xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision bb153e7960b24fb31b36b4230e4d1d008ae2f129)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
do_sqrdmlah_b(int8_t src1,int8_t src2,int8_t src3,bool neg,bool round)158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
HELPER(sve2_sqrdmlah_b)179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
HELPER(sve2_sqrdmlsh_b)190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
HELPER(sve2_sqdmulh_b)201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
HELPER(sve2_sqrdmulh_b)211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
do_sqrdmlah_h(int16_t src1,int16_t src2,int16_t src3,bool neg,bool round,uint32_t * sat)222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
HELPER(neon_qrdmlah_s16)240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
HELPER(gvec_qrdmlah_s16)250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
HELPER(neon_qrdmlsh_s16)265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
HELPER(gvec_qrdmlsh_s16)275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
HELPER(neon_sqdmulh_h)290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
HELPER(neon_sqrdmulh_h)302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
HELPER(neon_sqdmulh_idx_h)314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
HELPER(neon_sqrdmulh_idx_h)332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
HELPER(neon_sqrdmlah_idx_h)350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
HELPER(neon_sqrdmlsh_idx_h)368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
HELPER(sve2_sqrdmlah_h)386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
HELPER(sve2_sqrdmlsh_h)398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
HELPER(sve2_sqdmulh_h)410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
HELPER(sve2_sqrdmulh_h)421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
HELPER(sve2_sqdmulh_idx_h)432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
HELPER(sve2_sqrdmulh_idx_h)447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
do_sqrdmlah_s(int32_t src1,int32_t src2,int32_t src3,bool neg,bool round,uint32_t * sat)463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
HELPER(neon_qrdmlah_s32)481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
HELPER(gvec_qrdmlah_s32)488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
HELPER(neon_qrdmlsh_s32)503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
HELPER(gvec_qrdmlsh_s32)510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
HELPER(neon_sqdmulh_s)525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
HELPER(neon_sqrdmulh_s)537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
HELPER(neon_sqdmulh_idx_s)549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
HELPER(neon_sqrdmulh_idx_s)567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
HELPER(neon_sqrdmlah_idx_s)585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
HELPER(neon_sqrdmlsh_idx_s)603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
HELPER(sve2_sqrdmlah_s)621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
HELPER(sve2_sqrdmlsh_s)633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
HELPER(sve2_sqdmulh_s)645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
HELPER(sve2_sqrdmulh_s)656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
HELPER(sve2_sqdmulh_idx_s)667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
HELPER(sve2_sqrdmulh_idx_s)682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
do_sat128_d(Int128 r)698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
do_sqrdmlah_d(int64_t n,int64_t m,int64_t a,bool neg,bool round)709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
HELPER(sve2_sqrdmlah_d)734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
HELPER(sve2_sqrdmlsh_d)745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
HELPER(sve2_sqdmulh_d)756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
HELPER(sve2_sqrdmulh_d)766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
HELPER(sve2_sqdmulh_idx_d)776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
HELPER(sve2_sqrdmulh_idx_d)790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
DO_DOT(gvec_sdot_4b,int32_t,int8_t,int8_t)828 DO_DOT(gvec_sdot_4b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_4b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_4b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_4h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_4h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_4b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_4b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_4b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_4b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_4h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_4h, uint64_t, uint16_t, uint16_t, H8)
874 
875 #undef DO_DOT
876 #undef DO_DOT_IDX
877 
878 /* Similar for 2-way dot product */
879 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
880 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
881 {                                                                         \
882     intptr_t i, opr_sz = simd_oprsz(desc);                                \
883     TYPED *d = vd, *a = va;                                               \
884     TYPEN *n = vn;                                                        \
885     TYPEM *m = vm;                                                        \
886     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
887         d[i] = (a[i] +                                                    \
888                 (TYPED)n[i * 2 + 0] * m[i * 2 + 0] +                      \
889                 (TYPED)n[i * 2 + 1] * m[i * 2 + 1]);                      \
890     }                                                                     \
891     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
892 }
893 
894 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
895 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
896 {                                                                         \
897     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
898     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
899     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
900     intptr_t index = simd_data(desc);                                     \
901     TYPED *d = vd, *a = va;                                               \
902     TYPEN *n = vn;                                                        \
903     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 2;                       \
904     do {                                                                  \
905         TYPED m0 = m_indexed[i * 2 + 0];                                  \
906         TYPED m1 = m_indexed[i * 2 + 1];                                  \
907         do {                                                              \
908             d[i] = (a[i] +                                                \
909                     n[i * 2 + 0] * m0 +                                   \
910                     n[i * 2 + 1] * m1);                                   \
911         } while (++i < segend);                                           \
912         segend = i + (16 / sizeof(TYPED));                                \
913     } while (i < opr_sz_n);                                               \
914     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
915 }
916 
917 DO_DOT(gvec_sdot_2h, int32_t, int16_t, int16_t)
918 DO_DOT(gvec_udot_2h, uint32_t, uint16_t, uint16_t)
919 
920 DO_DOT_IDX(gvec_sdot_idx_2h, int32_t, int16_t, int16_t, H4)
921 DO_DOT_IDX(gvec_udot_idx_2h, uint32_t, uint16_t, uint16_t, H4)
922 
923 #undef DO_DOT
924 #undef DO_DOT_IDX
925 
926 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
927                          float_status *fpst, uint32_t desc)
928 {
929     uintptr_t opr_sz = simd_oprsz(desc);
930     float16 *d = vd;
931     float16 *n = vn;
932     float16 *m = vm;
933     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
934     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
935     uintptr_t i;
936 
937     for (i = 0; i < opr_sz / 2; i += 2) {
938         float16 e0 = n[H2(i)];
939         float16 e1 = m[H2(i + 1)];
940         float16 e2 = n[H2(i + 1)];
941         float16 e3 = m[H2(i)];
942 
943         if (rot) {
944             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
945         } else {
946             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
947         }
948 
949         d[H2(i)] = float16_add(e0, e1, fpst);
950         d[H2(i + 1)] = float16_add(e2, e3, fpst);
951     }
952     clear_tail(d, opr_sz, simd_maxsz(desc));
953 }
954 
HELPER(gvec_fcadds)955 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
956                          float_status *fpst, uint32_t desc)
957 {
958     uintptr_t opr_sz = simd_oprsz(desc);
959     float32 *d = vd;
960     float32 *n = vn;
961     float32 *m = vm;
962     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
963     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
964     uintptr_t i;
965 
966     for (i = 0; i < opr_sz / 4; i += 2) {
967         float32 e0 = n[H4(i)];
968         float32 e1 = m[H4(i + 1)];
969         float32 e2 = n[H4(i + 1)];
970         float32 e3 = m[H4(i)];
971 
972         if (rot) {
973             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
974         } else {
975             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
976         }
977 
978         d[H4(i)] = float32_add(e0, e1, fpst);
979         d[H4(i + 1)] = float32_add(e2, e3, fpst);
980     }
981     clear_tail(d, opr_sz, simd_maxsz(desc));
982 }
983 
HELPER(gvec_fcaddd)984 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
985                          float_status *fpst, uint32_t desc)
986 {
987     uintptr_t opr_sz = simd_oprsz(desc);
988     float64 *d = vd;
989     float64 *n = vn;
990     float64 *m = vm;
991     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
992     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
993     uintptr_t i;
994 
995     for (i = 0; i < opr_sz / 8; i += 2) {
996         float64 e0 = n[i];
997         float64 e1 = m[i + 1];
998         float64 e2 = n[i + 1];
999         float64 e3 = m[i];
1000 
1001         if (rot) {
1002             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
1003         } else {
1004             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
1005         }
1006 
1007         d[i] = float64_add(e0, e1, fpst);
1008         d[i + 1] = float64_add(e2, e3, fpst);
1009     }
1010     clear_tail(d, opr_sz, simd_maxsz(desc));
1011 }
1012 
HELPER(gvec_fcmlah)1013 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
1014                          float_status *fpst, uint32_t desc)
1015 {
1016     uintptr_t opr_sz = simd_oprsz(desc);
1017     float16 *d = vd, *n = vn, *m = vm, *a = va;
1018     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1019     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1020     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1021     uint32_t negf_real = flip ^ negf_imag;
1022     float16 negx_imag, negx_real;
1023     uintptr_t i;
1024 
1025     /* With AH=0, use negx; with AH=1 use negf. */
1026     negx_real = (negf_real & ~fpcr_ah) << 15;
1027     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1028     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1029     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1030 
1031     for (i = 0; i < opr_sz / 2; i += 2) {
1032         float16 e2 = n[H2(i + flip)];
1033         float16 e1 = m[H2(i + flip)] ^ negx_real;
1034         float16 e4 = e2;
1035         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
1036 
1037         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
1038         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
1039     }
1040     clear_tail(d, opr_sz, simd_maxsz(desc));
1041 }
1042 
HELPER(gvec_fcmlah_idx)1043 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
1044                              float_status *fpst, uint32_t desc)
1045 {
1046     uintptr_t opr_sz = simd_oprsz(desc);
1047     float16 *d = vd, *n = vn, *m = vm, *a = va;
1048     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1049     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1050     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1051     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1052     uint32_t negf_real = flip ^ negf_imag;
1053     intptr_t elements = opr_sz / sizeof(float16);
1054     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1055     float16 negx_imag, negx_real;
1056     intptr_t i, j;
1057 
1058     /* With AH=0, use negx; with AH=1 use negf. */
1059     negx_real = (negf_real & ~fpcr_ah) << 15;
1060     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1061     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1062     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1063 
1064     for (i = 0; i < elements; i += eltspersegment) {
1065         float16 mr = m[H2(i + 2 * index + 0)];
1066         float16 mi = m[H2(i + 2 * index + 1)];
1067         float16 e1 = negx_real ^ (flip ? mi : mr);
1068         float16 e3 = negx_imag ^ (flip ? mr : mi);
1069 
1070         for (j = i; j < i + eltspersegment; j += 2) {
1071             float16 e2 = n[H2(j + flip)];
1072             float16 e4 = e2;
1073 
1074             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst);
1075             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst);
1076         }
1077     }
1078     clear_tail(d, opr_sz, simd_maxsz(desc));
1079 }
1080 
HELPER(gvec_fcmlas)1081 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1082                          float_status *fpst, uint32_t desc)
1083 {
1084     uintptr_t opr_sz = simd_oprsz(desc);
1085     float32 *d = vd, *n = vn, *m = vm, *a = va;
1086     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1087     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1088     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1089     uint32_t negf_real = flip ^ negf_imag;
1090     float32 negx_imag, negx_real;
1091     uintptr_t i;
1092 
1093     /* With AH=0, use negx; with AH=1 use negf. */
1094     negx_real = (negf_real & ~fpcr_ah) << 31;
1095     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1096     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1097     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1098 
1099     for (i = 0; i < opr_sz / 4; i += 2) {
1100         float32 e2 = n[H4(i + flip)];
1101         float32 e1 = m[H4(i + flip)] ^ negx_real;
1102         float32 e4 = e2;
1103         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1104 
1105         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1106         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1107     }
1108     clear_tail(d, opr_sz, simd_maxsz(desc));
1109 }
1110 
HELPER(gvec_fcmlas_idx)1111 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1112                              float_status *fpst, uint32_t desc)
1113 {
1114     uintptr_t opr_sz = simd_oprsz(desc);
1115     float32 *d = vd, *n = vn, *m = vm, *a = va;
1116     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1117     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1118     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1119     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1120     uint32_t negf_real = flip ^ negf_imag;
1121     intptr_t elements = opr_sz / sizeof(float32);
1122     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1123     float32 negx_imag, negx_real;
1124     intptr_t i, j;
1125 
1126     /* With AH=0, use negx; with AH=1 use negf. */
1127     negx_real = (negf_real & ~fpcr_ah) << 31;
1128     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1129     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1130     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1131 
1132     for (i = 0; i < elements; i += eltspersegment) {
1133         float32 mr = m[H4(i + 2 * index + 0)];
1134         float32 mi = m[H4(i + 2 * index + 1)];
1135         float32 e1 = negx_real ^ (flip ? mi : mr);
1136         float32 e3 = negx_imag ^ (flip ? mr : mi);
1137 
1138         for (j = i; j < i + eltspersegment; j += 2) {
1139             float32 e2 = n[H4(j + flip)];
1140             float32 e4 = e2;
1141 
1142             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst);
1143             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst);
1144         }
1145     }
1146     clear_tail(d, opr_sz, simd_maxsz(desc));
1147 }
1148 
HELPER(gvec_fcmlad)1149 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1150                          float_status *fpst, uint32_t desc)
1151 {
1152     uintptr_t opr_sz = simd_oprsz(desc);
1153     float64 *d = vd, *n = vn, *m = vm, *a = va;
1154     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1155     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1156     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1157     uint32_t negf_real = flip ^ negf_imag;
1158     float64 negx_real, negx_imag;
1159     uintptr_t i;
1160 
1161     /* With AH=0, use negx; with AH=1 use negf. */
1162     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1163     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1164     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1165     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1166 
1167     for (i = 0; i < opr_sz / 8; i += 2) {
1168         float64 e2 = n[i + flip];
1169         float64 e1 = m[i + flip] ^ negx_real;
1170         float64 e4 = e2;
1171         float64 e3 = m[i + 1 - flip] ^ negx_imag;
1172 
1173         d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1174         d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1175     }
1176     clear_tail(d, opr_sz, simd_maxsz(desc));
1177 }
1178 
1179 /*
1180  * Floating point comparisons producing an integer result (all 1s or all 0s).
1181  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1182  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1183  */
float16_ceq(float16 op1,float16 op2,float_status * stat)1184 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1185 {
1186     return -float16_eq_quiet(op1, op2, stat);
1187 }
1188 
float32_ceq(float32 op1,float32 op2,float_status * stat)1189 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1190 {
1191     return -float32_eq_quiet(op1, op2, stat);
1192 }
1193 
float64_ceq(float64 op1,float64 op2,float_status * stat)1194 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1195 {
1196     return -float64_eq_quiet(op1, op2, stat);
1197 }
1198 
float16_cge(float16 op1,float16 op2,float_status * stat)1199 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1200 {
1201     return -float16_le(op2, op1, stat);
1202 }
1203 
float32_cge(float32 op1,float32 op2,float_status * stat)1204 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1205 {
1206     return -float32_le(op2, op1, stat);
1207 }
1208 
float64_cge(float64 op1,float64 op2,float_status * stat)1209 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1210 {
1211     return -float64_le(op2, op1, stat);
1212 }
1213 
float16_cgt(float16 op1,float16 op2,float_status * stat)1214 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1215 {
1216     return -float16_lt(op2, op1, stat);
1217 }
1218 
float32_cgt(float32 op1,float32 op2,float_status * stat)1219 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1220 {
1221     return -float32_lt(op2, op1, stat);
1222 }
1223 
float64_cgt(float64 op1,float64 op2,float_status * stat)1224 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1225 {
1226     return -float64_lt(op2, op1, stat);
1227 }
1228 
float16_acge(float16 op1,float16 op2,float_status * stat)1229 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1230 {
1231     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1232 }
1233 
float32_acge(float32 op1,float32 op2,float_status * stat)1234 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1235 {
1236     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1237 }
1238 
float64_acge(float64 op1,float64 op2,float_status * stat)1239 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1240 {
1241     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1242 }
1243 
float16_acgt(float16 op1,float16 op2,float_status * stat)1244 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1245 {
1246     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1247 }
1248 
float32_acgt(float32 op1,float32 op2,float_status * stat)1249 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1250 {
1251     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1252 }
1253 
float64_acgt(float64 op1,float64 op2,float_status * stat)1254 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1255 {
1256     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1257 }
1258 
vfp_tosszh(float16 x,float_status * fpst)1259 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1260 {
1261     if (float16_is_any_nan(x)) {
1262         float_raise(float_flag_invalid, fpst);
1263         return 0;
1264     }
1265     return float16_to_int16_round_to_zero(x, fpst);
1266 }
1267 
vfp_touszh(float16 x,float_status * fpst)1268 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1269 {
1270     if (float16_is_any_nan(x)) {
1271         float_raise(float_flag_invalid, fpst);
1272         return 0;
1273     }
1274     return float16_to_uint16_round_to_zero(x, fpst);
1275 }
1276 
1277 #define DO_2OP(NAME, FUNC, TYPE) \
1278 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1279 {                                                                 \
1280     intptr_t i, oprsz = simd_oprsz(desc);                         \
1281     TYPE *d = vd, *n = vn;                                        \
1282     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1283         d[i] = FUNC(n[i], stat);                                  \
1284     }                                                             \
1285     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1286 }
1287 
DO_2OP(gvec_frecpe_h,helper_recpe_f16,float16)1288 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1289 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1290 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32)
1291 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1292 
1293 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1294 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1295 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32)
1296 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1297 
1298 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1299 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1300 
1301 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1302 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1303 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1304 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1305 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1306 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1307 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1308 DO_2OP(gvec_touszh, vfp_touszh, float16)
1309 
1310 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1311     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1312     {                                                           \
1313         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1314     }
1315 
1316 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1317     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1318     {                                                           \
1319         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1320     }
1321 
1322 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1323     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1324     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1325     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1326     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1327     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1328     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1329 
1330 DO_2OP_CMP0(cgt, cgt, FWD)
1331 DO_2OP_CMP0(cge, cge, FWD)
1332 DO_2OP_CMP0(ceq, ceq, FWD)
1333 DO_2OP_CMP0(clt, cgt, REV)
1334 DO_2OP_CMP0(cle, cge, REV)
1335 
1336 #undef DO_2OP
1337 #undef DO_2OP_CMP0
1338 
1339 /* Floating-point trigonometric starting value.
1340  * See the ARM ARM pseudocode function FPTrigSMul.
1341  */
1342 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1343 {
1344     float16 result = float16_mul(op1, op1, stat);
1345     if (!float16_is_any_nan(result)) {
1346         result = float16_set_sign(result, op2 & 1);
1347     }
1348     return result;
1349 }
1350 
float32_ftsmul(float32 op1,uint32_t op2,float_status * stat)1351 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1352 {
1353     float32 result = float32_mul(op1, op1, stat);
1354     if (!float32_is_any_nan(result)) {
1355         result = float32_set_sign(result, op2 & 1);
1356     }
1357     return result;
1358 }
1359 
float64_ftsmul(float64 op1,uint64_t op2,float_status * stat)1360 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1361 {
1362     float64 result = float64_mul(op1, op1, stat);
1363     if (!float64_is_any_nan(result)) {
1364         result = float64_set_sign(result, op2 & 1);
1365     }
1366     return result;
1367 }
1368 
float16_abd(float16 op1,float16 op2,float_status * stat)1369 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1370 {
1371     return float16_abs(float16_sub(op1, op2, stat));
1372 }
1373 
float32_abd(float32 op1,float32 op2,float_status * stat)1374 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1375 {
1376     return float32_abs(float32_sub(op1, op2, stat));
1377 }
1378 
float64_abd(float64 op1,float64 op2,float_status * stat)1379 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1380 {
1381     return float64_abs(float64_sub(op1, op2, stat));
1382 }
1383 
1384 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
float16_ah_abd(float16 op1,float16 op2,float_status * stat)1385 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1386 {
1387     float16 r = float16_sub(op1, op2, stat);
1388     return float16_is_any_nan(r) ? r : float16_abs(r);
1389 }
1390 
float32_ah_abd(float32 op1,float32 op2,float_status * stat)1391 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1392 {
1393     float32 r = float32_sub(op1, op2, stat);
1394     return float32_is_any_nan(r) ? r : float32_abs(r);
1395 }
1396 
float64_ah_abd(float64 op1,float64 op2,float_status * stat)1397 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1398 {
1399     float64 r = float64_sub(op1, op2, stat);
1400     return float64_is_any_nan(r) ? r : float64_abs(r);
1401 }
1402 
1403 /*
1404  * Reciprocal step. These are the AArch32 version which uses a
1405  * non-fused multiply-and-subtract.
1406  */
float16_recps_nf(float16 op1,float16 op2,float_status * stat)1407 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1408 {
1409     op1 = float16_squash_input_denormal(op1, stat);
1410     op2 = float16_squash_input_denormal(op2, stat);
1411 
1412     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1413         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1414         return float16_two;
1415     }
1416     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1417 }
1418 
float32_recps_nf(float32 op1,float32 op2,float_status * stat)1419 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1420 {
1421     op1 = float32_squash_input_denormal(op1, stat);
1422     op2 = float32_squash_input_denormal(op2, stat);
1423 
1424     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1425         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1426         return float32_two;
1427     }
1428     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1429 }
1430 
1431 /* Reciprocal square-root step. AArch32 non-fused semantics. */
float16_rsqrts_nf(float16 op1,float16 op2,float_status * stat)1432 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1433 {
1434     op1 = float16_squash_input_denormal(op1, stat);
1435     op2 = float16_squash_input_denormal(op2, stat);
1436 
1437     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1438         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1439         return float16_one_point_five;
1440     }
1441     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1442     return float16_div(op1, float16_two, stat);
1443 }
1444 
float32_rsqrts_nf(float32 op1,float32 op2,float_status * stat)1445 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1446 {
1447     op1 = float32_squash_input_denormal(op1, stat);
1448     op2 = float32_squash_input_denormal(op2, stat);
1449 
1450     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1451         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1452         return float32_one_point_five;
1453     }
1454     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1455     return float32_div(op1, float32_two, stat);
1456 }
1457 
1458 #define DO_3OP(NAME, FUNC, TYPE) \
1459 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1460                   float_status *stat, uint32_t desc)                       \
1461 {                                                                          \
1462     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1463     TYPE *d = vd, *n = vn, *m = vm;                                        \
1464     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1465         d[i] = FUNC(n[i], m[i], stat);                                     \
1466     }                                                                      \
1467     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1468 }
1469 
DO_3OP(gvec_fadd_b16,bfloat16_add,float16)1470 DO_3OP(gvec_fadd_b16, bfloat16_add, float16)
1471 DO_3OP(gvec_fadd_h, float16_add, float16)
1472 DO_3OP(gvec_fadd_s, float32_add, float32)
1473 DO_3OP(gvec_fadd_d, float64_add, float64)
1474 DO_3OP(gvec_bfadd, bfloat16_add, bfloat16)
1475 
1476 DO_3OP(gvec_fsub_b16, bfloat16_sub, float16)
1477 DO_3OP(gvec_fsub_h, float16_sub, float16)
1478 DO_3OP(gvec_fsub_s, float32_sub, float32)
1479 DO_3OP(gvec_fsub_d, float64_sub, float64)
1480 DO_3OP(gvec_bfsub, bfloat16_sub, bfloat16)
1481 
1482 DO_3OP(gvec_fmul_b16, bfloat16_mul, float16)
1483 DO_3OP(gvec_fmul_h, float16_mul, float16)
1484 DO_3OP(gvec_fmul_s, float32_mul, float32)
1485 DO_3OP(gvec_fmul_d, float64_mul, float64)
1486 
1487 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1488 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1489 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1490 
1491 DO_3OP(gvec_fabd_h, float16_abd, float16)
1492 DO_3OP(gvec_fabd_s, float32_abd, float32)
1493 DO_3OP(gvec_fabd_d, float64_abd, float64)
1494 
1495 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1496 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1497 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1498 
1499 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1500 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1501 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1502 
1503 DO_3OP(gvec_fcge_h, float16_cge, float16)
1504 DO_3OP(gvec_fcge_s, float32_cge, float32)
1505 DO_3OP(gvec_fcge_d, float64_cge, float64)
1506 
1507 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1508 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1509 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1510 
1511 DO_3OP(gvec_facge_h, float16_acge, float16)
1512 DO_3OP(gvec_facge_s, float32_acge, float32)
1513 DO_3OP(gvec_facge_d, float64_acge, float64)
1514 
1515 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1516 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1517 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1518 
1519 DO_3OP(gvec_fmax_h, float16_max, float16)
1520 DO_3OP(gvec_fmax_s, float32_max, float32)
1521 DO_3OP(gvec_fmax_d, float64_max, float64)
1522 
1523 DO_3OP(gvec_fmin_h, float16_min, float16)
1524 DO_3OP(gvec_fmin_s, float32_min, float32)
1525 DO_3OP(gvec_fmin_d, float64_min, float64)
1526 
1527 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1528 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1529 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1530 
1531 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1532 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1533 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1534 
1535 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1536 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1537 
1538 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1539 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1540 
1541 #ifdef TARGET_AARCH64
1542 DO_3OP(gvec_fdiv_h, float16_div, float16)
1543 DO_3OP(gvec_fdiv_s, float32_div, float32)
1544 DO_3OP(gvec_fdiv_d, float64_div, float64)
1545 
1546 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1547 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1548 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1549 
1550 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1551 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1552 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1553 
1554 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1555 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1556 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1557 
1558 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1559 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1560 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1561 
1562 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1563 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1564 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1565 
1566 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1567 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1568 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1569 
1570 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1571 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1572 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1573 
1574 DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16)
1575 DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16)
1576 DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16)
1577 DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16)
1578 DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16)
1579 DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16)
1580 
1581 #endif
1582 #undef DO_3OP
1583 
1584 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1585 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1586                                  float_status *stat)
1587 {
1588     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1589 }
1590 
float32_muladd_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1591 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1592                                  float_status *stat)
1593 {
1594     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1595 }
1596 
float16_mulsub_nf(float16 dest,float16 op1,float16 op2,float_status * stat)1597 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1598                                  float_status *stat)
1599 {
1600     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1601 }
1602 
float32_mulsub_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1603 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1604                                  float_status *stat)
1605 {
1606     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1607 }
1608 
1609 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
float16_muladd_f(float16 dest,float16 op1,float16 op2,float_status * stat)1610 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1611                                 float_status *stat)
1612 {
1613     return float16_muladd(op1, op2, dest, 0, stat);
1614 }
1615 
bfloat16_muladd_f(bfloat16 dest,bfloat16 op1,bfloat16 op2,float_status * stat)1616 static bfloat16 bfloat16_muladd_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1617                                   float_status *stat)
1618 {
1619     return bfloat16_muladd(op1, op2, dest, 0, stat);
1620 }
1621 
float32_muladd_f(float32 dest,float32 op1,float32 op2,float_status * stat)1622 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1623                                  float_status *stat)
1624 {
1625     return float32_muladd(op1, op2, dest, 0, stat);
1626 }
1627 
float64_muladd_f(float64 dest,float64 op1,float64 op2,float_status * stat)1628 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1629                                  float_status *stat)
1630 {
1631     return float64_muladd(op1, op2, dest, 0, stat);
1632 }
1633 
float16_mulsub_f(float16 dest,float16 op1,float16 op2,float_status * stat)1634 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1635                                  float_status *stat)
1636 {
1637     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1638 }
1639 
bfloat16_mulsub_f(bfloat16 dest,bfloat16 op1,bfloat16 op2,float_status * stat)1640 static bfloat16 bfloat16_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1641                                   float_status *stat)
1642 {
1643     return bfloat16_muladd(bfloat16_chs(op1), op2, dest, 0, stat);
1644 }
1645 
float32_mulsub_f(float32 dest,float32 op1,float32 op2,float_status * stat)1646 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1647                                  float_status *stat)
1648 {
1649     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1650 }
1651 
float64_mulsub_f(float64 dest,float64 op1,float64 op2,float_status * stat)1652 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1653                                  float_status *stat)
1654 {
1655     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1656 }
1657 
float16_ah_mulsub_f(float16 dest,float16 op1,float16 op2,float_status * stat)1658 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1659                                  float_status *stat)
1660 {
1661     return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1662 }
1663 
bfloat16_ah_mulsub_f(bfloat16 dest,bfloat16 op1,bfloat16 op2,float_status * stat)1664 static bfloat16 bfloat16_ah_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1665                                      float_status *stat)
1666 {
1667     return bfloat16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1668 }
1669 
float32_ah_mulsub_f(float32 dest,float32 op1,float32 op2,float_status * stat)1670 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1671                                  float_status *stat)
1672 {
1673     return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1674 }
1675 
float64_ah_mulsub_f(float64 dest,float64 op1,float64 op2,float_status * stat)1676 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1677                                  float_status *stat)
1678 {
1679     return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1680 }
1681 
1682 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1683 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1684                   float_status *stat, uint32_t desc)                       \
1685 {                                                                          \
1686     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1687     TYPE *d = vd, *n = vn, *m = vm;                                        \
1688     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1689         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1690     }                                                                      \
1691     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1692 }
1693 
DO_MULADD(gvec_fmla_nf_h,float16_muladd_nf,float16)1694 DO_MULADD(gvec_fmla_nf_h, float16_muladd_nf, float16)
1695 DO_MULADD(gvec_fmla_nf_s, float32_muladd_nf, float32)
1696 
1697 DO_MULADD(gvec_fmls_nf_h, float16_mulsub_nf, float16)
1698 DO_MULADD(gvec_fmls_nf_s, float32_mulsub_nf, float32)
1699 
1700 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1701 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1702 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1703 DO_MULADD(gvec_bfmla, bfloat16_muladd_f, bfloat16)
1704 
1705 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1706 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1707 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1708 DO_MULADD(gvec_bfmls, bfloat16_mulsub_f, bfloat16)
1709 
1710 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1711 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1712 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1713 DO_MULADD(gvec_ah_bfmls, bfloat16_ah_mulsub_f, bfloat16)
1714 
1715 #undef DO_MULADD
1716 
1717 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1718  * For AdvSIMD, there is of course only one such vector segment.
1719  */
1720 
1721 #define DO_MUL_IDX(NAME, TYPE, H) \
1722 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1723 {                                                                          \
1724     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1725     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1726     intptr_t idx = simd_data(desc);                                        \
1727     TYPE *d = vd, *n = vn, *m = vm;                                        \
1728     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1729         TYPE mm = m[H(i + idx)];                                           \
1730         for (j = 0; j < segment; j++) {                                    \
1731             d[i + j] = n[i + j] * mm;                                      \
1732         }                                                                  \
1733     }                                                                      \
1734     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1735 }
1736 
1737 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1738 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1739 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1740 
1741 #undef DO_MUL_IDX
1742 
1743 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1744 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1745 {                                                                          \
1746     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1747     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1748     intptr_t idx = simd_data(desc);                                        \
1749     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1750     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1751         TYPE mm = m[H(i + idx)];                                           \
1752         for (j = 0; j < segment; j++) {                                    \
1753             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1754         }                                                                  \
1755     }                                                                      \
1756     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1757 }
1758 
1759 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1760 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1761 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1762 
1763 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1764 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1765 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1766 
1767 #undef DO_MLA_IDX
1768 
1769 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1770 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1771                   float_status *stat, uint32_t desc)                       \
1772 {                                                                          \
1773     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1774     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1775     intptr_t idx = simd_data(desc);                                        \
1776     TYPE *d = vd, *n = vn, *m = vm;                                        \
1777     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1778         TYPE mm = m[H(i + idx)];                                           \
1779         for (j = 0; j < segment; j++) {                                    \
1780             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1781         }                                                                  \
1782     }                                                                      \
1783     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1784 }
1785 
1786 #define nop(N, M, S) (M)
1787 
1788 DO_FMUL_IDX(gvec_fmul_idx_b16, nop, bfloat16_mul, float16, H2)
1789 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1790 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1791 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1792 
1793 #ifdef TARGET_AARCH64
1794 
1795 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1796 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1797 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1798 
1799 #endif
1800 
1801 #undef nop
1802 
1803 /*
1804  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1805  * the fused ops below they assume accumulate both from and into Vd.
1806  */
1807 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1808 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1809 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1810 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1811 
1812 #undef DO_FMUL_IDX
1813 
1814 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF)                             \
1815 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1816                   float_status *stat, uint32_t desc)                       \
1817 {                                                                          \
1818     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1819     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1820     intptr_t idx = simd_data(desc);                                        \
1821     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1822     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1823         TYPE mm = m[H(i + idx)];                                           \
1824         for (j = 0; j < segment; j++) {                                    \
1825             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1826                                      a[i + j], NEGF, stat);                \
1827         }                                                                  \
1828     }                                                                      \
1829     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1830 }
1831 
1832 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1833 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1834 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1835 DO_FMLA_IDX(gvec_bfmla_idx, bfloat16, H2, 0, 0)
1836 
1837 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1838 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1839 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1840 DO_FMLA_IDX(gvec_bfmls_idx, bfloat16, H2, INT16_MIN, 0)
1841 
1842 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1843 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1844 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1845 DO_FMLA_IDX(gvec_ah_bfmls_idx, bfloat16, H2, 0, float_muladd_negate_product)
1846 
1847 #undef DO_FMLA_IDX
1848 
1849 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1850 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1851 {                                                                          \
1852     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1853     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1854     bool q = false;                                                        \
1855     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1856         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1857         if (dd < MIN) {                                                    \
1858             dd = MIN;                                                      \
1859             q = true;                                                      \
1860         } else if (dd > MAX) {                                             \
1861             dd = MAX;                                                      \
1862             q = true;                                                      \
1863         }                                                                  \
1864         d[i] = dd;                                                         \
1865     }                                                                      \
1866     if (q) {                                                               \
1867         uint32_t *qc = vq;                                                 \
1868         qc[0] = 1;                                                         \
1869     }                                                                      \
1870     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1871 }
1872 
1873 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1874 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1875 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1876 
1877 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1878 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1879 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1880 
1881 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1882 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1883 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1884 
1885 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1886 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1887 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1888 
1889 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1890 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1891 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1892 
1893 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1894 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1895 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1896 
1897 #undef DO_SAT
1898 
1899 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1900                           void *vm, uint32_t desc)
1901 {
1902     intptr_t i, oprsz = simd_oprsz(desc);
1903     uint64_t *d = vd, *n = vn, *m = vm;
1904     bool q = false;
1905 
1906     for (i = 0; i < oprsz / 8; i++) {
1907         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1908         if (dd < nn) {
1909             dd = UINT64_MAX;
1910             q = true;
1911         }
1912         d[i] = dd;
1913     }
1914     if (q) {
1915         uint32_t *qc = vq;
1916         qc[0] = 1;
1917     }
1918     clear_tail(d, oprsz, simd_maxsz(desc));
1919 }
1920 
HELPER(gvec_uqsub_d)1921 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1922                           void *vm, uint32_t desc)
1923 {
1924     intptr_t i, oprsz = simd_oprsz(desc);
1925     uint64_t *d = vd, *n = vn, *m = vm;
1926     bool q = false;
1927 
1928     for (i = 0; i < oprsz / 8; i++) {
1929         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1930         if (nn < mm) {
1931             dd = 0;
1932             q = true;
1933         }
1934         d[i] = dd;
1935     }
1936     if (q) {
1937         uint32_t *qc = vq;
1938         qc[0] = 1;
1939     }
1940     clear_tail(d, oprsz, simd_maxsz(desc));
1941 }
1942 
HELPER(gvec_sqadd_d)1943 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1944                           void *vm, uint32_t desc)
1945 {
1946     intptr_t i, oprsz = simd_oprsz(desc);
1947     int64_t *d = vd, *n = vn, *m = vm;
1948     bool q = false;
1949 
1950     for (i = 0; i < oprsz / 8; i++) {
1951         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1952         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1953             dd = (nn >> 63) ^ ~INT64_MIN;
1954             q = true;
1955         }
1956         d[i] = dd;
1957     }
1958     if (q) {
1959         uint32_t *qc = vq;
1960         qc[0] = 1;
1961     }
1962     clear_tail(d, oprsz, simd_maxsz(desc));
1963 }
1964 
HELPER(gvec_sqsub_d)1965 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1966                           void *vm, uint32_t desc)
1967 {
1968     intptr_t i, oprsz = simd_oprsz(desc);
1969     int64_t *d = vd, *n = vn, *m = vm;
1970     bool q = false;
1971 
1972     for (i = 0; i < oprsz / 8; i++) {
1973         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1974         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1975             dd = (nn >> 63) ^ ~INT64_MIN;
1976             q = true;
1977         }
1978         d[i] = dd;
1979     }
1980     if (q) {
1981         uint32_t *qc = vq;
1982         qc[0] = 1;
1983     }
1984     clear_tail(d, oprsz, simd_maxsz(desc));
1985 }
1986 
HELPER(gvec_usqadd_d)1987 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1988                            void *vm, uint32_t desc)
1989 {
1990     intptr_t i, oprsz = simd_oprsz(desc);
1991     uint64_t *d = vd, *n = vn, *m = vm;
1992     bool q = false;
1993 
1994     for (i = 0; i < oprsz / 8; i++) {
1995         uint64_t nn = n[i];
1996         int64_t mm = m[i];
1997         uint64_t dd = nn + mm;
1998 
1999         if (mm < 0) {
2000             if (nn < (uint64_t)-mm) {
2001                 dd = 0;
2002                 q = true;
2003             }
2004         } else {
2005             if (dd < nn) {
2006                 dd = UINT64_MAX;
2007                 q = true;
2008             }
2009         }
2010         d[i] = dd;
2011     }
2012     if (q) {
2013         uint32_t *qc = vq;
2014         qc[0] = 1;
2015     }
2016     clear_tail(d, oprsz, simd_maxsz(desc));
2017 }
2018 
HELPER(gvec_suqadd_d)2019 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
2020                            void *vm, uint32_t desc)
2021 {
2022     intptr_t i, oprsz = simd_oprsz(desc);
2023     uint64_t *d = vd, *n = vn, *m = vm;
2024     bool q = false;
2025 
2026     for (i = 0; i < oprsz / 8; i++) {
2027         int64_t nn = n[i];
2028         uint64_t mm = m[i];
2029         int64_t dd = nn + mm;
2030 
2031         if (mm > (uint64_t)(INT64_MAX - nn)) {
2032             dd = INT64_MAX;
2033             q = true;
2034         }
2035         d[i] = dd;
2036     }
2037     if (q) {
2038         uint32_t *qc = vq;
2039         qc[0] = 1;
2040     }
2041     clear_tail(d, oprsz, simd_maxsz(desc));
2042 }
2043 
2044 #define DO_SRA(NAME, TYPE)                              \
2045 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2046 {                                                       \
2047     intptr_t i, oprsz = simd_oprsz(desc);               \
2048     int shift = simd_data(desc);                        \
2049     TYPE *d = vd, *n = vn;                              \
2050     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2051         d[i] += n[i] >> shift;                          \
2052     }                                                   \
2053     clear_tail(d, oprsz, simd_maxsz(desc));             \
2054 }
2055 
DO_SRA(gvec_ssra_b,int8_t)2056 DO_SRA(gvec_ssra_b, int8_t)
2057 DO_SRA(gvec_ssra_h, int16_t)
2058 DO_SRA(gvec_ssra_s, int32_t)
2059 DO_SRA(gvec_ssra_d, int64_t)
2060 
2061 DO_SRA(gvec_usra_b, uint8_t)
2062 DO_SRA(gvec_usra_h, uint16_t)
2063 DO_SRA(gvec_usra_s, uint32_t)
2064 DO_SRA(gvec_usra_d, uint64_t)
2065 
2066 #undef DO_SRA
2067 
2068 #define DO_RSHR(NAME, TYPE)                             \
2069 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2070 {                                                       \
2071     intptr_t i, oprsz = simd_oprsz(desc);               \
2072     int shift = simd_data(desc);                        \
2073     TYPE *d = vd, *n = vn;                              \
2074     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2075         TYPE tmp = n[i] >> (shift - 1);                 \
2076         d[i] = (tmp >> 1) + (tmp & 1);                  \
2077     }                                                   \
2078     clear_tail(d, oprsz, simd_maxsz(desc));             \
2079 }
2080 
2081 DO_RSHR(gvec_srshr_b, int8_t)
2082 DO_RSHR(gvec_srshr_h, int16_t)
2083 DO_RSHR(gvec_srshr_s, int32_t)
2084 DO_RSHR(gvec_srshr_d, int64_t)
2085 
2086 DO_RSHR(gvec_urshr_b, uint8_t)
2087 DO_RSHR(gvec_urshr_h, uint16_t)
2088 DO_RSHR(gvec_urshr_s, uint32_t)
2089 DO_RSHR(gvec_urshr_d, uint64_t)
2090 
2091 #undef DO_RSHR
2092 
2093 #define DO_RSRA(NAME, TYPE)                             \
2094 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2095 {                                                       \
2096     intptr_t i, oprsz = simd_oprsz(desc);               \
2097     int shift = simd_data(desc);                        \
2098     TYPE *d = vd, *n = vn;                              \
2099     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2100         TYPE tmp = n[i] >> (shift - 1);                 \
2101         d[i] += (tmp >> 1) + (tmp & 1);                 \
2102     }                                                   \
2103     clear_tail(d, oprsz, simd_maxsz(desc));             \
2104 }
2105 
2106 DO_RSRA(gvec_srsra_b, int8_t)
2107 DO_RSRA(gvec_srsra_h, int16_t)
2108 DO_RSRA(gvec_srsra_s, int32_t)
2109 DO_RSRA(gvec_srsra_d, int64_t)
2110 
2111 DO_RSRA(gvec_ursra_b, uint8_t)
2112 DO_RSRA(gvec_ursra_h, uint16_t)
2113 DO_RSRA(gvec_ursra_s, uint32_t)
2114 DO_RSRA(gvec_ursra_d, uint64_t)
2115 
2116 #undef DO_RSRA
2117 
2118 #define DO_SRI(NAME, TYPE)                              \
2119 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2120 {                                                       \
2121     intptr_t i, oprsz = simd_oprsz(desc);               \
2122     int shift = simd_data(desc);                        \
2123     TYPE *d = vd, *n = vn;                              \
2124     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2125         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2126     }                                                   \
2127     clear_tail(d, oprsz, simd_maxsz(desc));             \
2128 }
2129 
2130 DO_SRI(gvec_sri_b, uint8_t)
2131 DO_SRI(gvec_sri_h, uint16_t)
2132 DO_SRI(gvec_sri_s, uint32_t)
2133 DO_SRI(gvec_sri_d, uint64_t)
2134 
2135 #undef DO_SRI
2136 
2137 #define DO_SLI(NAME, TYPE)                              \
2138 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2139 {                                                       \
2140     intptr_t i, oprsz = simd_oprsz(desc);               \
2141     int shift = simd_data(desc);                        \
2142     TYPE *d = vd, *n = vn;                              \
2143     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2144         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2145     }                                                   \
2146     clear_tail(d, oprsz, simd_maxsz(desc));             \
2147 }
2148 
2149 DO_SLI(gvec_sli_b, uint8_t)
2150 DO_SLI(gvec_sli_h, uint16_t)
2151 DO_SLI(gvec_sli_s, uint32_t)
2152 DO_SLI(gvec_sli_d, uint64_t)
2153 
2154 #undef DO_SLI
2155 
2156 /*
2157  * Convert float16 to float32, raising no exceptions and
2158  * preserving exceptional values, including SNaN.
2159  * This is effectively an unpack+repack operation.
2160  */
2161 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2162 {
2163     const int f16_bias = 15;
2164     const int f32_bias = 127;
2165     uint32_t sign = extract32(f16, 15, 1);
2166     uint32_t exp = extract32(f16, 10, 5);
2167     uint32_t frac = extract32(f16, 0, 10);
2168 
2169     if (exp == 0x1f) {
2170         /* Inf or NaN */
2171         exp = 0xff;
2172     } else if (exp == 0) {
2173         /* Zero or denormal.  */
2174         if (frac != 0) {
2175             if (fz16) {
2176                 frac = 0;
2177             } else {
2178                 /*
2179                  * Denormal; these are all normal float32.
2180                  * Shift the fraction so that the msb is at bit 11,
2181                  * then remove bit 11 as the implicit bit of the
2182                  * normalized float32.  Note that we still go through
2183                  * the shift for normal numbers below, to put the
2184                  * float32 fraction at the right place.
2185                  */
2186                 int shift = clz32(frac) - 21;
2187                 frac = (frac << shift) & 0x3ff;
2188                 exp = f32_bias - f16_bias - shift + 1;
2189             }
2190         }
2191     } else {
2192         /* Normal number; adjust the bias.  */
2193         exp += f32_bias - f16_bias;
2194     }
2195     sign <<= 31;
2196     exp <<= 23;
2197     frac <<= 23 - 10;
2198 
2199     return sign | exp | frac;
2200 }
2201 
load4_f16(uint64_t * ptr,int is_q,int is_2)2202 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2203 {
2204     /*
2205      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2206      * Load the 2nd qword iff is_q & is_2.
2207      * Shift to the 2nd dword iff !is_q & is_2.
2208      * For !is_q & !is_2, the upper bits of the result are garbage.
2209      */
2210     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2211 }
2212 
2213 /*
2214  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2215  * as there is not yet SVE versions that might use blocking.
2216  */
2217 
do_fmlal(float32 * d,void * vn,void * vm,CPUARMState * env,uint32_t desc,ARMFPStatusFlavour fpst_idx,uint64_t negx,int negf)2218 static void do_fmlal(float32 *d, void *vn, void *vm,
2219                      CPUARMState *env, uint32_t desc,
2220                      ARMFPStatusFlavour fpst_idx,
2221                      uint64_t negx, int negf)
2222 {
2223     float_status *fpst = &env->vfp.fp_status[fpst_idx];
2224     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2225     intptr_t i, oprsz = simd_oprsz(desc);
2226     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2227     int is_q = oprsz == 16;
2228     uint64_t n_4, m_4;
2229 
2230     /*
2231      * Pre-load all of the f16 data, avoiding overlap issues.
2232      * Negate all inputs for AH=0 FMLSL at once.
2233      */
2234     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2235     m_4 = load4_f16(vm, is_q, is_2);
2236 
2237     for (i = 0; i < oprsz / 4; i++) {
2238         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2239         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2240         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2241     }
2242     clear_tail(d, oprsz, simd_maxsz(desc));
2243 }
2244 
HELPER(gvec_fmlal_a32)2245 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2246                             CPUARMState *env, uint32_t desc)
2247 {
2248     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2249     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2250 
2251     do_fmlal(vd, vn, vm, env, desc, FPST_STD, negx, 0);
2252 }
2253 
HELPER(gvec_fmlal_a64)2254 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2255                             CPUARMState *env, uint32_t desc)
2256 {
2257     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2258     uint64_t negx = 0;
2259     int negf = 0;
2260 
2261     if (is_s) {
2262         if (env->vfp.fpcr & FPCR_AH) {
2263             negf = float_muladd_negate_product;
2264         } else {
2265             negx = 0x8000800080008000ull;
2266         }
2267     }
2268     do_fmlal(vd, vn, vm, env, desc, FPST_A64, negx, negf);
2269 }
2270 
HELPER(sve2_fmlal_zzzw_s)2271 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2272                                CPUARMState *env, uint32_t desc)
2273 {
2274     intptr_t i, oprsz = simd_oprsz(desc);
2275     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2276     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2277     bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
2278     float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
2279     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2280     int negx = 0, negf = 0;
2281 
2282     if (is_s) {
2283         if (env->vfp.fpcr & FPCR_AH) {
2284             negf = float_muladd_negate_product;
2285         } else {
2286             negx = 0x8000;
2287         }
2288     }
2289 
2290     for (i = 0; i < oprsz; i += sizeof(float32)) {
2291         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx;
2292         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2293         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2294         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2295         float32 aa = *(float32 *)(va + H1_4(i));
2296 
2297         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status);
2298     }
2299 }
2300 
do_fmlal_idx(float32 * d,void * vn,void * vm,CPUARMState * env,uint32_t desc,ARMFPStatusFlavour fpst_idx,uint64_t negx,int negf)2301 static void do_fmlal_idx(float32 *d, void *vn, void *vm,
2302                          CPUARMState *env, uint32_t desc,
2303                          ARMFPStatusFlavour fpst_idx,
2304                          uint64_t negx, int negf)
2305 {
2306     float_status *fpst = &env->vfp.fp_status[fpst_idx];
2307     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2308     intptr_t i, oprsz = simd_oprsz(desc);
2309     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2310     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2311     int is_q = oprsz == 16;
2312     uint64_t n_4;
2313     float32 m_1;
2314 
2315     /*
2316      * Pre-load all of the f16 data, avoiding overlap issues.
2317      * Negate all inputs for AH=0 FMLSL at once.
2318      */
2319     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2320     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2321 
2322     for (i = 0; i < oprsz / 4; i++) {
2323         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2324         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2325     }
2326     clear_tail(d, oprsz, simd_maxsz(desc));
2327 }
2328 
HELPER(gvec_fmlal_idx_a32)2329 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2330                                 CPUARMState *env, uint32_t desc)
2331 {
2332     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2333     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2334 
2335     do_fmlal_idx(vd, vn, vm, env, desc, FPST_STD, negx, 0);
2336 }
2337 
HELPER(gvec_fmlal_idx_a64)2338 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2339                                 CPUARMState *env, uint32_t desc)
2340 {
2341     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2342     uint64_t negx = 0;
2343     int negf = 0;
2344 
2345     if (is_s) {
2346         if (env->vfp.fpcr & FPCR_AH) {
2347             negf = float_muladd_negate_product;
2348         } else {
2349             negx = 0x8000800080008000ull;
2350         }
2351     }
2352     do_fmlal_idx(vd, vn, vm, env, desc, FPST_A64, negx, negf);
2353 }
2354 
HELPER(sve2_fmlal_zzxw_s)2355 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2356                                CPUARMState *env, uint32_t desc)
2357 {
2358     intptr_t i, j, oprsz = simd_oprsz(desc);
2359     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2360     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2361     bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
2362     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 3, 3) * sizeof(float16);
2363     float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
2364     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2365     int negx = 0, negf = 0;
2366 
2367     if (is_s) {
2368         if (env->vfp.fpcr & FPCR_AH) {
2369             negf = float_muladd_negate_product;
2370         } else {
2371             negx = 0x8000;
2372         }
2373     }
2374     for (i = 0; i < oprsz; i += 16) {
2375         float16 mm_16 = *(float16 *)(vm + i + idx);
2376         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2377 
2378         for (j = 0; j < 16; j += sizeof(float32)) {
2379             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx;
2380             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2381             float32 aa = *(float32 *)(va + H1_4(i + j));
2382 
2383             *(float32 *)(vd + H1_4(i + j)) =
2384                 float32_muladd(nn, mm, aa, negf, status);
2385         }
2386     }
2387 }
2388 
HELPER(gvec_sshl_b)2389 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2390 {
2391     intptr_t i, opr_sz = simd_oprsz(desc);
2392     int8_t *d = vd, *n = vn, *m = vm;
2393 
2394     for (i = 0; i < opr_sz; ++i) {
2395         int8_t mm = m[i];
2396         int8_t nn = n[i];
2397         int8_t res = 0;
2398         if (mm >= 0) {
2399             if (mm < 8) {
2400                 res = nn << mm;
2401             }
2402         } else {
2403             res = nn >> (mm > -8 ? -mm : 7);
2404         }
2405         d[i] = res;
2406     }
2407     clear_tail(d, opr_sz, simd_maxsz(desc));
2408 }
2409 
HELPER(gvec_sshl_h)2410 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2411 {
2412     intptr_t i, opr_sz = simd_oprsz(desc);
2413     int16_t *d = vd, *n = vn, *m = vm;
2414 
2415     for (i = 0; i < opr_sz / 2; ++i) {
2416         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2417         int16_t nn = n[i];
2418         int16_t res = 0;
2419         if (mm >= 0) {
2420             if (mm < 16) {
2421                 res = nn << mm;
2422             }
2423         } else {
2424             res = nn >> (mm > -16 ? -mm : 15);
2425         }
2426         d[i] = res;
2427     }
2428     clear_tail(d, opr_sz, simd_maxsz(desc));
2429 }
2430 
HELPER(gvec_ushl_b)2431 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2432 {
2433     intptr_t i, opr_sz = simd_oprsz(desc);
2434     uint8_t *d = vd, *n = vn, *m = vm;
2435 
2436     for (i = 0; i < opr_sz; ++i) {
2437         int8_t mm = m[i];
2438         uint8_t nn = n[i];
2439         uint8_t res = 0;
2440         if (mm >= 0) {
2441             if (mm < 8) {
2442                 res = nn << mm;
2443             }
2444         } else {
2445             if (mm > -8) {
2446                 res = nn >> -mm;
2447             }
2448         }
2449         d[i] = res;
2450     }
2451     clear_tail(d, opr_sz, simd_maxsz(desc));
2452 }
2453 
HELPER(gvec_ushl_h)2454 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2455 {
2456     intptr_t i, opr_sz = simd_oprsz(desc);
2457     uint16_t *d = vd, *n = vn, *m = vm;
2458 
2459     for (i = 0; i < opr_sz / 2; ++i) {
2460         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2461         uint16_t nn = n[i];
2462         uint16_t res = 0;
2463         if (mm >= 0) {
2464             if (mm < 16) {
2465                 res = nn << mm;
2466             }
2467         } else {
2468             if (mm > -16) {
2469                 res = nn >> -mm;
2470             }
2471         }
2472         d[i] = res;
2473     }
2474     clear_tail(d, opr_sz, simd_maxsz(desc));
2475 }
2476 
2477 /*
2478  * 8x8->8 polynomial multiply.
2479  *
2480  * Polynomial multiplication is like integer multiplication except the
2481  * partial products are XORed, not added.
2482  *
2483  * TODO: expose this as a generic vector operation, as it is a common
2484  * crypto building block.
2485  */
HELPER(gvec_pmul_b)2486 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2487 {
2488     intptr_t i, opr_sz = simd_oprsz(desc);
2489     uint64_t *d = vd, *n = vn, *m = vm;
2490 
2491     for (i = 0; i < opr_sz / 8; ++i) {
2492         d[i] = clmul_8x8_low(n[i], m[i]);
2493     }
2494     clear_tail(d, opr_sz, simd_maxsz(desc));
2495 }
2496 
2497 /*
2498  * 64x64->128 polynomial multiply.
2499  * Because of the lanes are not accessed in strict columns,
2500  * this probably cannot be turned into a generic helper.
2501  */
HELPER(gvec_pmull_q)2502 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2503 {
2504     intptr_t i, opr_sz = simd_oprsz(desc);
2505     intptr_t hi = simd_data(desc);
2506     uint64_t *d = vd, *n = vn, *m = vm;
2507 
2508     for (i = 0; i < opr_sz / 8; i += 2) {
2509         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2510         d[i] = int128_getlo(r);
2511         d[i + 1] = int128_gethi(r);
2512     }
2513     clear_tail(d, opr_sz, simd_maxsz(desc));
2514 }
2515 
HELPER(neon_pmull_h)2516 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2517 {
2518     int hi = simd_data(desc);
2519     uint64_t *d = vd, *n = vn, *m = vm;
2520     uint64_t nn = n[hi], mm = m[hi];
2521 
2522     d[0] = clmul_8x4_packed(nn, mm);
2523     nn >>= 32;
2524     mm >>= 32;
2525     d[1] = clmul_8x4_packed(nn, mm);
2526 
2527     clear_tail(d, 16, simd_maxsz(desc));
2528 }
2529 
2530 #ifdef TARGET_AARCH64
HELPER(sve2_pmull_h)2531 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2532 {
2533     int shift = simd_data(desc) * 8;
2534     intptr_t i, opr_sz = simd_oprsz(desc);
2535     uint64_t *d = vd, *n = vn, *m = vm;
2536 
2537     for (i = 0; i < opr_sz / 8; ++i) {
2538         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2539     }
2540 }
2541 
HELPER(sve2_pmull_d)2542 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2543 {
2544     intptr_t sel = H4(simd_data(desc));
2545     intptr_t i, opr_sz = simd_oprsz(desc);
2546     uint32_t *n = vn, *m = vm;
2547     uint64_t *d = vd;
2548 
2549     for (i = 0; i < opr_sz / 8; ++i) {
2550         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2551     }
2552 }
2553 #endif
2554 
2555 #define DO_CMP0(NAME, TYPE, OP)                         \
2556 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2557 {                                                       \
2558     intptr_t i, opr_sz = simd_oprsz(desc);              \
2559     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2560         TYPE nn = *(TYPE *)(vn + i);                    \
2561         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2562     }                                                   \
2563     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2564 }
2565 
2566 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2567 DO_CMP0(gvec_clt0_b, int8_t, <)
2568 DO_CMP0(gvec_cle0_b, int8_t, <=)
2569 DO_CMP0(gvec_cgt0_b, int8_t, >)
2570 DO_CMP0(gvec_cge0_b, int8_t, >=)
2571 
2572 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2573 DO_CMP0(gvec_clt0_h, int16_t, <)
2574 DO_CMP0(gvec_cle0_h, int16_t, <=)
2575 DO_CMP0(gvec_cgt0_h, int16_t, >)
2576 DO_CMP0(gvec_cge0_h, int16_t, >=)
2577 
2578 #undef DO_CMP0
2579 
2580 #define DO_ABD(NAME, TYPE)                                      \
2581 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2582 {                                                               \
2583     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2584     TYPE *d = vd, *n = vn, *m = vm;                             \
2585                                                                 \
2586     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2587         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2588     }                                                           \
2589     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2590 }
2591 
DO_ABD(gvec_sabd_b,int8_t)2592 DO_ABD(gvec_sabd_b, int8_t)
2593 DO_ABD(gvec_sabd_h, int16_t)
2594 DO_ABD(gvec_sabd_s, int32_t)
2595 DO_ABD(gvec_sabd_d, int64_t)
2596 
2597 DO_ABD(gvec_uabd_b, uint8_t)
2598 DO_ABD(gvec_uabd_h, uint16_t)
2599 DO_ABD(gvec_uabd_s, uint32_t)
2600 DO_ABD(gvec_uabd_d, uint64_t)
2601 
2602 #undef DO_ABD
2603 
2604 #define DO_ABA(NAME, TYPE)                                      \
2605 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2606 {                                                               \
2607     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2608     TYPE *d = vd, *n = vn, *m = vm;                             \
2609                                                                 \
2610     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2611         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2612     }                                                           \
2613     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2614 }
2615 
2616 DO_ABA(gvec_saba_b, int8_t)
2617 DO_ABA(gvec_saba_h, int16_t)
2618 DO_ABA(gvec_saba_s, int32_t)
2619 DO_ABA(gvec_saba_d, int64_t)
2620 
2621 DO_ABA(gvec_uaba_b, uint8_t)
2622 DO_ABA(gvec_uaba_h, uint16_t)
2623 DO_ABA(gvec_uaba_s, uint32_t)
2624 DO_ABA(gvec_uaba_d, uint64_t)
2625 
2626 #undef DO_ABA
2627 
2628 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2629 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2630                   float_status *stat, uint32_t desc)                       \
2631 {                                                                          \
2632     ARMVectorReg scratch;                                                  \
2633     intptr_t oprsz = simd_oprsz(desc);                                     \
2634     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2635     TYPE *d = vd, *n = vn, *m = vm;                                        \
2636     if (unlikely(d == m)) {                                                \
2637         m = memcpy(&scratch, m, oprsz);                                    \
2638     }                                                                      \
2639     for (intptr_t i = 0; i < half; ++i) {                                  \
2640         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2641     }                                                                      \
2642     for (intptr_t i = 0; i < half; ++i) {                                  \
2643         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2644     }                                                                      \
2645     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2646 }
2647 
2648 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2649 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2650 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2651 
2652 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2653 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2654 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2655 
2656 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2657 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2658 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2659 
2660 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2661 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2662 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2663 
2664 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2665 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2666 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2667 
2668 #ifdef TARGET_AARCH64
2669 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2670 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2671 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2672 
2673 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2674 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2675 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2676 #endif
2677 
2678 #undef DO_3OP_PAIR
2679 
2680 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2681 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2682 {                                                               \
2683     ARMVectorReg scratch;                                       \
2684     intptr_t oprsz = simd_oprsz(desc);                          \
2685     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2686     TYPE *d = vd, *n = vn, *m = vm;                             \
2687     if (unlikely(d == m)) {                                     \
2688         m = memcpy(&scratch, m, oprsz);                         \
2689     }                                                           \
2690     for (intptr_t i = 0; i < half; ++i) {                       \
2691         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2692     }                                                           \
2693     for (intptr_t i = 0; i < half; ++i) {                       \
2694         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2695     }                                                           \
2696     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2697 }
2698 
2699 #define ADD(A, B) (A + B)
2700 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2701 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2702 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2703 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2704 #undef  ADD
2705 
2706 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2707 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2708 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2709 
2710 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2711 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2712 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2713 
2714 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2715 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2716 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2717 
2718 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2719 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2720 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2721 
2722 #undef DO_3OP_PAIR
2723 
2724 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2725     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2726     {                                                                   \
2727         intptr_t i, oprsz = simd_oprsz(desc);                           \
2728         int shift = simd_data(desc);                                    \
2729         TYPE *d = vd, *n = vn;                                          \
2730         float_status *fpst = stat;                                      \
2731         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2732             d[i] = FUNC(n[i], shift, fpst);                             \
2733         }                                                               \
2734         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2735     }
2736 
2737 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2738 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2739 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2740 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2741 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2742 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2743 
2744 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2745 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2746 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2747 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2748 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2749 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2750 
2751 #undef DO_VCVT_FIXED
2752 
2753 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2754     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2755     {                                                                   \
2756         intptr_t i, oprsz = simd_oprsz(desc);                           \
2757         uint32_t rmode = simd_data(desc);                               \
2758         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2759         TYPE *d = vd, *n = vn;                                          \
2760         set_float_rounding_mode(rmode, fpst);                           \
2761         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2762             d[i] = FUNC(n[i], 0, fpst);                                 \
2763         }                                                               \
2764         set_float_rounding_mode(prev_rmode, fpst);                      \
2765         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2766     }
2767 
2768 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2769 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2770 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2771 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2772 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2773 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2774 
2775 #undef DO_VCVT_RMODE
2776 
2777 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2778     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2779     {                                                                   \
2780         intptr_t i, oprsz = simd_oprsz(desc);                           \
2781         uint32_t rmode = simd_data(desc);                               \
2782         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2783         TYPE *d = vd, *n = vn;                                          \
2784         set_float_rounding_mode(rmode, fpst);                           \
2785         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2786             d[i] = FUNC(n[i], fpst);                                    \
2787         }                                                               \
2788         set_float_rounding_mode(prev_rmode, fpst);                      \
2789         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2790     }
2791 
2792 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2793 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2794 
2795 #undef DO_VRINT_RMODE
2796 
2797 #ifdef TARGET_AARCH64
2798 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2799 {
2800     const uint8_t *indices = vm;
2801     size_t oprsz = simd_oprsz(desc);
2802     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2803     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2804     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2805     union {
2806         uint8_t b[16];
2807         uint64_t d[2];
2808     } result;
2809 
2810     /*
2811      * We must construct the final result in a temp, lest the output
2812      * overlaps the input table.  For TBL, begin with zero; for TBX,
2813      * begin with the original register contents.  Note that we always
2814      * copy 16 bytes here to avoid an extra branch; clearing the high
2815      * bits of the register for oprsz == 8 is handled below.
2816      */
2817     if (is_tbx) {
2818         memcpy(&result, vd, 16);
2819     } else {
2820         memset(&result, 0, 16);
2821     }
2822 
2823     for (size_t i = 0; i < oprsz; ++i) {
2824         uint32_t index = indices[H1(i)];
2825 
2826         if (index < table_len) {
2827             /*
2828              * Convert index (a byte offset into the virtual table
2829              * which is a series of 128-bit vectors concatenated)
2830              * into the correct register element, bearing in mind
2831              * that the table can wrap around from V31 to V0.
2832              */
2833             const uint8_t *table = (const uint8_t *)
2834                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2835             result.b[H1(i)] = table[H1(index % 16)];
2836         }
2837     }
2838 
2839     memcpy(vd, &result, 16);
2840     clear_tail(vd, oprsz, simd_maxsz(desc));
2841 }
2842 #endif
2843 
2844 /*
2845  * NxN -> N highpart multiply
2846  *
2847  * TODO: expose this as a generic vector operation.
2848  */
2849 
HELPER(gvec_smulh_b)2850 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2851 {
2852     intptr_t i, opr_sz = simd_oprsz(desc);
2853     int8_t *d = vd, *n = vn, *m = vm;
2854 
2855     for (i = 0; i < opr_sz; ++i) {
2856         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2857     }
2858     clear_tail(d, opr_sz, simd_maxsz(desc));
2859 }
2860 
HELPER(gvec_smulh_h)2861 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2862 {
2863     intptr_t i, opr_sz = simd_oprsz(desc);
2864     int16_t *d = vd, *n = vn, *m = vm;
2865 
2866     for (i = 0; i < opr_sz / 2; ++i) {
2867         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2868     }
2869     clear_tail(d, opr_sz, simd_maxsz(desc));
2870 }
2871 
HELPER(gvec_smulh_s)2872 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2873 {
2874     intptr_t i, opr_sz = simd_oprsz(desc);
2875     int32_t *d = vd, *n = vn, *m = vm;
2876 
2877     for (i = 0; i < opr_sz / 4; ++i) {
2878         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2879     }
2880     clear_tail(d, opr_sz, simd_maxsz(desc));
2881 }
2882 
HELPER(gvec_smulh_d)2883 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2884 {
2885     intptr_t i, opr_sz = simd_oprsz(desc);
2886     uint64_t *d = vd, *n = vn, *m = vm;
2887     uint64_t discard;
2888 
2889     for (i = 0; i < opr_sz / 8; ++i) {
2890         muls64(&discard, &d[i], n[i], m[i]);
2891     }
2892     clear_tail(d, opr_sz, simd_maxsz(desc));
2893 }
2894 
HELPER(gvec_umulh_b)2895 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2896 {
2897     intptr_t i, opr_sz = simd_oprsz(desc);
2898     uint8_t *d = vd, *n = vn, *m = vm;
2899 
2900     for (i = 0; i < opr_sz; ++i) {
2901         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2902     }
2903     clear_tail(d, opr_sz, simd_maxsz(desc));
2904 }
2905 
HELPER(gvec_umulh_h)2906 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2907 {
2908     intptr_t i, opr_sz = simd_oprsz(desc);
2909     uint16_t *d = vd, *n = vn, *m = vm;
2910 
2911     for (i = 0; i < opr_sz / 2; ++i) {
2912         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2913     }
2914     clear_tail(d, opr_sz, simd_maxsz(desc));
2915 }
2916 
HELPER(gvec_umulh_s)2917 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2918 {
2919     intptr_t i, opr_sz = simd_oprsz(desc);
2920     uint32_t *d = vd, *n = vn, *m = vm;
2921 
2922     for (i = 0; i < opr_sz / 4; ++i) {
2923         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2924     }
2925     clear_tail(d, opr_sz, simd_maxsz(desc));
2926 }
2927 
HELPER(gvec_umulh_d)2928 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2929 {
2930     intptr_t i, opr_sz = simd_oprsz(desc);
2931     uint64_t *d = vd, *n = vn, *m = vm;
2932     uint64_t discard;
2933 
2934     for (i = 0; i < opr_sz / 8; ++i) {
2935         mulu64(&discard, &d[i], n[i], m[i]);
2936     }
2937     clear_tail(d, opr_sz, simd_maxsz(desc));
2938 }
2939 
HELPER(gvec_xar_d)2940 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2941 {
2942     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2943     int shr = simd_data(desc);
2944     uint64_t *d = vd, *n = vn, *m = vm;
2945 
2946     for (i = 0; i < opr_sz; ++i) {
2947         d[i] = ror64(n[i] ^ m[i], shr);
2948     }
2949     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2950 }
2951 
2952 /*
2953  * Integer matrix-multiply accumulate
2954  */
2955 
do_smmla_b(uint32_t sum,void * vn,void * vm)2956 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2957 {
2958     int8_t *n = vn, *m = vm;
2959 
2960     for (intptr_t k = 0; k < 8; ++k) {
2961         sum += n[H1(k)] * m[H1(k)];
2962     }
2963     return sum;
2964 }
2965 
do_ummla_b(uint32_t sum,void * vn,void * vm)2966 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2967 {
2968     uint8_t *n = vn, *m = vm;
2969 
2970     for (intptr_t k = 0; k < 8; ++k) {
2971         sum += n[H1(k)] * m[H1(k)];
2972     }
2973     return sum;
2974 }
2975 
do_usmmla_b(uint32_t sum,void * vn,void * vm)2976 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2977 {
2978     uint8_t *n = vn;
2979     int8_t *m = vm;
2980 
2981     for (intptr_t k = 0; k < 8; ++k) {
2982         sum += n[H1(k)] * m[H1(k)];
2983     }
2984     return sum;
2985 }
2986 
do_mmla_b(void * vd,void * vn,void * vm,void * va,uint32_t desc,uint32_t (* inner_loop)(uint32_t,void *,void *))2987 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2988                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2989 {
2990     intptr_t seg, opr_sz = simd_oprsz(desc);
2991 
2992     for (seg = 0; seg < opr_sz; seg += 16) {
2993         uint32_t *d = vd + seg;
2994         uint32_t *a = va + seg;
2995         uint32_t sum0, sum1, sum2, sum3;
2996 
2997         /*
2998          * Process the entire segment at once, writing back the
2999          * results only after we've consumed all of the inputs.
3000          *
3001          * Key to indices by column:
3002          *          i   j                  i             j
3003          */
3004         sum0 = a[H4(0 + 0)];
3005         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
3006         sum1 = a[H4(0 + 1)];
3007         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
3008         sum2 = a[H4(2 + 0)];
3009         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
3010         sum3 = a[H4(2 + 1)];
3011         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
3012 
3013         d[H4(0)] = sum0;
3014         d[H4(1)] = sum1;
3015         d[H4(2)] = sum2;
3016         d[H4(3)] = sum3;
3017     }
3018     clear_tail(vd, opr_sz, simd_maxsz(desc));
3019 }
3020 
3021 #define DO_MMLA_B(NAME, INNER) \
3022     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
3023     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
3024 
DO_MMLA_B(gvec_smmla_b,do_smmla_b)3025 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
3026 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
3027 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
3028 
3029 /*
3030  * BFloat16 Dot Product
3031  */
3032 
3033 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
3034 {
3035     /*
3036      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
3037      * For EBF = 0, we ignore the FPCR bits which determine rounding
3038      * mode and denormal-flushing, and we do unfused multiplies and
3039      * additions with intermediate rounding of all products and sums.
3040      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
3041      * and we perform a fused two-way sum-of-products without intermediate
3042      * rounding of the products.
3043      * In either case, we don't set fp exception flags.
3044      *
3045      * EBF is AArch64 only, so even if it's set in the FPCR it has
3046      * no effect on AArch32 instructions.
3047      */
3048     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
3049 
3050     *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32];
3051     set_default_nan_mode(true, statusp);
3052 
3053     if (ebf) {
3054         /* EBF=1 needs to do a step with round-to-odd semantics */
3055         *oddstatusp = *statusp;
3056         set_float_rounding_mode(float_round_to_odd, oddstatusp);
3057     } else {
3058         set_flush_to_zero(true, statusp);
3059         set_flush_inputs_to_zero(true, statusp);
3060         set_float_rounding_mode(float_round_to_odd_inf, statusp);
3061     }
3062     return ebf;
3063 }
3064 
bfdotadd(float32 sum,uint32_t e1,uint32_t e2,float_status * fpst)3065 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
3066 {
3067     float32 t1, t2;
3068 
3069     /*
3070      * Extract each BFloat16 from the element pair, and shift
3071      * them such that they become float32.
3072      */
3073     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
3074     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
3075     t1 = float32_add(t1, t2, fpst);
3076     t1 = float32_add(sum, t1, fpst);
3077 
3078     return t1;
3079 }
3080 
bfdotadd_ebf(float32 sum,uint32_t e1,uint32_t e2,float_status * fpst,float_status * fpst_odd)3081 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
3082                      float_status *fpst, float_status *fpst_odd)
3083 {
3084     float32 s1r = e1 << 16;
3085     float32 s1c = e1 & 0xffff0000u;
3086     float32 s2r = e2 << 16;
3087     float32 s2c = e2 & 0xffff0000u;
3088     float32 t32;
3089 
3090     /* C.f. FPProcessNaNs4 */
3091     if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) ||
3092         float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) {
3093         if (float32_is_signaling_nan(s1r, fpst)) {
3094             t32 = s1r;
3095         } else if (float32_is_signaling_nan(s1c, fpst)) {
3096             t32 = s1c;
3097         } else if (float32_is_signaling_nan(s2r, fpst)) {
3098             t32 = s2r;
3099         } else if (float32_is_signaling_nan(s2c, fpst)) {
3100             t32 = s2c;
3101         } else if (float32_is_any_nan(s1r)) {
3102             t32 = s1r;
3103         } else if (float32_is_any_nan(s1c)) {
3104             t32 = s1c;
3105         } else if (float32_is_any_nan(s2r)) {
3106             t32 = s2r;
3107         } else {
3108             t32 = s2c;
3109         }
3110         /*
3111          * FPConvertNaN(FPProcessNaN(t32)) will be done as part
3112          * of the final addition below.
3113          */
3114     } else {
3115         /*
3116          * Compare f16_dotadd() in sme_helper.c, but here we have
3117          * bfloat16 inputs. In particular that means that we do not
3118          * want the FPCR.FZ16 flush semantics, so we use the normal
3119          * float_status for the input handling here.
3120          */
3121         float64 e1r = float32_to_float64(s1r, fpst);
3122         float64 e1c = float32_to_float64(s1c, fpst);
3123         float64 e2r = float32_to_float64(s2r, fpst);
3124         float64 e2c = float32_to_float64(s2c, fpst);
3125         float64 t64;
3126 
3127         /*
3128          * The ARM pseudocode function FPDot performs both multiplies
3129          * and the add with a single rounding operation.  Emulate this
3130          * by performing the first multiply in round-to-odd, then doing
3131          * the second multiply as fused multiply-add, and rounding to
3132          * float32 all in one step.
3133          */
3134         t64 = float64_mul(e1r, e2r, fpst_odd);
3135         t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
3136 
3137         /* This conversion is exact, because we've already rounded. */
3138         t32 = float64_to_float32(t64, fpst);
3139     }
3140 
3141     /* The final accumulation step is not fused. */
3142     return float32_add(sum, t32, fpst);
3143 }
3144 
HELPER(gvec_bfdot)3145 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
3146                         CPUARMState *env, uint32_t desc)
3147 {
3148     intptr_t i, opr_sz = simd_oprsz(desc);
3149     float32 *d = vd, *a = va;
3150     uint32_t *n = vn, *m = vm;
3151     float_status fpst, fpst_odd;
3152 
3153     if (is_ebf(env, &fpst, &fpst_odd)) {
3154         for (i = 0; i < opr_sz / 4; ++i) {
3155             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
3156         }
3157     } else {
3158         for (i = 0; i < opr_sz / 4; ++i) {
3159             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
3160         }
3161     }
3162     clear_tail(d, opr_sz, simd_maxsz(desc));
3163 }
3164 
HELPER(gvec_bfdot_idx)3165 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
3166                             void *va, CPUARMState *env, uint32_t desc)
3167 {
3168     intptr_t i, j, opr_sz = simd_oprsz(desc);
3169     intptr_t index = simd_data(desc);
3170     intptr_t elements = opr_sz / 4;
3171     intptr_t eltspersegment = MIN(16 / 4, elements);
3172     float32 *d = vd, *a = va;
3173     uint32_t *n = vn, *m = vm;
3174     float_status fpst, fpst_odd;
3175 
3176     if (is_ebf(env, &fpst, &fpst_odd)) {
3177         for (i = 0; i < elements; i += eltspersegment) {
3178             uint32_t m_idx = m[i + H4(index)];
3179 
3180             for (j = i; j < i + eltspersegment; j++) {
3181                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3182             }
3183         }
3184     } else {
3185         for (i = 0; i < elements; i += eltspersegment) {
3186             uint32_t m_idx = m[i + H4(index)];
3187 
3188             for (j = i; j < i + eltspersegment; j++) {
3189                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3190             }
3191         }
3192     }
3193     clear_tail(d, opr_sz, simd_maxsz(desc));
3194 }
3195 
HELPER(sme2_bfvdot_idx)3196 void HELPER(sme2_bfvdot_idx)(void *vd, void *vn, void *vm,
3197                              void *va, CPUARMState *env, uint32_t desc)
3198 {
3199     intptr_t i, j, opr_sz = simd_oprsz(desc);
3200     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2);
3201     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
3202     intptr_t elements = opr_sz / 4;
3203     intptr_t eltspersegment = MIN(16 / 4, elements);
3204     float32 *d = vd, *a = va;
3205     uint16_t *n0 = vn;
3206     uint16_t *n1 = vn + sizeof(ARMVectorReg);
3207     uint32_t *m = vm;
3208     float_status fpst, fpst_odd;
3209 
3210     if (is_ebf(env, &fpst, &fpst_odd)) {
3211         for (i = 0; i < elements; i += eltspersegment) {
3212             uint32_t m_idx = m[i + H4(idx)];
3213 
3214             for (j = 0; j < eltspersegment; j++) {
3215                 uint32_t nn = (n0[H2(2 * (i + j) + sel)])
3216                             | (n1[H2(2 * (i + j) + sel)] << 16);
3217                 d[i + H4(j)] = bfdotadd_ebf(a[i + H4(j)], nn, m_idx,
3218                                             &fpst, &fpst_odd);
3219             }
3220         }
3221     } else {
3222         for (i = 0; i < elements; i += eltspersegment) {
3223             uint32_t m_idx = m[i + H4(idx)];
3224 
3225             for (j = 0; j < eltspersegment; j++) {
3226                 uint32_t nn = (n0[H2(2 * (i + j) + sel)])
3227                             | (n1[H2(2 * (i + j) + sel)] << 16);
3228                 d[i + H4(j)] = bfdotadd(a[i + H4(j)], nn, m_idx, &fpst);
3229             }
3230         }
3231     }
3232     clear_tail(d, opr_sz, simd_maxsz(desc));
3233 }
3234 
HELPER(gvec_bfmmla)3235 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3236                          CPUARMState *env, uint32_t desc)
3237 {
3238     intptr_t s, opr_sz = simd_oprsz(desc);
3239     float32 *d = vd, *a = va;
3240     uint32_t *n = vn, *m = vm;
3241     float_status fpst, fpst_odd;
3242 
3243     if (is_ebf(env, &fpst, &fpst_odd)) {
3244         for (s = 0; s < opr_sz / 4; s += 4) {
3245             float32 sum00, sum01, sum10, sum11;
3246 
3247             /*
3248              * Process the entire segment at once, writing back the
3249              * results only after we've consumed all of the inputs.
3250              *
3251              * Key to indices by column:
3252              *               i   j               i   k             j   k
3253              */
3254             sum00 = a[s + H4(0 + 0)];
3255             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3256             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3257 
3258             sum01 = a[s + H4(0 + 1)];
3259             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3260             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3261 
3262             sum10 = a[s + H4(2 + 0)];
3263             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3264             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3265 
3266             sum11 = a[s + H4(2 + 1)];
3267             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3268             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3269 
3270             d[s + H4(0 + 0)] = sum00;
3271             d[s + H4(0 + 1)] = sum01;
3272             d[s + H4(2 + 0)] = sum10;
3273             d[s + H4(2 + 1)] = sum11;
3274         }
3275     } else {
3276         for (s = 0; s < opr_sz / 4; s += 4) {
3277             float32 sum00, sum01, sum10, sum11;
3278 
3279             /*
3280              * Process the entire segment at once, writing back the
3281              * results only after we've consumed all of the inputs.
3282              *
3283              * Key to indices by column:
3284              *               i   j           i   k             j   k
3285              */
3286             sum00 = a[s + H4(0 + 0)];
3287             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3288             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3289 
3290             sum01 = a[s + H4(0 + 1)];
3291             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3292             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3293 
3294             sum10 = a[s + H4(2 + 0)];
3295             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3296             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3297 
3298             sum11 = a[s + H4(2 + 1)];
3299             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3300             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3301 
3302             d[s + H4(0 + 0)] = sum00;
3303             d[s + H4(0 + 1)] = sum01;
3304             d[s + H4(2 + 0)] = sum10;
3305             d[s + H4(2 + 1)] = sum11;
3306         }
3307     }
3308     clear_tail(d, opr_sz, simd_maxsz(desc));
3309 }
3310 
do_bfmlal(float32 * d,bfloat16 * n,bfloat16 * m,float32 * a,float_status * stat,uint32_t desc,int negx,int negf)3311 static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
3312                       float_status *stat, uint32_t desc, int negx, int negf)
3313 {
3314     intptr_t i, opr_sz = simd_oprsz(desc);
3315     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3316 
3317     for (i = 0; i < opr_sz / 4; ++i) {
3318         float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16;
3319         float32 mm = m[H2(i * 2 + sel)] << 16;
3320         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat);
3321     }
3322     clear_tail(d, opr_sz, simd_maxsz(desc));
3323 }
3324 
HELPER(gvec_bfmlal)3325 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3326                          float_status *stat, uint32_t desc)
3327 {
3328     do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0);
3329 }
3330 
HELPER(gvec_bfmlsl)3331 void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va,
3332                          float_status *stat, uint32_t desc)
3333 {
3334     do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0);
3335 }
3336 
HELPER(gvec_ah_bfmlsl)3337 void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va,
3338                             float_status *stat, uint32_t desc)
3339 {
3340     do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
3341 }
3342 
do_bfmlal_idx(float32 * d,bfloat16 * n,bfloat16 * m,float32 * a,float_status * stat,uint32_t desc,int negx,int negf)3343 static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
3344                           float_status *stat, uint32_t desc, int negx, int negf)
3345 {
3346     intptr_t i, j, opr_sz = simd_oprsz(desc);
3347     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3348     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3349     intptr_t elements = opr_sz / 4;
3350     intptr_t eltspersegment = MIN(16 / 4, elements);
3351 
3352     for (i = 0; i < elements; i += eltspersegment) {
3353         float32 m_idx = m[H2(2 * i + index)] << 16;
3354 
3355         for (j = i; j < i + eltspersegment; j++) {
3356             float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16;
3357             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat);
3358         }
3359     }
3360     clear_tail(d, opr_sz, simd_maxsz(desc));
3361 }
3362 
HELPER(gvec_bfmlal_idx)3363 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va,
3364                              float_status *stat, uint32_t desc)
3365 {
3366     do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0);
3367 }
3368 
HELPER(gvec_bfmlsl_idx)3369 void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
3370                              float_status *stat, uint32_t desc)
3371 {
3372     do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0);
3373 }
3374 
HELPER(gvec_ah_bfmlsl_idx)3375 void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
3376                                 float_status *stat, uint32_t desc)
3377 {
3378     do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
3379 }
3380 
3381 #define DO_CLAMP(NAME, TYPE) \
3382 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3383 {                                                                       \
3384     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3385     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3386         TYPE aa = *(TYPE *)(a + i);                                     \
3387         TYPE nn = *(TYPE *)(n + i);                                     \
3388         TYPE mm = *(TYPE *)(m + i);                                     \
3389         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3390         *(TYPE *)(d + i) = dd;                                          \
3391     }                                                                   \
3392     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3393 }
3394 
DO_CLAMP(gvec_sclamp_b,int8_t)3395 DO_CLAMP(gvec_sclamp_b, int8_t)
3396 DO_CLAMP(gvec_sclamp_h, int16_t)
3397 DO_CLAMP(gvec_sclamp_s, int32_t)
3398 DO_CLAMP(gvec_sclamp_d, int64_t)
3399 
3400 DO_CLAMP(gvec_uclamp_b, uint8_t)
3401 DO_CLAMP(gvec_uclamp_h, uint16_t)
3402 DO_CLAMP(gvec_uclamp_s, uint32_t)
3403 DO_CLAMP(gvec_uclamp_d, uint64_t)
3404 
3405 /* Bit count in each 8-bit word. */
3406 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3407 {
3408     intptr_t i, opr_sz = simd_oprsz(desc);
3409     uint8_t *d = vd, *n = vn;
3410 
3411     for (i = 0; i < opr_sz; ++i) {
3412         d[i] = ctpop8(n[i]);
3413     }
3414     clear_tail(d, opr_sz, simd_maxsz(desc));
3415 }
3416 
3417 /* Reverse bits in each 8 bit word */
HELPER(gvec_rbit_b)3418 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3419 {
3420     intptr_t i, opr_sz = simd_oprsz(desc);
3421     uint64_t *d = vd, *n = vn;
3422 
3423     for (i = 0; i < opr_sz / 8; ++i) {
3424         d[i] = revbit64(bswap64(n[i]));
3425     }
3426     clear_tail(d, opr_sz, simd_maxsz(desc));
3427 }
3428 
HELPER(gvec_urecpe_s)3429 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3430 {
3431     intptr_t i, opr_sz = simd_oprsz(desc);
3432     uint32_t *d = vd, *n = vn;
3433 
3434     for (i = 0; i < opr_sz / 4; ++i) {
3435         d[i] = helper_recpe_u32(n[i]);
3436     }
3437     clear_tail(d, opr_sz, simd_maxsz(desc));
3438 }
3439 
HELPER(gvec_ursqrte_s)3440 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3441 {
3442     intptr_t i, opr_sz = simd_oprsz(desc);
3443     uint32_t *d = vd, *n = vn;
3444 
3445     for (i = 0; i < opr_sz / 4; ++i) {
3446         d[i] = helper_rsqrte_u32(n[i]);
3447     }
3448     clear_tail(d, opr_sz, simd_maxsz(desc));
3449 }
3450 
do_lut_b(void * zd,uint64_t * indexes,uint64_t * table,unsigned elements,unsigned segbase,unsigned dstride,unsigned isize,unsigned tsize,unsigned nreg)3451 static inline void do_lut_b(void *zd, uint64_t *indexes, uint64_t *table,
3452                             unsigned elements, unsigned segbase,
3453                             unsigned dstride, unsigned isize,
3454                             unsigned tsize, unsigned nreg)
3455 {
3456     for (unsigned r = 0; r < nreg; ++r) {
3457         uint8_t *dst = zd + dstride * r;
3458         unsigned base = segbase + r * elements;
3459 
3460         for (unsigned e = 0; e < elements; ++e) {
3461             unsigned index = extractn(indexes, (base + e) * isize, isize);
3462             dst[H1(e)] = extractn(table, index * tsize, 8);
3463         }
3464     }
3465 }
3466 
do_lut_h(void * zd,uint64_t * indexes,uint64_t * table,unsigned elements,unsigned segbase,unsigned dstride,unsigned isize,unsigned tsize,unsigned nreg)3467 static inline void do_lut_h(void *zd, uint64_t *indexes, uint64_t *table,
3468                             unsigned elements, unsigned segbase,
3469                             unsigned dstride, unsigned isize,
3470                             unsigned tsize, unsigned nreg)
3471 {
3472     for (unsigned r = 0; r < nreg; ++r) {
3473         uint16_t *dst = zd + dstride * r;
3474         unsigned base = segbase + r * elements;
3475 
3476         for (unsigned e = 0; e < elements; ++e) {
3477             unsigned index = extractn(indexes, (base + e) * isize, isize);
3478             dst[H2(e)] = extractn(table, index * tsize, 16);
3479         }
3480     }
3481 }
3482 
do_lut_s(void * zd,uint64_t * indexes,uint32_t * table,unsigned elements,unsigned segbase,unsigned dstride,unsigned isize,unsigned tsize,unsigned nreg)3483 static inline void do_lut_s(void *zd, uint64_t *indexes, uint32_t *table,
3484                             unsigned elements, unsigned segbase,
3485                             unsigned dstride, unsigned isize,
3486                             unsigned tsize, unsigned nreg)
3487 {
3488     for (unsigned r = 0; r < nreg; ++r) {
3489         uint32_t *dst = zd + dstride * r;
3490         unsigned base = segbase + r * elements;
3491 
3492         for (unsigned e = 0; e < elements; ++e) {
3493             unsigned index = extractn(indexes, (base + e) * isize, isize);
3494             dst[H4(e)] = table[H4(index)];
3495         }
3496     }
3497 }
3498 
3499 #define DO_SME2_LUT(ISIZE, NREG, SUFF, ESIZE) \
3500 void helper_sme2_luti##ISIZE##_##NREG##SUFF                             \
3501     (void *zd, void *zn, CPUARMState *env, uint32_t desc)               \
3502 {                                                                       \
3503     unsigned vl = simd_oprsz(desc);                                     \
3504     unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1);             \
3505     unsigned idx = extract32(desc, SIMD_DATA_SHIFT + 1, 4);             \
3506     unsigned elements = vl / ESIZE;                                     \
3507     unsigned dstride = (!strided ? 1 : NREG == 4 ? 4 : 8);              \
3508     unsigned segments = (ESIZE * 8) / (ISIZE * NREG);                   \
3509     unsigned segment = idx & (segments - 1);                            \
3510     ARMVectorReg indexes;                                               \
3511     memcpy(&indexes, zn, vl);                                           \
3512     do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements,   \
3513                   segment * NREG * elements,                            \
3514                   dstride * sizeof(ARMVectorReg), ISIZE, 32, NREG);     \
3515 }
3516 
3517 DO_SME2_LUT(2,1,b, 1)
3518 DO_SME2_LUT(2,1,h, 2)
3519 DO_SME2_LUT(2,1,s, 4)
3520 DO_SME2_LUT(2,2,b, 1)
3521 DO_SME2_LUT(2,2,h, 2)
3522 DO_SME2_LUT(2,2,s, 4)
3523 DO_SME2_LUT(2,4,b, 1)
3524 DO_SME2_LUT(2,4,h, 2)
3525 DO_SME2_LUT(2,4,s, 4)
3526 
3527 DO_SME2_LUT(4,1,b, 1)
3528 DO_SME2_LUT(4,1,h, 2)
3529 DO_SME2_LUT(4,1,s, 4)
3530 DO_SME2_LUT(4,2,b, 1)
3531 DO_SME2_LUT(4,2,h, 2)
3532 DO_SME2_LUT(4,2,s, 4)
3533 DO_SME2_LUT(4,4,h, 2)
3534 DO_SME2_LUT(4,4,s, 4)
3535 
3536 #undef DO_SME2_LUT
3537