1 /*
2 * ARM AdvSIMD / SVE Vector Operations
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28
29 /*
30 * Data for expanding active predicate bits to bytes, for byte elements.
31 *
32 * for (i = 0; i < 256; ++i) {
33 * unsigned long m = 0;
34 * for (j = 0; j < 8; j++) {
35 * if ((i >> j) & 1) {
36 * m |= 0xfful << (j << 3);
37 * }
38 * }
39 * printf("0x%016lx,\n", m);
40 * }
41 */
42 const uint64_t expand_pred_b_data[256] = {
43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128 0xffffffffffffffff,
129 };
130
131 /*
132 * Similarly for half-word elements.
133 * for (i = 0; i < 256; ++i) {
134 * unsigned long m = 0;
135 * if (i & 0xaa) {
136 * continue;
137 * }
138 * for (j = 0; j < 8; j += 2) {
139 * if ((i >> j) & 1) {
140 * m |= 0xfffful << (j << 3);
141 * }
142 * }
143 * printf("[0x%x] = 0x%016lx,\n", i, m);
144 * }
145 */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154 [0x55] = 0xffffffffffffffff,
155 };
156
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
do_sqrdmlah_b(int8_t src1,int8_t src2,int8_t src3,bool neg,bool round)158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159 bool neg, bool round)
160 {
161 /*
162 * Simplify:
163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165 */
166 int32_t ret = (int32_t)src1 * src2;
167 if (neg) {
168 ret = -ret;
169 }
170 ret += ((int32_t)src3 << 7) + (round << 6);
171 ret >>= 7;
172
173 if (ret != (int8_t)ret) {
174 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175 }
176 return ret;
177 }
178
HELPER(sve2_sqrdmlah_b)179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180 void *va, uint32_t desc)
181 {
182 intptr_t i, opr_sz = simd_oprsz(desc);
183 int8_t *d = vd, *n = vn, *m = vm, *a = va;
184
185 for (i = 0; i < opr_sz; ++i) {
186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187 }
188 }
189
HELPER(sve2_sqrdmlsh_b)190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191 void *va, uint32_t desc)
192 {
193 intptr_t i, opr_sz = simd_oprsz(desc);
194 int8_t *d = vd, *n = vn, *m = vm, *a = va;
195
196 for (i = 0; i < opr_sz; ++i) {
197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198 }
199 }
200
HELPER(sve2_sqdmulh_b)201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203 intptr_t i, opr_sz = simd_oprsz(desc);
204 int8_t *d = vd, *n = vn, *m = vm;
205
206 for (i = 0; i < opr_sz; ++i) {
207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208 }
209 }
210
HELPER(sve2_sqrdmulh_b)211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213 intptr_t i, opr_sz = simd_oprsz(desc);
214 int8_t *d = vd, *n = vn, *m = vm;
215
216 for (i = 0; i < opr_sz; ++i) {
217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218 }
219 }
220
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
do_sqrdmlah_h(int16_t src1,int16_t src2,int16_t src3,bool neg,bool round,uint32_t * sat)222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223 bool neg, bool round, uint32_t *sat)
224 {
225 /* Simplify similarly to do_sqrdmlah_b above. */
226 int32_t ret = (int32_t)src1 * src2;
227 if (neg) {
228 ret = -ret;
229 }
230 ret += ((int32_t)src3 << 15) + (round << 14);
231 ret >>= 15;
232
233 if (ret != (int16_t)ret) {
234 *sat = 1;
235 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236 }
237 return ret;
238 }
239
HELPER(neon_qrdmlah_s16)240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241 uint32_t src2, uint32_t src3)
242 {
243 uint32_t *sat = &env->vfp.qc[0];
244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246 false, true, sat);
247 return deposit32(e1, 16, 16, e2);
248 }
249
HELPER(gvec_qrdmlah_s16)250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251 void *vq, uint32_t desc)
252 {
253 uintptr_t opr_sz = simd_oprsz(desc);
254 int16_t *d = vd;
255 int16_t *n = vn;
256 int16_t *m = vm;
257 uintptr_t i;
258
259 for (i = 0; i < opr_sz / 2; ++i) {
260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261 }
262 clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264
HELPER(neon_qrdmlsh_s16)265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266 uint32_t src2, uint32_t src3)
267 {
268 uint32_t *sat = &env->vfp.qc[0];
269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271 true, true, sat);
272 return deposit32(e1, 16, 16, e2);
273 }
274
HELPER(gvec_qrdmlsh_s16)275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276 void *vq, uint32_t desc)
277 {
278 uintptr_t opr_sz = simd_oprsz(desc);
279 int16_t *d = vd;
280 int16_t *n = vn;
281 int16_t *m = vm;
282 uintptr_t i;
283
284 for (i = 0; i < opr_sz / 2; ++i) {
285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286 }
287 clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289
HELPER(neon_sqdmulh_h)290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291 void *vq, uint32_t desc)
292 {
293 intptr_t i, opr_sz = simd_oprsz(desc);
294 int16_t *d = vd, *n = vn, *m = vm;
295
296 for (i = 0; i < opr_sz / 2; ++i) {
297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298 }
299 clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301
HELPER(neon_sqrdmulh_h)302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303 void *vq, uint32_t desc)
304 {
305 intptr_t i, opr_sz = simd_oprsz(desc);
306 int16_t *d = vd, *n = vn, *m = vm;
307
308 for (i = 0; i < opr_sz / 2; ++i) {
309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310 }
311 clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313
HELPER(neon_sqdmulh_idx_h)314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315 void *vq, uint32_t desc)
316 {
317 intptr_t i, j, opr_sz = simd_oprsz(desc);
318 int idx = simd_data(desc);
319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320 intptr_t elements = opr_sz / 2;
321 intptr_t eltspersegment = MIN(16 / 2, elements);
322
323 for (i = 0; i < elements; i += 16 / 2) {
324 int16_t mm = m[i];
325 for (j = 0; j < eltspersegment; ++j) {
326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327 }
328 }
329 clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331
HELPER(neon_sqrdmulh_idx_h)332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333 void *vq, uint32_t desc)
334 {
335 intptr_t i, j, opr_sz = simd_oprsz(desc);
336 int idx = simd_data(desc);
337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338 intptr_t elements = opr_sz / 2;
339 intptr_t eltspersegment = MIN(16 / 2, elements);
340
341 for (i = 0; i < elements; i += 16 / 2) {
342 int16_t mm = m[i];
343 for (j = 0; j < eltspersegment; ++j) {
344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345 }
346 }
347 clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349
HELPER(neon_sqrdmlah_idx_h)350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351 void *vq, uint32_t desc)
352 {
353 intptr_t i, j, opr_sz = simd_oprsz(desc);
354 int idx = simd_data(desc);
355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356 intptr_t elements = opr_sz / 2;
357 intptr_t eltspersegment = MIN(16 / 2, elements);
358
359 for (i = 0; i < elements; i += 16 / 2) {
360 int16_t mm = m[i];
361 for (j = 0; j < eltspersegment; ++j) {
362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363 }
364 }
365 clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367
HELPER(neon_sqrdmlsh_idx_h)368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369 void *vq, uint32_t desc)
370 {
371 intptr_t i, j, opr_sz = simd_oprsz(desc);
372 int idx = simd_data(desc);
373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374 intptr_t elements = opr_sz / 2;
375 intptr_t eltspersegment = MIN(16 / 2, elements);
376
377 for (i = 0; i < elements; i += 16 / 2) {
378 int16_t mm = m[i];
379 for (j = 0; j < eltspersegment; ++j) {
380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381 }
382 }
383 clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385
HELPER(sve2_sqrdmlah_h)386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387 void *va, uint32_t desc)
388 {
389 intptr_t i, opr_sz = simd_oprsz(desc);
390 int16_t *d = vd, *n = vn, *m = vm, *a = va;
391 uint32_t discard;
392
393 for (i = 0; i < opr_sz / 2; ++i) {
394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395 }
396 }
397
HELPER(sve2_sqrdmlsh_h)398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399 void *va, uint32_t desc)
400 {
401 intptr_t i, opr_sz = simd_oprsz(desc);
402 int16_t *d = vd, *n = vn, *m = vm, *a = va;
403 uint32_t discard;
404
405 for (i = 0; i < opr_sz / 2; ++i) {
406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407 }
408 }
409
HELPER(sve2_sqdmulh_h)410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412 intptr_t i, opr_sz = simd_oprsz(desc);
413 int16_t *d = vd, *n = vn, *m = vm;
414 uint32_t discard;
415
416 for (i = 0; i < opr_sz / 2; ++i) {
417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418 }
419 }
420
HELPER(sve2_sqrdmulh_h)421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423 intptr_t i, opr_sz = simd_oprsz(desc);
424 int16_t *d = vd, *n = vn, *m = vm;
425 uint32_t discard;
426
427 for (i = 0; i < opr_sz / 2; ++i) {
428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429 }
430 }
431
HELPER(sve2_sqdmulh_idx_h)432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434 intptr_t i, j, opr_sz = simd_oprsz(desc);
435 int idx = simd_data(desc);
436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437 uint32_t discard;
438
439 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440 int16_t mm = m[i];
441 for (j = 0; j < 16 / 2; ++j) {
442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443 }
444 }
445 }
446
HELPER(sve2_sqrdmulh_idx_h)447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449 intptr_t i, j, opr_sz = simd_oprsz(desc);
450 int idx = simd_data(desc);
451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452 uint32_t discard;
453
454 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455 int16_t mm = m[i];
456 for (j = 0; j < 16 / 2; ++j) {
457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458 }
459 }
460 }
461
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
do_sqrdmlah_s(int32_t src1,int32_t src2,int32_t src3,bool neg,bool round,uint32_t * sat)463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464 bool neg, bool round, uint32_t *sat)
465 {
466 /* Simplify similarly to do_sqrdmlah_b above. */
467 int64_t ret = (int64_t)src1 * src2;
468 if (neg) {
469 ret = -ret;
470 }
471 ret += ((int64_t)src3 << 31) + (round << 30);
472 ret >>= 31;
473
474 if (ret != (int32_t)ret) {
475 *sat = 1;
476 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477 }
478 return ret;
479 }
480
HELPER(neon_qrdmlah_s32)481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482 int32_t src2, int32_t src3)
483 {
484 uint32_t *sat = &env->vfp.qc[0];
485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487
HELPER(gvec_qrdmlah_s32)488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489 void *vq, uint32_t desc)
490 {
491 uintptr_t opr_sz = simd_oprsz(desc);
492 int32_t *d = vd;
493 int32_t *n = vn;
494 int32_t *m = vm;
495 uintptr_t i;
496
497 for (i = 0; i < opr_sz / 4; ++i) {
498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499 }
500 clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502
HELPER(neon_qrdmlsh_s32)503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504 int32_t src2, int32_t src3)
505 {
506 uint32_t *sat = &env->vfp.qc[0];
507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509
HELPER(gvec_qrdmlsh_s32)510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511 void *vq, uint32_t desc)
512 {
513 uintptr_t opr_sz = simd_oprsz(desc);
514 int32_t *d = vd;
515 int32_t *n = vn;
516 int32_t *m = vm;
517 uintptr_t i;
518
519 for (i = 0; i < opr_sz / 4; ++i) {
520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521 }
522 clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524
HELPER(neon_sqdmulh_s)525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526 void *vq, uint32_t desc)
527 {
528 intptr_t i, opr_sz = simd_oprsz(desc);
529 int32_t *d = vd, *n = vn, *m = vm;
530
531 for (i = 0; i < opr_sz / 4; ++i) {
532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533 }
534 clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536
HELPER(neon_sqrdmulh_s)537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538 void *vq, uint32_t desc)
539 {
540 intptr_t i, opr_sz = simd_oprsz(desc);
541 int32_t *d = vd, *n = vn, *m = vm;
542
543 for (i = 0; i < opr_sz / 4; ++i) {
544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545 }
546 clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548
HELPER(neon_sqdmulh_idx_s)549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550 void *vq, uint32_t desc)
551 {
552 intptr_t i, j, opr_sz = simd_oprsz(desc);
553 int idx = simd_data(desc);
554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555 intptr_t elements = opr_sz / 4;
556 intptr_t eltspersegment = MIN(16 / 4, elements);
557
558 for (i = 0; i < elements; i += 16 / 4) {
559 int32_t mm = m[i];
560 for (j = 0; j < eltspersegment; ++j) {
561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562 }
563 }
564 clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566
HELPER(neon_sqrdmulh_idx_s)567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568 void *vq, uint32_t desc)
569 {
570 intptr_t i, j, opr_sz = simd_oprsz(desc);
571 int idx = simd_data(desc);
572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573 intptr_t elements = opr_sz / 4;
574 intptr_t eltspersegment = MIN(16 / 4, elements);
575
576 for (i = 0; i < elements; i += 16 / 4) {
577 int32_t mm = m[i];
578 for (j = 0; j < eltspersegment; ++j) {
579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580 }
581 }
582 clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584
HELPER(neon_sqrdmlah_idx_s)585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586 void *vq, uint32_t desc)
587 {
588 intptr_t i, j, opr_sz = simd_oprsz(desc);
589 int idx = simd_data(desc);
590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591 intptr_t elements = opr_sz / 4;
592 intptr_t eltspersegment = MIN(16 / 4, elements);
593
594 for (i = 0; i < elements; i += 16 / 4) {
595 int32_t mm = m[i];
596 for (j = 0; j < eltspersegment; ++j) {
597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598 }
599 }
600 clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602
HELPER(neon_sqrdmlsh_idx_s)603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604 void *vq, uint32_t desc)
605 {
606 intptr_t i, j, opr_sz = simd_oprsz(desc);
607 int idx = simd_data(desc);
608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609 intptr_t elements = opr_sz / 4;
610 intptr_t eltspersegment = MIN(16 / 4, elements);
611
612 for (i = 0; i < elements; i += 16 / 4) {
613 int32_t mm = m[i];
614 for (j = 0; j < eltspersegment; ++j) {
615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616 }
617 }
618 clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620
HELPER(sve2_sqrdmlah_s)621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622 void *va, uint32_t desc)
623 {
624 intptr_t i, opr_sz = simd_oprsz(desc);
625 int32_t *d = vd, *n = vn, *m = vm, *a = va;
626 uint32_t discard;
627
628 for (i = 0; i < opr_sz / 4; ++i) {
629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630 }
631 }
632
HELPER(sve2_sqrdmlsh_s)633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634 void *va, uint32_t desc)
635 {
636 intptr_t i, opr_sz = simd_oprsz(desc);
637 int32_t *d = vd, *n = vn, *m = vm, *a = va;
638 uint32_t discard;
639
640 for (i = 0; i < opr_sz / 4; ++i) {
641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642 }
643 }
644
HELPER(sve2_sqdmulh_s)645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647 intptr_t i, opr_sz = simd_oprsz(desc);
648 int32_t *d = vd, *n = vn, *m = vm;
649 uint32_t discard;
650
651 for (i = 0; i < opr_sz / 4; ++i) {
652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653 }
654 }
655
HELPER(sve2_sqrdmulh_s)656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658 intptr_t i, opr_sz = simd_oprsz(desc);
659 int32_t *d = vd, *n = vn, *m = vm;
660 uint32_t discard;
661
662 for (i = 0; i < opr_sz / 4; ++i) {
663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664 }
665 }
666
HELPER(sve2_sqdmulh_idx_s)667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669 intptr_t i, j, opr_sz = simd_oprsz(desc);
670 int idx = simd_data(desc);
671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672 uint32_t discard;
673
674 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675 int32_t mm = m[i];
676 for (j = 0; j < 16 / 4; ++j) {
677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678 }
679 }
680 }
681
HELPER(sve2_sqrdmulh_idx_s)682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684 intptr_t i, j, opr_sz = simd_oprsz(desc);
685 int idx = simd_data(desc);
686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687 uint32_t discard;
688
689 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690 int32_t mm = m[i];
691 for (j = 0; j < 16 / 4; ++j) {
692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693 }
694 }
695 }
696
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
do_sat128_d(Int128 r)698 static int64_t do_sat128_d(Int128 r)
699 {
700 int64_t ls = int128_getlo(r);
701 int64_t hs = int128_gethi(r);
702
703 if (unlikely(hs != (ls >> 63))) {
704 return hs < 0 ? INT64_MIN : INT64_MAX;
705 }
706 return ls;
707 }
708
do_sqrdmlah_d(int64_t n,int64_t m,int64_t a,bool neg,bool round)709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711 uint64_t l, h;
712 Int128 r, t;
713
714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715 muls64(&l, &h, m, n);
716 r = int128_make128(l, h);
717 if (neg) {
718 r = int128_neg(r);
719 }
720 if (a) {
721 t = int128_exts64(a);
722 t = int128_lshift(t, 63);
723 r = int128_add(r, t);
724 }
725 if (round) {
726 t = int128_exts64(1ll << 62);
727 r = int128_add(r, t);
728 }
729 r = int128_rshift(r, 63);
730
731 return do_sat128_d(r);
732 }
733
HELPER(sve2_sqrdmlah_d)734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735 void *va, uint32_t desc)
736 {
737 intptr_t i, opr_sz = simd_oprsz(desc);
738 int64_t *d = vd, *n = vn, *m = vm, *a = va;
739
740 for (i = 0; i < opr_sz / 8; ++i) {
741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742 }
743 }
744
HELPER(sve2_sqrdmlsh_d)745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746 void *va, uint32_t desc)
747 {
748 intptr_t i, opr_sz = simd_oprsz(desc);
749 int64_t *d = vd, *n = vn, *m = vm, *a = va;
750
751 for (i = 0; i < opr_sz / 8; ++i) {
752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753 }
754 }
755
HELPER(sve2_sqdmulh_d)756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758 intptr_t i, opr_sz = simd_oprsz(desc);
759 int64_t *d = vd, *n = vn, *m = vm;
760
761 for (i = 0; i < opr_sz / 8; ++i) {
762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763 }
764 }
765
HELPER(sve2_sqrdmulh_d)766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768 intptr_t i, opr_sz = simd_oprsz(desc);
769 int64_t *d = vd, *n = vn, *m = vm;
770
771 for (i = 0; i < opr_sz / 8; ++i) {
772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773 }
774 }
775
HELPER(sve2_sqdmulh_idx_d)776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778 intptr_t i, j, opr_sz = simd_oprsz(desc);
779 int idx = simd_data(desc);
780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781
782 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783 int64_t mm = m[i];
784 for (j = 0; j < 16 / 8; ++j) {
785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786 }
787 }
788 }
789
HELPER(sve2_sqrdmulh_idx_d)790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792 intptr_t i, j, opr_sz = simd_oprsz(desc);
793 int idx = simd_data(desc);
794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795
796 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797 int64_t mm = m[i];
798 for (j = 0; j < 16 / 8; ++j) {
799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800 }
801 }
802 }
803
804 /* Integer 8 and 16-bit dot-product.
805 *
806 * Note that for the loops herein, host endianness does not matter
807 * with respect to the ordering of data within the quad-width lanes.
808 * All elements are treated equally, no matter where they are.
809 */
810
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
813 { \
814 intptr_t i, opr_sz = simd_oprsz(desc); \
815 TYPED *d = vd, *a = va; \
816 TYPEN *n = vn; \
817 TYPEM *m = vm; \
818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
819 d[i] = (a[i] + \
820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
824 } \
825 clear_tail(d, opr_sz, simd_maxsz(desc)); \
826 }
827
DO_DOT(gvec_sdot_4b,int32_t,int8_t,int8_t)828 DO_DOT(gvec_sdot_4b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_4b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_4b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_4h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_4h, uint64_t, uint16_t, uint16_t)
833
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
836 { \
837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
839 /* \
840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \
841 * first iteration might not be a full 16 byte segment. But \
842 * for vector lengths beyond that this must be SVE and we know \
843 * opr_sz is a multiple of 16, so we need not clamp segend \
844 * to opr_sz_n when we advance it at the end of the loop. \
845 */ \
846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
847 intptr_t index = simd_data(desc); \
848 TYPED *d = vd, *a = va; \
849 TYPEN *n = vn; \
850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
851 do { \
852 TYPED m0 = m_indexed[i * 4 + 0]; \
853 TYPED m1 = m_indexed[i * 4 + 1]; \
854 TYPED m2 = m_indexed[i * 4 + 2]; \
855 TYPED m3 = m_indexed[i * 4 + 3]; \
856 do { \
857 d[i] = (a[i] + \
858 n[i * 4 + 0] * m0 + \
859 n[i * 4 + 1] * m1 + \
860 n[i * 4 + 2] * m2 + \
861 n[i * 4 + 3] * m3); \
862 } while (++i < segend); \
863 segend = i + (16 / sizeof(TYPED)); \
864 } while (i < opr_sz_n); \
865 clear_tail(d, opr_sz, simd_maxsz(desc)); \
866 }
867
868 DO_DOT_IDX(gvec_sdot_idx_4b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_4b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_4b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_4b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_4h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_4h, uint64_t, uint16_t, uint16_t, H8)
874
875 #undef DO_DOT
876 #undef DO_DOT_IDX
877
878 /* Similar for 2-way dot product */
879 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
880 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
881 { \
882 intptr_t i, opr_sz = simd_oprsz(desc); \
883 TYPED *d = vd, *a = va; \
884 TYPEN *n = vn; \
885 TYPEM *m = vm; \
886 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
887 d[i] = (a[i] + \
888 (TYPED)n[i * 2 + 0] * m[i * 2 + 0] + \
889 (TYPED)n[i * 2 + 1] * m[i * 2 + 1]); \
890 } \
891 clear_tail(d, opr_sz, simd_maxsz(desc)); \
892 }
893
894 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
895 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
896 { \
897 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
898 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
899 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
900 intptr_t index = simd_data(desc); \
901 TYPED *d = vd, *a = va; \
902 TYPEN *n = vn; \
903 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 2; \
904 do { \
905 TYPED m0 = m_indexed[i * 2 + 0]; \
906 TYPED m1 = m_indexed[i * 2 + 1]; \
907 do { \
908 d[i] = (a[i] + \
909 n[i * 2 + 0] * m0 + \
910 n[i * 2 + 1] * m1); \
911 } while (++i < segend); \
912 segend = i + (16 / sizeof(TYPED)); \
913 } while (i < opr_sz_n); \
914 clear_tail(d, opr_sz, simd_maxsz(desc)); \
915 }
916
917 DO_DOT(gvec_sdot_2h, int32_t, int16_t, int16_t)
918 DO_DOT(gvec_udot_2h, uint32_t, uint16_t, uint16_t)
919
920 DO_DOT_IDX(gvec_sdot_idx_2h, int32_t, int16_t, int16_t, H4)
921 DO_DOT_IDX(gvec_udot_idx_2h, uint32_t, uint16_t, uint16_t, H4)
922
923 #undef DO_DOT
924 #undef DO_DOT_IDX
925
926 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
927 float_status *fpst, uint32_t desc)
928 {
929 uintptr_t opr_sz = simd_oprsz(desc);
930 float16 *d = vd;
931 float16 *n = vn;
932 float16 *m = vm;
933 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
934 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
935 uintptr_t i;
936
937 for (i = 0; i < opr_sz / 2; i += 2) {
938 float16 e0 = n[H2(i)];
939 float16 e1 = m[H2(i + 1)];
940 float16 e2 = n[H2(i + 1)];
941 float16 e3 = m[H2(i)];
942
943 if (rot) {
944 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
945 } else {
946 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
947 }
948
949 d[H2(i)] = float16_add(e0, e1, fpst);
950 d[H2(i + 1)] = float16_add(e2, e3, fpst);
951 }
952 clear_tail(d, opr_sz, simd_maxsz(desc));
953 }
954
HELPER(gvec_fcadds)955 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
956 float_status *fpst, uint32_t desc)
957 {
958 uintptr_t opr_sz = simd_oprsz(desc);
959 float32 *d = vd;
960 float32 *n = vn;
961 float32 *m = vm;
962 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
963 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
964 uintptr_t i;
965
966 for (i = 0; i < opr_sz / 4; i += 2) {
967 float32 e0 = n[H4(i)];
968 float32 e1 = m[H4(i + 1)];
969 float32 e2 = n[H4(i + 1)];
970 float32 e3 = m[H4(i)];
971
972 if (rot) {
973 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
974 } else {
975 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
976 }
977
978 d[H4(i)] = float32_add(e0, e1, fpst);
979 d[H4(i + 1)] = float32_add(e2, e3, fpst);
980 }
981 clear_tail(d, opr_sz, simd_maxsz(desc));
982 }
983
HELPER(gvec_fcaddd)984 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
985 float_status *fpst, uint32_t desc)
986 {
987 uintptr_t opr_sz = simd_oprsz(desc);
988 float64 *d = vd;
989 float64 *n = vn;
990 float64 *m = vm;
991 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
992 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
993 uintptr_t i;
994
995 for (i = 0; i < opr_sz / 8; i += 2) {
996 float64 e0 = n[i];
997 float64 e1 = m[i + 1];
998 float64 e2 = n[i + 1];
999 float64 e3 = m[i];
1000
1001 if (rot) {
1002 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
1003 } else {
1004 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
1005 }
1006
1007 d[i] = float64_add(e0, e1, fpst);
1008 d[i + 1] = float64_add(e2, e3, fpst);
1009 }
1010 clear_tail(d, opr_sz, simd_maxsz(desc));
1011 }
1012
HELPER(gvec_fcmlah)1013 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
1014 float_status *fpst, uint32_t desc)
1015 {
1016 uintptr_t opr_sz = simd_oprsz(desc);
1017 float16 *d = vd, *n = vn, *m = vm, *a = va;
1018 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1019 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1020 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1021 uint32_t negf_real = flip ^ negf_imag;
1022 float16 negx_imag, negx_real;
1023 uintptr_t i;
1024
1025 /* With AH=0, use negx; with AH=1 use negf. */
1026 negx_real = (negf_real & ~fpcr_ah) << 15;
1027 negx_imag = (negf_imag & ~fpcr_ah) << 15;
1028 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1029 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1030
1031 for (i = 0; i < opr_sz / 2; i += 2) {
1032 float16 e2 = n[H2(i + flip)];
1033 float16 e1 = m[H2(i + flip)] ^ negx_real;
1034 float16 e4 = e2;
1035 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
1036
1037 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
1038 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
1039 }
1040 clear_tail(d, opr_sz, simd_maxsz(desc));
1041 }
1042
HELPER(gvec_fcmlah_idx)1043 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
1044 float_status *fpst, uint32_t desc)
1045 {
1046 uintptr_t opr_sz = simd_oprsz(desc);
1047 float16 *d = vd, *n = vn, *m = vm, *a = va;
1048 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1049 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1050 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1051 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1052 uint32_t negf_real = flip ^ negf_imag;
1053 intptr_t elements = opr_sz / sizeof(float16);
1054 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1055 float16 negx_imag, negx_real;
1056 intptr_t i, j;
1057
1058 /* With AH=0, use negx; with AH=1 use negf. */
1059 negx_real = (negf_real & ~fpcr_ah) << 15;
1060 negx_imag = (negf_imag & ~fpcr_ah) << 15;
1061 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1062 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1063
1064 for (i = 0; i < elements; i += eltspersegment) {
1065 float16 mr = m[H2(i + 2 * index + 0)];
1066 float16 mi = m[H2(i + 2 * index + 1)];
1067 float16 e1 = negx_real ^ (flip ? mi : mr);
1068 float16 e3 = negx_imag ^ (flip ? mr : mi);
1069
1070 for (j = i; j < i + eltspersegment; j += 2) {
1071 float16 e2 = n[H2(j + flip)];
1072 float16 e4 = e2;
1073
1074 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst);
1075 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst);
1076 }
1077 }
1078 clear_tail(d, opr_sz, simd_maxsz(desc));
1079 }
1080
HELPER(gvec_fcmlas)1081 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1082 float_status *fpst, uint32_t desc)
1083 {
1084 uintptr_t opr_sz = simd_oprsz(desc);
1085 float32 *d = vd, *n = vn, *m = vm, *a = va;
1086 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1087 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1088 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1089 uint32_t negf_real = flip ^ negf_imag;
1090 float32 negx_imag, negx_real;
1091 uintptr_t i;
1092
1093 /* With AH=0, use negx; with AH=1 use negf. */
1094 negx_real = (negf_real & ~fpcr_ah) << 31;
1095 negx_imag = (negf_imag & ~fpcr_ah) << 31;
1096 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1097 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1098
1099 for (i = 0; i < opr_sz / 4; i += 2) {
1100 float32 e2 = n[H4(i + flip)];
1101 float32 e1 = m[H4(i + flip)] ^ negx_real;
1102 float32 e4 = e2;
1103 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1104
1105 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1106 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1107 }
1108 clear_tail(d, opr_sz, simd_maxsz(desc));
1109 }
1110
HELPER(gvec_fcmlas_idx)1111 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1112 float_status *fpst, uint32_t desc)
1113 {
1114 uintptr_t opr_sz = simd_oprsz(desc);
1115 float32 *d = vd, *n = vn, *m = vm, *a = va;
1116 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1117 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1118 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1119 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1120 uint32_t negf_real = flip ^ negf_imag;
1121 intptr_t elements = opr_sz / sizeof(float32);
1122 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1123 float32 negx_imag, negx_real;
1124 intptr_t i, j;
1125
1126 /* With AH=0, use negx; with AH=1 use negf. */
1127 negx_real = (negf_real & ~fpcr_ah) << 31;
1128 negx_imag = (negf_imag & ~fpcr_ah) << 31;
1129 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1130 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1131
1132 for (i = 0; i < elements; i += eltspersegment) {
1133 float32 mr = m[H4(i + 2 * index + 0)];
1134 float32 mi = m[H4(i + 2 * index + 1)];
1135 float32 e1 = negx_real ^ (flip ? mi : mr);
1136 float32 e3 = negx_imag ^ (flip ? mr : mi);
1137
1138 for (j = i; j < i + eltspersegment; j += 2) {
1139 float32 e2 = n[H4(j + flip)];
1140 float32 e4 = e2;
1141
1142 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst);
1143 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst);
1144 }
1145 }
1146 clear_tail(d, opr_sz, simd_maxsz(desc));
1147 }
1148
HELPER(gvec_fcmlad)1149 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1150 float_status *fpst, uint32_t desc)
1151 {
1152 uintptr_t opr_sz = simd_oprsz(desc);
1153 float64 *d = vd, *n = vn, *m = vm, *a = va;
1154 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1155 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1156 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1157 uint32_t negf_real = flip ^ negf_imag;
1158 float64 negx_real, negx_imag;
1159 uintptr_t i;
1160
1161 /* With AH=0, use negx; with AH=1 use negf. */
1162 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1163 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1164 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1165 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1166
1167 for (i = 0; i < opr_sz / 8; i += 2) {
1168 float64 e2 = n[i + flip];
1169 float64 e1 = m[i + flip] ^ negx_real;
1170 float64 e4 = e2;
1171 float64 e3 = m[i + 1 - flip] ^ negx_imag;
1172
1173 d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1174 d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1175 }
1176 clear_tail(d, opr_sz, simd_maxsz(desc));
1177 }
1178
1179 /*
1180 * Floating point comparisons producing an integer result (all 1s or all 0s).
1181 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1182 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1183 */
float16_ceq(float16 op1,float16 op2,float_status * stat)1184 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1185 {
1186 return -float16_eq_quiet(op1, op2, stat);
1187 }
1188
float32_ceq(float32 op1,float32 op2,float_status * stat)1189 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1190 {
1191 return -float32_eq_quiet(op1, op2, stat);
1192 }
1193
float64_ceq(float64 op1,float64 op2,float_status * stat)1194 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1195 {
1196 return -float64_eq_quiet(op1, op2, stat);
1197 }
1198
float16_cge(float16 op1,float16 op2,float_status * stat)1199 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1200 {
1201 return -float16_le(op2, op1, stat);
1202 }
1203
float32_cge(float32 op1,float32 op2,float_status * stat)1204 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1205 {
1206 return -float32_le(op2, op1, stat);
1207 }
1208
float64_cge(float64 op1,float64 op2,float_status * stat)1209 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1210 {
1211 return -float64_le(op2, op1, stat);
1212 }
1213
float16_cgt(float16 op1,float16 op2,float_status * stat)1214 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1215 {
1216 return -float16_lt(op2, op1, stat);
1217 }
1218
float32_cgt(float32 op1,float32 op2,float_status * stat)1219 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1220 {
1221 return -float32_lt(op2, op1, stat);
1222 }
1223
float64_cgt(float64 op1,float64 op2,float_status * stat)1224 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1225 {
1226 return -float64_lt(op2, op1, stat);
1227 }
1228
float16_acge(float16 op1,float16 op2,float_status * stat)1229 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1230 {
1231 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1232 }
1233
float32_acge(float32 op1,float32 op2,float_status * stat)1234 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1235 {
1236 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1237 }
1238
float64_acge(float64 op1,float64 op2,float_status * stat)1239 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1240 {
1241 return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1242 }
1243
float16_acgt(float16 op1,float16 op2,float_status * stat)1244 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1245 {
1246 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1247 }
1248
float32_acgt(float32 op1,float32 op2,float_status * stat)1249 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1250 {
1251 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1252 }
1253
float64_acgt(float64 op1,float64 op2,float_status * stat)1254 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1255 {
1256 return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1257 }
1258
vfp_tosszh(float16 x,float_status * fpst)1259 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1260 {
1261 if (float16_is_any_nan(x)) {
1262 float_raise(float_flag_invalid, fpst);
1263 return 0;
1264 }
1265 return float16_to_int16_round_to_zero(x, fpst);
1266 }
1267
vfp_touszh(float16 x,float_status * fpst)1268 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1269 {
1270 if (float16_is_any_nan(x)) {
1271 float_raise(float_flag_invalid, fpst);
1272 return 0;
1273 }
1274 return float16_to_uint16_round_to_zero(x, fpst);
1275 }
1276
1277 #define DO_2OP(NAME, FUNC, TYPE) \
1278 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
1279 { \
1280 intptr_t i, oprsz = simd_oprsz(desc); \
1281 TYPE *d = vd, *n = vn; \
1282 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1283 d[i] = FUNC(n[i], stat); \
1284 } \
1285 clear_tail(d, oprsz, simd_maxsz(desc)); \
1286 }
1287
DO_2OP(gvec_frecpe_h,helper_recpe_f16,float16)1288 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1289 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1290 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32)
1291 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1292
1293 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1294 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1295 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32)
1296 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1297
1298 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1299 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1300
1301 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1302 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1303 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1304 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1305 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1306 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1307 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1308 DO_2OP(gvec_touszh, vfp_touszh, float16)
1309
1310 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1311 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1312 { \
1313 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1314 }
1315
1316 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1317 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1318 { \
1319 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1320 }
1321
1322 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1323 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1324 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1325 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \
1326 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1327 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \
1328 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1329
1330 DO_2OP_CMP0(cgt, cgt, FWD)
1331 DO_2OP_CMP0(cge, cge, FWD)
1332 DO_2OP_CMP0(ceq, ceq, FWD)
1333 DO_2OP_CMP0(clt, cgt, REV)
1334 DO_2OP_CMP0(cle, cge, REV)
1335
1336 #undef DO_2OP
1337 #undef DO_2OP_CMP0
1338
1339 /* Floating-point trigonometric starting value.
1340 * See the ARM ARM pseudocode function FPTrigSMul.
1341 */
1342 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1343 {
1344 float16 result = float16_mul(op1, op1, stat);
1345 if (!float16_is_any_nan(result)) {
1346 result = float16_set_sign(result, op2 & 1);
1347 }
1348 return result;
1349 }
1350
float32_ftsmul(float32 op1,uint32_t op2,float_status * stat)1351 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1352 {
1353 float32 result = float32_mul(op1, op1, stat);
1354 if (!float32_is_any_nan(result)) {
1355 result = float32_set_sign(result, op2 & 1);
1356 }
1357 return result;
1358 }
1359
float64_ftsmul(float64 op1,uint64_t op2,float_status * stat)1360 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1361 {
1362 float64 result = float64_mul(op1, op1, stat);
1363 if (!float64_is_any_nan(result)) {
1364 result = float64_set_sign(result, op2 & 1);
1365 }
1366 return result;
1367 }
1368
float16_abd(float16 op1,float16 op2,float_status * stat)1369 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1370 {
1371 return float16_abs(float16_sub(op1, op2, stat));
1372 }
1373
float32_abd(float32 op1,float32 op2,float_status * stat)1374 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1375 {
1376 return float32_abs(float32_sub(op1, op2, stat));
1377 }
1378
float64_abd(float64 op1,float64 op2,float_status * stat)1379 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1380 {
1381 return float64_abs(float64_sub(op1, op2, stat));
1382 }
1383
1384 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
float16_ah_abd(float16 op1,float16 op2,float_status * stat)1385 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1386 {
1387 float16 r = float16_sub(op1, op2, stat);
1388 return float16_is_any_nan(r) ? r : float16_abs(r);
1389 }
1390
float32_ah_abd(float32 op1,float32 op2,float_status * stat)1391 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1392 {
1393 float32 r = float32_sub(op1, op2, stat);
1394 return float32_is_any_nan(r) ? r : float32_abs(r);
1395 }
1396
float64_ah_abd(float64 op1,float64 op2,float_status * stat)1397 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1398 {
1399 float64 r = float64_sub(op1, op2, stat);
1400 return float64_is_any_nan(r) ? r : float64_abs(r);
1401 }
1402
1403 /*
1404 * Reciprocal step. These are the AArch32 version which uses a
1405 * non-fused multiply-and-subtract.
1406 */
float16_recps_nf(float16 op1,float16 op2,float_status * stat)1407 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1408 {
1409 op1 = float16_squash_input_denormal(op1, stat);
1410 op2 = float16_squash_input_denormal(op2, stat);
1411
1412 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1413 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1414 return float16_two;
1415 }
1416 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1417 }
1418
float32_recps_nf(float32 op1,float32 op2,float_status * stat)1419 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1420 {
1421 op1 = float32_squash_input_denormal(op1, stat);
1422 op2 = float32_squash_input_denormal(op2, stat);
1423
1424 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1425 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1426 return float32_two;
1427 }
1428 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1429 }
1430
1431 /* Reciprocal square-root step. AArch32 non-fused semantics. */
float16_rsqrts_nf(float16 op1,float16 op2,float_status * stat)1432 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1433 {
1434 op1 = float16_squash_input_denormal(op1, stat);
1435 op2 = float16_squash_input_denormal(op2, stat);
1436
1437 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1438 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1439 return float16_one_point_five;
1440 }
1441 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1442 return float16_div(op1, float16_two, stat);
1443 }
1444
float32_rsqrts_nf(float32 op1,float32 op2,float_status * stat)1445 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1446 {
1447 op1 = float32_squash_input_denormal(op1, stat);
1448 op2 = float32_squash_input_denormal(op2, stat);
1449
1450 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1451 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1452 return float32_one_point_five;
1453 }
1454 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1455 return float32_div(op1, float32_two, stat);
1456 }
1457
1458 #define DO_3OP(NAME, FUNC, TYPE) \
1459 void HELPER(NAME)(void *vd, void *vn, void *vm, \
1460 float_status *stat, uint32_t desc) \
1461 { \
1462 intptr_t i, oprsz = simd_oprsz(desc); \
1463 TYPE *d = vd, *n = vn, *m = vm; \
1464 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1465 d[i] = FUNC(n[i], m[i], stat); \
1466 } \
1467 clear_tail(d, oprsz, simd_maxsz(desc)); \
1468 }
1469
DO_3OP(gvec_fadd_b16,bfloat16_add,float16)1470 DO_3OP(gvec_fadd_b16, bfloat16_add, float16)
1471 DO_3OP(gvec_fadd_h, float16_add, float16)
1472 DO_3OP(gvec_fadd_s, float32_add, float32)
1473 DO_3OP(gvec_fadd_d, float64_add, float64)
1474 DO_3OP(gvec_bfadd, bfloat16_add, bfloat16)
1475
1476 DO_3OP(gvec_fsub_b16, bfloat16_sub, float16)
1477 DO_3OP(gvec_fsub_h, float16_sub, float16)
1478 DO_3OP(gvec_fsub_s, float32_sub, float32)
1479 DO_3OP(gvec_fsub_d, float64_sub, float64)
1480 DO_3OP(gvec_bfsub, bfloat16_sub, bfloat16)
1481
1482 DO_3OP(gvec_fmul_b16, bfloat16_mul, float16)
1483 DO_3OP(gvec_fmul_h, float16_mul, float16)
1484 DO_3OP(gvec_fmul_s, float32_mul, float32)
1485 DO_3OP(gvec_fmul_d, float64_mul, float64)
1486
1487 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1488 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1489 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1490
1491 DO_3OP(gvec_fabd_h, float16_abd, float16)
1492 DO_3OP(gvec_fabd_s, float32_abd, float32)
1493 DO_3OP(gvec_fabd_d, float64_abd, float64)
1494
1495 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1496 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1497 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1498
1499 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1500 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1501 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1502
1503 DO_3OP(gvec_fcge_h, float16_cge, float16)
1504 DO_3OP(gvec_fcge_s, float32_cge, float32)
1505 DO_3OP(gvec_fcge_d, float64_cge, float64)
1506
1507 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1508 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1509 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1510
1511 DO_3OP(gvec_facge_h, float16_acge, float16)
1512 DO_3OP(gvec_facge_s, float32_acge, float32)
1513 DO_3OP(gvec_facge_d, float64_acge, float64)
1514
1515 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1516 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1517 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1518
1519 DO_3OP(gvec_fmax_h, float16_max, float16)
1520 DO_3OP(gvec_fmax_s, float32_max, float32)
1521 DO_3OP(gvec_fmax_d, float64_max, float64)
1522
1523 DO_3OP(gvec_fmin_h, float16_min, float16)
1524 DO_3OP(gvec_fmin_s, float32_min, float32)
1525 DO_3OP(gvec_fmin_d, float64_min, float64)
1526
1527 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1528 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1529 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1530
1531 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1532 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1533 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1534
1535 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1536 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1537
1538 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1539 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1540
1541 #ifdef TARGET_AARCH64
1542 DO_3OP(gvec_fdiv_h, float16_div, float16)
1543 DO_3OP(gvec_fdiv_s, float32_div, float32)
1544 DO_3OP(gvec_fdiv_d, float64_div, float64)
1545
1546 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1547 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1548 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1549
1550 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1551 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1552 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1553
1554 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1555 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1556 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1557
1558 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1559 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1560 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1561
1562 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1563 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1564 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1565
1566 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1567 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1568 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1569
1570 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1571 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1572 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1573
1574 DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16)
1575 DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16)
1576 DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16)
1577 DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16)
1578 DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16)
1579 DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16)
1580
1581 #endif
1582 #undef DO_3OP
1583
1584 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1585 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1586 float_status *stat)
1587 {
1588 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1589 }
1590
float32_muladd_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1591 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1592 float_status *stat)
1593 {
1594 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1595 }
1596
float16_mulsub_nf(float16 dest,float16 op1,float16 op2,float_status * stat)1597 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1598 float_status *stat)
1599 {
1600 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1601 }
1602
float32_mulsub_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1603 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1604 float_status *stat)
1605 {
1606 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1607 }
1608
1609 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
float16_muladd_f(float16 dest,float16 op1,float16 op2,float_status * stat)1610 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1611 float_status *stat)
1612 {
1613 return float16_muladd(op1, op2, dest, 0, stat);
1614 }
1615
bfloat16_muladd_f(bfloat16 dest,bfloat16 op1,bfloat16 op2,float_status * stat)1616 static bfloat16 bfloat16_muladd_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1617 float_status *stat)
1618 {
1619 return bfloat16_muladd(op1, op2, dest, 0, stat);
1620 }
1621
float32_muladd_f(float32 dest,float32 op1,float32 op2,float_status * stat)1622 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1623 float_status *stat)
1624 {
1625 return float32_muladd(op1, op2, dest, 0, stat);
1626 }
1627
float64_muladd_f(float64 dest,float64 op1,float64 op2,float_status * stat)1628 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1629 float_status *stat)
1630 {
1631 return float64_muladd(op1, op2, dest, 0, stat);
1632 }
1633
float16_mulsub_f(float16 dest,float16 op1,float16 op2,float_status * stat)1634 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1635 float_status *stat)
1636 {
1637 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1638 }
1639
bfloat16_mulsub_f(bfloat16 dest,bfloat16 op1,bfloat16 op2,float_status * stat)1640 static bfloat16 bfloat16_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1641 float_status *stat)
1642 {
1643 return bfloat16_muladd(bfloat16_chs(op1), op2, dest, 0, stat);
1644 }
1645
float32_mulsub_f(float32 dest,float32 op1,float32 op2,float_status * stat)1646 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1647 float_status *stat)
1648 {
1649 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1650 }
1651
float64_mulsub_f(float64 dest,float64 op1,float64 op2,float_status * stat)1652 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1653 float_status *stat)
1654 {
1655 return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1656 }
1657
float16_ah_mulsub_f(float16 dest,float16 op1,float16 op2,float_status * stat)1658 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1659 float_status *stat)
1660 {
1661 return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1662 }
1663
bfloat16_ah_mulsub_f(bfloat16 dest,bfloat16 op1,bfloat16 op2,float_status * stat)1664 static bfloat16 bfloat16_ah_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1665 float_status *stat)
1666 {
1667 return bfloat16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1668 }
1669
float32_ah_mulsub_f(float32 dest,float32 op1,float32 op2,float_status * stat)1670 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1671 float_status *stat)
1672 {
1673 return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1674 }
1675
float64_ah_mulsub_f(float64 dest,float64 op1,float64 op2,float_status * stat)1676 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1677 float_status *stat)
1678 {
1679 return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1680 }
1681
1682 #define DO_MULADD(NAME, FUNC, TYPE) \
1683 void HELPER(NAME)(void *vd, void *vn, void *vm, \
1684 float_status *stat, uint32_t desc) \
1685 { \
1686 intptr_t i, oprsz = simd_oprsz(desc); \
1687 TYPE *d = vd, *n = vn, *m = vm; \
1688 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1689 d[i] = FUNC(d[i], n[i], m[i], stat); \
1690 } \
1691 clear_tail(d, oprsz, simd_maxsz(desc)); \
1692 }
1693
DO_MULADD(gvec_fmla_nf_h,float16_muladd_nf,float16)1694 DO_MULADD(gvec_fmla_nf_h, float16_muladd_nf, float16)
1695 DO_MULADD(gvec_fmla_nf_s, float32_muladd_nf, float32)
1696
1697 DO_MULADD(gvec_fmls_nf_h, float16_mulsub_nf, float16)
1698 DO_MULADD(gvec_fmls_nf_s, float32_mulsub_nf, float32)
1699
1700 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1701 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1702 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1703 DO_MULADD(gvec_bfmla, bfloat16_muladd_f, bfloat16)
1704
1705 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1706 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1707 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1708 DO_MULADD(gvec_bfmls, bfloat16_mulsub_f, bfloat16)
1709
1710 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1711 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1712 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1713 DO_MULADD(gvec_ah_bfmls, bfloat16_ah_mulsub_f, bfloat16)
1714
1715 #undef DO_MULADD
1716
1717 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1718 * For AdvSIMD, there is of course only one such vector segment.
1719 */
1720
1721 #define DO_MUL_IDX(NAME, TYPE, H) \
1722 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1723 { \
1724 intptr_t i, j, oprsz = simd_oprsz(desc); \
1725 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1726 intptr_t idx = simd_data(desc); \
1727 TYPE *d = vd, *n = vn, *m = vm; \
1728 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1729 TYPE mm = m[H(i + idx)]; \
1730 for (j = 0; j < segment; j++) { \
1731 d[i + j] = n[i + j] * mm; \
1732 } \
1733 } \
1734 clear_tail(d, oprsz, simd_maxsz(desc)); \
1735 }
1736
1737 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1738 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1739 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1740
1741 #undef DO_MUL_IDX
1742
1743 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1744 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1745 { \
1746 intptr_t i, j, oprsz = simd_oprsz(desc); \
1747 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1748 intptr_t idx = simd_data(desc); \
1749 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1750 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1751 TYPE mm = m[H(i + idx)]; \
1752 for (j = 0; j < segment; j++) { \
1753 d[i + j] = a[i + j] OP n[i + j] * mm; \
1754 } \
1755 } \
1756 clear_tail(d, oprsz, simd_maxsz(desc)); \
1757 }
1758
1759 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1760 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1761 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1762
1763 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1764 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1765 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1766
1767 #undef DO_MLA_IDX
1768
1769 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \
1770 void HELPER(NAME)(void *vd, void *vn, void *vm, \
1771 float_status *stat, uint32_t desc) \
1772 { \
1773 intptr_t i, j, oprsz = simd_oprsz(desc); \
1774 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1775 intptr_t idx = simd_data(desc); \
1776 TYPE *d = vd, *n = vn, *m = vm; \
1777 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1778 TYPE mm = m[H(i + idx)]; \
1779 for (j = 0; j < segment; j++) { \
1780 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \
1781 } \
1782 } \
1783 clear_tail(d, oprsz, simd_maxsz(desc)); \
1784 }
1785
1786 #define nop(N, M, S) (M)
1787
1788 DO_FMUL_IDX(gvec_fmul_idx_b16, nop, bfloat16_mul, float16, H2)
1789 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1790 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1791 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1792
1793 #ifdef TARGET_AARCH64
1794
1795 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1796 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1797 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1798
1799 #endif
1800
1801 #undef nop
1802
1803 /*
1804 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1805 * the fused ops below they assume accumulate both from and into Vd.
1806 */
1807 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1808 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1809 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1810 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1811
1812 #undef DO_FMUL_IDX
1813
1814 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF) \
1815 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1816 float_status *stat, uint32_t desc) \
1817 { \
1818 intptr_t i, j, oprsz = simd_oprsz(desc); \
1819 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1820 intptr_t idx = simd_data(desc); \
1821 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1822 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1823 TYPE mm = m[H(i + idx)]; \
1824 for (j = 0; j < segment; j++) { \
1825 d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \
1826 a[i + j], NEGF, stat); \
1827 } \
1828 } \
1829 clear_tail(d, oprsz, simd_maxsz(desc)); \
1830 }
1831
1832 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1833 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1834 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1835 DO_FMLA_IDX(gvec_bfmla_idx, bfloat16, H2, 0, 0)
1836
1837 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1838 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1839 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1840 DO_FMLA_IDX(gvec_bfmls_idx, bfloat16, H2, INT16_MIN, 0)
1841
1842 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1843 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1844 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1845 DO_FMLA_IDX(gvec_ah_bfmls_idx, bfloat16, H2, 0, float_muladd_negate_product)
1846
1847 #undef DO_FMLA_IDX
1848
1849 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1850 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1851 { \
1852 intptr_t i, oprsz = simd_oprsz(desc); \
1853 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1854 bool q = false; \
1855 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1856 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1857 if (dd < MIN) { \
1858 dd = MIN; \
1859 q = true; \
1860 } else if (dd > MAX) { \
1861 dd = MAX; \
1862 q = true; \
1863 } \
1864 d[i] = dd; \
1865 } \
1866 if (q) { \
1867 uint32_t *qc = vq; \
1868 qc[0] = 1; \
1869 } \
1870 clear_tail(d, oprsz, simd_maxsz(desc)); \
1871 }
1872
1873 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1874 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1875 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1876
1877 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1878 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1879 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1880
1881 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1882 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1883 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1884
1885 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1886 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1887 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1888
1889 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1890 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1891 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1892
1893 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1894 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1895 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1896
1897 #undef DO_SAT
1898
1899 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1900 void *vm, uint32_t desc)
1901 {
1902 intptr_t i, oprsz = simd_oprsz(desc);
1903 uint64_t *d = vd, *n = vn, *m = vm;
1904 bool q = false;
1905
1906 for (i = 0; i < oprsz / 8; i++) {
1907 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1908 if (dd < nn) {
1909 dd = UINT64_MAX;
1910 q = true;
1911 }
1912 d[i] = dd;
1913 }
1914 if (q) {
1915 uint32_t *qc = vq;
1916 qc[0] = 1;
1917 }
1918 clear_tail(d, oprsz, simd_maxsz(desc));
1919 }
1920
HELPER(gvec_uqsub_d)1921 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1922 void *vm, uint32_t desc)
1923 {
1924 intptr_t i, oprsz = simd_oprsz(desc);
1925 uint64_t *d = vd, *n = vn, *m = vm;
1926 bool q = false;
1927
1928 for (i = 0; i < oprsz / 8; i++) {
1929 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1930 if (nn < mm) {
1931 dd = 0;
1932 q = true;
1933 }
1934 d[i] = dd;
1935 }
1936 if (q) {
1937 uint32_t *qc = vq;
1938 qc[0] = 1;
1939 }
1940 clear_tail(d, oprsz, simd_maxsz(desc));
1941 }
1942
HELPER(gvec_sqadd_d)1943 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1944 void *vm, uint32_t desc)
1945 {
1946 intptr_t i, oprsz = simd_oprsz(desc);
1947 int64_t *d = vd, *n = vn, *m = vm;
1948 bool q = false;
1949
1950 for (i = 0; i < oprsz / 8; i++) {
1951 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1952 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1953 dd = (nn >> 63) ^ ~INT64_MIN;
1954 q = true;
1955 }
1956 d[i] = dd;
1957 }
1958 if (q) {
1959 uint32_t *qc = vq;
1960 qc[0] = 1;
1961 }
1962 clear_tail(d, oprsz, simd_maxsz(desc));
1963 }
1964
HELPER(gvec_sqsub_d)1965 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1966 void *vm, uint32_t desc)
1967 {
1968 intptr_t i, oprsz = simd_oprsz(desc);
1969 int64_t *d = vd, *n = vn, *m = vm;
1970 bool q = false;
1971
1972 for (i = 0; i < oprsz / 8; i++) {
1973 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1974 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1975 dd = (nn >> 63) ^ ~INT64_MIN;
1976 q = true;
1977 }
1978 d[i] = dd;
1979 }
1980 if (q) {
1981 uint32_t *qc = vq;
1982 qc[0] = 1;
1983 }
1984 clear_tail(d, oprsz, simd_maxsz(desc));
1985 }
1986
HELPER(gvec_usqadd_d)1987 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1988 void *vm, uint32_t desc)
1989 {
1990 intptr_t i, oprsz = simd_oprsz(desc);
1991 uint64_t *d = vd, *n = vn, *m = vm;
1992 bool q = false;
1993
1994 for (i = 0; i < oprsz / 8; i++) {
1995 uint64_t nn = n[i];
1996 int64_t mm = m[i];
1997 uint64_t dd = nn + mm;
1998
1999 if (mm < 0) {
2000 if (nn < (uint64_t)-mm) {
2001 dd = 0;
2002 q = true;
2003 }
2004 } else {
2005 if (dd < nn) {
2006 dd = UINT64_MAX;
2007 q = true;
2008 }
2009 }
2010 d[i] = dd;
2011 }
2012 if (q) {
2013 uint32_t *qc = vq;
2014 qc[0] = 1;
2015 }
2016 clear_tail(d, oprsz, simd_maxsz(desc));
2017 }
2018
HELPER(gvec_suqadd_d)2019 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
2020 void *vm, uint32_t desc)
2021 {
2022 intptr_t i, oprsz = simd_oprsz(desc);
2023 uint64_t *d = vd, *n = vn, *m = vm;
2024 bool q = false;
2025
2026 for (i = 0; i < oprsz / 8; i++) {
2027 int64_t nn = n[i];
2028 uint64_t mm = m[i];
2029 int64_t dd = nn + mm;
2030
2031 if (mm > (uint64_t)(INT64_MAX - nn)) {
2032 dd = INT64_MAX;
2033 q = true;
2034 }
2035 d[i] = dd;
2036 }
2037 if (q) {
2038 uint32_t *qc = vq;
2039 qc[0] = 1;
2040 }
2041 clear_tail(d, oprsz, simd_maxsz(desc));
2042 }
2043
2044 #define DO_SRA(NAME, TYPE) \
2045 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2046 { \
2047 intptr_t i, oprsz = simd_oprsz(desc); \
2048 int shift = simd_data(desc); \
2049 TYPE *d = vd, *n = vn; \
2050 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2051 d[i] += n[i] >> shift; \
2052 } \
2053 clear_tail(d, oprsz, simd_maxsz(desc)); \
2054 }
2055
DO_SRA(gvec_ssra_b,int8_t)2056 DO_SRA(gvec_ssra_b, int8_t)
2057 DO_SRA(gvec_ssra_h, int16_t)
2058 DO_SRA(gvec_ssra_s, int32_t)
2059 DO_SRA(gvec_ssra_d, int64_t)
2060
2061 DO_SRA(gvec_usra_b, uint8_t)
2062 DO_SRA(gvec_usra_h, uint16_t)
2063 DO_SRA(gvec_usra_s, uint32_t)
2064 DO_SRA(gvec_usra_d, uint64_t)
2065
2066 #undef DO_SRA
2067
2068 #define DO_RSHR(NAME, TYPE) \
2069 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2070 { \
2071 intptr_t i, oprsz = simd_oprsz(desc); \
2072 int shift = simd_data(desc); \
2073 TYPE *d = vd, *n = vn; \
2074 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2075 TYPE tmp = n[i] >> (shift - 1); \
2076 d[i] = (tmp >> 1) + (tmp & 1); \
2077 } \
2078 clear_tail(d, oprsz, simd_maxsz(desc)); \
2079 }
2080
2081 DO_RSHR(gvec_srshr_b, int8_t)
2082 DO_RSHR(gvec_srshr_h, int16_t)
2083 DO_RSHR(gvec_srshr_s, int32_t)
2084 DO_RSHR(gvec_srshr_d, int64_t)
2085
2086 DO_RSHR(gvec_urshr_b, uint8_t)
2087 DO_RSHR(gvec_urshr_h, uint16_t)
2088 DO_RSHR(gvec_urshr_s, uint32_t)
2089 DO_RSHR(gvec_urshr_d, uint64_t)
2090
2091 #undef DO_RSHR
2092
2093 #define DO_RSRA(NAME, TYPE) \
2094 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2095 { \
2096 intptr_t i, oprsz = simd_oprsz(desc); \
2097 int shift = simd_data(desc); \
2098 TYPE *d = vd, *n = vn; \
2099 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2100 TYPE tmp = n[i] >> (shift - 1); \
2101 d[i] += (tmp >> 1) + (tmp & 1); \
2102 } \
2103 clear_tail(d, oprsz, simd_maxsz(desc)); \
2104 }
2105
2106 DO_RSRA(gvec_srsra_b, int8_t)
2107 DO_RSRA(gvec_srsra_h, int16_t)
2108 DO_RSRA(gvec_srsra_s, int32_t)
2109 DO_RSRA(gvec_srsra_d, int64_t)
2110
2111 DO_RSRA(gvec_ursra_b, uint8_t)
2112 DO_RSRA(gvec_ursra_h, uint16_t)
2113 DO_RSRA(gvec_ursra_s, uint32_t)
2114 DO_RSRA(gvec_ursra_d, uint64_t)
2115
2116 #undef DO_RSRA
2117
2118 #define DO_SRI(NAME, TYPE) \
2119 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2120 { \
2121 intptr_t i, oprsz = simd_oprsz(desc); \
2122 int shift = simd_data(desc); \
2123 TYPE *d = vd, *n = vn; \
2124 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2125 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2126 } \
2127 clear_tail(d, oprsz, simd_maxsz(desc)); \
2128 }
2129
2130 DO_SRI(gvec_sri_b, uint8_t)
2131 DO_SRI(gvec_sri_h, uint16_t)
2132 DO_SRI(gvec_sri_s, uint32_t)
2133 DO_SRI(gvec_sri_d, uint64_t)
2134
2135 #undef DO_SRI
2136
2137 #define DO_SLI(NAME, TYPE) \
2138 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2139 { \
2140 intptr_t i, oprsz = simd_oprsz(desc); \
2141 int shift = simd_data(desc); \
2142 TYPE *d = vd, *n = vn; \
2143 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2144 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2145 } \
2146 clear_tail(d, oprsz, simd_maxsz(desc)); \
2147 }
2148
2149 DO_SLI(gvec_sli_b, uint8_t)
2150 DO_SLI(gvec_sli_h, uint16_t)
2151 DO_SLI(gvec_sli_s, uint32_t)
2152 DO_SLI(gvec_sli_d, uint64_t)
2153
2154 #undef DO_SLI
2155
2156 /*
2157 * Convert float16 to float32, raising no exceptions and
2158 * preserving exceptional values, including SNaN.
2159 * This is effectively an unpack+repack operation.
2160 */
2161 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2162 {
2163 const int f16_bias = 15;
2164 const int f32_bias = 127;
2165 uint32_t sign = extract32(f16, 15, 1);
2166 uint32_t exp = extract32(f16, 10, 5);
2167 uint32_t frac = extract32(f16, 0, 10);
2168
2169 if (exp == 0x1f) {
2170 /* Inf or NaN */
2171 exp = 0xff;
2172 } else if (exp == 0) {
2173 /* Zero or denormal. */
2174 if (frac != 0) {
2175 if (fz16) {
2176 frac = 0;
2177 } else {
2178 /*
2179 * Denormal; these are all normal float32.
2180 * Shift the fraction so that the msb is at bit 11,
2181 * then remove bit 11 as the implicit bit of the
2182 * normalized float32. Note that we still go through
2183 * the shift for normal numbers below, to put the
2184 * float32 fraction at the right place.
2185 */
2186 int shift = clz32(frac) - 21;
2187 frac = (frac << shift) & 0x3ff;
2188 exp = f32_bias - f16_bias - shift + 1;
2189 }
2190 }
2191 } else {
2192 /* Normal number; adjust the bias. */
2193 exp += f32_bias - f16_bias;
2194 }
2195 sign <<= 31;
2196 exp <<= 23;
2197 frac <<= 23 - 10;
2198
2199 return sign | exp | frac;
2200 }
2201
load4_f16(uint64_t * ptr,int is_q,int is_2)2202 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2203 {
2204 /*
2205 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2206 * Load the 2nd qword iff is_q & is_2.
2207 * Shift to the 2nd dword iff !is_q & is_2.
2208 * For !is_q & !is_2, the upper bits of the result are garbage.
2209 */
2210 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2211 }
2212
2213 /*
2214 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2215 * as there is not yet SVE versions that might use blocking.
2216 */
2217
do_fmlal(float32 * d,void * vn,void * vm,CPUARMState * env,uint32_t desc,ARMFPStatusFlavour fpst_idx,uint64_t negx,int negf)2218 static void do_fmlal(float32 *d, void *vn, void *vm,
2219 CPUARMState *env, uint32_t desc,
2220 ARMFPStatusFlavour fpst_idx,
2221 uint64_t negx, int negf)
2222 {
2223 float_status *fpst = &env->vfp.fp_status[fpst_idx];
2224 bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2225 intptr_t i, oprsz = simd_oprsz(desc);
2226 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2227 int is_q = oprsz == 16;
2228 uint64_t n_4, m_4;
2229
2230 /*
2231 * Pre-load all of the f16 data, avoiding overlap issues.
2232 * Negate all inputs for AH=0 FMLSL at once.
2233 */
2234 n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2235 m_4 = load4_f16(vm, is_q, is_2);
2236
2237 for (i = 0; i < oprsz / 4; i++) {
2238 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2239 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2240 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2241 }
2242 clear_tail(d, oprsz, simd_maxsz(desc));
2243 }
2244
HELPER(gvec_fmlal_a32)2245 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2246 CPUARMState *env, uint32_t desc)
2247 {
2248 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2249 uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2250
2251 do_fmlal(vd, vn, vm, env, desc, FPST_STD, negx, 0);
2252 }
2253
HELPER(gvec_fmlal_a64)2254 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2255 CPUARMState *env, uint32_t desc)
2256 {
2257 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2258 uint64_t negx = 0;
2259 int negf = 0;
2260
2261 if (is_s) {
2262 if (env->vfp.fpcr & FPCR_AH) {
2263 negf = float_muladd_negate_product;
2264 } else {
2265 negx = 0x8000800080008000ull;
2266 }
2267 }
2268 do_fmlal(vd, vn, vm, env, desc, FPST_A64, negx, negf);
2269 }
2270
HELPER(sve2_fmlal_zzzw_s)2271 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2272 CPUARMState *env, uint32_t desc)
2273 {
2274 intptr_t i, oprsz = simd_oprsz(desc);
2275 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2276 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2277 bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
2278 float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
2279 bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2280 int negx = 0, negf = 0;
2281
2282 if (is_s) {
2283 if (env->vfp.fpcr & FPCR_AH) {
2284 negf = float_muladd_negate_product;
2285 } else {
2286 negx = 0x8000;
2287 }
2288 }
2289
2290 for (i = 0; i < oprsz; i += sizeof(float32)) {
2291 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx;
2292 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2293 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2294 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2295 float32 aa = *(float32 *)(va + H1_4(i));
2296
2297 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status);
2298 }
2299 }
2300
do_fmlal_idx(float32 * d,void * vn,void * vm,CPUARMState * env,uint32_t desc,ARMFPStatusFlavour fpst_idx,uint64_t negx,int negf)2301 static void do_fmlal_idx(float32 *d, void *vn, void *vm,
2302 CPUARMState *env, uint32_t desc,
2303 ARMFPStatusFlavour fpst_idx,
2304 uint64_t negx, int negf)
2305 {
2306 float_status *fpst = &env->vfp.fp_status[fpst_idx];
2307 bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2308 intptr_t i, oprsz = simd_oprsz(desc);
2309 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2310 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2311 int is_q = oprsz == 16;
2312 uint64_t n_4;
2313 float32 m_1;
2314
2315 /*
2316 * Pre-load all of the f16 data, avoiding overlap issues.
2317 * Negate all inputs for AH=0 FMLSL at once.
2318 */
2319 n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2320 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2321
2322 for (i = 0; i < oprsz / 4; i++) {
2323 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2324 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2325 }
2326 clear_tail(d, oprsz, simd_maxsz(desc));
2327 }
2328
HELPER(gvec_fmlal_idx_a32)2329 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2330 CPUARMState *env, uint32_t desc)
2331 {
2332 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2333 uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2334
2335 do_fmlal_idx(vd, vn, vm, env, desc, FPST_STD, negx, 0);
2336 }
2337
HELPER(gvec_fmlal_idx_a64)2338 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2339 CPUARMState *env, uint32_t desc)
2340 {
2341 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2342 uint64_t negx = 0;
2343 int negf = 0;
2344
2345 if (is_s) {
2346 if (env->vfp.fpcr & FPCR_AH) {
2347 negf = float_muladd_negate_product;
2348 } else {
2349 negx = 0x8000800080008000ull;
2350 }
2351 }
2352 do_fmlal_idx(vd, vn, vm, env, desc, FPST_A64, negx, negf);
2353 }
2354
HELPER(sve2_fmlal_zzxw_s)2355 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2356 CPUARMState *env, uint32_t desc)
2357 {
2358 intptr_t i, j, oprsz = simd_oprsz(desc);
2359 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2360 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2361 bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
2362 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 3, 3) * sizeof(float16);
2363 float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
2364 bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2365 int negx = 0, negf = 0;
2366
2367 if (is_s) {
2368 if (env->vfp.fpcr & FPCR_AH) {
2369 negf = float_muladd_negate_product;
2370 } else {
2371 negx = 0x8000;
2372 }
2373 }
2374 for (i = 0; i < oprsz; i += 16) {
2375 float16 mm_16 = *(float16 *)(vm + i + idx);
2376 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2377
2378 for (j = 0; j < 16; j += sizeof(float32)) {
2379 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx;
2380 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2381 float32 aa = *(float32 *)(va + H1_4(i + j));
2382
2383 *(float32 *)(vd + H1_4(i + j)) =
2384 float32_muladd(nn, mm, aa, negf, status);
2385 }
2386 }
2387 }
2388
HELPER(gvec_sshl_b)2389 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2390 {
2391 intptr_t i, opr_sz = simd_oprsz(desc);
2392 int8_t *d = vd, *n = vn, *m = vm;
2393
2394 for (i = 0; i < opr_sz; ++i) {
2395 int8_t mm = m[i];
2396 int8_t nn = n[i];
2397 int8_t res = 0;
2398 if (mm >= 0) {
2399 if (mm < 8) {
2400 res = nn << mm;
2401 }
2402 } else {
2403 res = nn >> (mm > -8 ? -mm : 7);
2404 }
2405 d[i] = res;
2406 }
2407 clear_tail(d, opr_sz, simd_maxsz(desc));
2408 }
2409
HELPER(gvec_sshl_h)2410 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2411 {
2412 intptr_t i, opr_sz = simd_oprsz(desc);
2413 int16_t *d = vd, *n = vn, *m = vm;
2414
2415 for (i = 0; i < opr_sz / 2; ++i) {
2416 int8_t mm = m[i]; /* only 8 bits of shift are significant */
2417 int16_t nn = n[i];
2418 int16_t res = 0;
2419 if (mm >= 0) {
2420 if (mm < 16) {
2421 res = nn << mm;
2422 }
2423 } else {
2424 res = nn >> (mm > -16 ? -mm : 15);
2425 }
2426 d[i] = res;
2427 }
2428 clear_tail(d, opr_sz, simd_maxsz(desc));
2429 }
2430
HELPER(gvec_ushl_b)2431 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2432 {
2433 intptr_t i, opr_sz = simd_oprsz(desc);
2434 uint8_t *d = vd, *n = vn, *m = vm;
2435
2436 for (i = 0; i < opr_sz; ++i) {
2437 int8_t mm = m[i];
2438 uint8_t nn = n[i];
2439 uint8_t res = 0;
2440 if (mm >= 0) {
2441 if (mm < 8) {
2442 res = nn << mm;
2443 }
2444 } else {
2445 if (mm > -8) {
2446 res = nn >> -mm;
2447 }
2448 }
2449 d[i] = res;
2450 }
2451 clear_tail(d, opr_sz, simd_maxsz(desc));
2452 }
2453
HELPER(gvec_ushl_h)2454 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2455 {
2456 intptr_t i, opr_sz = simd_oprsz(desc);
2457 uint16_t *d = vd, *n = vn, *m = vm;
2458
2459 for (i = 0; i < opr_sz / 2; ++i) {
2460 int8_t mm = m[i]; /* only 8 bits of shift are significant */
2461 uint16_t nn = n[i];
2462 uint16_t res = 0;
2463 if (mm >= 0) {
2464 if (mm < 16) {
2465 res = nn << mm;
2466 }
2467 } else {
2468 if (mm > -16) {
2469 res = nn >> -mm;
2470 }
2471 }
2472 d[i] = res;
2473 }
2474 clear_tail(d, opr_sz, simd_maxsz(desc));
2475 }
2476
2477 /*
2478 * 8x8->8 polynomial multiply.
2479 *
2480 * Polynomial multiplication is like integer multiplication except the
2481 * partial products are XORed, not added.
2482 *
2483 * TODO: expose this as a generic vector operation, as it is a common
2484 * crypto building block.
2485 */
HELPER(gvec_pmul_b)2486 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2487 {
2488 intptr_t i, opr_sz = simd_oprsz(desc);
2489 uint64_t *d = vd, *n = vn, *m = vm;
2490
2491 for (i = 0; i < opr_sz / 8; ++i) {
2492 d[i] = clmul_8x8_low(n[i], m[i]);
2493 }
2494 clear_tail(d, opr_sz, simd_maxsz(desc));
2495 }
2496
2497 /*
2498 * 64x64->128 polynomial multiply.
2499 * Because of the lanes are not accessed in strict columns,
2500 * this probably cannot be turned into a generic helper.
2501 */
HELPER(gvec_pmull_q)2502 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2503 {
2504 intptr_t i, opr_sz = simd_oprsz(desc);
2505 intptr_t hi = simd_data(desc);
2506 uint64_t *d = vd, *n = vn, *m = vm;
2507
2508 for (i = 0; i < opr_sz / 8; i += 2) {
2509 Int128 r = clmul_64(n[i + hi], m[i + hi]);
2510 d[i] = int128_getlo(r);
2511 d[i + 1] = int128_gethi(r);
2512 }
2513 clear_tail(d, opr_sz, simd_maxsz(desc));
2514 }
2515
HELPER(neon_pmull_h)2516 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2517 {
2518 int hi = simd_data(desc);
2519 uint64_t *d = vd, *n = vn, *m = vm;
2520 uint64_t nn = n[hi], mm = m[hi];
2521
2522 d[0] = clmul_8x4_packed(nn, mm);
2523 nn >>= 32;
2524 mm >>= 32;
2525 d[1] = clmul_8x4_packed(nn, mm);
2526
2527 clear_tail(d, 16, simd_maxsz(desc));
2528 }
2529
2530 #ifdef TARGET_AARCH64
HELPER(sve2_pmull_h)2531 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2532 {
2533 int shift = simd_data(desc) * 8;
2534 intptr_t i, opr_sz = simd_oprsz(desc);
2535 uint64_t *d = vd, *n = vn, *m = vm;
2536
2537 for (i = 0; i < opr_sz / 8; ++i) {
2538 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2539 }
2540 }
2541
HELPER(sve2_pmull_d)2542 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2543 {
2544 intptr_t sel = H4(simd_data(desc));
2545 intptr_t i, opr_sz = simd_oprsz(desc);
2546 uint32_t *n = vn, *m = vm;
2547 uint64_t *d = vd;
2548
2549 for (i = 0; i < opr_sz / 8; ++i) {
2550 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2551 }
2552 }
2553 #endif
2554
2555 #define DO_CMP0(NAME, TYPE, OP) \
2556 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2557 { \
2558 intptr_t i, opr_sz = simd_oprsz(desc); \
2559 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2560 TYPE nn = *(TYPE *)(vn + i); \
2561 *(TYPE *)(vd + i) = -(nn OP 0); \
2562 } \
2563 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2564 }
2565
2566 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2567 DO_CMP0(gvec_clt0_b, int8_t, <)
2568 DO_CMP0(gvec_cle0_b, int8_t, <=)
2569 DO_CMP0(gvec_cgt0_b, int8_t, >)
2570 DO_CMP0(gvec_cge0_b, int8_t, >=)
2571
2572 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2573 DO_CMP0(gvec_clt0_h, int16_t, <)
2574 DO_CMP0(gvec_cle0_h, int16_t, <=)
2575 DO_CMP0(gvec_cgt0_h, int16_t, >)
2576 DO_CMP0(gvec_cge0_h, int16_t, >=)
2577
2578 #undef DO_CMP0
2579
2580 #define DO_ABD(NAME, TYPE) \
2581 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2582 { \
2583 intptr_t i, opr_sz = simd_oprsz(desc); \
2584 TYPE *d = vd, *n = vn, *m = vm; \
2585 \
2586 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2587 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2588 } \
2589 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2590 }
2591
DO_ABD(gvec_sabd_b,int8_t)2592 DO_ABD(gvec_sabd_b, int8_t)
2593 DO_ABD(gvec_sabd_h, int16_t)
2594 DO_ABD(gvec_sabd_s, int32_t)
2595 DO_ABD(gvec_sabd_d, int64_t)
2596
2597 DO_ABD(gvec_uabd_b, uint8_t)
2598 DO_ABD(gvec_uabd_h, uint16_t)
2599 DO_ABD(gvec_uabd_s, uint32_t)
2600 DO_ABD(gvec_uabd_d, uint64_t)
2601
2602 #undef DO_ABD
2603
2604 #define DO_ABA(NAME, TYPE) \
2605 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2606 { \
2607 intptr_t i, opr_sz = simd_oprsz(desc); \
2608 TYPE *d = vd, *n = vn, *m = vm; \
2609 \
2610 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2611 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2612 } \
2613 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2614 }
2615
2616 DO_ABA(gvec_saba_b, int8_t)
2617 DO_ABA(gvec_saba_h, int16_t)
2618 DO_ABA(gvec_saba_s, int32_t)
2619 DO_ABA(gvec_saba_d, int64_t)
2620
2621 DO_ABA(gvec_uaba_b, uint8_t)
2622 DO_ABA(gvec_uaba_h, uint16_t)
2623 DO_ABA(gvec_uaba_s, uint32_t)
2624 DO_ABA(gvec_uaba_d, uint64_t)
2625
2626 #undef DO_ABA
2627
2628 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2629 void HELPER(NAME)(void *vd, void *vn, void *vm, \
2630 float_status *stat, uint32_t desc) \
2631 { \
2632 ARMVectorReg scratch; \
2633 intptr_t oprsz = simd_oprsz(desc); \
2634 intptr_t half = oprsz / sizeof(TYPE) / 2; \
2635 TYPE *d = vd, *n = vn, *m = vm; \
2636 if (unlikely(d == m)) { \
2637 m = memcpy(&scratch, m, oprsz); \
2638 } \
2639 for (intptr_t i = 0; i < half; ++i) { \
2640 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \
2641 } \
2642 for (intptr_t i = 0; i < half; ++i) { \
2643 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \
2644 } \
2645 clear_tail(d, oprsz, simd_maxsz(desc)); \
2646 }
2647
2648 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2649 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2650 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2651
2652 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2653 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2654 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2655
2656 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2657 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2658 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2659
2660 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2661 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2662 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2663
2664 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2665 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2666 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2667
2668 #ifdef TARGET_AARCH64
2669 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2670 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2671 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2672
2673 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2674 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2675 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2676 #endif
2677
2678 #undef DO_3OP_PAIR
2679
2680 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2681 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2682 { \
2683 ARMVectorReg scratch; \
2684 intptr_t oprsz = simd_oprsz(desc); \
2685 intptr_t half = oprsz / sizeof(TYPE) / 2; \
2686 TYPE *d = vd, *n = vn, *m = vm; \
2687 if (unlikely(d == m)) { \
2688 m = memcpy(&scratch, m, oprsz); \
2689 } \
2690 for (intptr_t i = 0; i < half; ++i) { \
2691 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \
2692 } \
2693 for (intptr_t i = 0; i < half; ++i) { \
2694 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \
2695 } \
2696 clear_tail(d, oprsz, simd_maxsz(desc)); \
2697 }
2698
2699 #define ADD(A, B) (A + B)
2700 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2701 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2702 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2703 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2704 #undef ADD
2705
2706 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2707 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2708 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2709
2710 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2711 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2712 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2713
2714 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2715 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2716 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2717
2718 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2719 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2720 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2721
2722 #undef DO_3OP_PAIR
2723
2724 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2725 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2726 { \
2727 intptr_t i, oprsz = simd_oprsz(desc); \
2728 int shift = simd_data(desc); \
2729 TYPE *d = vd, *n = vn; \
2730 float_status *fpst = stat; \
2731 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2732 d[i] = FUNC(n[i], shift, fpst); \
2733 } \
2734 clear_tail(d, oprsz, simd_maxsz(desc)); \
2735 }
2736
2737 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2738 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2739 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2740 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2741 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2742 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2743
2744 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2745 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2746 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2747 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2748 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2749 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2750
2751 #undef DO_VCVT_FIXED
2752
2753 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2754 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2755 { \
2756 intptr_t i, oprsz = simd_oprsz(desc); \
2757 uint32_t rmode = simd_data(desc); \
2758 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2759 TYPE *d = vd, *n = vn; \
2760 set_float_rounding_mode(rmode, fpst); \
2761 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2762 d[i] = FUNC(n[i], 0, fpst); \
2763 } \
2764 set_float_rounding_mode(prev_rmode, fpst); \
2765 clear_tail(d, oprsz, simd_maxsz(desc)); \
2766 }
2767
2768 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2769 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2770 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2771 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2772 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2773 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2774
2775 #undef DO_VCVT_RMODE
2776
2777 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2778 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2779 { \
2780 intptr_t i, oprsz = simd_oprsz(desc); \
2781 uint32_t rmode = simd_data(desc); \
2782 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2783 TYPE *d = vd, *n = vn; \
2784 set_float_rounding_mode(rmode, fpst); \
2785 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2786 d[i] = FUNC(n[i], fpst); \
2787 } \
2788 set_float_rounding_mode(prev_rmode, fpst); \
2789 clear_tail(d, oprsz, simd_maxsz(desc)); \
2790 }
2791
2792 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2793 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2794
2795 #undef DO_VRINT_RMODE
2796
2797 #ifdef TARGET_AARCH64
2798 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2799 {
2800 const uint8_t *indices = vm;
2801 size_t oprsz = simd_oprsz(desc);
2802 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2803 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2804 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2805 union {
2806 uint8_t b[16];
2807 uint64_t d[2];
2808 } result;
2809
2810 /*
2811 * We must construct the final result in a temp, lest the output
2812 * overlaps the input table. For TBL, begin with zero; for TBX,
2813 * begin with the original register contents. Note that we always
2814 * copy 16 bytes here to avoid an extra branch; clearing the high
2815 * bits of the register for oprsz == 8 is handled below.
2816 */
2817 if (is_tbx) {
2818 memcpy(&result, vd, 16);
2819 } else {
2820 memset(&result, 0, 16);
2821 }
2822
2823 for (size_t i = 0; i < oprsz; ++i) {
2824 uint32_t index = indices[H1(i)];
2825
2826 if (index < table_len) {
2827 /*
2828 * Convert index (a byte offset into the virtual table
2829 * which is a series of 128-bit vectors concatenated)
2830 * into the correct register element, bearing in mind
2831 * that the table can wrap around from V31 to V0.
2832 */
2833 const uint8_t *table = (const uint8_t *)
2834 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2835 result.b[H1(i)] = table[H1(index % 16)];
2836 }
2837 }
2838
2839 memcpy(vd, &result, 16);
2840 clear_tail(vd, oprsz, simd_maxsz(desc));
2841 }
2842 #endif
2843
2844 /*
2845 * NxN -> N highpart multiply
2846 *
2847 * TODO: expose this as a generic vector operation.
2848 */
2849
HELPER(gvec_smulh_b)2850 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2851 {
2852 intptr_t i, opr_sz = simd_oprsz(desc);
2853 int8_t *d = vd, *n = vn, *m = vm;
2854
2855 for (i = 0; i < opr_sz; ++i) {
2856 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2857 }
2858 clear_tail(d, opr_sz, simd_maxsz(desc));
2859 }
2860
HELPER(gvec_smulh_h)2861 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2862 {
2863 intptr_t i, opr_sz = simd_oprsz(desc);
2864 int16_t *d = vd, *n = vn, *m = vm;
2865
2866 for (i = 0; i < opr_sz / 2; ++i) {
2867 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2868 }
2869 clear_tail(d, opr_sz, simd_maxsz(desc));
2870 }
2871
HELPER(gvec_smulh_s)2872 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2873 {
2874 intptr_t i, opr_sz = simd_oprsz(desc);
2875 int32_t *d = vd, *n = vn, *m = vm;
2876
2877 for (i = 0; i < opr_sz / 4; ++i) {
2878 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2879 }
2880 clear_tail(d, opr_sz, simd_maxsz(desc));
2881 }
2882
HELPER(gvec_smulh_d)2883 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2884 {
2885 intptr_t i, opr_sz = simd_oprsz(desc);
2886 uint64_t *d = vd, *n = vn, *m = vm;
2887 uint64_t discard;
2888
2889 for (i = 0; i < opr_sz / 8; ++i) {
2890 muls64(&discard, &d[i], n[i], m[i]);
2891 }
2892 clear_tail(d, opr_sz, simd_maxsz(desc));
2893 }
2894
HELPER(gvec_umulh_b)2895 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2896 {
2897 intptr_t i, opr_sz = simd_oprsz(desc);
2898 uint8_t *d = vd, *n = vn, *m = vm;
2899
2900 for (i = 0; i < opr_sz; ++i) {
2901 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2902 }
2903 clear_tail(d, opr_sz, simd_maxsz(desc));
2904 }
2905
HELPER(gvec_umulh_h)2906 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2907 {
2908 intptr_t i, opr_sz = simd_oprsz(desc);
2909 uint16_t *d = vd, *n = vn, *m = vm;
2910
2911 for (i = 0; i < opr_sz / 2; ++i) {
2912 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2913 }
2914 clear_tail(d, opr_sz, simd_maxsz(desc));
2915 }
2916
HELPER(gvec_umulh_s)2917 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2918 {
2919 intptr_t i, opr_sz = simd_oprsz(desc);
2920 uint32_t *d = vd, *n = vn, *m = vm;
2921
2922 for (i = 0; i < opr_sz / 4; ++i) {
2923 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2924 }
2925 clear_tail(d, opr_sz, simd_maxsz(desc));
2926 }
2927
HELPER(gvec_umulh_d)2928 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2929 {
2930 intptr_t i, opr_sz = simd_oprsz(desc);
2931 uint64_t *d = vd, *n = vn, *m = vm;
2932 uint64_t discard;
2933
2934 for (i = 0; i < opr_sz / 8; ++i) {
2935 mulu64(&discard, &d[i], n[i], m[i]);
2936 }
2937 clear_tail(d, opr_sz, simd_maxsz(desc));
2938 }
2939
HELPER(gvec_xar_d)2940 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2941 {
2942 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2943 int shr = simd_data(desc);
2944 uint64_t *d = vd, *n = vn, *m = vm;
2945
2946 for (i = 0; i < opr_sz; ++i) {
2947 d[i] = ror64(n[i] ^ m[i], shr);
2948 }
2949 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2950 }
2951
2952 /*
2953 * Integer matrix-multiply accumulate
2954 */
2955
do_smmla_b(uint32_t sum,void * vn,void * vm)2956 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2957 {
2958 int8_t *n = vn, *m = vm;
2959
2960 for (intptr_t k = 0; k < 8; ++k) {
2961 sum += n[H1(k)] * m[H1(k)];
2962 }
2963 return sum;
2964 }
2965
do_ummla_b(uint32_t sum,void * vn,void * vm)2966 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2967 {
2968 uint8_t *n = vn, *m = vm;
2969
2970 for (intptr_t k = 0; k < 8; ++k) {
2971 sum += n[H1(k)] * m[H1(k)];
2972 }
2973 return sum;
2974 }
2975
do_usmmla_b(uint32_t sum,void * vn,void * vm)2976 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2977 {
2978 uint8_t *n = vn;
2979 int8_t *m = vm;
2980
2981 for (intptr_t k = 0; k < 8; ++k) {
2982 sum += n[H1(k)] * m[H1(k)];
2983 }
2984 return sum;
2985 }
2986
do_mmla_b(void * vd,void * vn,void * vm,void * va,uint32_t desc,uint32_t (* inner_loop)(uint32_t,void *,void *))2987 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2988 uint32_t (*inner_loop)(uint32_t, void *, void *))
2989 {
2990 intptr_t seg, opr_sz = simd_oprsz(desc);
2991
2992 for (seg = 0; seg < opr_sz; seg += 16) {
2993 uint32_t *d = vd + seg;
2994 uint32_t *a = va + seg;
2995 uint32_t sum0, sum1, sum2, sum3;
2996
2997 /*
2998 * Process the entire segment at once, writing back the
2999 * results only after we've consumed all of the inputs.
3000 *
3001 * Key to indices by column:
3002 * i j i j
3003 */
3004 sum0 = a[H4(0 + 0)];
3005 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
3006 sum1 = a[H4(0 + 1)];
3007 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
3008 sum2 = a[H4(2 + 0)];
3009 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
3010 sum3 = a[H4(2 + 1)];
3011 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
3012
3013 d[H4(0)] = sum0;
3014 d[H4(1)] = sum1;
3015 d[H4(2)] = sum2;
3016 d[H4(3)] = sum3;
3017 }
3018 clear_tail(vd, opr_sz, simd_maxsz(desc));
3019 }
3020
3021 #define DO_MMLA_B(NAME, INNER) \
3022 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
3023 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
3024
DO_MMLA_B(gvec_smmla_b,do_smmla_b)3025 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
3026 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
3027 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
3028
3029 /*
3030 * BFloat16 Dot Product
3031 */
3032
3033 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
3034 {
3035 /*
3036 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
3037 * For EBF = 0, we ignore the FPCR bits which determine rounding
3038 * mode and denormal-flushing, and we do unfused multiplies and
3039 * additions with intermediate rounding of all products and sums.
3040 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
3041 * and we perform a fused two-way sum-of-products without intermediate
3042 * rounding of the products.
3043 * In either case, we don't set fp exception flags.
3044 *
3045 * EBF is AArch64 only, so even if it's set in the FPCR it has
3046 * no effect on AArch32 instructions.
3047 */
3048 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
3049
3050 *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32];
3051 set_default_nan_mode(true, statusp);
3052
3053 if (ebf) {
3054 /* EBF=1 needs to do a step with round-to-odd semantics */
3055 *oddstatusp = *statusp;
3056 set_float_rounding_mode(float_round_to_odd, oddstatusp);
3057 } else {
3058 set_flush_to_zero(true, statusp);
3059 set_flush_inputs_to_zero(true, statusp);
3060 set_float_rounding_mode(float_round_to_odd_inf, statusp);
3061 }
3062 return ebf;
3063 }
3064
bfdotadd(float32 sum,uint32_t e1,uint32_t e2,float_status * fpst)3065 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
3066 {
3067 float32 t1, t2;
3068
3069 /*
3070 * Extract each BFloat16 from the element pair, and shift
3071 * them such that they become float32.
3072 */
3073 t1 = float32_mul(e1 << 16, e2 << 16, fpst);
3074 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
3075 t1 = float32_add(t1, t2, fpst);
3076 t1 = float32_add(sum, t1, fpst);
3077
3078 return t1;
3079 }
3080
bfdotadd_ebf(float32 sum,uint32_t e1,uint32_t e2,float_status * fpst,float_status * fpst_odd)3081 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
3082 float_status *fpst, float_status *fpst_odd)
3083 {
3084 float32 s1r = e1 << 16;
3085 float32 s1c = e1 & 0xffff0000u;
3086 float32 s2r = e2 << 16;
3087 float32 s2c = e2 & 0xffff0000u;
3088 float32 t32;
3089
3090 /* C.f. FPProcessNaNs4 */
3091 if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) ||
3092 float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) {
3093 if (float32_is_signaling_nan(s1r, fpst)) {
3094 t32 = s1r;
3095 } else if (float32_is_signaling_nan(s1c, fpst)) {
3096 t32 = s1c;
3097 } else if (float32_is_signaling_nan(s2r, fpst)) {
3098 t32 = s2r;
3099 } else if (float32_is_signaling_nan(s2c, fpst)) {
3100 t32 = s2c;
3101 } else if (float32_is_any_nan(s1r)) {
3102 t32 = s1r;
3103 } else if (float32_is_any_nan(s1c)) {
3104 t32 = s1c;
3105 } else if (float32_is_any_nan(s2r)) {
3106 t32 = s2r;
3107 } else {
3108 t32 = s2c;
3109 }
3110 /*
3111 * FPConvertNaN(FPProcessNaN(t32)) will be done as part
3112 * of the final addition below.
3113 */
3114 } else {
3115 /*
3116 * Compare f16_dotadd() in sme_helper.c, but here we have
3117 * bfloat16 inputs. In particular that means that we do not
3118 * want the FPCR.FZ16 flush semantics, so we use the normal
3119 * float_status for the input handling here.
3120 */
3121 float64 e1r = float32_to_float64(s1r, fpst);
3122 float64 e1c = float32_to_float64(s1c, fpst);
3123 float64 e2r = float32_to_float64(s2r, fpst);
3124 float64 e2c = float32_to_float64(s2c, fpst);
3125 float64 t64;
3126
3127 /*
3128 * The ARM pseudocode function FPDot performs both multiplies
3129 * and the add with a single rounding operation. Emulate this
3130 * by performing the first multiply in round-to-odd, then doing
3131 * the second multiply as fused multiply-add, and rounding to
3132 * float32 all in one step.
3133 */
3134 t64 = float64_mul(e1r, e2r, fpst_odd);
3135 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
3136
3137 /* This conversion is exact, because we've already rounded. */
3138 t32 = float64_to_float32(t64, fpst);
3139 }
3140
3141 /* The final accumulation step is not fused. */
3142 return float32_add(sum, t32, fpst);
3143 }
3144
HELPER(gvec_bfdot)3145 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
3146 CPUARMState *env, uint32_t desc)
3147 {
3148 intptr_t i, opr_sz = simd_oprsz(desc);
3149 float32 *d = vd, *a = va;
3150 uint32_t *n = vn, *m = vm;
3151 float_status fpst, fpst_odd;
3152
3153 if (is_ebf(env, &fpst, &fpst_odd)) {
3154 for (i = 0; i < opr_sz / 4; ++i) {
3155 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
3156 }
3157 } else {
3158 for (i = 0; i < opr_sz / 4; ++i) {
3159 d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
3160 }
3161 }
3162 clear_tail(d, opr_sz, simd_maxsz(desc));
3163 }
3164
HELPER(gvec_bfdot_idx)3165 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
3166 void *va, CPUARMState *env, uint32_t desc)
3167 {
3168 intptr_t i, j, opr_sz = simd_oprsz(desc);
3169 intptr_t index = simd_data(desc);
3170 intptr_t elements = opr_sz / 4;
3171 intptr_t eltspersegment = MIN(16 / 4, elements);
3172 float32 *d = vd, *a = va;
3173 uint32_t *n = vn, *m = vm;
3174 float_status fpst, fpst_odd;
3175
3176 if (is_ebf(env, &fpst, &fpst_odd)) {
3177 for (i = 0; i < elements; i += eltspersegment) {
3178 uint32_t m_idx = m[i + H4(index)];
3179
3180 for (j = i; j < i + eltspersegment; j++) {
3181 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3182 }
3183 }
3184 } else {
3185 for (i = 0; i < elements; i += eltspersegment) {
3186 uint32_t m_idx = m[i + H4(index)];
3187
3188 for (j = i; j < i + eltspersegment; j++) {
3189 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3190 }
3191 }
3192 }
3193 clear_tail(d, opr_sz, simd_maxsz(desc));
3194 }
3195
HELPER(sme2_bfvdot_idx)3196 void HELPER(sme2_bfvdot_idx)(void *vd, void *vn, void *vm,
3197 void *va, CPUARMState *env, uint32_t desc)
3198 {
3199 intptr_t i, j, opr_sz = simd_oprsz(desc);
3200 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2);
3201 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
3202 intptr_t elements = opr_sz / 4;
3203 intptr_t eltspersegment = MIN(16 / 4, elements);
3204 float32 *d = vd, *a = va;
3205 uint16_t *n0 = vn;
3206 uint16_t *n1 = vn + sizeof(ARMVectorReg);
3207 uint32_t *m = vm;
3208 float_status fpst, fpst_odd;
3209
3210 if (is_ebf(env, &fpst, &fpst_odd)) {
3211 for (i = 0; i < elements; i += eltspersegment) {
3212 uint32_t m_idx = m[i + H4(idx)];
3213
3214 for (j = 0; j < eltspersegment; j++) {
3215 uint32_t nn = (n0[H2(2 * (i + j) + sel)])
3216 | (n1[H2(2 * (i + j) + sel)] << 16);
3217 d[i + H4(j)] = bfdotadd_ebf(a[i + H4(j)], nn, m_idx,
3218 &fpst, &fpst_odd);
3219 }
3220 }
3221 } else {
3222 for (i = 0; i < elements; i += eltspersegment) {
3223 uint32_t m_idx = m[i + H4(idx)];
3224
3225 for (j = 0; j < eltspersegment; j++) {
3226 uint32_t nn = (n0[H2(2 * (i + j) + sel)])
3227 | (n1[H2(2 * (i + j) + sel)] << 16);
3228 d[i + H4(j)] = bfdotadd(a[i + H4(j)], nn, m_idx, &fpst);
3229 }
3230 }
3231 }
3232 clear_tail(d, opr_sz, simd_maxsz(desc));
3233 }
3234
HELPER(gvec_bfmmla)3235 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3236 CPUARMState *env, uint32_t desc)
3237 {
3238 intptr_t s, opr_sz = simd_oprsz(desc);
3239 float32 *d = vd, *a = va;
3240 uint32_t *n = vn, *m = vm;
3241 float_status fpst, fpst_odd;
3242
3243 if (is_ebf(env, &fpst, &fpst_odd)) {
3244 for (s = 0; s < opr_sz / 4; s += 4) {
3245 float32 sum00, sum01, sum10, sum11;
3246
3247 /*
3248 * Process the entire segment at once, writing back the
3249 * results only after we've consumed all of the inputs.
3250 *
3251 * Key to indices by column:
3252 * i j i k j k
3253 */
3254 sum00 = a[s + H4(0 + 0)];
3255 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3256 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3257
3258 sum01 = a[s + H4(0 + 1)];
3259 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3260 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3261
3262 sum10 = a[s + H4(2 + 0)];
3263 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3264 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3265
3266 sum11 = a[s + H4(2 + 1)];
3267 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3268 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3269
3270 d[s + H4(0 + 0)] = sum00;
3271 d[s + H4(0 + 1)] = sum01;
3272 d[s + H4(2 + 0)] = sum10;
3273 d[s + H4(2 + 1)] = sum11;
3274 }
3275 } else {
3276 for (s = 0; s < opr_sz / 4; s += 4) {
3277 float32 sum00, sum01, sum10, sum11;
3278
3279 /*
3280 * Process the entire segment at once, writing back the
3281 * results only after we've consumed all of the inputs.
3282 *
3283 * Key to indices by column:
3284 * i j i k j k
3285 */
3286 sum00 = a[s + H4(0 + 0)];
3287 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3288 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3289
3290 sum01 = a[s + H4(0 + 1)];
3291 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3292 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3293
3294 sum10 = a[s + H4(2 + 0)];
3295 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3296 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3297
3298 sum11 = a[s + H4(2 + 1)];
3299 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3300 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3301
3302 d[s + H4(0 + 0)] = sum00;
3303 d[s + H4(0 + 1)] = sum01;
3304 d[s + H4(2 + 0)] = sum10;
3305 d[s + H4(2 + 1)] = sum11;
3306 }
3307 }
3308 clear_tail(d, opr_sz, simd_maxsz(desc));
3309 }
3310
do_bfmlal(float32 * d,bfloat16 * n,bfloat16 * m,float32 * a,float_status * stat,uint32_t desc,int negx,int negf)3311 static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
3312 float_status *stat, uint32_t desc, int negx, int negf)
3313 {
3314 intptr_t i, opr_sz = simd_oprsz(desc);
3315 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3316
3317 for (i = 0; i < opr_sz / 4; ++i) {
3318 float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16;
3319 float32 mm = m[H2(i * 2 + sel)] << 16;
3320 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat);
3321 }
3322 clear_tail(d, opr_sz, simd_maxsz(desc));
3323 }
3324
HELPER(gvec_bfmlal)3325 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3326 float_status *stat, uint32_t desc)
3327 {
3328 do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0);
3329 }
3330
HELPER(gvec_bfmlsl)3331 void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va,
3332 float_status *stat, uint32_t desc)
3333 {
3334 do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0);
3335 }
3336
HELPER(gvec_ah_bfmlsl)3337 void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va,
3338 float_status *stat, uint32_t desc)
3339 {
3340 do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
3341 }
3342
do_bfmlal_idx(float32 * d,bfloat16 * n,bfloat16 * m,float32 * a,float_status * stat,uint32_t desc,int negx,int negf)3343 static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
3344 float_status *stat, uint32_t desc, int negx, int negf)
3345 {
3346 intptr_t i, j, opr_sz = simd_oprsz(desc);
3347 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3348 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3349 intptr_t elements = opr_sz / 4;
3350 intptr_t eltspersegment = MIN(16 / 4, elements);
3351
3352 for (i = 0; i < elements; i += eltspersegment) {
3353 float32 m_idx = m[H2(2 * i + index)] << 16;
3354
3355 for (j = i; j < i + eltspersegment; j++) {
3356 float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16;
3357 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat);
3358 }
3359 }
3360 clear_tail(d, opr_sz, simd_maxsz(desc));
3361 }
3362
HELPER(gvec_bfmlal_idx)3363 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va,
3364 float_status *stat, uint32_t desc)
3365 {
3366 do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0);
3367 }
3368
HELPER(gvec_bfmlsl_idx)3369 void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
3370 float_status *stat, uint32_t desc)
3371 {
3372 do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0);
3373 }
3374
HELPER(gvec_ah_bfmlsl_idx)3375 void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
3376 float_status *stat, uint32_t desc)
3377 {
3378 do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
3379 }
3380
3381 #define DO_CLAMP(NAME, TYPE) \
3382 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
3383 { \
3384 intptr_t i, opr_sz = simd_oprsz(desc); \
3385 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
3386 TYPE aa = *(TYPE *)(a + i); \
3387 TYPE nn = *(TYPE *)(n + i); \
3388 TYPE mm = *(TYPE *)(m + i); \
3389 TYPE dd = MIN(MAX(aa, nn), mm); \
3390 *(TYPE *)(d + i) = dd; \
3391 } \
3392 clear_tail(d, opr_sz, simd_maxsz(desc)); \
3393 }
3394
DO_CLAMP(gvec_sclamp_b,int8_t)3395 DO_CLAMP(gvec_sclamp_b, int8_t)
3396 DO_CLAMP(gvec_sclamp_h, int16_t)
3397 DO_CLAMP(gvec_sclamp_s, int32_t)
3398 DO_CLAMP(gvec_sclamp_d, int64_t)
3399
3400 DO_CLAMP(gvec_uclamp_b, uint8_t)
3401 DO_CLAMP(gvec_uclamp_h, uint16_t)
3402 DO_CLAMP(gvec_uclamp_s, uint32_t)
3403 DO_CLAMP(gvec_uclamp_d, uint64_t)
3404
3405 /* Bit count in each 8-bit word. */
3406 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3407 {
3408 intptr_t i, opr_sz = simd_oprsz(desc);
3409 uint8_t *d = vd, *n = vn;
3410
3411 for (i = 0; i < opr_sz; ++i) {
3412 d[i] = ctpop8(n[i]);
3413 }
3414 clear_tail(d, opr_sz, simd_maxsz(desc));
3415 }
3416
3417 /* Reverse bits in each 8 bit word */
HELPER(gvec_rbit_b)3418 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3419 {
3420 intptr_t i, opr_sz = simd_oprsz(desc);
3421 uint64_t *d = vd, *n = vn;
3422
3423 for (i = 0; i < opr_sz / 8; ++i) {
3424 d[i] = revbit64(bswap64(n[i]));
3425 }
3426 clear_tail(d, opr_sz, simd_maxsz(desc));
3427 }
3428
HELPER(gvec_urecpe_s)3429 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3430 {
3431 intptr_t i, opr_sz = simd_oprsz(desc);
3432 uint32_t *d = vd, *n = vn;
3433
3434 for (i = 0; i < opr_sz / 4; ++i) {
3435 d[i] = helper_recpe_u32(n[i]);
3436 }
3437 clear_tail(d, opr_sz, simd_maxsz(desc));
3438 }
3439
HELPER(gvec_ursqrte_s)3440 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3441 {
3442 intptr_t i, opr_sz = simd_oprsz(desc);
3443 uint32_t *d = vd, *n = vn;
3444
3445 for (i = 0; i < opr_sz / 4; ++i) {
3446 d[i] = helper_rsqrte_u32(n[i]);
3447 }
3448 clear_tail(d, opr_sz, simd_maxsz(desc));
3449 }
3450
do_lut_b(void * zd,uint64_t * indexes,uint64_t * table,unsigned elements,unsigned segbase,unsigned dstride,unsigned isize,unsigned tsize,unsigned nreg)3451 static inline void do_lut_b(void *zd, uint64_t *indexes, uint64_t *table,
3452 unsigned elements, unsigned segbase,
3453 unsigned dstride, unsigned isize,
3454 unsigned tsize, unsigned nreg)
3455 {
3456 for (unsigned r = 0; r < nreg; ++r) {
3457 uint8_t *dst = zd + dstride * r;
3458 unsigned base = segbase + r * elements;
3459
3460 for (unsigned e = 0; e < elements; ++e) {
3461 unsigned index = extractn(indexes, (base + e) * isize, isize);
3462 dst[H1(e)] = extractn(table, index * tsize, 8);
3463 }
3464 }
3465 }
3466
do_lut_h(void * zd,uint64_t * indexes,uint64_t * table,unsigned elements,unsigned segbase,unsigned dstride,unsigned isize,unsigned tsize,unsigned nreg)3467 static inline void do_lut_h(void *zd, uint64_t *indexes, uint64_t *table,
3468 unsigned elements, unsigned segbase,
3469 unsigned dstride, unsigned isize,
3470 unsigned tsize, unsigned nreg)
3471 {
3472 for (unsigned r = 0; r < nreg; ++r) {
3473 uint16_t *dst = zd + dstride * r;
3474 unsigned base = segbase + r * elements;
3475
3476 for (unsigned e = 0; e < elements; ++e) {
3477 unsigned index = extractn(indexes, (base + e) * isize, isize);
3478 dst[H2(e)] = extractn(table, index * tsize, 16);
3479 }
3480 }
3481 }
3482
do_lut_s(void * zd,uint64_t * indexes,uint32_t * table,unsigned elements,unsigned segbase,unsigned dstride,unsigned isize,unsigned tsize,unsigned nreg)3483 static inline void do_lut_s(void *zd, uint64_t *indexes, uint32_t *table,
3484 unsigned elements, unsigned segbase,
3485 unsigned dstride, unsigned isize,
3486 unsigned tsize, unsigned nreg)
3487 {
3488 for (unsigned r = 0; r < nreg; ++r) {
3489 uint32_t *dst = zd + dstride * r;
3490 unsigned base = segbase + r * elements;
3491
3492 for (unsigned e = 0; e < elements; ++e) {
3493 unsigned index = extractn(indexes, (base + e) * isize, isize);
3494 dst[H4(e)] = table[H4(index)];
3495 }
3496 }
3497 }
3498
3499 #define DO_SME2_LUT(ISIZE, NREG, SUFF, ESIZE) \
3500 void helper_sme2_luti##ISIZE##_##NREG##SUFF \
3501 (void *zd, void *zn, CPUARMState *env, uint32_t desc) \
3502 { \
3503 unsigned vl = simd_oprsz(desc); \
3504 unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1); \
3505 unsigned idx = extract32(desc, SIMD_DATA_SHIFT + 1, 4); \
3506 unsigned elements = vl / ESIZE; \
3507 unsigned dstride = (!strided ? 1 : NREG == 4 ? 4 : 8); \
3508 unsigned segments = (ESIZE * 8) / (ISIZE * NREG); \
3509 unsigned segment = idx & (segments - 1); \
3510 ARMVectorReg indexes; \
3511 memcpy(&indexes, zn, vl); \
3512 do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements, \
3513 segment * NREG * elements, \
3514 dstride * sizeof(ARMVectorReg), ISIZE, 32, NREG); \
3515 }
3516
3517 DO_SME2_LUT(2,1,b, 1)
3518 DO_SME2_LUT(2,1,h, 2)
3519 DO_SME2_LUT(2,1,s, 4)
3520 DO_SME2_LUT(2,2,b, 1)
3521 DO_SME2_LUT(2,2,h, 2)
3522 DO_SME2_LUT(2,2,s, 4)
3523 DO_SME2_LUT(2,4,b, 1)
3524 DO_SME2_LUT(2,4,h, 2)
3525 DO_SME2_LUT(2,4,s, 4)
3526
3527 DO_SME2_LUT(4,1,b, 1)
3528 DO_SME2_LUT(4,1,h, 2)
3529 DO_SME2_LUT(4,1,s, 4)
3530 DO_SME2_LUT(4,2,b, 1)
3531 DO_SME2_LUT(4,2,h, 2)
3532 DO_SME2_LUT(4,2,s, 4)
3533 DO_SME2_LUT(4,4,h, 2)
3534 DO_SME2_LUT(4,4,s, 4)
3535
3536 #undef DO_SME2_LUT
3537