1 /*
2 * ARM NEON vector operations.
3 *
4 * Copyright (c) 2007, 2008 CodeSourcery.
5 * Written by Paul Brook
6 *
7 * This code is licensed under the GNU GPL v2.
8 */
9
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
16
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
19
20 #define SET_QC() env->vfp.qc[0] = 1
21
22 #define NEON_TYPE1(name, type) \
23 typedef struct \
24 { \
25 type v1; \
26 } neon_##name;
27 #if HOST_BIG_ENDIAN
28 #define NEON_TYPE2(name, type) \
29 typedef struct \
30 { \
31 type v2; \
32 type v1; \
33 } neon_##name;
34 #define NEON_TYPE4(name, type) \
35 typedef struct \
36 { \
37 type v4; \
38 type v3; \
39 type v2; \
40 type v1; \
41 } neon_##name;
42 #else
43 #define NEON_TYPE2(name, type) \
44 typedef struct \
45 { \
46 type v1; \
47 type v2; \
48 } neon_##name;
49 #define NEON_TYPE4(name, type) \
50 typedef struct \
51 { \
52 type v1; \
53 type v2; \
54 type v3; \
55 type v4; \
56 } neon_##name;
57 #endif
58
NEON_TYPE4(s8,int8_t)59 NEON_TYPE4(s8, int8_t)
60 NEON_TYPE4(u8, uint8_t)
61 NEON_TYPE2(s16, int16_t)
62 NEON_TYPE2(u16, uint16_t)
63 NEON_TYPE1(s32, int32_t)
64 NEON_TYPE1(u32, uint32_t)
65 #undef NEON_TYPE4
66 #undef NEON_TYPE2
67 #undef NEON_TYPE1
68
69 /* Copy from a uint32_t to a vector structure type. */
70 #define NEON_UNPACK(vtype, dest, val) do { \
71 union { \
72 vtype v; \
73 uint32_t i; \
74 } conv_u; \
75 conv_u.i = (val); \
76 dest = conv_u.v; \
77 } while(0)
78
79 /* Copy from a vector structure type to a uint32_t. */
80 #define NEON_PACK(vtype, dest, val) do { \
81 union { \
82 vtype v; \
83 uint32_t i; \
84 } conv_u; \
85 conv_u.v = (val); \
86 dest = conv_u.i; \
87 } while(0)
88
89 #define NEON_DO1 \
90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91 #define NEON_DO2 \
92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94 #define NEON_DO4 \
95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99
100 #define NEON_VOP_BODY(vtype, n) \
101 { \
102 uint32_t res; \
103 vtype vsrc1; \
104 vtype vsrc2; \
105 vtype vdest; \
106 NEON_UNPACK(vtype, vsrc1, arg1); \
107 NEON_UNPACK(vtype, vsrc2, arg2); \
108 NEON_DO##n; \
109 NEON_PACK(vtype, res, vdest); \
110 return res; \
111 }
112
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
116
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
120
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123 { \
124 intptr_t i, opr_sz = simd_oprsz(desc); \
125 vtype *d = vd, *n = vn, *m = vm; \
126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
127 NEON_FN(d[i], n[i], m[i]); \
128 } \
129 clear_tail(d, opr_sz, simd_maxsz(desc)); \
130 }
131
132 #define NEON_GVEC_VOP2_ENV(name, vtype) \
133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
134 { \
135 intptr_t i, opr_sz = simd_oprsz(desc); \
136 vtype *d = vd, *n = vn, *m = vm; \
137 CPUARMState *env = venv; \
138 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
139 NEON_FN(d[i], n[i], m[i]); \
140 } \
141 clear_tail(d, opr_sz, simd_maxsz(desc)); \
142 }
143
144 /* Pairwise operations. */
145 /* For 32-bit elements each segment only contains a single element, so
146 the elementwise and pairwise operations are the same. */
147 #define NEON_PDO2 \
148 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
149 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
150 #define NEON_PDO4 \
151 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
152 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
153 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
154 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
155
156 #define NEON_POP(name, vtype, n) \
157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
158 { \
159 uint32_t res; \
160 vtype vsrc1; \
161 vtype vsrc2; \
162 vtype vdest; \
163 NEON_UNPACK(vtype, vsrc1, arg1); \
164 NEON_UNPACK(vtype, vsrc2, arg2); \
165 NEON_PDO##n; \
166 NEON_PACK(vtype, res, vdest); \
167 return res; \
168 }
169
170 /* Unary operators. */
171 #define NEON_VOP1(name, vtype, n) \
172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
173 { \
174 vtype vsrc1; \
175 vtype vdest; \
176 NEON_UNPACK(vtype, vsrc1, arg); \
177 NEON_DO##n; \
178 NEON_PACK(vtype, arg, vdest); \
179 return arg; \
180 }
181
182 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
183 NEON_POP(pmin_s8, neon_s8, 4)
184 NEON_POP(pmin_u8, neon_u8, 4)
185 NEON_POP(pmin_s16, neon_s16, 2)
186 NEON_POP(pmin_u16, neon_u16, 2)
187 #undef NEON_FN
188
189 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
190 NEON_POP(pmax_s8, neon_s8, 4)
191 NEON_POP(pmax_u8, neon_u8, 4)
192 NEON_POP(pmax_s16, neon_s16, 2)
193 NEON_POP(pmax_u16, neon_u16, 2)
194 #undef NEON_FN
195
196 #define NEON_FN(dest, src1, src2) \
197 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
198 NEON_VOP(shl_u16, neon_u16, 2)
199 #undef NEON_FN
200
201 #define NEON_FN(dest, src1, src2) \
202 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
203 NEON_VOP(shl_s16, neon_s16, 2)
204 #undef NEON_FN
205
206 #define NEON_FN(dest, src1, src2) \
207 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
208 NEON_VOP(rshl_s8, neon_s8, 4)
209 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
210 #undef NEON_FN
211
212 #define NEON_FN(dest, src1, src2) \
213 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
214 NEON_VOP(rshl_s16, neon_s16, 2)
215 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
216 #undef NEON_FN
217
218 #define NEON_FN(dest, src1, src2) \
219 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
220 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
221 #undef NEON_FN
222
223 #define NEON_FN(dest, src1, src2) \
224 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
225 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
226 #undef NEON_FN
227
228 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
229 {
230 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
231 }
232
HELPER(neon_rshl_s64)233 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
234 {
235 return do_sqrshl_d(val, (int8_t)shift, true, NULL);
236 }
237
238 #define NEON_FN(dest, src1, src2) \
239 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
240 NEON_VOP(rshl_u8, neon_u8, 4)
NEON_GVEC_VOP2(gvec_urshl_b,uint8_t)241 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
242 #undef NEON_FN
243
244 #define NEON_FN(dest, src1, src2) \
245 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
246 NEON_VOP(rshl_u16, neon_u16, 2)
247 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
248 #undef NEON_FN
249
250 #define NEON_FN(dest, src1, src2) \
251 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
252 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
253 #undef NEON_FN
254
255 #define NEON_FN(dest, src1, src2) \
256 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
257 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
258 #undef NEON_FN
259
260 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
261 {
262 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
263 }
264
HELPER(neon_rshl_u64)265 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
266 {
267 return do_uqrshl_d(val, (int8_t)shift, true, NULL);
268 }
269
270 #define NEON_FN(dest, src1, src2) \
271 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
272 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
NEON_GVEC_VOP2_ENV(neon_uqshl_b,uint8_t)273 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
274 #undef NEON_FN
275
276 #define NEON_FN(dest, src1, src2) \
277 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
278 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
279 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
280 #undef NEON_FN
281
282 #define NEON_FN(dest, src1, src2) \
283 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
284 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
285 #undef NEON_FN
286
287 #define NEON_FN(dest, src1, src2) \
288 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
289 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
290 #undef NEON_FN
291
292 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
293 {
294 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
295 }
296
HELPER(neon_qshl_u64)297 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
298 {
299 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
300 }
301
302 #define NEON_FN(dest, src1, src2) \
303 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
304 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqshl_b,int8_t)305 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
306 #undef NEON_FN
307
308 #define NEON_FN(dest, src1, src2) \
309 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
310 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
311 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
312 #undef NEON_FN
313
314 #define NEON_FN(dest, src1, src2) \
315 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
316 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
317 #undef NEON_FN
318
319 #define NEON_FN(dest, src1, src2) \
320 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
321 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
322 #undef NEON_FN
323
324 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
325 {
326 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
327 }
328
HELPER(neon_qshl_s64)329 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
330 {
331 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
332 }
333
334 #define NEON_FN(dest, src1, src2) \
335 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
336 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
337 #undef NEON_FN
338
339 #define NEON_FN(dest, src1, src2) \
340 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
341 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
342 #undef NEON_FN
343
HELPER(neon_qshlu_s32)344 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
345 {
346 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
347 }
348
HELPER(neon_qshlu_s64)349 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
350 {
351 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
352 }
353
354 #define NEON_FN(dest, src1, src2) \
355 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
356 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
NEON_GVEC_VOP2_ENV(neon_uqrshl_b,uint8_t)357 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
358 #undef NEON_FN
359
360 #define NEON_FN(dest, src1, src2) \
361 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
362 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
363 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
364 #undef NEON_FN
365
366 #define NEON_FN(dest, src1, src2) \
367 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
368 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
369 #undef NEON_FN
370
371 #define NEON_FN(dest, src1, src2) \
372 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
373 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
374 #undef NEON_FN
375
376 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
377 {
378 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
379 }
380
HELPER(neon_qrshl_u64)381 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
382 {
383 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
384 }
385
386 #define NEON_FN(dest, src1, src2) \
387 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
388 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqrshl_b,int8_t)389 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
390 #undef NEON_FN
391
392 #define NEON_FN(dest, src1, src2) \
393 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
394 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
395 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
396 #undef NEON_FN
397
398 #define NEON_FN(dest, src1, src2) \
399 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
400 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
401 #undef NEON_FN
402
403 #define NEON_FN(dest, src1, src2) \
404 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
405 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
406 #undef NEON_FN
407
408 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
409 {
410 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
411 }
412
HELPER(neon_qrshl_s64)413 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
414 {
415 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
416 }
417
HELPER(neon_add_u8)418 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
419 {
420 uint32_t mask;
421 mask = (a ^ b) & 0x80808080u;
422 a &= ~0x80808080u;
423 b &= ~0x80808080u;
424 return (a + b) ^ mask;
425 }
426
HELPER(neon_add_u16)427 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
428 {
429 uint32_t mask;
430 mask = (a ^ b) & 0x80008000u;
431 a &= ~0x80008000u;
432 b &= ~0x80008000u;
433 return (a + b) ^ mask;
434 }
435
436 #define NEON_FN(dest, src1, src2) dest = src1 - src2
437 NEON_VOP(sub_u8, neon_u8, 4)
438 NEON_VOP(sub_u16, neon_u16, 2)
439 #undef NEON_FN
440
441 #define NEON_FN(dest, src1, src2) dest = src1 * src2
442 NEON_VOP(mul_u8, neon_u8, 4)
443 NEON_VOP(mul_u16, neon_u16, 2)
444 #undef NEON_FN
445
446 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
447 NEON_VOP(tst_u8, neon_u8, 4)
448 NEON_VOP(tst_u16, neon_u16, 2)
449 NEON_VOP(tst_u32, neon_u32, 1)
450 #undef NEON_FN
451
452 /* Count Leading Sign/Zero Bits. */
do_clz8(uint8_t x)453 static inline int do_clz8(uint8_t x)
454 {
455 int n;
456 for (n = 8; x; n--)
457 x >>= 1;
458 return n;
459 }
460
do_clz16(uint16_t x)461 static inline int do_clz16(uint16_t x)
462 {
463 int n;
464 for (n = 16; x; n--)
465 x >>= 1;
466 return n;
467 }
468
469 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
470 NEON_VOP1(clz_u8, neon_u8, 4)
471 #undef NEON_FN
472
473 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
474 NEON_VOP1(clz_u16, neon_u16, 2)
475 #undef NEON_FN
476
477 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
478 NEON_VOP1(cls_s8, neon_s8, 4)
479 #undef NEON_FN
480
481 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
482 NEON_VOP1(cls_s16, neon_s16, 2)
483 #undef NEON_FN
484
HELPER(neon_cls_s32)485 uint32_t HELPER(neon_cls_s32)(uint32_t x)
486 {
487 int count;
488 if ((int32_t)x < 0)
489 x = ~x;
490 for (count = 32; x; count--)
491 x = x >> 1;
492 return count - 1;
493 }
494
495 /* Bit count. */
HELPER(neon_cnt_u8)496 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
497 {
498 x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
499 x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
500 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f);
501 return x;
502 }
503
504 /* Reverse bits in each 8 bit word */
HELPER(neon_rbit_u8)505 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
506 {
507 x = ((x & 0xf0f0f0f0) >> 4)
508 | ((x & 0x0f0f0f0f) << 4);
509 x = ((x & 0x88888888) >> 3)
510 | ((x & 0x44444444) >> 1)
511 | ((x & 0x22222222) << 1)
512 | ((x & 0x11111111) << 3);
513 return x;
514 }
515
516 #define NEON_QDMULH16(dest, src1, src2, round) do { \
517 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
518 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
519 SET_QC(); \
520 tmp = (tmp >> 31) ^ ~SIGNBIT; \
521 } else { \
522 tmp <<= 1; \
523 } \
524 if (round) { \
525 int32_t old = tmp; \
526 tmp += 1 << 15; \
527 if ((int32_t)tmp < old) { \
528 SET_QC(); \
529 tmp = SIGNBIT - 1; \
530 } \
531 } \
532 dest = tmp >> 16; \
533 } while(0)
534 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
535 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
536 #undef NEON_FN
537 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
538 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
539 #undef NEON_FN
540 #undef NEON_QDMULH16
541
542 #define NEON_QDMULH32(dest, src1, src2, round) do { \
543 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
544 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
545 SET_QC(); \
546 tmp = (tmp >> 63) ^ ~SIGNBIT64; \
547 } else { \
548 tmp <<= 1; \
549 } \
550 if (round) { \
551 int64_t old = tmp; \
552 tmp += (int64_t)1 << 31; \
553 if ((int64_t)tmp < old) { \
554 SET_QC(); \
555 tmp = SIGNBIT64 - 1; \
556 } \
557 } \
558 dest = tmp >> 32; \
559 } while(0)
560 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
561 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
562 #undef NEON_FN
563 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
564 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
565 #undef NEON_FN
566 #undef NEON_QDMULH32
567
HELPER(neon_narrow_u8)568 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
569 {
570 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
571 | ((x >> 24) & 0xff000000u);
572 }
573
HELPER(neon_narrow_u16)574 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
575 {
576 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
577 }
578
HELPER(neon_narrow_high_u8)579 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
580 {
581 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
582 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
583 }
584
HELPER(neon_narrow_high_u16)585 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
586 {
587 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
588 }
589
HELPER(neon_narrow_round_high_u8)590 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
591 {
592 x &= 0xff80ff80ff80ff80ull;
593 x += 0x0080008000800080ull;
594 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
595 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
596 }
597
HELPER(neon_narrow_round_high_u16)598 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
599 {
600 x &= 0xffff8000ffff8000ull;
601 x += 0x0000800000008000ull;
602 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
603 }
604
HELPER(neon_unarrow_sat8)605 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
606 {
607 uint16_t s;
608 uint8_t d;
609 uint32_t res = 0;
610 #define SAT8(n) \
611 s = x >> n; \
612 if (s & 0x8000) { \
613 SET_QC(); \
614 } else { \
615 if (s > 0xff) { \
616 d = 0xff; \
617 SET_QC(); \
618 } else { \
619 d = s; \
620 } \
621 res |= (uint32_t)d << (n / 2); \
622 }
623
624 SAT8(0);
625 SAT8(16);
626 SAT8(32);
627 SAT8(48);
628 #undef SAT8
629 return res;
630 }
631
HELPER(neon_narrow_sat_u8)632 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
633 {
634 uint16_t s;
635 uint8_t d;
636 uint32_t res = 0;
637 #define SAT8(n) \
638 s = x >> n; \
639 if (s > 0xff) { \
640 d = 0xff; \
641 SET_QC(); \
642 } else { \
643 d = s; \
644 } \
645 res |= (uint32_t)d << (n / 2);
646
647 SAT8(0);
648 SAT8(16);
649 SAT8(32);
650 SAT8(48);
651 #undef SAT8
652 return res;
653 }
654
HELPER(neon_narrow_sat_s8)655 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
656 {
657 int16_t s;
658 uint8_t d;
659 uint32_t res = 0;
660 #define SAT8(n) \
661 s = x >> n; \
662 if (s != (int8_t)s) { \
663 d = (s >> 15) ^ 0x7f; \
664 SET_QC(); \
665 } else { \
666 d = s; \
667 } \
668 res |= (uint32_t)d << (n / 2);
669
670 SAT8(0);
671 SAT8(16);
672 SAT8(32);
673 SAT8(48);
674 #undef SAT8
675 return res;
676 }
677
HELPER(neon_unarrow_sat16)678 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
679 {
680 uint32_t high;
681 uint32_t low;
682 low = x;
683 if (low & 0x80000000) {
684 low = 0;
685 SET_QC();
686 } else if (low > 0xffff) {
687 low = 0xffff;
688 SET_QC();
689 }
690 high = x >> 32;
691 if (high & 0x80000000) {
692 high = 0;
693 SET_QC();
694 } else if (high > 0xffff) {
695 high = 0xffff;
696 SET_QC();
697 }
698 return low | (high << 16);
699 }
700
HELPER(neon_narrow_sat_u16)701 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
702 {
703 uint32_t high;
704 uint32_t low;
705 low = x;
706 if (low > 0xffff) {
707 low = 0xffff;
708 SET_QC();
709 }
710 high = x >> 32;
711 if (high > 0xffff) {
712 high = 0xffff;
713 SET_QC();
714 }
715 return low | (high << 16);
716 }
717
HELPER(neon_narrow_sat_s16)718 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
719 {
720 int32_t low;
721 int32_t high;
722 low = x;
723 if (low != (int16_t)low) {
724 low = (low >> 31) ^ 0x7fff;
725 SET_QC();
726 }
727 high = x >> 32;
728 if (high != (int16_t)high) {
729 high = (high >> 31) ^ 0x7fff;
730 SET_QC();
731 }
732 return (uint16_t)low | (high << 16);
733 }
734
HELPER(neon_unarrow_sat32)735 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
736 {
737 if (x & 0x8000000000000000ull) {
738 SET_QC();
739 return 0;
740 }
741 if (x > 0xffffffffu) {
742 SET_QC();
743 return 0xffffffffu;
744 }
745 return x;
746 }
747
HELPER(neon_narrow_sat_u32)748 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
749 {
750 if (x > 0xffffffffu) {
751 SET_QC();
752 return 0xffffffffu;
753 }
754 return x;
755 }
756
HELPER(neon_narrow_sat_s32)757 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
758 {
759 if ((int64_t)x != (int32_t)x) {
760 SET_QC();
761 return ((int64_t)x >> 63) ^ 0x7fffffff;
762 }
763 return x;
764 }
765
HELPER(neon_widen_u8)766 uint64_t HELPER(neon_widen_u8)(uint32_t x)
767 {
768 uint64_t tmp;
769 uint64_t ret;
770 ret = (uint8_t)x;
771 tmp = (uint8_t)(x >> 8);
772 ret |= tmp << 16;
773 tmp = (uint8_t)(x >> 16);
774 ret |= tmp << 32;
775 tmp = (uint8_t)(x >> 24);
776 ret |= tmp << 48;
777 return ret;
778 }
779
HELPER(neon_widen_s8)780 uint64_t HELPER(neon_widen_s8)(uint32_t x)
781 {
782 uint64_t tmp;
783 uint64_t ret;
784 ret = (uint16_t)(int8_t)x;
785 tmp = (uint16_t)(int8_t)(x >> 8);
786 ret |= tmp << 16;
787 tmp = (uint16_t)(int8_t)(x >> 16);
788 ret |= tmp << 32;
789 tmp = (uint16_t)(int8_t)(x >> 24);
790 ret |= tmp << 48;
791 return ret;
792 }
793
HELPER(neon_widen_u16)794 uint64_t HELPER(neon_widen_u16)(uint32_t x)
795 {
796 uint64_t high = (uint16_t)(x >> 16);
797 return ((uint16_t)x) | (high << 32);
798 }
799
HELPER(neon_widen_s16)800 uint64_t HELPER(neon_widen_s16)(uint32_t x)
801 {
802 uint64_t high = (int16_t)(x >> 16);
803 return ((uint32_t)(int16_t)x) | (high << 32);
804 }
805
HELPER(neon_addl_u16)806 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
807 {
808 uint64_t mask;
809 mask = (a ^ b) & 0x8000800080008000ull;
810 a &= ~0x8000800080008000ull;
811 b &= ~0x8000800080008000ull;
812 return (a + b) ^ mask;
813 }
814
HELPER(neon_addl_u32)815 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
816 {
817 uint64_t mask;
818 mask = (a ^ b) & 0x8000000080000000ull;
819 a &= ~0x8000000080000000ull;
820 b &= ~0x8000000080000000ull;
821 return (a + b) ^ mask;
822 }
823
HELPER(neon_paddl_u16)824 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
825 {
826 uint64_t tmp;
827 uint64_t tmp2;
828
829 tmp = a & 0x0000ffff0000ffffull;
830 tmp += (a >> 16) & 0x0000ffff0000ffffull;
831 tmp2 = b & 0xffff0000ffff0000ull;
832 tmp2 += (b << 16) & 0xffff0000ffff0000ull;
833 return ( tmp & 0xffff)
834 | ((tmp >> 16) & 0xffff0000ull)
835 | ((tmp2 << 16) & 0xffff00000000ull)
836 | ( tmp2 & 0xffff000000000000ull);
837 }
838
HELPER(neon_paddl_u32)839 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
840 {
841 uint32_t low = a + (a >> 32);
842 uint32_t high = b + (b >> 32);
843 return low + ((uint64_t)high << 32);
844 }
845
HELPER(neon_subl_u16)846 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
847 {
848 uint64_t mask;
849 mask = (a ^ ~b) & 0x8000800080008000ull;
850 a |= 0x8000800080008000ull;
851 b &= ~0x8000800080008000ull;
852 return (a - b) ^ mask;
853 }
854
HELPER(neon_subl_u32)855 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
856 {
857 uint64_t mask;
858 mask = (a ^ ~b) & 0x8000000080000000ull;
859 a |= 0x8000000080000000ull;
860 b &= ~0x8000000080000000ull;
861 return (a - b) ^ mask;
862 }
863
HELPER(neon_addl_saturate_s32)864 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
865 {
866 uint32_t x, y;
867 uint32_t low, high;
868
869 x = a;
870 y = b;
871 low = x + y;
872 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
873 SET_QC();
874 low = ((int32_t)x >> 31) ^ ~SIGNBIT;
875 }
876 x = a >> 32;
877 y = b >> 32;
878 high = x + y;
879 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
880 SET_QC();
881 high = ((int32_t)x >> 31) ^ ~SIGNBIT;
882 }
883 return low | ((uint64_t)high << 32);
884 }
885
HELPER(neon_addl_saturate_s64)886 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
887 {
888 uint64_t result;
889
890 result = a + b;
891 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
892 SET_QC();
893 result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
894 }
895 return result;
896 }
897
898 /* We have to do the arithmetic in a larger type than
899 * the input type, because for example with a signed 32 bit
900 * op the absolute difference can overflow a signed 32 bit value.
901 */
902 #define DO_ABD(dest, x, y, intype, arithtype) do { \
903 arithtype tmp_x = (intype)(x); \
904 arithtype tmp_y = (intype)(y); \
905 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
906 } while(0)
907
HELPER(neon_abdl_u16)908 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
909 {
910 uint64_t tmp;
911 uint64_t result;
912 DO_ABD(result, a, b, uint8_t, uint32_t);
913 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
914 result |= tmp << 16;
915 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
916 result |= tmp << 32;
917 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
918 result |= tmp << 48;
919 return result;
920 }
921
HELPER(neon_abdl_s16)922 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
923 {
924 uint64_t tmp;
925 uint64_t result;
926 DO_ABD(result, a, b, int8_t, int32_t);
927 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
928 result |= tmp << 16;
929 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
930 result |= tmp << 32;
931 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
932 result |= tmp << 48;
933 return result;
934 }
935
HELPER(neon_abdl_u32)936 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
937 {
938 uint64_t tmp;
939 uint64_t result;
940 DO_ABD(result, a, b, uint16_t, uint32_t);
941 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
942 return result | (tmp << 32);
943 }
944
HELPER(neon_abdl_s32)945 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
946 {
947 uint64_t tmp;
948 uint64_t result;
949 DO_ABD(result, a, b, int16_t, int32_t);
950 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
951 return result | (tmp << 32);
952 }
953
HELPER(neon_abdl_u64)954 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
955 {
956 uint64_t result;
957 DO_ABD(result, a, b, uint32_t, uint64_t);
958 return result;
959 }
960
HELPER(neon_abdl_s64)961 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
962 {
963 uint64_t result;
964 DO_ABD(result, a, b, int32_t, int64_t);
965 return result;
966 }
967 #undef DO_ABD
968
969 /* Widening multiply. Named type is the source type. */
970 #define DO_MULL(dest, x, y, type1, type2) do { \
971 type1 tmp_x = x; \
972 type1 tmp_y = y; \
973 dest = (type2)((type2)tmp_x * (type2)tmp_y); \
974 } while(0)
975
HELPER(neon_mull_u8)976 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
977 {
978 uint64_t tmp;
979 uint64_t result;
980
981 DO_MULL(result, a, b, uint8_t, uint16_t);
982 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
983 result |= tmp << 16;
984 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
985 result |= tmp << 32;
986 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
987 result |= tmp << 48;
988 return result;
989 }
990
HELPER(neon_mull_s8)991 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
992 {
993 uint64_t tmp;
994 uint64_t result;
995
996 DO_MULL(result, a, b, int8_t, uint16_t);
997 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
998 result |= tmp << 16;
999 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1000 result |= tmp << 32;
1001 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1002 result |= tmp << 48;
1003 return result;
1004 }
1005
HELPER(neon_mull_u16)1006 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1007 {
1008 uint64_t tmp;
1009 uint64_t result;
1010
1011 DO_MULL(result, a, b, uint16_t, uint32_t);
1012 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1013 return result | (tmp << 32);
1014 }
1015
HELPER(neon_mull_s16)1016 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1017 {
1018 uint64_t tmp;
1019 uint64_t result;
1020
1021 DO_MULL(result, a, b, int16_t, uint32_t);
1022 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1023 return result | (tmp << 32);
1024 }
1025
HELPER(neon_negl_u16)1026 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1027 {
1028 uint16_t tmp;
1029 uint64_t result;
1030 result = (uint16_t)-x;
1031 tmp = -(x >> 16);
1032 result |= (uint64_t)tmp << 16;
1033 tmp = -(x >> 32);
1034 result |= (uint64_t)tmp << 32;
1035 tmp = -(x >> 48);
1036 result |= (uint64_t)tmp << 48;
1037 return result;
1038 }
1039
HELPER(neon_negl_u32)1040 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1041 {
1042 uint32_t low = -x;
1043 uint32_t high = -(x >> 32);
1044 return low | ((uint64_t)high << 32);
1045 }
1046
1047 /* Saturating sign manipulation. */
1048 /* ??? Make these use NEON_VOP1 */
1049 #define DO_QABS8(x) do { \
1050 if (x == (int8_t)0x80) { \
1051 x = 0x7f; \
1052 SET_QC(); \
1053 } else if (x < 0) { \
1054 x = -x; \
1055 }} while (0)
HELPER(neon_qabs_s8)1056 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1057 {
1058 neon_s8 vec;
1059 NEON_UNPACK(neon_s8, vec, x);
1060 DO_QABS8(vec.v1);
1061 DO_QABS8(vec.v2);
1062 DO_QABS8(vec.v3);
1063 DO_QABS8(vec.v4);
1064 NEON_PACK(neon_s8, x, vec);
1065 return x;
1066 }
1067 #undef DO_QABS8
1068
1069 #define DO_QNEG8(x) do { \
1070 if (x == (int8_t)0x80) { \
1071 x = 0x7f; \
1072 SET_QC(); \
1073 } else { \
1074 x = -x; \
1075 }} while (0)
HELPER(neon_qneg_s8)1076 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1077 {
1078 neon_s8 vec;
1079 NEON_UNPACK(neon_s8, vec, x);
1080 DO_QNEG8(vec.v1);
1081 DO_QNEG8(vec.v2);
1082 DO_QNEG8(vec.v3);
1083 DO_QNEG8(vec.v4);
1084 NEON_PACK(neon_s8, x, vec);
1085 return x;
1086 }
1087 #undef DO_QNEG8
1088
1089 #define DO_QABS16(x) do { \
1090 if (x == (int16_t)0x8000) { \
1091 x = 0x7fff; \
1092 SET_QC(); \
1093 } else if (x < 0) { \
1094 x = -x; \
1095 }} while (0)
HELPER(neon_qabs_s16)1096 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1097 {
1098 neon_s16 vec;
1099 NEON_UNPACK(neon_s16, vec, x);
1100 DO_QABS16(vec.v1);
1101 DO_QABS16(vec.v2);
1102 NEON_PACK(neon_s16, x, vec);
1103 return x;
1104 }
1105 #undef DO_QABS16
1106
1107 #define DO_QNEG16(x) do { \
1108 if (x == (int16_t)0x8000) { \
1109 x = 0x7fff; \
1110 SET_QC(); \
1111 } else { \
1112 x = -x; \
1113 }} while (0)
HELPER(neon_qneg_s16)1114 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1115 {
1116 neon_s16 vec;
1117 NEON_UNPACK(neon_s16, vec, x);
1118 DO_QNEG16(vec.v1);
1119 DO_QNEG16(vec.v2);
1120 NEON_PACK(neon_s16, x, vec);
1121 return x;
1122 }
1123 #undef DO_QNEG16
1124
HELPER(neon_qabs_s32)1125 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1126 {
1127 if (x == SIGNBIT) {
1128 SET_QC();
1129 x = ~SIGNBIT;
1130 } else if ((int32_t)x < 0) {
1131 x = -x;
1132 }
1133 return x;
1134 }
1135
HELPER(neon_qneg_s32)1136 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1137 {
1138 if (x == SIGNBIT) {
1139 SET_QC();
1140 x = ~SIGNBIT;
1141 } else {
1142 x = -x;
1143 }
1144 return x;
1145 }
1146
HELPER(neon_qabs_s64)1147 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1148 {
1149 if (x == SIGNBIT64) {
1150 SET_QC();
1151 x = ~SIGNBIT64;
1152 } else if ((int64_t)x < 0) {
1153 x = -x;
1154 }
1155 return x;
1156 }
1157
HELPER(neon_qneg_s64)1158 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1159 {
1160 if (x == SIGNBIT64) {
1161 SET_QC();
1162 x = ~SIGNBIT64;
1163 } else {
1164 x = -x;
1165 }
1166 return x;
1167 }
1168
1169 /* NEON Float helpers. */
1170
1171 /* Floating point comparisons produce an integer result.
1172 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1173 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1174 */
HELPER(neon_ceq_f32)1175 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1176 {
1177 float_status *fpst = fpstp;
1178 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1179 }
1180
HELPER(neon_cge_f32)1181 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1182 {
1183 float_status *fpst = fpstp;
1184 return -float32_le(make_float32(b), make_float32(a), fpst);
1185 }
1186
HELPER(neon_cgt_f32)1187 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1188 {
1189 float_status *fpst = fpstp;
1190 return -float32_lt(make_float32(b), make_float32(a), fpst);
1191 }
1192
HELPER(neon_acge_f32)1193 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1194 {
1195 float_status *fpst = fpstp;
1196 float32 f0 = float32_abs(make_float32(a));
1197 float32 f1 = float32_abs(make_float32(b));
1198 return -float32_le(f1, f0, fpst);
1199 }
1200
HELPER(neon_acgt_f32)1201 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1202 {
1203 float_status *fpst = fpstp;
1204 float32 f0 = float32_abs(make_float32(a));
1205 float32 f1 = float32_abs(make_float32(b));
1206 return -float32_lt(f1, f0, fpst);
1207 }
1208
HELPER(neon_acge_f64)1209 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1210 {
1211 float_status *fpst = fpstp;
1212 float64 f0 = float64_abs(make_float64(a));
1213 float64 f1 = float64_abs(make_float64(b));
1214 return -float64_le(f1, f0, fpst);
1215 }
1216
HELPER(neon_acgt_f64)1217 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1218 {
1219 float_status *fpst = fpstp;
1220 float64 f0 = float64_abs(make_float64(a));
1221 float64 f1 = float64_abs(make_float64(b));
1222 return -float64_lt(f1, f0, fpst);
1223 }
1224
1225 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1226
HELPER(neon_qunzip8)1227 void HELPER(neon_qunzip8)(void *vd, void *vm)
1228 {
1229 uint64_t *rd = vd, *rm = vm;
1230 uint64_t zd0 = rd[0], zd1 = rd[1];
1231 uint64_t zm0 = rm[0], zm1 = rm[1];
1232
1233 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1234 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1235 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1236 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1237 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1238 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1239 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1240 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1241 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1242 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1243 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1244 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1245 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1246 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1247 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1248 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1249
1250 rm[0] = m0;
1251 rm[1] = m1;
1252 rd[0] = d0;
1253 rd[1] = d1;
1254 }
1255
HELPER(neon_qunzip16)1256 void HELPER(neon_qunzip16)(void *vd, void *vm)
1257 {
1258 uint64_t *rd = vd, *rm = vm;
1259 uint64_t zd0 = rd[0], zd1 = rd[1];
1260 uint64_t zm0 = rm[0], zm1 = rm[1];
1261
1262 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1263 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1264 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1265 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1266 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1267 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1268 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1269 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1270
1271 rm[0] = m0;
1272 rm[1] = m1;
1273 rd[0] = d0;
1274 rd[1] = d1;
1275 }
1276
HELPER(neon_qunzip32)1277 void HELPER(neon_qunzip32)(void *vd, void *vm)
1278 {
1279 uint64_t *rd = vd, *rm = vm;
1280 uint64_t zd0 = rd[0], zd1 = rd[1];
1281 uint64_t zm0 = rm[0], zm1 = rm[1];
1282
1283 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1284 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1285 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1286 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1287
1288 rm[0] = m0;
1289 rm[1] = m1;
1290 rd[0] = d0;
1291 rd[1] = d1;
1292 }
1293
HELPER(neon_unzip8)1294 void HELPER(neon_unzip8)(void *vd, void *vm)
1295 {
1296 uint64_t *rd = vd, *rm = vm;
1297 uint64_t zd = rd[0], zm = rm[0];
1298
1299 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1300 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1301 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1302 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1303 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1304 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1305 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1306 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1307
1308 rm[0] = m0;
1309 rd[0] = d0;
1310 }
1311
HELPER(neon_unzip16)1312 void HELPER(neon_unzip16)(void *vd, void *vm)
1313 {
1314 uint64_t *rd = vd, *rm = vm;
1315 uint64_t zd = rd[0], zm = rm[0];
1316
1317 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1318 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1319 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1320 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1321
1322 rm[0] = m0;
1323 rd[0] = d0;
1324 }
1325
HELPER(neon_qzip8)1326 void HELPER(neon_qzip8)(void *vd, void *vm)
1327 {
1328 uint64_t *rd = vd, *rm = vm;
1329 uint64_t zd0 = rd[0], zd1 = rd[1];
1330 uint64_t zm0 = rm[0], zm1 = rm[1];
1331
1332 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1333 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1334 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1335 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1336 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1337 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1338 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1339 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1340 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1341 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1342 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1343 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1344 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1345 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1346 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1347 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1348
1349 rm[0] = m0;
1350 rm[1] = m1;
1351 rd[0] = d0;
1352 rd[1] = d1;
1353 }
1354
HELPER(neon_qzip16)1355 void HELPER(neon_qzip16)(void *vd, void *vm)
1356 {
1357 uint64_t *rd = vd, *rm = vm;
1358 uint64_t zd0 = rd[0], zd1 = rd[1];
1359 uint64_t zm0 = rm[0], zm1 = rm[1];
1360
1361 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1362 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1363 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1364 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1365 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1366 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1367 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1368 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1369
1370 rm[0] = m0;
1371 rm[1] = m1;
1372 rd[0] = d0;
1373 rd[1] = d1;
1374 }
1375
HELPER(neon_qzip32)1376 void HELPER(neon_qzip32)(void *vd, void *vm)
1377 {
1378 uint64_t *rd = vd, *rm = vm;
1379 uint64_t zd0 = rd[0], zd1 = rd[1];
1380 uint64_t zm0 = rm[0], zm1 = rm[1];
1381
1382 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1383 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1384 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1385 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1386
1387 rm[0] = m0;
1388 rm[1] = m1;
1389 rd[0] = d0;
1390 rd[1] = d1;
1391 }
1392
HELPER(neon_zip8)1393 void HELPER(neon_zip8)(void *vd, void *vm)
1394 {
1395 uint64_t *rd = vd, *rm = vm;
1396 uint64_t zd = rd[0], zm = rm[0];
1397
1398 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1399 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1400 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1401 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1402 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1403 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1404 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1405 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1406
1407 rm[0] = m0;
1408 rd[0] = d0;
1409 }
1410
HELPER(neon_zip16)1411 void HELPER(neon_zip16)(void *vd, void *vm)
1412 {
1413 uint64_t *rd = vd, *rm = vm;
1414 uint64_t zd = rd[0], zm = rm[0];
1415
1416 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1417 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1418 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1419 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1420
1421 rm[0] = m0;
1422 rd[0] = d0;
1423 }
1424