1 /*
2 * ARM NEON vector operations.
3 *
4 * Copyright (c) 2007, 2008 CodeSourcery.
5 * Written by Paul Brook
6 *
7 * This code is licensed under the GNU GPL v2.
8 */
9
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "tcg/tcg-gvec-desc.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15
16 #define HELPER_H "tcg/helper.h"
17 #include "exec/helper-proto.h.inc"
18
19 #define SIGNBIT (uint32_t)0x80000000
20 #define SIGNBIT64 ((uint64_t)1 << 63)
21
22 #define SET_QC() env->vfp.qc[0] = 1
23
24 #define NEON_TYPE1(name, type) \
25 typedef struct \
26 { \
27 type v1; \
28 } neon_##name;
29 #if HOST_BIG_ENDIAN
30 #define NEON_TYPE2(name, type) \
31 typedef struct \
32 { \
33 type v2; \
34 type v1; \
35 } neon_##name;
36 #define NEON_TYPE4(name, type) \
37 typedef struct \
38 { \
39 type v4; \
40 type v3; \
41 type v2; \
42 type v1; \
43 } neon_##name;
44 #else
45 #define NEON_TYPE2(name, type) \
46 typedef struct \
47 { \
48 type v1; \
49 type v2; \
50 } neon_##name;
51 #define NEON_TYPE4(name, type) \
52 typedef struct \
53 { \
54 type v1; \
55 type v2; \
56 type v3; \
57 type v4; \
58 } neon_##name;
59 #endif
60
NEON_TYPE4(s8,int8_t)61 NEON_TYPE4(s8, int8_t)
62 NEON_TYPE4(u8, uint8_t)
63 NEON_TYPE2(s16, int16_t)
64 NEON_TYPE2(u16, uint16_t)
65 NEON_TYPE1(s32, int32_t)
66 NEON_TYPE1(u32, uint32_t)
67 #undef NEON_TYPE4
68 #undef NEON_TYPE2
69 #undef NEON_TYPE1
70
71 /* Copy from a uint32_t to a vector structure type. */
72 #define NEON_UNPACK(vtype, dest, val) do { \
73 union { \
74 vtype v; \
75 uint32_t i; \
76 } conv_u; \
77 conv_u.i = (val); \
78 dest = conv_u.v; \
79 } while(0)
80
81 /* Copy from a vector structure type to a uint32_t. */
82 #define NEON_PACK(vtype, dest, val) do { \
83 union { \
84 vtype v; \
85 uint32_t i; \
86 } conv_u; \
87 conv_u.v = (val); \
88 dest = conv_u.i; \
89 } while(0)
90
91 #define NEON_DO1 \
92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
93 #define NEON_DO2 \
94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
96 #define NEON_DO4 \
97 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
98 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
99 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
100 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
101
102 #define NEON_VOP_BODY(vtype, n) \
103 { \
104 uint32_t res; \
105 vtype vsrc1; \
106 vtype vsrc2; \
107 vtype vdest; \
108 NEON_UNPACK(vtype, vsrc1, arg1); \
109 NEON_UNPACK(vtype, vsrc2, arg2); \
110 NEON_DO##n; \
111 NEON_PACK(vtype, res, vdest); \
112 return res; \
113 }
114
115 #define NEON_VOP(name, vtype, n) \
116 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
117 NEON_VOP_BODY(vtype, n)
118
119 #define NEON_VOP_ENV(name, vtype, n) \
120 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
121 NEON_VOP_BODY(vtype, n)
122
123 #define NEON_GVEC_VOP2(name, vtype) \
124 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
125 { \
126 intptr_t i, opr_sz = simd_oprsz(desc); \
127 vtype *d = vd, *n = vn, *m = vm; \
128 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
129 NEON_FN(d[i], n[i], m[i]); \
130 } \
131 clear_tail(d, opr_sz, simd_maxsz(desc)); \
132 }
133
134 #define NEON_GVEC_VOP2_ENV(name, vtype) \
135 void HELPER(name)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) \
136 { \
137 intptr_t i, opr_sz = simd_oprsz(desc); \
138 vtype *d = vd, *n = vn, *m = vm; \
139 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
140 NEON_FN(d[i], n[i], m[i]); \
141 } \
142 clear_tail(d, opr_sz, simd_maxsz(desc)); \
143 }
144
145 #define NEON_GVEC_VOP2i_ENV(name, vtype) \
146 void HELPER(name)(void *vd, void *vn, CPUARMState *env, uint32_t desc) \
147 { \
148 intptr_t i, opr_sz = simd_oprsz(desc); \
149 int imm = simd_data(desc); \
150 vtype *d = vd, *n = vn; \
151 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
152 NEON_FN(d[i], n[i], imm); \
153 } \
154 clear_tail(d, opr_sz, simd_maxsz(desc)); \
155 }
156
157 /* Pairwise operations. */
158 /* For 32-bit elements each segment only contains a single element, so
159 the elementwise and pairwise operations are the same. */
160 #define NEON_PDO2 \
161 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
162 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
163 #define NEON_PDO4 \
164 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
165 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
166 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
167 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
168
169 #define NEON_POP(name, vtype, n) \
170 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
171 { \
172 uint32_t res; \
173 vtype vsrc1; \
174 vtype vsrc2; \
175 vtype vdest; \
176 NEON_UNPACK(vtype, vsrc1, arg1); \
177 NEON_UNPACK(vtype, vsrc2, arg2); \
178 NEON_PDO##n; \
179 NEON_PACK(vtype, res, vdest); \
180 return res; \
181 }
182
183 /* Unary operators. */
184 #define NEON_VOP1(name, vtype, n) \
185 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
186 { \
187 vtype vsrc1; \
188 vtype vdest; \
189 NEON_UNPACK(vtype, vsrc1, arg); \
190 NEON_DO##n; \
191 NEON_PACK(vtype, arg, vdest); \
192 return arg; \
193 }
194
195 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
196 NEON_POP(pmin_s8, neon_s8, 4)
197 NEON_POP(pmin_u8, neon_u8, 4)
198 NEON_POP(pmin_s16, neon_s16, 2)
199 NEON_POP(pmin_u16, neon_u16, 2)
200 #undef NEON_FN
201
202 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
203 NEON_POP(pmax_s8, neon_s8, 4)
204 NEON_POP(pmax_u8, neon_u8, 4)
205 NEON_POP(pmax_s16, neon_s16, 2)
206 NEON_POP(pmax_u16, neon_u16, 2)
207 #undef NEON_FN
208
209 #define NEON_FN(dest, src1, src2) \
210 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
211 NEON_VOP(shl_u16, neon_u16, 2)
212 #undef NEON_FN
213
214 #define NEON_FN(dest, src1, src2) \
215 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
216 NEON_VOP(shl_s16, neon_s16, 2)
217 #undef NEON_FN
218
219 #define NEON_FN(dest, src1, src2) \
220 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
221 NEON_VOP(rshl_s8, neon_s8, 4)
222 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
223 #undef NEON_FN
224
225 #define NEON_FN(dest, src1, src2) \
226 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
227 NEON_VOP(rshl_s16, neon_s16, 2)
228 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
229 #undef NEON_FN
230
231 #define NEON_FN(dest, src1, src2) \
232 (dest = do_sqrshl_bhs(src1, src2, 16, true, NULL))
233 NEON_GVEC_VOP2(sme2_srshl_h, int16_t)
234 #undef NEON_FN
235
236 #define NEON_FN(dest, src1, src2) \
237 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
238 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
239 #undef NEON_FN
240
241 #define NEON_FN(dest, src1, src2) \
242 (dest = do_sqrshl_bhs(src1, src2, 32, true, NULL))
243 NEON_GVEC_VOP2(sme2_srshl_s, int32_t)
244 #undef NEON_FN
245
246 #define NEON_FN(dest, src1, src2) \
247 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
248 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
249 #undef NEON_FN
250
251 #define NEON_FN(dest, src1, src2) \
252 (dest = do_sqrshl_d(src1, src2, true, NULL))
253 NEON_GVEC_VOP2(sme2_srshl_d, int64_t)
254 #undef NEON_FN
255
256 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
257 {
258 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
259 }
260
HELPER(neon_rshl_s64)261 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
262 {
263 return do_sqrshl_d(val, (int8_t)shift, true, NULL);
264 }
265
266 #define NEON_FN(dest, src1, src2) \
267 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
268 NEON_VOP(rshl_u8, neon_u8, 4)
NEON_GVEC_VOP2(gvec_urshl_b,uint8_t)269 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
270 #undef NEON_FN
271
272 #define NEON_FN(dest, src1, src2) \
273 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
274 NEON_VOP(rshl_u16, neon_u16, 2)
275 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
276 #undef NEON_FN
277
278 #define NEON_FN(dest, src1, src2) \
279 (dest = do_uqrshl_bhs(src1, (int16_t)src2, 16, true, NULL))
280 NEON_GVEC_VOP2(sme2_urshl_h, uint16_t)
281 #undef NEON_FN
282
283 #define NEON_FN(dest, src1, src2) \
284 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
285 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
286 #undef NEON_FN
287
288 #define NEON_FN(dest, src1, src2) \
289 (dest = do_uqrshl_bhs(src1, src2, 32, true, NULL))
290 NEON_GVEC_VOP2(sme2_urshl_s, int32_t)
291 #undef NEON_FN
292
293 #define NEON_FN(dest, src1, src2) \
294 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
295 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
296 #undef NEON_FN
297
298 #define NEON_FN(dest, src1, src2) \
299 (dest = do_uqrshl_d(src1, src2, true, NULL))
300 NEON_GVEC_VOP2(sme2_urshl_d, int64_t)
301 #undef NEON_FN
302
303 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
304 {
305 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
306 }
307
HELPER(neon_rshl_u64)308 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
309 {
310 return do_uqrshl_d(val, (int8_t)shift, true, NULL);
311 }
312
313 #define NEON_FN(dest, src1, src2) \
314 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
315 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
NEON_GVEC_VOP2_ENV(neon_uqshl_b,uint8_t)316 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
317 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
318 #undef NEON_FN
319
320 #define NEON_FN(dest, src1, src2) \
321 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
322 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
323 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
324 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
325 #undef NEON_FN
326
327 #define NEON_FN(dest, src1, src2) \
328 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
329 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
330 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
331 #undef NEON_FN
332
333 #define NEON_FN(dest, src1, src2) \
334 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
335 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
336 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
337 #undef NEON_FN
338
339 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
340 {
341 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
342 }
343
HELPER(neon_qshl_u64)344 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
345 {
346 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
347 }
348
349 #define NEON_FN(dest, src1, src2) \
350 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
351 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqshl_b,int8_t)352 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
353 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
354 #undef NEON_FN
355
356 #define NEON_FN(dest, src1, src2) \
357 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
358 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
359 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
360 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
361 #undef NEON_FN
362
363 #define NEON_FN(dest, src1, src2) \
364 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
365 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
366 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
367 #undef NEON_FN
368
369 #define NEON_FN(dest, src1, src2) \
370 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
371 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
372 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
373 #undef NEON_FN
374
375 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
376 {
377 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
378 }
379
HELPER(neon_qshl_s64)380 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
381 {
382 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
383 }
384
385 #define NEON_FN(dest, src1, src2) \
386 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
387 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
NEON_GVEC_VOP2i_ENV(neon_sqshlui_b,int8_t)388 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
389 #undef NEON_FN
390
391 #define NEON_FN(dest, src1, src2) \
392 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
393 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
394 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
395 #undef NEON_FN
396
397 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
398 {
399 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
400 }
401
HELPER(neon_qshlu_s64)402 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
403 {
404 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
405 }
406
407 #define NEON_FN(dest, src1, src2) \
408 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
NEON_GVEC_VOP2i_ENV(neon_sqshlui_s,int32_t)409 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
410 #undef NEON_FN
411
412 #define NEON_FN(dest, src1, src2) \
413 (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
414 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
415 #undef NEON_FN
416
417 #define NEON_FN(dest, src1, src2) \
418 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
419 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
420 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
421 #undef NEON_FN
422
423 #define NEON_FN(dest, src1, src2) \
424 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
425 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
426 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
427 #undef NEON_FN
428
429 #define NEON_FN(dest, src1, src2) \
430 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
431 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
432 #undef NEON_FN
433
434 #define NEON_FN(dest, src1, src2) \
435 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
436 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
437 #undef NEON_FN
438
439 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
440 {
441 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
442 }
443
HELPER(neon_qrshl_u64)444 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
445 {
446 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
447 }
448
449 #define NEON_FN(dest, src1, src2) \
450 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
451 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqrshl_b,int8_t)452 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
453 #undef NEON_FN
454
455 #define NEON_FN(dest, src1, src2) \
456 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
457 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
458 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
459 #undef NEON_FN
460
461 #define NEON_FN(dest, src1, src2) \
462 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
463 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
464 #undef NEON_FN
465
466 #define NEON_FN(dest, src1, src2) \
467 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
468 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
469 #undef NEON_FN
470
471 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
472 {
473 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
474 }
475
HELPER(neon_qrshl_s64)476 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
477 {
478 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
479 }
480
HELPER(neon_add_u8)481 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
482 {
483 uint32_t mask;
484 mask = (a ^ b) & 0x80808080u;
485 a &= ~0x80808080u;
486 b &= ~0x80808080u;
487 return (a + b) ^ mask;
488 }
489
HELPER(neon_add_u16)490 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
491 {
492 uint32_t mask;
493 mask = (a ^ b) & 0x80008000u;
494 a &= ~0x80008000u;
495 b &= ~0x80008000u;
496 return (a + b) ^ mask;
497 }
498
499 #define NEON_FN(dest, src1, src2) dest = src1 - src2
500 NEON_VOP(sub_u8, neon_u8, 4)
501 NEON_VOP(sub_u16, neon_u16, 2)
502 #undef NEON_FN
503
504 #define NEON_FN(dest, src1, src2) dest = src1 * src2
505 NEON_VOP(mul_u8, neon_u8, 4)
506 NEON_VOP(mul_u16, neon_u16, 2)
507 #undef NEON_FN
508
509 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
510 NEON_VOP(tst_u8, neon_u8, 4)
511 NEON_VOP(tst_u16, neon_u16, 2)
512 NEON_VOP(tst_u32, neon_u32, 1)
513 #undef NEON_FN
514
515 /* Count Leading Sign/Zero Bits. */
do_clz8(uint8_t x)516 static inline int do_clz8(uint8_t x)
517 {
518 int n;
519 for (n = 8; x; n--)
520 x >>= 1;
521 return n;
522 }
523
do_clz16(uint16_t x)524 static inline int do_clz16(uint16_t x)
525 {
526 int n;
527 for (n = 16; x; n--)
528 x >>= 1;
529 return n;
530 }
531
532 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
533 NEON_VOP1(clz_u8, neon_u8, 4)
534 #undef NEON_FN
535
536 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
537 NEON_VOP1(clz_u16, neon_u16, 2)
538 #undef NEON_FN
539
540 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
541 NEON_VOP1(cls_s8, neon_s8, 4)
542 #undef NEON_FN
543
544 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
545 NEON_VOP1(cls_s16, neon_s16, 2)
546 #undef NEON_FN
547
HELPER(neon_cls_s32)548 uint32_t HELPER(neon_cls_s32)(uint32_t x)
549 {
550 int count;
551 if ((int32_t)x < 0)
552 x = ~x;
553 for (count = 32; x; count--)
554 x = x >> 1;
555 return count - 1;
556 }
557
558 #define NEON_QDMULH16(dest, src1, src2, round) do { \
559 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
560 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
561 SET_QC(); \
562 tmp = (tmp >> 31) ^ ~SIGNBIT; \
563 } else { \
564 tmp <<= 1; \
565 } \
566 if (round) { \
567 int32_t old = tmp; \
568 tmp += 1 << 15; \
569 if ((int32_t)tmp < old) { \
570 SET_QC(); \
571 tmp = SIGNBIT - 1; \
572 } \
573 } \
574 dest = tmp >> 16; \
575 } while(0)
576 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
577 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
578 #undef NEON_FN
579 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
580 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
581 #undef NEON_FN
582 #undef NEON_QDMULH16
583
584 #define NEON_QDMULH32(dest, src1, src2, round) do { \
585 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
586 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
587 SET_QC(); \
588 tmp = (tmp >> 63) ^ ~SIGNBIT64; \
589 } else { \
590 tmp <<= 1; \
591 } \
592 if (round) { \
593 int64_t old = tmp; \
594 tmp += (int64_t)1 << 31; \
595 if ((int64_t)tmp < old) { \
596 SET_QC(); \
597 tmp = SIGNBIT64 - 1; \
598 } \
599 } \
600 dest = tmp >> 32; \
601 } while(0)
602 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
603 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
604 #undef NEON_FN
605 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
606 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
607 #undef NEON_FN
608 #undef NEON_QDMULH32
609
610 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_u8)611 uint64_t HELPER(neon_narrow_u8)(uint64_t x)
612 {
613 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
614 | ((x >> 24) & 0xff000000u);
615 }
616
617 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_u16)618 uint64_t HELPER(neon_narrow_u16)(uint64_t x)
619 {
620 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
621 }
622
HELPER(neon_narrow_high_u8)623 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
624 {
625 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
626 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
627 }
628
HELPER(neon_narrow_high_u16)629 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
630 {
631 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
632 }
633
HELPER(neon_narrow_round_high_u8)634 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
635 {
636 x &= 0xff80ff80ff80ff80ull;
637 x += 0x0080008000800080ull;
638 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
639 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
640 }
641
HELPER(neon_narrow_round_high_u16)642 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
643 {
644 x &= 0xffff8000ffff8000ull;
645 x += 0x0000800000008000ull;
646 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
647 }
648
649 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat8)650 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
651 {
652 uint16_t s;
653 uint8_t d;
654 uint32_t res = 0;
655 #define SAT8(n) \
656 s = x >> n; \
657 if (s & 0x8000) { \
658 SET_QC(); \
659 } else { \
660 if (s > 0xff) { \
661 d = 0xff; \
662 SET_QC(); \
663 } else { \
664 d = s; \
665 } \
666 res |= (uint32_t)d << (n / 2); \
667 }
668
669 SAT8(0);
670 SAT8(16);
671 SAT8(32);
672 SAT8(48);
673 #undef SAT8
674 return res;
675 }
676
677 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u8)678 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
679 {
680 uint16_t s;
681 uint8_t d;
682 uint32_t res = 0;
683 #define SAT8(n) \
684 s = x >> n; \
685 if (s > 0xff) { \
686 d = 0xff; \
687 SET_QC(); \
688 } else { \
689 d = s; \
690 } \
691 res |= (uint32_t)d << (n / 2);
692
693 SAT8(0);
694 SAT8(16);
695 SAT8(32);
696 SAT8(48);
697 #undef SAT8
698 return res;
699 }
700
701 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s8)702 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
703 {
704 int16_t s;
705 uint8_t d;
706 uint32_t res = 0;
707 #define SAT8(n) \
708 s = x >> n; \
709 if (s != (int8_t)s) { \
710 d = (s >> 15) ^ 0x7f; \
711 SET_QC(); \
712 } else { \
713 d = s; \
714 } \
715 res |= (uint32_t)d << (n / 2);
716
717 SAT8(0);
718 SAT8(16);
719 SAT8(32);
720 SAT8(48);
721 #undef SAT8
722 return res;
723 }
724
725 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat16)726 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
727 {
728 uint32_t high;
729 uint32_t low;
730 low = x;
731 if (low & 0x80000000) {
732 low = 0;
733 SET_QC();
734 } else if (low > 0xffff) {
735 low = 0xffff;
736 SET_QC();
737 }
738 high = x >> 32;
739 if (high & 0x80000000) {
740 high = 0;
741 SET_QC();
742 } else if (high > 0xffff) {
743 high = 0xffff;
744 SET_QC();
745 }
746 return deposit32(low, 16, 16, high);
747 }
748
749 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u16)750 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
751 {
752 uint32_t high;
753 uint32_t low;
754 low = x;
755 if (low > 0xffff) {
756 low = 0xffff;
757 SET_QC();
758 }
759 high = x >> 32;
760 if (high > 0xffff) {
761 high = 0xffff;
762 SET_QC();
763 }
764 return deposit32(low, 16, 16, high);
765 }
766
767 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s16)768 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
769 {
770 int32_t low;
771 int32_t high;
772 low = x;
773 if (low != (int16_t)low) {
774 low = (low >> 31) ^ 0x7fff;
775 SET_QC();
776 }
777 high = x >> 32;
778 if (high != (int16_t)high) {
779 high = (high >> 31) ^ 0x7fff;
780 SET_QC();
781 }
782 return deposit32(low, 16, 16, high);
783 }
784
785 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat32)786 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
787 {
788 if (x & 0x8000000000000000ull) {
789 SET_QC();
790 return 0;
791 }
792 if (x > 0xffffffffu) {
793 SET_QC();
794 return 0xffffffffu;
795 }
796 return x;
797 }
798
799 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u32)800 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
801 {
802 if (x > 0xffffffffu) {
803 SET_QC();
804 return 0xffffffffu;
805 }
806 return x;
807 }
808
809 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s32)810 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
811 {
812 if ((int64_t)x != (int32_t)x) {
813 SET_QC();
814 return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff;
815 }
816 return (uint32_t)x;
817 }
818
HELPER(neon_widen_u8)819 uint64_t HELPER(neon_widen_u8)(uint32_t x)
820 {
821 uint64_t tmp;
822 uint64_t ret;
823 ret = (uint8_t)x;
824 tmp = (uint8_t)(x >> 8);
825 ret |= tmp << 16;
826 tmp = (uint8_t)(x >> 16);
827 ret |= tmp << 32;
828 tmp = (uint8_t)(x >> 24);
829 ret |= tmp << 48;
830 return ret;
831 }
832
HELPER(neon_widen_s8)833 uint64_t HELPER(neon_widen_s8)(uint32_t x)
834 {
835 uint64_t tmp;
836 uint64_t ret;
837 ret = (uint16_t)(int8_t)x;
838 tmp = (uint16_t)(int8_t)(x >> 8);
839 ret |= tmp << 16;
840 tmp = (uint16_t)(int8_t)(x >> 16);
841 ret |= tmp << 32;
842 tmp = (uint16_t)(int8_t)(x >> 24);
843 ret |= tmp << 48;
844 return ret;
845 }
846
HELPER(neon_widen_u16)847 uint64_t HELPER(neon_widen_u16)(uint32_t x)
848 {
849 uint64_t high = (uint16_t)(x >> 16);
850 return ((uint16_t)x) | (high << 32);
851 }
852
HELPER(neon_widen_s16)853 uint64_t HELPER(neon_widen_s16)(uint32_t x)
854 {
855 uint64_t high = (int16_t)(x >> 16);
856 return ((uint32_t)(int16_t)x) | (high << 32);
857 }
858
859 /* Pairwise long add: add pairs of adjacent elements into
860 * double-width elements in the result (eg _s8 is an 8x8->16 op)
861 */
HELPER(neon_addlp_s8)862 uint64_t HELPER(neon_addlp_s8)(uint64_t a)
863 {
864 uint64_t nsignmask = 0x0080008000800080ULL;
865 uint64_t wsignmask = 0x8000800080008000ULL;
866 uint64_t elementmask = 0x00ff00ff00ff00ffULL;
867 uint64_t tmp1, tmp2;
868 uint64_t res, signres;
869
870 /* Extract odd elements, sign extend each to a 16 bit field */
871 tmp1 = a & elementmask;
872 tmp1 ^= nsignmask;
873 tmp1 |= wsignmask;
874 tmp1 = (tmp1 - nsignmask) ^ wsignmask;
875 /* Ditto for the even elements */
876 tmp2 = (a >> 8) & elementmask;
877 tmp2 ^= nsignmask;
878 tmp2 |= wsignmask;
879 tmp2 = (tmp2 - nsignmask) ^ wsignmask;
880
881 /* calculate the result by summing bits 0..14, 16..22, etc,
882 * and then adjusting the sign bits 15, 23, etc manually.
883 * This ensures the addition can't overflow the 16 bit field.
884 */
885 signres = (tmp1 ^ tmp2) & wsignmask;
886 res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
887 res ^= signres;
888
889 return res;
890 }
891
HELPER(neon_addlp_s16)892 uint64_t HELPER(neon_addlp_s16)(uint64_t a)
893 {
894 int32_t reslo, reshi;
895
896 reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
897 reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
898
899 return (uint32_t)reslo | (((uint64_t)reshi) << 32);
900 }
901
HELPER(neon_addl_saturate_s32)902 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
903 {
904 uint32_t x, y;
905 uint32_t low, high;
906
907 x = a;
908 y = b;
909 low = x + y;
910 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
911 SET_QC();
912 low = ((int32_t)x >> 31) ^ ~SIGNBIT;
913 }
914 x = a >> 32;
915 y = b >> 32;
916 high = x + y;
917 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
918 SET_QC();
919 high = ((int32_t)x >> 31) ^ ~SIGNBIT;
920 }
921 return low | ((uint64_t)high << 32);
922 }
923
HELPER(neon_addl_saturate_s64)924 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
925 {
926 uint64_t result;
927
928 result = a + b;
929 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
930 SET_QC();
931 result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
932 }
933 return result;
934 }
935
936 /* We have to do the arithmetic in a larger type than
937 * the input type, because for example with a signed 32 bit
938 * op the absolute difference can overflow a signed 32 bit value.
939 */
940 #define DO_ABD(dest, x, y, intype, arithtype) do { \
941 arithtype tmp_x = (intype)(x); \
942 arithtype tmp_y = (intype)(y); \
943 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
944 } while(0)
945
HELPER(neon_abdl_u16)946 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
947 {
948 uint64_t tmp;
949 uint64_t result;
950 DO_ABD(result, a, b, uint8_t, uint32_t);
951 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
952 result |= tmp << 16;
953 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
954 result |= tmp << 32;
955 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
956 result |= tmp << 48;
957 return result;
958 }
959
HELPER(neon_abdl_s16)960 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
961 {
962 uint64_t tmp;
963 uint64_t result;
964 DO_ABD(result, a, b, int8_t, int32_t);
965 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
966 result |= tmp << 16;
967 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
968 result |= tmp << 32;
969 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
970 result |= tmp << 48;
971 return result;
972 }
973
HELPER(neon_abdl_u32)974 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
975 {
976 uint64_t tmp;
977 uint64_t result;
978 DO_ABD(result, a, b, uint16_t, uint32_t);
979 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
980 return result | (tmp << 32);
981 }
982
HELPER(neon_abdl_s32)983 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
984 {
985 uint64_t tmp;
986 uint64_t result;
987 DO_ABD(result, a, b, int16_t, int32_t);
988 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
989 return result | (tmp << 32);
990 }
991
HELPER(neon_abdl_u64)992 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
993 {
994 uint64_t result;
995 DO_ABD(result, a, b, uint32_t, uint64_t);
996 return result;
997 }
998
HELPER(neon_abdl_s64)999 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1000 {
1001 uint64_t result;
1002 DO_ABD(result, a, b, int32_t, int64_t);
1003 return result;
1004 }
1005 #undef DO_ABD
1006
1007 /* Widening multiply. Named type is the source type. */
1008 #define DO_MULL(dest, x, y, type1, type2) do { \
1009 type1 tmp_x = x; \
1010 type1 tmp_y = y; \
1011 dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1012 } while(0)
1013
HELPER(neon_mull_u8)1014 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1015 {
1016 uint64_t tmp;
1017 uint64_t result;
1018
1019 DO_MULL(result, a, b, uint8_t, uint16_t);
1020 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1021 result |= tmp << 16;
1022 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1023 result |= tmp << 32;
1024 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1025 result |= tmp << 48;
1026 return result;
1027 }
1028
HELPER(neon_mull_s8)1029 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1030 {
1031 uint64_t tmp;
1032 uint64_t result;
1033
1034 DO_MULL(result, a, b, int8_t, uint16_t);
1035 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1036 result |= tmp << 16;
1037 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1038 result |= tmp << 32;
1039 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1040 result |= tmp << 48;
1041 return result;
1042 }
1043
HELPER(neon_mull_u16)1044 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1045 {
1046 uint64_t tmp;
1047 uint64_t result;
1048
1049 DO_MULL(result, a, b, uint16_t, uint32_t);
1050 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1051 return result | (tmp << 32);
1052 }
1053
HELPER(neon_mull_s16)1054 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1055 {
1056 uint64_t tmp;
1057 uint64_t result;
1058
1059 DO_MULL(result, a, b, int16_t, uint32_t);
1060 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1061 return result | (tmp << 32);
1062 }
1063
HELPER(neon_negl_u16)1064 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1065 {
1066 uint16_t tmp;
1067 uint64_t result;
1068 result = (uint16_t)-x;
1069 tmp = -(x >> 16);
1070 result |= (uint64_t)tmp << 16;
1071 tmp = -(x >> 32);
1072 result |= (uint64_t)tmp << 32;
1073 tmp = -(x >> 48);
1074 result |= (uint64_t)tmp << 48;
1075 return result;
1076 }
1077
HELPER(neon_negl_u32)1078 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1079 {
1080 uint32_t low = -x;
1081 uint32_t high = -(x >> 32);
1082 return low | ((uint64_t)high << 32);
1083 }
1084
1085 /* Saturating sign manipulation. */
1086 /* ??? Make these use NEON_VOP1 */
1087 #define DO_QABS8(x) do { \
1088 if (x == (int8_t)0x80) { \
1089 x = 0x7f; \
1090 SET_QC(); \
1091 } else if (x < 0) { \
1092 x = -x; \
1093 }} while (0)
HELPER(neon_qabs_s8)1094 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1095 {
1096 neon_s8 vec;
1097 NEON_UNPACK(neon_s8, vec, x);
1098 DO_QABS8(vec.v1);
1099 DO_QABS8(vec.v2);
1100 DO_QABS8(vec.v3);
1101 DO_QABS8(vec.v4);
1102 NEON_PACK(neon_s8, x, vec);
1103 return x;
1104 }
1105 #undef DO_QABS8
1106
1107 #define DO_QNEG8(x) do { \
1108 if (x == (int8_t)0x80) { \
1109 x = 0x7f; \
1110 SET_QC(); \
1111 } else { \
1112 x = -x; \
1113 }} while (0)
HELPER(neon_qneg_s8)1114 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1115 {
1116 neon_s8 vec;
1117 NEON_UNPACK(neon_s8, vec, x);
1118 DO_QNEG8(vec.v1);
1119 DO_QNEG8(vec.v2);
1120 DO_QNEG8(vec.v3);
1121 DO_QNEG8(vec.v4);
1122 NEON_PACK(neon_s8, x, vec);
1123 return x;
1124 }
1125 #undef DO_QNEG8
1126
1127 #define DO_QABS16(x) do { \
1128 if (x == (int16_t)0x8000) { \
1129 x = 0x7fff; \
1130 SET_QC(); \
1131 } else if (x < 0) { \
1132 x = -x; \
1133 }} while (0)
HELPER(neon_qabs_s16)1134 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1135 {
1136 neon_s16 vec;
1137 NEON_UNPACK(neon_s16, vec, x);
1138 DO_QABS16(vec.v1);
1139 DO_QABS16(vec.v2);
1140 NEON_PACK(neon_s16, x, vec);
1141 return x;
1142 }
1143 #undef DO_QABS16
1144
1145 #define DO_QNEG16(x) do { \
1146 if (x == (int16_t)0x8000) { \
1147 x = 0x7fff; \
1148 SET_QC(); \
1149 } else { \
1150 x = -x; \
1151 }} while (0)
HELPER(neon_qneg_s16)1152 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1153 {
1154 neon_s16 vec;
1155 NEON_UNPACK(neon_s16, vec, x);
1156 DO_QNEG16(vec.v1);
1157 DO_QNEG16(vec.v2);
1158 NEON_PACK(neon_s16, x, vec);
1159 return x;
1160 }
1161 #undef DO_QNEG16
1162
HELPER(neon_qabs_s32)1163 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1164 {
1165 if (x == SIGNBIT) {
1166 SET_QC();
1167 x = ~SIGNBIT;
1168 } else if ((int32_t)x < 0) {
1169 x = -x;
1170 }
1171 return x;
1172 }
1173
HELPER(neon_qneg_s32)1174 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1175 {
1176 if (x == SIGNBIT) {
1177 SET_QC();
1178 x = ~SIGNBIT;
1179 } else {
1180 x = -x;
1181 }
1182 return x;
1183 }
1184
HELPER(neon_qabs_s64)1185 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1186 {
1187 if (x == SIGNBIT64) {
1188 SET_QC();
1189 x = ~SIGNBIT64;
1190 } else if ((int64_t)x < 0) {
1191 x = -x;
1192 }
1193 return x;
1194 }
1195
HELPER(neon_qneg_s64)1196 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1197 {
1198 if (x == SIGNBIT64) {
1199 SET_QC();
1200 x = ~SIGNBIT64;
1201 } else {
1202 x = -x;
1203 }
1204 return x;
1205 }
1206
1207 /* NEON Float helpers. */
1208
1209 /* Floating point comparisons produce an integer result.
1210 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1211 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1212 */
HELPER(neon_ceq_f32)1213 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, float_status *fpst)
1214 {
1215 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1216 }
1217
HELPER(neon_cge_f32)1218 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1219 {
1220 return -float32_le(make_float32(b), make_float32(a), fpst);
1221 }
1222
HELPER(neon_cgt_f32)1223 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1224 {
1225 return -float32_lt(make_float32(b), make_float32(a), fpst);
1226 }
1227
HELPER(neon_acge_f32)1228 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1229 {
1230 float32 f0 = float32_abs(make_float32(a));
1231 float32 f1 = float32_abs(make_float32(b));
1232 return -float32_le(f1, f0, fpst);
1233 }
1234
HELPER(neon_acgt_f32)1235 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1236 {
1237 float32 f0 = float32_abs(make_float32(a));
1238 float32 f1 = float32_abs(make_float32(b));
1239 return -float32_lt(f1, f0, fpst);
1240 }
1241
HELPER(neon_acge_f64)1242 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, float_status *fpst)
1243 {
1244 float64 f0 = float64_abs(make_float64(a));
1245 float64 f1 = float64_abs(make_float64(b));
1246 return -float64_le(f1, f0, fpst);
1247 }
1248
HELPER(neon_acgt_f64)1249 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, float_status *fpst)
1250 {
1251 float64 f0 = float64_abs(make_float64(a));
1252 float64 f1 = float64_abs(make_float64(b));
1253 return -float64_lt(f1, f0, fpst);
1254 }
1255
1256 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1257
HELPER(neon_qunzip8)1258 void HELPER(neon_qunzip8)(void *vd, void *vm)
1259 {
1260 uint64_t *rd = vd, *rm = vm;
1261 uint64_t zd0 = rd[0], zd1 = rd[1];
1262 uint64_t zm0 = rm[0], zm1 = rm[1];
1263
1264 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1265 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1266 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1267 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1268 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1269 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1270 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1271 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1272 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1273 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1274 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1275 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1276 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1277 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1278 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1279 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1280
1281 rm[0] = m0;
1282 rm[1] = m1;
1283 rd[0] = d0;
1284 rd[1] = d1;
1285 }
1286
HELPER(neon_qunzip16)1287 void HELPER(neon_qunzip16)(void *vd, void *vm)
1288 {
1289 uint64_t *rd = vd, *rm = vm;
1290 uint64_t zd0 = rd[0], zd1 = rd[1];
1291 uint64_t zm0 = rm[0], zm1 = rm[1];
1292
1293 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1294 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1295 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1296 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1297 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1298 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1299 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1300 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1301
1302 rm[0] = m0;
1303 rm[1] = m1;
1304 rd[0] = d0;
1305 rd[1] = d1;
1306 }
1307
HELPER(neon_qunzip32)1308 void HELPER(neon_qunzip32)(void *vd, void *vm)
1309 {
1310 uint64_t *rd = vd, *rm = vm;
1311 uint64_t zd0 = rd[0], zd1 = rd[1];
1312 uint64_t zm0 = rm[0], zm1 = rm[1];
1313
1314 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1315 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1316 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1317 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1318
1319 rm[0] = m0;
1320 rm[1] = m1;
1321 rd[0] = d0;
1322 rd[1] = d1;
1323 }
1324
HELPER(neon_unzip8)1325 void HELPER(neon_unzip8)(void *vd, void *vm)
1326 {
1327 uint64_t *rd = vd, *rm = vm;
1328 uint64_t zd = rd[0], zm = rm[0];
1329
1330 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1331 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1332 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1333 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1334 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1335 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1336 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1337 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1338
1339 rm[0] = m0;
1340 rd[0] = d0;
1341 }
1342
HELPER(neon_unzip16)1343 void HELPER(neon_unzip16)(void *vd, void *vm)
1344 {
1345 uint64_t *rd = vd, *rm = vm;
1346 uint64_t zd = rd[0], zm = rm[0];
1347
1348 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1349 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1350 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1351 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1352
1353 rm[0] = m0;
1354 rd[0] = d0;
1355 }
1356
HELPER(neon_qzip8)1357 void HELPER(neon_qzip8)(void *vd, void *vm)
1358 {
1359 uint64_t *rd = vd, *rm = vm;
1360 uint64_t zd0 = rd[0], zd1 = rd[1];
1361 uint64_t zm0 = rm[0], zm1 = rm[1];
1362
1363 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1364 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1365 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1366 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1367 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1368 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1369 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1370 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1371 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1372 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1373 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1374 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1375 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1376 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1377 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1378 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1379
1380 rm[0] = m0;
1381 rm[1] = m1;
1382 rd[0] = d0;
1383 rd[1] = d1;
1384 }
1385
HELPER(neon_qzip16)1386 void HELPER(neon_qzip16)(void *vd, void *vm)
1387 {
1388 uint64_t *rd = vd, *rm = vm;
1389 uint64_t zd0 = rd[0], zd1 = rd[1];
1390 uint64_t zm0 = rm[0], zm1 = rm[1];
1391
1392 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1393 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1394 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1395 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1396 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1397 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1398 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1399 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1400
1401 rm[0] = m0;
1402 rm[1] = m1;
1403 rd[0] = d0;
1404 rd[1] = d1;
1405 }
1406
HELPER(neon_qzip32)1407 void HELPER(neon_qzip32)(void *vd, void *vm)
1408 {
1409 uint64_t *rd = vd, *rm = vm;
1410 uint64_t zd0 = rd[0], zd1 = rd[1];
1411 uint64_t zm0 = rm[0], zm1 = rm[1];
1412
1413 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1414 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1415 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1416 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1417
1418 rm[0] = m0;
1419 rm[1] = m1;
1420 rd[0] = d0;
1421 rd[1] = d1;
1422 }
1423
HELPER(neon_zip8)1424 void HELPER(neon_zip8)(void *vd, void *vm)
1425 {
1426 uint64_t *rd = vd, *rm = vm;
1427 uint64_t zd = rd[0], zm = rm[0];
1428
1429 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1430 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1431 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1432 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1433 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1434 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1435 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1436 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1437
1438 rm[0] = m0;
1439 rd[0] = d0;
1440 }
1441
HELPER(neon_zip16)1442 void HELPER(neon_zip16)(void *vd, void *vm)
1443 {
1444 uint64_t *rd = vd, *rm = vm;
1445 uint64_t zd = rd[0], zm = rm[0];
1446
1447 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1448 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1449 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1450 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1451
1452 rm[0] = m0;
1453 rd[0] = d0;
1454 }
1455