xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision cef9d54f6b6f2a2ecbfd358b7124c921bae4fb32)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
16 
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
19 
20 #define SET_QC() env->vfp.qc[0] = 1
21 
22 #define NEON_TYPE1(name, type) \
23 typedef struct \
24 { \
25     type v1; \
26 } neon_##name;
27 #if HOST_BIG_ENDIAN
28 #define NEON_TYPE2(name, type) \
29 typedef struct \
30 { \
31     type v2; \
32     type v1; \
33 } neon_##name;
34 #define NEON_TYPE4(name, type) \
35 typedef struct \
36 { \
37     type v4; \
38     type v3; \
39     type v2; \
40     type v1; \
41 } neon_##name;
42 #else
43 #define NEON_TYPE2(name, type) \
44 typedef struct \
45 { \
46     type v1; \
47     type v2; \
48 } neon_##name;
49 #define NEON_TYPE4(name, type) \
50 typedef struct \
51 { \
52     type v1; \
53     type v2; \
54     type v3; \
55     type v4; \
56 } neon_##name;
57 #endif
58 
59 NEON_TYPE4(s8, int8_t)
60 NEON_TYPE4(u8, uint8_t)
61 NEON_TYPE2(s16, int16_t)
62 NEON_TYPE2(u16, uint16_t)
63 NEON_TYPE1(s32, int32_t)
64 NEON_TYPE1(u32, uint32_t)
65 #undef NEON_TYPE4
66 #undef NEON_TYPE2
67 #undef NEON_TYPE1
68 
69 /* Copy from a uint32_t to a vector structure type.  */
70 #define NEON_UNPACK(vtype, dest, val) do { \
71     union { \
72         vtype v; \
73         uint32_t i; \
74     } conv_u; \
75     conv_u.i = (val); \
76     dest = conv_u.v; \
77     } while(0)
78 
79 /* Copy from a vector structure type to a uint32_t.  */
80 #define NEON_PACK(vtype, dest, val) do { \
81     union { \
82         vtype v; \
83         uint32_t i; \
84     } conv_u; \
85     conv_u.v = (val); \
86     dest = conv_u.i; \
87     } while(0)
88 
89 #define NEON_DO1 \
90     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91 #define NEON_DO2 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94 #define NEON_DO4 \
95     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99 
100 #define NEON_VOP_BODY(vtype, n) \
101 { \
102     uint32_t res; \
103     vtype vsrc1; \
104     vtype vsrc2; \
105     vtype vdest; \
106     NEON_UNPACK(vtype, vsrc1, arg1); \
107     NEON_UNPACK(vtype, vsrc2, arg2); \
108     NEON_DO##n; \
109     NEON_PACK(vtype, res, vdest); \
110     return res; \
111 }
112 
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
116 
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
120 
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123 {                                                               \
124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
125     vtype *d = vd, *n = vn, *m = vm;                            \
126     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
127         NEON_FN(d[i], n[i], m[i]);                              \
128     }                                                           \
129     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
130 }
131 
132 #define NEON_GVEC_VOP2_ENV(name, vtype) \
133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
134 {                                                               \
135     intptr_t i, opr_sz = simd_oprsz(desc);                      \
136     vtype *d = vd, *n = vn, *m = vm;                            \
137     CPUARMState *env = venv;                                    \
138     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
139         NEON_FN(d[i], n[i], m[i]);                              \
140     }                                                           \
141     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
142 }
143 
144 /* Pairwise operations.  */
145 /* For 32-bit elements each segment only contains a single element, so
146    the elementwise and pairwise operations are the same.  */
147 #define NEON_PDO2 \
148     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
149     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
150 #define NEON_PDO4 \
151     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
152     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
153     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
154     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
155 
156 #define NEON_POP(name, vtype, n) \
157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
158 { \
159     uint32_t res; \
160     vtype vsrc1; \
161     vtype vsrc2; \
162     vtype vdest; \
163     NEON_UNPACK(vtype, vsrc1, arg1); \
164     NEON_UNPACK(vtype, vsrc2, arg2); \
165     NEON_PDO##n; \
166     NEON_PACK(vtype, res, vdest); \
167     return res; \
168 }
169 
170 /* Unary operators.  */
171 #define NEON_VOP1(name, vtype, n) \
172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
173 { \
174     vtype vsrc1; \
175     vtype vdest; \
176     NEON_UNPACK(vtype, vsrc1, arg); \
177     NEON_DO##n; \
178     NEON_PACK(vtype, arg, vdest); \
179     return arg; \
180 }
181 
182 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
183 NEON_VOP(hadd_s8, neon_s8, 4)
184 NEON_VOP(hadd_u8, neon_u8, 4)
185 NEON_VOP(hadd_s16, neon_s16, 2)
186 NEON_VOP(hadd_u16, neon_u16, 2)
187 #undef NEON_FN
188 
189 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
190 {
191     int32_t dest;
192 
193     dest = (src1 >> 1) + (src2 >> 1);
194     if (src1 & src2 & 1)
195         dest++;
196     return dest;
197 }
198 
199 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
200 {
201     uint32_t dest;
202 
203     dest = (src1 >> 1) + (src2 >> 1);
204     if (src1 & src2 & 1)
205         dest++;
206     return dest;
207 }
208 
209 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
210 NEON_VOP(rhadd_s8, neon_s8, 4)
211 NEON_VOP(rhadd_u8, neon_u8, 4)
212 NEON_VOP(rhadd_s16, neon_s16, 2)
213 NEON_VOP(rhadd_u16, neon_u16, 2)
214 #undef NEON_FN
215 
216 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
217 {
218     int32_t dest;
219 
220     dest = (src1 >> 1) + (src2 >> 1);
221     if ((src1 | src2) & 1)
222         dest++;
223     return dest;
224 }
225 
226 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
227 {
228     uint32_t dest;
229 
230     dest = (src1 >> 1) + (src2 >> 1);
231     if ((src1 | src2) & 1)
232         dest++;
233     return dest;
234 }
235 
236 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
237 NEON_VOP(hsub_s8, neon_s8, 4)
238 NEON_VOP(hsub_u8, neon_u8, 4)
239 NEON_VOP(hsub_s16, neon_s16, 2)
240 NEON_VOP(hsub_u16, neon_u16, 2)
241 #undef NEON_FN
242 
243 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
244 {
245     int32_t dest;
246 
247     dest = (src1 >> 1) - (src2 >> 1);
248     if ((~src1) & src2 & 1)
249         dest--;
250     return dest;
251 }
252 
253 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
254 {
255     uint32_t dest;
256 
257     dest = (src1 >> 1) - (src2 >> 1);
258     if ((~src1) & src2 & 1)
259         dest--;
260     return dest;
261 }
262 
263 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
264 NEON_POP(pmin_s8, neon_s8, 4)
265 NEON_POP(pmin_u8, neon_u8, 4)
266 NEON_POP(pmin_s16, neon_s16, 2)
267 NEON_POP(pmin_u16, neon_u16, 2)
268 #undef NEON_FN
269 
270 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
271 NEON_POP(pmax_s8, neon_s8, 4)
272 NEON_POP(pmax_u8, neon_u8, 4)
273 NEON_POP(pmax_s16, neon_s16, 2)
274 NEON_POP(pmax_u16, neon_u16, 2)
275 #undef NEON_FN
276 
277 #define NEON_FN(dest, src1, src2) \
278     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
279 NEON_VOP(shl_u16, neon_u16, 2)
280 #undef NEON_FN
281 
282 #define NEON_FN(dest, src1, src2) \
283     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
284 NEON_VOP(shl_s16, neon_s16, 2)
285 #undef NEON_FN
286 
287 #define NEON_FN(dest, src1, src2) \
288     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
289 NEON_VOP(rshl_s8, neon_s8, 4)
290 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
291 #undef NEON_FN
292 
293 #define NEON_FN(dest, src1, src2) \
294     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
295 NEON_VOP(rshl_s16, neon_s16, 2)
296 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
297 #undef NEON_FN
298 
299 #define NEON_FN(dest, src1, src2) \
300     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
301 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
302 #undef NEON_FN
303 
304 #define NEON_FN(dest, src1, src2) \
305     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
306 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
307 #undef NEON_FN
308 
309 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
310 {
311     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
312 }
313 
314 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
315 {
316     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
317 }
318 
319 #define NEON_FN(dest, src1, src2) \
320     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
321 NEON_VOP(rshl_u8, neon_u8, 4)
322 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
323 #undef NEON_FN
324 
325 #define NEON_FN(dest, src1, src2) \
326     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
327 NEON_VOP(rshl_u16, neon_u16, 2)
328 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
329 #undef NEON_FN
330 
331 #define NEON_FN(dest, src1, src2) \
332     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
333 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
334 #undef NEON_FN
335 
336 #define NEON_FN(dest, src1, src2) \
337     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
338 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
339 #undef NEON_FN
340 
341 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
342 {
343     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
344 }
345 
346 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
347 {
348     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
349 }
350 
351 #define NEON_FN(dest, src1, src2) \
352     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
353 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
354 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
355 #undef NEON_FN
356 
357 #define NEON_FN(dest, src1, src2) \
358     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
359 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
360 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
361 #undef NEON_FN
362 
363 #define NEON_FN(dest, src1, src2) \
364     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
365 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
366 #undef NEON_FN
367 
368 #define NEON_FN(dest, src1, src2) \
369     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
370 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
371 #undef NEON_FN
372 
373 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
374 {
375     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
376 }
377 
378 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
379 {
380     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
381 }
382 
383 #define NEON_FN(dest, src1, src2) \
384     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
385 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
386 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
387 #undef NEON_FN
388 
389 #define NEON_FN(dest, src1, src2) \
390     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
391 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
392 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
393 #undef NEON_FN
394 
395 #define NEON_FN(dest, src1, src2) \
396     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
397 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
398 #undef NEON_FN
399 
400 #define NEON_FN(dest, src1, src2) \
401     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
402 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
403 #undef NEON_FN
404 
405 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
406 {
407     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
408 }
409 
410 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
411 {
412     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
413 }
414 
415 #define NEON_FN(dest, src1, src2) \
416     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
417 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
418 #undef NEON_FN
419 
420 #define NEON_FN(dest, src1, src2) \
421     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
422 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
423 #undef NEON_FN
424 
425 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
426 {
427     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
428 }
429 
430 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
431 {
432     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
433 }
434 
435 #define NEON_FN(dest, src1, src2) \
436     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
437 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
438 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
439 #undef NEON_FN
440 
441 #define NEON_FN(dest, src1, src2) \
442     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
443 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
444 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
445 #undef NEON_FN
446 
447 #define NEON_FN(dest, src1, src2) \
448     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
449 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
450 #undef NEON_FN
451 
452 #define NEON_FN(dest, src1, src2) \
453     (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
454 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
455 #undef NEON_FN
456 
457 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
458 {
459     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
460 }
461 
462 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
463 {
464     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
465 }
466 
467 #define NEON_FN(dest, src1, src2) \
468     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
469 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
470 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
471 #undef NEON_FN
472 
473 #define NEON_FN(dest, src1, src2) \
474     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
475 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
476 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
477 #undef NEON_FN
478 
479 #define NEON_FN(dest, src1, src2) \
480     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
481 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
482 #undef NEON_FN
483 
484 #define NEON_FN(dest, src1, src2) \
485     (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
486 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
487 #undef NEON_FN
488 
489 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
490 {
491     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
492 }
493 
494 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
495 {
496     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
497 }
498 
499 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
500 {
501     uint32_t mask;
502     mask = (a ^ b) & 0x80808080u;
503     a &= ~0x80808080u;
504     b &= ~0x80808080u;
505     return (a + b) ^ mask;
506 }
507 
508 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
509 {
510     uint32_t mask;
511     mask = (a ^ b) & 0x80008000u;
512     a &= ~0x80008000u;
513     b &= ~0x80008000u;
514     return (a + b) ^ mask;
515 }
516 
517 #define NEON_FN(dest, src1, src2) dest = src1 - src2
518 NEON_VOP(sub_u8, neon_u8, 4)
519 NEON_VOP(sub_u16, neon_u16, 2)
520 #undef NEON_FN
521 
522 #define NEON_FN(dest, src1, src2) dest = src1 * src2
523 NEON_VOP(mul_u8, neon_u8, 4)
524 NEON_VOP(mul_u16, neon_u16, 2)
525 #undef NEON_FN
526 
527 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
528 NEON_VOP(tst_u8, neon_u8, 4)
529 NEON_VOP(tst_u16, neon_u16, 2)
530 NEON_VOP(tst_u32, neon_u32, 1)
531 #undef NEON_FN
532 
533 /* Count Leading Sign/Zero Bits.  */
534 static inline int do_clz8(uint8_t x)
535 {
536     int n;
537     for (n = 8; x; n--)
538         x >>= 1;
539     return n;
540 }
541 
542 static inline int do_clz16(uint16_t x)
543 {
544     int n;
545     for (n = 16; x; n--)
546         x >>= 1;
547     return n;
548 }
549 
550 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
551 NEON_VOP1(clz_u8, neon_u8, 4)
552 #undef NEON_FN
553 
554 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
555 NEON_VOP1(clz_u16, neon_u16, 2)
556 #undef NEON_FN
557 
558 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
559 NEON_VOP1(cls_s8, neon_s8, 4)
560 #undef NEON_FN
561 
562 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
563 NEON_VOP1(cls_s16, neon_s16, 2)
564 #undef NEON_FN
565 
566 uint32_t HELPER(neon_cls_s32)(uint32_t x)
567 {
568     int count;
569     if ((int32_t)x < 0)
570         x = ~x;
571     for (count = 32; x; count--)
572         x = x >> 1;
573     return count - 1;
574 }
575 
576 /* Bit count.  */
577 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
578 {
579     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
580     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
581     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
582     return x;
583 }
584 
585 /* Reverse bits in each 8 bit word */
586 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
587 {
588     x =  ((x & 0xf0f0f0f0) >> 4)
589        | ((x & 0x0f0f0f0f) << 4);
590     x =  ((x & 0x88888888) >> 3)
591        | ((x & 0x44444444) >> 1)
592        | ((x & 0x22222222) << 1)
593        | ((x & 0x11111111) << 3);
594     return x;
595 }
596 
597 #define NEON_QDMULH16(dest, src1, src2, round) do { \
598     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
599     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
600         SET_QC(); \
601         tmp = (tmp >> 31) ^ ~SIGNBIT; \
602     } else { \
603         tmp <<= 1; \
604     } \
605     if (round) { \
606         int32_t old = tmp; \
607         tmp += 1 << 15; \
608         if ((int32_t)tmp < old) { \
609             SET_QC(); \
610             tmp = SIGNBIT - 1; \
611         } \
612     } \
613     dest = tmp >> 16; \
614     } while(0)
615 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
616 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
617 #undef NEON_FN
618 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
619 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
620 #undef NEON_FN
621 #undef NEON_QDMULH16
622 
623 #define NEON_QDMULH32(dest, src1, src2, round) do { \
624     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
625     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
626         SET_QC(); \
627         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
628     } else { \
629         tmp <<= 1; \
630     } \
631     if (round) { \
632         int64_t old = tmp; \
633         tmp += (int64_t)1 << 31; \
634         if ((int64_t)tmp < old) { \
635             SET_QC(); \
636             tmp = SIGNBIT64 - 1; \
637         } \
638     } \
639     dest = tmp >> 32; \
640     } while(0)
641 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
642 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
643 #undef NEON_FN
644 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
645 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
646 #undef NEON_FN
647 #undef NEON_QDMULH32
648 
649 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
650 {
651     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
652            | ((x >> 24) & 0xff000000u);
653 }
654 
655 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
656 {
657     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
658 }
659 
660 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
661 {
662     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
663             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
664 }
665 
666 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
667 {
668     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
669 }
670 
671 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
672 {
673     x &= 0xff80ff80ff80ff80ull;
674     x += 0x0080008000800080ull;
675     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
676             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
677 }
678 
679 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
680 {
681     x &= 0xffff8000ffff8000ull;
682     x += 0x0000800000008000ull;
683     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
684 }
685 
686 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
687 {
688     uint16_t s;
689     uint8_t d;
690     uint32_t res = 0;
691 #define SAT8(n) \
692     s = x >> n; \
693     if (s & 0x8000) { \
694         SET_QC(); \
695     } else { \
696         if (s > 0xff) { \
697             d = 0xff; \
698             SET_QC(); \
699         } else  { \
700             d = s; \
701         } \
702         res |= (uint32_t)d << (n / 2); \
703     }
704 
705     SAT8(0);
706     SAT8(16);
707     SAT8(32);
708     SAT8(48);
709 #undef SAT8
710     return res;
711 }
712 
713 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
714 {
715     uint16_t s;
716     uint8_t d;
717     uint32_t res = 0;
718 #define SAT8(n) \
719     s = x >> n; \
720     if (s > 0xff) { \
721         d = 0xff; \
722         SET_QC(); \
723     } else  { \
724         d = s; \
725     } \
726     res |= (uint32_t)d << (n / 2);
727 
728     SAT8(0);
729     SAT8(16);
730     SAT8(32);
731     SAT8(48);
732 #undef SAT8
733     return res;
734 }
735 
736 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
737 {
738     int16_t s;
739     uint8_t d;
740     uint32_t res = 0;
741 #define SAT8(n) \
742     s = x >> n; \
743     if (s != (int8_t)s) { \
744         d = (s >> 15) ^ 0x7f; \
745         SET_QC(); \
746     } else  { \
747         d = s; \
748     } \
749     res |= (uint32_t)d << (n / 2);
750 
751     SAT8(0);
752     SAT8(16);
753     SAT8(32);
754     SAT8(48);
755 #undef SAT8
756     return res;
757 }
758 
759 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
760 {
761     uint32_t high;
762     uint32_t low;
763     low = x;
764     if (low & 0x80000000) {
765         low = 0;
766         SET_QC();
767     } else if (low > 0xffff) {
768         low = 0xffff;
769         SET_QC();
770     }
771     high = x >> 32;
772     if (high & 0x80000000) {
773         high = 0;
774         SET_QC();
775     } else if (high > 0xffff) {
776         high = 0xffff;
777         SET_QC();
778     }
779     return low | (high << 16);
780 }
781 
782 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
783 {
784     uint32_t high;
785     uint32_t low;
786     low = x;
787     if (low > 0xffff) {
788         low = 0xffff;
789         SET_QC();
790     }
791     high = x >> 32;
792     if (high > 0xffff) {
793         high = 0xffff;
794         SET_QC();
795     }
796     return low | (high << 16);
797 }
798 
799 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
800 {
801     int32_t low;
802     int32_t high;
803     low = x;
804     if (low != (int16_t)low) {
805         low = (low >> 31) ^ 0x7fff;
806         SET_QC();
807     }
808     high = x >> 32;
809     if (high != (int16_t)high) {
810         high = (high >> 31) ^ 0x7fff;
811         SET_QC();
812     }
813     return (uint16_t)low | (high << 16);
814 }
815 
816 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
817 {
818     if (x & 0x8000000000000000ull) {
819         SET_QC();
820         return 0;
821     }
822     if (x > 0xffffffffu) {
823         SET_QC();
824         return 0xffffffffu;
825     }
826     return x;
827 }
828 
829 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
830 {
831     if (x > 0xffffffffu) {
832         SET_QC();
833         return 0xffffffffu;
834     }
835     return x;
836 }
837 
838 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
839 {
840     if ((int64_t)x != (int32_t)x) {
841         SET_QC();
842         return ((int64_t)x >> 63) ^ 0x7fffffff;
843     }
844     return x;
845 }
846 
847 uint64_t HELPER(neon_widen_u8)(uint32_t x)
848 {
849     uint64_t tmp;
850     uint64_t ret;
851     ret = (uint8_t)x;
852     tmp = (uint8_t)(x >> 8);
853     ret |= tmp << 16;
854     tmp = (uint8_t)(x >> 16);
855     ret |= tmp << 32;
856     tmp = (uint8_t)(x >> 24);
857     ret |= tmp << 48;
858     return ret;
859 }
860 
861 uint64_t HELPER(neon_widen_s8)(uint32_t x)
862 {
863     uint64_t tmp;
864     uint64_t ret;
865     ret = (uint16_t)(int8_t)x;
866     tmp = (uint16_t)(int8_t)(x >> 8);
867     ret |= tmp << 16;
868     tmp = (uint16_t)(int8_t)(x >> 16);
869     ret |= tmp << 32;
870     tmp = (uint16_t)(int8_t)(x >> 24);
871     ret |= tmp << 48;
872     return ret;
873 }
874 
875 uint64_t HELPER(neon_widen_u16)(uint32_t x)
876 {
877     uint64_t high = (uint16_t)(x >> 16);
878     return ((uint16_t)x) | (high << 32);
879 }
880 
881 uint64_t HELPER(neon_widen_s16)(uint32_t x)
882 {
883     uint64_t high = (int16_t)(x >> 16);
884     return ((uint32_t)(int16_t)x) | (high << 32);
885 }
886 
887 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
888 {
889     uint64_t mask;
890     mask = (a ^ b) & 0x8000800080008000ull;
891     a &= ~0x8000800080008000ull;
892     b &= ~0x8000800080008000ull;
893     return (a + b) ^ mask;
894 }
895 
896 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
897 {
898     uint64_t mask;
899     mask = (a ^ b) & 0x8000000080000000ull;
900     a &= ~0x8000000080000000ull;
901     b &= ~0x8000000080000000ull;
902     return (a + b) ^ mask;
903 }
904 
905 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
906 {
907     uint64_t tmp;
908     uint64_t tmp2;
909 
910     tmp = a & 0x0000ffff0000ffffull;
911     tmp += (a >> 16) & 0x0000ffff0000ffffull;
912     tmp2 = b & 0xffff0000ffff0000ull;
913     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
914     return    ( tmp         & 0xffff)
915             | ((tmp  >> 16) & 0xffff0000ull)
916             | ((tmp2 << 16) & 0xffff00000000ull)
917             | ( tmp2        & 0xffff000000000000ull);
918 }
919 
920 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
921 {
922     uint32_t low = a + (a >> 32);
923     uint32_t high = b + (b >> 32);
924     return low + ((uint64_t)high << 32);
925 }
926 
927 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
928 {
929     uint64_t mask;
930     mask = (a ^ ~b) & 0x8000800080008000ull;
931     a |= 0x8000800080008000ull;
932     b &= ~0x8000800080008000ull;
933     return (a - b) ^ mask;
934 }
935 
936 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
937 {
938     uint64_t mask;
939     mask = (a ^ ~b) & 0x8000000080000000ull;
940     a |= 0x8000000080000000ull;
941     b &= ~0x8000000080000000ull;
942     return (a - b) ^ mask;
943 }
944 
945 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
946 {
947     uint32_t x, y;
948     uint32_t low, high;
949 
950     x = a;
951     y = b;
952     low = x + y;
953     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
954         SET_QC();
955         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
956     }
957     x = a >> 32;
958     y = b >> 32;
959     high = x + y;
960     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
961         SET_QC();
962         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
963     }
964     return low | ((uint64_t)high << 32);
965 }
966 
967 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
968 {
969     uint64_t result;
970 
971     result = a + b;
972     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
973         SET_QC();
974         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
975     }
976     return result;
977 }
978 
979 /* We have to do the arithmetic in a larger type than
980  * the input type, because for example with a signed 32 bit
981  * op the absolute difference can overflow a signed 32 bit value.
982  */
983 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
984     arithtype tmp_x = (intype)(x);                            \
985     arithtype tmp_y = (intype)(y);                            \
986     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
987     } while(0)
988 
989 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
990 {
991     uint64_t tmp;
992     uint64_t result;
993     DO_ABD(result, a, b, uint8_t, uint32_t);
994     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
995     result |= tmp << 16;
996     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
997     result |= tmp << 32;
998     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
999     result |= tmp << 48;
1000     return result;
1001 }
1002 
1003 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1004 {
1005     uint64_t tmp;
1006     uint64_t result;
1007     DO_ABD(result, a, b, int8_t, int32_t);
1008     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1009     result |= tmp << 16;
1010     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1011     result |= tmp << 32;
1012     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1013     result |= tmp << 48;
1014     return result;
1015 }
1016 
1017 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1018 {
1019     uint64_t tmp;
1020     uint64_t result;
1021     DO_ABD(result, a, b, uint16_t, uint32_t);
1022     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1023     return result | (tmp << 32);
1024 }
1025 
1026 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1027 {
1028     uint64_t tmp;
1029     uint64_t result;
1030     DO_ABD(result, a, b, int16_t, int32_t);
1031     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1032     return result | (tmp << 32);
1033 }
1034 
1035 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1036 {
1037     uint64_t result;
1038     DO_ABD(result, a, b, uint32_t, uint64_t);
1039     return result;
1040 }
1041 
1042 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1043 {
1044     uint64_t result;
1045     DO_ABD(result, a, b, int32_t, int64_t);
1046     return result;
1047 }
1048 #undef DO_ABD
1049 
1050 /* Widening multiply. Named type is the source type.  */
1051 #define DO_MULL(dest, x, y, type1, type2) do { \
1052     type1 tmp_x = x; \
1053     type1 tmp_y = y; \
1054     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1055     } while(0)
1056 
1057 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1058 {
1059     uint64_t tmp;
1060     uint64_t result;
1061 
1062     DO_MULL(result, a, b, uint8_t, uint16_t);
1063     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1064     result |= tmp << 16;
1065     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1066     result |= tmp << 32;
1067     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1068     result |= tmp << 48;
1069     return result;
1070 }
1071 
1072 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1073 {
1074     uint64_t tmp;
1075     uint64_t result;
1076 
1077     DO_MULL(result, a, b, int8_t, uint16_t);
1078     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1079     result |= tmp << 16;
1080     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1081     result |= tmp << 32;
1082     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1083     result |= tmp << 48;
1084     return result;
1085 }
1086 
1087 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1088 {
1089     uint64_t tmp;
1090     uint64_t result;
1091 
1092     DO_MULL(result, a, b, uint16_t, uint32_t);
1093     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1094     return result | (tmp << 32);
1095 }
1096 
1097 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1098 {
1099     uint64_t tmp;
1100     uint64_t result;
1101 
1102     DO_MULL(result, a, b, int16_t, uint32_t);
1103     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1104     return result | (tmp << 32);
1105 }
1106 
1107 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1108 {
1109     uint16_t tmp;
1110     uint64_t result;
1111     result = (uint16_t)-x;
1112     tmp = -(x >> 16);
1113     result |= (uint64_t)tmp << 16;
1114     tmp = -(x >> 32);
1115     result |= (uint64_t)tmp << 32;
1116     tmp = -(x >> 48);
1117     result |= (uint64_t)tmp << 48;
1118     return result;
1119 }
1120 
1121 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1122 {
1123     uint32_t low = -x;
1124     uint32_t high = -(x >> 32);
1125     return low | ((uint64_t)high << 32);
1126 }
1127 
1128 /* Saturating sign manipulation.  */
1129 /* ??? Make these use NEON_VOP1 */
1130 #define DO_QABS8(x) do { \
1131     if (x == (int8_t)0x80) { \
1132         x = 0x7f; \
1133         SET_QC(); \
1134     } else if (x < 0) { \
1135         x = -x; \
1136     }} while (0)
1137 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1138 {
1139     neon_s8 vec;
1140     NEON_UNPACK(neon_s8, vec, x);
1141     DO_QABS8(vec.v1);
1142     DO_QABS8(vec.v2);
1143     DO_QABS8(vec.v3);
1144     DO_QABS8(vec.v4);
1145     NEON_PACK(neon_s8, x, vec);
1146     return x;
1147 }
1148 #undef DO_QABS8
1149 
1150 #define DO_QNEG8(x) do { \
1151     if (x == (int8_t)0x80) { \
1152         x = 0x7f; \
1153         SET_QC(); \
1154     } else { \
1155         x = -x; \
1156     }} while (0)
1157 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1158 {
1159     neon_s8 vec;
1160     NEON_UNPACK(neon_s8, vec, x);
1161     DO_QNEG8(vec.v1);
1162     DO_QNEG8(vec.v2);
1163     DO_QNEG8(vec.v3);
1164     DO_QNEG8(vec.v4);
1165     NEON_PACK(neon_s8, x, vec);
1166     return x;
1167 }
1168 #undef DO_QNEG8
1169 
1170 #define DO_QABS16(x) do { \
1171     if (x == (int16_t)0x8000) { \
1172         x = 0x7fff; \
1173         SET_QC(); \
1174     } else if (x < 0) { \
1175         x = -x; \
1176     }} while (0)
1177 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1178 {
1179     neon_s16 vec;
1180     NEON_UNPACK(neon_s16, vec, x);
1181     DO_QABS16(vec.v1);
1182     DO_QABS16(vec.v2);
1183     NEON_PACK(neon_s16, x, vec);
1184     return x;
1185 }
1186 #undef DO_QABS16
1187 
1188 #define DO_QNEG16(x) do { \
1189     if (x == (int16_t)0x8000) { \
1190         x = 0x7fff; \
1191         SET_QC(); \
1192     } else { \
1193         x = -x; \
1194     }} while (0)
1195 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1196 {
1197     neon_s16 vec;
1198     NEON_UNPACK(neon_s16, vec, x);
1199     DO_QNEG16(vec.v1);
1200     DO_QNEG16(vec.v2);
1201     NEON_PACK(neon_s16, x, vec);
1202     return x;
1203 }
1204 #undef DO_QNEG16
1205 
1206 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1207 {
1208     if (x == SIGNBIT) {
1209         SET_QC();
1210         x = ~SIGNBIT;
1211     } else if ((int32_t)x < 0) {
1212         x = -x;
1213     }
1214     return x;
1215 }
1216 
1217 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1218 {
1219     if (x == SIGNBIT) {
1220         SET_QC();
1221         x = ~SIGNBIT;
1222     } else {
1223         x = -x;
1224     }
1225     return x;
1226 }
1227 
1228 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1229 {
1230     if (x == SIGNBIT64) {
1231         SET_QC();
1232         x = ~SIGNBIT64;
1233     } else if ((int64_t)x < 0) {
1234         x = -x;
1235     }
1236     return x;
1237 }
1238 
1239 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1240 {
1241     if (x == SIGNBIT64) {
1242         SET_QC();
1243         x = ~SIGNBIT64;
1244     } else {
1245         x = -x;
1246     }
1247     return x;
1248 }
1249 
1250 /* NEON Float helpers.  */
1251 
1252 /* Floating point comparisons produce an integer result.
1253  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1254  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1255  */
1256 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1257 {
1258     float_status *fpst = fpstp;
1259     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1260 }
1261 
1262 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1263 {
1264     float_status *fpst = fpstp;
1265     return -float32_le(make_float32(b), make_float32(a), fpst);
1266 }
1267 
1268 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1269 {
1270     float_status *fpst = fpstp;
1271     return -float32_lt(make_float32(b), make_float32(a), fpst);
1272 }
1273 
1274 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1275 {
1276     float_status *fpst = fpstp;
1277     float32 f0 = float32_abs(make_float32(a));
1278     float32 f1 = float32_abs(make_float32(b));
1279     return -float32_le(f1, f0, fpst);
1280 }
1281 
1282 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1283 {
1284     float_status *fpst = fpstp;
1285     float32 f0 = float32_abs(make_float32(a));
1286     float32 f1 = float32_abs(make_float32(b));
1287     return -float32_lt(f1, f0, fpst);
1288 }
1289 
1290 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1291 {
1292     float_status *fpst = fpstp;
1293     float64 f0 = float64_abs(make_float64(a));
1294     float64 f1 = float64_abs(make_float64(b));
1295     return -float64_le(f1, f0, fpst);
1296 }
1297 
1298 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1299 {
1300     float_status *fpst = fpstp;
1301     float64 f0 = float64_abs(make_float64(a));
1302     float64 f1 = float64_abs(make_float64(b));
1303     return -float64_lt(f1, f0, fpst);
1304 }
1305 
1306 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1307 
1308 void HELPER(neon_qunzip8)(void *vd, void *vm)
1309 {
1310     uint64_t *rd = vd, *rm = vm;
1311     uint64_t zd0 = rd[0], zd1 = rd[1];
1312     uint64_t zm0 = rm[0], zm1 = rm[1];
1313 
1314     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1315         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1316         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1317         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1318     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1319         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1320         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1321         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1322     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1323         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1324         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1325         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1326     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1327         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1328         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1329         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1330 
1331     rm[0] = m0;
1332     rm[1] = m1;
1333     rd[0] = d0;
1334     rd[1] = d1;
1335 }
1336 
1337 void HELPER(neon_qunzip16)(void *vd, void *vm)
1338 {
1339     uint64_t *rd = vd, *rm = vm;
1340     uint64_t zd0 = rd[0], zd1 = rd[1];
1341     uint64_t zm0 = rm[0], zm1 = rm[1];
1342 
1343     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1344         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1345     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1346         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1347     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1348         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1349     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1350         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1351 
1352     rm[0] = m0;
1353     rm[1] = m1;
1354     rd[0] = d0;
1355     rd[1] = d1;
1356 }
1357 
1358 void HELPER(neon_qunzip32)(void *vd, void *vm)
1359 {
1360     uint64_t *rd = vd, *rm = vm;
1361     uint64_t zd0 = rd[0], zd1 = rd[1];
1362     uint64_t zm0 = rm[0], zm1 = rm[1];
1363 
1364     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1365     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1366     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1367     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1368 
1369     rm[0] = m0;
1370     rm[1] = m1;
1371     rd[0] = d0;
1372     rd[1] = d1;
1373 }
1374 
1375 void HELPER(neon_unzip8)(void *vd, void *vm)
1376 {
1377     uint64_t *rd = vd, *rm = vm;
1378     uint64_t zd = rd[0], zm = rm[0];
1379 
1380     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1381         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1382         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1383         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1384     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1385         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1386         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1387         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1388 
1389     rm[0] = m0;
1390     rd[0] = d0;
1391 }
1392 
1393 void HELPER(neon_unzip16)(void *vd, void *vm)
1394 {
1395     uint64_t *rd = vd, *rm = vm;
1396     uint64_t zd = rd[0], zm = rm[0];
1397 
1398     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1399         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1400     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1401         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1402 
1403     rm[0] = m0;
1404     rd[0] = d0;
1405 }
1406 
1407 void HELPER(neon_qzip8)(void *vd, void *vm)
1408 {
1409     uint64_t *rd = vd, *rm = vm;
1410     uint64_t zd0 = rd[0], zd1 = rd[1];
1411     uint64_t zm0 = rm[0], zm1 = rm[1];
1412 
1413     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1414         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1415         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1416         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1417     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1418         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1419         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1420         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1421     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1422         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1423         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1424         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1425     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1426         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1427         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1428         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1429 
1430     rm[0] = m0;
1431     rm[1] = m1;
1432     rd[0] = d0;
1433     rd[1] = d1;
1434 }
1435 
1436 void HELPER(neon_qzip16)(void *vd, void *vm)
1437 {
1438     uint64_t *rd = vd, *rm = vm;
1439     uint64_t zd0 = rd[0], zd1 = rd[1];
1440     uint64_t zm0 = rm[0], zm1 = rm[1];
1441 
1442     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1443         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1444     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1445         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1446     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1447         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1448     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1449         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1450 
1451     rm[0] = m0;
1452     rm[1] = m1;
1453     rd[0] = d0;
1454     rd[1] = d1;
1455 }
1456 
1457 void HELPER(neon_qzip32)(void *vd, void *vm)
1458 {
1459     uint64_t *rd = vd, *rm = vm;
1460     uint64_t zd0 = rd[0], zd1 = rd[1];
1461     uint64_t zm0 = rm[0], zm1 = rm[1];
1462 
1463     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1464     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1465     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1466     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1467 
1468     rm[0] = m0;
1469     rm[1] = m1;
1470     rd[0] = d0;
1471     rd[1] = d1;
1472 }
1473 
1474 void HELPER(neon_zip8)(void *vd, void *vm)
1475 {
1476     uint64_t *rd = vd, *rm = vm;
1477     uint64_t zd = rd[0], zm = rm[0];
1478 
1479     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1480         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1481         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1482         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1483     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1484         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1485         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1486         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1487 
1488     rm[0] = m0;
1489     rd[0] = d0;
1490 }
1491 
1492 void HELPER(neon_zip16)(void *vd, void *vm)
1493 {
1494     uint64_t *rd = vd, *rm = vm;
1495     uint64_t zd = rd[0], zm = rm[0];
1496 
1497     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1498         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1499     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1500         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1501 
1502     rm[0] = m0;
1503     rd[0] = d0;
1504 }
1505