xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision 1217edac)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 #include "qemu/osdep.h"
10 
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15 
16 #define SIGNBIT (uint32_t)0x80000000
17 #define SIGNBIT64 ((uint64_t)1 << 63)
18 
19 #define SET_QC() env->vfp.qc[0] = 1
20 
21 #define NEON_TYPE1(name, type) \
22 typedef struct \
23 { \
24     type v1; \
25 } neon_##name;
26 #if HOST_BIG_ENDIAN
27 #define NEON_TYPE2(name, type) \
28 typedef struct \
29 { \
30     type v2; \
31     type v1; \
32 } neon_##name;
33 #define NEON_TYPE4(name, type) \
34 typedef struct \
35 { \
36     type v4; \
37     type v3; \
38     type v2; \
39     type v1; \
40 } neon_##name;
41 #else
42 #define NEON_TYPE2(name, type) \
43 typedef struct \
44 { \
45     type v1; \
46     type v2; \
47 } neon_##name;
48 #define NEON_TYPE4(name, type) \
49 typedef struct \
50 { \
51     type v1; \
52     type v2; \
53     type v3; \
54     type v4; \
55 } neon_##name;
56 #endif
57 
58 NEON_TYPE4(s8, int8_t)
59 NEON_TYPE4(u8, uint8_t)
60 NEON_TYPE2(s16, int16_t)
61 NEON_TYPE2(u16, uint16_t)
62 NEON_TYPE1(s32, int32_t)
63 NEON_TYPE1(u32, uint32_t)
64 #undef NEON_TYPE4
65 #undef NEON_TYPE2
66 #undef NEON_TYPE1
67 
68 /* Copy from a uint32_t to a vector structure type.  */
69 #define NEON_UNPACK(vtype, dest, val) do { \
70     union { \
71         vtype v; \
72         uint32_t i; \
73     } conv_u; \
74     conv_u.i = (val); \
75     dest = conv_u.v; \
76     } while(0)
77 
78 /* Copy from a vector structure type to a uint32_t.  */
79 #define NEON_PACK(vtype, dest, val) do { \
80     union { \
81         vtype v; \
82         uint32_t i; \
83     } conv_u; \
84     conv_u.v = (val); \
85     dest = conv_u.i; \
86     } while(0)
87 
88 #define NEON_DO1 \
89     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
90 #define NEON_DO2 \
91     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
92     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
93 #define NEON_DO4 \
94     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
96     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
97     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
98 
99 #define NEON_VOP_BODY(vtype, n) \
100 { \
101     uint32_t res; \
102     vtype vsrc1; \
103     vtype vsrc2; \
104     vtype vdest; \
105     NEON_UNPACK(vtype, vsrc1, arg1); \
106     NEON_UNPACK(vtype, vsrc2, arg2); \
107     NEON_DO##n; \
108     NEON_PACK(vtype, res, vdest); \
109     return res; \
110 }
111 
112 #define NEON_VOP(name, vtype, n) \
113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
114 NEON_VOP_BODY(vtype, n)
115 
116 #define NEON_VOP_ENV(name, vtype, n) \
117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
118 NEON_VOP_BODY(vtype, n)
119 
120 /* Pairwise operations.  */
121 /* For 32-bit elements each segment only contains a single element, so
122    the elementwise and pairwise operations are the same.  */
123 #define NEON_PDO2 \
124     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
125     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
126 #define NEON_PDO4 \
127     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
128     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
129     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
130     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
131 
132 #define NEON_POP(name, vtype, n) \
133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
134 { \
135     uint32_t res; \
136     vtype vsrc1; \
137     vtype vsrc2; \
138     vtype vdest; \
139     NEON_UNPACK(vtype, vsrc1, arg1); \
140     NEON_UNPACK(vtype, vsrc2, arg2); \
141     NEON_PDO##n; \
142     NEON_PACK(vtype, res, vdest); \
143     return res; \
144 }
145 
146 /* Unary operators.  */
147 #define NEON_VOP1(name, vtype, n) \
148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
149 { \
150     vtype vsrc1; \
151     vtype vdest; \
152     NEON_UNPACK(vtype, vsrc1, arg); \
153     NEON_DO##n; \
154     NEON_PACK(vtype, arg, vdest); \
155     return arg; \
156 }
157 
158 
159 #define NEON_USAT(dest, src1, src2, type) do { \
160     uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
161     if (tmp != (type)tmp) { \
162         SET_QC(); \
163         dest = ~0; \
164     } else { \
165         dest = tmp; \
166     }} while(0)
167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
168 NEON_VOP_ENV(qadd_u8, neon_u8, 4)
169 #undef NEON_FN
170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
171 NEON_VOP_ENV(qadd_u16, neon_u16, 2)
172 #undef NEON_FN
173 #undef NEON_USAT
174 
175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
176 {
177     uint32_t res = a + b;
178     if (res < a) {
179         SET_QC();
180         res = ~0;
181     }
182     return res;
183 }
184 
185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
186 {
187     uint64_t res;
188 
189     res = src1 + src2;
190     if (res < src1) {
191         SET_QC();
192         res = ~(uint64_t)0;
193     }
194     return res;
195 }
196 
197 #define NEON_SSAT(dest, src1, src2, type) do { \
198     int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
199     if (tmp != (type)tmp) { \
200         SET_QC(); \
201         if (src2 > 0) { \
202             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
203         } else { \
204             tmp = 1 << (sizeof(type) * 8 - 1); \
205         } \
206     } \
207     dest = tmp; \
208     } while(0)
209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
210 NEON_VOP_ENV(qadd_s8, neon_s8, 4)
211 #undef NEON_FN
212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
213 NEON_VOP_ENV(qadd_s16, neon_s16, 2)
214 #undef NEON_FN
215 #undef NEON_SSAT
216 
217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
218 {
219     uint32_t res = a + b;
220     if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
221         SET_QC();
222         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
223     }
224     return res;
225 }
226 
227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
228 {
229     uint64_t res;
230 
231     res = src1 + src2;
232     if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
233         SET_QC();
234         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
235     }
236     return res;
237 }
238 
239 #define NEON_USAT(dest, src1, src2, type) do { \
240     uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
241     if (tmp != (type)tmp) { \
242         SET_QC(); \
243         dest = 0; \
244     } else { \
245         dest = tmp; \
246     }} while(0)
247 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
248 NEON_VOP_ENV(qsub_u8, neon_u8, 4)
249 #undef NEON_FN
250 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
251 NEON_VOP_ENV(qsub_u16, neon_u16, 2)
252 #undef NEON_FN
253 #undef NEON_USAT
254 
255 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
256 {
257     uint32_t res = a - b;
258     if (res > a) {
259         SET_QC();
260         res = 0;
261     }
262     return res;
263 }
264 
265 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
266 {
267     uint64_t res;
268 
269     if (src1 < src2) {
270         SET_QC();
271         res = 0;
272     } else {
273         res = src1 - src2;
274     }
275     return res;
276 }
277 
278 #define NEON_SSAT(dest, src1, src2, type) do { \
279     int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
280     if (tmp != (type)tmp) { \
281         SET_QC(); \
282         if (src2 < 0) { \
283             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
284         } else { \
285             tmp = 1 << (sizeof(type) * 8 - 1); \
286         } \
287     } \
288     dest = tmp; \
289     } while(0)
290 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
291 NEON_VOP_ENV(qsub_s8, neon_s8, 4)
292 #undef NEON_FN
293 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
294 NEON_VOP_ENV(qsub_s16, neon_s16, 2)
295 #undef NEON_FN
296 #undef NEON_SSAT
297 
298 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
299 {
300     uint32_t res = a - b;
301     if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
302         SET_QC();
303         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
304     }
305     return res;
306 }
307 
308 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
309 {
310     uint64_t res;
311 
312     res = src1 - src2;
313     if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
314         SET_QC();
315         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
316     }
317     return res;
318 }
319 
320 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
321 NEON_VOP(hadd_s8, neon_s8, 4)
322 NEON_VOP(hadd_u8, neon_u8, 4)
323 NEON_VOP(hadd_s16, neon_s16, 2)
324 NEON_VOP(hadd_u16, neon_u16, 2)
325 #undef NEON_FN
326 
327 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
328 {
329     int32_t dest;
330 
331     dest = (src1 >> 1) + (src2 >> 1);
332     if (src1 & src2 & 1)
333         dest++;
334     return dest;
335 }
336 
337 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
338 {
339     uint32_t dest;
340 
341     dest = (src1 >> 1) + (src2 >> 1);
342     if (src1 & src2 & 1)
343         dest++;
344     return dest;
345 }
346 
347 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
348 NEON_VOP(rhadd_s8, neon_s8, 4)
349 NEON_VOP(rhadd_u8, neon_u8, 4)
350 NEON_VOP(rhadd_s16, neon_s16, 2)
351 NEON_VOP(rhadd_u16, neon_u16, 2)
352 #undef NEON_FN
353 
354 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
355 {
356     int32_t dest;
357 
358     dest = (src1 >> 1) + (src2 >> 1);
359     if ((src1 | src2) & 1)
360         dest++;
361     return dest;
362 }
363 
364 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
365 {
366     uint32_t dest;
367 
368     dest = (src1 >> 1) + (src2 >> 1);
369     if ((src1 | src2) & 1)
370         dest++;
371     return dest;
372 }
373 
374 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
375 NEON_VOP(hsub_s8, neon_s8, 4)
376 NEON_VOP(hsub_u8, neon_u8, 4)
377 NEON_VOP(hsub_s16, neon_s16, 2)
378 NEON_VOP(hsub_u16, neon_u16, 2)
379 #undef NEON_FN
380 
381 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
382 {
383     int32_t dest;
384 
385     dest = (src1 >> 1) - (src2 >> 1);
386     if ((~src1) & src2 & 1)
387         dest--;
388     return dest;
389 }
390 
391 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
392 {
393     uint32_t dest;
394 
395     dest = (src1 >> 1) - (src2 >> 1);
396     if ((~src1) & src2 & 1)
397         dest--;
398     return dest;
399 }
400 
401 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
402 NEON_POP(pmin_s8, neon_s8, 4)
403 NEON_POP(pmin_u8, neon_u8, 4)
404 NEON_POP(pmin_s16, neon_s16, 2)
405 NEON_POP(pmin_u16, neon_u16, 2)
406 #undef NEON_FN
407 
408 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
409 NEON_POP(pmax_s8, neon_s8, 4)
410 NEON_POP(pmax_u8, neon_u8, 4)
411 NEON_POP(pmax_s16, neon_s16, 2)
412 NEON_POP(pmax_u16, neon_u16, 2)
413 #undef NEON_FN
414 
415 #define NEON_FN(dest, src1, src2) \
416     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
417 NEON_VOP(shl_u16, neon_u16, 2)
418 #undef NEON_FN
419 
420 #define NEON_FN(dest, src1, src2) \
421     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
422 NEON_VOP(shl_s16, neon_s16, 2)
423 #undef NEON_FN
424 
425 #define NEON_FN(dest, src1, src2) \
426     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
427 NEON_VOP(rshl_s8, neon_s8, 4)
428 #undef NEON_FN
429 
430 #define NEON_FN(dest, src1, src2) \
431     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
432 NEON_VOP(rshl_s16, neon_s16, 2)
433 #undef NEON_FN
434 
435 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
436 {
437     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
438 }
439 
440 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
441 {
442     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
443 }
444 
445 #define NEON_FN(dest, src1, src2) \
446     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
447 NEON_VOP(rshl_u8, neon_u8, 4)
448 #undef NEON_FN
449 
450 #define NEON_FN(dest, src1, src2) \
451     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
452 NEON_VOP(rshl_u16, neon_u16, 2)
453 #undef NEON_FN
454 
455 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
456 {
457     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
458 }
459 
460 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
461 {
462     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
463 }
464 
465 #define NEON_FN(dest, src1, src2) \
466     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
467 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
468 #undef NEON_FN
469 
470 #define NEON_FN(dest, src1, src2) \
471     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
472 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
473 #undef NEON_FN
474 
475 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
476 {
477     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
478 }
479 
480 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
481 {
482     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
483 }
484 
485 #define NEON_FN(dest, src1, src2) \
486     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
487 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
488 #undef NEON_FN
489 
490 #define NEON_FN(dest, src1, src2) \
491     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
492 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
493 #undef NEON_FN
494 
495 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
496 {
497     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
498 }
499 
500 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
501 {
502     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
503 }
504 
505 #define NEON_FN(dest, src1, src2) \
506     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
507 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
508 #undef NEON_FN
509 
510 #define NEON_FN(dest, src1, src2) \
511     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
512 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
513 #undef NEON_FN
514 
515 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
516 {
517     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
518 }
519 
520 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
521 {
522     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
523 }
524 
525 #define NEON_FN(dest, src1, src2) \
526     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
527 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
528 #undef NEON_FN
529 
530 #define NEON_FN(dest, src1, src2) \
531     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
532 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
533 #undef NEON_FN
534 
535 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
536 {
537     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
538 }
539 
540 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
541 {
542     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
543 }
544 
545 #define NEON_FN(dest, src1, src2) \
546     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
547 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
548 #undef NEON_FN
549 
550 #define NEON_FN(dest, src1, src2) \
551     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
552 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
553 #undef NEON_FN
554 
555 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
556 {
557     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
558 }
559 
560 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
561 {
562     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
563 }
564 
565 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
566 {
567     uint32_t mask;
568     mask = (a ^ b) & 0x80808080u;
569     a &= ~0x80808080u;
570     b &= ~0x80808080u;
571     return (a + b) ^ mask;
572 }
573 
574 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
575 {
576     uint32_t mask;
577     mask = (a ^ b) & 0x80008000u;
578     a &= ~0x80008000u;
579     b &= ~0x80008000u;
580     return (a + b) ^ mask;
581 }
582 
583 #define NEON_FN(dest, src1, src2) dest = src1 - src2
584 NEON_VOP(sub_u8, neon_u8, 4)
585 NEON_VOP(sub_u16, neon_u16, 2)
586 #undef NEON_FN
587 
588 #define NEON_FN(dest, src1, src2) dest = src1 * src2
589 NEON_VOP(mul_u8, neon_u8, 4)
590 NEON_VOP(mul_u16, neon_u16, 2)
591 #undef NEON_FN
592 
593 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
594 NEON_VOP(tst_u8, neon_u8, 4)
595 NEON_VOP(tst_u16, neon_u16, 2)
596 NEON_VOP(tst_u32, neon_u32, 1)
597 #undef NEON_FN
598 
599 /* Count Leading Sign/Zero Bits.  */
600 static inline int do_clz8(uint8_t x)
601 {
602     int n;
603     for (n = 8; x; n--)
604         x >>= 1;
605     return n;
606 }
607 
608 static inline int do_clz16(uint16_t x)
609 {
610     int n;
611     for (n = 16; x; n--)
612         x >>= 1;
613     return n;
614 }
615 
616 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
617 NEON_VOP1(clz_u8, neon_u8, 4)
618 #undef NEON_FN
619 
620 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
621 NEON_VOP1(clz_u16, neon_u16, 2)
622 #undef NEON_FN
623 
624 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
625 NEON_VOP1(cls_s8, neon_s8, 4)
626 #undef NEON_FN
627 
628 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
629 NEON_VOP1(cls_s16, neon_s16, 2)
630 #undef NEON_FN
631 
632 uint32_t HELPER(neon_cls_s32)(uint32_t x)
633 {
634     int count;
635     if ((int32_t)x < 0)
636         x = ~x;
637     for (count = 32; x; count--)
638         x = x >> 1;
639     return count - 1;
640 }
641 
642 /* Bit count.  */
643 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
644 {
645     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
646     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
647     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
648     return x;
649 }
650 
651 /* Reverse bits in each 8 bit word */
652 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
653 {
654     x =  ((x & 0xf0f0f0f0) >> 4)
655        | ((x & 0x0f0f0f0f) << 4);
656     x =  ((x & 0x88888888) >> 3)
657        | ((x & 0x44444444) >> 1)
658        | ((x & 0x22222222) << 1)
659        | ((x & 0x11111111) << 3);
660     return x;
661 }
662 
663 #define NEON_QDMULH16(dest, src1, src2, round) do { \
664     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
665     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
666         SET_QC(); \
667         tmp = (tmp >> 31) ^ ~SIGNBIT; \
668     } else { \
669         tmp <<= 1; \
670     } \
671     if (round) { \
672         int32_t old = tmp; \
673         tmp += 1 << 15; \
674         if ((int32_t)tmp < old) { \
675             SET_QC(); \
676             tmp = SIGNBIT - 1; \
677         } \
678     } \
679     dest = tmp >> 16; \
680     } while(0)
681 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
682 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
683 #undef NEON_FN
684 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
685 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
686 #undef NEON_FN
687 #undef NEON_QDMULH16
688 
689 #define NEON_QDMULH32(dest, src1, src2, round) do { \
690     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
691     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
692         SET_QC(); \
693         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
694     } else { \
695         tmp <<= 1; \
696     } \
697     if (round) { \
698         int64_t old = tmp; \
699         tmp += (int64_t)1 << 31; \
700         if ((int64_t)tmp < old) { \
701             SET_QC(); \
702             tmp = SIGNBIT64 - 1; \
703         } \
704     } \
705     dest = tmp >> 32; \
706     } while(0)
707 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
708 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
709 #undef NEON_FN
710 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
711 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
712 #undef NEON_FN
713 #undef NEON_QDMULH32
714 
715 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
716 {
717     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
718            | ((x >> 24) & 0xff000000u);
719 }
720 
721 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
722 {
723     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
724 }
725 
726 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
727 {
728     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
729             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
730 }
731 
732 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
733 {
734     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
735 }
736 
737 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
738 {
739     x &= 0xff80ff80ff80ff80ull;
740     x += 0x0080008000800080ull;
741     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
742             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
743 }
744 
745 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
746 {
747     x &= 0xffff8000ffff8000ull;
748     x += 0x0000800000008000ull;
749     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
750 }
751 
752 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
753 {
754     uint16_t s;
755     uint8_t d;
756     uint32_t res = 0;
757 #define SAT8(n) \
758     s = x >> n; \
759     if (s & 0x8000) { \
760         SET_QC(); \
761     } else { \
762         if (s > 0xff) { \
763             d = 0xff; \
764             SET_QC(); \
765         } else  { \
766             d = s; \
767         } \
768         res |= (uint32_t)d << (n / 2); \
769     }
770 
771     SAT8(0);
772     SAT8(16);
773     SAT8(32);
774     SAT8(48);
775 #undef SAT8
776     return res;
777 }
778 
779 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
780 {
781     uint16_t s;
782     uint8_t d;
783     uint32_t res = 0;
784 #define SAT8(n) \
785     s = x >> n; \
786     if (s > 0xff) { \
787         d = 0xff; \
788         SET_QC(); \
789     } else  { \
790         d = s; \
791     } \
792     res |= (uint32_t)d << (n / 2);
793 
794     SAT8(0);
795     SAT8(16);
796     SAT8(32);
797     SAT8(48);
798 #undef SAT8
799     return res;
800 }
801 
802 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
803 {
804     int16_t s;
805     uint8_t d;
806     uint32_t res = 0;
807 #define SAT8(n) \
808     s = x >> n; \
809     if (s != (int8_t)s) { \
810         d = (s >> 15) ^ 0x7f; \
811         SET_QC(); \
812     } else  { \
813         d = s; \
814     } \
815     res |= (uint32_t)d << (n / 2);
816 
817     SAT8(0);
818     SAT8(16);
819     SAT8(32);
820     SAT8(48);
821 #undef SAT8
822     return res;
823 }
824 
825 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
826 {
827     uint32_t high;
828     uint32_t low;
829     low = x;
830     if (low & 0x80000000) {
831         low = 0;
832         SET_QC();
833     } else if (low > 0xffff) {
834         low = 0xffff;
835         SET_QC();
836     }
837     high = x >> 32;
838     if (high & 0x80000000) {
839         high = 0;
840         SET_QC();
841     } else if (high > 0xffff) {
842         high = 0xffff;
843         SET_QC();
844     }
845     return low | (high << 16);
846 }
847 
848 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
849 {
850     uint32_t high;
851     uint32_t low;
852     low = x;
853     if (low > 0xffff) {
854         low = 0xffff;
855         SET_QC();
856     }
857     high = x >> 32;
858     if (high > 0xffff) {
859         high = 0xffff;
860         SET_QC();
861     }
862     return low | (high << 16);
863 }
864 
865 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
866 {
867     int32_t low;
868     int32_t high;
869     low = x;
870     if (low != (int16_t)low) {
871         low = (low >> 31) ^ 0x7fff;
872         SET_QC();
873     }
874     high = x >> 32;
875     if (high != (int16_t)high) {
876         high = (high >> 31) ^ 0x7fff;
877         SET_QC();
878     }
879     return (uint16_t)low | (high << 16);
880 }
881 
882 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
883 {
884     if (x & 0x8000000000000000ull) {
885         SET_QC();
886         return 0;
887     }
888     if (x > 0xffffffffu) {
889         SET_QC();
890         return 0xffffffffu;
891     }
892     return x;
893 }
894 
895 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
896 {
897     if (x > 0xffffffffu) {
898         SET_QC();
899         return 0xffffffffu;
900     }
901     return x;
902 }
903 
904 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
905 {
906     if ((int64_t)x != (int32_t)x) {
907         SET_QC();
908         return ((int64_t)x >> 63) ^ 0x7fffffff;
909     }
910     return x;
911 }
912 
913 uint64_t HELPER(neon_widen_u8)(uint32_t x)
914 {
915     uint64_t tmp;
916     uint64_t ret;
917     ret = (uint8_t)x;
918     tmp = (uint8_t)(x >> 8);
919     ret |= tmp << 16;
920     tmp = (uint8_t)(x >> 16);
921     ret |= tmp << 32;
922     tmp = (uint8_t)(x >> 24);
923     ret |= tmp << 48;
924     return ret;
925 }
926 
927 uint64_t HELPER(neon_widen_s8)(uint32_t x)
928 {
929     uint64_t tmp;
930     uint64_t ret;
931     ret = (uint16_t)(int8_t)x;
932     tmp = (uint16_t)(int8_t)(x >> 8);
933     ret |= tmp << 16;
934     tmp = (uint16_t)(int8_t)(x >> 16);
935     ret |= tmp << 32;
936     tmp = (uint16_t)(int8_t)(x >> 24);
937     ret |= tmp << 48;
938     return ret;
939 }
940 
941 uint64_t HELPER(neon_widen_u16)(uint32_t x)
942 {
943     uint64_t high = (uint16_t)(x >> 16);
944     return ((uint16_t)x) | (high << 32);
945 }
946 
947 uint64_t HELPER(neon_widen_s16)(uint32_t x)
948 {
949     uint64_t high = (int16_t)(x >> 16);
950     return ((uint32_t)(int16_t)x) | (high << 32);
951 }
952 
953 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
954 {
955     uint64_t mask;
956     mask = (a ^ b) & 0x8000800080008000ull;
957     a &= ~0x8000800080008000ull;
958     b &= ~0x8000800080008000ull;
959     return (a + b) ^ mask;
960 }
961 
962 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
963 {
964     uint64_t mask;
965     mask = (a ^ b) & 0x8000000080000000ull;
966     a &= ~0x8000000080000000ull;
967     b &= ~0x8000000080000000ull;
968     return (a + b) ^ mask;
969 }
970 
971 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
972 {
973     uint64_t tmp;
974     uint64_t tmp2;
975 
976     tmp = a & 0x0000ffff0000ffffull;
977     tmp += (a >> 16) & 0x0000ffff0000ffffull;
978     tmp2 = b & 0xffff0000ffff0000ull;
979     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
980     return    ( tmp         & 0xffff)
981             | ((tmp  >> 16) & 0xffff0000ull)
982             | ((tmp2 << 16) & 0xffff00000000ull)
983             | ( tmp2        & 0xffff000000000000ull);
984 }
985 
986 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
987 {
988     uint32_t low = a + (a >> 32);
989     uint32_t high = b + (b >> 32);
990     return low + ((uint64_t)high << 32);
991 }
992 
993 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
994 {
995     uint64_t mask;
996     mask = (a ^ ~b) & 0x8000800080008000ull;
997     a |= 0x8000800080008000ull;
998     b &= ~0x8000800080008000ull;
999     return (a - b) ^ mask;
1000 }
1001 
1002 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1003 {
1004     uint64_t mask;
1005     mask = (a ^ ~b) & 0x8000000080000000ull;
1006     a |= 0x8000000080000000ull;
1007     b &= ~0x8000000080000000ull;
1008     return (a - b) ^ mask;
1009 }
1010 
1011 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
1012 {
1013     uint32_t x, y;
1014     uint32_t low, high;
1015 
1016     x = a;
1017     y = b;
1018     low = x + y;
1019     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1020         SET_QC();
1021         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1022     }
1023     x = a >> 32;
1024     y = b >> 32;
1025     high = x + y;
1026     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1027         SET_QC();
1028         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1029     }
1030     return low | ((uint64_t)high << 32);
1031 }
1032 
1033 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
1034 {
1035     uint64_t result;
1036 
1037     result = a + b;
1038     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1039         SET_QC();
1040         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1041     }
1042     return result;
1043 }
1044 
1045 /* We have to do the arithmetic in a larger type than
1046  * the input type, because for example with a signed 32 bit
1047  * op the absolute difference can overflow a signed 32 bit value.
1048  */
1049 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
1050     arithtype tmp_x = (intype)(x);                            \
1051     arithtype tmp_y = (intype)(y);                            \
1052     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1053     } while(0)
1054 
1055 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1056 {
1057     uint64_t tmp;
1058     uint64_t result;
1059     DO_ABD(result, a, b, uint8_t, uint32_t);
1060     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1061     result |= tmp << 16;
1062     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1063     result |= tmp << 32;
1064     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1065     result |= tmp << 48;
1066     return result;
1067 }
1068 
1069 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1070 {
1071     uint64_t tmp;
1072     uint64_t result;
1073     DO_ABD(result, a, b, int8_t, int32_t);
1074     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1075     result |= tmp << 16;
1076     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1077     result |= tmp << 32;
1078     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1079     result |= tmp << 48;
1080     return result;
1081 }
1082 
1083 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1084 {
1085     uint64_t tmp;
1086     uint64_t result;
1087     DO_ABD(result, a, b, uint16_t, uint32_t);
1088     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1089     return result | (tmp << 32);
1090 }
1091 
1092 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1093 {
1094     uint64_t tmp;
1095     uint64_t result;
1096     DO_ABD(result, a, b, int16_t, int32_t);
1097     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1098     return result | (tmp << 32);
1099 }
1100 
1101 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1102 {
1103     uint64_t result;
1104     DO_ABD(result, a, b, uint32_t, uint64_t);
1105     return result;
1106 }
1107 
1108 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1109 {
1110     uint64_t result;
1111     DO_ABD(result, a, b, int32_t, int64_t);
1112     return result;
1113 }
1114 #undef DO_ABD
1115 
1116 /* Widening multiply. Named type is the source type.  */
1117 #define DO_MULL(dest, x, y, type1, type2) do { \
1118     type1 tmp_x = x; \
1119     type1 tmp_y = y; \
1120     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1121     } while(0)
1122 
1123 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1124 {
1125     uint64_t tmp;
1126     uint64_t result;
1127 
1128     DO_MULL(result, a, b, uint8_t, uint16_t);
1129     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1130     result |= tmp << 16;
1131     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1132     result |= tmp << 32;
1133     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1134     result |= tmp << 48;
1135     return result;
1136 }
1137 
1138 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1139 {
1140     uint64_t tmp;
1141     uint64_t result;
1142 
1143     DO_MULL(result, a, b, int8_t, uint16_t);
1144     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1145     result |= tmp << 16;
1146     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1147     result |= tmp << 32;
1148     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1149     result |= tmp << 48;
1150     return result;
1151 }
1152 
1153 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1154 {
1155     uint64_t tmp;
1156     uint64_t result;
1157 
1158     DO_MULL(result, a, b, uint16_t, uint32_t);
1159     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1160     return result | (tmp << 32);
1161 }
1162 
1163 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1164 {
1165     uint64_t tmp;
1166     uint64_t result;
1167 
1168     DO_MULL(result, a, b, int16_t, uint32_t);
1169     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1170     return result | (tmp << 32);
1171 }
1172 
1173 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1174 {
1175     uint16_t tmp;
1176     uint64_t result;
1177     result = (uint16_t)-x;
1178     tmp = -(x >> 16);
1179     result |= (uint64_t)tmp << 16;
1180     tmp = -(x >> 32);
1181     result |= (uint64_t)tmp << 32;
1182     tmp = -(x >> 48);
1183     result |= (uint64_t)tmp << 48;
1184     return result;
1185 }
1186 
1187 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1188 {
1189     uint32_t low = -x;
1190     uint32_t high = -(x >> 32);
1191     return low | ((uint64_t)high << 32);
1192 }
1193 
1194 /* Saturating sign manipulation.  */
1195 /* ??? Make these use NEON_VOP1 */
1196 #define DO_QABS8(x) do { \
1197     if (x == (int8_t)0x80) { \
1198         x = 0x7f; \
1199         SET_QC(); \
1200     } else if (x < 0) { \
1201         x = -x; \
1202     }} while (0)
1203 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1204 {
1205     neon_s8 vec;
1206     NEON_UNPACK(neon_s8, vec, x);
1207     DO_QABS8(vec.v1);
1208     DO_QABS8(vec.v2);
1209     DO_QABS8(vec.v3);
1210     DO_QABS8(vec.v4);
1211     NEON_PACK(neon_s8, x, vec);
1212     return x;
1213 }
1214 #undef DO_QABS8
1215 
1216 #define DO_QNEG8(x) do { \
1217     if (x == (int8_t)0x80) { \
1218         x = 0x7f; \
1219         SET_QC(); \
1220     } else { \
1221         x = -x; \
1222     }} while (0)
1223 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1224 {
1225     neon_s8 vec;
1226     NEON_UNPACK(neon_s8, vec, x);
1227     DO_QNEG8(vec.v1);
1228     DO_QNEG8(vec.v2);
1229     DO_QNEG8(vec.v3);
1230     DO_QNEG8(vec.v4);
1231     NEON_PACK(neon_s8, x, vec);
1232     return x;
1233 }
1234 #undef DO_QNEG8
1235 
1236 #define DO_QABS16(x) do { \
1237     if (x == (int16_t)0x8000) { \
1238         x = 0x7fff; \
1239         SET_QC(); \
1240     } else if (x < 0) { \
1241         x = -x; \
1242     }} while (0)
1243 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1244 {
1245     neon_s16 vec;
1246     NEON_UNPACK(neon_s16, vec, x);
1247     DO_QABS16(vec.v1);
1248     DO_QABS16(vec.v2);
1249     NEON_PACK(neon_s16, x, vec);
1250     return x;
1251 }
1252 #undef DO_QABS16
1253 
1254 #define DO_QNEG16(x) do { \
1255     if (x == (int16_t)0x8000) { \
1256         x = 0x7fff; \
1257         SET_QC(); \
1258     } else { \
1259         x = -x; \
1260     }} while (0)
1261 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1262 {
1263     neon_s16 vec;
1264     NEON_UNPACK(neon_s16, vec, x);
1265     DO_QNEG16(vec.v1);
1266     DO_QNEG16(vec.v2);
1267     NEON_PACK(neon_s16, x, vec);
1268     return x;
1269 }
1270 #undef DO_QNEG16
1271 
1272 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1273 {
1274     if (x == SIGNBIT) {
1275         SET_QC();
1276         x = ~SIGNBIT;
1277     } else if ((int32_t)x < 0) {
1278         x = -x;
1279     }
1280     return x;
1281 }
1282 
1283 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1284 {
1285     if (x == SIGNBIT) {
1286         SET_QC();
1287         x = ~SIGNBIT;
1288     } else {
1289         x = -x;
1290     }
1291     return x;
1292 }
1293 
1294 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1295 {
1296     if (x == SIGNBIT64) {
1297         SET_QC();
1298         x = ~SIGNBIT64;
1299     } else if ((int64_t)x < 0) {
1300         x = -x;
1301     }
1302     return x;
1303 }
1304 
1305 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1306 {
1307     if (x == SIGNBIT64) {
1308         SET_QC();
1309         x = ~SIGNBIT64;
1310     } else {
1311         x = -x;
1312     }
1313     return x;
1314 }
1315 
1316 /* NEON Float helpers.  */
1317 
1318 /* Floating point comparisons produce an integer result.
1319  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1320  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1321  */
1322 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1323 {
1324     float_status *fpst = fpstp;
1325     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1326 }
1327 
1328 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1329 {
1330     float_status *fpst = fpstp;
1331     return -float32_le(make_float32(b), make_float32(a), fpst);
1332 }
1333 
1334 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1335 {
1336     float_status *fpst = fpstp;
1337     return -float32_lt(make_float32(b), make_float32(a), fpst);
1338 }
1339 
1340 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1341 {
1342     float_status *fpst = fpstp;
1343     float32 f0 = float32_abs(make_float32(a));
1344     float32 f1 = float32_abs(make_float32(b));
1345     return -float32_le(f1, f0, fpst);
1346 }
1347 
1348 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1349 {
1350     float_status *fpst = fpstp;
1351     float32 f0 = float32_abs(make_float32(a));
1352     float32 f1 = float32_abs(make_float32(b));
1353     return -float32_lt(f1, f0, fpst);
1354 }
1355 
1356 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1357 {
1358     float_status *fpst = fpstp;
1359     float64 f0 = float64_abs(make_float64(a));
1360     float64 f1 = float64_abs(make_float64(b));
1361     return -float64_le(f1, f0, fpst);
1362 }
1363 
1364 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1365 {
1366     float_status *fpst = fpstp;
1367     float64 f0 = float64_abs(make_float64(a));
1368     float64 f1 = float64_abs(make_float64(b));
1369     return -float64_lt(f1, f0, fpst);
1370 }
1371 
1372 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1373 
1374 void HELPER(neon_qunzip8)(void *vd, void *vm)
1375 {
1376     uint64_t *rd = vd, *rm = vm;
1377     uint64_t zd0 = rd[0], zd1 = rd[1];
1378     uint64_t zm0 = rm[0], zm1 = rm[1];
1379 
1380     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1381         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1382         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1383         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1384     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1385         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1386         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1387         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1388     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1389         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1390         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1391         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1392     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1393         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1394         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1395         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1396 
1397     rm[0] = m0;
1398     rm[1] = m1;
1399     rd[0] = d0;
1400     rd[1] = d1;
1401 }
1402 
1403 void HELPER(neon_qunzip16)(void *vd, void *vm)
1404 {
1405     uint64_t *rd = vd, *rm = vm;
1406     uint64_t zd0 = rd[0], zd1 = rd[1];
1407     uint64_t zm0 = rm[0], zm1 = rm[1];
1408 
1409     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1410         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1411     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1412         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1413     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1414         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1415     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1416         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1417 
1418     rm[0] = m0;
1419     rm[1] = m1;
1420     rd[0] = d0;
1421     rd[1] = d1;
1422 }
1423 
1424 void HELPER(neon_qunzip32)(void *vd, void *vm)
1425 {
1426     uint64_t *rd = vd, *rm = vm;
1427     uint64_t zd0 = rd[0], zd1 = rd[1];
1428     uint64_t zm0 = rm[0], zm1 = rm[1];
1429 
1430     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1431     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1432     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1433     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1434 
1435     rm[0] = m0;
1436     rm[1] = m1;
1437     rd[0] = d0;
1438     rd[1] = d1;
1439 }
1440 
1441 void HELPER(neon_unzip8)(void *vd, void *vm)
1442 {
1443     uint64_t *rd = vd, *rm = vm;
1444     uint64_t zd = rd[0], zm = rm[0];
1445 
1446     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1447         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1448         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1449         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1450     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1451         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1452         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1453         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1454 
1455     rm[0] = m0;
1456     rd[0] = d0;
1457 }
1458 
1459 void HELPER(neon_unzip16)(void *vd, void *vm)
1460 {
1461     uint64_t *rd = vd, *rm = vm;
1462     uint64_t zd = rd[0], zm = rm[0];
1463 
1464     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1465         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1466     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1467         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1468 
1469     rm[0] = m0;
1470     rd[0] = d0;
1471 }
1472 
1473 void HELPER(neon_qzip8)(void *vd, void *vm)
1474 {
1475     uint64_t *rd = vd, *rm = vm;
1476     uint64_t zd0 = rd[0], zd1 = rd[1];
1477     uint64_t zm0 = rm[0], zm1 = rm[1];
1478 
1479     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1480         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1481         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1482         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1483     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1484         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1485         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1486         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1487     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1488         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1489         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1490         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1491     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1492         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1493         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1494         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1495 
1496     rm[0] = m0;
1497     rm[1] = m1;
1498     rd[0] = d0;
1499     rd[1] = d1;
1500 }
1501 
1502 void HELPER(neon_qzip16)(void *vd, void *vm)
1503 {
1504     uint64_t *rd = vd, *rm = vm;
1505     uint64_t zd0 = rd[0], zd1 = rd[1];
1506     uint64_t zm0 = rm[0], zm1 = rm[1];
1507 
1508     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1509         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1510     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1511         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1512     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1513         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1514     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1515         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1516 
1517     rm[0] = m0;
1518     rm[1] = m1;
1519     rd[0] = d0;
1520     rd[1] = d1;
1521 }
1522 
1523 void HELPER(neon_qzip32)(void *vd, void *vm)
1524 {
1525     uint64_t *rd = vd, *rm = vm;
1526     uint64_t zd0 = rd[0], zd1 = rd[1];
1527     uint64_t zm0 = rm[0], zm1 = rm[1];
1528 
1529     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1530     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1531     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1532     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1533 
1534     rm[0] = m0;
1535     rm[1] = m1;
1536     rd[0] = d0;
1537     rd[1] = d1;
1538 }
1539 
1540 void HELPER(neon_zip8)(void *vd, void *vm)
1541 {
1542     uint64_t *rd = vd, *rm = vm;
1543     uint64_t zd = rd[0], zm = rm[0];
1544 
1545     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1546         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1547         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1548         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1549     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1550         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1551         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1552         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1553 
1554     rm[0] = m0;
1555     rd[0] = d0;
1556 }
1557 
1558 void HELPER(neon_zip16)(void *vd, void *vm)
1559 {
1560     uint64_t *rd = vd, *rm = vm;
1561     uint64_t zd = rd[0], zm = rm[0];
1562 
1563     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1564         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1565     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1566         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1567 
1568     rm[0] = m0;
1569     rd[0] = d0;
1570 }
1571