xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision 3e964275)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 #include "qemu/osdep.h"
10 
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15 
16 #define SIGNBIT (uint32_t)0x80000000
17 #define SIGNBIT64 ((uint64_t)1 << 63)
18 
19 #define SET_QC() env->vfp.qc[0] = 1
20 
21 #define NEON_TYPE1(name, type) \
22 typedef struct \
23 { \
24     type v1; \
25 } neon_##name;
26 #if HOST_BIG_ENDIAN
27 #define NEON_TYPE2(name, type) \
28 typedef struct \
29 { \
30     type v2; \
31     type v1; \
32 } neon_##name;
33 #define NEON_TYPE4(name, type) \
34 typedef struct \
35 { \
36     type v4; \
37     type v3; \
38     type v2; \
39     type v1; \
40 } neon_##name;
41 #else
42 #define NEON_TYPE2(name, type) \
43 typedef struct \
44 { \
45     type v1; \
46     type v2; \
47 } neon_##name;
48 #define NEON_TYPE4(name, type) \
49 typedef struct \
50 { \
51     type v1; \
52     type v2; \
53     type v3; \
54     type v4; \
55 } neon_##name;
56 #endif
57 
58 NEON_TYPE4(s8, int8_t)
59 NEON_TYPE4(u8, uint8_t)
60 NEON_TYPE2(s16, int16_t)
61 NEON_TYPE2(u16, uint16_t)
62 NEON_TYPE1(s32, int32_t)
63 NEON_TYPE1(u32, uint32_t)
64 #undef NEON_TYPE4
65 #undef NEON_TYPE2
66 #undef NEON_TYPE1
67 
68 /* Copy from a uint32_t to a vector structure type.  */
69 #define NEON_UNPACK(vtype, dest, val) do { \
70     union { \
71         vtype v; \
72         uint32_t i; \
73     } conv_u; \
74     conv_u.i = (val); \
75     dest = conv_u.v; \
76     } while(0)
77 
78 /* Copy from a vector structure type to a uint32_t.  */
79 #define NEON_PACK(vtype, dest, val) do { \
80     union { \
81         vtype v; \
82         uint32_t i; \
83     } conv_u; \
84     conv_u.v = (val); \
85     dest = conv_u.i; \
86     } while(0)
87 
88 #define NEON_DO1 \
89     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
90 #define NEON_DO2 \
91     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
92     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
93 #define NEON_DO4 \
94     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
96     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
97     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
98 
99 #define NEON_VOP_BODY(vtype, n) \
100 { \
101     uint32_t res; \
102     vtype vsrc1; \
103     vtype vsrc2; \
104     vtype vdest; \
105     NEON_UNPACK(vtype, vsrc1, arg1); \
106     NEON_UNPACK(vtype, vsrc2, arg2); \
107     NEON_DO##n; \
108     NEON_PACK(vtype, res, vdest); \
109     return res; \
110 }
111 
112 #define NEON_VOP(name, vtype, n) \
113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
114 NEON_VOP_BODY(vtype, n)
115 
116 #define NEON_VOP_ENV(name, vtype, n) \
117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
118 NEON_VOP_BODY(vtype, n)
119 
120 /* Pairwise operations.  */
121 /* For 32-bit elements each segment only contains a single element, so
122    the elementwise and pairwise operations are the same.  */
123 #define NEON_PDO2 \
124     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
125     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
126 #define NEON_PDO4 \
127     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
128     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
129     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
130     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
131 
132 #define NEON_POP(name, vtype, n) \
133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
134 { \
135     uint32_t res; \
136     vtype vsrc1; \
137     vtype vsrc2; \
138     vtype vdest; \
139     NEON_UNPACK(vtype, vsrc1, arg1); \
140     NEON_UNPACK(vtype, vsrc2, arg2); \
141     NEON_PDO##n; \
142     NEON_PACK(vtype, res, vdest); \
143     return res; \
144 }
145 
146 /* Unary operators.  */
147 #define NEON_VOP1(name, vtype, n) \
148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
149 { \
150     vtype vsrc1; \
151     vtype vdest; \
152     NEON_UNPACK(vtype, vsrc1, arg); \
153     NEON_DO##n; \
154     NEON_PACK(vtype, arg, vdest); \
155     return arg; \
156 }
157 
158 
159 #define NEON_USAT(dest, src1, src2, type) do { \
160     uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
161     if (tmp != (type)tmp) { \
162         SET_QC(); \
163         dest = ~0; \
164     } else { \
165         dest = tmp; \
166     }} while(0)
167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
168 NEON_VOP_ENV(qadd_u8, neon_u8, 4)
169 #undef NEON_FN
170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
171 NEON_VOP_ENV(qadd_u16, neon_u16, 2)
172 #undef NEON_FN
173 #undef NEON_USAT
174 
175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
176 {
177     uint32_t res = a + b;
178     if (res < a) {
179         SET_QC();
180         res = ~0;
181     }
182     return res;
183 }
184 
185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
186 {
187     uint64_t res;
188 
189     res = src1 + src2;
190     if (res < src1) {
191         SET_QC();
192         res = ~(uint64_t)0;
193     }
194     return res;
195 }
196 
197 #define NEON_SSAT(dest, src1, src2, type) do { \
198     int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
199     if (tmp != (type)tmp) { \
200         SET_QC(); \
201         if (src2 > 0) { \
202             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
203         } else { \
204             tmp = 1 << (sizeof(type) * 8 - 1); \
205         } \
206     } \
207     dest = tmp; \
208     } while(0)
209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
210 NEON_VOP_ENV(qadd_s8, neon_s8, 4)
211 #undef NEON_FN
212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
213 NEON_VOP_ENV(qadd_s16, neon_s16, 2)
214 #undef NEON_FN
215 #undef NEON_SSAT
216 
217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
218 {
219     uint32_t res = a + b;
220     if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
221         SET_QC();
222         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
223     }
224     return res;
225 }
226 
227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
228 {
229     uint64_t res;
230 
231     res = src1 + src2;
232     if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
233         SET_QC();
234         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
235     }
236     return res;
237 }
238 
239 /* Unsigned saturating accumulate of signed value
240  *
241  * Op1/Rn is treated as signed
242  * Op2/Rd is treated as unsigned
243  *
244  * Explicit casting is used to ensure the correct sign extension of
245  * inputs. The result is treated as a unsigned value and saturated as such.
246  *
247  * We use a macro for the 8/16 bit cases which expects signed integers of va,
248  * vb, and vr for interim calculation and an unsigned 32 bit result value r.
249  */
250 
251 #define USATACC(bits, shift) \
252     do { \
253         va = sextract32(a, shift, bits);                                \
254         vb = extract32(b, shift, bits);                                 \
255         vr = va + vb;                                                   \
256         if (vr > UINT##bits##_MAX) {                                    \
257             SET_QC();                                                   \
258             vr = UINT##bits##_MAX;                                      \
259         } else if (vr < 0) {                                            \
260             SET_QC();                                                   \
261             vr = 0;                                                     \
262         }                                                               \
263         r = deposit32(r, shift, bits, vr);                              \
264    } while (0)
265 
266 uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
267 {
268     int16_t va, vb, vr;
269     uint32_t r = 0;
270 
271     USATACC(8, 0);
272     USATACC(8, 8);
273     USATACC(8, 16);
274     USATACC(8, 24);
275     return r;
276 }
277 
278 uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
279 {
280     int32_t va, vb, vr;
281     uint64_t r = 0;
282 
283     USATACC(16, 0);
284     USATACC(16, 16);
285     return r;
286 }
287 
288 #undef USATACC
289 
290 uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
291 {
292     int64_t va = (int32_t)a;
293     int64_t vb = (uint32_t)b;
294     int64_t vr = va + vb;
295     if (vr > UINT32_MAX) {
296         SET_QC();
297         vr = UINT32_MAX;
298     } else if (vr < 0) {
299         SET_QC();
300         vr = 0;
301     }
302     return vr;
303 }
304 
305 uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
306 {
307     uint64_t res;
308     res = a + b;
309     /* We only need to look at the pattern of SIGN bits to detect
310      * +ve/-ve saturation
311      */
312     if (~a & b & ~res & SIGNBIT64) {
313         SET_QC();
314         res = UINT64_MAX;
315     } else if (a & ~b & res & SIGNBIT64) {
316         SET_QC();
317         res = 0;
318     }
319     return res;
320 }
321 
322 /* Signed saturating accumulate of unsigned value
323  *
324  * Op1/Rn is treated as unsigned
325  * Op2/Rd is treated as signed
326  *
327  * The result is treated as a signed value and saturated as such
328  *
329  * We use a macro for the 8/16 bit cases which expects signed integers of va,
330  * vb, and vr for interim calculation and an unsigned 32 bit result value r.
331  */
332 
333 #define SSATACC(bits, shift) \
334     do { \
335         va = extract32(a, shift, bits);                                 \
336         vb = sextract32(b, shift, bits);                                \
337         vr = va + vb;                                                   \
338         if (vr > INT##bits##_MAX) {                                     \
339             SET_QC();                                                   \
340             vr = INT##bits##_MAX;                                       \
341         } else if (vr < INT##bits##_MIN) {                              \
342             SET_QC();                                                   \
343             vr = INT##bits##_MIN;                                       \
344         }                                                               \
345         r = deposit32(r, shift, bits, vr);                              \
346     } while (0)
347 
348 uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
349 {
350     int16_t va, vb, vr;
351     uint32_t r = 0;
352 
353     SSATACC(8, 0);
354     SSATACC(8, 8);
355     SSATACC(8, 16);
356     SSATACC(8, 24);
357     return r;
358 }
359 
360 uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
361 {
362     int32_t va, vb, vr;
363     uint32_t r = 0;
364 
365     SSATACC(16, 0);
366     SSATACC(16, 16);
367 
368     return r;
369 }
370 
371 #undef SSATACC
372 
373 uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
374 {
375     int64_t res;
376     int64_t op1 = (uint32_t)a;
377     int64_t op2 = (int32_t)b;
378     res = op1 + op2;
379     if (res > INT32_MAX) {
380         SET_QC();
381         res = INT32_MAX;
382     } else if (res < INT32_MIN) {
383         SET_QC();
384         res = INT32_MIN;
385     }
386     return res;
387 }
388 
389 uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
390 {
391     uint64_t res;
392     res = a + b;
393     /* We only need to look at the pattern of SIGN bits to detect an overflow */
394     if (((a & res)
395          | (~b & res)
396          | (a & ~b)) & SIGNBIT64) {
397         SET_QC();
398         res = INT64_MAX;
399     }
400     return res;
401 }
402 
403 
404 #define NEON_USAT(dest, src1, src2, type) do { \
405     uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
406     if (tmp != (type)tmp) { \
407         SET_QC(); \
408         dest = 0; \
409     } else { \
410         dest = tmp; \
411     }} while(0)
412 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
413 NEON_VOP_ENV(qsub_u8, neon_u8, 4)
414 #undef NEON_FN
415 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
416 NEON_VOP_ENV(qsub_u16, neon_u16, 2)
417 #undef NEON_FN
418 #undef NEON_USAT
419 
420 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
421 {
422     uint32_t res = a - b;
423     if (res > a) {
424         SET_QC();
425         res = 0;
426     }
427     return res;
428 }
429 
430 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
431 {
432     uint64_t res;
433 
434     if (src1 < src2) {
435         SET_QC();
436         res = 0;
437     } else {
438         res = src1 - src2;
439     }
440     return res;
441 }
442 
443 #define NEON_SSAT(dest, src1, src2, type) do { \
444     int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
445     if (tmp != (type)tmp) { \
446         SET_QC(); \
447         if (src2 < 0) { \
448             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
449         } else { \
450             tmp = 1 << (sizeof(type) * 8 - 1); \
451         } \
452     } \
453     dest = tmp; \
454     } while(0)
455 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
456 NEON_VOP_ENV(qsub_s8, neon_s8, 4)
457 #undef NEON_FN
458 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
459 NEON_VOP_ENV(qsub_s16, neon_s16, 2)
460 #undef NEON_FN
461 #undef NEON_SSAT
462 
463 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
464 {
465     uint32_t res = a - b;
466     if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
467         SET_QC();
468         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
469     }
470     return res;
471 }
472 
473 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
474 {
475     uint64_t res;
476 
477     res = src1 - src2;
478     if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
479         SET_QC();
480         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
481     }
482     return res;
483 }
484 
485 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
486 NEON_VOP(hadd_s8, neon_s8, 4)
487 NEON_VOP(hadd_u8, neon_u8, 4)
488 NEON_VOP(hadd_s16, neon_s16, 2)
489 NEON_VOP(hadd_u16, neon_u16, 2)
490 #undef NEON_FN
491 
492 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
493 {
494     int32_t dest;
495 
496     dest = (src1 >> 1) + (src2 >> 1);
497     if (src1 & src2 & 1)
498         dest++;
499     return dest;
500 }
501 
502 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
503 {
504     uint32_t dest;
505 
506     dest = (src1 >> 1) + (src2 >> 1);
507     if (src1 & src2 & 1)
508         dest++;
509     return dest;
510 }
511 
512 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
513 NEON_VOP(rhadd_s8, neon_s8, 4)
514 NEON_VOP(rhadd_u8, neon_u8, 4)
515 NEON_VOP(rhadd_s16, neon_s16, 2)
516 NEON_VOP(rhadd_u16, neon_u16, 2)
517 #undef NEON_FN
518 
519 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
520 {
521     int32_t dest;
522 
523     dest = (src1 >> 1) + (src2 >> 1);
524     if ((src1 | src2) & 1)
525         dest++;
526     return dest;
527 }
528 
529 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
530 {
531     uint32_t dest;
532 
533     dest = (src1 >> 1) + (src2 >> 1);
534     if ((src1 | src2) & 1)
535         dest++;
536     return dest;
537 }
538 
539 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
540 NEON_VOP(hsub_s8, neon_s8, 4)
541 NEON_VOP(hsub_u8, neon_u8, 4)
542 NEON_VOP(hsub_s16, neon_s16, 2)
543 NEON_VOP(hsub_u16, neon_u16, 2)
544 #undef NEON_FN
545 
546 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
547 {
548     int32_t dest;
549 
550     dest = (src1 >> 1) - (src2 >> 1);
551     if ((~src1) & src2 & 1)
552         dest--;
553     return dest;
554 }
555 
556 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
557 {
558     uint32_t dest;
559 
560     dest = (src1 >> 1) - (src2 >> 1);
561     if ((~src1) & src2 & 1)
562         dest--;
563     return dest;
564 }
565 
566 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
567 NEON_POP(pmin_s8, neon_s8, 4)
568 NEON_POP(pmin_u8, neon_u8, 4)
569 NEON_POP(pmin_s16, neon_s16, 2)
570 NEON_POP(pmin_u16, neon_u16, 2)
571 #undef NEON_FN
572 
573 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
574 NEON_POP(pmax_s8, neon_s8, 4)
575 NEON_POP(pmax_u8, neon_u8, 4)
576 NEON_POP(pmax_s16, neon_s16, 2)
577 NEON_POP(pmax_u16, neon_u16, 2)
578 #undef NEON_FN
579 
580 #define NEON_FN(dest, src1, src2) \
581     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
582 NEON_VOP(shl_u16, neon_u16, 2)
583 #undef NEON_FN
584 
585 #define NEON_FN(dest, src1, src2) \
586     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
587 NEON_VOP(shl_s16, neon_s16, 2)
588 #undef NEON_FN
589 
590 #define NEON_FN(dest, src1, src2) \
591     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
592 NEON_VOP(rshl_s8, neon_s8, 4)
593 #undef NEON_FN
594 
595 #define NEON_FN(dest, src1, src2) \
596     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
597 NEON_VOP(rshl_s16, neon_s16, 2)
598 #undef NEON_FN
599 
600 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
601 {
602     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
603 }
604 
605 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
606 {
607     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
608 }
609 
610 #define NEON_FN(dest, src1, src2) \
611     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
612 NEON_VOP(rshl_u8, neon_u8, 4)
613 #undef NEON_FN
614 
615 #define NEON_FN(dest, src1, src2) \
616     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
617 NEON_VOP(rshl_u16, neon_u16, 2)
618 #undef NEON_FN
619 
620 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
621 {
622     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
623 }
624 
625 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
626 {
627     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
628 }
629 
630 #define NEON_FN(dest, src1, src2) \
631     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
632 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
633 #undef NEON_FN
634 
635 #define NEON_FN(dest, src1, src2) \
636     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
637 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
638 #undef NEON_FN
639 
640 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
641 {
642     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
643 }
644 
645 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
646 {
647     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
648 }
649 
650 #define NEON_FN(dest, src1, src2) \
651     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
652 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
653 #undef NEON_FN
654 
655 #define NEON_FN(dest, src1, src2) \
656     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
657 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
658 #undef NEON_FN
659 
660 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
661 {
662     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
663 }
664 
665 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
666 {
667     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
668 }
669 
670 #define NEON_FN(dest, src1, src2) \
671     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
672 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
673 #undef NEON_FN
674 
675 #define NEON_FN(dest, src1, src2) \
676     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
677 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
678 #undef NEON_FN
679 
680 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
681 {
682     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
683 }
684 
685 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
686 {
687     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
688 }
689 
690 #define NEON_FN(dest, src1, src2) \
691     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
692 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
693 #undef NEON_FN
694 
695 #define NEON_FN(dest, src1, src2) \
696     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
697 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
698 #undef NEON_FN
699 
700 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
701 {
702     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
703 }
704 
705 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
706 {
707     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
708 }
709 
710 #define NEON_FN(dest, src1, src2) \
711     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
712 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
713 #undef NEON_FN
714 
715 #define NEON_FN(dest, src1, src2) \
716     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
717 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
718 #undef NEON_FN
719 
720 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
721 {
722     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
723 }
724 
725 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
726 {
727     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
728 }
729 
730 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
731 {
732     uint32_t mask;
733     mask = (a ^ b) & 0x80808080u;
734     a &= ~0x80808080u;
735     b &= ~0x80808080u;
736     return (a + b) ^ mask;
737 }
738 
739 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
740 {
741     uint32_t mask;
742     mask = (a ^ b) & 0x80008000u;
743     a &= ~0x80008000u;
744     b &= ~0x80008000u;
745     return (a + b) ^ mask;
746 }
747 
748 #define NEON_FN(dest, src1, src2) dest = src1 - src2
749 NEON_VOP(sub_u8, neon_u8, 4)
750 NEON_VOP(sub_u16, neon_u16, 2)
751 #undef NEON_FN
752 
753 #define NEON_FN(dest, src1, src2) dest = src1 * src2
754 NEON_VOP(mul_u8, neon_u8, 4)
755 NEON_VOP(mul_u16, neon_u16, 2)
756 #undef NEON_FN
757 
758 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
759 NEON_VOP(tst_u8, neon_u8, 4)
760 NEON_VOP(tst_u16, neon_u16, 2)
761 NEON_VOP(tst_u32, neon_u32, 1)
762 #undef NEON_FN
763 
764 /* Count Leading Sign/Zero Bits.  */
765 static inline int do_clz8(uint8_t x)
766 {
767     int n;
768     for (n = 8; x; n--)
769         x >>= 1;
770     return n;
771 }
772 
773 static inline int do_clz16(uint16_t x)
774 {
775     int n;
776     for (n = 16; x; n--)
777         x >>= 1;
778     return n;
779 }
780 
781 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
782 NEON_VOP1(clz_u8, neon_u8, 4)
783 #undef NEON_FN
784 
785 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
786 NEON_VOP1(clz_u16, neon_u16, 2)
787 #undef NEON_FN
788 
789 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
790 NEON_VOP1(cls_s8, neon_s8, 4)
791 #undef NEON_FN
792 
793 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
794 NEON_VOP1(cls_s16, neon_s16, 2)
795 #undef NEON_FN
796 
797 uint32_t HELPER(neon_cls_s32)(uint32_t x)
798 {
799     int count;
800     if ((int32_t)x < 0)
801         x = ~x;
802     for (count = 32; x; count--)
803         x = x >> 1;
804     return count - 1;
805 }
806 
807 /* Bit count.  */
808 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
809 {
810     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
811     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
812     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
813     return x;
814 }
815 
816 /* Reverse bits in each 8 bit word */
817 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
818 {
819     x =  ((x & 0xf0f0f0f0) >> 4)
820        | ((x & 0x0f0f0f0f) << 4);
821     x =  ((x & 0x88888888) >> 3)
822        | ((x & 0x44444444) >> 1)
823        | ((x & 0x22222222) << 1)
824        | ((x & 0x11111111) << 3);
825     return x;
826 }
827 
828 #define NEON_QDMULH16(dest, src1, src2, round) do { \
829     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
830     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
831         SET_QC(); \
832         tmp = (tmp >> 31) ^ ~SIGNBIT; \
833     } else { \
834         tmp <<= 1; \
835     } \
836     if (round) { \
837         int32_t old = tmp; \
838         tmp += 1 << 15; \
839         if ((int32_t)tmp < old) { \
840             SET_QC(); \
841             tmp = SIGNBIT - 1; \
842         } \
843     } \
844     dest = tmp >> 16; \
845     } while(0)
846 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
847 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
848 #undef NEON_FN
849 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
850 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
851 #undef NEON_FN
852 #undef NEON_QDMULH16
853 
854 #define NEON_QDMULH32(dest, src1, src2, round) do { \
855     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
856     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
857         SET_QC(); \
858         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
859     } else { \
860         tmp <<= 1; \
861     } \
862     if (round) { \
863         int64_t old = tmp; \
864         tmp += (int64_t)1 << 31; \
865         if ((int64_t)tmp < old) { \
866             SET_QC(); \
867             tmp = SIGNBIT64 - 1; \
868         } \
869     } \
870     dest = tmp >> 32; \
871     } while(0)
872 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
873 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
874 #undef NEON_FN
875 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
876 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
877 #undef NEON_FN
878 #undef NEON_QDMULH32
879 
880 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
881 {
882     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
883            | ((x >> 24) & 0xff000000u);
884 }
885 
886 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
887 {
888     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
889 }
890 
891 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
892 {
893     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
894             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
895 }
896 
897 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
898 {
899     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
900 }
901 
902 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
903 {
904     x &= 0xff80ff80ff80ff80ull;
905     x += 0x0080008000800080ull;
906     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
907             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
908 }
909 
910 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
911 {
912     x &= 0xffff8000ffff8000ull;
913     x += 0x0000800000008000ull;
914     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
915 }
916 
917 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
918 {
919     uint16_t s;
920     uint8_t d;
921     uint32_t res = 0;
922 #define SAT8(n) \
923     s = x >> n; \
924     if (s & 0x8000) { \
925         SET_QC(); \
926     } else { \
927         if (s > 0xff) { \
928             d = 0xff; \
929             SET_QC(); \
930         } else  { \
931             d = s; \
932         } \
933         res |= (uint32_t)d << (n / 2); \
934     }
935 
936     SAT8(0);
937     SAT8(16);
938     SAT8(32);
939     SAT8(48);
940 #undef SAT8
941     return res;
942 }
943 
944 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
945 {
946     uint16_t s;
947     uint8_t d;
948     uint32_t res = 0;
949 #define SAT8(n) \
950     s = x >> n; \
951     if (s > 0xff) { \
952         d = 0xff; \
953         SET_QC(); \
954     } else  { \
955         d = s; \
956     } \
957     res |= (uint32_t)d << (n / 2);
958 
959     SAT8(0);
960     SAT8(16);
961     SAT8(32);
962     SAT8(48);
963 #undef SAT8
964     return res;
965 }
966 
967 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
968 {
969     int16_t s;
970     uint8_t d;
971     uint32_t res = 0;
972 #define SAT8(n) \
973     s = x >> n; \
974     if (s != (int8_t)s) { \
975         d = (s >> 15) ^ 0x7f; \
976         SET_QC(); \
977     } else  { \
978         d = s; \
979     } \
980     res |= (uint32_t)d << (n / 2);
981 
982     SAT8(0);
983     SAT8(16);
984     SAT8(32);
985     SAT8(48);
986 #undef SAT8
987     return res;
988 }
989 
990 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
991 {
992     uint32_t high;
993     uint32_t low;
994     low = x;
995     if (low & 0x80000000) {
996         low = 0;
997         SET_QC();
998     } else if (low > 0xffff) {
999         low = 0xffff;
1000         SET_QC();
1001     }
1002     high = x >> 32;
1003     if (high & 0x80000000) {
1004         high = 0;
1005         SET_QC();
1006     } else if (high > 0xffff) {
1007         high = 0xffff;
1008         SET_QC();
1009     }
1010     return low | (high << 16);
1011 }
1012 
1013 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
1014 {
1015     uint32_t high;
1016     uint32_t low;
1017     low = x;
1018     if (low > 0xffff) {
1019         low = 0xffff;
1020         SET_QC();
1021     }
1022     high = x >> 32;
1023     if (high > 0xffff) {
1024         high = 0xffff;
1025         SET_QC();
1026     }
1027     return low | (high << 16);
1028 }
1029 
1030 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
1031 {
1032     int32_t low;
1033     int32_t high;
1034     low = x;
1035     if (low != (int16_t)low) {
1036         low = (low >> 31) ^ 0x7fff;
1037         SET_QC();
1038     }
1039     high = x >> 32;
1040     if (high != (int16_t)high) {
1041         high = (high >> 31) ^ 0x7fff;
1042         SET_QC();
1043     }
1044     return (uint16_t)low | (high << 16);
1045 }
1046 
1047 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
1048 {
1049     if (x & 0x8000000000000000ull) {
1050         SET_QC();
1051         return 0;
1052     }
1053     if (x > 0xffffffffu) {
1054         SET_QC();
1055         return 0xffffffffu;
1056     }
1057     return x;
1058 }
1059 
1060 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
1061 {
1062     if (x > 0xffffffffu) {
1063         SET_QC();
1064         return 0xffffffffu;
1065     }
1066     return x;
1067 }
1068 
1069 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
1070 {
1071     if ((int64_t)x != (int32_t)x) {
1072         SET_QC();
1073         return ((int64_t)x >> 63) ^ 0x7fffffff;
1074     }
1075     return x;
1076 }
1077 
1078 uint64_t HELPER(neon_widen_u8)(uint32_t x)
1079 {
1080     uint64_t tmp;
1081     uint64_t ret;
1082     ret = (uint8_t)x;
1083     tmp = (uint8_t)(x >> 8);
1084     ret |= tmp << 16;
1085     tmp = (uint8_t)(x >> 16);
1086     ret |= tmp << 32;
1087     tmp = (uint8_t)(x >> 24);
1088     ret |= tmp << 48;
1089     return ret;
1090 }
1091 
1092 uint64_t HELPER(neon_widen_s8)(uint32_t x)
1093 {
1094     uint64_t tmp;
1095     uint64_t ret;
1096     ret = (uint16_t)(int8_t)x;
1097     tmp = (uint16_t)(int8_t)(x >> 8);
1098     ret |= tmp << 16;
1099     tmp = (uint16_t)(int8_t)(x >> 16);
1100     ret |= tmp << 32;
1101     tmp = (uint16_t)(int8_t)(x >> 24);
1102     ret |= tmp << 48;
1103     return ret;
1104 }
1105 
1106 uint64_t HELPER(neon_widen_u16)(uint32_t x)
1107 {
1108     uint64_t high = (uint16_t)(x >> 16);
1109     return ((uint16_t)x) | (high << 32);
1110 }
1111 
1112 uint64_t HELPER(neon_widen_s16)(uint32_t x)
1113 {
1114     uint64_t high = (int16_t)(x >> 16);
1115     return ((uint32_t)(int16_t)x) | (high << 32);
1116 }
1117 
1118 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1119 {
1120     uint64_t mask;
1121     mask = (a ^ b) & 0x8000800080008000ull;
1122     a &= ~0x8000800080008000ull;
1123     b &= ~0x8000800080008000ull;
1124     return (a + b) ^ mask;
1125 }
1126 
1127 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1128 {
1129     uint64_t mask;
1130     mask = (a ^ b) & 0x8000000080000000ull;
1131     a &= ~0x8000000080000000ull;
1132     b &= ~0x8000000080000000ull;
1133     return (a + b) ^ mask;
1134 }
1135 
1136 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1137 {
1138     uint64_t tmp;
1139     uint64_t tmp2;
1140 
1141     tmp = a & 0x0000ffff0000ffffull;
1142     tmp += (a >> 16) & 0x0000ffff0000ffffull;
1143     tmp2 = b & 0xffff0000ffff0000ull;
1144     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1145     return    ( tmp         & 0xffff)
1146             | ((tmp  >> 16) & 0xffff0000ull)
1147             | ((tmp2 << 16) & 0xffff00000000ull)
1148             | ( tmp2        & 0xffff000000000000ull);
1149 }
1150 
1151 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1152 {
1153     uint32_t low = a + (a >> 32);
1154     uint32_t high = b + (b >> 32);
1155     return low + ((uint64_t)high << 32);
1156 }
1157 
1158 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1159 {
1160     uint64_t mask;
1161     mask = (a ^ ~b) & 0x8000800080008000ull;
1162     a |= 0x8000800080008000ull;
1163     b &= ~0x8000800080008000ull;
1164     return (a - b) ^ mask;
1165 }
1166 
1167 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1168 {
1169     uint64_t mask;
1170     mask = (a ^ ~b) & 0x8000000080000000ull;
1171     a |= 0x8000000080000000ull;
1172     b &= ~0x8000000080000000ull;
1173     return (a - b) ^ mask;
1174 }
1175 
1176 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
1177 {
1178     uint32_t x, y;
1179     uint32_t low, high;
1180 
1181     x = a;
1182     y = b;
1183     low = x + y;
1184     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1185         SET_QC();
1186         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1187     }
1188     x = a >> 32;
1189     y = b >> 32;
1190     high = x + y;
1191     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1192         SET_QC();
1193         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1194     }
1195     return low | ((uint64_t)high << 32);
1196 }
1197 
1198 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
1199 {
1200     uint64_t result;
1201 
1202     result = a + b;
1203     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1204         SET_QC();
1205         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1206     }
1207     return result;
1208 }
1209 
1210 /* We have to do the arithmetic in a larger type than
1211  * the input type, because for example with a signed 32 bit
1212  * op the absolute difference can overflow a signed 32 bit value.
1213  */
1214 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
1215     arithtype tmp_x = (intype)(x);                            \
1216     arithtype tmp_y = (intype)(y);                            \
1217     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1218     } while(0)
1219 
1220 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1221 {
1222     uint64_t tmp;
1223     uint64_t result;
1224     DO_ABD(result, a, b, uint8_t, uint32_t);
1225     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1226     result |= tmp << 16;
1227     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1228     result |= tmp << 32;
1229     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1230     result |= tmp << 48;
1231     return result;
1232 }
1233 
1234 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1235 {
1236     uint64_t tmp;
1237     uint64_t result;
1238     DO_ABD(result, a, b, int8_t, int32_t);
1239     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1240     result |= tmp << 16;
1241     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1242     result |= tmp << 32;
1243     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1244     result |= tmp << 48;
1245     return result;
1246 }
1247 
1248 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1249 {
1250     uint64_t tmp;
1251     uint64_t result;
1252     DO_ABD(result, a, b, uint16_t, uint32_t);
1253     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1254     return result | (tmp << 32);
1255 }
1256 
1257 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1258 {
1259     uint64_t tmp;
1260     uint64_t result;
1261     DO_ABD(result, a, b, int16_t, int32_t);
1262     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1263     return result | (tmp << 32);
1264 }
1265 
1266 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1267 {
1268     uint64_t result;
1269     DO_ABD(result, a, b, uint32_t, uint64_t);
1270     return result;
1271 }
1272 
1273 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1274 {
1275     uint64_t result;
1276     DO_ABD(result, a, b, int32_t, int64_t);
1277     return result;
1278 }
1279 #undef DO_ABD
1280 
1281 /* Widening multiply. Named type is the source type.  */
1282 #define DO_MULL(dest, x, y, type1, type2) do { \
1283     type1 tmp_x = x; \
1284     type1 tmp_y = y; \
1285     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1286     } while(0)
1287 
1288 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1289 {
1290     uint64_t tmp;
1291     uint64_t result;
1292 
1293     DO_MULL(result, a, b, uint8_t, uint16_t);
1294     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1295     result |= tmp << 16;
1296     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1297     result |= tmp << 32;
1298     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1299     result |= tmp << 48;
1300     return result;
1301 }
1302 
1303 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1304 {
1305     uint64_t tmp;
1306     uint64_t result;
1307 
1308     DO_MULL(result, a, b, int8_t, uint16_t);
1309     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1310     result |= tmp << 16;
1311     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1312     result |= tmp << 32;
1313     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1314     result |= tmp << 48;
1315     return result;
1316 }
1317 
1318 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1319 {
1320     uint64_t tmp;
1321     uint64_t result;
1322 
1323     DO_MULL(result, a, b, uint16_t, uint32_t);
1324     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1325     return result | (tmp << 32);
1326 }
1327 
1328 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1329 {
1330     uint64_t tmp;
1331     uint64_t result;
1332 
1333     DO_MULL(result, a, b, int16_t, uint32_t);
1334     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1335     return result | (tmp << 32);
1336 }
1337 
1338 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1339 {
1340     uint16_t tmp;
1341     uint64_t result;
1342     result = (uint16_t)-x;
1343     tmp = -(x >> 16);
1344     result |= (uint64_t)tmp << 16;
1345     tmp = -(x >> 32);
1346     result |= (uint64_t)tmp << 32;
1347     tmp = -(x >> 48);
1348     result |= (uint64_t)tmp << 48;
1349     return result;
1350 }
1351 
1352 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1353 {
1354     uint32_t low = -x;
1355     uint32_t high = -(x >> 32);
1356     return low | ((uint64_t)high << 32);
1357 }
1358 
1359 /* Saturating sign manipulation.  */
1360 /* ??? Make these use NEON_VOP1 */
1361 #define DO_QABS8(x) do { \
1362     if (x == (int8_t)0x80) { \
1363         x = 0x7f; \
1364         SET_QC(); \
1365     } else if (x < 0) { \
1366         x = -x; \
1367     }} while (0)
1368 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1369 {
1370     neon_s8 vec;
1371     NEON_UNPACK(neon_s8, vec, x);
1372     DO_QABS8(vec.v1);
1373     DO_QABS8(vec.v2);
1374     DO_QABS8(vec.v3);
1375     DO_QABS8(vec.v4);
1376     NEON_PACK(neon_s8, x, vec);
1377     return x;
1378 }
1379 #undef DO_QABS8
1380 
1381 #define DO_QNEG8(x) do { \
1382     if (x == (int8_t)0x80) { \
1383         x = 0x7f; \
1384         SET_QC(); \
1385     } else { \
1386         x = -x; \
1387     }} while (0)
1388 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1389 {
1390     neon_s8 vec;
1391     NEON_UNPACK(neon_s8, vec, x);
1392     DO_QNEG8(vec.v1);
1393     DO_QNEG8(vec.v2);
1394     DO_QNEG8(vec.v3);
1395     DO_QNEG8(vec.v4);
1396     NEON_PACK(neon_s8, x, vec);
1397     return x;
1398 }
1399 #undef DO_QNEG8
1400 
1401 #define DO_QABS16(x) do { \
1402     if (x == (int16_t)0x8000) { \
1403         x = 0x7fff; \
1404         SET_QC(); \
1405     } else if (x < 0) { \
1406         x = -x; \
1407     }} while (0)
1408 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1409 {
1410     neon_s16 vec;
1411     NEON_UNPACK(neon_s16, vec, x);
1412     DO_QABS16(vec.v1);
1413     DO_QABS16(vec.v2);
1414     NEON_PACK(neon_s16, x, vec);
1415     return x;
1416 }
1417 #undef DO_QABS16
1418 
1419 #define DO_QNEG16(x) do { \
1420     if (x == (int16_t)0x8000) { \
1421         x = 0x7fff; \
1422         SET_QC(); \
1423     } else { \
1424         x = -x; \
1425     }} while (0)
1426 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1427 {
1428     neon_s16 vec;
1429     NEON_UNPACK(neon_s16, vec, x);
1430     DO_QNEG16(vec.v1);
1431     DO_QNEG16(vec.v2);
1432     NEON_PACK(neon_s16, x, vec);
1433     return x;
1434 }
1435 #undef DO_QNEG16
1436 
1437 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1438 {
1439     if (x == SIGNBIT) {
1440         SET_QC();
1441         x = ~SIGNBIT;
1442     } else if ((int32_t)x < 0) {
1443         x = -x;
1444     }
1445     return x;
1446 }
1447 
1448 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1449 {
1450     if (x == SIGNBIT) {
1451         SET_QC();
1452         x = ~SIGNBIT;
1453     } else {
1454         x = -x;
1455     }
1456     return x;
1457 }
1458 
1459 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1460 {
1461     if (x == SIGNBIT64) {
1462         SET_QC();
1463         x = ~SIGNBIT64;
1464     } else if ((int64_t)x < 0) {
1465         x = -x;
1466     }
1467     return x;
1468 }
1469 
1470 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1471 {
1472     if (x == SIGNBIT64) {
1473         SET_QC();
1474         x = ~SIGNBIT64;
1475     } else {
1476         x = -x;
1477     }
1478     return x;
1479 }
1480 
1481 /* NEON Float helpers.  */
1482 
1483 /* Floating point comparisons produce an integer result.
1484  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1485  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1486  */
1487 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1488 {
1489     float_status *fpst = fpstp;
1490     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1491 }
1492 
1493 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1494 {
1495     float_status *fpst = fpstp;
1496     return -float32_le(make_float32(b), make_float32(a), fpst);
1497 }
1498 
1499 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1500 {
1501     float_status *fpst = fpstp;
1502     return -float32_lt(make_float32(b), make_float32(a), fpst);
1503 }
1504 
1505 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1506 {
1507     float_status *fpst = fpstp;
1508     float32 f0 = float32_abs(make_float32(a));
1509     float32 f1 = float32_abs(make_float32(b));
1510     return -float32_le(f1, f0, fpst);
1511 }
1512 
1513 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1514 {
1515     float_status *fpst = fpstp;
1516     float32 f0 = float32_abs(make_float32(a));
1517     float32 f1 = float32_abs(make_float32(b));
1518     return -float32_lt(f1, f0, fpst);
1519 }
1520 
1521 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1522 {
1523     float_status *fpst = fpstp;
1524     float64 f0 = float64_abs(make_float64(a));
1525     float64 f1 = float64_abs(make_float64(b));
1526     return -float64_le(f1, f0, fpst);
1527 }
1528 
1529 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1530 {
1531     float_status *fpst = fpstp;
1532     float64 f0 = float64_abs(make_float64(a));
1533     float64 f1 = float64_abs(make_float64(b));
1534     return -float64_lt(f1, f0, fpst);
1535 }
1536 
1537 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1538 
1539 void HELPER(neon_qunzip8)(void *vd, void *vm)
1540 {
1541     uint64_t *rd = vd, *rm = vm;
1542     uint64_t zd0 = rd[0], zd1 = rd[1];
1543     uint64_t zm0 = rm[0], zm1 = rm[1];
1544 
1545     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1546         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1547         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1548         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1549     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1550         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1551         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1552         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1553     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1554         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1555         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1556         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1557     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1558         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1559         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1560         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1561 
1562     rm[0] = m0;
1563     rm[1] = m1;
1564     rd[0] = d0;
1565     rd[1] = d1;
1566 }
1567 
1568 void HELPER(neon_qunzip16)(void *vd, void *vm)
1569 {
1570     uint64_t *rd = vd, *rm = vm;
1571     uint64_t zd0 = rd[0], zd1 = rd[1];
1572     uint64_t zm0 = rm[0], zm1 = rm[1];
1573 
1574     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1575         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1576     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1577         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1578     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1579         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1580     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1581         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1582 
1583     rm[0] = m0;
1584     rm[1] = m1;
1585     rd[0] = d0;
1586     rd[1] = d1;
1587 }
1588 
1589 void HELPER(neon_qunzip32)(void *vd, void *vm)
1590 {
1591     uint64_t *rd = vd, *rm = vm;
1592     uint64_t zd0 = rd[0], zd1 = rd[1];
1593     uint64_t zm0 = rm[0], zm1 = rm[1];
1594 
1595     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1596     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1597     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1598     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1599 
1600     rm[0] = m0;
1601     rm[1] = m1;
1602     rd[0] = d0;
1603     rd[1] = d1;
1604 }
1605 
1606 void HELPER(neon_unzip8)(void *vd, void *vm)
1607 {
1608     uint64_t *rd = vd, *rm = vm;
1609     uint64_t zd = rd[0], zm = rm[0];
1610 
1611     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1612         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1613         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1614         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1615     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1616         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1617         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1618         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1619 
1620     rm[0] = m0;
1621     rd[0] = d0;
1622 }
1623 
1624 void HELPER(neon_unzip16)(void *vd, void *vm)
1625 {
1626     uint64_t *rd = vd, *rm = vm;
1627     uint64_t zd = rd[0], zm = rm[0];
1628 
1629     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1630         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1631     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1632         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1633 
1634     rm[0] = m0;
1635     rd[0] = d0;
1636 }
1637 
1638 void HELPER(neon_qzip8)(void *vd, void *vm)
1639 {
1640     uint64_t *rd = vd, *rm = vm;
1641     uint64_t zd0 = rd[0], zd1 = rd[1];
1642     uint64_t zm0 = rm[0], zm1 = rm[1];
1643 
1644     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1645         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1646         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1647         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1648     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1649         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1650         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1651         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1652     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1653         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1654         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1655         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1656     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1657         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1658         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1659         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1660 
1661     rm[0] = m0;
1662     rm[1] = m1;
1663     rd[0] = d0;
1664     rd[1] = d1;
1665 }
1666 
1667 void HELPER(neon_qzip16)(void *vd, void *vm)
1668 {
1669     uint64_t *rd = vd, *rm = vm;
1670     uint64_t zd0 = rd[0], zd1 = rd[1];
1671     uint64_t zm0 = rm[0], zm1 = rm[1];
1672 
1673     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1674         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1675     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1676         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1677     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1678         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1679     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1680         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1681 
1682     rm[0] = m0;
1683     rm[1] = m1;
1684     rd[0] = d0;
1685     rd[1] = d1;
1686 }
1687 
1688 void HELPER(neon_qzip32)(void *vd, void *vm)
1689 {
1690     uint64_t *rd = vd, *rm = vm;
1691     uint64_t zd0 = rd[0], zd1 = rd[1];
1692     uint64_t zm0 = rm[0], zm1 = rm[1];
1693 
1694     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1695     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1696     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1697     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1698 
1699     rm[0] = m0;
1700     rm[1] = m1;
1701     rd[0] = d0;
1702     rd[1] = d1;
1703 }
1704 
1705 void HELPER(neon_zip8)(void *vd, void *vm)
1706 {
1707     uint64_t *rd = vd, *rm = vm;
1708     uint64_t zd = rd[0], zm = rm[0];
1709 
1710     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1711         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1712         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1713         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1714     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1715         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1716         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1717         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1718 
1719     rm[0] = m0;
1720     rd[0] = d0;
1721 }
1722 
1723 void HELPER(neon_zip16)(void *vd, void *vm)
1724 {
1725     uint64_t *rd = vd, *rm = vm;
1726     uint64_t zd = rd[0], zm = rm[0];
1727 
1728     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1729         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1730     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1731         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1732 
1733     rm[0] = m0;
1734     rd[0] = d0;
1735 }
1736