xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision 4622c706)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 #include "qemu/osdep.h"
10 
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15 
16 #define SIGNBIT (uint32_t)0x80000000
17 #define SIGNBIT64 ((uint64_t)1 << 63)
18 
19 #define SET_QC() env->vfp.qc[0] = 1
20 
21 #define NEON_TYPE1(name, type) \
22 typedef struct \
23 { \
24     type v1; \
25 } neon_##name;
26 #if HOST_BIG_ENDIAN
27 #define NEON_TYPE2(name, type) \
28 typedef struct \
29 { \
30     type v2; \
31     type v1; \
32 } neon_##name;
33 #define NEON_TYPE4(name, type) \
34 typedef struct \
35 { \
36     type v4; \
37     type v3; \
38     type v2; \
39     type v1; \
40 } neon_##name;
41 #else
42 #define NEON_TYPE2(name, type) \
43 typedef struct \
44 { \
45     type v1; \
46     type v2; \
47 } neon_##name;
48 #define NEON_TYPE4(name, type) \
49 typedef struct \
50 { \
51     type v1; \
52     type v2; \
53     type v3; \
54     type v4; \
55 } neon_##name;
56 #endif
57 
58 NEON_TYPE4(s8, int8_t)
59 NEON_TYPE4(u8, uint8_t)
60 NEON_TYPE2(s16, int16_t)
61 NEON_TYPE2(u16, uint16_t)
62 NEON_TYPE1(s32, int32_t)
63 NEON_TYPE1(u32, uint32_t)
64 #undef NEON_TYPE4
65 #undef NEON_TYPE2
66 #undef NEON_TYPE1
67 
68 /* Copy from a uint32_t to a vector structure type.  */
69 #define NEON_UNPACK(vtype, dest, val) do { \
70     union { \
71         vtype v; \
72         uint32_t i; \
73     } conv_u; \
74     conv_u.i = (val); \
75     dest = conv_u.v; \
76     } while(0)
77 
78 /* Copy from a vector structure type to a uint32_t.  */
79 #define NEON_PACK(vtype, dest, val) do { \
80     union { \
81         vtype v; \
82         uint32_t i; \
83     } conv_u; \
84     conv_u.v = (val); \
85     dest = conv_u.i; \
86     } while(0)
87 
88 #define NEON_DO1 \
89     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
90 #define NEON_DO2 \
91     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
92     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
93 #define NEON_DO4 \
94     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
96     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
97     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
98 
99 #define NEON_VOP_BODY(vtype, n) \
100 { \
101     uint32_t res; \
102     vtype vsrc1; \
103     vtype vsrc2; \
104     vtype vdest; \
105     NEON_UNPACK(vtype, vsrc1, arg1); \
106     NEON_UNPACK(vtype, vsrc2, arg2); \
107     NEON_DO##n; \
108     NEON_PACK(vtype, res, vdest); \
109     return res; \
110 }
111 
112 #define NEON_VOP(name, vtype, n) \
113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
114 NEON_VOP_BODY(vtype, n)
115 
116 #define NEON_VOP_ENV(name, vtype, n) \
117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
118 NEON_VOP_BODY(vtype, n)
119 
120 /* Pairwise operations.  */
121 /* For 32-bit elements each segment only contains a single element, so
122    the elementwise and pairwise operations are the same.  */
123 #define NEON_PDO2 \
124     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
125     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
126 #define NEON_PDO4 \
127     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
128     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
129     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
130     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
131 
132 #define NEON_POP(name, vtype, n) \
133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
134 { \
135     uint32_t res; \
136     vtype vsrc1; \
137     vtype vsrc2; \
138     vtype vdest; \
139     NEON_UNPACK(vtype, vsrc1, arg1); \
140     NEON_UNPACK(vtype, vsrc2, arg2); \
141     NEON_PDO##n; \
142     NEON_PACK(vtype, res, vdest); \
143     return res; \
144 }
145 
146 /* Unary operators.  */
147 #define NEON_VOP1(name, vtype, n) \
148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
149 { \
150     vtype vsrc1; \
151     vtype vdest; \
152     NEON_UNPACK(vtype, vsrc1, arg); \
153     NEON_DO##n; \
154     NEON_PACK(vtype, arg, vdest); \
155     return arg; \
156 }
157 
158 
159 #define NEON_USAT(dest, src1, src2, type) do { \
160     uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
161     if (tmp != (type)tmp) { \
162         SET_QC(); \
163         dest = ~0; \
164     } else { \
165         dest = tmp; \
166     }} while(0)
167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
168 NEON_VOP_ENV(qadd_u8, neon_u8, 4)
169 #undef NEON_FN
170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
171 NEON_VOP_ENV(qadd_u16, neon_u16, 2)
172 #undef NEON_FN
173 #undef NEON_USAT
174 
175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
176 {
177     uint32_t res = a + b;
178     if (res < a) {
179         SET_QC();
180         res = ~0;
181     }
182     return res;
183 }
184 
185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
186 {
187     uint64_t res;
188 
189     res = src1 + src2;
190     if (res < src1) {
191         SET_QC();
192         res = ~(uint64_t)0;
193     }
194     return res;
195 }
196 
197 #define NEON_SSAT(dest, src1, src2, type) do { \
198     int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
199     if (tmp != (type)tmp) { \
200         SET_QC(); \
201         if (src2 > 0) { \
202             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
203         } else { \
204             tmp = 1 << (sizeof(type) * 8 - 1); \
205         } \
206     } \
207     dest = tmp; \
208     } while(0)
209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
210 NEON_VOP_ENV(qadd_s8, neon_s8, 4)
211 #undef NEON_FN
212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
213 NEON_VOP_ENV(qadd_s16, neon_s16, 2)
214 #undef NEON_FN
215 #undef NEON_SSAT
216 
217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
218 {
219     uint32_t res = a + b;
220     if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
221         SET_QC();
222         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
223     }
224     return res;
225 }
226 
227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
228 {
229     uint64_t res;
230 
231     res = src1 + src2;
232     if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
233         SET_QC();
234         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
235     }
236     return res;
237 }
238 
239 /* Unsigned saturating accumulate of signed value
240  *
241  * Op1/Rn is treated as signed
242  * Op2/Rd is treated as unsigned
243  *
244  * Explicit casting is used to ensure the correct sign extension of
245  * inputs. The result is treated as a unsigned value and saturated as such.
246  *
247  * We use a macro for the 8/16 bit cases which expects signed integers of va,
248  * vb, and vr for interim calculation and an unsigned 32 bit result value r.
249  */
250 
251 #define USATACC(bits, shift) \
252     do { \
253         va = sextract32(a, shift, bits);                                \
254         vb = extract32(b, shift, bits);                                 \
255         vr = va + vb;                                                   \
256         if (vr > UINT##bits##_MAX) {                                    \
257             SET_QC();                                                   \
258             vr = UINT##bits##_MAX;                                      \
259         } else if (vr < 0) {                                            \
260             SET_QC();                                                   \
261             vr = 0;                                                     \
262         }                                                               \
263         r = deposit32(r, shift, bits, vr);                              \
264    } while (0)
265 
266 uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
267 {
268     int16_t va, vb, vr;
269     uint32_t r = 0;
270 
271     USATACC(8, 0);
272     USATACC(8, 8);
273     USATACC(8, 16);
274     USATACC(8, 24);
275     return r;
276 }
277 
278 uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
279 {
280     int32_t va, vb, vr;
281     uint64_t r = 0;
282 
283     USATACC(16, 0);
284     USATACC(16, 16);
285     return r;
286 }
287 
288 #undef USATACC
289 
290 uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
291 {
292     int64_t va = (int32_t)a;
293     int64_t vb = (uint32_t)b;
294     int64_t vr = va + vb;
295     if (vr > UINT32_MAX) {
296         SET_QC();
297         vr = UINT32_MAX;
298     } else if (vr < 0) {
299         SET_QC();
300         vr = 0;
301     }
302     return vr;
303 }
304 
305 uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
306 {
307     uint64_t res;
308     res = a + b;
309     /* We only need to look at the pattern of SIGN bits to detect
310      * +ve/-ve saturation
311      */
312     if (~a & b & ~res & SIGNBIT64) {
313         SET_QC();
314         res = UINT64_MAX;
315     } else if (a & ~b & res & SIGNBIT64) {
316         SET_QC();
317         res = 0;
318     }
319     return res;
320 }
321 
322 /* Signed saturating accumulate of unsigned value
323  *
324  * Op1/Rn is treated as unsigned
325  * Op2/Rd is treated as signed
326  *
327  * The result is treated as a signed value and saturated as such
328  *
329  * We use a macro for the 8/16 bit cases which expects signed integers of va,
330  * vb, and vr for interim calculation and an unsigned 32 bit result value r.
331  */
332 
333 #define SSATACC(bits, shift) \
334     do { \
335         va = extract32(a, shift, bits);                                 \
336         vb = sextract32(b, shift, bits);                                \
337         vr = va + vb;                                                   \
338         if (vr > INT##bits##_MAX) {                                     \
339             SET_QC();                                                   \
340             vr = INT##bits##_MAX;                                       \
341         } else if (vr < INT##bits##_MIN) {                              \
342             SET_QC();                                                   \
343             vr = INT##bits##_MIN;                                       \
344         }                                                               \
345         r = deposit32(r, shift, bits, vr);                              \
346     } while (0)
347 
348 uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
349 {
350     int16_t va, vb, vr;
351     uint32_t r = 0;
352 
353     SSATACC(8, 0);
354     SSATACC(8, 8);
355     SSATACC(8, 16);
356     SSATACC(8, 24);
357     return r;
358 }
359 
360 uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
361 {
362     int32_t va, vb, vr;
363     uint32_t r = 0;
364 
365     SSATACC(16, 0);
366     SSATACC(16, 16);
367 
368     return r;
369 }
370 
371 #undef SSATACC
372 
373 uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
374 {
375     int64_t res;
376     int64_t op1 = (uint32_t)a;
377     int64_t op2 = (int32_t)b;
378     res = op1 + op2;
379     if (res > INT32_MAX) {
380         SET_QC();
381         res = INT32_MAX;
382     } else if (res < INT32_MIN) {
383         SET_QC();
384         res = INT32_MIN;
385     }
386     return res;
387 }
388 
389 uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
390 {
391     uint64_t res;
392     res = a + b;
393     /* We only need to look at the pattern of SIGN bits to detect an overflow */
394     if (((a & res)
395          | (~b & res)
396          | (a & ~b)) & SIGNBIT64) {
397         SET_QC();
398         res = INT64_MAX;
399     }
400     return res;
401 }
402 
403 
404 #define NEON_USAT(dest, src1, src2, type) do { \
405     uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
406     if (tmp != (type)tmp) { \
407         SET_QC(); \
408         dest = 0; \
409     } else { \
410         dest = tmp; \
411     }} while(0)
412 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
413 NEON_VOP_ENV(qsub_u8, neon_u8, 4)
414 #undef NEON_FN
415 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
416 NEON_VOP_ENV(qsub_u16, neon_u16, 2)
417 #undef NEON_FN
418 #undef NEON_USAT
419 
420 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
421 {
422     uint32_t res = a - b;
423     if (res > a) {
424         SET_QC();
425         res = 0;
426     }
427     return res;
428 }
429 
430 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
431 {
432     uint64_t res;
433 
434     if (src1 < src2) {
435         SET_QC();
436         res = 0;
437     } else {
438         res = src1 - src2;
439     }
440     return res;
441 }
442 
443 #define NEON_SSAT(dest, src1, src2, type) do { \
444     int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
445     if (tmp != (type)tmp) { \
446         SET_QC(); \
447         if (src2 < 0) { \
448             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
449         } else { \
450             tmp = 1 << (sizeof(type) * 8 - 1); \
451         } \
452     } \
453     dest = tmp; \
454     } while(0)
455 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
456 NEON_VOP_ENV(qsub_s8, neon_s8, 4)
457 #undef NEON_FN
458 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
459 NEON_VOP_ENV(qsub_s16, neon_s16, 2)
460 #undef NEON_FN
461 #undef NEON_SSAT
462 
463 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
464 {
465     uint32_t res = a - b;
466     if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
467         SET_QC();
468         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
469     }
470     return res;
471 }
472 
473 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
474 {
475     uint64_t res;
476 
477     res = src1 - src2;
478     if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
479         SET_QC();
480         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
481     }
482     return res;
483 }
484 
485 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
486 NEON_VOP(hadd_s8, neon_s8, 4)
487 NEON_VOP(hadd_u8, neon_u8, 4)
488 NEON_VOP(hadd_s16, neon_s16, 2)
489 NEON_VOP(hadd_u16, neon_u16, 2)
490 #undef NEON_FN
491 
492 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
493 {
494     int32_t dest;
495 
496     dest = (src1 >> 1) + (src2 >> 1);
497     if (src1 & src2 & 1)
498         dest++;
499     return dest;
500 }
501 
502 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
503 {
504     uint32_t dest;
505 
506     dest = (src1 >> 1) + (src2 >> 1);
507     if (src1 & src2 & 1)
508         dest++;
509     return dest;
510 }
511 
512 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
513 NEON_VOP(rhadd_s8, neon_s8, 4)
514 NEON_VOP(rhadd_u8, neon_u8, 4)
515 NEON_VOP(rhadd_s16, neon_s16, 2)
516 NEON_VOP(rhadd_u16, neon_u16, 2)
517 #undef NEON_FN
518 
519 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
520 {
521     int32_t dest;
522 
523     dest = (src1 >> 1) + (src2 >> 1);
524     if ((src1 | src2) & 1)
525         dest++;
526     return dest;
527 }
528 
529 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
530 {
531     uint32_t dest;
532 
533     dest = (src1 >> 1) + (src2 >> 1);
534     if ((src1 | src2) & 1)
535         dest++;
536     return dest;
537 }
538 
539 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
540 NEON_VOP(hsub_s8, neon_s8, 4)
541 NEON_VOP(hsub_u8, neon_u8, 4)
542 NEON_VOP(hsub_s16, neon_s16, 2)
543 NEON_VOP(hsub_u16, neon_u16, 2)
544 #undef NEON_FN
545 
546 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
547 {
548     int32_t dest;
549 
550     dest = (src1 >> 1) - (src2 >> 1);
551     if ((~src1) & src2 & 1)
552         dest--;
553     return dest;
554 }
555 
556 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
557 {
558     uint32_t dest;
559 
560     dest = (src1 >> 1) - (src2 >> 1);
561     if ((~src1) & src2 & 1)
562         dest--;
563     return dest;
564 }
565 
566 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
567 NEON_POP(pmin_s8, neon_s8, 4)
568 NEON_POP(pmin_u8, neon_u8, 4)
569 NEON_POP(pmin_s16, neon_s16, 2)
570 NEON_POP(pmin_u16, neon_u16, 2)
571 #undef NEON_FN
572 
573 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
574 NEON_POP(pmax_s8, neon_s8, 4)
575 NEON_POP(pmax_u8, neon_u8, 4)
576 NEON_POP(pmax_s16, neon_s16, 2)
577 NEON_POP(pmax_u16, neon_u16, 2)
578 #undef NEON_FN
579 
580 #define NEON_FN(dest, src1, src2) \
581     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
582 NEON_VOP(shl_u16, neon_u16, 2)
583 #undef NEON_FN
584 
585 #define NEON_FN(dest, src1, src2) \
586     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
587 NEON_VOP(shl_s16, neon_s16, 2)
588 #undef NEON_FN
589 
590 #define NEON_FN(dest, src1, src2) \
591     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
592 NEON_VOP(rshl_s8, neon_s8, 4)
593 #undef NEON_FN
594 
595 #define NEON_FN(dest, src1, src2) \
596     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
597 NEON_VOP(rshl_s16, neon_s16, 2)
598 #undef NEON_FN
599 
600 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
601 {
602     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
603 }
604 
605 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
606 {
607     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
608 }
609 
610 #define NEON_FN(dest, src1, src2) \
611     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
612 NEON_VOP(rshl_u8, neon_u8, 4)
613 #undef NEON_FN
614 
615 #define NEON_FN(dest, src1, src2) \
616     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
617 NEON_VOP(rshl_u16, neon_u16, 2)
618 #undef NEON_FN
619 
620 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
621 {
622     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
623 }
624 
625 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
626 {
627     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
628 }
629 
630 #define NEON_FN(dest, src1, src2) \
631     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
632 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
633 #undef NEON_FN
634 
635 #define NEON_FN(dest, src1, src2) \
636     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
637 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
638 #undef NEON_FN
639 
640 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
641 {
642     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
643 }
644 
645 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
646 {
647     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
648 }
649 
650 #define NEON_FN(dest, src1, src2) \
651     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
652 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
653 #undef NEON_FN
654 
655 #define NEON_FN(dest, src1, src2) \
656     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
657 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
658 #undef NEON_FN
659 
660 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
661 {
662     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
663 }
664 
665 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
666 {
667     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
668 }
669 
670 #define NEON_FN(dest, src1, src2) \
671     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
672 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
673 #undef NEON_FN
674 
675 #define NEON_FN(dest, src1, src2) \
676     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
677 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
678 #undef NEON_FN
679 
680 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
681 {
682     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
683 }
684 
685 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
686 {
687     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
688 }
689 
690 #define NEON_FN(dest, src1, src2) \
691     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
692 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
693 #undef NEON_FN
694 
695 #define NEON_FN(dest, src1, src2) \
696     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
697 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
698 #undef NEON_FN
699 
700 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
701 {
702     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
703 }
704 
705 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
706 {
707     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
708 }
709 
710 #define NEON_FN(dest, src1, src2) \
711     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
712 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
713 #undef NEON_FN
714 
715 #define NEON_FN(dest, src1, src2) \
716     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
717 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
718 #undef NEON_FN
719 
720 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
721 {
722     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
723 }
724 
725 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
726 {
727     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
728 }
729 
730 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
731 {
732     uint32_t mask;
733     mask = (a ^ b) & 0x80808080u;
734     a &= ~0x80808080u;
735     b &= ~0x80808080u;
736     return (a + b) ^ mask;
737 }
738 
739 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
740 {
741     uint32_t mask;
742     mask = (a ^ b) & 0x80008000u;
743     a &= ~0x80008000u;
744     b &= ~0x80008000u;
745     return (a + b) ^ mask;
746 }
747 
748 #define NEON_FN(dest, src1, src2) dest = src1 + src2
749 NEON_POP(padd_u8, neon_u8, 4)
750 NEON_POP(padd_u16, neon_u16, 2)
751 #undef NEON_FN
752 
753 #define NEON_FN(dest, src1, src2) dest = src1 - src2
754 NEON_VOP(sub_u8, neon_u8, 4)
755 NEON_VOP(sub_u16, neon_u16, 2)
756 #undef NEON_FN
757 
758 #define NEON_FN(dest, src1, src2) dest = src1 * src2
759 NEON_VOP(mul_u8, neon_u8, 4)
760 NEON_VOP(mul_u16, neon_u16, 2)
761 #undef NEON_FN
762 
763 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
764 NEON_VOP(tst_u8, neon_u8, 4)
765 NEON_VOP(tst_u16, neon_u16, 2)
766 NEON_VOP(tst_u32, neon_u32, 1)
767 #undef NEON_FN
768 
769 /* Count Leading Sign/Zero Bits.  */
770 static inline int do_clz8(uint8_t x)
771 {
772     int n;
773     for (n = 8; x; n--)
774         x >>= 1;
775     return n;
776 }
777 
778 static inline int do_clz16(uint16_t x)
779 {
780     int n;
781     for (n = 16; x; n--)
782         x >>= 1;
783     return n;
784 }
785 
786 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
787 NEON_VOP1(clz_u8, neon_u8, 4)
788 #undef NEON_FN
789 
790 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
791 NEON_VOP1(clz_u16, neon_u16, 2)
792 #undef NEON_FN
793 
794 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
795 NEON_VOP1(cls_s8, neon_s8, 4)
796 #undef NEON_FN
797 
798 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
799 NEON_VOP1(cls_s16, neon_s16, 2)
800 #undef NEON_FN
801 
802 uint32_t HELPER(neon_cls_s32)(uint32_t x)
803 {
804     int count;
805     if ((int32_t)x < 0)
806         x = ~x;
807     for (count = 32; x; count--)
808         x = x >> 1;
809     return count - 1;
810 }
811 
812 /* Bit count.  */
813 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
814 {
815     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
816     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
817     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
818     return x;
819 }
820 
821 /* Reverse bits in each 8 bit word */
822 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
823 {
824     x =  ((x & 0xf0f0f0f0) >> 4)
825        | ((x & 0x0f0f0f0f) << 4);
826     x =  ((x & 0x88888888) >> 3)
827        | ((x & 0x44444444) >> 1)
828        | ((x & 0x22222222) << 1)
829        | ((x & 0x11111111) << 3);
830     return x;
831 }
832 
833 #define NEON_QDMULH16(dest, src1, src2, round) do { \
834     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
835     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
836         SET_QC(); \
837         tmp = (tmp >> 31) ^ ~SIGNBIT; \
838     } else { \
839         tmp <<= 1; \
840     } \
841     if (round) { \
842         int32_t old = tmp; \
843         tmp += 1 << 15; \
844         if ((int32_t)tmp < old) { \
845             SET_QC(); \
846             tmp = SIGNBIT - 1; \
847         } \
848     } \
849     dest = tmp >> 16; \
850     } while(0)
851 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
852 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
853 #undef NEON_FN
854 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
855 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
856 #undef NEON_FN
857 #undef NEON_QDMULH16
858 
859 #define NEON_QDMULH32(dest, src1, src2, round) do { \
860     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
861     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
862         SET_QC(); \
863         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
864     } else { \
865         tmp <<= 1; \
866     } \
867     if (round) { \
868         int64_t old = tmp; \
869         tmp += (int64_t)1 << 31; \
870         if ((int64_t)tmp < old) { \
871             SET_QC(); \
872             tmp = SIGNBIT64 - 1; \
873         } \
874     } \
875     dest = tmp >> 32; \
876     } while(0)
877 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
878 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
879 #undef NEON_FN
880 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
881 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
882 #undef NEON_FN
883 #undef NEON_QDMULH32
884 
885 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
886 {
887     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
888            | ((x >> 24) & 0xff000000u);
889 }
890 
891 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
892 {
893     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
894 }
895 
896 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
897 {
898     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
899             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
900 }
901 
902 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
903 {
904     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
905 }
906 
907 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
908 {
909     x &= 0xff80ff80ff80ff80ull;
910     x += 0x0080008000800080ull;
911     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
912             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
913 }
914 
915 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
916 {
917     x &= 0xffff8000ffff8000ull;
918     x += 0x0000800000008000ull;
919     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
920 }
921 
922 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
923 {
924     uint16_t s;
925     uint8_t d;
926     uint32_t res = 0;
927 #define SAT8(n) \
928     s = x >> n; \
929     if (s & 0x8000) { \
930         SET_QC(); \
931     } else { \
932         if (s > 0xff) { \
933             d = 0xff; \
934             SET_QC(); \
935         } else  { \
936             d = s; \
937         } \
938         res |= (uint32_t)d << (n / 2); \
939     }
940 
941     SAT8(0);
942     SAT8(16);
943     SAT8(32);
944     SAT8(48);
945 #undef SAT8
946     return res;
947 }
948 
949 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
950 {
951     uint16_t s;
952     uint8_t d;
953     uint32_t res = 0;
954 #define SAT8(n) \
955     s = x >> n; \
956     if (s > 0xff) { \
957         d = 0xff; \
958         SET_QC(); \
959     } else  { \
960         d = s; \
961     } \
962     res |= (uint32_t)d << (n / 2);
963 
964     SAT8(0);
965     SAT8(16);
966     SAT8(32);
967     SAT8(48);
968 #undef SAT8
969     return res;
970 }
971 
972 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
973 {
974     int16_t s;
975     uint8_t d;
976     uint32_t res = 0;
977 #define SAT8(n) \
978     s = x >> n; \
979     if (s != (int8_t)s) { \
980         d = (s >> 15) ^ 0x7f; \
981         SET_QC(); \
982     } else  { \
983         d = s; \
984     } \
985     res |= (uint32_t)d << (n / 2);
986 
987     SAT8(0);
988     SAT8(16);
989     SAT8(32);
990     SAT8(48);
991 #undef SAT8
992     return res;
993 }
994 
995 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
996 {
997     uint32_t high;
998     uint32_t low;
999     low = x;
1000     if (low & 0x80000000) {
1001         low = 0;
1002         SET_QC();
1003     } else if (low > 0xffff) {
1004         low = 0xffff;
1005         SET_QC();
1006     }
1007     high = x >> 32;
1008     if (high & 0x80000000) {
1009         high = 0;
1010         SET_QC();
1011     } else if (high > 0xffff) {
1012         high = 0xffff;
1013         SET_QC();
1014     }
1015     return low | (high << 16);
1016 }
1017 
1018 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
1019 {
1020     uint32_t high;
1021     uint32_t low;
1022     low = x;
1023     if (low > 0xffff) {
1024         low = 0xffff;
1025         SET_QC();
1026     }
1027     high = x >> 32;
1028     if (high > 0xffff) {
1029         high = 0xffff;
1030         SET_QC();
1031     }
1032     return low | (high << 16);
1033 }
1034 
1035 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
1036 {
1037     int32_t low;
1038     int32_t high;
1039     low = x;
1040     if (low != (int16_t)low) {
1041         low = (low >> 31) ^ 0x7fff;
1042         SET_QC();
1043     }
1044     high = x >> 32;
1045     if (high != (int16_t)high) {
1046         high = (high >> 31) ^ 0x7fff;
1047         SET_QC();
1048     }
1049     return (uint16_t)low | (high << 16);
1050 }
1051 
1052 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
1053 {
1054     if (x & 0x8000000000000000ull) {
1055         SET_QC();
1056         return 0;
1057     }
1058     if (x > 0xffffffffu) {
1059         SET_QC();
1060         return 0xffffffffu;
1061     }
1062     return x;
1063 }
1064 
1065 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
1066 {
1067     if (x > 0xffffffffu) {
1068         SET_QC();
1069         return 0xffffffffu;
1070     }
1071     return x;
1072 }
1073 
1074 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
1075 {
1076     if ((int64_t)x != (int32_t)x) {
1077         SET_QC();
1078         return ((int64_t)x >> 63) ^ 0x7fffffff;
1079     }
1080     return x;
1081 }
1082 
1083 uint64_t HELPER(neon_widen_u8)(uint32_t x)
1084 {
1085     uint64_t tmp;
1086     uint64_t ret;
1087     ret = (uint8_t)x;
1088     tmp = (uint8_t)(x >> 8);
1089     ret |= tmp << 16;
1090     tmp = (uint8_t)(x >> 16);
1091     ret |= tmp << 32;
1092     tmp = (uint8_t)(x >> 24);
1093     ret |= tmp << 48;
1094     return ret;
1095 }
1096 
1097 uint64_t HELPER(neon_widen_s8)(uint32_t x)
1098 {
1099     uint64_t tmp;
1100     uint64_t ret;
1101     ret = (uint16_t)(int8_t)x;
1102     tmp = (uint16_t)(int8_t)(x >> 8);
1103     ret |= tmp << 16;
1104     tmp = (uint16_t)(int8_t)(x >> 16);
1105     ret |= tmp << 32;
1106     tmp = (uint16_t)(int8_t)(x >> 24);
1107     ret |= tmp << 48;
1108     return ret;
1109 }
1110 
1111 uint64_t HELPER(neon_widen_u16)(uint32_t x)
1112 {
1113     uint64_t high = (uint16_t)(x >> 16);
1114     return ((uint16_t)x) | (high << 32);
1115 }
1116 
1117 uint64_t HELPER(neon_widen_s16)(uint32_t x)
1118 {
1119     uint64_t high = (int16_t)(x >> 16);
1120     return ((uint32_t)(int16_t)x) | (high << 32);
1121 }
1122 
1123 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1124 {
1125     uint64_t mask;
1126     mask = (a ^ b) & 0x8000800080008000ull;
1127     a &= ~0x8000800080008000ull;
1128     b &= ~0x8000800080008000ull;
1129     return (a + b) ^ mask;
1130 }
1131 
1132 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1133 {
1134     uint64_t mask;
1135     mask = (a ^ b) & 0x8000000080000000ull;
1136     a &= ~0x8000000080000000ull;
1137     b &= ~0x8000000080000000ull;
1138     return (a + b) ^ mask;
1139 }
1140 
1141 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1142 {
1143     uint64_t tmp;
1144     uint64_t tmp2;
1145 
1146     tmp = a & 0x0000ffff0000ffffull;
1147     tmp += (a >> 16) & 0x0000ffff0000ffffull;
1148     tmp2 = b & 0xffff0000ffff0000ull;
1149     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1150     return    ( tmp         & 0xffff)
1151             | ((tmp  >> 16) & 0xffff0000ull)
1152             | ((tmp2 << 16) & 0xffff00000000ull)
1153             | ( tmp2        & 0xffff000000000000ull);
1154 }
1155 
1156 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1157 {
1158     uint32_t low = a + (a >> 32);
1159     uint32_t high = b + (b >> 32);
1160     return low + ((uint64_t)high << 32);
1161 }
1162 
1163 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1164 {
1165     uint64_t mask;
1166     mask = (a ^ ~b) & 0x8000800080008000ull;
1167     a |= 0x8000800080008000ull;
1168     b &= ~0x8000800080008000ull;
1169     return (a - b) ^ mask;
1170 }
1171 
1172 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1173 {
1174     uint64_t mask;
1175     mask = (a ^ ~b) & 0x8000000080000000ull;
1176     a |= 0x8000000080000000ull;
1177     b &= ~0x8000000080000000ull;
1178     return (a - b) ^ mask;
1179 }
1180 
1181 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
1182 {
1183     uint32_t x, y;
1184     uint32_t low, high;
1185 
1186     x = a;
1187     y = b;
1188     low = x + y;
1189     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1190         SET_QC();
1191         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1192     }
1193     x = a >> 32;
1194     y = b >> 32;
1195     high = x + y;
1196     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1197         SET_QC();
1198         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1199     }
1200     return low | ((uint64_t)high << 32);
1201 }
1202 
1203 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
1204 {
1205     uint64_t result;
1206 
1207     result = a + b;
1208     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1209         SET_QC();
1210         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1211     }
1212     return result;
1213 }
1214 
1215 /* We have to do the arithmetic in a larger type than
1216  * the input type, because for example with a signed 32 bit
1217  * op the absolute difference can overflow a signed 32 bit value.
1218  */
1219 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
1220     arithtype tmp_x = (intype)(x);                            \
1221     arithtype tmp_y = (intype)(y);                            \
1222     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1223     } while(0)
1224 
1225 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1226 {
1227     uint64_t tmp;
1228     uint64_t result;
1229     DO_ABD(result, a, b, uint8_t, uint32_t);
1230     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1231     result |= tmp << 16;
1232     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1233     result |= tmp << 32;
1234     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1235     result |= tmp << 48;
1236     return result;
1237 }
1238 
1239 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1240 {
1241     uint64_t tmp;
1242     uint64_t result;
1243     DO_ABD(result, a, b, int8_t, int32_t);
1244     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1245     result |= tmp << 16;
1246     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1247     result |= tmp << 32;
1248     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1249     result |= tmp << 48;
1250     return result;
1251 }
1252 
1253 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1254 {
1255     uint64_t tmp;
1256     uint64_t result;
1257     DO_ABD(result, a, b, uint16_t, uint32_t);
1258     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1259     return result | (tmp << 32);
1260 }
1261 
1262 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1263 {
1264     uint64_t tmp;
1265     uint64_t result;
1266     DO_ABD(result, a, b, int16_t, int32_t);
1267     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1268     return result | (tmp << 32);
1269 }
1270 
1271 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1272 {
1273     uint64_t result;
1274     DO_ABD(result, a, b, uint32_t, uint64_t);
1275     return result;
1276 }
1277 
1278 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1279 {
1280     uint64_t result;
1281     DO_ABD(result, a, b, int32_t, int64_t);
1282     return result;
1283 }
1284 #undef DO_ABD
1285 
1286 /* Widening multiply. Named type is the source type.  */
1287 #define DO_MULL(dest, x, y, type1, type2) do { \
1288     type1 tmp_x = x; \
1289     type1 tmp_y = y; \
1290     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1291     } while(0)
1292 
1293 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1294 {
1295     uint64_t tmp;
1296     uint64_t result;
1297 
1298     DO_MULL(result, a, b, uint8_t, uint16_t);
1299     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1300     result |= tmp << 16;
1301     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1302     result |= tmp << 32;
1303     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1304     result |= tmp << 48;
1305     return result;
1306 }
1307 
1308 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1309 {
1310     uint64_t tmp;
1311     uint64_t result;
1312 
1313     DO_MULL(result, a, b, int8_t, uint16_t);
1314     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1315     result |= tmp << 16;
1316     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1317     result |= tmp << 32;
1318     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1319     result |= tmp << 48;
1320     return result;
1321 }
1322 
1323 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1324 {
1325     uint64_t tmp;
1326     uint64_t result;
1327 
1328     DO_MULL(result, a, b, uint16_t, uint32_t);
1329     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1330     return result | (tmp << 32);
1331 }
1332 
1333 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1334 {
1335     uint64_t tmp;
1336     uint64_t result;
1337 
1338     DO_MULL(result, a, b, int16_t, uint32_t);
1339     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1340     return result | (tmp << 32);
1341 }
1342 
1343 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1344 {
1345     uint16_t tmp;
1346     uint64_t result;
1347     result = (uint16_t)-x;
1348     tmp = -(x >> 16);
1349     result |= (uint64_t)tmp << 16;
1350     tmp = -(x >> 32);
1351     result |= (uint64_t)tmp << 32;
1352     tmp = -(x >> 48);
1353     result |= (uint64_t)tmp << 48;
1354     return result;
1355 }
1356 
1357 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1358 {
1359     uint32_t low = -x;
1360     uint32_t high = -(x >> 32);
1361     return low | ((uint64_t)high << 32);
1362 }
1363 
1364 /* Saturating sign manipulation.  */
1365 /* ??? Make these use NEON_VOP1 */
1366 #define DO_QABS8(x) do { \
1367     if (x == (int8_t)0x80) { \
1368         x = 0x7f; \
1369         SET_QC(); \
1370     } else if (x < 0) { \
1371         x = -x; \
1372     }} while (0)
1373 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1374 {
1375     neon_s8 vec;
1376     NEON_UNPACK(neon_s8, vec, x);
1377     DO_QABS8(vec.v1);
1378     DO_QABS8(vec.v2);
1379     DO_QABS8(vec.v3);
1380     DO_QABS8(vec.v4);
1381     NEON_PACK(neon_s8, x, vec);
1382     return x;
1383 }
1384 #undef DO_QABS8
1385 
1386 #define DO_QNEG8(x) do { \
1387     if (x == (int8_t)0x80) { \
1388         x = 0x7f; \
1389         SET_QC(); \
1390     } else { \
1391         x = -x; \
1392     }} while (0)
1393 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1394 {
1395     neon_s8 vec;
1396     NEON_UNPACK(neon_s8, vec, x);
1397     DO_QNEG8(vec.v1);
1398     DO_QNEG8(vec.v2);
1399     DO_QNEG8(vec.v3);
1400     DO_QNEG8(vec.v4);
1401     NEON_PACK(neon_s8, x, vec);
1402     return x;
1403 }
1404 #undef DO_QNEG8
1405 
1406 #define DO_QABS16(x) do { \
1407     if (x == (int16_t)0x8000) { \
1408         x = 0x7fff; \
1409         SET_QC(); \
1410     } else if (x < 0) { \
1411         x = -x; \
1412     }} while (0)
1413 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1414 {
1415     neon_s16 vec;
1416     NEON_UNPACK(neon_s16, vec, x);
1417     DO_QABS16(vec.v1);
1418     DO_QABS16(vec.v2);
1419     NEON_PACK(neon_s16, x, vec);
1420     return x;
1421 }
1422 #undef DO_QABS16
1423 
1424 #define DO_QNEG16(x) do { \
1425     if (x == (int16_t)0x8000) { \
1426         x = 0x7fff; \
1427         SET_QC(); \
1428     } else { \
1429         x = -x; \
1430     }} while (0)
1431 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1432 {
1433     neon_s16 vec;
1434     NEON_UNPACK(neon_s16, vec, x);
1435     DO_QNEG16(vec.v1);
1436     DO_QNEG16(vec.v2);
1437     NEON_PACK(neon_s16, x, vec);
1438     return x;
1439 }
1440 #undef DO_QNEG16
1441 
1442 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1443 {
1444     if (x == SIGNBIT) {
1445         SET_QC();
1446         x = ~SIGNBIT;
1447     } else if ((int32_t)x < 0) {
1448         x = -x;
1449     }
1450     return x;
1451 }
1452 
1453 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1454 {
1455     if (x == SIGNBIT) {
1456         SET_QC();
1457         x = ~SIGNBIT;
1458     } else {
1459         x = -x;
1460     }
1461     return x;
1462 }
1463 
1464 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1465 {
1466     if (x == SIGNBIT64) {
1467         SET_QC();
1468         x = ~SIGNBIT64;
1469     } else if ((int64_t)x < 0) {
1470         x = -x;
1471     }
1472     return x;
1473 }
1474 
1475 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1476 {
1477     if (x == SIGNBIT64) {
1478         SET_QC();
1479         x = ~SIGNBIT64;
1480     } else {
1481         x = -x;
1482     }
1483     return x;
1484 }
1485 
1486 /* NEON Float helpers.  */
1487 
1488 /* Floating point comparisons produce an integer result.
1489  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1490  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1491  */
1492 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1493 {
1494     float_status *fpst = fpstp;
1495     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1496 }
1497 
1498 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1499 {
1500     float_status *fpst = fpstp;
1501     return -float32_le(make_float32(b), make_float32(a), fpst);
1502 }
1503 
1504 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1505 {
1506     float_status *fpst = fpstp;
1507     return -float32_lt(make_float32(b), make_float32(a), fpst);
1508 }
1509 
1510 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1511 {
1512     float_status *fpst = fpstp;
1513     float32 f0 = float32_abs(make_float32(a));
1514     float32 f1 = float32_abs(make_float32(b));
1515     return -float32_le(f1, f0, fpst);
1516 }
1517 
1518 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1519 {
1520     float_status *fpst = fpstp;
1521     float32 f0 = float32_abs(make_float32(a));
1522     float32 f1 = float32_abs(make_float32(b));
1523     return -float32_lt(f1, f0, fpst);
1524 }
1525 
1526 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1527 {
1528     float_status *fpst = fpstp;
1529     float64 f0 = float64_abs(make_float64(a));
1530     float64 f1 = float64_abs(make_float64(b));
1531     return -float64_le(f1, f0, fpst);
1532 }
1533 
1534 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1535 {
1536     float_status *fpst = fpstp;
1537     float64 f0 = float64_abs(make_float64(a));
1538     float64 f1 = float64_abs(make_float64(b));
1539     return -float64_lt(f1, f0, fpst);
1540 }
1541 
1542 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1543 
1544 void HELPER(neon_qunzip8)(void *vd, void *vm)
1545 {
1546     uint64_t *rd = vd, *rm = vm;
1547     uint64_t zd0 = rd[0], zd1 = rd[1];
1548     uint64_t zm0 = rm[0], zm1 = rm[1];
1549 
1550     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1551         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1552         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1553         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1554     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1555         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1556         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1557         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1558     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1559         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1560         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1561         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1562     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1563         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1564         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1565         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1566 
1567     rm[0] = m0;
1568     rm[1] = m1;
1569     rd[0] = d0;
1570     rd[1] = d1;
1571 }
1572 
1573 void HELPER(neon_qunzip16)(void *vd, void *vm)
1574 {
1575     uint64_t *rd = vd, *rm = vm;
1576     uint64_t zd0 = rd[0], zd1 = rd[1];
1577     uint64_t zm0 = rm[0], zm1 = rm[1];
1578 
1579     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1580         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1581     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1582         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1583     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1584         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1585     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1586         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1587 
1588     rm[0] = m0;
1589     rm[1] = m1;
1590     rd[0] = d0;
1591     rd[1] = d1;
1592 }
1593 
1594 void HELPER(neon_qunzip32)(void *vd, void *vm)
1595 {
1596     uint64_t *rd = vd, *rm = vm;
1597     uint64_t zd0 = rd[0], zd1 = rd[1];
1598     uint64_t zm0 = rm[0], zm1 = rm[1];
1599 
1600     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1601     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1602     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1603     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1604 
1605     rm[0] = m0;
1606     rm[1] = m1;
1607     rd[0] = d0;
1608     rd[1] = d1;
1609 }
1610 
1611 void HELPER(neon_unzip8)(void *vd, void *vm)
1612 {
1613     uint64_t *rd = vd, *rm = vm;
1614     uint64_t zd = rd[0], zm = rm[0];
1615 
1616     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1617         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1618         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1619         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1620     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1621         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1622         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1623         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1624 
1625     rm[0] = m0;
1626     rd[0] = d0;
1627 }
1628 
1629 void HELPER(neon_unzip16)(void *vd, void *vm)
1630 {
1631     uint64_t *rd = vd, *rm = vm;
1632     uint64_t zd = rd[0], zm = rm[0];
1633 
1634     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1635         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1636     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1637         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1638 
1639     rm[0] = m0;
1640     rd[0] = d0;
1641 }
1642 
1643 void HELPER(neon_qzip8)(void *vd, void *vm)
1644 {
1645     uint64_t *rd = vd, *rm = vm;
1646     uint64_t zd0 = rd[0], zd1 = rd[1];
1647     uint64_t zm0 = rm[0], zm1 = rm[1];
1648 
1649     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1650         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1651         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1652         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1653     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1654         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1655         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1656         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1657     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1658         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1659         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1660         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1661     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1662         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1663         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1664         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1665 
1666     rm[0] = m0;
1667     rm[1] = m1;
1668     rd[0] = d0;
1669     rd[1] = d1;
1670 }
1671 
1672 void HELPER(neon_qzip16)(void *vd, void *vm)
1673 {
1674     uint64_t *rd = vd, *rm = vm;
1675     uint64_t zd0 = rd[0], zd1 = rd[1];
1676     uint64_t zm0 = rm[0], zm1 = rm[1];
1677 
1678     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1679         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1680     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1681         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1682     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1683         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1684     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1685         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1686 
1687     rm[0] = m0;
1688     rm[1] = m1;
1689     rd[0] = d0;
1690     rd[1] = d1;
1691 }
1692 
1693 void HELPER(neon_qzip32)(void *vd, void *vm)
1694 {
1695     uint64_t *rd = vd, *rm = vm;
1696     uint64_t zd0 = rd[0], zd1 = rd[1];
1697     uint64_t zm0 = rm[0], zm1 = rm[1];
1698 
1699     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1700     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1701     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1702     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1703 
1704     rm[0] = m0;
1705     rm[1] = m1;
1706     rd[0] = d0;
1707     rd[1] = d1;
1708 }
1709 
1710 void HELPER(neon_zip8)(void *vd, void *vm)
1711 {
1712     uint64_t *rd = vd, *rm = vm;
1713     uint64_t zd = rd[0], zm = rm[0];
1714 
1715     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1716         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1717         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1718         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1719     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1720         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1721         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1722         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1723 
1724     rm[0] = m0;
1725     rd[0] = d0;
1726 }
1727 
1728 void HELPER(neon_zip16)(void *vd, void *vm)
1729 {
1730     uint64_t *rd = vd, *rm = vm;
1731     uint64_t zd = rd[0], zm = rm[0];
1732 
1733     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1734         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1735     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1736         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1737 
1738     rm[0] = m0;
1739     rd[0] = d0;
1740 }
1741