xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision 01dc65a3bc262ab1bec8fe89775e9bbfa627becb)
1a3ef070eSClaudio Fontana /*
2a3ef070eSClaudio Fontana  * ARM NEON vector operations.
3a3ef070eSClaudio Fontana  *
4a3ef070eSClaudio Fontana  * Copyright (c) 2007, 2008 CodeSourcery.
5a3ef070eSClaudio Fontana  * Written by Paul Brook
6a3ef070eSClaudio Fontana  *
7a3ef070eSClaudio Fontana  * This code is licensed under the GNU GPL v2.
8a3ef070eSClaudio Fontana  */
9a3ef070eSClaudio Fontana 
10940392c8SRichard Henderson #include "qemu/osdep.h"
11a3ef070eSClaudio Fontana #include "cpu.h"
12a3ef070eSClaudio Fontana #include "exec/helper-proto.h"
13940392c8SRichard Henderson #include "tcg/tcg-gvec-desc.h"
14a3ef070eSClaudio Fontana #include "fpu/softfloat.h"
15a3ef070eSClaudio Fontana #include "vec_internal.h"
16a3ef070eSClaudio Fontana 
17a3ef070eSClaudio Fontana #define SIGNBIT (uint32_t)0x80000000
18a3ef070eSClaudio Fontana #define SIGNBIT64 ((uint64_t)1 << 63)
19a3ef070eSClaudio Fontana 
20a3ef070eSClaudio Fontana #define SET_QC() env->vfp.qc[0] = 1
21a3ef070eSClaudio Fontana 
22a3ef070eSClaudio Fontana #define NEON_TYPE1(name, type) \
23a3ef070eSClaudio Fontana typedef struct \
24a3ef070eSClaudio Fontana { \
25a3ef070eSClaudio Fontana     type v1; \
26a3ef070eSClaudio Fontana } neon_##name;
27a3ef070eSClaudio Fontana #if HOST_BIG_ENDIAN
28a3ef070eSClaudio Fontana #define NEON_TYPE2(name, type) \
29a3ef070eSClaudio Fontana typedef struct \
30a3ef070eSClaudio Fontana { \
31a3ef070eSClaudio Fontana     type v2; \
32a3ef070eSClaudio Fontana     type v1; \
33a3ef070eSClaudio Fontana } neon_##name;
34a3ef070eSClaudio Fontana #define NEON_TYPE4(name, type) \
35a3ef070eSClaudio Fontana typedef struct \
36a3ef070eSClaudio Fontana { \
37a3ef070eSClaudio Fontana     type v4; \
38a3ef070eSClaudio Fontana     type v3; \
39a3ef070eSClaudio Fontana     type v2; \
40a3ef070eSClaudio Fontana     type v1; \
41a3ef070eSClaudio Fontana } neon_##name;
42a3ef070eSClaudio Fontana #else
43a3ef070eSClaudio Fontana #define NEON_TYPE2(name, type) \
44a3ef070eSClaudio Fontana typedef struct \
45a3ef070eSClaudio Fontana { \
46a3ef070eSClaudio Fontana     type v1; \
47a3ef070eSClaudio Fontana     type v2; \
48a3ef070eSClaudio Fontana } neon_##name;
49a3ef070eSClaudio Fontana #define NEON_TYPE4(name, type) \
50a3ef070eSClaudio Fontana typedef struct \
51a3ef070eSClaudio Fontana { \
52a3ef070eSClaudio Fontana     type v1; \
53a3ef070eSClaudio Fontana     type v2; \
54a3ef070eSClaudio Fontana     type v3; \
55a3ef070eSClaudio Fontana     type v4; \
56a3ef070eSClaudio Fontana } neon_##name;
57a3ef070eSClaudio Fontana #endif
58a3ef070eSClaudio Fontana 
NEON_TYPE4(s8,int8_t)59a3ef070eSClaudio Fontana NEON_TYPE4(s8, int8_t)
60a3ef070eSClaudio Fontana NEON_TYPE4(u8, uint8_t)
61a3ef070eSClaudio Fontana NEON_TYPE2(s16, int16_t)
62a3ef070eSClaudio Fontana NEON_TYPE2(u16, uint16_t)
63a3ef070eSClaudio Fontana NEON_TYPE1(s32, int32_t)
64a3ef070eSClaudio Fontana NEON_TYPE1(u32, uint32_t)
65a3ef070eSClaudio Fontana #undef NEON_TYPE4
66a3ef070eSClaudio Fontana #undef NEON_TYPE2
67a3ef070eSClaudio Fontana #undef NEON_TYPE1
68a3ef070eSClaudio Fontana 
69a3ef070eSClaudio Fontana /* Copy from a uint32_t to a vector structure type.  */
70a3ef070eSClaudio Fontana #define NEON_UNPACK(vtype, dest, val) do { \
71a3ef070eSClaudio Fontana     union { \
72a3ef070eSClaudio Fontana         vtype v; \
73a3ef070eSClaudio Fontana         uint32_t i; \
74a3ef070eSClaudio Fontana     } conv_u; \
75a3ef070eSClaudio Fontana     conv_u.i = (val); \
76a3ef070eSClaudio Fontana     dest = conv_u.v; \
77a3ef070eSClaudio Fontana     } while(0)
78a3ef070eSClaudio Fontana 
79a3ef070eSClaudio Fontana /* Copy from a vector structure type to a uint32_t.  */
80a3ef070eSClaudio Fontana #define NEON_PACK(vtype, dest, val) do { \
81a3ef070eSClaudio Fontana     union { \
82a3ef070eSClaudio Fontana         vtype v; \
83a3ef070eSClaudio Fontana         uint32_t i; \
84a3ef070eSClaudio Fontana     } conv_u; \
85a3ef070eSClaudio Fontana     conv_u.v = (val); \
86a3ef070eSClaudio Fontana     dest = conv_u.i; \
87a3ef070eSClaudio Fontana     } while(0)
88a3ef070eSClaudio Fontana 
89a3ef070eSClaudio Fontana #define NEON_DO1 \
90a3ef070eSClaudio Fontana     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91a3ef070eSClaudio Fontana #define NEON_DO2 \
92a3ef070eSClaudio Fontana     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93a3ef070eSClaudio Fontana     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94a3ef070eSClaudio Fontana #define NEON_DO4 \
95a3ef070eSClaudio Fontana     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96a3ef070eSClaudio Fontana     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97a3ef070eSClaudio Fontana     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98a3ef070eSClaudio Fontana     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99a3ef070eSClaudio Fontana 
100a3ef070eSClaudio Fontana #define NEON_VOP_BODY(vtype, n) \
101a3ef070eSClaudio Fontana { \
102a3ef070eSClaudio Fontana     uint32_t res; \
103a3ef070eSClaudio Fontana     vtype vsrc1; \
104a3ef070eSClaudio Fontana     vtype vsrc2; \
105a3ef070eSClaudio Fontana     vtype vdest; \
106a3ef070eSClaudio Fontana     NEON_UNPACK(vtype, vsrc1, arg1); \
107a3ef070eSClaudio Fontana     NEON_UNPACK(vtype, vsrc2, arg2); \
108a3ef070eSClaudio Fontana     NEON_DO##n; \
109a3ef070eSClaudio Fontana     NEON_PACK(vtype, res, vdest); \
110a3ef070eSClaudio Fontana     return res; \
111a3ef070eSClaudio Fontana }
112a3ef070eSClaudio Fontana 
113a3ef070eSClaudio Fontana #define NEON_VOP(name, vtype, n) \
114a3ef070eSClaudio Fontana uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115a3ef070eSClaudio Fontana NEON_VOP_BODY(vtype, n)
116a3ef070eSClaudio Fontana 
117a3ef070eSClaudio Fontana #define NEON_VOP_ENV(name, vtype, n) \
118a3ef070eSClaudio Fontana uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119a3ef070eSClaudio Fontana NEON_VOP_BODY(vtype, n)
120a3ef070eSClaudio Fontana 
121940392c8SRichard Henderson #define NEON_GVEC_VOP2(name, vtype) \
122940392c8SRichard Henderson void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123940392c8SRichard Henderson {                                                               \
124940392c8SRichard Henderson     intptr_t i, opr_sz = simd_oprsz(desc);                      \
125940392c8SRichard Henderson     vtype *d = vd, *n = vn, *m = vm;                            \
126940392c8SRichard Henderson     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
127940392c8SRichard Henderson         NEON_FN(d[i], n[i], m[i]);                              \
128940392c8SRichard Henderson     }                                                           \
129940392c8SRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
130940392c8SRichard Henderson }
131940392c8SRichard Henderson 
132e72a6878SRichard Henderson #define NEON_GVEC_VOP2_ENV(name, vtype) \
133e72a6878SRichard Henderson void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
134e72a6878SRichard Henderson {                                                               \
135e72a6878SRichard Henderson     intptr_t i, opr_sz = simd_oprsz(desc);                      \
136e72a6878SRichard Henderson     vtype *d = vd, *n = vn, *m = vm;                            \
137e72a6878SRichard Henderson     CPUARMState *env = venv;                                    \
138e72a6878SRichard Henderson     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
139e72a6878SRichard Henderson         NEON_FN(d[i], n[i], m[i]);                              \
140e72a6878SRichard Henderson     }                                                           \
141e72a6878SRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
142e72a6878SRichard Henderson }
143e72a6878SRichard Henderson 
144ef2b80ebSRichard Henderson #define NEON_GVEC_VOP2i_ENV(name, vtype) \
145ef2b80ebSRichard Henderson void HELPER(name)(void *vd, void *vn, void *venv, uint32_t desc) \
146ef2b80ebSRichard Henderson {                                                               \
147ef2b80ebSRichard Henderson     intptr_t i, opr_sz = simd_oprsz(desc);                      \
148ef2b80ebSRichard Henderson     int imm = simd_data(desc);                                  \
149ef2b80ebSRichard Henderson     vtype *d = vd, *n = vn;                                     \
150ef2b80ebSRichard Henderson     CPUARMState *env = venv;                                    \
151ef2b80ebSRichard Henderson     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
152ef2b80ebSRichard Henderson         NEON_FN(d[i], n[i], imm);                               \
153ef2b80ebSRichard Henderson     }                                                           \
154ef2b80ebSRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
155ef2b80ebSRichard Henderson }
156ef2b80ebSRichard Henderson 
157a3ef070eSClaudio Fontana /* Pairwise operations.  */
158a3ef070eSClaudio Fontana /* For 32-bit elements each segment only contains a single element, so
159a3ef070eSClaudio Fontana    the elementwise and pairwise operations are the same.  */
160a3ef070eSClaudio Fontana #define NEON_PDO2 \
161a3ef070eSClaudio Fontana     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
162a3ef070eSClaudio Fontana     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
163a3ef070eSClaudio Fontana #define NEON_PDO4 \
164a3ef070eSClaudio Fontana     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
165a3ef070eSClaudio Fontana     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
166a3ef070eSClaudio Fontana     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
167a3ef070eSClaudio Fontana     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
168a3ef070eSClaudio Fontana 
169a3ef070eSClaudio Fontana #define NEON_POP(name, vtype, n) \
170a3ef070eSClaudio Fontana uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
171a3ef070eSClaudio Fontana { \
172a3ef070eSClaudio Fontana     uint32_t res; \
173a3ef070eSClaudio Fontana     vtype vsrc1; \
174a3ef070eSClaudio Fontana     vtype vsrc2; \
175a3ef070eSClaudio Fontana     vtype vdest; \
176a3ef070eSClaudio Fontana     NEON_UNPACK(vtype, vsrc1, arg1); \
177a3ef070eSClaudio Fontana     NEON_UNPACK(vtype, vsrc2, arg2); \
178a3ef070eSClaudio Fontana     NEON_PDO##n; \
179a3ef070eSClaudio Fontana     NEON_PACK(vtype, res, vdest); \
180a3ef070eSClaudio Fontana     return res; \
181a3ef070eSClaudio Fontana }
182a3ef070eSClaudio Fontana 
183a3ef070eSClaudio Fontana /* Unary operators.  */
184a3ef070eSClaudio Fontana #define NEON_VOP1(name, vtype, n) \
185a3ef070eSClaudio Fontana uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
186a3ef070eSClaudio Fontana { \
187a3ef070eSClaudio Fontana     vtype vsrc1; \
188a3ef070eSClaudio Fontana     vtype vdest; \
189a3ef070eSClaudio Fontana     NEON_UNPACK(vtype, vsrc1, arg); \
190a3ef070eSClaudio Fontana     NEON_DO##n; \
191a3ef070eSClaudio Fontana     NEON_PACK(vtype, arg, vdest); \
192a3ef070eSClaudio Fontana     return arg; \
193a3ef070eSClaudio Fontana }
194a3ef070eSClaudio Fontana 
195a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
196a3ef070eSClaudio Fontana NEON_POP(pmin_s8, neon_s8, 4)
197a3ef070eSClaudio Fontana NEON_POP(pmin_u8, neon_u8, 4)
198a3ef070eSClaudio Fontana NEON_POP(pmin_s16, neon_s16, 2)
199a3ef070eSClaudio Fontana NEON_POP(pmin_u16, neon_u16, 2)
200a3ef070eSClaudio Fontana #undef NEON_FN
201a3ef070eSClaudio Fontana 
202a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
203a3ef070eSClaudio Fontana NEON_POP(pmax_s8, neon_s8, 4)
204a3ef070eSClaudio Fontana NEON_POP(pmax_u8, neon_u8, 4)
205a3ef070eSClaudio Fontana NEON_POP(pmax_s16, neon_s16, 2)
206a3ef070eSClaudio Fontana NEON_POP(pmax_u16, neon_u16, 2)
207a3ef070eSClaudio Fontana #undef NEON_FN
208a3ef070eSClaudio Fontana 
209a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
210a3ef070eSClaudio Fontana     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
211a3ef070eSClaudio Fontana NEON_VOP(shl_u16, neon_u16, 2)
212a3ef070eSClaudio Fontana #undef NEON_FN
213a3ef070eSClaudio Fontana 
214a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
215a3ef070eSClaudio Fontana     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
216a3ef070eSClaudio Fontana NEON_VOP(shl_s16, neon_s16, 2)
217a3ef070eSClaudio Fontana #undef NEON_FN
218a3ef070eSClaudio Fontana 
219a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
220a3ef070eSClaudio Fontana     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
221a3ef070eSClaudio Fontana NEON_VOP(rshl_s8, neon_s8, 4)
222940392c8SRichard Henderson NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
223a3ef070eSClaudio Fontana #undef NEON_FN
224a3ef070eSClaudio Fontana 
225a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
226a3ef070eSClaudio Fontana     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
227a3ef070eSClaudio Fontana NEON_VOP(rshl_s16, neon_s16, 2)
228940392c8SRichard Henderson NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
229940392c8SRichard Henderson #undef NEON_FN
230940392c8SRichard Henderson 
231940392c8SRichard Henderson #define NEON_FN(dest, src1, src2) \
232940392c8SRichard Henderson     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
233940392c8SRichard Henderson NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
234940392c8SRichard Henderson #undef NEON_FN
235940392c8SRichard Henderson 
236940392c8SRichard Henderson #define NEON_FN(dest, src1, src2) \
237940392c8SRichard Henderson     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
238940392c8SRichard Henderson NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
239a3ef070eSClaudio Fontana #undef NEON_FN
240a3ef070eSClaudio Fontana 
241a3ef070eSClaudio Fontana uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
242a3ef070eSClaudio Fontana {
243a3ef070eSClaudio Fontana     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
244a3ef070eSClaudio Fontana }
245a3ef070eSClaudio Fontana 
HELPER(neon_rshl_s64)246a3ef070eSClaudio Fontana uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
247a3ef070eSClaudio Fontana {
248a3ef070eSClaudio Fontana     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
249a3ef070eSClaudio Fontana }
250a3ef070eSClaudio Fontana 
251a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
252a3ef070eSClaudio Fontana     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
253a3ef070eSClaudio Fontana NEON_VOP(rshl_u8, neon_u8, 4)
NEON_GVEC_VOP2(gvec_urshl_b,uint8_t)254940392c8SRichard Henderson NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
255a3ef070eSClaudio Fontana #undef NEON_FN
256a3ef070eSClaudio Fontana 
257a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
258a3ef070eSClaudio Fontana     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
259a3ef070eSClaudio Fontana NEON_VOP(rshl_u16, neon_u16, 2)
260940392c8SRichard Henderson NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
261940392c8SRichard Henderson #undef NEON_FN
262940392c8SRichard Henderson 
263940392c8SRichard Henderson #define NEON_FN(dest, src1, src2) \
264940392c8SRichard Henderson     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
265940392c8SRichard Henderson NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
266940392c8SRichard Henderson #undef NEON_FN
267940392c8SRichard Henderson 
268940392c8SRichard Henderson #define NEON_FN(dest, src1, src2) \
269940392c8SRichard Henderson     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
270940392c8SRichard Henderson NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
271a3ef070eSClaudio Fontana #undef NEON_FN
272a3ef070eSClaudio Fontana 
273a3ef070eSClaudio Fontana uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
274a3ef070eSClaudio Fontana {
275a3ef070eSClaudio Fontana     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
276a3ef070eSClaudio Fontana }
277a3ef070eSClaudio Fontana 
HELPER(neon_rshl_u64)278a3ef070eSClaudio Fontana uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
279a3ef070eSClaudio Fontana {
280a3ef070eSClaudio Fontana     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
281a3ef070eSClaudio Fontana }
282a3ef070eSClaudio Fontana 
283a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
284a3ef070eSClaudio Fontana     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
285a3ef070eSClaudio Fontana NEON_VOP_ENV(qshl_u8, neon_u8, 4)
NEON_GVEC_VOP2_ENV(neon_uqshl_b,uint8_t)286e72a6878SRichard Henderson NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
287ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
288a3ef070eSClaudio Fontana #undef NEON_FN
289a3ef070eSClaudio Fontana 
290a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
291a3ef070eSClaudio Fontana     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
292a3ef070eSClaudio Fontana NEON_VOP_ENV(qshl_u16, neon_u16, 2)
293e72a6878SRichard Henderson NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
294ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
295e72a6878SRichard Henderson #undef NEON_FN
296e72a6878SRichard Henderson 
297e72a6878SRichard Henderson #define NEON_FN(dest, src1, src2) \
298e72a6878SRichard Henderson     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
299e72a6878SRichard Henderson NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
300ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
301e72a6878SRichard Henderson #undef NEON_FN
302e72a6878SRichard Henderson 
303e72a6878SRichard Henderson #define NEON_FN(dest, src1, src2) \
304e72a6878SRichard Henderson     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
305e72a6878SRichard Henderson NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
306ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
307a3ef070eSClaudio Fontana #undef NEON_FN
308a3ef070eSClaudio Fontana 
309a3ef070eSClaudio Fontana uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
310a3ef070eSClaudio Fontana {
311a3ef070eSClaudio Fontana     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
312a3ef070eSClaudio Fontana }
313a3ef070eSClaudio Fontana 
HELPER(neon_qshl_u64)314a3ef070eSClaudio Fontana uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
315a3ef070eSClaudio Fontana {
316a3ef070eSClaudio Fontana     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
317a3ef070eSClaudio Fontana }
318a3ef070eSClaudio Fontana 
319a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
320a3ef070eSClaudio Fontana     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
321a3ef070eSClaudio Fontana NEON_VOP_ENV(qshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqshl_b,int8_t)322e72a6878SRichard Henderson NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
323ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
324a3ef070eSClaudio Fontana #undef NEON_FN
325a3ef070eSClaudio Fontana 
326a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
327a3ef070eSClaudio Fontana     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
328a3ef070eSClaudio Fontana NEON_VOP_ENV(qshl_s16, neon_s16, 2)
329e72a6878SRichard Henderson NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
330ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
331e72a6878SRichard Henderson #undef NEON_FN
332e72a6878SRichard Henderson 
333e72a6878SRichard Henderson #define NEON_FN(dest, src1, src2) \
334e72a6878SRichard Henderson     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
335e72a6878SRichard Henderson NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
336ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
337e72a6878SRichard Henderson #undef NEON_FN
338e72a6878SRichard Henderson 
339e72a6878SRichard Henderson #define NEON_FN(dest, src1, src2) \
340e72a6878SRichard Henderson     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
341e72a6878SRichard Henderson NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
342ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
343a3ef070eSClaudio Fontana #undef NEON_FN
344a3ef070eSClaudio Fontana 
345a3ef070eSClaudio Fontana uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
346a3ef070eSClaudio Fontana {
347a3ef070eSClaudio Fontana     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
348a3ef070eSClaudio Fontana }
349a3ef070eSClaudio Fontana 
HELPER(neon_qshl_s64)350a3ef070eSClaudio Fontana uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
351a3ef070eSClaudio Fontana {
352a3ef070eSClaudio Fontana     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
353a3ef070eSClaudio Fontana }
354a3ef070eSClaudio Fontana 
355a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
356a3ef070eSClaudio Fontana     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
357a3ef070eSClaudio Fontana NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
NEON_GVEC_VOP2i_ENV(neon_sqshlui_b,int8_t)358ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
359a3ef070eSClaudio Fontana #undef NEON_FN
360a3ef070eSClaudio Fontana 
361a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
362a3ef070eSClaudio Fontana     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
363a3ef070eSClaudio Fontana NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
364ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
365a3ef070eSClaudio Fontana #undef NEON_FN
366a3ef070eSClaudio Fontana 
367a3ef070eSClaudio Fontana uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
368a3ef070eSClaudio Fontana {
369a3ef070eSClaudio Fontana     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
370a3ef070eSClaudio Fontana }
371a3ef070eSClaudio Fontana 
HELPER(neon_qshlu_s64)372a3ef070eSClaudio Fontana uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
373a3ef070eSClaudio Fontana {
374a3ef070eSClaudio Fontana     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
375a3ef070eSClaudio Fontana }
376a3ef070eSClaudio Fontana 
377a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
378ef2b80ebSRichard Henderson     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
NEON_GVEC_VOP2i_ENV(neon_sqshlui_s,int32_t)379ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
380ef2b80ebSRichard Henderson #undef NEON_FN
381ef2b80ebSRichard Henderson 
382ef2b80ebSRichard Henderson #define NEON_FN(dest, src1, src2) \
383ef2b80ebSRichard Henderson     (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
384ef2b80ebSRichard Henderson NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
385ef2b80ebSRichard Henderson #undef NEON_FN
386ef2b80ebSRichard Henderson 
387ef2b80ebSRichard Henderson #define NEON_FN(dest, src1, src2) \
388a3ef070eSClaudio Fontana     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
389a3ef070eSClaudio Fontana NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
390cef9d54fSRichard Henderson NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
391a3ef070eSClaudio Fontana #undef NEON_FN
392a3ef070eSClaudio Fontana 
393a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
394a3ef070eSClaudio Fontana     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
395a3ef070eSClaudio Fontana NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
396cef9d54fSRichard Henderson NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
397cef9d54fSRichard Henderson #undef NEON_FN
398cef9d54fSRichard Henderson 
399cef9d54fSRichard Henderson #define NEON_FN(dest, src1, src2) \
400cef9d54fSRichard Henderson     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
401cef9d54fSRichard Henderson NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
402cef9d54fSRichard Henderson #undef NEON_FN
403cef9d54fSRichard Henderson 
404cef9d54fSRichard Henderson #define NEON_FN(dest, src1, src2) \
405cef9d54fSRichard Henderson     (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
406cef9d54fSRichard Henderson NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
407a3ef070eSClaudio Fontana #undef NEON_FN
408a3ef070eSClaudio Fontana 
409a3ef070eSClaudio Fontana uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
410a3ef070eSClaudio Fontana {
411a3ef070eSClaudio Fontana     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
412a3ef070eSClaudio Fontana }
413a3ef070eSClaudio Fontana 
HELPER(neon_qrshl_u64)414a3ef070eSClaudio Fontana uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
415a3ef070eSClaudio Fontana {
416a3ef070eSClaudio Fontana     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
417a3ef070eSClaudio Fontana }
418a3ef070eSClaudio Fontana 
419a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
420a3ef070eSClaudio Fontana     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
421a3ef070eSClaudio Fontana NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqrshl_b,int8_t)422cef9d54fSRichard Henderson NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
423a3ef070eSClaudio Fontana #undef NEON_FN
424a3ef070eSClaudio Fontana 
425a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) \
426a3ef070eSClaudio Fontana     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
427a3ef070eSClaudio Fontana NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
428cef9d54fSRichard Henderson NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
429cef9d54fSRichard Henderson #undef NEON_FN
430cef9d54fSRichard Henderson 
431cef9d54fSRichard Henderson #define NEON_FN(dest, src1, src2) \
432cef9d54fSRichard Henderson     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
433cef9d54fSRichard Henderson NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
434cef9d54fSRichard Henderson #undef NEON_FN
435cef9d54fSRichard Henderson 
436cef9d54fSRichard Henderson #define NEON_FN(dest, src1, src2) \
437cef9d54fSRichard Henderson     (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
438cef9d54fSRichard Henderson NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
439a3ef070eSClaudio Fontana #undef NEON_FN
440a3ef070eSClaudio Fontana 
441a3ef070eSClaudio Fontana uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
442a3ef070eSClaudio Fontana {
443a3ef070eSClaudio Fontana     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
444a3ef070eSClaudio Fontana }
445a3ef070eSClaudio Fontana 
HELPER(neon_qrshl_s64)446a3ef070eSClaudio Fontana uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
447a3ef070eSClaudio Fontana {
448a3ef070eSClaudio Fontana     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
449a3ef070eSClaudio Fontana }
450a3ef070eSClaudio Fontana 
HELPER(neon_add_u8)451a3ef070eSClaudio Fontana uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
452a3ef070eSClaudio Fontana {
453a3ef070eSClaudio Fontana     uint32_t mask;
454a3ef070eSClaudio Fontana     mask = (a ^ b) & 0x80808080u;
455a3ef070eSClaudio Fontana     a &= ~0x80808080u;
456a3ef070eSClaudio Fontana     b &= ~0x80808080u;
457a3ef070eSClaudio Fontana     return (a + b) ^ mask;
458a3ef070eSClaudio Fontana }
459a3ef070eSClaudio Fontana 
HELPER(neon_add_u16)460a3ef070eSClaudio Fontana uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
461a3ef070eSClaudio Fontana {
462a3ef070eSClaudio Fontana     uint32_t mask;
463a3ef070eSClaudio Fontana     mask = (a ^ b) & 0x80008000u;
464a3ef070eSClaudio Fontana     a &= ~0x80008000u;
465a3ef070eSClaudio Fontana     b &= ~0x80008000u;
466a3ef070eSClaudio Fontana     return (a + b) ^ mask;
467a3ef070eSClaudio Fontana }
468a3ef070eSClaudio Fontana 
469a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) dest = src1 - src2
470a3ef070eSClaudio Fontana NEON_VOP(sub_u8, neon_u8, 4)
471a3ef070eSClaudio Fontana NEON_VOP(sub_u16, neon_u16, 2)
472a3ef070eSClaudio Fontana #undef NEON_FN
473a3ef070eSClaudio Fontana 
474a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) dest = src1 * src2
475a3ef070eSClaudio Fontana NEON_VOP(mul_u8, neon_u8, 4)
476a3ef070eSClaudio Fontana NEON_VOP(mul_u16, neon_u16, 2)
477a3ef070eSClaudio Fontana #undef NEON_FN
478a3ef070eSClaudio Fontana 
479a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
480a3ef070eSClaudio Fontana NEON_VOP(tst_u8, neon_u8, 4)
481a3ef070eSClaudio Fontana NEON_VOP(tst_u16, neon_u16, 2)
482a3ef070eSClaudio Fontana NEON_VOP(tst_u32, neon_u32, 1)
483a3ef070eSClaudio Fontana #undef NEON_FN
484a3ef070eSClaudio Fontana 
485a3ef070eSClaudio Fontana /* Count Leading Sign/Zero Bits.  */
do_clz8(uint8_t x)486a3ef070eSClaudio Fontana static inline int do_clz8(uint8_t x)
487a3ef070eSClaudio Fontana {
488a3ef070eSClaudio Fontana     int n;
489a3ef070eSClaudio Fontana     for (n = 8; x; n--)
490a3ef070eSClaudio Fontana         x >>= 1;
491a3ef070eSClaudio Fontana     return n;
492a3ef070eSClaudio Fontana }
493a3ef070eSClaudio Fontana 
do_clz16(uint16_t x)494a3ef070eSClaudio Fontana static inline int do_clz16(uint16_t x)
495a3ef070eSClaudio Fontana {
496a3ef070eSClaudio Fontana     int n;
497a3ef070eSClaudio Fontana     for (n = 16; x; n--)
498a3ef070eSClaudio Fontana         x >>= 1;
499a3ef070eSClaudio Fontana     return n;
500a3ef070eSClaudio Fontana }
501a3ef070eSClaudio Fontana 
502a3ef070eSClaudio Fontana #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
503a3ef070eSClaudio Fontana NEON_VOP1(clz_u8, neon_u8, 4)
504a3ef070eSClaudio Fontana #undef NEON_FN
505a3ef070eSClaudio Fontana 
506a3ef070eSClaudio Fontana #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
507a3ef070eSClaudio Fontana NEON_VOP1(clz_u16, neon_u16, 2)
508a3ef070eSClaudio Fontana #undef NEON_FN
509a3ef070eSClaudio Fontana 
510a3ef070eSClaudio Fontana #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
511a3ef070eSClaudio Fontana NEON_VOP1(cls_s8, neon_s8, 4)
512a3ef070eSClaudio Fontana #undef NEON_FN
513a3ef070eSClaudio Fontana 
514a3ef070eSClaudio Fontana #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
515a3ef070eSClaudio Fontana NEON_VOP1(cls_s16, neon_s16, 2)
516a3ef070eSClaudio Fontana #undef NEON_FN
517a3ef070eSClaudio Fontana 
HELPER(neon_cls_s32)518a3ef070eSClaudio Fontana uint32_t HELPER(neon_cls_s32)(uint32_t x)
519a3ef070eSClaudio Fontana {
520a3ef070eSClaudio Fontana     int count;
521a3ef070eSClaudio Fontana     if ((int32_t)x < 0)
522a3ef070eSClaudio Fontana         x = ~x;
523a3ef070eSClaudio Fontana     for (count = 32; x; count--)
524a3ef070eSClaudio Fontana         x = x >> 1;
525a3ef070eSClaudio Fontana     return count - 1;
526a3ef070eSClaudio Fontana }
527a3ef070eSClaudio Fontana 
528a3ef070eSClaudio Fontana /* Bit count.  */
HELPER(neon_cnt_u8)529a3ef070eSClaudio Fontana uint32_t HELPER(neon_cnt_u8)(uint32_t x)
530a3ef070eSClaudio Fontana {
531a3ef070eSClaudio Fontana     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
532a3ef070eSClaudio Fontana     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
533a3ef070eSClaudio Fontana     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
534a3ef070eSClaudio Fontana     return x;
535a3ef070eSClaudio Fontana }
536a3ef070eSClaudio Fontana 
537a3ef070eSClaudio Fontana /* Reverse bits in each 8 bit word */
HELPER(neon_rbit_u8)538a3ef070eSClaudio Fontana uint32_t HELPER(neon_rbit_u8)(uint32_t x)
539a3ef070eSClaudio Fontana {
540a3ef070eSClaudio Fontana     x =  ((x & 0xf0f0f0f0) >> 4)
541a3ef070eSClaudio Fontana        | ((x & 0x0f0f0f0f) << 4);
542a3ef070eSClaudio Fontana     x =  ((x & 0x88888888) >> 3)
543a3ef070eSClaudio Fontana        | ((x & 0x44444444) >> 1)
544a3ef070eSClaudio Fontana        | ((x & 0x22222222) << 1)
545a3ef070eSClaudio Fontana        | ((x & 0x11111111) << 3);
546a3ef070eSClaudio Fontana     return x;
547a3ef070eSClaudio Fontana }
548a3ef070eSClaudio Fontana 
549a3ef070eSClaudio Fontana #define NEON_QDMULH16(dest, src1, src2, round) do { \
550a3ef070eSClaudio Fontana     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
551a3ef070eSClaudio Fontana     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
552a3ef070eSClaudio Fontana         SET_QC(); \
553a3ef070eSClaudio Fontana         tmp = (tmp >> 31) ^ ~SIGNBIT; \
554a3ef070eSClaudio Fontana     } else { \
555a3ef070eSClaudio Fontana         tmp <<= 1; \
556a3ef070eSClaudio Fontana     } \
557a3ef070eSClaudio Fontana     if (round) { \
558a3ef070eSClaudio Fontana         int32_t old = tmp; \
559a3ef070eSClaudio Fontana         tmp += 1 << 15; \
560a3ef070eSClaudio Fontana         if ((int32_t)tmp < old) { \
561a3ef070eSClaudio Fontana             SET_QC(); \
562a3ef070eSClaudio Fontana             tmp = SIGNBIT - 1; \
563a3ef070eSClaudio Fontana         } \
564a3ef070eSClaudio Fontana     } \
565a3ef070eSClaudio Fontana     dest = tmp >> 16; \
566a3ef070eSClaudio Fontana     } while(0)
567a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
568a3ef070eSClaudio Fontana NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
569a3ef070eSClaudio Fontana #undef NEON_FN
570a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
571a3ef070eSClaudio Fontana NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
572a3ef070eSClaudio Fontana #undef NEON_FN
573a3ef070eSClaudio Fontana #undef NEON_QDMULH16
574a3ef070eSClaudio Fontana 
575a3ef070eSClaudio Fontana #define NEON_QDMULH32(dest, src1, src2, round) do { \
576a3ef070eSClaudio Fontana     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
577a3ef070eSClaudio Fontana     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
578a3ef070eSClaudio Fontana         SET_QC(); \
579a3ef070eSClaudio Fontana         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
580a3ef070eSClaudio Fontana     } else { \
581a3ef070eSClaudio Fontana         tmp <<= 1; \
582a3ef070eSClaudio Fontana     } \
583a3ef070eSClaudio Fontana     if (round) { \
584a3ef070eSClaudio Fontana         int64_t old = tmp; \
585a3ef070eSClaudio Fontana         tmp += (int64_t)1 << 31; \
586a3ef070eSClaudio Fontana         if ((int64_t)tmp < old) { \
587a3ef070eSClaudio Fontana             SET_QC(); \
588a3ef070eSClaudio Fontana             tmp = SIGNBIT64 - 1; \
589a3ef070eSClaudio Fontana         } \
590a3ef070eSClaudio Fontana     } \
591a3ef070eSClaudio Fontana     dest = tmp >> 32; \
592a3ef070eSClaudio Fontana     } while(0)
593a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
594a3ef070eSClaudio Fontana NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
595a3ef070eSClaudio Fontana #undef NEON_FN
596a3ef070eSClaudio Fontana #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
597a3ef070eSClaudio Fontana NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
598a3ef070eSClaudio Fontana #undef NEON_FN
599a3ef070eSClaudio Fontana #undef NEON_QDMULH32
600a3ef070eSClaudio Fontana 
601*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_u8)602*3e683f0aSRichard Henderson uint64_t HELPER(neon_narrow_u8)(uint64_t x)
603a3ef070eSClaudio Fontana {
604a3ef070eSClaudio Fontana     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
605a3ef070eSClaudio Fontana            | ((x >> 24) & 0xff000000u);
606a3ef070eSClaudio Fontana }
607a3ef070eSClaudio Fontana 
608*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_u16)609*3e683f0aSRichard Henderson uint64_t HELPER(neon_narrow_u16)(uint64_t x)
610a3ef070eSClaudio Fontana {
611a3ef070eSClaudio Fontana     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
612a3ef070eSClaudio Fontana }
613a3ef070eSClaudio Fontana 
HELPER(neon_narrow_high_u8)614a3ef070eSClaudio Fontana uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
615a3ef070eSClaudio Fontana {
616a3ef070eSClaudio Fontana     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
617a3ef070eSClaudio Fontana             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
618a3ef070eSClaudio Fontana }
619a3ef070eSClaudio Fontana 
HELPER(neon_narrow_high_u16)620a3ef070eSClaudio Fontana uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
621a3ef070eSClaudio Fontana {
622a3ef070eSClaudio Fontana     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
623a3ef070eSClaudio Fontana }
624a3ef070eSClaudio Fontana 
HELPER(neon_narrow_round_high_u8)625a3ef070eSClaudio Fontana uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
626a3ef070eSClaudio Fontana {
627a3ef070eSClaudio Fontana     x &= 0xff80ff80ff80ff80ull;
628a3ef070eSClaudio Fontana     x += 0x0080008000800080ull;
629a3ef070eSClaudio Fontana     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
630a3ef070eSClaudio Fontana             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
631a3ef070eSClaudio Fontana }
632a3ef070eSClaudio Fontana 
HELPER(neon_narrow_round_high_u16)633a3ef070eSClaudio Fontana uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
634a3ef070eSClaudio Fontana {
635a3ef070eSClaudio Fontana     x &= 0xffff8000ffff8000ull;
636a3ef070eSClaudio Fontana     x += 0x0000800000008000ull;
637a3ef070eSClaudio Fontana     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
638a3ef070eSClaudio Fontana }
639a3ef070eSClaudio Fontana 
640*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat8)641*3e683f0aSRichard Henderson uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
642a3ef070eSClaudio Fontana {
643a3ef070eSClaudio Fontana     uint16_t s;
644a3ef070eSClaudio Fontana     uint8_t d;
645a3ef070eSClaudio Fontana     uint32_t res = 0;
646a3ef070eSClaudio Fontana #define SAT8(n) \
647a3ef070eSClaudio Fontana     s = x >> n; \
648a3ef070eSClaudio Fontana     if (s & 0x8000) { \
649a3ef070eSClaudio Fontana         SET_QC(); \
650a3ef070eSClaudio Fontana     } else { \
651a3ef070eSClaudio Fontana         if (s > 0xff) { \
652a3ef070eSClaudio Fontana             d = 0xff; \
653a3ef070eSClaudio Fontana             SET_QC(); \
654a3ef070eSClaudio Fontana         } else  { \
655a3ef070eSClaudio Fontana             d = s; \
656a3ef070eSClaudio Fontana         } \
657a3ef070eSClaudio Fontana         res |= (uint32_t)d << (n / 2); \
658a3ef070eSClaudio Fontana     }
659a3ef070eSClaudio Fontana 
660a3ef070eSClaudio Fontana     SAT8(0);
661a3ef070eSClaudio Fontana     SAT8(16);
662a3ef070eSClaudio Fontana     SAT8(32);
663a3ef070eSClaudio Fontana     SAT8(48);
664a3ef070eSClaudio Fontana #undef SAT8
665a3ef070eSClaudio Fontana     return res;
666a3ef070eSClaudio Fontana }
667a3ef070eSClaudio Fontana 
668*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u8)669*3e683f0aSRichard Henderson uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
670a3ef070eSClaudio Fontana {
671a3ef070eSClaudio Fontana     uint16_t s;
672a3ef070eSClaudio Fontana     uint8_t d;
673a3ef070eSClaudio Fontana     uint32_t res = 0;
674a3ef070eSClaudio Fontana #define SAT8(n) \
675a3ef070eSClaudio Fontana     s = x >> n; \
676a3ef070eSClaudio Fontana     if (s > 0xff) { \
677a3ef070eSClaudio Fontana         d = 0xff; \
678a3ef070eSClaudio Fontana         SET_QC(); \
679a3ef070eSClaudio Fontana     } else  { \
680a3ef070eSClaudio Fontana         d = s; \
681a3ef070eSClaudio Fontana     } \
682a3ef070eSClaudio Fontana     res |= (uint32_t)d << (n / 2);
683a3ef070eSClaudio Fontana 
684a3ef070eSClaudio Fontana     SAT8(0);
685a3ef070eSClaudio Fontana     SAT8(16);
686a3ef070eSClaudio Fontana     SAT8(32);
687a3ef070eSClaudio Fontana     SAT8(48);
688a3ef070eSClaudio Fontana #undef SAT8
689a3ef070eSClaudio Fontana     return res;
690a3ef070eSClaudio Fontana }
691a3ef070eSClaudio Fontana 
692*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s8)693*3e683f0aSRichard Henderson uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
694a3ef070eSClaudio Fontana {
695a3ef070eSClaudio Fontana     int16_t s;
696a3ef070eSClaudio Fontana     uint8_t d;
697a3ef070eSClaudio Fontana     uint32_t res = 0;
698a3ef070eSClaudio Fontana #define SAT8(n) \
699a3ef070eSClaudio Fontana     s = x >> n; \
700a3ef070eSClaudio Fontana     if (s != (int8_t)s) { \
701a3ef070eSClaudio Fontana         d = (s >> 15) ^ 0x7f; \
702a3ef070eSClaudio Fontana         SET_QC(); \
703a3ef070eSClaudio Fontana     } else  { \
704a3ef070eSClaudio Fontana         d = s; \
705a3ef070eSClaudio Fontana     } \
706a3ef070eSClaudio Fontana     res |= (uint32_t)d << (n / 2);
707a3ef070eSClaudio Fontana 
708a3ef070eSClaudio Fontana     SAT8(0);
709a3ef070eSClaudio Fontana     SAT8(16);
710a3ef070eSClaudio Fontana     SAT8(32);
711a3ef070eSClaudio Fontana     SAT8(48);
712a3ef070eSClaudio Fontana #undef SAT8
713a3ef070eSClaudio Fontana     return res;
714a3ef070eSClaudio Fontana }
715a3ef070eSClaudio Fontana 
716*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat16)717*3e683f0aSRichard Henderson uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
718a3ef070eSClaudio Fontana {
719a3ef070eSClaudio Fontana     uint32_t high;
720a3ef070eSClaudio Fontana     uint32_t low;
721a3ef070eSClaudio Fontana     low = x;
722a3ef070eSClaudio Fontana     if (low & 0x80000000) {
723a3ef070eSClaudio Fontana         low = 0;
724a3ef070eSClaudio Fontana         SET_QC();
725a3ef070eSClaudio Fontana     } else if (low > 0xffff) {
726a3ef070eSClaudio Fontana         low = 0xffff;
727a3ef070eSClaudio Fontana         SET_QC();
728a3ef070eSClaudio Fontana     }
729a3ef070eSClaudio Fontana     high = x >> 32;
730a3ef070eSClaudio Fontana     if (high & 0x80000000) {
731a3ef070eSClaudio Fontana         high = 0;
732a3ef070eSClaudio Fontana         SET_QC();
733a3ef070eSClaudio Fontana     } else if (high > 0xffff) {
734a3ef070eSClaudio Fontana         high = 0xffff;
735a3ef070eSClaudio Fontana         SET_QC();
736a3ef070eSClaudio Fontana     }
737*3e683f0aSRichard Henderson     return deposit32(low, 16, 16, high);
738a3ef070eSClaudio Fontana }
739a3ef070eSClaudio Fontana 
740*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u16)741*3e683f0aSRichard Henderson uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
742a3ef070eSClaudio Fontana {
743a3ef070eSClaudio Fontana     uint32_t high;
744a3ef070eSClaudio Fontana     uint32_t low;
745a3ef070eSClaudio Fontana     low = x;
746a3ef070eSClaudio Fontana     if (low > 0xffff) {
747a3ef070eSClaudio Fontana         low = 0xffff;
748a3ef070eSClaudio Fontana         SET_QC();
749a3ef070eSClaudio Fontana     }
750a3ef070eSClaudio Fontana     high = x >> 32;
751a3ef070eSClaudio Fontana     if (high > 0xffff) {
752a3ef070eSClaudio Fontana         high = 0xffff;
753a3ef070eSClaudio Fontana         SET_QC();
754a3ef070eSClaudio Fontana     }
755*3e683f0aSRichard Henderson     return deposit32(low, 16, 16, high);
756a3ef070eSClaudio Fontana }
757a3ef070eSClaudio Fontana 
758*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s16)759*3e683f0aSRichard Henderson uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
760a3ef070eSClaudio Fontana {
761a3ef070eSClaudio Fontana     int32_t low;
762a3ef070eSClaudio Fontana     int32_t high;
763a3ef070eSClaudio Fontana     low = x;
764a3ef070eSClaudio Fontana     if (low != (int16_t)low) {
765a3ef070eSClaudio Fontana         low = (low >> 31) ^ 0x7fff;
766a3ef070eSClaudio Fontana         SET_QC();
767a3ef070eSClaudio Fontana     }
768a3ef070eSClaudio Fontana     high = x >> 32;
769a3ef070eSClaudio Fontana     if (high != (int16_t)high) {
770a3ef070eSClaudio Fontana         high = (high >> 31) ^ 0x7fff;
771a3ef070eSClaudio Fontana         SET_QC();
772a3ef070eSClaudio Fontana     }
773*3e683f0aSRichard Henderson     return deposit32(low, 16, 16, high);
774a3ef070eSClaudio Fontana }
775a3ef070eSClaudio Fontana 
776*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat32)777*3e683f0aSRichard Henderson uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
778a3ef070eSClaudio Fontana {
779a3ef070eSClaudio Fontana     if (x & 0x8000000000000000ull) {
780a3ef070eSClaudio Fontana         SET_QC();
781a3ef070eSClaudio Fontana         return 0;
782a3ef070eSClaudio Fontana     }
783a3ef070eSClaudio Fontana     if (x > 0xffffffffu) {
784a3ef070eSClaudio Fontana         SET_QC();
785a3ef070eSClaudio Fontana         return 0xffffffffu;
786a3ef070eSClaudio Fontana     }
787a3ef070eSClaudio Fontana     return x;
788a3ef070eSClaudio Fontana }
789a3ef070eSClaudio Fontana 
790*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u32)791*3e683f0aSRichard Henderson uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
792a3ef070eSClaudio Fontana {
793a3ef070eSClaudio Fontana     if (x > 0xffffffffu) {
794a3ef070eSClaudio Fontana         SET_QC();
795a3ef070eSClaudio Fontana         return 0xffffffffu;
796a3ef070eSClaudio Fontana     }
797a3ef070eSClaudio Fontana     return x;
798a3ef070eSClaudio Fontana }
799a3ef070eSClaudio Fontana 
800*3e683f0aSRichard Henderson /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s32)801*3e683f0aSRichard Henderson uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
802a3ef070eSClaudio Fontana {
803a3ef070eSClaudio Fontana     if ((int64_t)x != (int32_t)x) {
804a3ef070eSClaudio Fontana         SET_QC();
805*3e683f0aSRichard Henderson         return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff;
806a3ef070eSClaudio Fontana     }
807*3e683f0aSRichard Henderson     return (uint32_t)x;
808a3ef070eSClaudio Fontana }
809a3ef070eSClaudio Fontana 
HELPER(neon_widen_u8)810a3ef070eSClaudio Fontana uint64_t HELPER(neon_widen_u8)(uint32_t x)
811a3ef070eSClaudio Fontana {
812a3ef070eSClaudio Fontana     uint64_t tmp;
813a3ef070eSClaudio Fontana     uint64_t ret;
814a3ef070eSClaudio Fontana     ret = (uint8_t)x;
815a3ef070eSClaudio Fontana     tmp = (uint8_t)(x >> 8);
816a3ef070eSClaudio Fontana     ret |= tmp << 16;
817a3ef070eSClaudio Fontana     tmp = (uint8_t)(x >> 16);
818a3ef070eSClaudio Fontana     ret |= tmp << 32;
819a3ef070eSClaudio Fontana     tmp = (uint8_t)(x >> 24);
820a3ef070eSClaudio Fontana     ret |= tmp << 48;
821a3ef070eSClaudio Fontana     return ret;
822a3ef070eSClaudio Fontana }
823a3ef070eSClaudio Fontana 
HELPER(neon_widen_s8)824a3ef070eSClaudio Fontana uint64_t HELPER(neon_widen_s8)(uint32_t x)
825a3ef070eSClaudio Fontana {
826a3ef070eSClaudio Fontana     uint64_t tmp;
827a3ef070eSClaudio Fontana     uint64_t ret;
828a3ef070eSClaudio Fontana     ret = (uint16_t)(int8_t)x;
829a3ef070eSClaudio Fontana     tmp = (uint16_t)(int8_t)(x >> 8);
830a3ef070eSClaudio Fontana     ret |= tmp << 16;
831a3ef070eSClaudio Fontana     tmp = (uint16_t)(int8_t)(x >> 16);
832a3ef070eSClaudio Fontana     ret |= tmp << 32;
833a3ef070eSClaudio Fontana     tmp = (uint16_t)(int8_t)(x >> 24);
834a3ef070eSClaudio Fontana     ret |= tmp << 48;
835a3ef070eSClaudio Fontana     return ret;
836a3ef070eSClaudio Fontana }
837a3ef070eSClaudio Fontana 
HELPER(neon_widen_u16)838a3ef070eSClaudio Fontana uint64_t HELPER(neon_widen_u16)(uint32_t x)
839a3ef070eSClaudio Fontana {
840a3ef070eSClaudio Fontana     uint64_t high = (uint16_t)(x >> 16);
841a3ef070eSClaudio Fontana     return ((uint16_t)x) | (high << 32);
842a3ef070eSClaudio Fontana }
843a3ef070eSClaudio Fontana 
HELPER(neon_widen_s16)844a3ef070eSClaudio Fontana uint64_t HELPER(neon_widen_s16)(uint32_t x)
845a3ef070eSClaudio Fontana {
846a3ef070eSClaudio Fontana     uint64_t high = (int16_t)(x >> 16);
847a3ef070eSClaudio Fontana     return ((uint32_t)(int16_t)x) | (high << 32);
848a3ef070eSClaudio Fontana }
849a3ef070eSClaudio Fontana 
HELPER(neon_addl_u16)850a3ef070eSClaudio Fontana uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
851a3ef070eSClaudio Fontana {
852a3ef070eSClaudio Fontana     uint64_t mask;
853a3ef070eSClaudio Fontana     mask = (a ^ b) & 0x8000800080008000ull;
854a3ef070eSClaudio Fontana     a &= ~0x8000800080008000ull;
855a3ef070eSClaudio Fontana     b &= ~0x8000800080008000ull;
856a3ef070eSClaudio Fontana     return (a + b) ^ mask;
857a3ef070eSClaudio Fontana }
858a3ef070eSClaudio Fontana 
HELPER(neon_addl_u32)859a3ef070eSClaudio Fontana uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
860a3ef070eSClaudio Fontana {
861a3ef070eSClaudio Fontana     uint64_t mask;
862a3ef070eSClaudio Fontana     mask = (a ^ b) & 0x8000000080000000ull;
863a3ef070eSClaudio Fontana     a &= ~0x8000000080000000ull;
864a3ef070eSClaudio Fontana     b &= ~0x8000000080000000ull;
865a3ef070eSClaudio Fontana     return (a + b) ^ mask;
866a3ef070eSClaudio Fontana }
867a3ef070eSClaudio Fontana 
HELPER(neon_paddl_u16)868a3ef070eSClaudio Fontana uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
869a3ef070eSClaudio Fontana {
870a3ef070eSClaudio Fontana     uint64_t tmp;
871a3ef070eSClaudio Fontana     uint64_t tmp2;
872a3ef070eSClaudio Fontana 
873a3ef070eSClaudio Fontana     tmp = a & 0x0000ffff0000ffffull;
874a3ef070eSClaudio Fontana     tmp += (a >> 16) & 0x0000ffff0000ffffull;
875a3ef070eSClaudio Fontana     tmp2 = b & 0xffff0000ffff0000ull;
876a3ef070eSClaudio Fontana     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
877a3ef070eSClaudio Fontana     return    ( tmp         & 0xffff)
878a3ef070eSClaudio Fontana             | ((tmp  >> 16) & 0xffff0000ull)
879a3ef070eSClaudio Fontana             | ((tmp2 << 16) & 0xffff00000000ull)
880a3ef070eSClaudio Fontana             | ( tmp2        & 0xffff000000000000ull);
881a3ef070eSClaudio Fontana }
882a3ef070eSClaudio Fontana 
HELPER(neon_paddl_u32)883a3ef070eSClaudio Fontana uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
884a3ef070eSClaudio Fontana {
885a3ef070eSClaudio Fontana     uint32_t low = a + (a >> 32);
886a3ef070eSClaudio Fontana     uint32_t high = b + (b >> 32);
887a3ef070eSClaudio Fontana     return low + ((uint64_t)high << 32);
888a3ef070eSClaudio Fontana }
889a3ef070eSClaudio Fontana 
HELPER(neon_subl_u16)890a3ef070eSClaudio Fontana uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
891a3ef070eSClaudio Fontana {
892a3ef070eSClaudio Fontana     uint64_t mask;
893a3ef070eSClaudio Fontana     mask = (a ^ ~b) & 0x8000800080008000ull;
894a3ef070eSClaudio Fontana     a |= 0x8000800080008000ull;
895a3ef070eSClaudio Fontana     b &= ~0x8000800080008000ull;
896a3ef070eSClaudio Fontana     return (a - b) ^ mask;
897a3ef070eSClaudio Fontana }
898a3ef070eSClaudio Fontana 
HELPER(neon_subl_u32)899a3ef070eSClaudio Fontana uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
900a3ef070eSClaudio Fontana {
901a3ef070eSClaudio Fontana     uint64_t mask;
902a3ef070eSClaudio Fontana     mask = (a ^ ~b) & 0x8000000080000000ull;
903a3ef070eSClaudio Fontana     a |= 0x8000000080000000ull;
904a3ef070eSClaudio Fontana     b &= ~0x8000000080000000ull;
905a3ef070eSClaudio Fontana     return (a - b) ^ mask;
906a3ef070eSClaudio Fontana }
907a3ef070eSClaudio Fontana 
HELPER(neon_addl_saturate_s32)908a3ef070eSClaudio Fontana uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
909a3ef070eSClaudio Fontana {
910a3ef070eSClaudio Fontana     uint32_t x, y;
911a3ef070eSClaudio Fontana     uint32_t low, high;
912a3ef070eSClaudio Fontana 
913a3ef070eSClaudio Fontana     x = a;
914a3ef070eSClaudio Fontana     y = b;
915a3ef070eSClaudio Fontana     low = x + y;
916a3ef070eSClaudio Fontana     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
917a3ef070eSClaudio Fontana         SET_QC();
918a3ef070eSClaudio Fontana         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
919a3ef070eSClaudio Fontana     }
920a3ef070eSClaudio Fontana     x = a >> 32;
921a3ef070eSClaudio Fontana     y = b >> 32;
922a3ef070eSClaudio Fontana     high = x + y;
923a3ef070eSClaudio Fontana     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
924a3ef070eSClaudio Fontana         SET_QC();
925a3ef070eSClaudio Fontana         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
926a3ef070eSClaudio Fontana     }
927a3ef070eSClaudio Fontana     return low | ((uint64_t)high << 32);
928a3ef070eSClaudio Fontana }
929a3ef070eSClaudio Fontana 
HELPER(neon_addl_saturate_s64)930a3ef070eSClaudio Fontana uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
931a3ef070eSClaudio Fontana {
932a3ef070eSClaudio Fontana     uint64_t result;
933a3ef070eSClaudio Fontana 
934a3ef070eSClaudio Fontana     result = a + b;
935a3ef070eSClaudio Fontana     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
936a3ef070eSClaudio Fontana         SET_QC();
937a3ef070eSClaudio Fontana         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
938a3ef070eSClaudio Fontana     }
939a3ef070eSClaudio Fontana     return result;
940a3ef070eSClaudio Fontana }
941a3ef070eSClaudio Fontana 
942a3ef070eSClaudio Fontana /* We have to do the arithmetic in a larger type than
943a3ef070eSClaudio Fontana  * the input type, because for example with a signed 32 bit
944a3ef070eSClaudio Fontana  * op the absolute difference can overflow a signed 32 bit value.
945a3ef070eSClaudio Fontana  */
946a3ef070eSClaudio Fontana #define DO_ABD(dest, x, y, intype, arithtype) do {            \
947a3ef070eSClaudio Fontana     arithtype tmp_x = (intype)(x);                            \
948a3ef070eSClaudio Fontana     arithtype tmp_y = (intype)(y);                            \
949a3ef070eSClaudio Fontana     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
950a3ef070eSClaudio Fontana     } while(0)
951a3ef070eSClaudio Fontana 
HELPER(neon_abdl_u16)952a3ef070eSClaudio Fontana uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
953a3ef070eSClaudio Fontana {
954a3ef070eSClaudio Fontana     uint64_t tmp;
955a3ef070eSClaudio Fontana     uint64_t result;
956a3ef070eSClaudio Fontana     DO_ABD(result, a, b, uint8_t, uint32_t);
957a3ef070eSClaudio Fontana     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
958a3ef070eSClaudio Fontana     result |= tmp << 16;
959a3ef070eSClaudio Fontana     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
960a3ef070eSClaudio Fontana     result |= tmp << 32;
961a3ef070eSClaudio Fontana     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
962a3ef070eSClaudio Fontana     result |= tmp << 48;
963a3ef070eSClaudio Fontana     return result;
964a3ef070eSClaudio Fontana }
965a3ef070eSClaudio Fontana 
HELPER(neon_abdl_s16)966a3ef070eSClaudio Fontana uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
967a3ef070eSClaudio Fontana {
968a3ef070eSClaudio Fontana     uint64_t tmp;
969a3ef070eSClaudio Fontana     uint64_t result;
970a3ef070eSClaudio Fontana     DO_ABD(result, a, b, int8_t, int32_t);
971a3ef070eSClaudio Fontana     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
972a3ef070eSClaudio Fontana     result |= tmp << 16;
973a3ef070eSClaudio Fontana     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
974a3ef070eSClaudio Fontana     result |= tmp << 32;
975a3ef070eSClaudio Fontana     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
976a3ef070eSClaudio Fontana     result |= tmp << 48;
977a3ef070eSClaudio Fontana     return result;
978a3ef070eSClaudio Fontana }
979a3ef070eSClaudio Fontana 
HELPER(neon_abdl_u32)980a3ef070eSClaudio Fontana uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
981a3ef070eSClaudio Fontana {
982a3ef070eSClaudio Fontana     uint64_t tmp;
983a3ef070eSClaudio Fontana     uint64_t result;
984a3ef070eSClaudio Fontana     DO_ABD(result, a, b, uint16_t, uint32_t);
985a3ef070eSClaudio Fontana     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
986a3ef070eSClaudio Fontana     return result | (tmp << 32);
987a3ef070eSClaudio Fontana }
988a3ef070eSClaudio Fontana 
HELPER(neon_abdl_s32)989a3ef070eSClaudio Fontana uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
990a3ef070eSClaudio Fontana {
991a3ef070eSClaudio Fontana     uint64_t tmp;
992a3ef070eSClaudio Fontana     uint64_t result;
993a3ef070eSClaudio Fontana     DO_ABD(result, a, b, int16_t, int32_t);
994a3ef070eSClaudio Fontana     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
995a3ef070eSClaudio Fontana     return result | (tmp << 32);
996a3ef070eSClaudio Fontana }
997a3ef070eSClaudio Fontana 
HELPER(neon_abdl_u64)998a3ef070eSClaudio Fontana uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
999a3ef070eSClaudio Fontana {
1000a3ef070eSClaudio Fontana     uint64_t result;
1001a3ef070eSClaudio Fontana     DO_ABD(result, a, b, uint32_t, uint64_t);
1002a3ef070eSClaudio Fontana     return result;
1003a3ef070eSClaudio Fontana }
1004a3ef070eSClaudio Fontana 
HELPER(neon_abdl_s64)1005a3ef070eSClaudio Fontana uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1006a3ef070eSClaudio Fontana {
1007a3ef070eSClaudio Fontana     uint64_t result;
1008a3ef070eSClaudio Fontana     DO_ABD(result, a, b, int32_t, int64_t);
1009a3ef070eSClaudio Fontana     return result;
1010a3ef070eSClaudio Fontana }
1011a3ef070eSClaudio Fontana #undef DO_ABD
1012a3ef070eSClaudio Fontana 
1013a3ef070eSClaudio Fontana /* Widening multiply. Named type is the source type.  */
1014a3ef070eSClaudio Fontana #define DO_MULL(dest, x, y, type1, type2) do { \
1015a3ef070eSClaudio Fontana     type1 tmp_x = x; \
1016a3ef070eSClaudio Fontana     type1 tmp_y = y; \
1017a3ef070eSClaudio Fontana     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1018a3ef070eSClaudio Fontana     } while(0)
1019a3ef070eSClaudio Fontana 
HELPER(neon_mull_u8)1020a3ef070eSClaudio Fontana uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1021a3ef070eSClaudio Fontana {
1022a3ef070eSClaudio Fontana     uint64_t tmp;
1023a3ef070eSClaudio Fontana     uint64_t result;
1024a3ef070eSClaudio Fontana 
1025a3ef070eSClaudio Fontana     DO_MULL(result, a, b, uint8_t, uint16_t);
1026a3ef070eSClaudio Fontana     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1027a3ef070eSClaudio Fontana     result |= tmp << 16;
1028a3ef070eSClaudio Fontana     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1029a3ef070eSClaudio Fontana     result |= tmp << 32;
1030a3ef070eSClaudio Fontana     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1031a3ef070eSClaudio Fontana     result |= tmp << 48;
1032a3ef070eSClaudio Fontana     return result;
1033a3ef070eSClaudio Fontana }
1034a3ef070eSClaudio Fontana 
HELPER(neon_mull_s8)1035a3ef070eSClaudio Fontana uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1036a3ef070eSClaudio Fontana {
1037a3ef070eSClaudio Fontana     uint64_t tmp;
1038a3ef070eSClaudio Fontana     uint64_t result;
1039a3ef070eSClaudio Fontana 
1040a3ef070eSClaudio Fontana     DO_MULL(result, a, b, int8_t, uint16_t);
1041a3ef070eSClaudio Fontana     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1042a3ef070eSClaudio Fontana     result |= tmp << 16;
1043a3ef070eSClaudio Fontana     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1044a3ef070eSClaudio Fontana     result |= tmp << 32;
1045a3ef070eSClaudio Fontana     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1046a3ef070eSClaudio Fontana     result |= tmp << 48;
1047a3ef070eSClaudio Fontana     return result;
1048a3ef070eSClaudio Fontana }
1049a3ef070eSClaudio Fontana 
HELPER(neon_mull_u16)1050a3ef070eSClaudio Fontana uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1051a3ef070eSClaudio Fontana {
1052a3ef070eSClaudio Fontana     uint64_t tmp;
1053a3ef070eSClaudio Fontana     uint64_t result;
1054a3ef070eSClaudio Fontana 
1055a3ef070eSClaudio Fontana     DO_MULL(result, a, b, uint16_t, uint32_t);
1056a3ef070eSClaudio Fontana     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1057a3ef070eSClaudio Fontana     return result | (tmp << 32);
1058a3ef070eSClaudio Fontana }
1059a3ef070eSClaudio Fontana 
HELPER(neon_mull_s16)1060a3ef070eSClaudio Fontana uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1061a3ef070eSClaudio Fontana {
1062a3ef070eSClaudio Fontana     uint64_t tmp;
1063a3ef070eSClaudio Fontana     uint64_t result;
1064a3ef070eSClaudio Fontana 
1065a3ef070eSClaudio Fontana     DO_MULL(result, a, b, int16_t, uint32_t);
1066a3ef070eSClaudio Fontana     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1067a3ef070eSClaudio Fontana     return result | (tmp << 32);
1068a3ef070eSClaudio Fontana }
1069a3ef070eSClaudio Fontana 
HELPER(neon_negl_u16)1070a3ef070eSClaudio Fontana uint64_t HELPER(neon_negl_u16)(uint64_t x)
1071a3ef070eSClaudio Fontana {
1072a3ef070eSClaudio Fontana     uint16_t tmp;
1073a3ef070eSClaudio Fontana     uint64_t result;
1074a3ef070eSClaudio Fontana     result = (uint16_t)-x;
1075a3ef070eSClaudio Fontana     tmp = -(x >> 16);
1076a3ef070eSClaudio Fontana     result |= (uint64_t)tmp << 16;
1077a3ef070eSClaudio Fontana     tmp = -(x >> 32);
1078a3ef070eSClaudio Fontana     result |= (uint64_t)tmp << 32;
1079a3ef070eSClaudio Fontana     tmp = -(x >> 48);
1080a3ef070eSClaudio Fontana     result |= (uint64_t)tmp << 48;
1081a3ef070eSClaudio Fontana     return result;
1082a3ef070eSClaudio Fontana }
1083a3ef070eSClaudio Fontana 
HELPER(neon_negl_u32)1084a3ef070eSClaudio Fontana uint64_t HELPER(neon_negl_u32)(uint64_t x)
1085a3ef070eSClaudio Fontana {
1086a3ef070eSClaudio Fontana     uint32_t low = -x;
1087a3ef070eSClaudio Fontana     uint32_t high = -(x >> 32);
1088a3ef070eSClaudio Fontana     return low | ((uint64_t)high << 32);
1089a3ef070eSClaudio Fontana }
1090a3ef070eSClaudio Fontana 
1091a3ef070eSClaudio Fontana /* Saturating sign manipulation.  */
1092a3ef070eSClaudio Fontana /* ??? Make these use NEON_VOP1 */
1093a3ef070eSClaudio Fontana #define DO_QABS8(x) do { \
1094a3ef070eSClaudio Fontana     if (x == (int8_t)0x80) { \
1095a3ef070eSClaudio Fontana         x = 0x7f; \
1096a3ef070eSClaudio Fontana         SET_QC(); \
1097a3ef070eSClaudio Fontana     } else if (x < 0) { \
1098a3ef070eSClaudio Fontana         x = -x; \
1099a3ef070eSClaudio Fontana     }} while (0)
HELPER(neon_qabs_s8)1100a3ef070eSClaudio Fontana uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1101a3ef070eSClaudio Fontana {
1102a3ef070eSClaudio Fontana     neon_s8 vec;
1103a3ef070eSClaudio Fontana     NEON_UNPACK(neon_s8, vec, x);
1104a3ef070eSClaudio Fontana     DO_QABS8(vec.v1);
1105a3ef070eSClaudio Fontana     DO_QABS8(vec.v2);
1106a3ef070eSClaudio Fontana     DO_QABS8(vec.v3);
1107a3ef070eSClaudio Fontana     DO_QABS8(vec.v4);
1108a3ef070eSClaudio Fontana     NEON_PACK(neon_s8, x, vec);
1109a3ef070eSClaudio Fontana     return x;
1110a3ef070eSClaudio Fontana }
1111a3ef070eSClaudio Fontana #undef DO_QABS8
1112a3ef070eSClaudio Fontana 
1113a3ef070eSClaudio Fontana #define DO_QNEG8(x) do { \
1114a3ef070eSClaudio Fontana     if (x == (int8_t)0x80) { \
1115a3ef070eSClaudio Fontana         x = 0x7f; \
1116a3ef070eSClaudio Fontana         SET_QC(); \
1117a3ef070eSClaudio Fontana     } else { \
1118a3ef070eSClaudio Fontana         x = -x; \
1119a3ef070eSClaudio Fontana     }} while (0)
HELPER(neon_qneg_s8)1120a3ef070eSClaudio Fontana uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1121a3ef070eSClaudio Fontana {
1122a3ef070eSClaudio Fontana     neon_s8 vec;
1123a3ef070eSClaudio Fontana     NEON_UNPACK(neon_s8, vec, x);
1124a3ef070eSClaudio Fontana     DO_QNEG8(vec.v1);
1125a3ef070eSClaudio Fontana     DO_QNEG8(vec.v2);
1126a3ef070eSClaudio Fontana     DO_QNEG8(vec.v3);
1127a3ef070eSClaudio Fontana     DO_QNEG8(vec.v4);
1128a3ef070eSClaudio Fontana     NEON_PACK(neon_s8, x, vec);
1129a3ef070eSClaudio Fontana     return x;
1130a3ef070eSClaudio Fontana }
1131a3ef070eSClaudio Fontana #undef DO_QNEG8
1132a3ef070eSClaudio Fontana 
1133a3ef070eSClaudio Fontana #define DO_QABS16(x) do { \
1134a3ef070eSClaudio Fontana     if (x == (int16_t)0x8000) { \
1135a3ef070eSClaudio Fontana         x = 0x7fff; \
1136a3ef070eSClaudio Fontana         SET_QC(); \
1137a3ef070eSClaudio Fontana     } else if (x < 0) { \
1138a3ef070eSClaudio Fontana         x = -x; \
1139a3ef070eSClaudio Fontana     }} while (0)
HELPER(neon_qabs_s16)1140a3ef070eSClaudio Fontana uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1141a3ef070eSClaudio Fontana {
1142a3ef070eSClaudio Fontana     neon_s16 vec;
1143a3ef070eSClaudio Fontana     NEON_UNPACK(neon_s16, vec, x);
1144a3ef070eSClaudio Fontana     DO_QABS16(vec.v1);
1145a3ef070eSClaudio Fontana     DO_QABS16(vec.v2);
1146a3ef070eSClaudio Fontana     NEON_PACK(neon_s16, x, vec);
1147a3ef070eSClaudio Fontana     return x;
1148a3ef070eSClaudio Fontana }
1149a3ef070eSClaudio Fontana #undef DO_QABS16
1150a3ef070eSClaudio Fontana 
1151a3ef070eSClaudio Fontana #define DO_QNEG16(x) do { \
1152a3ef070eSClaudio Fontana     if (x == (int16_t)0x8000) { \
1153a3ef070eSClaudio Fontana         x = 0x7fff; \
1154a3ef070eSClaudio Fontana         SET_QC(); \
1155a3ef070eSClaudio Fontana     } else { \
1156a3ef070eSClaudio Fontana         x = -x; \
1157a3ef070eSClaudio Fontana     }} while (0)
HELPER(neon_qneg_s16)1158a3ef070eSClaudio Fontana uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1159a3ef070eSClaudio Fontana {
1160a3ef070eSClaudio Fontana     neon_s16 vec;
1161a3ef070eSClaudio Fontana     NEON_UNPACK(neon_s16, vec, x);
1162a3ef070eSClaudio Fontana     DO_QNEG16(vec.v1);
1163a3ef070eSClaudio Fontana     DO_QNEG16(vec.v2);
1164a3ef070eSClaudio Fontana     NEON_PACK(neon_s16, x, vec);
1165a3ef070eSClaudio Fontana     return x;
1166a3ef070eSClaudio Fontana }
1167a3ef070eSClaudio Fontana #undef DO_QNEG16
1168a3ef070eSClaudio Fontana 
HELPER(neon_qabs_s32)1169a3ef070eSClaudio Fontana uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1170a3ef070eSClaudio Fontana {
1171a3ef070eSClaudio Fontana     if (x == SIGNBIT) {
1172a3ef070eSClaudio Fontana         SET_QC();
1173a3ef070eSClaudio Fontana         x = ~SIGNBIT;
1174a3ef070eSClaudio Fontana     } else if ((int32_t)x < 0) {
1175a3ef070eSClaudio Fontana         x = -x;
1176a3ef070eSClaudio Fontana     }
1177a3ef070eSClaudio Fontana     return x;
1178a3ef070eSClaudio Fontana }
1179a3ef070eSClaudio Fontana 
HELPER(neon_qneg_s32)1180a3ef070eSClaudio Fontana uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1181a3ef070eSClaudio Fontana {
1182a3ef070eSClaudio Fontana     if (x == SIGNBIT) {
1183a3ef070eSClaudio Fontana         SET_QC();
1184a3ef070eSClaudio Fontana         x = ~SIGNBIT;
1185a3ef070eSClaudio Fontana     } else {
1186a3ef070eSClaudio Fontana         x = -x;
1187a3ef070eSClaudio Fontana     }
1188a3ef070eSClaudio Fontana     return x;
1189a3ef070eSClaudio Fontana }
1190a3ef070eSClaudio Fontana 
HELPER(neon_qabs_s64)1191a3ef070eSClaudio Fontana uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1192a3ef070eSClaudio Fontana {
1193a3ef070eSClaudio Fontana     if (x == SIGNBIT64) {
1194a3ef070eSClaudio Fontana         SET_QC();
1195a3ef070eSClaudio Fontana         x = ~SIGNBIT64;
1196a3ef070eSClaudio Fontana     } else if ((int64_t)x < 0) {
1197a3ef070eSClaudio Fontana         x = -x;
1198a3ef070eSClaudio Fontana     }
1199a3ef070eSClaudio Fontana     return x;
1200a3ef070eSClaudio Fontana }
1201a3ef070eSClaudio Fontana 
HELPER(neon_qneg_s64)1202a3ef070eSClaudio Fontana uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1203a3ef070eSClaudio Fontana {
1204a3ef070eSClaudio Fontana     if (x == SIGNBIT64) {
1205a3ef070eSClaudio Fontana         SET_QC();
1206a3ef070eSClaudio Fontana         x = ~SIGNBIT64;
1207a3ef070eSClaudio Fontana     } else {
1208a3ef070eSClaudio Fontana         x = -x;
1209a3ef070eSClaudio Fontana     }
1210a3ef070eSClaudio Fontana     return x;
1211a3ef070eSClaudio Fontana }
1212a3ef070eSClaudio Fontana 
1213a3ef070eSClaudio Fontana /* NEON Float helpers.  */
1214a3ef070eSClaudio Fontana 
1215a3ef070eSClaudio Fontana /* Floating point comparisons produce an integer result.
1216a3ef070eSClaudio Fontana  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1217a3ef070eSClaudio Fontana  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1218a3ef070eSClaudio Fontana  */
HELPER(neon_ceq_f32)1219a3ef070eSClaudio Fontana uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1220a3ef070eSClaudio Fontana {
1221a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1222a3ef070eSClaudio Fontana     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1223a3ef070eSClaudio Fontana }
1224a3ef070eSClaudio Fontana 
HELPER(neon_cge_f32)1225a3ef070eSClaudio Fontana uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1226a3ef070eSClaudio Fontana {
1227a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1228a3ef070eSClaudio Fontana     return -float32_le(make_float32(b), make_float32(a), fpst);
1229a3ef070eSClaudio Fontana }
1230a3ef070eSClaudio Fontana 
HELPER(neon_cgt_f32)1231a3ef070eSClaudio Fontana uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1232a3ef070eSClaudio Fontana {
1233a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1234a3ef070eSClaudio Fontana     return -float32_lt(make_float32(b), make_float32(a), fpst);
1235a3ef070eSClaudio Fontana }
1236a3ef070eSClaudio Fontana 
HELPER(neon_acge_f32)1237a3ef070eSClaudio Fontana uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1238a3ef070eSClaudio Fontana {
1239a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1240a3ef070eSClaudio Fontana     float32 f0 = float32_abs(make_float32(a));
1241a3ef070eSClaudio Fontana     float32 f1 = float32_abs(make_float32(b));
1242a3ef070eSClaudio Fontana     return -float32_le(f1, f0, fpst);
1243a3ef070eSClaudio Fontana }
1244a3ef070eSClaudio Fontana 
HELPER(neon_acgt_f32)1245a3ef070eSClaudio Fontana uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1246a3ef070eSClaudio Fontana {
1247a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1248a3ef070eSClaudio Fontana     float32 f0 = float32_abs(make_float32(a));
1249a3ef070eSClaudio Fontana     float32 f1 = float32_abs(make_float32(b));
1250a3ef070eSClaudio Fontana     return -float32_lt(f1, f0, fpst);
1251a3ef070eSClaudio Fontana }
1252a3ef070eSClaudio Fontana 
HELPER(neon_acge_f64)1253a3ef070eSClaudio Fontana uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1254a3ef070eSClaudio Fontana {
1255a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1256a3ef070eSClaudio Fontana     float64 f0 = float64_abs(make_float64(a));
1257a3ef070eSClaudio Fontana     float64 f1 = float64_abs(make_float64(b));
1258a3ef070eSClaudio Fontana     return -float64_le(f1, f0, fpst);
1259a3ef070eSClaudio Fontana }
1260a3ef070eSClaudio Fontana 
HELPER(neon_acgt_f64)1261a3ef070eSClaudio Fontana uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1262a3ef070eSClaudio Fontana {
1263a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1264a3ef070eSClaudio Fontana     float64 f0 = float64_abs(make_float64(a));
1265a3ef070eSClaudio Fontana     float64 f1 = float64_abs(make_float64(b));
1266a3ef070eSClaudio Fontana     return -float64_lt(f1, f0, fpst);
1267a3ef070eSClaudio Fontana }
1268a3ef070eSClaudio Fontana 
1269a3ef070eSClaudio Fontana #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1270a3ef070eSClaudio Fontana 
HELPER(neon_qunzip8)1271a3ef070eSClaudio Fontana void HELPER(neon_qunzip8)(void *vd, void *vm)
1272a3ef070eSClaudio Fontana {
1273a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1274a3ef070eSClaudio Fontana     uint64_t zd0 = rd[0], zd1 = rd[1];
1275a3ef070eSClaudio Fontana     uint64_t zm0 = rm[0], zm1 = rm[1];
1276a3ef070eSClaudio Fontana 
1277a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1278a3ef070eSClaudio Fontana         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1279a3ef070eSClaudio Fontana         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1280a3ef070eSClaudio Fontana         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1281a3ef070eSClaudio Fontana     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1282a3ef070eSClaudio Fontana         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1283a3ef070eSClaudio Fontana         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1284a3ef070eSClaudio Fontana         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1285a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1286a3ef070eSClaudio Fontana         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1287a3ef070eSClaudio Fontana         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1288a3ef070eSClaudio Fontana         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1289a3ef070eSClaudio Fontana     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1290a3ef070eSClaudio Fontana         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1291a3ef070eSClaudio Fontana         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1292a3ef070eSClaudio Fontana         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1293a3ef070eSClaudio Fontana 
1294a3ef070eSClaudio Fontana     rm[0] = m0;
1295a3ef070eSClaudio Fontana     rm[1] = m1;
1296a3ef070eSClaudio Fontana     rd[0] = d0;
1297a3ef070eSClaudio Fontana     rd[1] = d1;
1298a3ef070eSClaudio Fontana }
1299a3ef070eSClaudio Fontana 
HELPER(neon_qunzip16)1300a3ef070eSClaudio Fontana void HELPER(neon_qunzip16)(void *vd, void *vm)
1301a3ef070eSClaudio Fontana {
1302a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1303a3ef070eSClaudio Fontana     uint64_t zd0 = rd[0], zd1 = rd[1];
1304a3ef070eSClaudio Fontana     uint64_t zm0 = rm[0], zm1 = rm[1];
1305a3ef070eSClaudio Fontana 
1306a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1307a3ef070eSClaudio Fontana         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1308a3ef070eSClaudio Fontana     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1309a3ef070eSClaudio Fontana         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1310a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1311a3ef070eSClaudio Fontana         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1312a3ef070eSClaudio Fontana     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1313a3ef070eSClaudio Fontana         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1314a3ef070eSClaudio Fontana 
1315a3ef070eSClaudio Fontana     rm[0] = m0;
1316a3ef070eSClaudio Fontana     rm[1] = m1;
1317a3ef070eSClaudio Fontana     rd[0] = d0;
1318a3ef070eSClaudio Fontana     rd[1] = d1;
1319a3ef070eSClaudio Fontana }
1320a3ef070eSClaudio Fontana 
HELPER(neon_qunzip32)1321a3ef070eSClaudio Fontana void HELPER(neon_qunzip32)(void *vd, void *vm)
1322a3ef070eSClaudio Fontana {
1323a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1324a3ef070eSClaudio Fontana     uint64_t zd0 = rd[0], zd1 = rd[1];
1325a3ef070eSClaudio Fontana     uint64_t zm0 = rm[0], zm1 = rm[1];
1326a3ef070eSClaudio Fontana 
1327a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1328a3ef070eSClaudio Fontana     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1329a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1330a3ef070eSClaudio Fontana     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1331a3ef070eSClaudio Fontana 
1332a3ef070eSClaudio Fontana     rm[0] = m0;
1333a3ef070eSClaudio Fontana     rm[1] = m1;
1334a3ef070eSClaudio Fontana     rd[0] = d0;
1335a3ef070eSClaudio Fontana     rd[1] = d1;
1336a3ef070eSClaudio Fontana }
1337a3ef070eSClaudio Fontana 
HELPER(neon_unzip8)1338a3ef070eSClaudio Fontana void HELPER(neon_unzip8)(void *vd, void *vm)
1339a3ef070eSClaudio Fontana {
1340a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1341a3ef070eSClaudio Fontana     uint64_t zd = rd[0], zm = rm[0];
1342a3ef070eSClaudio Fontana 
1343a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1344a3ef070eSClaudio Fontana         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1345a3ef070eSClaudio Fontana         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1346a3ef070eSClaudio Fontana         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1347a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1348a3ef070eSClaudio Fontana         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1349a3ef070eSClaudio Fontana         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1350a3ef070eSClaudio Fontana         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1351a3ef070eSClaudio Fontana 
1352a3ef070eSClaudio Fontana     rm[0] = m0;
1353a3ef070eSClaudio Fontana     rd[0] = d0;
1354a3ef070eSClaudio Fontana }
1355a3ef070eSClaudio Fontana 
HELPER(neon_unzip16)1356a3ef070eSClaudio Fontana void HELPER(neon_unzip16)(void *vd, void *vm)
1357a3ef070eSClaudio Fontana {
1358a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1359a3ef070eSClaudio Fontana     uint64_t zd = rd[0], zm = rm[0];
1360a3ef070eSClaudio Fontana 
1361a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1362a3ef070eSClaudio Fontana         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1363a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1364a3ef070eSClaudio Fontana         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1365a3ef070eSClaudio Fontana 
1366a3ef070eSClaudio Fontana     rm[0] = m0;
1367a3ef070eSClaudio Fontana     rd[0] = d0;
1368a3ef070eSClaudio Fontana }
1369a3ef070eSClaudio Fontana 
HELPER(neon_qzip8)1370a3ef070eSClaudio Fontana void HELPER(neon_qzip8)(void *vd, void *vm)
1371a3ef070eSClaudio Fontana {
1372a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1373a3ef070eSClaudio Fontana     uint64_t zd0 = rd[0], zd1 = rd[1];
1374a3ef070eSClaudio Fontana     uint64_t zm0 = rm[0], zm1 = rm[1];
1375a3ef070eSClaudio Fontana 
1376a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1377a3ef070eSClaudio Fontana         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1378a3ef070eSClaudio Fontana         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1379a3ef070eSClaudio Fontana         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1380a3ef070eSClaudio Fontana     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1381a3ef070eSClaudio Fontana         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1382a3ef070eSClaudio Fontana         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1383a3ef070eSClaudio Fontana         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1384a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1385a3ef070eSClaudio Fontana         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1386a3ef070eSClaudio Fontana         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1387a3ef070eSClaudio Fontana         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1388a3ef070eSClaudio Fontana     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1389a3ef070eSClaudio Fontana         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1390a3ef070eSClaudio Fontana         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1391a3ef070eSClaudio Fontana         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1392a3ef070eSClaudio Fontana 
1393a3ef070eSClaudio Fontana     rm[0] = m0;
1394a3ef070eSClaudio Fontana     rm[1] = m1;
1395a3ef070eSClaudio Fontana     rd[0] = d0;
1396a3ef070eSClaudio Fontana     rd[1] = d1;
1397a3ef070eSClaudio Fontana }
1398a3ef070eSClaudio Fontana 
HELPER(neon_qzip16)1399a3ef070eSClaudio Fontana void HELPER(neon_qzip16)(void *vd, void *vm)
1400a3ef070eSClaudio Fontana {
1401a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1402a3ef070eSClaudio Fontana     uint64_t zd0 = rd[0], zd1 = rd[1];
1403a3ef070eSClaudio Fontana     uint64_t zm0 = rm[0], zm1 = rm[1];
1404a3ef070eSClaudio Fontana 
1405a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1406a3ef070eSClaudio Fontana         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1407a3ef070eSClaudio Fontana     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1408a3ef070eSClaudio Fontana         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1409a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1410a3ef070eSClaudio Fontana         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1411a3ef070eSClaudio Fontana     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1412a3ef070eSClaudio Fontana         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1413a3ef070eSClaudio Fontana 
1414a3ef070eSClaudio Fontana     rm[0] = m0;
1415a3ef070eSClaudio Fontana     rm[1] = m1;
1416a3ef070eSClaudio Fontana     rd[0] = d0;
1417a3ef070eSClaudio Fontana     rd[1] = d1;
1418a3ef070eSClaudio Fontana }
1419a3ef070eSClaudio Fontana 
HELPER(neon_qzip32)1420a3ef070eSClaudio Fontana void HELPER(neon_qzip32)(void *vd, void *vm)
1421a3ef070eSClaudio Fontana {
1422a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1423a3ef070eSClaudio Fontana     uint64_t zd0 = rd[0], zd1 = rd[1];
1424a3ef070eSClaudio Fontana     uint64_t zm0 = rm[0], zm1 = rm[1];
1425a3ef070eSClaudio Fontana 
1426a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1427a3ef070eSClaudio Fontana     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1428a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1429a3ef070eSClaudio Fontana     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1430a3ef070eSClaudio Fontana 
1431a3ef070eSClaudio Fontana     rm[0] = m0;
1432a3ef070eSClaudio Fontana     rm[1] = m1;
1433a3ef070eSClaudio Fontana     rd[0] = d0;
1434a3ef070eSClaudio Fontana     rd[1] = d1;
1435a3ef070eSClaudio Fontana }
1436a3ef070eSClaudio Fontana 
HELPER(neon_zip8)1437a3ef070eSClaudio Fontana void HELPER(neon_zip8)(void *vd, void *vm)
1438a3ef070eSClaudio Fontana {
1439a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1440a3ef070eSClaudio Fontana     uint64_t zd = rd[0], zm = rm[0];
1441a3ef070eSClaudio Fontana 
1442a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1443a3ef070eSClaudio Fontana         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1444a3ef070eSClaudio Fontana         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1445a3ef070eSClaudio Fontana         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1446a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1447a3ef070eSClaudio Fontana         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1448a3ef070eSClaudio Fontana         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1449a3ef070eSClaudio Fontana         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1450a3ef070eSClaudio Fontana 
1451a3ef070eSClaudio Fontana     rm[0] = m0;
1452a3ef070eSClaudio Fontana     rd[0] = d0;
1453a3ef070eSClaudio Fontana }
1454a3ef070eSClaudio Fontana 
HELPER(neon_zip16)1455a3ef070eSClaudio Fontana void HELPER(neon_zip16)(void *vd, void *vm)
1456a3ef070eSClaudio Fontana {
1457a3ef070eSClaudio Fontana     uint64_t *rd = vd, *rm = vm;
1458a3ef070eSClaudio Fontana     uint64_t zd = rd[0], zm = rm[0];
1459a3ef070eSClaudio Fontana 
1460a3ef070eSClaudio Fontana     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1461a3ef070eSClaudio Fontana         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1462a3ef070eSClaudio Fontana     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1463a3ef070eSClaudio Fontana         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1464a3ef070eSClaudio Fontana 
1465a3ef070eSClaudio Fontana     rm[0] = m0;
1466a3ef070eSClaudio Fontana     rd[0] = d0;
1467a3ef070eSClaudio Fontana }
1468