xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision 203aca91252c4d74742f89b761bea801b89ca803)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
16 
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
19 
20 #define SET_QC() env->vfp.qc[0] = 1
21 
22 #define NEON_TYPE1(name, type) \
23 typedef struct \
24 { \
25     type v1; \
26 } neon_##name;
27 #if HOST_BIG_ENDIAN
28 #define NEON_TYPE2(name, type) \
29 typedef struct \
30 { \
31     type v2; \
32     type v1; \
33 } neon_##name;
34 #define NEON_TYPE4(name, type) \
35 typedef struct \
36 { \
37     type v4; \
38     type v3; \
39     type v2; \
40     type v1; \
41 } neon_##name;
42 #else
43 #define NEON_TYPE2(name, type) \
44 typedef struct \
45 { \
46     type v1; \
47     type v2; \
48 } neon_##name;
49 #define NEON_TYPE4(name, type) \
50 typedef struct \
51 { \
52     type v1; \
53     type v2; \
54     type v3; \
55     type v4; \
56 } neon_##name;
57 #endif
58 
59 NEON_TYPE4(s8, int8_t)
60 NEON_TYPE4(u8, uint8_t)
61 NEON_TYPE2(s16, int16_t)
62 NEON_TYPE2(u16, uint16_t)
63 NEON_TYPE1(s32, int32_t)
64 NEON_TYPE1(u32, uint32_t)
65 #undef NEON_TYPE4
66 #undef NEON_TYPE2
67 #undef NEON_TYPE1
68 
69 /* Copy from a uint32_t to a vector structure type.  */
70 #define NEON_UNPACK(vtype, dest, val) do { \
71     union { \
72         vtype v; \
73         uint32_t i; \
74     } conv_u; \
75     conv_u.i = (val); \
76     dest = conv_u.v; \
77     } while(0)
78 
79 /* Copy from a vector structure type to a uint32_t.  */
80 #define NEON_PACK(vtype, dest, val) do { \
81     union { \
82         vtype v; \
83         uint32_t i; \
84     } conv_u; \
85     conv_u.v = (val); \
86     dest = conv_u.i; \
87     } while(0)
88 
89 #define NEON_DO1 \
90     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91 #define NEON_DO2 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94 #define NEON_DO4 \
95     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99 
100 #define NEON_VOP_BODY(vtype, n) \
101 { \
102     uint32_t res; \
103     vtype vsrc1; \
104     vtype vsrc2; \
105     vtype vdest; \
106     NEON_UNPACK(vtype, vsrc1, arg1); \
107     NEON_UNPACK(vtype, vsrc2, arg2); \
108     NEON_DO##n; \
109     NEON_PACK(vtype, res, vdest); \
110     return res; \
111 }
112 
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
116 
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
120 
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123 {                                                               \
124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
125     vtype *d = vd, *n = vn, *m = vm;                            \
126     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
127         NEON_FN(d[i], n[i], m[i]);                              \
128     }                                                           \
129     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
130 }
131 
132 #define NEON_GVEC_VOP2_ENV(name, vtype) \
133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
134 {                                                               \
135     intptr_t i, opr_sz = simd_oprsz(desc);                      \
136     vtype *d = vd, *n = vn, *m = vm;                            \
137     CPUARMState *env = venv;                                    \
138     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
139         NEON_FN(d[i], n[i], m[i]);                              \
140     }                                                           \
141     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
142 }
143 
144 /* Pairwise operations.  */
145 /* For 32-bit elements each segment only contains a single element, so
146    the elementwise and pairwise operations are the same.  */
147 #define NEON_PDO2 \
148     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
149     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
150 #define NEON_PDO4 \
151     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
152     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
153     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
154     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
155 
156 #define NEON_POP(name, vtype, n) \
157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
158 { \
159     uint32_t res; \
160     vtype vsrc1; \
161     vtype vsrc2; \
162     vtype vdest; \
163     NEON_UNPACK(vtype, vsrc1, arg1); \
164     NEON_UNPACK(vtype, vsrc2, arg2); \
165     NEON_PDO##n; \
166     NEON_PACK(vtype, res, vdest); \
167     return res; \
168 }
169 
170 /* Unary operators.  */
171 #define NEON_VOP1(name, vtype, n) \
172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
173 { \
174     vtype vsrc1; \
175     vtype vdest; \
176     NEON_UNPACK(vtype, vsrc1, arg); \
177     NEON_DO##n; \
178     NEON_PACK(vtype, arg, vdest); \
179     return arg; \
180 }
181 
182 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
183 NEON_VOP(rhadd_s8, neon_s8, 4)
184 NEON_VOP(rhadd_u8, neon_u8, 4)
185 NEON_VOP(rhadd_s16, neon_s16, 2)
186 NEON_VOP(rhadd_u16, neon_u16, 2)
187 #undef NEON_FN
188 
189 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
190 {
191     int32_t dest;
192 
193     dest = (src1 >> 1) + (src2 >> 1);
194     if ((src1 | src2) & 1)
195         dest++;
196     return dest;
197 }
198 
199 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
200 {
201     uint32_t dest;
202 
203     dest = (src1 >> 1) + (src2 >> 1);
204     if ((src1 | src2) & 1)
205         dest++;
206     return dest;
207 }
208 
209 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
210 NEON_VOP(hsub_s8, neon_s8, 4)
211 NEON_VOP(hsub_u8, neon_u8, 4)
212 NEON_VOP(hsub_s16, neon_s16, 2)
213 NEON_VOP(hsub_u16, neon_u16, 2)
214 #undef NEON_FN
215 
216 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
217 {
218     int32_t dest;
219 
220     dest = (src1 >> 1) - (src2 >> 1);
221     if ((~src1) & src2 & 1)
222         dest--;
223     return dest;
224 }
225 
226 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
227 {
228     uint32_t dest;
229 
230     dest = (src1 >> 1) - (src2 >> 1);
231     if ((~src1) & src2 & 1)
232         dest--;
233     return dest;
234 }
235 
236 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
237 NEON_POP(pmin_s8, neon_s8, 4)
238 NEON_POP(pmin_u8, neon_u8, 4)
239 NEON_POP(pmin_s16, neon_s16, 2)
240 NEON_POP(pmin_u16, neon_u16, 2)
241 #undef NEON_FN
242 
243 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
244 NEON_POP(pmax_s8, neon_s8, 4)
245 NEON_POP(pmax_u8, neon_u8, 4)
246 NEON_POP(pmax_s16, neon_s16, 2)
247 NEON_POP(pmax_u16, neon_u16, 2)
248 #undef NEON_FN
249 
250 #define NEON_FN(dest, src1, src2) \
251     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
252 NEON_VOP(shl_u16, neon_u16, 2)
253 #undef NEON_FN
254 
255 #define NEON_FN(dest, src1, src2) \
256     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
257 NEON_VOP(shl_s16, neon_s16, 2)
258 #undef NEON_FN
259 
260 #define NEON_FN(dest, src1, src2) \
261     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
262 NEON_VOP(rshl_s8, neon_s8, 4)
263 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
264 #undef NEON_FN
265 
266 #define NEON_FN(dest, src1, src2) \
267     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
268 NEON_VOP(rshl_s16, neon_s16, 2)
269 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
270 #undef NEON_FN
271 
272 #define NEON_FN(dest, src1, src2) \
273     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
274 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
275 #undef NEON_FN
276 
277 #define NEON_FN(dest, src1, src2) \
278     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
279 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
280 #undef NEON_FN
281 
282 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
283 {
284     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
285 }
286 
287 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
288 {
289     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
290 }
291 
292 #define NEON_FN(dest, src1, src2) \
293     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
294 NEON_VOP(rshl_u8, neon_u8, 4)
295 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
296 #undef NEON_FN
297 
298 #define NEON_FN(dest, src1, src2) \
299     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
300 NEON_VOP(rshl_u16, neon_u16, 2)
301 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
302 #undef NEON_FN
303 
304 #define NEON_FN(dest, src1, src2) \
305     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
306 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
307 #undef NEON_FN
308 
309 #define NEON_FN(dest, src1, src2) \
310     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
311 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
312 #undef NEON_FN
313 
314 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
315 {
316     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
317 }
318 
319 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
320 {
321     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
322 }
323 
324 #define NEON_FN(dest, src1, src2) \
325     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
326 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
327 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
328 #undef NEON_FN
329 
330 #define NEON_FN(dest, src1, src2) \
331     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
332 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
333 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
334 #undef NEON_FN
335 
336 #define NEON_FN(dest, src1, src2) \
337     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
338 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
339 #undef NEON_FN
340 
341 #define NEON_FN(dest, src1, src2) \
342     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
343 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
344 #undef NEON_FN
345 
346 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
347 {
348     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
349 }
350 
351 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
352 {
353     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
354 }
355 
356 #define NEON_FN(dest, src1, src2) \
357     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
358 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
359 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
360 #undef NEON_FN
361 
362 #define NEON_FN(dest, src1, src2) \
363     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
364 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
365 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
366 #undef NEON_FN
367 
368 #define NEON_FN(dest, src1, src2) \
369     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
370 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
371 #undef NEON_FN
372 
373 #define NEON_FN(dest, src1, src2) \
374     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
375 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
376 #undef NEON_FN
377 
378 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
379 {
380     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
381 }
382 
383 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
384 {
385     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
386 }
387 
388 #define NEON_FN(dest, src1, src2) \
389     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
390 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
391 #undef NEON_FN
392 
393 #define NEON_FN(dest, src1, src2) \
394     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
395 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
396 #undef NEON_FN
397 
398 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
399 {
400     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
401 }
402 
403 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
404 {
405     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
406 }
407 
408 #define NEON_FN(dest, src1, src2) \
409     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
410 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
411 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
412 #undef NEON_FN
413 
414 #define NEON_FN(dest, src1, src2) \
415     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
416 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
417 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
418 #undef NEON_FN
419 
420 #define NEON_FN(dest, src1, src2) \
421     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
422 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
423 #undef NEON_FN
424 
425 #define NEON_FN(dest, src1, src2) \
426     (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
427 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
428 #undef NEON_FN
429 
430 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
431 {
432     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
433 }
434 
435 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
436 {
437     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
438 }
439 
440 #define NEON_FN(dest, src1, src2) \
441     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
442 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
443 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
444 #undef NEON_FN
445 
446 #define NEON_FN(dest, src1, src2) \
447     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
448 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
449 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
450 #undef NEON_FN
451 
452 #define NEON_FN(dest, src1, src2) \
453     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
454 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
455 #undef NEON_FN
456 
457 #define NEON_FN(dest, src1, src2) \
458     (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
459 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
460 #undef NEON_FN
461 
462 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
463 {
464     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
465 }
466 
467 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
468 {
469     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
470 }
471 
472 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
473 {
474     uint32_t mask;
475     mask = (a ^ b) & 0x80808080u;
476     a &= ~0x80808080u;
477     b &= ~0x80808080u;
478     return (a + b) ^ mask;
479 }
480 
481 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
482 {
483     uint32_t mask;
484     mask = (a ^ b) & 0x80008000u;
485     a &= ~0x80008000u;
486     b &= ~0x80008000u;
487     return (a + b) ^ mask;
488 }
489 
490 #define NEON_FN(dest, src1, src2) dest = src1 - src2
491 NEON_VOP(sub_u8, neon_u8, 4)
492 NEON_VOP(sub_u16, neon_u16, 2)
493 #undef NEON_FN
494 
495 #define NEON_FN(dest, src1, src2) dest = src1 * src2
496 NEON_VOP(mul_u8, neon_u8, 4)
497 NEON_VOP(mul_u16, neon_u16, 2)
498 #undef NEON_FN
499 
500 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
501 NEON_VOP(tst_u8, neon_u8, 4)
502 NEON_VOP(tst_u16, neon_u16, 2)
503 NEON_VOP(tst_u32, neon_u32, 1)
504 #undef NEON_FN
505 
506 /* Count Leading Sign/Zero Bits.  */
507 static inline int do_clz8(uint8_t x)
508 {
509     int n;
510     for (n = 8; x; n--)
511         x >>= 1;
512     return n;
513 }
514 
515 static inline int do_clz16(uint16_t x)
516 {
517     int n;
518     for (n = 16; x; n--)
519         x >>= 1;
520     return n;
521 }
522 
523 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
524 NEON_VOP1(clz_u8, neon_u8, 4)
525 #undef NEON_FN
526 
527 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
528 NEON_VOP1(clz_u16, neon_u16, 2)
529 #undef NEON_FN
530 
531 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
532 NEON_VOP1(cls_s8, neon_s8, 4)
533 #undef NEON_FN
534 
535 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
536 NEON_VOP1(cls_s16, neon_s16, 2)
537 #undef NEON_FN
538 
539 uint32_t HELPER(neon_cls_s32)(uint32_t x)
540 {
541     int count;
542     if ((int32_t)x < 0)
543         x = ~x;
544     for (count = 32; x; count--)
545         x = x >> 1;
546     return count - 1;
547 }
548 
549 /* Bit count.  */
550 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
551 {
552     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
553     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
554     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
555     return x;
556 }
557 
558 /* Reverse bits in each 8 bit word */
559 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
560 {
561     x =  ((x & 0xf0f0f0f0) >> 4)
562        | ((x & 0x0f0f0f0f) << 4);
563     x =  ((x & 0x88888888) >> 3)
564        | ((x & 0x44444444) >> 1)
565        | ((x & 0x22222222) << 1)
566        | ((x & 0x11111111) << 3);
567     return x;
568 }
569 
570 #define NEON_QDMULH16(dest, src1, src2, round) do { \
571     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
572     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
573         SET_QC(); \
574         tmp = (tmp >> 31) ^ ~SIGNBIT; \
575     } else { \
576         tmp <<= 1; \
577     } \
578     if (round) { \
579         int32_t old = tmp; \
580         tmp += 1 << 15; \
581         if ((int32_t)tmp < old) { \
582             SET_QC(); \
583             tmp = SIGNBIT - 1; \
584         } \
585     } \
586     dest = tmp >> 16; \
587     } while(0)
588 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
589 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
590 #undef NEON_FN
591 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
592 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
593 #undef NEON_FN
594 #undef NEON_QDMULH16
595 
596 #define NEON_QDMULH32(dest, src1, src2, round) do { \
597     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
598     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
599         SET_QC(); \
600         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
601     } else { \
602         tmp <<= 1; \
603     } \
604     if (round) { \
605         int64_t old = tmp; \
606         tmp += (int64_t)1 << 31; \
607         if ((int64_t)tmp < old) { \
608             SET_QC(); \
609             tmp = SIGNBIT64 - 1; \
610         } \
611     } \
612     dest = tmp >> 32; \
613     } while(0)
614 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
615 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
616 #undef NEON_FN
617 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
618 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
619 #undef NEON_FN
620 #undef NEON_QDMULH32
621 
622 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
623 {
624     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
625            | ((x >> 24) & 0xff000000u);
626 }
627 
628 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
629 {
630     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
631 }
632 
633 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
634 {
635     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
636             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
637 }
638 
639 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
640 {
641     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
642 }
643 
644 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
645 {
646     x &= 0xff80ff80ff80ff80ull;
647     x += 0x0080008000800080ull;
648     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
649             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
650 }
651 
652 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
653 {
654     x &= 0xffff8000ffff8000ull;
655     x += 0x0000800000008000ull;
656     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
657 }
658 
659 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
660 {
661     uint16_t s;
662     uint8_t d;
663     uint32_t res = 0;
664 #define SAT8(n) \
665     s = x >> n; \
666     if (s & 0x8000) { \
667         SET_QC(); \
668     } else { \
669         if (s > 0xff) { \
670             d = 0xff; \
671             SET_QC(); \
672         } else  { \
673             d = s; \
674         } \
675         res |= (uint32_t)d << (n / 2); \
676     }
677 
678     SAT8(0);
679     SAT8(16);
680     SAT8(32);
681     SAT8(48);
682 #undef SAT8
683     return res;
684 }
685 
686 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
687 {
688     uint16_t s;
689     uint8_t d;
690     uint32_t res = 0;
691 #define SAT8(n) \
692     s = x >> n; \
693     if (s > 0xff) { \
694         d = 0xff; \
695         SET_QC(); \
696     } else  { \
697         d = s; \
698     } \
699     res |= (uint32_t)d << (n / 2);
700 
701     SAT8(0);
702     SAT8(16);
703     SAT8(32);
704     SAT8(48);
705 #undef SAT8
706     return res;
707 }
708 
709 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
710 {
711     int16_t s;
712     uint8_t d;
713     uint32_t res = 0;
714 #define SAT8(n) \
715     s = x >> n; \
716     if (s != (int8_t)s) { \
717         d = (s >> 15) ^ 0x7f; \
718         SET_QC(); \
719     } else  { \
720         d = s; \
721     } \
722     res |= (uint32_t)d << (n / 2);
723 
724     SAT8(0);
725     SAT8(16);
726     SAT8(32);
727     SAT8(48);
728 #undef SAT8
729     return res;
730 }
731 
732 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
733 {
734     uint32_t high;
735     uint32_t low;
736     low = x;
737     if (low & 0x80000000) {
738         low = 0;
739         SET_QC();
740     } else if (low > 0xffff) {
741         low = 0xffff;
742         SET_QC();
743     }
744     high = x >> 32;
745     if (high & 0x80000000) {
746         high = 0;
747         SET_QC();
748     } else if (high > 0xffff) {
749         high = 0xffff;
750         SET_QC();
751     }
752     return low | (high << 16);
753 }
754 
755 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
756 {
757     uint32_t high;
758     uint32_t low;
759     low = x;
760     if (low > 0xffff) {
761         low = 0xffff;
762         SET_QC();
763     }
764     high = x >> 32;
765     if (high > 0xffff) {
766         high = 0xffff;
767         SET_QC();
768     }
769     return low | (high << 16);
770 }
771 
772 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
773 {
774     int32_t low;
775     int32_t high;
776     low = x;
777     if (low != (int16_t)low) {
778         low = (low >> 31) ^ 0x7fff;
779         SET_QC();
780     }
781     high = x >> 32;
782     if (high != (int16_t)high) {
783         high = (high >> 31) ^ 0x7fff;
784         SET_QC();
785     }
786     return (uint16_t)low | (high << 16);
787 }
788 
789 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
790 {
791     if (x & 0x8000000000000000ull) {
792         SET_QC();
793         return 0;
794     }
795     if (x > 0xffffffffu) {
796         SET_QC();
797         return 0xffffffffu;
798     }
799     return x;
800 }
801 
802 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
803 {
804     if (x > 0xffffffffu) {
805         SET_QC();
806         return 0xffffffffu;
807     }
808     return x;
809 }
810 
811 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
812 {
813     if ((int64_t)x != (int32_t)x) {
814         SET_QC();
815         return ((int64_t)x >> 63) ^ 0x7fffffff;
816     }
817     return x;
818 }
819 
820 uint64_t HELPER(neon_widen_u8)(uint32_t x)
821 {
822     uint64_t tmp;
823     uint64_t ret;
824     ret = (uint8_t)x;
825     tmp = (uint8_t)(x >> 8);
826     ret |= tmp << 16;
827     tmp = (uint8_t)(x >> 16);
828     ret |= tmp << 32;
829     tmp = (uint8_t)(x >> 24);
830     ret |= tmp << 48;
831     return ret;
832 }
833 
834 uint64_t HELPER(neon_widen_s8)(uint32_t x)
835 {
836     uint64_t tmp;
837     uint64_t ret;
838     ret = (uint16_t)(int8_t)x;
839     tmp = (uint16_t)(int8_t)(x >> 8);
840     ret |= tmp << 16;
841     tmp = (uint16_t)(int8_t)(x >> 16);
842     ret |= tmp << 32;
843     tmp = (uint16_t)(int8_t)(x >> 24);
844     ret |= tmp << 48;
845     return ret;
846 }
847 
848 uint64_t HELPER(neon_widen_u16)(uint32_t x)
849 {
850     uint64_t high = (uint16_t)(x >> 16);
851     return ((uint16_t)x) | (high << 32);
852 }
853 
854 uint64_t HELPER(neon_widen_s16)(uint32_t x)
855 {
856     uint64_t high = (int16_t)(x >> 16);
857     return ((uint32_t)(int16_t)x) | (high << 32);
858 }
859 
860 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
861 {
862     uint64_t mask;
863     mask = (a ^ b) & 0x8000800080008000ull;
864     a &= ~0x8000800080008000ull;
865     b &= ~0x8000800080008000ull;
866     return (a + b) ^ mask;
867 }
868 
869 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
870 {
871     uint64_t mask;
872     mask = (a ^ b) & 0x8000000080000000ull;
873     a &= ~0x8000000080000000ull;
874     b &= ~0x8000000080000000ull;
875     return (a + b) ^ mask;
876 }
877 
878 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
879 {
880     uint64_t tmp;
881     uint64_t tmp2;
882 
883     tmp = a & 0x0000ffff0000ffffull;
884     tmp += (a >> 16) & 0x0000ffff0000ffffull;
885     tmp2 = b & 0xffff0000ffff0000ull;
886     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
887     return    ( tmp         & 0xffff)
888             | ((tmp  >> 16) & 0xffff0000ull)
889             | ((tmp2 << 16) & 0xffff00000000ull)
890             | ( tmp2        & 0xffff000000000000ull);
891 }
892 
893 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
894 {
895     uint32_t low = a + (a >> 32);
896     uint32_t high = b + (b >> 32);
897     return low + ((uint64_t)high << 32);
898 }
899 
900 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
901 {
902     uint64_t mask;
903     mask = (a ^ ~b) & 0x8000800080008000ull;
904     a |= 0x8000800080008000ull;
905     b &= ~0x8000800080008000ull;
906     return (a - b) ^ mask;
907 }
908 
909 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
910 {
911     uint64_t mask;
912     mask = (a ^ ~b) & 0x8000000080000000ull;
913     a |= 0x8000000080000000ull;
914     b &= ~0x8000000080000000ull;
915     return (a - b) ^ mask;
916 }
917 
918 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
919 {
920     uint32_t x, y;
921     uint32_t low, high;
922 
923     x = a;
924     y = b;
925     low = x + y;
926     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
927         SET_QC();
928         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
929     }
930     x = a >> 32;
931     y = b >> 32;
932     high = x + y;
933     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
934         SET_QC();
935         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
936     }
937     return low | ((uint64_t)high << 32);
938 }
939 
940 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
941 {
942     uint64_t result;
943 
944     result = a + b;
945     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
946         SET_QC();
947         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
948     }
949     return result;
950 }
951 
952 /* We have to do the arithmetic in a larger type than
953  * the input type, because for example with a signed 32 bit
954  * op the absolute difference can overflow a signed 32 bit value.
955  */
956 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
957     arithtype tmp_x = (intype)(x);                            \
958     arithtype tmp_y = (intype)(y);                            \
959     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
960     } while(0)
961 
962 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
963 {
964     uint64_t tmp;
965     uint64_t result;
966     DO_ABD(result, a, b, uint8_t, uint32_t);
967     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
968     result |= tmp << 16;
969     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
970     result |= tmp << 32;
971     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
972     result |= tmp << 48;
973     return result;
974 }
975 
976 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
977 {
978     uint64_t tmp;
979     uint64_t result;
980     DO_ABD(result, a, b, int8_t, int32_t);
981     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
982     result |= tmp << 16;
983     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
984     result |= tmp << 32;
985     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
986     result |= tmp << 48;
987     return result;
988 }
989 
990 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
991 {
992     uint64_t tmp;
993     uint64_t result;
994     DO_ABD(result, a, b, uint16_t, uint32_t);
995     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
996     return result | (tmp << 32);
997 }
998 
999 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1000 {
1001     uint64_t tmp;
1002     uint64_t result;
1003     DO_ABD(result, a, b, int16_t, int32_t);
1004     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1005     return result | (tmp << 32);
1006 }
1007 
1008 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1009 {
1010     uint64_t result;
1011     DO_ABD(result, a, b, uint32_t, uint64_t);
1012     return result;
1013 }
1014 
1015 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1016 {
1017     uint64_t result;
1018     DO_ABD(result, a, b, int32_t, int64_t);
1019     return result;
1020 }
1021 #undef DO_ABD
1022 
1023 /* Widening multiply. Named type is the source type.  */
1024 #define DO_MULL(dest, x, y, type1, type2) do { \
1025     type1 tmp_x = x; \
1026     type1 tmp_y = y; \
1027     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1028     } while(0)
1029 
1030 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1031 {
1032     uint64_t tmp;
1033     uint64_t result;
1034 
1035     DO_MULL(result, a, b, uint8_t, uint16_t);
1036     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1037     result |= tmp << 16;
1038     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1039     result |= tmp << 32;
1040     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1041     result |= tmp << 48;
1042     return result;
1043 }
1044 
1045 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1046 {
1047     uint64_t tmp;
1048     uint64_t result;
1049 
1050     DO_MULL(result, a, b, int8_t, uint16_t);
1051     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1052     result |= tmp << 16;
1053     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1054     result |= tmp << 32;
1055     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1056     result |= tmp << 48;
1057     return result;
1058 }
1059 
1060 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1061 {
1062     uint64_t tmp;
1063     uint64_t result;
1064 
1065     DO_MULL(result, a, b, uint16_t, uint32_t);
1066     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1067     return result | (tmp << 32);
1068 }
1069 
1070 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1071 {
1072     uint64_t tmp;
1073     uint64_t result;
1074 
1075     DO_MULL(result, a, b, int16_t, uint32_t);
1076     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1077     return result | (tmp << 32);
1078 }
1079 
1080 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1081 {
1082     uint16_t tmp;
1083     uint64_t result;
1084     result = (uint16_t)-x;
1085     tmp = -(x >> 16);
1086     result |= (uint64_t)tmp << 16;
1087     tmp = -(x >> 32);
1088     result |= (uint64_t)tmp << 32;
1089     tmp = -(x >> 48);
1090     result |= (uint64_t)tmp << 48;
1091     return result;
1092 }
1093 
1094 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1095 {
1096     uint32_t low = -x;
1097     uint32_t high = -(x >> 32);
1098     return low | ((uint64_t)high << 32);
1099 }
1100 
1101 /* Saturating sign manipulation.  */
1102 /* ??? Make these use NEON_VOP1 */
1103 #define DO_QABS8(x) do { \
1104     if (x == (int8_t)0x80) { \
1105         x = 0x7f; \
1106         SET_QC(); \
1107     } else if (x < 0) { \
1108         x = -x; \
1109     }} while (0)
1110 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1111 {
1112     neon_s8 vec;
1113     NEON_UNPACK(neon_s8, vec, x);
1114     DO_QABS8(vec.v1);
1115     DO_QABS8(vec.v2);
1116     DO_QABS8(vec.v3);
1117     DO_QABS8(vec.v4);
1118     NEON_PACK(neon_s8, x, vec);
1119     return x;
1120 }
1121 #undef DO_QABS8
1122 
1123 #define DO_QNEG8(x) do { \
1124     if (x == (int8_t)0x80) { \
1125         x = 0x7f; \
1126         SET_QC(); \
1127     } else { \
1128         x = -x; \
1129     }} while (0)
1130 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1131 {
1132     neon_s8 vec;
1133     NEON_UNPACK(neon_s8, vec, x);
1134     DO_QNEG8(vec.v1);
1135     DO_QNEG8(vec.v2);
1136     DO_QNEG8(vec.v3);
1137     DO_QNEG8(vec.v4);
1138     NEON_PACK(neon_s8, x, vec);
1139     return x;
1140 }
1141 #undef DO_QNEG8
1142 
1143 #define DO_QABS16(x) do { \
1144     if (x == (int16_t)0x8000) { \
1145         x = 0x7fff; \
1146         SET_QC(); \
1147     } else if (x < 0) { \
1148         x = -x; \
1149     }} while (0)
1150 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1151 {
1152     neon_s16 vec;
1153     NEON_UNPACK(neon_s16, vec, x);
1154     DO_QABS16(vec.v1);
1155     DO_QABS16(vec.v2);
1156     NEON_PACK(neon_s16, x, vec);
1157     return x;
1158 }
1159 #undef DO_QABS16
1160 
1161 #define DO_QNEG16(x) do { \
1162     if (x == (int16_t)0x8000) { \
1163         x = 0x7fff; \
1164         SET_QC(); \
1165     } else { \
1166         x = -x; \
1167     }} while (0)
1168 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1169 {
1170     neon_s16 vec;
1171     NEON_UNPACK(neon_s16, vec, x);
1172     DO_QNEG16(vec.v1);
1173     DO_QNEG16(vec.v2);
1174     NEON_PACK(neon_s16, x, vec);
1175     return x;
1176 }
1177 #undef DO_QNEG16
1178 
1179 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1180 {
1181     if (x == SIGNBIT) {
1182         SET_QC();
1183         x = ~SIGNBIT;
1184     } else if ((int32_t)x < 0) {
1185         x = -x;
1186     }
1187     return x;
1188 }
1189 
1190 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1191 {
1192     if (x == SIGNBIT) {
1193         SET_QC();
1194         x = ~SIGNBIT;
1195     } else {
1196         x = -x;
1197     }
1198     return x;
1199 }
1200 
1201 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1202 {
1203     if (x == SIGNBIT64) {
1204         SET_QC();
1205         x = ~SIGNBIT64;
1206     } else if ((int64_t)x < 0) {
1207         x = -x;
1208     }
1209     return x;
1210 }
1211 
1212 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1213 {
1214     if (x == SIGNBIT64) {
1215         SET_QC();
1216         x = ~SIGNBIT64;
1217     } else {
1218         x = -x;
1219     }
1220     return x;
1221 }
1222 
1223 /* NEON Float helpers.  */
1224 
1225 /* Floating point comparisons produce an integer result.
1226  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1227  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1228  */
1229 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1230 {
1231     float_status *fpst = fpstp;
1232     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1233 }
1234 
1235 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1236 {
1237     float_status *fpst = fpstp;
1238     return -float32_le(make_float32(b), make_float32(a), fpst);
1239 }
1240 
1241 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1242 {
1243     float_status *fpst = fpstp;
1244     return -float32_lt(make_float32(b), make_float32(a), fpst);
1245 }
1246 
1247 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1248 {
1249     float_status *fpst = fpstp;
1250     float32 f0 = float32_abs(make_float32(a));
1251     float32 f1 = float32_abs(make_float32(b));
1252     return -float32_le(f1, f0, fpst);
1253 }
1254 
1255 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1256 {
1257     float_status *fpst = fpstp;
1258     float32 f0 = float32_abs(make_float32(a));
1259     float32 f1 = float32_abs(make_float32(b));
1260     return -float32_lt(f1, f0, fpst);
1261 }
1262 
1263 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1264 {
1265     float_status *fpst = fpstp;
1266     float64 f0 = float64_abs(make_float64(a));
1267     float64 f1 = float64_abs(make_float64(b));
1268     return -float64_le(f1, f0, fpst);
1269 }
1270 
1271 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1272 {
1273     float_status *fpst = fpstp;
1274     float64 f0 = float64_abs(make_float64(a));
1275     float64 f1 = float64_abs(make_float64(b));
1276     return -float64_lt(f1, f0, fpst);
1277 }
1278 
1279 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1280 
1281 void HELPER(neon_qunzip8)(void *vd, void *vm)
1282 {
1283     uint64_t *rd = vd, *rm = vm;
1284     uint64_t zd0 = rd[0], zd1 = rd[1];
1285     uint64_t zm0 = rm[0], zm1 = rm[1];
1286 
1287     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1288         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1289         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1290         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1291     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1292         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1293         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1294         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1295     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1296         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1297         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1298         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1299     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1300         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1301         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1302         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1303 
1304     rm[0] = m0;
1305     rm[1] = m1;
1306     rd[0] = d0;
1307     rd[1] = d1;
1308 }
1309 
1310 void HELPER(neon_qunzip16)(void *vd, void *vm)
1311 {
1312     uint64_t *rd = vd, *rm = vm;
1313     uint64_t zd0 = rd[0], zd1 = rd[1];
1314     uint64_t zm0 = rm[0], zm1 = rm[1];
1315 
1316     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1317         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1318     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1319         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1320     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1321         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1322     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1323         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1324 
1325     rm[0] = m0;
1326     rm[1] = m1;
1327     rd[0] = d0;
1328     rd[1] = d1;
1329 }
1330 
1331 void HELPER(neon_qunzip32)(void *vd, void *vm)
1332 {
1333     uint64_t *rd = vd, *rm = vm;
1334     uint64_t zd0 = rd[0], zd1 = rd[1];
1335     uint64_t zm0 = rm[0], zm1 = rm[1];
1336 
1337     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1338     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1339     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1340     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1341 
1342     rm[0] = m0;
1343     rm[1] = m1;
1344     rd[0] = d0;
1345     rd[1] = d1;
1346 }
1347 
1348 void HELPER(neon_unzip8)(void *vd, void *vm)
1349 {
1350     uint64_t *rd = vd, *rm = vm;
1351     uint64_t zd = rd[0], zm = rm[0];
1352 
1353     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1354         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1355         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1356         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1357     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1358         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1359         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1360         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1361 
1362     rm[0] = m0;
1363     rd[0] = d0;
1364 }
1365 
1366 void HELPER(neon_unzip16)(void *vd, void *vm)
1367 {
1368     uint64_t *rd = vd, *rm = vm;
1369     uint64_t zd = rd[0], zm = rm[0];
1370 
1371     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1372         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1373     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1374         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1375 
1376     rm[0] = m0;
1377     rd[0] = d0;
1378 }
1379 
1380 void HELPER(neon_qzip8)(void *vd, void *vm)
1381 {
1382     uint64_t *rd = vd, *rm = vm;
1383     uint64_t zd0 = rd[0], zd1 = rd[1];
1384     uint64_t zm0 = rm[0], zm1 = rm[1];
1385 
1386     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1387         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1388         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1389         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1390     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1391         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1392         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1393         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1394     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1395         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1396         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1397         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1398     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1399         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1400         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1401         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1402 
1403     rm[0] = m0;
1404     rm[1] = m1;
1405     rd[0] = d0;
1406     rd[1] = d1;
1407 }
1408 
1409 void HELPER(neon_qzip16)(void *vd, void *vm)
1410 {
1411     uint64_t *rd = vd, *rm = vm;
1412     uint64_t zd0 = rd[0], zd1 = rd[1];
1413     uint64_t zm0 = rm[0], zm1 = rm[1];
1414 
1415     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1416         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1417     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1418         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1419     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1420         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1421     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1422         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1423 
1424     rm[0] = m0;
1425     rm[1] = m1;
1426     rd[0] = d0;
1427     rd[1] = d1;
1428 }
1429 
1430 void HELPER(neon_qzip32)(void *vd, void *vm)
1431 {
1432     uint64_t *rd = vd, *rm = vm;
1433     uint64_t zd0 = rd[0], zd1 = rd[1];
1434     uint64_t zm0 = rm[0], zm1 = rm[1];
1435 
1436     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1437     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1438     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1439     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1440 
1441     rm[0] = m0;
1442     rm[1] = m1;
1443     rd[0] = d0;
1444     rd[1] = d1;
1445 }
1446 
1447 void HELPER(neon_zip8)(void *vd, void *vm)
1448 {
1449     uint64_t *rd = vd, *rm = vm;
1450     uint64_t zd = rd[0], zm = rm[0];
1451 
1452     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1453         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1454         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1455         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1456     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1457         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1458         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1459         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1460 
1461     rm[0] = m0;
1462     rd[0] = d0;
1463 }
1464 
1465 void HELPER(neon_zip16)(void *vd, void *vm)
1466 {
1467     uint64_t *rd = vd, *rm = vm;
1468     uint64_t zd = rd[0], zm = rm[0];
1469 
1470     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1471         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1472     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1473         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1474 
1475     rm[0] = m0;
1476     rd[0] = d0;
1477 }
1478