xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision e72a68781572f31cbd6824681720ff936fba4707)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
16 
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
19 
20 #define SET_QC() env->vfp.qc[0] = 1
21 
22 #define NEON_TYPE1(name, type) \
23 typedef struct \
24 { \
25     type v1; \
26 } neon_##name;
27 #if HOST_BIG_ENDIAN
28 #define NEON_TYPE2(name, type) \
29 typedef struct \
30 { \
31     type v2; \
32     type v1; \
33 } neon_##name;
34 #define NEON_TYPE4(name, type) \
35 typedef struct \
36 { \
37     type v4; \
38     type v3; \
39     type v2; \
40     type v1; \
41 } neon_##name;
42 #else
43 #define NEON_TYPE2(name, type) \
44 typedef struct \
45 { \
46     type v1; \
47     type v2; \
48 } neon_##name;
49 #define NEON_TYPE4(name, type) \
50 typedef struct \
51 { \
52     type v1; \
53     type v2; \
54     type v3; \
55     type v4; \
56 } neon_##name;
57 #endif
58 
59 NEON_TYPE4(s8, int8_t)
60 NEON_TYPE4(u8, uint8_t)
61 NEON_TYPE2(s16, int16_t)
62 NEON_TYPE2(u16, uint16_t)
63 NEON_TYPE1(s32, int32_t)
64 NEON_TYPE1(u32, uint32_t)
65 #undef NEON_TYPE4
66 #undef NEON_TYPE2
67 #undef NEON_TYPE1
68 
69 /* Copy from a uint32_t to a vector structure type.  */
70 #define NEON_UNPACK(vtype, dest, val) do { \
71     union { \
72         vtype v; \
73         uint32_t i; \
74     } conv_u; \
75     conv_u.i = (val); \
76     dest = conv_u.v; \
77     } while(0)
78 
79 /* Copy from a vector structure type to a uint32_t.  */
80 #define NEON_PACK(vtype, dest, val) do { \
81     union { \
82         vtype v; \
83         uint32_t i; \
84     } conv_u; \
85     conv_u.v = (val); \
86     dest = conv_u.i; \
87     } while(0)
88 
89 #define NEON_DO1 \
90     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91 #define NEON_DO2 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94 #define NEON_DO4 \
95     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99 
100 #define NEON_VOP_BODY(vtype, n) \
101 { \
102     uint32_t res; \
103     vtype vsrc1; \
104     vtype vsrc2; \
105     vtype vdest; \
106     NEON_UNPACK(vtype, vsrc1, arg1); \
107     NEON_UNPACK(vtype, vsrc2, arg2); \
108     NEON_DO##n; \
109     NEON_PACK(vtype, res, vdest); \
110     return res; \
111 }
112 
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
116 
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
120 
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123 {                                                               \
124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
125     vtype *d = vd, *n = vn, *m = vm;                            \
126     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
127         NEON_FN(d[i], n[i], m[i]);                              \
128     }                                                           \
129     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
130 }
131 
132 #define NEON_GVEC_VOP2_ENV(name, vtype) \
133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
134 {                                                               \
135     intptr_t i, opr_sz = simd_oprsz(desc);                      \
136     vtype *d = vd, *n = vn, *m = vm;                            \
137     CPUARMState *env = venv;                                    \
138     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
139         NEON_FN(d[i], n[i], m[i]);                              \
140     }                                                           \
141     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
142 }
143 
144 /* Pairwise operations.  */
145 /* For 32-bit elements each segment only contains a single element, so
146    the elementwise and pairwise operations are the same.  */
147 #define NEON_PDO2 \
148     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
149     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
150 #define NEON_PDO4 \
151     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
152     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
153     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
154     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
155 
156 #define NEON_POP(name, vtype, n) \
157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
158 { \
159     uint32_t res; \
160     vtype vsrc1; \
161     vtype vsrc2; \
162     vtype vdest; \
163     NEON_UNPACK(vtype, vsrc1, arg1); \
164     NEON_UNPACK(vtype, vsrc2, arg2); \
165     NEON_PDO##n; \
166     NEON_PACK(vtype, res, vdest); \
167     return res; \
168 }
169 
170 /* Unary operators.  */
171 #define NEON_VOP1(name, vtype, n) \
172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
173 { \
174     vtype vsrc1; \
175     vtype vdest; \
176     NEON_UNPACK(vtype, vsrc1, arg); \
177     NEON_DO##n; \
178     NEON_PACK(vtype, arg, vdest); \
179     return arg; \
180 }
181 
182 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
183 NEON_VOP(hadd_s8, neon_s8, 4)
184 NEON_VOP(hadd_u8, neon_u8, 4)
185 NEON_VOP(hadd_s16, neon_s16, 2)
186 NEON_VOP(hadd_u16, neon_u16, 2)
187 #undef NEON_FN
188 
189 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
190 {
191     int32_t dest;
192 
193     dest = (src1 >> 1) + (src2 >> 1);
194     if (src1 & src2 & 1)
195         dest++;
196     return dest;
197 }
198 
199 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
200 {
201     uint32_t dest;
202 
203     dest = (src1 >> 1) + (src2 >> 1);
204     if (src1 & src2 & 1)
205         dest++;
206     return dest;
207 }
208 
209 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
210 NEON_VOP(rhadd_s8, neon_s8, 4)
211 NEON_VOP(rhadd_u8, neon_u8, 4)
212 NEON_VOP(rhadd_s16, neon_s16, 2)
213 NEON_VOP(rhadd_u16, neon_u16, 2)
214 #undef NEON_FN
215 
216 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
217 {
218     int32_t dest;
219 
220     dest = (src1 >> 1) + (src2 >> 1);
221     if ((src1 | src2) & 1)
222         dest++;
223     return dest;
224 }
225 
226 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
227 {
228     uint32_t dest;
229 
230     dest = (src1 >> 1) + (src2 >> 1);
231     if ((src1 | src2) & 1)
232         dest++;
233     return dest;
234 }
235 
236 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
237 NEON_VOP(hsub_s8, neon_s8, 4)
238 NEON_VOP(hsub_u8, neon_u8, 4)
239 NEON_VOP(hsub_s16, neon_s16, 2)
240 NEON_VOP(hsub_u16, neon_u16, 2)
241 #undef NEON_FN
242 
243 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
244 {
245     int32_t dest;
246 
247     dest = (src1 >> 1) - (src2 >> 1);
248     if ((~src1) & src2 & 1)
249         dest--;
250     return dest;
251 }
252 
253 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
254 {
255     uint32_t dest;
256 
257     dest = (src1 >> 1) - (src2 >> 1);
258     if ((~src1) & src2 & 1)
259         dest--;
260     return dest;
261 }
262 
263 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
264 NEON_POP(pmin_s8, neon_s8, 4)
265 NEON_POP(pmin_u8, neon_u8, 4)
266 NEON_POP(pmin_s16, neon_s16, 2)
267 NEON_POP(pmin_u16, neon_u16, 2)
268 #undef NEON_FN
269 
270 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
271 NEON_POP(pmax_s8, neon_s8, 4)
272 NEON_POP(pmax_u8, neon_u8, 4)
273 NEON_POP(pmax_s16, neon_s16, 2)
274 NEON_POP(pmax_u16, neon_u16, 2)
275 #undef NEON_FN
276 
277 #define NEON_FN(dest, src1, src2) \
278     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
279 NEON_VOP(shl_u16, neon_u16, 2)
280 #undef NEON_FN
281 
282 #define NEON_FN(dest, src1, src2) \
283     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
284 NEON_VOP(shl_s16, neon_s16, 2)
285 #undef NEON_FN
286 
287 #define NEON_FN(dest, src1, src2) \
288     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
289 NEON_VOP(rshl_s8, neon_s8, 4)
290 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
291 #undef NEON_FN
292 
293 #define NEON_FN(dest, src1, src2) \
294     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
295 NEON_VOP(rshl_s16, neon_s16, 2)
296 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
297 #undef NEON_FN
298 
299 #define NEON_FN(dest, src1, src2) \
300     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
301 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
302 #undef NEON_FN
303 
304 #define NEON_FN(dest, src1, src2) \
305     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
306 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
307 #undef NEON_FN
308 
309 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
310 {
311     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
312 }
313 
314 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
315 {
316     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
317 }
318 
319 #define NEON_FN(dest, src1, src2) \
320     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
321 NEON_VOP(rshl_u8, neon_u8, 4)
322 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
323 #undef NEON_FN
324 
325 #define NEON_FN(dest, src1, src2) \
326     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
327 NEON_VOP(rshl_u16, neon_u16, 2)
328 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
329 #undef NEON_FN
330 
331 #define NEON_FN(dest, src1, src2) \
332     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
333 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
334 #undef NEON_FN
335 
336 #define NEON_FN(dest, src1, src2) \
337     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
338 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
339 #undef NEON_FN
340 
341 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
342 {
343     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
344 }
345 
346 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
347 {
348     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
349 }
350 
351 #define NEON_FN(dest, src1, src2) \
352     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
353 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
354 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
355 #undef NEON_FN
356 
357 #define NEON_FN(dest, src1, src2) \
358     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
359 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
360 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
361 #undef NEON_FN
362 
363 #define NEON_FN(dest, src1, src2) \
364     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
365 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
366 #undef NEON_FN
367 
368 #define NEON_FN(dest, src1, src2) \
369     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
370 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
371 #undef NEON_FN
372 
373 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
374 {
375     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
376 }
377 
378 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
379 {
380     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
381 }
382 
383 #define NEON_FN(dest, src1, src2) \
384     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
385 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
386 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
387 #undef NEON_FN
388 
389 #define NEON_FN(dest, src1, src2) \
390     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
391 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
392 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
393 #undef NEON_FN
394 
395 #define NEON_FN(dest, src1, src2) \
396     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
397 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
398 #undef NEON_FN
399 
400 #define NEON_FN(dest, src1, src2) \
401     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
402 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
403 #undef NEON_FN
404 
405 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
406 {
407     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
408 }
409 
410 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
411 {
412     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
413 }
414 
415 #define NEON_FN(dest, src1, src2) \
416     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
417 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
418 #undef NEON_FN
419 
420 #define NEON_FN(dest, src1, src2) \
421     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
422 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
423 #undef NEON_FN
424 
425 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
426 {
427     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
428 }
429 
430 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
431 {
432     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
433 }
434 
435 #define NEON_FN(dest, src1, src2) \
436     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
437 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
438 #undef NEON_FN
439 
440 #define NEON_FN(dest, src1, src2) \
441     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
442 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
443 #undef NEON_FN
444 
445 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
446 {
447     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
448 }
449 
450 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
451 {
452     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
453 }
454 
455 #define NEON_FN(dest, src1, src2) \
456     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
457 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
458 #undef NEON_FN
459 
460 #define NEON_FN(dest, src1, src2) \
461     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
462 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
463 #undef NEON_FN
464 
465 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
466 {
467     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
468 }
469 
470 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
471 {
472     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
473 }
474 
475 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
476 {
477     uint32_t mask;
478     mask = (a ^ b) & 0x80808080u;
479     a &= ~0x80808080u;
480     b &= ~0x80808080u;
481     return (a + b) ^ mask;
482 }
483 
484 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
485 {
486     uint32_t mask;
487     mask = (a ^ b) & 0x80008000u;
488     a &= ~0x80008000u;
489     b &= ~0x80008000u;
490     return (a + b) ^ mask;
491 }
492 
493 #define NEON_FN(dest, src1, src2) dest = src1 - src2
494 NEON_VOP(sub_u8, neon_u8, 4)
495 NEON_VOP(sub_u16, neon_u16, 2)
496 #undef NEON_FN
497 
498 #define NEON_FN(dest, src1, src2) dest = src1 * src2
499 NEON_VOP(mul_u8, neon_u8, 4)
500 NEON_VOP(mul_u16, neon_u16, 2)
501 #undef NEON_FN
502 
503 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
504 NEON_VOP(tst_u8, neon_u8, 4)
505 NEON_VOP(tst_u16, neon_u16, 2)
506 NEON_VOP(tst_u32, neon_u32, 1)
507 #undef NEON_FN
508 
509 /* Count Leading Sign/Zero Bits.  */
510 static inline int do_clz8(uint8_t x)
511 {
512     int n;
513     for (n = 8; x; n--)
514         x >>= 1;
515     return n;
516 }
517 
518 static inline int do_clz16(uint16_t x)
519 {
520     int n;
521     for (n = 16; x; n--)
522         x >>= 1;
523     return n;
524 }
525 
526 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
527 NEON_VOP1(clz_u8, neon_u8, 4)
528 #undef NEON_FN
529 
530 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
531 NEON_VOP1(clz_u16, neon_u16, 2)
532 #undef NEON_FN
533 
534 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
535 NEON_VOP1(cls_s8, neon_s8, 4)
536 #undef NEON_FN
537 
538 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
539 NEON_VOP1(cls_s16, neon_s16, 2)
540 #undef NEON_FN
541 
542 uint32_t HELPER(neon_cls_s32)(uint32_t x)
543 {
544     int count;
545     if ((int32_t)x < 0)
546         x = ~x;
547     for (count = 32; x; count--)
548         x = x >> 1;
549     return count - 1;
550 }
551 
552 /* Bit count.  */
553 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
554 {
555     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
556     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
557     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
558     return x;
559 }
560 
561 /* Reverse bits in each 8 bit word */
562 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
563 {
564     x =  ((x & 0xf0f0f0f0) >> 4)
565        | ((x & 0x0f0f0f0f) << 4);
566     x =  ((x & 0x88888888) >> 3)
567        | ((x & 0x44444444) >> 1)
568        | ((x & 0x22222222) << 1)
569        | ((x & 0x11111111) << 3);
570     return x;
571 }
572 
573 #define NEON_QDMULH16(dest, src1, src2, round) do { \
574     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
575     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
576         SET_QC(); \
577         tmp = (tmp >> 31) ^ ~SIGNBIT; \
578     } else { \
579         tmp <<= 1; \
580     } \
581     if (round) { \
582         int32_t old = tmp; \
583         tmp += 1 << 15; \
584         if ((int32_t)tmp < old) { \
585             SET_QC(); \
586             tmp = SIGNBIT - 1; \
587         } \
588     } \
589     dest = tmp >> 16; \
590     } while(0)
591 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
592 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
593 #undef NEON_FN
594 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
595 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
596 #undef NEON_FN
597 #undef NEON_QDMULH16
598 
599 #define NEON_QDMULH32(dest, src1, src2, round) do { \
600     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
601     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
602         SET_QC(); \
603         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
604     } else { \
605         tmp <<= 1; \
606     } \
607     if (round) { \
608         int64_t old = tmp; \
609         tmp += (int64_t)1 << 31; \
610         if ((int64_t)tmp < old) { \
611             SET_QC(); \
612             tmp = SIGNBIT64 - 1; \
613         } \
614     } \
615     dest = tmp >> 32; \
616     } while(0)
617 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
618 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
619 #undef NEON_FN
620 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
621 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
622 #undef NEON_FN
623 #undef NEON_QDMULH32
624 
625 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
626 {
627     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
628            | ((x >> 24) & 0xff000000u);
629 }
630 
631 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
632 {
633     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
634 }
635 
636 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
637 {
638     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
639             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
640 }
641 
642 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
643 {
644     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
645 }
646 
647 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
648 {
649     x &= 0xff80ff80ff80ff80ull;
650     x += 0x0080008000800080ull;
651     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
652             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
653 }
654 
655 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
656 {
657     x &= 0xffff8000ffff8000ull;
658     x += 0x0000800000008000ull;
659     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
660 }
661 
662 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
663 {
664     uint16_t s;
665     uint8_t d;
666     uint32_t res = 0;
667 #define SAT8(n) \
668     s = x >> n; \
669     if (s & 0x8000) { \
670         SET_QC(); \
671     } else { \
672         if (s > 0xff) { \
673             d = 0xff; \
674             SET_QC(); \
675         } else  { \
676             d = s; \
677         } \
678         res |= (uint32_t)d << (n / 2); \
679     }
680 
681     SAT8(0);
682     SAT8(16);
683     SAT8(32);
684     SAT8(48);
685 #undef SAT8
686     return res;
687 }
688 
689 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
690 {
691     uint16_t s;
692     uint8_t d;
693     uint32_t res = 0;
694 #define SAT8(n) \
695     s = x >> n; \
696     if (s > 0xff) { \
697         d = 0xff; \
698         SET_QC(); \
699     } else  { \
700         d = s; \
701     } \
702     res |= (uint32_t)d << (n / 2);
703 
704     SAT8(0);
705     SAT8(16);
706     SAT8(32);
707     SAT8(48);
708 #undef SAT8
709     return res;
710 }
711 
712 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
713 {
714     int16_t s;
715     uint8_t d;
716     uint32_t res = 0;
717 #define SAT8(n) \
718     s = x >> n; \
719     if (s != (int8_t)s) { \
720         d = (s >> 15) ^ 0x7f; \
721         SET_QC(); \
722     } else  { \
723         d = s; \
724     } \
725     res |= (uint32_t)d << (n / 2);
726 
727     SAT8(0);
728     SAT8(16);
729     SAT8(32);
730     SAT8(48);
731 #undef SAT8
732     return res;
733 }
734 
735 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
736 {
737     uint32_t high;
738     uint32_t low;
739     low = x;
740     if (low & 0x80000000) {
741         low = 0;
742         SET_QC();
743     } else if (low > 0xffff) {
744         low = 0xffff;
745         SET_QC();
746     }
747     high = x >> 32;
748     if (high & 0x80000000) {
749         high = 0;
750         SET_QC();
751     } else if (high > 0xffff) {
752         high = 0xffff;
753         SET_QC();
754     }
755     return low | (high << 16);
756 }
757 
758 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
759 {
760     uint32_t high;
761     uint32_t low;
762     low = x;
763     if (low > 0xffff) {
764         low = 0xffff;
765         SET_QC();
766     }
767     high = x >> 32;
768     if (high > 0xffff) {
769         high = 0xffff;
770         SET_QC();
771     }
772     return low | (high << 16);
773 }
774 
775 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
776 {
777     int32_t low;
778     int32_t high;
779     low = x;
780     if (low != (int16_t)low) {
781         low = (low >> 31) ^ 0x7fff;
782         SET_QC();
783     }
784     high = x >> 32;
785     if (high != (int16_t)high) {
786         high = (high >> 31) ^ 0x7fff;
787         SET_QC();
788     }
789     return (uint16_t)low | (high << 16);
790 }
791 
792 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
793 {
794     if (x & 0x8000000000000000ull) {
795         SET_QC();
796         return 0;
797     }
798     if (x > 0xffffffffu) {
799         SET_QC();
800         return 0xffffffffu;
801     }
802     return x;
803 }
804 
805 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
806 {
807     if (x > 0xffffffffu) {
808         SET_QC();
809         return 0xffffffffu;
810     }
811     return x;
812 }
813 
814 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
815 {
816     if ((int64_t)x != (int32_t)x) {
817         SET_QC();
818         return ((int64_t)x >> 63) ^ 0x7fffffff;
819     }
820     return x;
821 }
822 
823 uint64_t HELPER(neon_widen_u8)(uint32_t x)
824 {
825     uint64_t tmp;
826     uint64_t ret;
827     ret = (uint8_t)x;
828     tmp = (uint8_t)(x >> 8);
829     ret |= tmp << 16;
830     tmp = (uint8_t)(x >> 16);
831     ret |= tmp << 32;
832     tmp = (uint8_t)(x >> 24);
833     ret |= tmp << 48;
834     return ret;
835 }
836 
837 uint64_t HELPER(neon_widen_s8)(uint32_t x)
838 {
839     uint64_t tmp;
840     uint64_t ret;
841     ret = (uint16_t)(int8_t)x;
842     tmp = (uint16_t)(int8_t)(x >> 8);
843     ret |= tmp << 16;
844     tmp = (uint16_t)(int8_t)(x >> 16);
845     ret |= tmp << 32;
846     tmp = (uint16_t)(int8_t)(x >> 24);
847     ret |= tmp << 48;
848     return ret;
849 }
850 
851 uint64_t HELPER(neon_widen_u16)(uint32_t x)
852 {
853     uint64_t high = (uint16_t)(x >> 16);
854     return ((uint16_t)x) | (high << 32);
855 }
856 
857 uint64_t HELPER(neon_widen_s16)(uint32_t x)
858 {
859     uint64_t high = (int16_t)(x >> 16);
860     return ((uint32_t)(int16_t)x) | (high << 32);
861 }
862 
863 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
864 {
865     uint64_t mask;
866     mask = (a ^ b) & 0x8000800080008000ull;
867     a &= ~0x8000800080008000ull;
868     b &= ~0x8000800080008000ull;
869     return (a + b) ^ mask;
870 }
871 
872 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
873 {
874     uint64_t mask;
875     mask = (a ^ b) & 0x8000000080000000ull;
876     a &= ~0x8000000080000000ull;
877     b &= ~0x8000000080000000ull;
878     return (a + b) ^ mask;
879 }
880 
881 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
882 {
883     uint64_t tmp;
884     uint64_t tmp2;
885 
886     tmp = a & 0x0000ffff0000ffffull;
887     tmp += (a >> 16) & 0x0000ffff0000ffffull;
888     tmp2 = b & 0xffff0000ffff0000ull;
889     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
890     return    ( tmp         & 0xffff)
891             | ((tmp  >> 16) & 0xffff0000ull)
892             | ((tmp2 << 16) & 0xffff00000000ull)
893             | ( tmp2        & 0xffff000000000000ull);
894 }
895 
896 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
897 {
898     uint32_t low = a + (a >> 32);
899     uint32_t high = b + (b >> 32);
900     return low + ((uint64_t)high << 32);
901 }
902 
903 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
904 {
905     uint64_t mask;
906     mask = (a ^ ~b) & 0x8000800080008000ull;
907     a |= 0x8000800080008000ull;
908     b &= ~0x8000800080008000ull;
909     return (a - b) ^ mask;
910 }
911 
912 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
913 {
914     uint64_t mask;
915     mask = (a ^ ~b) & 0x8000000080000000ull;
916     a |= 0x8000000080000000ull;
917     b &= ~0x8000000080000000ull;
918     return (a - b) ^ mask;
919 }
920 
921 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
922 {
923     uint32_t x, y;
924     uint32_t low, high;
925 
926     x = a;
927     y = b;
928     low = x + y;
929     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
930         SET_QC();
931         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
932     }
933     x = a >> 32;
934     y = b >> 32;
935     high = x + y;
936     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
937         SET_QC();
938         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
939     }
940     return low | ((uint64_t)high << 32);
941 }
942 
943 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
944 {
945     uint64_t result;
946 
947     result = a + b;
948     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
949         SET_QC();
950         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
951     }
952     return result;
953 }
954 
955 /* We have to do the arithmetic in a larger type than
956  * the input type, because for example with a signed 32 bit
957  * op the absolute difference can overflow a signed 32 bit value.
958  */
959 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
960     arithtype tmp_x = (intype)(x);                            \
961     arithtype tmp_y = (intype)(y);                            \
962     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
963     } while(0)
964 
965 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
966 {
967     uint64_t tmp;
968     uint64_t result;
969     DO_ABD(result, a, b, uint8_t, uint32_t);
970     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
971     result |= tmp << 16;
972     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
973     result |= tmp << 32;
974     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
975     result |= tmp << 48;
976     return result;
977 }
978 
979 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
980 {
981     uint64_t tmp;
982     uint64_t result;
983     DO_ABD(result, a, b, int8_t, int32_t);
984     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
985     result |= tmp << 16;
986     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
987     result |= tmp << 32;
988     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
989     result |= tmp << 48;
990     return result;
991 }
992 
993 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
994 {
995     uint64_t tmp;
996     uint64_t result;
997     DO_ABD(result, a, b, uint16_t, uint32_t);
998     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
999     return result | (tmp << 32);
1000 }
1001 
1002 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1003 {
1004     uint64_t tmp;
1005     uint64_t result;
1006     DO_ABD(result, a, b, int16_t, int32_t);
1007     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1008     return result | (tmp << 32);
1009 }
1010 
1011 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1012 {
1013     uint64_t result;
1014     DO_ABD(result, a, b, uint32_t, uint64_t);
1015     return result;
1016 }
1017 
1018 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1019 {
1020     uint64_t result;
1021     DO_ABD(result, a, b, int32_t, int64_t);
1022     return result;
1023 }
1024 #undef DO_ABD
1025 
1026 /* Widening multiply. Named type is the source type.  */
1027 #define DO_MULL(dest, x, y, type1, type2) do { \
1028     type1 tmp_x = x; \
1029     type1 tmp_y = y; \
1030     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1031     } while(0)
1032 
1033 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1034 {
1035     uint64_t tmp;
1036     uint64_t result;
1037 
1038     DO_MULL(result, a, b, uint8_t, uint16_t);
1039     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1040     result |= tmp << 16;
1041     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1042     result |= tmp << 32;
1043     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1044     result |= tmp << 48;
1045     return result;
1046 }
1047 
1048 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1049 {
1050     uint64_t tmp;
1051     uint64_t result;
1052 
1053     DO_MULL(result, a, b, int8_t, uint16_t);
1054     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1055     result |= tmp << 16;
1056     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1057     result |= tmp << 32;
1058     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1059     result |= tmp << 48;
1060     return result;
1061 }
1062 
1063 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1064 {
1065     uint64_t tmp;
1066     uint64_t result;
1067 
1068     DO_MULL(result, a, b, uint16_t, uint32_t);
1069     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1070     return result | (tmp << 32);
1071 }
1072 
1073 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1074 {
1075     uint64_t tmp;
1076     uint64_t result;
1077 
1078     DO_MULL(result, a, b, int16_t, uint32_t);
1079     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1080     return result | (tmp << 32);
1081 }
1082 
1083 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1084 {
1085     uint16_t tmp;
1086     uint64_t result;
1087     result = (uint16_t)-x;
1088     tmp = -(x >> 16);
1089     result |= (uint64_t)tmp << 16;
1090     tmp = -(x >> 32);
1091     result |= (uint64_t)tmp << 32;
1092     tmp = -(x >> 48);
1093     result |= (uint64_t)tmp << 48;
1094     return result;
1095 }
1096 
1097 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1098 {
1099     uint32_t low = -x;
1100     uint32_t high = -(x >> 32);
1101     return low | ((uint64_t)high << 32);
1102 }
1103 
1104 /* Saturating sign manipulation.  */
1105 /* ??? Make these use NEON_VOP1 */
1106 #define DO_QABS8(x) do { \
1107     if (x == (int8_t)0x80) { \
1108         x = 0x7f; \
1109         SET_QC(); \
1110     } else if (x < 0) { \
1111         x = -x; \
1112     }} while (0)
1113 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1114 {
1115     neon_s8 vec;
1116     NEON_UNPACK(neon_s8, vec, x);
1117     DO_QABS8(vec.v1);
1118     DO_QABS8(vec.v2);
1119     DO_QABS8(vec.v3);
1120     DO_QABS8(vec.v4);
1121     NEON_PACK(neon_s8, x, vec);
1122     return x;
1123 }
1124 #undef DO_QABS8
1125 
1126 #define DO_QNEG8(x) do { \
1127     if (x == (int8_t)0x80) { \
1128         x = 0x7f; \
1129         SET_QC(); \
1130     } else { \
1131         x = -x; \
1132     }} while (0)
1133 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1134 {
1135     neon_s8 vec;
1136     NEON_UNPACK(neon_s8, vec, x);
1137     DO_QNEG8(vec.v1);
1138     DO_QNEG8(vec.v2);
1139     DO_QNEG8(vec.v3);
1140     DO_QNEG8(vec.v4);
1141     NEON_PACK(neon_s8, x, vec);
1142     return x;
1143 }
1144 #undef DO_QNEG8
1145 
1146 #define DO_QABS16(x) do { \
1147     if (x == (int16_t)0x8000) { \
1148         x = 0x7fff; \
1149         SET_QC(); \
1150     } else if (x < 0) { \
1151         x = -x; \
1152     }} while (0)
1153 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1154 {
1155     neon_s16 vec;
1156     NEON_UNPACK(neon_s16, vec, x);
1157     DO_QABS16(vec.v1);
1158     DO_QABS16(vec.v2);
1159     NEON_PACK(neon_s16, x, vec);
1160     return x;
1161 }
1162 #undef DO_QABS16
1163 
1164 #define DO_QNEG16(x) do { \
1165     if (x == (int16_t)0x8000) { \
1166         x = 0x7fff; \
1167         SET_QC(); \
1168     } else { \
1169         x = -x; \
1170     }} while (0)
1171 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1172 {
1173     neon_s16 vec;
1174     NEON_UNPACK(neon_s16, vec, x);
1175     DO_QNEG16(vec.v1);
1176     DO_QNEG16(vec.v2);
1177     NEON_PACK(neon_s16, x, vec);
1178     return x;
1179 }
1180 #undef DO_QNEG16
1181 
1182 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1183 {
1184     if (x == SIGNBIT) {
1185         SET_QC();
1186         x = ~SIGNBIT;
1187     } else if ((int32_t)x < 0) {
1188         x = -x;
1189     }
1190     return x;
1191 }
1192 
1193 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1194 {
1195     if (x == SIGNBIT) {
1196         SET_QC();
1197         x = ~SIGNBIT;
1198     } else {
1199         x = -x;
1200     }
1201     return x;
1202 }
1203 
1204 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1205 {
1206     if (x == SIGNBIT64) {
1207         SET_QC();
1208         x = ~SIGNBIT64;
1209     } else if ((int64_t)x < 0) {
1210         x = -x;
1211     }
1212     return x;
1213 }
1214 
1215 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1216 {
1217     if (x == SIGNBIT64) {
1218         SET_QC();
1219         x = ~SIGNBIT64;
1220     } else {
1221         x = -x;
1222     }
1223     return x;
1224 }
1225 
1226 /* NEON Float helpers.  */
1227 
1228 /* Floating point comparisons produce an integer result.
1229  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1230  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1231  */
1232 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1233 {
1234     float_status *fpst = fpstp;
1235     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1236 }
1237 
1238 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1239 {
1240     float_status *fpst = fpstp;
1241     return -float32_le(make_float32(b), make_float32(a), fpst);
1242 }
1243 
1244 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1245 {
1246     float_status *fpst = fpstp;
1247     return -float32_lt(make_float32(b), make_float32(a), fpst);
1248 }
1249 
1250 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1251 {
1252     float_status *fpst = fpstp;
1253     float32 f0 = float32_abs(make_float32(a));
1254     float32 f1 = float32_abs(make_float32(b));
1255     return -float32_le(f1, f0, fpst);
1256 }
1257 
1258 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1259 {
1260     float_status *fpst = fpstp;
1261     float32 f0 = float32_abs(make_float32(a));
1262     float32 f1 = float32_abs(make_float32(b));
1263     return -float32_lt(f1, f0, fpst);
1264 }
1265 
1266 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1267 {
1268     float_status *fpst = fpstp;
1269     float64 f0 = float64_abs(make_float64(a));
1270     float64 f1 = float64_abs(make_float64(b));
1271     return -float64_le(f1, f0, fpst);
1272 }
1273 
1274 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1275 {
1276     float_status *fpst = fpstp;
1277     float64 f0 = float64_abs(make_float64(a));
1278     float64 f1 = float64_abs(make_float64(b));
1279     return -float64_lt(f1, f0, fpst);
1280 }
1281 
1282 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1283 
1284 void HELPER(neon_qunzip8)(void *vd, void *vm)
1285 {
1286     uint64_t *rd = vd, *rm = vm;
1287     uint64_t zd0 = rd[0], zd1 = rd[1];
1288     uint64_t zm0 = rm[0], zm1 = rm[1];
1289 
1290     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1291         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1292         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1293         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1294     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1295         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1296         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1297         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1298     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1299         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1300         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1301         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1302     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1303         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1304         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1305         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1306 
1307     rm[0] = m0;
1308     rm[1] = m1;
1309     rd[0] = d0;
1310     rd[1] = d1;
1311 }
1312 
1313 void HELPER(neon_qunzip16)(void *vd, void *vm)
1314 {
1315     uint64_t *rd = vd, *rm = vm;
1316     uint64_t zd0 = rd[0], zd1 = rd[1];
1317     uint64_t zm0 = rm[0], zm1 = rm[1];
1318 
1319     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1320         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1321     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1322         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1323     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1324         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1325     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1326         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1327 
1328     rm[0] = m0;
1329     rm[1] = m1;
1330     rd[0] = d0;
1331     rd[1] = d1;
1332 }
1333 
1334 void HELPER(neon_qunzip32)(void *vd, void *vm)
1335 {
1336     uint64_t *rd = vd, *rm = vm;
1337     uint64_t zd0 = rd[0], zd1 = rd[1];
1338     uint64_t zm0 = rm[0], zm1 = rm[1];
1339 
1340     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1341     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1342     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1343     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1344 
1345     rm[0] = m0;
1346     rm[1] = m1;
1347     rd[0] = d0;
1348     rd[1] = d1;
1349 }
1350 
1351 void HELPER(neon_unzip8)(void *vd, void *vm)
1352 {
1353     uint64_t *rd = vd, *rm = vm;
1354     uint64_t zd = rd[0], zm = rm[0];
1355 
1356     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1357         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1358         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1359         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1360     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1361         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1362         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1363         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1364 
1365     rm[0] = m0;
1366     rd[0] = d0;
1367 }
1368 
1369 void HELPER(neon_unzip16)(void *vd, void *vm)
1370 {
1371     uint64_t *rd = vd, *rm = vm;
1372     uint64_t zd = rd[0], zm = rm[0];
1373 
1374     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1375         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1376     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1377         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1378 
1379     rm[0] = m0;
1380     rd[0] = d0;
1381 }
1382 
1383 void HELPER(neon_qzip8)(void *vd, void *vm)
1384 {
1385     uint64_t *rd = vd, *rm = vm;
1386     uint64_t zd0 = rd[0], zd1 = rd[1];
1387     uint64_t zm0 = rm[0], zm1 = rm[1];
1388 
1389     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1390         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1391         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1392         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1393     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1394         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1395         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1396         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1397     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1398         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1399         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1400         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1401     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1402         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1403         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1404         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1405 
1406     rm[0] = m0;
1407     rm[1] = m1;
1408     rd[0] = d0;
1409     rd[1] = d1;
1410 }
1411 
1412 void HELPER(neon_qzip16)(void *vd, void *vm)
1413 {
1414     uint64_t *rd = vd, *rm = vm;
1415     uint64_t zd0 = rd[0], zd1 = rd[1];
1416     uint64_t zm0 = rm[0], zm1 = rm[1];
1417 
1418     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1419         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1420     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1421         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1422     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1423         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1424     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1425         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1426 
1427     rm[0] = m0;
1428     rm[1] = m1;
1429     rd[0] = d0;
1430     rd[1] = d1;
1431 }
1432 
1433 void HELPER(neon_qzip32)(void *vd, void *vm)
1434 {
1435     uint64_t *rd = vd, *rm = vm;
1436     uint64_t zd0 = rd[0], zd1 = rd[1];
1437     uint64_t zm0 = rm[0], zm1 = rm[1];
1438 
1439     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1440     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1441     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1442     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1443 
1444     rm[0] = m0;
1445     rm[1] = m1;
1446     rd[0] = d0;
1447     rd[1] = d1;
1448 }
1449 
1450 void HELPER(neon_zip8)(void *vd, void *vm)
1451 {
1452     uint64_t *rd = vd, *rm = vm;
1453     uint64_t zd = rd[0], zm = rm[0];
1454 
1455     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1456         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1457         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1458         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1459     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1460         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1461         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1462         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1463 
1464     rm[0] = m0;
1465     rd[0] = d0;
1466 }
1467 
1468 void HELPER(neon_zip16)(void *vd, void *vm)
1469 {
1470     uint64_t *rd = vd, *rm = vm;
1471     uint64_t zd = rd[0], zm = rm[0];
1472 
1473     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1474         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1475     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1476         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1477 
1478     rm[0] = m0;
1479     rd[0] = d0;
1480 }
1481