xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision 34c0d865a3a29a160f3e572bd49f606cddc56c85)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
16 
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
19 
20 #define SET_QC() env->vfp.qc[0] = 1
21 
22 #define NEON_TYPE1(name, type) \
23 typedef struct \
24 { \
25     type v1; \
26 } neon_##name;
27 #if HOST_BIG_ENDIAN
28 #define NEON_TYPE2(name, type) \
29 typedef struct \
30 { \
31     type v2; \
32     type v1; \
33 } neon_##name;
34 #define NEON_TYPE4(name, type) \
35 typedef struct \
36 { \
37     type v4; \
38     type v3; \
39     type v2; \
40     type v1; \
41 } neon_##name;
42 #else
43 #define NEON_TYPE2(name, type) \
44 typedef struct \
45 { \
46     type v1; \
47     type v2; \
48 } neon_##name;
49 #define NEON_TYPE4(name, type) \
50 typedef struct \
51 { \
52     type v1; \
53     type v2; \
54     type v3; \
55     type v4; \
56 } neon_##name;
57 #endif
58 
59 NEON_TYPE4(s8, int8_t)
60 NEON_TYPE4(u8, uint8_t)
61 NEON_TYPE2(s16, int16_t)
62 NEON_TYPE2(u16, uint16_t)
63 NEON_TYPE1(s32, int32_t)
64 NEON_TYPE1(u32, uint32_t)
65 #undef NEON_TYPE4
66 #undef NEON_TYPE2
67 #undef NEON_TYPE1
68 
69 /* Copy from a uint32_t to a vector structure type.  */
70 #define NEON_UNPACK(vtype, dest, val) do { \
71     union { \
72         vtype v; \
73         uint32_t i; \
74     } conv_u; \
75     conv_u.i = (val); \
76     dest = conv_u.v; \
77     } while(0)
78 
79 /* Copy from a vector structure type to a uint32_t.  */
80 #define NEON_PACK(vtype, dest, val) do { \
81     union { \
82         vtype v; \
83         uint32_t i; \
84     } conv_u; \
85     conv_u.v = (val); \
86     dest = conv_u.i; \
87     } while(0)
88 
89 #define NEON_DO1 \
90     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91 #define NEON_DO2 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94 #define NEON_DO4 \
95     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99 
100 #define NEON_VOP_BODY(vtype, n) \
101 { \
102     uint32_t res; \
103     vtype vsrc1; \
104     vtype vsrc2; \
105     vtype vdest; \
106     NEON_UNPACK(vtype, vsrc1, arg1); \
107     NEON_UNPACK(vtype, vsrc2, arg2); \
108     NEON_DO##n; \
109     NEON_PACK(vtype, res, vdest); \
110     return res; \
111 }
112 
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
116 
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
120 
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123 {                                                               \
124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
125     vtype *d = vd, *n = vn, *m = vm;                            \
126     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
127         NEON_FN(d[i], n[i], m[i]);                              \
128     }                                                           \
129     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
130 }
131 
132 #define NEON_GVEC_VOP2_ENV(name, vtype) \
133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
134 {                                                               \
135     intptr_t i, opr_sz = simd_oprsz(desc);                      \
136     vtype *d = vd, *n = vn, *m = vm;                            \
137     CPUARMState *env = venv;                                    \
138     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
139         NEON_FN(d[i], n[i], m[i]);                              \
140     }                                                           \
141     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
142 }
143 
144 /* Pairwise operations.  */
145 /* For 32-bit elements each segment only contains a single element, so
146    the elementwise and pairwise operations are the same.  */
147 #define NEON_PDO2 \
148     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
149     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
150 #define NEON_PDO4 \
151     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
152     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
153     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
154     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
155 
156 #define NEON_POP(name, vtype, n) \
157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
158 { \
159     uint32_t res; \
160     vtype vsrc1; \
161     vtype vsrc2; \
162     vtype vdest; \
163     NEON_UNPACK(vtype, vsrc1, arg1); \
164     NEON_UNPACK(vtype, vsrc2, arg2); \
165     NEON_PDO##n; \
166     NEON_PACK(vtype, res, vdest); \
167     return res; \
168 }
169 
170 /* Unary operators.  */
171 #define NEON_VOP1(name, vtype, n) \
172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
173 { \
174     vtype vsrc1; \
175     vtype vdest; \
176     NEON_UNPACK(vtype, vsrc1, arg); \
177     NEON_DO##n; \
178     NEON_PACK(vtype, arg, vdest); \
179     return arg; \
180 }
181 
182 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
183 NEON_VOP(rhadd_s8, neon_s8, 4)
184 NEON_VOP(rhadd_u8, neon_u8, 4)
185 NEON_VOP(rhadd_s16, neon_s16, 2)
186 NEON_VOP(rhadd_u16, neon_u16, 2)
187 #undef NEON_FN
188 
189 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
190 {
191     int32_t dest;
192 
193     dest = (src1 >> 1) + (src2 >> 1);
194     if ((src1 | src2) & 1)
195         dest++;
196     return dest;
197 }
198 
199 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
200 {
201     uint32_t dest;
202 
203     dest = (src1 >> 1) + (src2 >> 1);
204     if ((src1 | src2) & 1)
205         dest++;
206     return dest;
207 }
208 
209 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
210 NEON_POP(pmin_s8, neon_s8, 4)
211 NEON_POP(pmin_u8, neon_u8, 4)
212 NEON_POP(pmin_s16, neon_s16, 2)
213 NEON_POP(pmin_u16, neon_u16, 2)
214 #undef NEON_FN
215 
216 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
217 NEON_POP(pmax_s8, neon_s8, 4)
218 NEON_POP(pmax_u8, neon_u8, 4)
219 NEON_POP(pmax_s16, neon_s16, 2)
220 NEON_POP(pmax_u16, neon_u16, 2)
221 #undef NEON_FN
222 
223 #define NEON_FN(dest, src1, src2) \
224     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
225 NEON_VOP(shl_u16, neon_u16, 2)
226 #undef NEON_FN
227 
228 #define NEON_FN(dest, src1, src2) \
229     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
230 NEON_VOP(shl_s16, neon_s16, 2)
231 #undef NEON_FN
232 
233 #define NEON_FN(dest, src1, src2) \
234     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
235 NEON_VOP(rshl_s8, neon_s8, 4)
236 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
237 #undef NEON_FN
238 
239 #define NEON_FN(dest, src1, src2) \
240     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
241 NEON_VOP(rshl_s16, neon_s16, 2)
242 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
243 #undef NEON_FN
244 
245 #define NEON_FN(dest, src1, src2) \
246     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
247 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
248 #undef NEON_FN
249 
250 #define NEON_FN(dest, src1, src2) \
251     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
252 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
253 #undef NEON_FN
254 
255 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
256 {
257     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
258 }
259 
260 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
261 {
262     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
263 }
264 
265 #define NEON_FN(dest, src1, src2) \
266     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
267 NEON_VOP(rshl_u8, neon_u8, 4)
268 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
269 #undef NEON_FN
270 
271 #define NEON_FN(dest, src1, src2) \
272     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
273 NEON_VOP(rshl_u16, neon_u16, 2)
274 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
275 #undef NEON_FN
276 
277 #define NEON_FN(dest, src1, src2) \
278     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
279 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
280 #undef NEON_FN
281 
282 #define NEON_FN(dest, src1, src2) \
283     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
284 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
285 #undef NEON_FN
286 
287 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
288 {
289     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
290 }
291 
292 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
293 {
294     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
295 }
296 
297 #define NEON_FN(dest, src1, src2) \
298     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
299 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
300 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
301 #undef NEON_FN
302 
303 #define NEON_FN(dest, src1, src2) \
304     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
305 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
306 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
307 #undef NEON_FN
308 
309 #define NEON_FN(dest, src1, src2) \
310     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
311 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
312 #undef NEON_FN
313 
314 #define NEON_FN(dest, src1, src2) \
315     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
316 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
317 #undef NEON_FN
318 
319 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
320 {
321     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
322 }
323 
324 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
325 {
326     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
327 }
328 
329 #define NEON_FN(dest, src1, src2) \
330     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
331 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
332 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
333 #undef NEON_FN
334 
335 #define NEON_FN(dest, src1, src2) \
336     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
337 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
338 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
339 #undef NEON_FN
340 
341 #define NEON_FN(dest, src1, src2) \
342     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
343 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
344 #undef NEON_FN
345 
346 #define NEON_FN(dest, src1, src2) \
347     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
348 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
349 #undef NEON_FN
350 
351 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
352 {
353     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
354 }
355 
356 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
357 {
358     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
359 }
360 
361 #define NEON_FN(dest, src1, src2) \
362     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
363 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
364 #undef NEON_FN
365 
366 #define NEON_FN(dest, src1, src2) \
367     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
368 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
369 #undef NEON_FN
370 
371 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
372 {
373     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
374 }
375 
376 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
377 {
378     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
379 }
380 
381 #define NEON_FN(dest, src1, src2) \
382     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
383 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
384 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
385 #undef NEON_FN
386 
387 #define NEON_FN(dest, src1, src2) \
388     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
389 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
390 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
391 #undef NEON_FN
392 
393 #define NEON_FN(dest, src1, src2) \
394     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
395 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
396 #undef NEON_FN
397 
398 #define NEON_FN(dest, src1, src2) \
399     (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
400 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
401 #undef NEON_FN
402 
403 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
404 {
405     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
406 }
407 
408 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
409 {
410     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
411 }
412 
413 #define NEON_FN(dest, src1, src2) \
414     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
415 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
416 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
417 #undef NEON_FN
418 
419 #define NEON_FN(dest, src1, src2) \
420     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
421 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
422 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
423 #undef NEON_FN
424 
425 #define NEON_FN(dest, src1, src2) \
426     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
427 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
428 #undef NEON_FN
429 
430 #define NEON_FN(dest, src1, src2) \
431     (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
432 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
433 #undef NEON_FN
434 
435 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
436 {
437     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
438 }
439 
440 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
441 {
442     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
443 }
444 
445 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
446 {
447     uint32_t mask;
448     mask = (a ^ b) & 0x80808080u;
449     a &= ~0x80808080u;
450     b &= ~0x80808080u;
451     return (a + b) ^ mask;
452 }
453 
454 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
455 {
456     uint32_t mask;
457     mask = (a ^ b) & 0x80008000u;
458     a &= ~0x80008000u;
459     b &= ~0x80008000u;
460     return (a + b) ^ mask;
461 }
462 
463 #define NEON_FN(dest, src1, src2) dest = src1 - src2
464 NEON_VOP(sub_u8, neon_u8, 4)
465 NEON_VOP(sub_u16, neon_u16, 2)
466 #undef NEON_FN
467 
468 #define NEON_FN(dest, src1, src2) dest = src1 * src2
469 NEON_VOP(mul_u8, neon_u8, 4)
470 NEON_VOP(mul_u16, neon_u16, 2)
471 #undef NEON_FN
472 
473 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
474 NEON_VOP(tst_u8, neon_u8, 4)
475 NEON_VOP(tst_u16, neon_u16, 2)
476 NEON_VOP(tst_u32, neon_u32, 1)
477 #undef NEON_FN
478 
479 /* Count Leading Sign/Zero Bits.  */
480 static inline int do_clz8(uint8_t x)
481 {
482     int n;
483     for (n = 8; x; n--)
484         x >>= 1;
485     return n;
486 }
487 
488 static inline int do_clz16(uint16_t x)
489 {
490     int n;
491     for (n = 16; x; n--)
492         x >>= 1;
493     return n;
494 }
495 
496 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
497 NEON_VOP1(clz_u8, neon_u8, 4)
498 #undef NEON_FN
499 
500 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
501 NEON_VOP1(clz_u16, neon_u16, 2)
502 #undef NEON_FN
503 
504 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
505 NEON_VOP1(cls_s8, neon_s8, 4)
506 #undef NEON_FN
507 
508 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
509 NEON_VOP1(cls_s16, neon_s16, 2)
510 #undef NEON_FN
511 
512 uint32_t HELPER(neon_cls_s32)(uint32_t x)
513 {
514     int count;
515     if ((int32_t)x < 0)
516         x = ~x;
517     for (count = 32; x; count--)
518         x = x >> 1;
519     return count - 1;
520 }
521 
522 /* Bit count.  */
523 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
524 {
525     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
526     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
527     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
528     return x;
529 }
530 
531 /* Reverse bits in each 8 bit word */
532 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
533 {
534     x =  ((x & 0xf0f0f0f0) >> 4)
535        | ((x & 0x0f0f0f0f) << 4);
536     x =  ((x & 0x88888888) >> 3)
537        | ((x & 0x44444444) >> 1)
538        | ((x & 0x22222222) << 1)
539        | ((x & 0x11111111) << 3);
540     return x;
541 }
542 
543 #define NEON_QDMULH16(dest, src1, src2, round) do { \
544     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
545     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
546         SET_QC(); \
547         tmp = (tmp >> 31) ^ ~SIGNBIT; \
548     } else { \
549         tmp <<= 1; \
550     } \
551     if (round) { \
552         int32_t old = tmp; \
553         tmp += 1 << 15; \
554         if ((int32_t)tmp < old) { \
555             SET_QC(); \
556             tmp = SIGNBIT - 1; \
557         } \
558     } \
559     dest = tmp >> 16; \
560     } while(0)
561 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
562 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
563 #undef NEON_FN
564 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
565 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
566 #undef NEON_FN
567 #undef NEON_QDMULH16
568 
569 #define NEON_QDMULH32(dest, src1, src2, round) do { \
570     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
571     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
572         SET_QC(); \
573         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
574     } else { \
575         tmp <<= 1; \
576     } \
577     if (round) { \
578         int64_t old = tmp; \
579         tmp += (int64_t)1 << 31; \
580         if ((int64_t)tmp < old) { \
581             SET_QC(); \
582             tmp = SIGNBIT64 - 1; \
583         } \
584     } \
585     dest = tmp >> 32; \
586     } while(0)
587 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
588 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
589 #undef NEON_FN
590 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
591 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
592 #undef NEON_FN
593 #undef NEON_QDMULH32
594 
595 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
596 {
597     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
598            | ((x >> 24) & 0xff000000u);
599 }
600 
601 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
602 {
603     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
604 }
605 
606 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
607 {
608     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
609             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
610 }
611 
612 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
613 {
614     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
615 }
616 
617 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
618 {
619     x &= 0xff80ff80ff80ff80ull;
620     x += 0x0080008000800080ull;
621     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
622             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
623 }
624 
625 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
626 {
627     x &= 0xffff8000ffff8000ull;
628     x += 0x0000800000008000ull;
629     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
630 }
631 
632 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
633 {
634     uint16_t s;
635     uint8_t d;
636     uint32_t res = 0;
637 #define SAT8(n) \
638     s = x >> n; \
639     if (s & 0x8000) { \
640         SET_QC(); \
641     } else { \
642         if (s > 0xff) { \
643             d = 0xff; \
644             SET_QC(); \
645         } else  { \
646             d = s; \
647         } \
648         res |= (uint32_t)d << (n / 2); \
649     }
650 
651     SAT8(0);
652     SAT8(16);
653     SAT8(32);
654     SAT8(48);
655 #undef SAT8
656     return res;
657 }
658 
659 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
660 {
661     uint16_t s;
662     uint8_t d;
663     uint32_t res = 0;
664 #define SAT8(n) \
665     s = x >> n; \
666     if (s > 0xff) { \
667         d = 0xff; \
668         SET_QC(); \
669     } else  { \
670         d = s; \
671     } \
672     res |= (uint32_t)d << (n / 2);
673 
674     SAT8(0);
675     SAT8(16);
676     SAT8(32);
677     SAT8(48);
678 #undef SAT8
679     return res;
680 }
681 
682 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
683 {
684     int16_t s;
685     uint8_t d;
686     uint32_t res = 0;
687 #define SAT8(n) \
688     s = x >> n; \
689     if (s != (int8_t)s) { \
690         d = (s >> 15) ^ 0x7f; \
691         SET_QC(); \
692     } else  { \
693         d = s; \
694     } \
695     res |= (uint32_t)d << (n / 2);
696 
697     SAT8(0);
698     SAT8(16);
699     SAT8(32);
700     SAT8(48);
701 #undef SAT8
702     return res;
703 }
704 
705 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
706 {
707     uint32_t high;
708     uint32_t low;
709     low = x;
710     if (low & 0x80000000) {
711         low = 0;
712         SET_QC();
713     } else if (low > 0xffff) {
714         low = 0xffff;
715         SET_QC();
716     }
717     high = x >> 32;
718     if (high & 0x80000000) {
719         high = 0;
720         SET_QC();
721     } else if (high > 0xffff) {
722         high = 0xffff;
723         SET_QC();
724     }
725     return low | (high << 16);
726 }
727 
728 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
729 {
730     uint32_t high;
731     uint32_t low;
732     low = x;
733     if (low > 0xffff) {
734         low = 0xffff;
735         SET_QC();
736     }
737     high = x >> 32;
738     if (high > 0xffff) {
739         high = 0xffff;
740         SET_QC();
741     }
742     return low | (high << 16);
743 }
744 
745 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
746 {
747     int32_t low;
748     int32_t high;
749     low = x;
750     if (low != (int16_t)low) {
751         low = (low >> 31) ^ 0x7fff;
752         SET_QC();
753     }
754     high = x >> 32;
755     if (high != (int16_t)high) {
756         high = (high >> 31) ^ 0x7fff;
757         SET_QC();
758     }
759     return (uint16_t)low | (high << 16);
760 }
761 
762 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
763 {
764     if (x & 0x8000000000000000ull) {
765         SET_QC();
766         return 0;
767     }
768     if (x > 0xffffffffu) {
769         SET_QC();
770         return 0xffffffffu;
771     }
772     return x;
773 }
774 
775 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
776 {
777     if (x > 0xffffffffu) {
778         SET_QC();
779         return 0xffffffffu;
780     }
781     return x;
782 }
783 
784 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
785 {
786     if ((int64_t)x != (int32_t)x) {
787         SET_QC();
788         return ((int64_t)x >> 63) ^ 0x7fffffff;
789     }
790     return x;
791 }
792 
793 uint64_t HELPER(neon_widen_u8)(uint32_t x)
794 {
795     uint64_t tmp;
796     uint64_t ret;
797     ret = (uint8_t)x;
798     tmp = (uint8_t)(x >> 8);
799     ret |= tmp << 16;
800     tmp = (uint8_t)(x >> 16);
801     ret |= tmp << 32;
802     tmp = (uint8_t)(x >> 24);
803     ret |= tmp << 48;
804     return ret;
805 }
806 
807 uint64_t HELPER(neon_widen_s8)(uint32_t x)
808 {
809     uint64_t tmp;
810     uint64_t ret;
811     ret = (uint16_t)(int8_t)x;
812     tmp = (uint16_t)(int8_t)(x >> 8);
813     ret |= tmp << 16;
814     tmp = (uint16_t)(int8_t)(x >> 16);
815     ret |= tmp << 32;
816     tmp = (uint16_t)(int8_t)(x >> 24);
817     ret |= tmp << 48;
818     return ret;
819 }
820 
821 uint64_t HELPER(neon_widen_u16)(uint32_t x)
822 {
823     uint64_t high = (uint16_t)(x >> 16);
824     return ((uint16_t)x) | (high << 32);
825 }
826 
827 uint64_t HELPER(neon_widen_s16)(uint32_t x)
828 {
829     uint64_t high = (int16_t)(x >> 16);
830     return ((uint32_t)(int16_t)x) | (high << 32);
831 }
832 
833 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
834 {
835     uint64_t mask;
836     mask = (a ^ b) & 0x8000800080008000ull;
837     a &= ~0x8000800080008000ull;
838     b &= ~0x8000800080008000ull;
839     return (a + b) ^ mask;
840 }
841 
842 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
843 {
844     uint64_t mask;
845     mask = (a ^ b) & 0x8000000080000000ull;
846     a &= ~0x8000000080000000ull;
847     b &= ~0x8000000080000000ull;
848     return (a + b) ^ mask;
849 }
850 
851 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
852 {
853     uint64_t tmp;
854     uint64_t tmp2;
855 
856     tmp = a & 0x0000ffff0000ffffull;
857     tmp += (a >> 16) & 0x0000ffff0000ffffull;
858     tmp2 = b & 0xffff0000ffff0000ull;
859     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
860     return    ( tmp         & 0xffff)
861             | ((tmp  >> 16) & 0xffff0000ull)
862             | ((tmp2 << 16) & 0xffff00000000ull)
863             | ( tmp2        & 0xffff000000000000ull);
864 }
865 
866 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
867 {
868     uint32_t low = a + (a >> 32);
869     uint32_t high = b + (b >> 32);
870     return low + ((uint64_t)high << 32);
871 }
872 
873 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
874 {
875     uint64_t mask;
876     mask = (a ^ ~b) & 0x8000800080008000ull;
877     a |= 0x8000800080008000ull;
878     b &= ~0x8000800080008000ull;
879     return (a - b) ^ mask;
880 }
881 
882 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
883 {
884     uint64_t mask;
885     mask = (a ^ ~b) & 0x8000000080000000ull;
886     a |= 0x8000000080000000ull;
887     b &= ~0x8000000080000000ull;
888     return (a - b) ^ mask;
889 }
890 
891 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
892 {
893     uint32_t x, y;
894     uint32_t low, high;
895 
896     x = a;
897     y = b;
898     low = x + y;
899     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
900         SET_QC();
901         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
902     }
903     x = a >> 32;
904     y = b >> 32;
905     high = x + y;
906     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
907         SET_QC();
908         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
909     }
910     return low | ((uint64_t)high << 32);
911 }
912 
913 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
914 {
915     uint64_t result;
916 
917     result = a + b;
918     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
919         SET_QC();
920         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
921     }
922     return result;
923 }
924 
925 /* We have to do the arithmetic in a larger type than
926  * the input type, because for example with a signed 32 bit
927  * op the absolute difference can overflow a signed 32 bit value.
928  */
929 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
930     arithtype tmp_x = (intype)(x);                            \
931     arithtype tmp_y = (intype)(y);                            \
932     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
933     } while(0)
934 
935 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
936 {
937     uint64_t tmp;
938     uint64_t result;
939     DO_ABD(result, a, b, uint8_t, uint32_t);
940     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
941     result |= tmp << 16;
942     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
943     result |= tmp << 32;
944     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
945     result |= tmp << 48;
946     return result;
947 }
948 
949 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
950 {
951     uint64_t tmp;
952     uint64_t result;
953     DO_ABD(result, a, b, int8_t, int32_t);
954     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
955     result |= tmp << 16;
956     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
957     result |= tmp << 32;
958     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
959     result |= tmp << 48;
960     return result;
961 }
962 
963 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
964 {
965     uint64_t tmp;
966     uint64_t result;
967     DO_ABD(result, a, b, uint16_t, uint32_t);
968     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
969     return result | (tmp << 32);
970 }
971 
972 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
973 {
974     uint64_t tmp;
975     uint64_t result;
976     DO_ABD(result, a, b, int16_t, int32_t);
977     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
978     return result | (tmp << 32);
979 }
980 
981 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
982 {
983     uint64_t result;
984     DO_ABD(result, a, b, uint32_t, uint64_t);
985     return result;
986 }
987 
988 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
989 {
990     uint64_t result;
991     DO_ABD(result, a, b, int32_t, int64_t);
992     return result;
993 }
994 #undef DO_ABD
995 
996 /* Widening multiply. Named type is the source type.  */
997 #define DO_MULL(dest, x, y, type1, type2) do { \
998     type1 tmp_x = x; \
999     type1 tmp_y = y; \
1000     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1001     } while(0)
1002 
1003 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1004 {
1005     uint64_t tmp;
1006     uint64_t result;
1007 
1008     DO_MULL(result, a, b, uint8_t, uint16_t);
1009     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1010     result |= tmp << 16;
1011     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1012     result |= tmp << 32;
1013     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1014     result |= tmp << 48;
1015     return result;
1016 }
1017 
1018 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1019 {
1020     uint64_t tmp;
1021     uint64_t result;
1022 
1023     DO_MULL(result, a, b, int8_t, uint16_t);
1024     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1025     result |= tmp << 16;
1026     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1027     result |= tmp << 32;
1028     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1029     result |= tmp << 48;
1030     return result;
1031 }
1032 
1033 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1034 {
1035     uint64_t tmp;
1036     uint64_t result;
1037 
1038     DO_MULL(result, a, b, uint16_t, uint32_t);
1039     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1040     return result | (tmp << 32);
1041 }
1042 
1043 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1044 {
1045     uint64_t tmp;
1046     uint64_t result;
1047 
1048     DO_MULL(result, a, b, int16_t, uint32_t);
1049     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1050     return result | (tmp << 32);
1051 }
1052 
1053 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1054 {
1055     uint16_t tmp;
1056     uint64_t result;
1057     result = (uint16_t)-x;
1058     tmp = -(x >> 16);
1059     result |= (uint64_t)tmp << 16;
1060     tmp = -(x >> 32);
1061     result |= (uint64_t)tmp << 32;
1062     tmp = -(x >> 48);
1063     result |= (uint64_t)tmp << 48;
1064     return result;
1065 }
1066 
1067 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1068 {
1069     uint32_t low = -x;
1070     uint32_t high = -(x >> 32);
1071     return low | ((uint64_t)high << 32);
1072 }
1073 
1074 /* Saturating sign manipulation.  */
1075 /* ??? Make these use NEON_VOP1 */
1076 #define DO_QABS8(x) do { \
1077     if (x == (int8_t)0x80) { \
1078         x = 0x7f; \
1079         SET_QC(); \
1080     } else if (x < 0) { \
1081         x = -x; \
1082     }} while (0)
1083 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1084 {
1085     neon_s8 vec;
1086     NEON_UNPACK(neon_s8, vec, x);
1087     DO_QABS8(vec.v1);
1088     DO_QABS8(vec.v2);
1089     DO_QABS8(vec.v3);
1090     DO_QABS8(vec.v4);
1091     NEON_PACK(neon_s8, x, vec);
1092     return x;
1093 }
1094 #undef DO_QABS8
1095 
1096 #define DO_QNEG8(x) do { \
1097     if (x == (int8_t)0x80) { \
1098         x = 0x7f; \
1099         SET_QC(); \
1100     } else { \
1101         x = -x; \
1102     }} while (0)
1103 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1104 {
1105     neon_s8 vec;
1106     NEON_UNPACK(neon_s8, vec, x);
1107     DO_QNEG8(vec.v1);
1108     DO_QNEG8(vec.v2);
1109     DO_QNEG8(vec.v3);
1110     DO_QNEG8(vec.v4);
1111     NEON_PACK(neon_s8, x, vec);
1112     return x;
1113 }
1114 #undef DO_QNEG8
1115 
1116 #define DO_QABS16(x) do { \
1117     if (x == (int16_t)0x8000) { \
1118         x = 0x7fff; \
1119         SET_QC(); \
1120     } else if (x < 0) { \
1121         x = -x; \
1122     }} while (0)
1123 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1124 {
1125     neon_s16 vec;
1126     NEON_UNPACK(neon_s16, vec, x);
1127     DO_QABS16(vec.v1);
1128     DO_QABS16(vec.v2);
1129     NEON_PACK(neon_s16, x, vec);
1130     return x;
1131 }
1132 #undef DO_QABS16
1133 
1134 #define DO_QNEG16(x) do { \
1135     if (x == (int16_t)0x8000) { \
1136         x = 0x7fff; \
1137         SET_QC(); \
1138     } else { \
1139         x = -x; \
1140     }} while (0)
1141 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1142 {
1143     neon_s16 vec;
1144     NEON_UNPACK(neon_s16, vec, x);
1145     DO_QNEG16(vec.v1);
1146     DO_QNEG16(vec.v2);
1147     NEON_PACK(neon_s16, x, vec);
1148     return x;
1149 }
1150 #undef DO_QNEG16
1151 
1152 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1153 {
1154     if (x == SIGNBIT) {
1155         SET_QC();
1156         x = ~SIGNBIT;
1157     } else if ((int32_t)x < 0) {
1158         x = -x;
1159     }
1160     return x;
1161 }
1162 
1163 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1164 {
1165     if (x == SIGNBIT) {
1166         SET_QC();
1167         x = ~SIGNBIT;
1168     } else {
1169         x = -x;
1170     }
1171     return x;
1172 }
1173 
1174 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1175 {
1176     if (x == SIGNBIT64) {
1177         SET_QC();
1178         x = ~SIGNBIT64;
1179     } else if ((int64_t)x < 0) {
1180         x = -x;
1181     }
1182     return x;
1183 }
1184 
1185 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1186 {
1187     if (x == SIGNBIT64) {
1188         SET_QC();
1189         x = ~SIGNBIT64;
1190     } else {
1191         x = -x;
1192     }
1193     return x;
1194 }
1195 
1196 /* NEON Float helpers.  */
1197 
1198 /* Floating point comparisons produce an integer result.
1199  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1200  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1201  */
1202 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1203 {
1204     float_status *fpst = fpstp;
1205     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1206 }
1207 
1208 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1209 {
1210     float_status *fpst = fpstp;
1211     return -float32_le(make_float32(b), make_float32(a), fpst);
1212 }
1213 
1214 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1215 {
1216     float_status *fpst = fpstp;
1217     return -float32_lt(make_float32(b), make_float32(a), fpst);
1218 }
1219 
1220 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1221 {
1222     float_status *fpst = fpstp;
1223     float32 f0 = float32_abs(make_float32(a));
1224     float32 f1 = float32_abs(make_float32(b));
1225     return -float32_le(f1, f0, fpst);
1226 }
1227 
1228 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1229 {
1230     float_status *fpst = fpstp;
1231     float32 f0 = float32_abs(make_float32(a));
1232     float32 f1 = float32_abs(make_float32(b));
1233     return -float32_lt(f1, f0, fpst);
1234 }
1235 
1236 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1237 {
1238     float_status *fpst = fpstp;
1239     float64 f0 = float64_abs(make_float64(a));
1240     float64 f1 = float64_abs(make_float64(b));
1241     return -float64_le(f1, f0, fpst);
1242 }
1243 
1244 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1245 {
1246     float_status *fpst = fpstp;
1247     float64 f0 = float64_abs(make_float64(a));
1248     float64 f1 = float64_abs(make_float64(b));
1249     return -float64_lt(f1, f0, fpst);
1250 }
1251 
1252 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1253 
1254 void HELPER(neon_qunzip8)(void *vd, void *vm)
1255 {
1256     uint64_t *rd = vd, *rm = vm;
1257     uint64_t zd0 = rd[0], zd1 = rd[1];
1258     uint64_t zm0 = rm[0], zm1 = rm[1];
1259 
1260     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1261         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1262         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1263         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1264     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1265         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1266         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1267         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1268     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1269         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1270         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1271         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1272     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1273         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1274         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1275         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1276 
1277     rm[0] = m0;
1278     rm[1] = m1;
1279     rd[0] = d0;
1280     rd[1] = d1;
1281 }
1282 
1283 void HELPER(neon_qunzip16)(void *vd, void *vm)
1284 {
1285     uint64_t *rd = vd, *rm = vm;
1286     uint64_t zd0 = rd[0], zd1 = rd[1];
1287     uint64_t zm0 = rm[0], zm1 = rm[1];
1288 
1289     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1290         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1291     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1292         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1293     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1294         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1295     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1296         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1297 
1298     rm[0] = m0;
1299     rm[1] = m1;
1300     rd[0] = d0;
1301     rd[1] = d1;
1302 }
1303 
1304 void HELPER(neon_qunzip32)(void *vd, void *vm)
1305 {
1306     uint64_t *rd = vd, *rm = vm;
1307     uint64_t zd0 = rd[0], zd1 = rd[1];
1308     uint64_t zm0 = rm[0], zm1 = rm[1];
1309 
1310     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1311     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1312     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1313     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1314 
1315     rm[0] = m0;
1316     rm[1] = m1;
1317     rd[0] = d0;
1318     rd[1] = d1;
1319 }
1320 
1321 void HELPER(neon_unzip8)(void *vd, void *vm)
1322 {
1323     uint64_t *rd = vd, *rm = vm;
1324     uint64_t zd = rd[0], zm = rm[0];
1325 
1326     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1327         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1328         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1329         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1330     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1331         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1332         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1333         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1334 
1335     rm[0] = m0;
1336     rd[0] = d0;
1337 }
1338 
1339 void HELPER(neon_unzip16)(void *vd, void *vm)
1340 {
1341     uint64_t *rd = vd, *rm = vm;
1342     uint64_t zd = rd[0], zm = rm[0];
1343 
1344     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1345         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1346     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1347         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1348 
1349     rm[0] = m0;
1350     rd[0] = d0;
1351 }
1352 
1353 void HELPER(neon_qzip8)(void *vd, void *vm)
1354 {
1355     uint64_t *rd = vd, *rm = vm;
1356     uint64_t zd0 = rd[0], zd1 = rd[1];
1357     uint64_t zm0 = rm[0], zm1 = rm[1];
1358 
1359     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1360         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1361         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1362         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1363     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1364         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1365         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1366         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1367     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1368         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1369         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1370         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1371     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1372         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1373         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1374         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1375 
1376     rm[0] = m0;
1377     rm[1] = m1;
1378     rd[0] = d0;
1379     rd[1] = d1;
1380 }
1381 
1382 void HELPER(neon_qzip16)(void *vd, void *vm)
1383 {
1384     uint64_t *rd = vd, *rm = vm;
1385     uint64_t zd0 = rd[0], zd1 = rd[1];
1386     uint64_t zm0 = rm[0], zm1 = rm[1];
1387 
1388     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1389         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1390     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1391         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1392     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1393         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1394     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1395         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1396 
1397     rm[0] = m0;
1398     rm[1] = m1;
1399     rd[0] = d0;
1400     rd[1] = d1;
1401 }
1402 
1403 void HELPER(neon_qzip32)(void *vd, void *vm)
1404 {
1405     uint64_t *rd = vd, *rm = vm;
1406     uint64_t zd0 = rd[0], zd1 = rd[1];
1407     uint64_t zm0 = rm[0], zm1 = rm[1];
1408 
1409     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1410     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1411     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1412     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1413 
1414     rm[0] = m0;
1415     rm[1] = m1;
1416     rd[0] = d0;
1417     rd[1] = d1;
1418 }
1419 
1420 void HELPER(neon_zip8)(void *vd, void *vm)
1421 {
1422     uint64_t *rd = vd, *rm = vm;
1423     uint64_t zd = rd[0], zm = rm[0];
1424 
1425     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1426         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1427         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1428         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1429     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1430         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1431         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1432         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1433 
1434     rm[0] = m0;
1435     rd[0] = d0;
1436 }
1437 
1438 void HELPER(neon_zip16)(void *vd, void *vm)
1439 {
1440     uint64_t *rd = vd, *rm = vm;
1441     uint64_t zd = rd[0], zm = rm[0];
1442 
1443     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1444         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1445     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1446         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1447 
1448     rm[0] = m0;
1449     rd[0] = d0;
1450 }
1451