xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision 50a92d9b)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
16 
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
19 
20 #define SET_QC() env->vfp.qc[0] = 1
21 
22 #define NEON_TYPE1(name, type) \
23 typedef struct \
24 { \
25     type v1; \
26 } neon_##name;
27 #if HOST_BIG_ENDIAN
28 #define NEON_TYPE2(name, type) \
29 typedef struct \
30 { \
31     type v2; \
32     type v1; \
33 } neon_##name;
34 #define NEON_TYPE4(name, type) \
35 typedef struct \
36 { \
37     type v4; \
38     type v3; \
39     type v2; \
40     type v1; \
41 } neon_##name;
42 #else
43 #define NEON_TYPE2(name, type) \
44 typedef struct \
45 { \
46     type v1; \
47     type v2; \
48 } neon_##name;
49 #define NEON_TYPE4(name, type) \
50 typedef struct \
51 { \
52     type v1; \
53     type v2; \
54     type v3; \
55     type v4; \
56 } neon_##name;
57 #endif
58 
59 NEON_TYPE4(s8, int8_t)
60 NEON_TYPE4(u8, uint8_t)
61 NEON_TYPE2(s16, int16_t)
62 NEON_TYPE2(u16, uint16_t)
63 NEON_TYPE1(s32, int32_t)
64 NEON_TYPE1(u32, uint32_t)
65 #undef NEON_TYPE4
66 #undef NEON_TYPE2
67 #undef NEON_TYPE1
68 
69 /* Copy from a uint32_t to a vector structure type.  */
70 #define NEON_UNPACK(vtype, dest, val) do { \
71     union { \
72         vtype v; \
73         uint32_t i; \
74     } conv_u; \
75     conv_u.i = (val); \
76     dest = conv_u.v; \
77     } while(0)
78 
79 /* Copy from a vector structure type to a uint32_t.  */
80 #define NEON_PACK(vtype, dest, val) do { \
81     union { \
82         vtype v; \
83         uint32_t i; \
84     } conv_u; \
85     conv_u.v = (val); \
86     dest = conv_u.i; \
87     } while(0)
88 
89 #define NEON_DO1 \
90     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91 #define NEON_DO2 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94 #define NEON_DO4 \
95     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99 
100 #define NEON_VOP_BODY(vtype, n) \
101 { \
102     uint32_t res; \
103     vtype vsrc1; \
104     vtype vsrc2; \
105     vtype vdest; \
106     NEON_UNPACK(vtype, vsrc1, arg1); \
107     NEON_UNPACK(vtype, vsrc2, arg2); \
108     NEON_DO##n; \
109     NEON_PACK(vtype, res, vdest); \
110     return res; \
111 }
112 
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
116 
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
120 
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123 {                                                               \
124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
125     vtype *d = vd, *n = vn, *m = vm;                            \
126     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
127         NEON_FN(d[i], n[i], m[i]);                              \
128     }                                                           \
129     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
130 }
131 
132 #define NEON_GVEC_VOP2_ENV(name, vtype) \
133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
134 {                                                               \
135     intptr_t i, opr_sz = simd_oprsz(desc);                      \
136     vtype *d = vd, *n = vn, *m = vm;                            \
137     CPUARMState *env = venv;                                    \
138     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
139         NEON_FN(d[i], n[i], m[i]);                              \
140     }                                                           \
141     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
142 }
143 
144 /* Pairwise operations.  */
145 /* For 32-bit elements each segment only contains a single element, so
146    the elementwise and pairwise operations are the same.  */
147 #define NEON_PDO2 \
148     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
149     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
150 #define NEON_PDO4 \
151     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
152     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
153     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
154     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
155 
156 #define NEON_POP(name, vtype, n) \
157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
158 { \
159     uint32_t res; \
160     vtype vsrc1; \
161     vtype vsrc2; \
162     vtype vdest; \
163     NEON_UNPACK(vtype, vsrc1, arg1); \
164     NEON_UNPACK(vtype, vsrc2, arg2); \
165     NEON_PDO##n; \
166     NEON_PACK(vtype, res, vdest); \
167     return res; \
168 }
169 
170 /* Unary operators.  */
171 #define NEON_VOP1(name, vtype, n) \
172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
173 { \
174     vtype vsrc1; \
175     vtype vdest; \
176     NEON_UNPACK(vtype, vsrc1, arg); \
177     NEON_DO##n; \
178     NEON_PACK(vtype, arg, vdest); \
179     return arg; \
180 }
181 
182 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
183 NEON_POP(pmin_s8, neon_s8, 4)
184 NEON_POP(pmin_u8, neon_u8, 4)
185 NEON_POP(pmin_s16, neon_s16, 2)
186 NEON_POP(pmin_u16, neon_u16, 2)
187 #undef NEON_FN
188 
189 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
190 NEON_POP(pmax_s8, neon_s8, 4)
191 NEON_POP(pmax_u8, neon_u8, 4)
192 NEON_POP(pmax_s16, neon_s16, 2)
193 NEON_POP(pmax_u16, neon_u16, 2)
194 #undef NEON_FN
195 
196 #define NEON_FN(dest, src1, src2) \
197     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
198 NEON_VOP(shl_u16, neon_u16, 2)
199 #undef NEON_FN
200 
201 #define NEON_FN(dest, src1, src2) \
202     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
203 NEON_VOP(shl_s16, neon_s16, 2)
204 #undef NEON_FN
205 
206 #define NEON_FN(dest, src1, src2) \
207     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
208 NEON_VOP(rshl_s8, neon_s8, 4)
209 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
210 #undef NEON_FN
211 
212 #define NEON_FN(dest, src1, src2) \
213     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
214 NEON_VOP(rshl_s16, neon_s16, 2)
215 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
216 #undef NEON_FN
217 
218 #define NEON_FN(dest, src1, src2) \
219     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
220 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
221 #undef NEON_FN
222 
223 #define NEON_FN(dest, src1, src2) \
224     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
225 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
226 #undef NEON_FN
227 
228 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
229 {
230     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
231 }
232 
233 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
234 {
235     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
236 }
237 
238 #define NEON_FN(dest, src1, src2) \
239     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
240 NEON_VOP(rshl_u8, neon_u8, 4)
241 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
242 #undef NEON_FN
243 
244 #define NEON_FN(dest, src1, src2) \
245     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
246 NEON_VOP(rshl_u16, neon_u16, 2)
247 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
248 #undef NEON_FN
249 
250 #define NEON_FN(dest, src1, src2) \
251     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
252 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
253 #undef NEON_FN
254 
255 #define NEON_FN(dest, src1, src2) \
256     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
257 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
258 #undef NEON_FN
259 
260 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
261 {
262     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
263 }
264 
265 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
266 {
267     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
268 }
269 
270 #define NEON_FN(dest, src1, src2) \
271     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
272 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
273 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
274 #undef NEON_FN
275 
276 #define NEON_FN(dest, src1, src2) \
277     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
278 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
279 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
280 #undef NEON_FN
281 
282 #define NEON_FN(dest, src1, src2) \
283     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
284 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
285 #undef NEON_FN
286 
287 #define NEON_FN(dest, src1, src2) \
288     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
289 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
290 #undef NEON_FN
291 
292 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
293 {
294     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
295 }
296 
297 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
298 {
299     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
300 }
301 
302 #define NEON_FN(dest, src1, src2) \
303     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
304 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
305 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
306 #undef NEON_FN
307 
308 #define NEON_FN(dest, src1, src2) \
309     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
310 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
311 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
312 #undef NEON_FN
313 
314 #define NEON_FN(dest, src1, src2) \
315     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
316 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
317 #undef NEON_FN
318 
319 #define NEON_FN(dest, src1, src2) \
320     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
321 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
322 #undef NEON_FN
323 
324 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
325 {
326     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
327 }
328 
329 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
330 {
331     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
332 }
333 
334 #define NEON_FN(dest, src1, src2) \
335     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
336 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
337 #undef NEON_FN
338 
339 #define NEON_FN(dest, src1, src2) \
340     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
341 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
342 #undef NEON_FN
343 
344 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
345 {
346     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
347 }
348 
349 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
350 {
351     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
352 }
353 
354 #define NEON_FN(dest, src1, src2) \
355     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
356 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
357 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
358 #undef NEON_FN
359 
360 #define NEON_FN(dest, src1, src2) \
361     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
362 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
363 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
364 #undef NEON_FN
365 
366 #define NEON_FN(dest, src1, src2) \
367     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
368 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
369 #undef NEON_FN
370 
371 #define NEON_FN(dest, src1, src2) \
372     (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
373 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
374 #undef NEON_FN
375 
376 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
377 {
378     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
379 }
380 
381 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
382 {
383     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
384 }
385 
386 #define NEON_FN(dest, src1, src2) \
387     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
388 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
389 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
390 #undef NEON_FN
391 
392 #define NEON_FN(dest, src1, src2) \
393     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
394 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
395 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
396 #undef NEON_FN
397 
398 #define NEON_FN(dest, src1, src2) \
399     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
400 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
401 #undef NEON_FN
402 
403 #define NEON_FN(dest, src1, src2) \
404     (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
405 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
406 #undef NEON_FN
407 
408 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
409 {
410     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
411 }
412 
413 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
414 {
415     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
416 }
417 
418 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
419 {
420     uint32_t mask;
421     mask = (a ^ b) & 0x80808080u;
422     a &= ~0x80808080u;
423     b &= ~0x80808080u;
424     return (a + b) ^ mask;
425 }
426 
427 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
428 {
429     uint32_t mask;
430     mask = (a ^ b) & 0x80008000u;
431     a &= ~0x80008000u;
432     b &= ~0x80008000u;
433     return (a + b) ^ mask;
434 }
435 
436 #define NEON_FN(dest, src1, src2) dest = src1 - src2
437 NEON_VOP(sub_u8, neon_u8, 4)
438 NEON_VOP(sub_u16, neon_u16, 2)
439 #undef NEON_FN
440 
441 #define NEON_FN(dest, src1, src2) dest = src1 * src2
442 NEON_VOP(mul_u8, neon_u8, 4)
443 NEON_VOP(mul_u16, neon_u16, 2)
444 #undef NEON_FN
445 
446 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
447 NEON_VOP(tst_u8, neon_u8, 4)
448 NEON_VOP(tst_u16, neon_u16, 2)
449 NEON_VOP(tst_u32, neon_u32, 1)
450 #undef NEON_FN
451 
452 /* Count Leading Sign/Zero Bits.  */
453 static inline int do_clz8(uint8_t x)
454 {
455     int n;
456     for (n = 8; x; n--)
457         x >>= 1;
458     return n;
459 }
460 
461 static inline int do_clz16(uint16_t x)
462 {
463     int n;
464     for (n = 16; x; n--)
465         x >>= 1;
466     return n;
467 }
468 
469 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
470 NEON_VOP1(clz_u8, neon_u8, 4)
471 #undef NEON_FN
472 
473 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
474 NEON_VOP1(clz_u16, neon_u16, 2)
475 #undef NEON_FN
476 
477 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
478 NEON_VOP1(cls_s8, neon_s8, 4)
479 #undef NEON_FN
480 
481 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
482 NEON_VOP1(cls_s16, neon_s16, 2)
483 #undef NEON_FN
484 
485 uint32_t HELPER(neon_cls_s32)(uint32_t x)
486 {
487     int count;
488     if ((int32_t)x < 0)
489         x = ~x;
490     for (count = 32; x; count--)
491         x = x >> 1;
492     return count - 1;
493 }
494 
495 /* Bit count.  */
496 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
497 {
498     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
499     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
500     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
501     return x;
502 }
503 
504 /* Reverse bits in each 8 bit word */
505 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
506 {
507     x =  ((x & 0xf0f0f0f0) >> 4)
508        | ((x & 0x0f0f0f0f) << 4);
509     x =  ((x & 0x88888888) >> 3)
510        | ((x & 0x44444444) >> 1)
511        | ((x & 0x22222222) << 1)
512        | ((x & 0x11111111) << 3);
513     return x;
514 }
515 
516 #define NEON_QDMULH16(dest, src1, src2, round) do { \
517     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
518     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
519         SET_QC(); \
520         tmp = (tmp >> 31) ^ ~SIGNBIT; \
521     } else { \
522         tmp <<= 1; \
523     } \
524     if (round) { \
525         int32_t old = tmp; \
526         tmp += 1 << 15; \
527         if ((int32_t)tmp < old) { \
528             SET_QC(); \
529             tmp = SIGNBIT - 1; \
530         } \
531     } \
532     dest = tmp >> 16; \
533     } while(0)
534 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
535 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
536 #undef NEON_FN
537 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
538 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
539 #undef NEON_FN
540 #undef NEON_QDMULH16
541 
542 #define NEON_QDMULH32(dest, src1, src2, round) do { \
543     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
544     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
545         SET_QC(); \
546         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
547     } else { \
548         tmp <<= 1; \
549     } \
550     if (round) { \
551         int64_t old = tmp; \
552         tmp += (int64_t)1 << 31; \
553         if ((int64_t)tmp < old) { \
554             SET_QC(); \
555             tmp = SIGNBIT64 - 1; \
556         } \
557     } \
558     dest = tmp >> 32; \
559     } while(0)
560 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
561 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
562 #undef NEON_FN
563 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
564 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
565 #undef NEON_FN
566 #undef NEON_QDMULH32
567 
568 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
569 {
570     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
571            | ((x >> 24) & 0xff000000u);
572 }
573 
574 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
575 {
576     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
577 }
578 
579 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
580 {
581     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
582             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
583 }
584 
585 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
586 {
587     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
588 }
589 
590 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
591 {
592     x &= 0xff80ff80ff80ff80ull;
593     x += 0x0080008000800080ull;
594     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
595             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
596 }
597 
598 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
599 {
600     x &= 0xffff8000ffff8000ull;
601     x += 0x0000800000008000ull;
602     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
603 }
604 
605 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
606 {
607     uint16_t s;
608     uint8_t d;
609     uint32_t res = 0;
610 #define SAT8(n) \
611     s = x >> n; \
612     if (s & 0x8000) { \
613         SET_QC(); \
614     } else { \
615         if (s > 0xff) { \
616             d = 0xff; \
617             SET_QC(); \
618         } else  { \
619             d = s; \
620         } \
621         res |= (uint32_t)d << (n / 2); \
622     }
623 
624     SAT8(0);
625     SAT8(16);
626     SAT8(32);
627     SAT8(48);
628 #undef SAT8
629     return res;
630 }
631 
632 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
633 {
634     uint16_t s;
635     uint8_t d;
636     uint32_t res = 0;
637 #define SAT8(n) \
638     s = x >> n; \
639     if (s > 0xff) { \
640         d = 0xff; \
641         SET_QC(); \
642     } else  { \
643         d = s; \
644     } \
645     res |= (uint32_t)d << (n / 2);
646 
647     SAT8(0);
648     SAT8(16);
649     SAT8(32);
650     SAT8(48);
651 #undef SAT8
652     return res;
653 }
654 
655 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
656 {
657     int16_t s;
658     uint8_t d;
659     uint32_t res = 0;
660 #define SAT8(n) \
661     s = x >> n; \
662     if (s != (int8_t)s) { \
663         d = (s >> 15) ^ 0x7f; \
664         SET_QC(); \
665     } else  { \
666         d = s; \
667     } \
668     res |= (uint32_t)d << (n / 2);
669 
670     SAT8(0);
671     SAT8(16);
672     SAT8(32);
673     SAT8(48);
674 #undef SAT8
675     return res;
676 }
677 
678 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
679 {
680     uint32_t high;
681     uint32_t low;
682     low = x;
683     if (low & 0x80000000) {
684         low = 0;
685         SET_QC();
686     } else if (low > 0xffff) {
687         low = 0xffff;
688         SET_QC();
689     }
690     high = x >> 32;
691     if (high & 0x80000000) {
692         high = 0;
693         SET_QC();
694     } else if (high > 0xffff) {
695         high = 0xffff;
696         SET_QC();
697     }
698     return low | (high << 16);
699 }
700 
701 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
702 {
703     uint32_t high;
704     uint32_t low;
705     low = x;
706     if (low > 0xffff) {
707         low = 0xffff;
708         SET_QC();
709     }
710     high = x >> 32;
711     if (high > 0xffff) {
712         high = 0xffff;
713         SET_QC();
714     }
715     return low | (high << 16);
716 }
717 
718 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
719 {
720     int32_t low;
721     int32_t high;
722     low = x;
723     if (low != (int16_t)low) {
724         low = (low >> 31) ^ 0x7fff;
725         SET_QC();
726     }
727     high = x >> 32;
728     if (high != (int16_t)high) {
729         high = (high >> 31) ^ 0x7fff;
730         SET_QC();
731     }
732     return (uint16_t)low | (high << 16);
733 }
734 
735 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
736 {
737     if (x & 0x8000000000000000ull) {
738         SET_QC();
739         return 0;
740     }
741     if (x > 0xffffffffu) {
742         SET_QC();
743         return 0xffffffffu;
744     }
745     return x;
746 }
747 
748 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
749 {
750     if (x > 0xffffffffu) {
751         SET_QC();
752         return 0xffffffffu;
753     }
754     return x;
755 }
756 
757 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
758 {
759     if ((int64_t)x != (int32_t)x) {
760         SET_QC();
761         return ((int64_t)x >> 63) ^ 0x7fffffff;
762     }
763     return x;
764 }
765 
766 uint64_t HELPER(neon_widen_u8)(uint32_t x)
767 {
768     uint64_t tmp;
769     uint64_t ret;
770     ret = (uint8_t)x;
771     tmp = (uint8_t)(x >> 8);
772     ret |= tmp << 16;
773     tmp = (uint8_t)(x >> 16);
774     ret |= tmp << 32;
775     tmp = (uint8_t)(x >> 24);
776     ret |= tmp << 48;
777     return ret;
778 }
779 
780 uint64_t HELPER(neon_widen_s8)(uint32_t x)
781 {
782     uint64_t tmp;
783     uint64_t ret;
784     ret = (uint16_t)(int8_t)x;
785     tmp = (uint16_t)(int8_t)(x >> 8);
786     ret |= tmp << 16;
787     tmp = (uint16_t)(int8_t)(x >> 16);
788     ret |= tmp << 32;
789     tmp = (uint16_t)(int8_t)(x >> 24);
790     ret |= tmp << 48;
791     return ret;
792 }
793 
794 uint64_t HELPER(neon_widen_u16)(uint32_t x)
795 {
796     uint64_t high = (uint16_t)(x >> 16);
797     return ((uint16_t)x) | (high << 32);
798 }
799 
800 uint64_t HELPER(neon_widen_s16)(uint32_t x)
801 {
802     uint64_t high = (int16_t)(x >> 16);
803     return ((uint32_t)(int16_t)x) | (high << 32);
804 }
805 
806 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
807 {
808     uint64_t mask;
809     mask = (a ^ b) & 0x8000800080008000ull;
810     a &= ~0x8000800080008000ull;
811     b &= ~0x8000800080008000ull;
812     return (a + b) ^ mask;
813 }
814 
815 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
816 {
817     uint64_t mask;
818     mask = (a ^ b) & 0x8000000080000000ull;
819     a &= ~0x8000000080000000ull;
820     b &= ~0x8000000080000000ull;
821     return (a + b) ^ mask;
822 }
823 
824 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
825 {
826     uint64_t tmp;
827     uint64_t tmp2;
828 
829     tmp = a & 0x0000ffff0000ffffull;
830     tmp += (a >> 16) & 0x0000ffff0000ffffull;
831     tmp2 = b & 0xffff0000ffff0000ull;
832     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
833     return    ( tmp         & 0xffff)
834             | ((tmp  >> 16) & 0xffff0000ull)
835             | ((tmp2 << 16) & 0xffff00000000ull)
836             | ( tmp2        & 0xffff000000000000ull);
837 }
838 
839 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
840 {
841     uint32_t low = a + (a >> 32);
842     uint32_t high = b + (b >> 32);
843     return low + ((uint64_t)high << 32);
844 }
845 
846 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
847 {
848     uint64_t mask;
849     mask = (a ^ ~b) & 0x8000800080008000ull;
850     a |= 0x8000800080008000ull;
851     b &= ~0x8000800080008000ull;
852     return (a - b) ^ mask;
853 }
854 
855 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
856 {
857     uint64_t mask;
858     mask = (a ^ ~b) & 0x8000000080000000ull;
859     a |= 0x8000000080000000ull;
860     b &= ~0x8000000080000000ull;
861     return (a - b) ^ mask;
862 }
863 
864 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
865 {
866     uint32_t x, y;
867     uint32_t low, high;
868 
869     x = a;
870     y = b;
871     low = x + y;
872     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
873         SET_QC();
874         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
875     }
876     x = a >> 32;
877     y = b >> 32;
878     high = x + y;
879     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
880         SET_QC();
881         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
882     }
883     return low | ((uint64_t)high << 32);
884 }
885 
886 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
887 {
888     uint64_t result;
889 
890     result = a + b;
891     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
892         SET_QC();
893         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
894     }
895     return result;
896 }
897 
898 /* We have to do the arithmetic in a larger type than
899  * the input type, because for example with a signed 32 bit
900  * op the absolute difference can overflow a signed 32 bit value.
901  */
902 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
903     arithtype tmp_x = (intype)(x);                            \
904     arithtype tmp_y = (intype)(y);                            \
905     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
906     } while(0)
907 
908 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
909 {
910     uint64_t tmp;
911     uint64_t result;
912     DO_ABD(result, a, b, uint8_t, uint32_t);
913     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
914     result |= tmp << 16;
915     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
916     result |= tmp << 32;
917     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
918     result |= tmp << 48;
919     return result;
920 }
921 
922 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
923 {
924     uint64_t tmp;
925     uint64_t result;
926     DO_ABD(result, a, b, int8_t, int32_t);
927     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
928     result |= tmp << 16;
929     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
930     result |= tmp << 32;
931     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
932     result |= tmp << 48;
933     return result;
934 }
935 
936 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
937 {
938     uint64_t tmp;
939     uint64_t result;
940     DO_ABD(result, a, b, uint16_t, uint32_t);
941     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
942     return result | (tmp << 32);
943 }
944 
945 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
946 {
947     uint64_t tmp;
948     uint64_t result;
949     DO_ABD(result, a, b, int16_t, int32_t);
950     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
951     return result | (tmp << 32);
952 }
953 
954 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
955 {
956     uint64_t result;
957     DO_ABD(result, a, b, uint32_t, uint64_t);
958     return result;
959 }
960 
961 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
962 {
963     uint64_t result;
964     DO_ABD(result, a, b, int32_t, int64_t);
965     return result;
966 }
967 #undef DO_ABD
968 
969 /* Widening multiply. Named type is the source type.  */
970 #define DO_MULL(dest, x, y, type1, type2) do { \
971     type1 tmp_x = x; \
972     type1 tmp_y = y; \
973     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
974     } while(0)
975 
976 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
977 {
978     uint64_t tmp;
979     uint64_t result;
980 
981     DO_MULL(result, a, b, uint8_t, uint16_t);
982     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
983     result |= tmp << 16;
984     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
985     result |= tmp << 32;
986     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
987     result |= tmp << 48;
988     return result;
989 }
990 
991 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
992 {
993     uint64_t tmp;
994     uint64_t result;
995 
996     DO_MULL(result, a, b, int8_t, uint16_t);
997     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
998     result |= tmp << 16;
999     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1000     result |= tmp << 32;
1001     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1002     result |= tmp << 48;
1003     return result;
1004 }
1005 
1006 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1007 {
1008     uint64_t tmp;
1009     uint64_t result;
1010 
1011     DO_MULL(result, a, b, uint16_t, uint32_t);
1012     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1013     return result | (tmp << 32);
1014 }
1015 
1016 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1017 {
1018     uint64_t tmp;
1019     uint64_t result;
1020 
1021     DO_MULL(result, a, b, int16_t, uint32_t);
1022     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1023     return result | (tmp << 32);
1024 }
1025 
1026 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1027 {
1028     uint16_t tmp;
1029     uint64_t result;
1030     result = (uint16_t)-x;
1031     tmp = -(x >> 16);
1032     result |= (uint64_t)tmp << 16;
1033     tmp = -(x >> 32);
1034     result |= (uint64_t)tmp << 32;
1035     tmp = -(x >> 48);
1036     result |= (uint64_t)tmp << 48;
1037     return result;
1038 }
1039 
1040 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1041 {
1042     uint32_t low = -x;
1043     uint32_t high = -(x >> 32);
1044     return low | ((uint64_t)high << 32);
1045 }
1046 
1047 /* Saturating sign manipulation.  */
1048 /* ??? Make these use NEON_VOP1 */
1049 #define DO_QABS8(x) do { \
1050     if (x == (int8_t)0x80) { \
1051         x = 0x7f; \
1052         SET_QC(); \
1053     } else if (x < 0) { \
1054         x = -x; \
1055     }} while (0)
1056 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1057 {
1058     neon_s8 vec;
1059     NEON_UNPACK(neon_s8, vec, x);
1060     DO_QABS8(vec.v1);
1061     DO_QABS8(vec.v2);
1062     DO_QABS8(vec.v3);
1063     DO_QABS8(vec.v4);
1064     NEON_PACK(neon_s8, x, vec);
1065     return x;
1066 }
1067 #undef DO_QABS8
1068 
1069 #define DO_QNEG8(x) do { \
1070     if (x == (int8_t)0x80) { \
1071         x = 0x7f; \
1072         SET_QC(); \
1073     } else { \
1074         x = -x; \
1075     }} while (0)
1076 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1077 {
1078     neon_s8 vec;
1079     NEON_UNPACK(neon_s8, vec, x);
1080     DO_QNEG8(vec.v1);
1081     DO_QNEG8(vec.v2);
1082     DO_QNEG8(vec.v3);
1083     DO_QNEG8(vec.v4);
1084     NEON_PACK(neon_s8, x, vec);
1085     return x;
1086 }
1087 #undef DO_QNEG8
1088 
1089 #define DO_QABS16(x) do { \
1090     if (x == (int16_t)0x8000) { \
1091         x = 0x7fff; \
1092         SET_QC(); \
1093     } else if (x < 0) { \
1094         x = -x; \
1095     }} while (0)
1096 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1097 {
1098     neon_s16 vec;
1099     NEON_UNPACK(neon_s16, vec, x);
1100     DO_QABS16(vec.v1);
1101     DO_QABS16(vec.v2);
1102     NEON_PACK(neon_s16, x, vec);
1103     return x;
1104 }
1105 #undef DO_QABS16
1106 
1107 #define DO_QNEG16(x) do { \
1108     if (x == (int16_t)0x8000) { \
1109         x = 0x7fff; \
1110         SET_QC(); \
1111     } else { \
1112         x = -x; \
1113     }} while (0)
1114 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1115 {
1116     neon_s16 vec;
1117     NEON_UNPACK(neon_s16, vec, x);
1118     DO_QNEG16(vec.v1);
1119     DO_QNEG16(vec.v2);
1120     NEON_PACK(neon_s16, x, vec);
1121     return x;
1122 }
1123 #undef DO_QNEG16
1124 
1125 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1126 {
1127     if (x == SIGNBIT) {
1128         SET_QC();
1129         x = ~SIGNBIT;
1130     } else if ((int32_t)x < 0) {
1131         x = -x;
1132     }
1133     return x;
1134 }
1135 
1136 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1137 {
1138     if (x == SIGNBIT) {
1139         SET_QC();
1140         x = ~SIGNBIT;
1141     } else {
1142         x = -x;
1143     }
1144     return x;
1145 }
1146 
1147 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1148 {
1149     if (x == SIGNBIT64) {
1150         SET_QC();
1151         x = ~SIGNBIT64;
1152     } else if ((int64_t)x < 0) {
1153         x = -x;
1154     }
1155     return x;
1156 }
1157 
1158 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1159 {
1160     if (x == SIGNBIT64) {
1161         SET_QC();
1162         x = ~SIGNBIT64;
1163     } else {
1164         x = -x;
1165     }
1166     return x;
1167 }
1168 
1169 /* NEON Float helpers.  */
1170 
1171 /* Floating point comparisons produce an integer result.
1172  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1173  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1174  */
1175 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1176 {
1177     float_status *fpst = fpstp;
1178     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1179 }
1180 
1181 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1182 {
1183     float_status *fpst = fpstp;
1184     return -float32_le(make_float32(b), make_float32(a), fpst);
1185 }
1186 
1187 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1188 {
1189     float_status *fpst = fpstp;
1190     return -float32_lt(make_float32(b), make_float32(a), fpst);
1191 }
1192 
1193 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1194 {
1195     float_status *fpst = fpstp;
1196     float32 f0 = float32_abs(make_float32(a));
1197     float32 f1 = float32_abs(make_float32(b));
1198     return -float32_le(f1, f0, fpst);
1199 }
1200 
1201 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1202 {
1203     float_status *fpst = fpstp;
1204     float32 f0 = float32_abs(make_float32(a));
1205     float32 f1 = float32_abs(make_float32(b));
1206     return -float32_lt(f1, f0, fpst);
1207 }
1208 
1209 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1210 {
1211     float_status *fpst = fpstp;
1212     float64 f0 = float64_abs(make_float64(a));
1213     float64 f1 = float64_abs(make_float64(b));
1214     return -float64_le(f1, f0, fpst);
1215 }
1216 
1217 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1218 {
1219     float_status *fpst = fpstp;
1220     float64 f0 = float64_abs(make_float64(a));
1221     float64 f1 = float64_abs(make_float64(b));
1222     return -float64_lt(f1, f0, fpst);
1223 }
1224 
1225 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1226 
1227 void HELPER(neon_qunzip8)(void *vd, void *vm)
1228 {
1229     uint64_t *rd = vd, *rm = vm;
1230     uint64_t zd0 = rd[0], zd1 = rd[1];
1231     uint64_t zm0 = rm[0], zm1 = rm[1];
1232 
1233     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1234         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1235         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1236         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1237     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1238         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1239         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1240         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1241     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1242         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1243         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1244         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1245     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1246         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1247         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1248         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1249 
1250     rm[0] = m0;
1251     rm[1] = m1;
1252     rd[0] = d0;
1253     rd[1] = d1;
1254 }
1255 
1256 void HELPER(neon_qunzip16)(void *vd, void *vm)
1257 {
1258     uint64_t *rd = vd, *rm = vm;
1259     uint64_t zd0 = rd[0], zd1 = rd[1];
1260     uint64_t zm0 = rm[0], zm1 = rm[1];
1261 
1262     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1263         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1264     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1265         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1266     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1267         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1268     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1269         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1270 
1271     rm[0] = m0;
1272     rm[1] = m1;
1273     rd[0] = d0;
1274     rd[1] = d1;
1275 }
1276 
1277 void HELPER(neon_qunzip32)(void *vd, void *vm)
1278 {
1279     uint64_t *rd = vd, *rm = vm;
1280     uint64_t zd0 = rd[0], zd1 = rd[1];
1281     uint64_t zm0 = rm[0], zm1 = rm[1];
1282 
1283     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1284     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1285     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1286     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1287 
1288     rm[0] = m0;
1289     rm[1] = m1;
1290     rd[0] = d0;
1291     rd[1] = d1;
1292 }
1293 
1294 void HELPER(neon_unzip8)(void *vd, void *vm)
1295 {
1296     uint64_t *rd = vd, *rm = vm;
1297     uint64_t zd = rd[0], zm = rm[0];
1298 
1299     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1300         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1301         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1302         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1303     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1304         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1305         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1306         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1307 
1308     rm[0] = m0;
1309     rd[0] = d0;
1310 }
1311 
1312 void HELPER(neon_unzip16)(void *vd, void *vm)
1313 {
1314     uint64_t *rd = vd, *rm = vm;
1315     uint64_t zd = rd[0], zm = rm[0];
1316 
1317     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1318         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1319     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1320         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1321 
1322     rm[0] = m0;
1323     rd[0] = d0;
1324 }
1325 
1326 void HELPER(neon_qzip8)(void *vd, void *vm)
1327 {
1328     uint64_t *rd = vd, *rm = vm;
1329     uint64_t zd0 = rd[0], zd1 = rd[1];
1330     uint64_t zm0 = rm[0], zm1 = rm[1];
1331 
1332     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1333         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1334         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1335         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1336     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1337         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1338         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1339         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1340     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1341         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1342         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1343         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1344     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1345         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1346         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1347         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1348 
1349     rm[0] = m0;
1350     rm[1] = m1;
1351     rd[0] = d0;
1352     rd[1] = d1;
1353 }
1354 
1355 void HELPER(neon_qzip16)(void *vd, void *vm)
1356 {
1357     uint64_t *rd = vd, *rm = vm;
1358     uint64_t zd0 = rd[0], zd1 = rd[1];
1359     uint64_t zm0 = rm[0], zm1 = rm[1];
1360 
1361     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1362         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1363     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1364         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1365     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1366         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1367     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1368         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1369 
1370     rm[0] = m0;
1371     rm[1] = m1;
1372     rd[0] = d0;
1373     rd[1] = d1;
1374 }
1375 
1376 void HELPER(neon_qzip32)(void *vd, void *vm)
1377 {
1378     uint64_t *rd = vd, *rm = vm;
1379     uint64_t zd0 = rd[0], zd1 = rd[1];
1380     uint64_t zm0 = rm[0], zm1 = rm[1];
1381 
1382     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1383     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1384     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1385     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1386 
1387     rm[0] = m0;
1388     rm[1] = m1;
1389     rd[0] = d0;
1390     rd[1] = d1;
1391 }
1392 
1393 void HELPER(neon_zip8)(void *vd, void *vm)
1394 {
1395     uint64_t *rd = vd, *rm = vm;
1396     uint64_t zd = rd[0], zm = rm[0];
1397 
1398     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1399         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1400         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1401         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1402     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1403         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1404         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1405         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1406 
1407     rm[0] = m0;
1408     rd[0] = d0;
1409 }
1410 
1411 void HELPER(neon_zip16)(void *vd, void *vm)
1412 {
1413     uint64_t *rd = vd, *rm = vm;
1414     uint64_t zd = rd[0], zm = rm[0];
1415 
1416     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1417         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1418     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1419         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1420 
1421     rm[0] = m0;
1422     rd[0] = d0;
1423 }
1424