xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision f4fa83d6148f9d9bbd543c776e6cdc919c43c8e3)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 #include "qemu/osdep.h"
10 
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15 
16 #define SIGNBIT (uint32_t)0x80000000
17 #define SIGNBIT64 ((uint64_t)1 << 63)
18 
19 #define SET_QC() env->vfp.qc[0] = 1
20 
21 #define NEON_TYPE1(name, type) \
22 typedef struct \
23 { \
24     type v1; \
25 } neon_##name;
26 #if HOST_BIG_ENDIAN
27 #define NEON_TYPE2(name, type) \
28 typedef struct \
29 { \
30     type v2; \
31     type v1; \
32 } neon_##name;
33 #define NEON_TYPE4(name, type) \
34 typedef struct \
35 { \
36     type v4; \
37     type v3; \
38     type v2; \
39     type v1; \
40 } neon_##name;
41 #else
42 #define NEON_TYPE2(name, type) \
43 typedef struct \
44 { \
45     type v1; \
46     type v2; \
47 } neon_##name;
48 #define NEON_TYPE4(name, type) \
49 typedef struct \
50 { \
51     type v1; \
52     type v2; \
53     type v3; \
54     type v4; \
55 } neon_##name;
56 #endif
57 
58 NEON_TYPE4(s8, int8_t)
59 NEON_TYPE4(u8, uint8_t)
60 NEON_TYPE2(s16, int16_t)
61 NEON_TYPE2(u16, uint16_t)
62 NEON_TYPE1(s32, int32_t)
63 NEON_TYPE1(u32, uint32_t)
64 #undef NEON_TYPE4
65 #undef NEON_TYPE2
66 #undef NEON_TYPE1
67 
68 /* Copy from a uint32_t to a vector structure type.  */
69 #define NEON_UNPACK(vtype, dest, val) do { \
70     union { \
71         vtype v; \
72         uint32_t i; \
73     } conv_u; \
74     conv_u.i = (val); \
75     dest = conv_u.v; \
76     } while(0)
77 
78 /* Copy from a vector structure type to a uint32_t.  */
79 #define NEON_PACK(vtype, dest, val) do { \
80     union { \
81         vtype v; \
82         uint32_t i; \
83     } conv_u; \
84     conv_u.v = (val); \
85     dest = conv_u.i; \
86     } while(0)
87 
88 #define NEON_DO1 \
89     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
90 #define NEON_DO2 \
91     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
92     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
93 #define NEON_DO4 \
94     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
96     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
97     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
98 
99 #define NEON_VOP_BODY(vtype, n) \
100 { \
101     uint32_t res; \
102     vtype vsrc1; \
103     vtype vsrc2; \
104     vtype vdest; \
105     NEON_UNPACK(vtype, vsrc1, arg1); \
106     NEON_UNPACK(vtype, vsrc2, arg2); \
107     NEON_DO##n; \
108     NEON_PACK(vtype, res, vdest); \
109     return res; \
110 }
111 
112 #define NEON_VOP(name, vtype, n) \
113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
114 NEON_VOP_BODY(vtype, n)
115 
116 #define NEON_VOP_ENV(name, vtype, n) \
117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
118 NEON_VOP_BODY(vtype, n)
119 
120 /* Pairwise operations.  */
121 /* For 32-bit elements each segment only contains a single element, so
122    the elementwise and pairwise operations are the same.  */
123 #define NEON_PDO2 \
124     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
125     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
126 #define NEON_PDO4 \
127     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
128     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
129     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
130     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
131 
132 #define NEON_POP(name, vtype, n) \
133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
134 { \
135     uint32_t res; \
136     vtype vsrc1; \
137     vtype vsrc2; \
138     vtype vdest; \
139     NEON_UNPACK(vtype, vsrc1, arg1); \
140     NEON_UNPACK(vtype, vsrc2, arg2); \
141     NEON_PDO##n; \
142     NEON_PACK(vtype, res, vdest); \
143     return res; \
144 }
145 
146 /* Unary operators.  */
147 #define NEON_VOP1(name, vtype, n) \
148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
149 { \
150     vtype vsrc1; \
151     vtype vdest; \
152     NEON_UNPACK(vtype, vsrc1, arg); \
153     NEON_DO##n; \
154     NEON_PACK(vtype, arg, vdest); \
155     return arg; \
156 }
157 
158 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
159 NEON_VOP(hadd_s8, neon_s8, 4)
160 NEON_VOP(hadd_u8, neon_u8, 4)
161 NEON_VOP(hadd_s16, neon_s16, 2)
162 NEON_VOP(hadd_u16, neon_u16, 2)
163 #undef NEON_FN
164 
165 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
166 {
167     int32_t dest;
168 
169     dest = (src1 >> 1) + (src2 >> 1);
170     if (src1 & src2 & 1)
171         dest++;
172     return dest;
173 }
174 
175 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
176 {
177     uint32_t dest;
178 
179     dest = (src1 >> 1) + (src2 >> 1);
180     if (src1 & src2 & 1)
181         dest++;
182     return dest;
183 }
184 
185 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
186 NEON_VOP(rhadd_s8, neon_s8, 4)
187 NEON_VOP(rhadd_u8, neon_u8, 4)
188 NEON_VOP(rhadd_s16, neon_s16, 2)
189 NEON_VOP(rhadd_u16, neon_u16, 2)
190 #undef NEON_FN
191 
192 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
193 {
194     int32_t dest;
195 
196     dest = (src1 >> 1) + (src2 >> 1);
197     if ((src1 | src2) & 1)
198         dest++;
199     return dest;
200 }
201 
202 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
203 {
204     uint32_t dest;
205 
206     dest = (src1 >> 1) + (src2 >> 1);
207     if ((src1 | src2) & 1)
208         dest++;
209     return dest;
210 }
211 
212 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
213 NEON_VOP(hsub_s8, neon_s8, 4)
214 NEON_VOP(hsub_u8, neon_u8, 4)
215 NEON_VOP(hsub_s16, neon_s16, 2)
216 NEON_VOP(hsub_u16, neon_u16, 2)
217 #undef NEON_FN
218 
219 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
220 {
221     int32_t dest;
222 
223     dest = (src1 >> 1) - (src2 >> 1);
224     if ((~src1) & src2 & 1)
225         dest--;
226     return dest;
227 }
228 
229 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
230 {
231     uint32_t dest;
232 
233     dest = (src1 >> 1) - (src2 >> 1);
234     if ((~src1) & src2 & 1)
235         dest--;
236     return dest;
237 }
238 
239 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
240 NEON_POP(pmin_s8, neon_s8, 4)
241 NEON_POP(pmin_u8, neon_u8, 4)
242 NEON_POP(pmin_s16, neon_s16, 2)
243 NEON_POP(pmin_u16, neon_u16, 2)
244 #undef NEON_FN
245 
246 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
247 NEON_POP(pmax_s8, neon_s8, 4)
248 NEON_POP(pmax_u8, neon_u8, 4)
249 NEON_POP(pmax_s16, neon_s16, 2)
250 NEON_POP(pmax_u16, neon_u16, 2)
251 #undef NEON_FN
252 
253 #define NEON_FN(dest, src1, src2) \
254     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
255 NEON_VOP(shl_u16, neon_u16, 2)
256 #undef NEON_FN
257 
258 #define NEON_FN(dest, src1, src2) \
259     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
260 NEON_VOP(shl_s16, neon_s16, 2)
261 #undef NEON_FN
262 
263 #define NEON_FN(dest, src1, src2) \
264     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
265 NEON_VOP(rshl_s8, neon_s8, 4)
266 #undef NEON_FN
267 
268 #define NEON_FN(dest, src1, src2) \
269     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
270 NEON_VOP(rshl_s16, neon_s16, 2)
271 #undef NEON_FN
272 
273 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
274 {
275     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
276 }
277 
278 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
279 {
280     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
281 }
282 
283 #define NEON_FN(dest, src1, src2) \
284     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
285 NEON_VOP(rshl_u8, neon_u8, 4)
286 #undef NEON_FN
287 
288 #define NEON_FN(dest, src1, src2) \
289     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
290 NEON_VOP(rshl_u16, neon_u16, 2)
291 #undef NEON_FN
292 
293 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
294 {
295     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
296 }
297 
298 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
299 {
300     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
301 }
302 
303 #define NEON_FN(dest, src1, src2) \
304     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
305 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
306 #undef NEON_FN
307 
308 #define NEON_FN(dest, src1, src2) \
309     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
310 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
311 #undef NEON_FN
312 
313 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
314 {
315     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
316 }
317 
318 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
319 {
320     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
321 }
322 
323 #define NEON_FN(dest, src1, src2) \
324     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
325 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
326 #undef NEON_FN
327 
328 #define NEON_FN(dest, src1, src2) \
329     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
330 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
331 #undef NEON_FN
332 
333 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
334 {
335     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
336 }
337 
338 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
339 {
340     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
341 }
342 
343 #define NEON_FN(dest, src1, src2) \
344     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
345 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
346 #undef NEON_FN
347 
348 #define NEON_FN(dest, src1, src2) \
349     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
350 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
351 #undef NEON_FN
352 
353 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
354 {
355     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
356 }
357 
358 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
359 {
360     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
361 }
362 
363 #define NEON_FN(dest, src1, src2) \
364     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
365 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
366 #undef NEON_FN
367 
368 #define NEON_FN(dest, src1, src2) \
369     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
370 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
371 #undef NEON_FN
372 
373 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
374 {
375     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
376 }
377 
378 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
379 {
380     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
381 }
382 
383 #define NEON_FN(dest, src1, src2) \
384     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
385 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
386 #undef NEON_FN
387 
388 #define NEON_FN(dest, src1, src2) \
389     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
390 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
391 #undef NEON_FN
392 
393 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
394 {
395     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
396 }
397 
398 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
399 {
400     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
401 }
402 
403 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
404 {
405     uint32_t mask;
406     mask = (a ^ b) & 0x80808080u;
407     a &= ~0x80808080u;
408     b &= ~0x80808080u;
409     return (a + b) ^ mask;
410 }
411 
412 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
413 {
414     uint32_t mask;
415     mask = (a ^ b) & 0x80008000u;
416     a &= ~0x80008000u;
417     b &= ~0x80008000u;
418     return (a + b) ^ mask;
419 }
420 
421 #define NEON_FN(dest, src1, src2) dest = src1 - src2
422 NEON_VOP(sub_u8, neon_u8, 4)
423 NEON_VOP(sub_u16, neon_u16, 2)
424 #undef NEON_FN
425 
426 #define NEON_FN(dest, src1, src2) dest = src1 * src2
427 NEON_VOP(mul_u8, neon_u8, 4)
428 NEON_VOP(mul_u16, neon_u16, 2)
429 #undef NEON_FN
430 
431 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
432 NEON_VOP(tst_u8, neon_u8, 4)
433 NEON_VOP(tst_u16, neon_u16, 2)
434 NEON_VOP(tst_u32, neon_u32, 1)
435 #undef NEON_FN
436 
437 /* Count Leading Sign/Zero Bits.  */
438 static inline int do_clz8(uint8_t x)
439 {
440     int n;
441     for (n = 8; x; n--)
442         x >>= 1;
443     return n;
444 }
445 
446 static inline int do_clz16(uint16_t x)
447 {
448     int n;
449     for (n = 16; x; n--)
450         x >>= 1;
451     return n;
452 }
453 
454 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
455 NEON_VOP1(clz_u8, neon_u8, 4)
456 #undef NEON_FN
457 
458 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
459 NEON_VOP1(clz_u16, neon_u16, 2)
460 #undef NEON_FN
461 
462 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
463 NEON_VOP1(cls_s8, neon_s8, 4)
464 #undef NEON_FN
465 
466 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
467 NEON_VOP1(cls_s16, neon_s16, 2)
468 #undef NEON_FN
469 
470 uint32_t HELPER(neon_cls_s32)(uint32_t x)
471 {
472     int count;
473     if ((int32_t)x < 0)
474         x = ~x;
475     for (count = 32; x; count--)
476         x = x >> 1;
477     return count - 1;
478 }
479 
480 /* Bit count.  */
481 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
482 {
483     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
484     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
485     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
486     return x;
487 }
488 
489 /* Reverse bits in each 8 bit word */
490 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
491 {
492     x =  ((x & 0xf0f0f0f0) >> 4)
493        | ((x & 0x0f0f0f0f) << 4);
494     x =  ((x & 0x88888888) >> 3)
495        | ((x & 0x44444444) >> 1)
496        | ((x & 0x22222222) << 1)
497        | ((x & 0x11111111) << 3);
498     return x;
499 }
500 
501 #define NEON_QDMULH16(dest, src1, src2, round) do { \
502     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
503     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
504         SET_QC(); \
505         tmp = (tmp >> 31) ^ ~SIGNBIT; \
506     } else { \
507         tmp <<= 1; \
508     } \
509     if (round) { \
510         int32_t old = tmp; \
511         tmp += 1 << 15; \
512         if ((int32_t)tmp < old) { \
513             SET_QC(); \
514             tmp = SIGNBIT - 1; \
515         } \
516     } \
517     dest = tmp >> 16; \
518     } while(0)
519 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
520 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
521 #undef NEON_FN
522 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
523 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
524 #undef NEON_FN
525 #undef NEON_QDMULH16
526 
527 #define NEON_QDMULH32(dest, src1, src2, round) do { \
528     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
529     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
530         SET_QC(); \
531         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
532     } else { \
533         tmp <<= 1; \
534     } \
535     if (round) { \
536         int64_t old = tmp; \
537         tmp += (int64_t)1 << 31; \
538         if ((int64_t)tmp < old) { \
539             SET_QC(); \
540             tmp = SIGNBIT64 - 1; \
541         } \
542     } \
543     dest = tmp >> 32; \
544     } while(0)
545 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
546 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
547 #undef NEON_FN
548 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
549 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
550 #undef NEON_FN
551 #undef NEON_QDMULH32
552 
553 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
554 {
555     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
556            | ((x >> 24) & 0xff000000u);
557 }
558 
559 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
560 {
561     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
562 }
563 
564 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
565 {
566     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
567             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
568 }
569 
570 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
571 {
572     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
573 }
574 
575 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
576 {
577     x &= 0xff80ff80ff80ff80ull;
578     x += 0x0080008000800080ull;
579     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
580             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
581 }
582 
583 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
584 {
585     x &= 0xffff8000ffff8000ull;
586     x += 0x0000800000008000ull;
587     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
588 }
589 
590 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
591 {
592     uint16_t s;
593     uint8_t d;
594     uint32_t res = 0;
595 #define SAT8(n) \
596     s = x >> n; \
597     if (s & 0x8000) { \
598         SET_QC(); \
599     } else { \
600         if (s > 0xff) { \
601             d = 0xff; \
602             SET_QC(); \
603         } else  { \
604             d = s; \
605         } \
606         res |= (uint32_t)d << (n / 2); \
607     }
608 
609     SAT8(0);
610     SAT8(16);
611     SAT8(32);
612     SAT8(48);
613 #undef SAT8
614     return res;
615 }
616 
617 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
618 {
619     uint16_t s;
620     uint8_t d;
621     uint32_t res = 0;
622 #define SAT8(n) \
623     s = x >> n; \
624     if (s > 0xff) { \
625         d = 0xff; \
626         SET_QC(); \
627     } else  { \
628         d = s; \
629     } \
630     res |= (uint32_t)d << (n / 2);
631 
632     SAT8(0);
633     SAT8(16);
634     SAT8(32);
635     SAT8(48);
636 #undef SAT8
637     return res;
638 }
639 
640 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
641 {
642     int16_t s;
643     uint8_t d;
644     uint32_t res = 0;
645 #define SAT8(n) \
646     s = x >> n; \
647     if (s != (int8_t)s) { \
648         d = (s >> 15) ^ 0x7f; \
649         SET_QC(); \
650     } else  { \
651         d = s; \
652     } \
653     res |= (uint32_t)d << (n / 2);
654 
655     SAT8(0);
656     SAT8(16);
657     SAT8(32);
658     SAT8(48);
659 #undef SAT8
660     return res;
661 }
662 
663 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
664 {
665     uint32_t high;
666     uint32_t low;
667     low = x;
668     if (low & 0x80000000) {
669         low = 0;
670         SET_QC();
671     } else if (low > 0xffff) {
672         low = 0xffff;
673         SET_QC();
674     }
675     high = x >> 32;
676     if (high & 0x80000000) {
677         high = 0;
678         SET_QC();
679     } else if (high > 0xffff) {
680         high = 0xffff;
681         SET_QC();
682     }
683     return low | (high << 16);
684 }
685 
686 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
687 {
688     uint32_t high;
689     uint32_t low;
690     low = x;
691     if (low > 0xffff) {
692         low = 0xffff;
693         SET_QC();
694     }
695     high = x >> 32;
696     if (high > 0xffff) {
697         high = 0xffff;
698         SET_QC();
699     }
700     return low | (high << 16);
701 }
702 
703 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
704 {
705     int32_t low;
706     int32_t high;
707     low = x;
708     if (low != (int16_t)low) {
709         low = (low >> 31) ^ 0x7fff;
710         SET_QC();
711     }
712     high = x >> 32;
713     if (high != (int16_t)high) {
714         high = (high >> 31) ^ 0x7fff;
715         SET_QC();
716     }
717     return (uint16_t)low | (high << 16);
718 }
719 
720 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
721 {
722     if (x & 0x8000000000000000ull) {
723         SET_QC();
724         return 0;
725     }
726     if (x > 0xffffffffu) {
727         SET_QC();
728         return 0xffffffffu;
729     }
730     return x;
731 }
732 
733 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
734 {
735     if (x > 0xffffffffu) {
736         SET_QC();
737         return 0xffffffffu;
738     }
739     return x;
740 }
741 
742 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
743 {
744     if ((int64_t)x != (int32_t)x) {
745         SET_QC();
746         return ((int64_t)x >> 63) ^ 0x7fffffff;
747     }
748     return x;
749 }
750 
751 uint64_t HELPER(neon_widen_u8)(uint32_t x)
752 {
753     uint64_t tmp;
754     uint64_t ret;
755     ret = (uint8_t)x;
756     tmp = (uint8_t)(x >> 8);
757     ret |= tmp << 16;
758     tmp = (uint8_t)(x >> 16);
759     ret |= tmp << 32;
760     tmp = (uint8_t)(x >> 24);
761     ret |= tmp << 48;
762     return ret;
763 }
764 
765 uint64_t HELPER(neon_widen_s8)(uint32_t x)
766 {
767     uint64_t tmp;
768     uint64_t ret;
769     ret = (uint16_t)(int8_t)x;
770     tmp = (uint16_t)(int8_t)(x >> 8);
771     ret |= tmp << 16;
772     tmp = (uint16_t)(int8_t)(x >> 16);
773     ret |= tmp << 32;
774     tmp = (uint16_t)(int8_t)(x >> 24);
775     ret |= tmp << 48;
776     return ret;
777 }
778 
779 uint64_t HELPER(neon_widen_u16)(uint32_t x)
780 {
781     uint64_t high = (uint16_t)(x >> 16);
782     return ((uint16_t)x) | (high << 32);
783 }
784 
785 uint64_t HELPER(neon_widen_s16)(uint32_t x)
786 {
787     uint64_t high = (int16_t)(x >> 16);
788     return ((uint32_t)(int16_t)x) | (high << 32);
789 }
790 
791 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
792 {
793     uint64_t mask;
794     mask = (a ^ b) & 0x8000800080008000ull;
795     a &= ~0x8000800080008000ull;
796     b &= ~0x8000800080008000ull;
797     return (a + b) ^ mask;
798 }
799 
800 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
801 {
802     uint64_t mask;
803     mask = (a ^ b) & 0x8000000080000000ull;
804     a &= ~0x8000000080000000ull;
805     b &= ~0x8000000080000000ull;
806     return (a + b) ^ mask;
807 }
808 
809 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
810 {
811     uint64_t tmp;
812     uint64_t tmp2;
813 
814     tmp = a & 0x0000ffff0000ffffull;
815     tmp += (a >> 16) & 0x0000ffff0000ffffull;
816     tmp2 = b & 0xffff0000ffff0000ull;
817     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
818     return    ( tmp         & 0xffff)
819             | ((tmp  >> 16) & 0xffff0000ull)
820             | ((tmp2 << 16) & 0xffff00000000ull)
821             | ( tmp2        & 0xffff000000000000ull);
822 }
823 
824 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
825 {
826     uint32_t low = a + (a >> 32);
827     uint32_t high = b + (b >> 32);
828     return low + ((uint64_t)high << 32);
829 }
830 
831 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
832 {
833     uint64_t mask;
834     mask = (a ^ ~b) & 0x8000800080008000ull;
835     a |= 0x8000800080008000ull;
836     b &= ~0x8000800080008000ull;
837     return (a - b) ^ mask;
838 }
839 
840 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
841 {
842     uint64_t mask;
843     mask = (a ^ ~b) & 0x8000000080000000ull;
844     a |= 0x8000000080000000ull;
845     b &= ~0x8000000080000000ull;
846     return (a - b) ^ mask;
847 }
848 
849 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
850 {
851     uint32_t x, y;
852     uint32_t low, high;
853 
854     x = a;
855     y = b;
856     low = x + y;
857     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
858         SET_QC();
859         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
860     }
861     x = a >> 32;
862     y = b >> 32;
863     high = x + y;
864     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
865         SET_QC();
866         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
867     }
868     return low | ((uint64_t)high << 32);
869 }
870 
871 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
872 {
873     uint64_t result;
874 
875     result = a + b;
876     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
877         SET_QC();
878         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
879     }
880     return result;
881 }
882 
883 /* We have to do the arithmetic in a larger type than
884  * the input type, because for example with a signed 32 bit
885  * op the absolute difference can overflow a signed 32 bit value.
886  */
887 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
888     arithtype tmp_x = (intype)(x);                            \
889     arithtype tmp_y = (intype)(y);                            \
890     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
891     } while(0)
892 
893 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
894 {
895     uint64_t tmp;
896     uint64_t result;
897     DO_ABD(result, a, b, uint8_t, uint32_t);
898     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
899     result |= tmp << 16;
900     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
901     result |= tmp << 32;
902     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
903     result |= tmp << 48;
904     return result;
905 }
906 
907 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
908 {
909     uint64_t tmp;
910     uint64_t result;
911     DO_ABD(result, a, b, int8_t, int32_t);
912     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
913     result |= tmp << 16;
914     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
915     result |= tmp << 32;
916     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
917     result |= tmp << 48;
918     return result;
919 }
920 
921 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
922 {
923     uint64_t tmp;
924     uint64_t result;
925     DO_ABD(result, a, b, uint16_t, uint32_t);
926     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
927     return result | (tmp << 32);
928 }
929 
930 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
931 {
932     uint64_t tmp;
933     uint64_t result;
934     DO_ABD(result, a, b, int16_t, int32_t);
935     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
936     return result | (tmp << 32);
937 }
938 
939 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
940 {
941     uint64_t result;
942     DO_ABD(result, a, b, uint32_t, uint64_t);
943     return result;
944 }
945 
946 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
947 {
948     uint64_t result;
949     DO_ABD(result, a, b, int32_t, int64_t);
950     return result;
951 }
952 #undef DO_ABD
953 
954 /* Widening multiply. Named type is the source type.  */
955 #define DO_MULL(dest, x, y, type1, type2) do { \
956     type1 tmp_x = x; \
957     type1 tmp_y = y; \
958     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
959     } while(0)
960 
961 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
962 {
963     uint64_t tmp;
964     uint64_t result;
965 
966     DO_MULL(result, a, b, uint8_t, uint16_t);
967     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
968     result |= tmp << 16;
969     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
970     result |= tmp << 32;
971     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
972     result |= tmp << 48;
973     return result;
974 }
975 
976 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
977 {
978     uint64_t tmp;
979     uint64_t result;
980 
981     DO_MULL(result, a, b, int8_t, uint16_t);
982     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
983     result |= tmp << 16;
984     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
985     result |= tmp << 32;
986     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
987     result |= tmp << 48;
988     return result;
989 }
990 
991 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
992 {
993     uint64_t tmp;
994     uint64_t result;
995 
996     DO_MULL(result, a, b, uint16_t, uint32_t);
997     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
998     return result | (tmp << 32);
999 }
1000 
1001 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1002 {
1003     uint64_t tmp;
1004     uint64_t result;
1005 
1006     DO_MULL(result, a, b, int16_t, uint32_t);
1007     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1008     return result | (tmp << 32);
1009 }
1010 
1011 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1012 {
1013     uint16_t tmp;
1014     uint64_t result;
1015     result = (uint16_t)-x;
1016     tmp = -(x >> 16);
1017     result |= (uint64_t)tmp << 16;
1018     tmp = -(x >> 32);
1019     result |= (uint64_t)tmp << 32;
1020     tmp = -(x >> 48);
1021     result |= (uint64_t)tmp << 48;
1022     return result;
1023 }
1024 
1025 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1026 {
1027     uint32_t low = -x;
1028     uint32_t high = -(x >> 32);
1029     return low | ((uint64_t)high << 32);
1030 }
1031 
1032 /* Saturating sign manipulation.  */
1033 /* ??? Make these use NEON_VOP1 */
1034 #define DO_QABS8(x) do { \
1035     if (x == (int8_t)0x80) { \
1036         x = 0x7f; \
1037         SET_QC(); \
1038     } else if (x < 0) { \
1039         x = -x; \
1040     }} while (0)
1041 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1042 {
1043     neon_s8 vec;
1044     NEON_UNPACK(neon_s8, vec, x);
1045     DO_QABS8(vec.v1);
1046     DO_QABS8(vec.v2);
1047     DO_QABS8(vec.v3);
1048     DO_QABS8(vec.v4);
1049     NEON_PACK(neon_s8, x, vec);
1050     return x;
1051 }
1052 #undef DO_QABS8
1053 
1054 #define DO_QNEG8(x) do { \
1055     if (x == (int8_t)0x80) { \
1056         x = 0x7f; \
1057         SET_QC(); \
1058     } else { \
1059         x = -x; \
1060     }} while (0)
1061 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1062 {
1063     neon_s8 vec;
1064     NEON_UNPACK(neon_s8, vec, x);
1065     DO_QNEG8(vec.v1);
1066     DO_QNEG8(vec.v2);
1067     DO_QNEG8(vec.v3);
1068     DO_QNEG8(vec.v4);
1069     NEON_PACK(neon_s8, x, vec);
1070     return x;
1071 }
1072 #undef DO_QNEG8
1073 
1074 #define DO_QABS16(x) do { \
1075     if (x == (int16_t)0x8000) { \
1076         x = 0x7fff; \
1077         SET_QC(); \
1078     } else if (x < 0) { \
1079         x = -x; \
1080     }} while (0)
1081 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1082 {
1083     neon_s16 vec;
1084     NEON_UNPACK(neon_s16, vec, x);
1085     DO_QABS16(vec.v1);
1086     DO_QABS16(vec.v2);
1087     NEON_PACK(neon_s16, x, vec);
1088     return x;
1089 }
1090 #undef DO_QABS16
1091 
1092 #define DO_QNEG16(x) do { \
1093     if (x == (int16_t)0x8000) { \
1094         x = 0x7fff; \
1095         SET_QC(); \
1096     } else { \
1097         x = -x; \
1098     }} while (0)
1099 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1100 {
1101     neon_s16 vec;
1102     NEON_UNPACK(neon_s16, vec, x);
1103     DO_QNEG16(vec.v1);
1104     DO_QNEG16(vec.v2);
1105     NEON_PACK(neon_s16, x, vec);
1106     return x;
1107 }
1108 #undef DO_QNEG16
1109 
1110 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1111 {
1112     if (x == SIGNBIT) {
1113         SET_QC();
1114         x = ~SIGNBIT;
1115     } else if ((int32_t)x < 0) {
1116         x = -x;
1117     }
1118     return x;
1119 }
1120 
1121 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1122 {
1123     if (x == SIGNBIT) {
1124         SET_QC();
1125         x = ~SIGNBIT;
1126     } else {
1127         x = -x;
1128     }
1129     return x;
1130 }
1131 
1132 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1133 {
1134     if (x == SIGNBIT64) {
1135         SET_QC();
1136         x = ~SIGNBIT64;
1137     } else if ((int64_t)x < 0) {
1138         x = -x;
1139     }
1140     return x;
1141 }
1142 
1143 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1144 {
1145     if (x == SIGNBIT64) {
1146         SET_QC();
1147         x = ~SIGNBIT64;
1148     } else {
1149         x = -x;
1150     }
1151     return x;
1152 }
1153 
1154 /* NEON Float helpers.  */
1155 
1156 /* Floating point comparisons produce an integer result.
1157  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1158  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1159  */
1160 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1161 {
1162     float_status *fpst = fpstp;
1163     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1164 }
1165 
1166 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1167 {
1168     float_status *fpst = fpstp;
1169     return -float32_le(make_float32(b), make_float32(a), fpst);
1170 }
1171 
1172 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1173 {
1174     float_status *fpst = fpstp;
1175     return -float32_lt(make_float32(b), make_float32(a), fpst);
1176 }
1177 
1178 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1179 {
1180     float_status *fpst = fpstp;
1181     float32 f0 = float32_abs(make_float32(a));
1182     float32 f1 = float32_abs(make_float32(b));
1183     return -float32_le(f1, f0, fpst);
1184 }
1185 
1186 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1187 {
1188     float_status *fpst = fpstp;
1189     float32 f0 = float32_abs(make_float32(a));
1190     float32 f1 = float32_abs(make_float32(b));
1191     return -float32_lt(f1, f0, fpst);
1192 }
1193 
1194 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1195 {
1196     float_status *fpst = fpstp;
1197     float64 f0 = float64_abs(make_float64(a));
1198     float64 f1 = float64_abs(make_float64(b));
1199     return -float64_le(f1, f0, fpst);
1200 }
1201 
1202 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1203 {
1204     float_status *fpst = fpstp;
1205     float64 f0 = float64_abs(make_float64(a));
1206     float64 f1 = float64_abs(make_float64(b));
1207     return -float64_lt(f1, f0, fpst);
1208 }
1209 
1210 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1211 
1212 void HELPER(neon_qunzip8)(void *vd, void *vm)
1213 {
1214     uint64_t *rd = vd, *rm = vm;
1215     uint64_t zd0 = rd[0], zd1 = rd[1];
1216     uint64_t zm0 = rm[0], zm1 = rm[1];
1217 
1218     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1219         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1220         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1221         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1222     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1223         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1224         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1225         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1226     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1227         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1228         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1229         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1230     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1231         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1232         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1233         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1234 
1235     rm[0] = m0;
1236     rm[1] = m1;
1237     rd[0] = d0;
1238     rd[1] = d1;
1239 }
1240 
1241 void HELPER(neon_qunzip16)(void *vd, void *vm)
1242 {
1243     uint64_t *rd = vd, *rm = vm;
1244     uint64_t zd0 = rd[0], zd1 = rd[1];
1245     uint64_t zm0 = rm[0], zm1 = rm[1];
1246 
1247     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1248         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1249     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1250         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1251     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1252         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1253     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1254         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1255 
1256     rm[0] = m0;
1257     rm[1] = m1;
1258     rd[0] = d0;
1259     rd[1] = d1;
1260 }
1261 
1262 void HELPER(neon_qunzip32)(void *vd, void *vm)
1263 {
1264     uint64_t *rd = vd, *rm = vm;
1265     uint64_t zd0 = rd[0], zd1 = rd[1];
1266     uint64_t zm0 = rm[0], zm1 = rm[1];
1267 
1268     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1269     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1270     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1271     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1272 
1273     rm[0] = m0;
1274     rm[1] = m1;
1275     rd[0] = d0;
1276     rd[1] = d1;
1277 }
1278 
1279 void HELPER(neon_unzip8)(void *vd, void *vm)
1280 {
1281     uint64_t *rd = vd, *rm = vm;
1282     uint64_t zd = rd[0], zm = rm[0];
1283 
1284     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1285         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1286         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1287         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1288     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1289         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1290         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1291         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1292 
1293     rm[0] = m0;
1294     rd[0] = d0;
1295 }
1296 
1297 void HELPER(neon_unzip16)(void *vd, void *vm)
1298 {
1299     uint64_t *rd = vd, *rm = vm;
1300     uint64_t zd = rd[0], zm = rm[0];
1301 
1302     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1303         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1304     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1305         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1306 
1307     rm[0] = m0;
1308     rd[0] = d0;
1309 }
1310 
1311 void HELPER(neon_qzip8)(void *vd, void *vm)
1312 {
1313     uint64_t *rd = vd, *rm = vm;
1314     uint64_t zd0 = rd[0], zd1 = rd[1];
1315     uint64_t zm0 = rm[0], zm1 = rm[1];
1316 
1317     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1318         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1319         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1320         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1321     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1322         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1323         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1324         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1325     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1326         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1327         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1328         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1329     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1330         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1331         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1332         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1333 
1334     rm[0] = m0;
1335     rm[1] = m1;
1336     rd[0] = d0;
1337     rd[1] = d1;
1338 }
1339 
1340 void HELPER(neon_qzip16)(void *vd, void *vm)
1341 {
1342     uint64_t *rd = vd, *rm = vm;
1343     uint64_t zd0 = rd[0], zd1 = rd[1];
1344     uint64_t zm0 = rm[0], zm1 = rm[1];
1345 
1346     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1347         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1348     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1349         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1350     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1351         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1352     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1353         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1354 
1355     rm[0] = m0;
1356     rm[1] = m1;
1357     rd[0] = d0;
1358     rd[1] = d1;
1359 }
1360 
1361 void HELPER(neon_qzip32)(void *vd, void *vm)
1362 {
1363     uint64_t *rd = vd, *rm = vm;
1364     uint64_t zd0 = rd[0], zd1 = rd[1];
1365     uint64_t zm0 = rm[0], zm1 = rm[1];
1366 
1367     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1368     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1369     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1370     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1371 
1372     rm[0] = m0;
1373     rm[1] = m1;
1374     rd[0] = d0;
1375     rd[1] = d1;
1376 }
1377 
1378 void HELPER(neon_zip8)(void *vd, void *vm)
1379 {
1380     uint64_t *rd = vd, *rm = vm;
1381     uint64_t zd = rd[0], zm = rm[0];
1382 
1383     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1384         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1385         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1386         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1387     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1388         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1389         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1390         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1391 
1392     rm[0] = m0;
1393     rd[0] = d0;
1394 }
1395 
1396 void HELPER(neon_zip16)(void *vd, void *vm)
1397 {
1398     uint64_t *rd = vd, *rm = vm;
1399     uint64_t zd = rd[0], zm = rm[0];
1400 
1401     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1402         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1403     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1404         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1405 
1406     rm[0] = m0;
1407     rd[0] = d0;
1408 }
1409