xref: /openbmc/qemu/target/arm/tcg/neon_helper.c (revision 940392c834e9e1a707bae584ac28f63d832b033f)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
16 
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
19 
20 #define SET_QC() env->vfp.qc[0] = 1
21 
22 #define NEON_TYPE1(name, type) \
23 typedef struct \
24 { \
25     type v1; \
26 } neon_##name;
27 #if HOST_BIG_ENDIAN
28 #define NEON_TYPE2(name, type) \
29 typedef struct \
30 { \
31     type v2; \
32     type v1; \
33 } neon_##name;
34 #define NEON_TYPE4(name, type) \
35 typedef struct \
36 { \
37     type v4; \
38     type v3; \
39     type v2; \
40     type v1; \
41 } neon_##name;
42 #else
43 #define NEON_TYPE2(name, type) \
44 typedef struct \
45 { \
46     type v1; \
47     type v2; \
48 } neon_##name;
49 #define NEON_TYPE4(name, type) \
50 typedef struct \
51 { \
52     type v1; \
53     type v2; \
54     type v3; \
55     type v4; \
56 } neon_##name;
57 #endif
58 
59 NEON_TYPE4(s8, int8_t)
60 NEON_TYPE4(u8, uint8_t)
61 NEON_TYPE2(s16, int16_t)
62 NEON_TYPE2(u16, uint16_t)
63 NEON_TYPE1(s32, int32_t)
64 NEON_TYPE1(u32, uint32_t)
65 #undef NEON_TYPE4
66 #undef NEON_TYPE2
67 #undef NEON_TYPE1
68 
69 /* Copy from a uint32_t to a vector structure type.  */
70 #define NEON_UNPACK(vtype, dest, val) do { \
71     union { \
72         vtype v; \
73         uint32_t i; \
74     } conv_u; \
75     conv_u.i = (val); \
76     dest = conv_u.v; \
77     } while(0)
78 
79 /* Copy from a vector structure type to a uint32_t.  */
80 #define NEON_PACK(vtype, dest, val) do { \
81     union { \
82         vtype v; \
83         uint32_t i; \
84     } conv_u; \
85     conv_u.v = (val); \
86     dest = conv_u.i; \
87     } while(0)
88 
89 #define NEON_DO1 \
90     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91 #define NEON_DO2 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94 #define NEON_DO4 \
95     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99 
100 #define NEON_VOP_BODY(vtype, n) \
101 { \
102     uint32_t res; \
103     vtype vsrc1; \
104     vtype vsrc2; \
105     vtype vdest; \
106     NEON_UNPACK(vtype, vsrc1, arg1); \
107     NEON_UNPACK(vtype, vsrc2, arg2); \
108     NEON_DO##n; \
109     NEON_PACK(vtype, res, vdest); \
110     return res; \
111 }
112 
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
116 
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
120 
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123 {                                                               \
124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
125     vtype *d = vd, *n = vn, *m = vm;                            \
126     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
127         NEON_FN(d[i], n[i], m[i]);                              \
128     }                                                           \
129     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
130 }
131 
132 /* Pairwise operations.  */
133 /* For 32-bit elements each segment only contains a single element, so
134    the elementwise and pairwise operations are the same.  */
135 #define NEON_PDO2 \
136     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
137     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
138 #define NEON_PDO4 \
139     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
140     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
141     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
142     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
143 
144 #define NEON_POP(name, vtype, n) \
145 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
146 { \
147     uint32_t res; \
148     vtype vsrc1; \
149     vtype vsrc2; \
150     vtype vdest; \
151     NEON_UNPACK(vtype, vsrc1, arg1); \
152     NEON_UNPACK(vtype, vsrc2, arg2); \
153     NEON_PDO##n; \
154     NEON_PACK(vtype, res, vdest); \
155     return res; \
156 }
157 
158 /* Unary operators.  */
159 #define NEON_VOP1(name, vtype, n) \
160 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
161 { \
162     vtype vsrc1; \
163     vtype vdest; \
164     NEON_UNPACK(vtype, vsrc1, arg); \
165     NEON_DO##n; \
166     NEON_PACK(vtype, arg, vdest); \
167     return arg; \
168 }
169 
170 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
171 NEON_VOP(hadd_s8, neon_s8, 4)
172 NEON_VOP(hadd_u8, neon_u8, 4)
173 NEON_VOP(hadd_s16, neon_s16, 2)
174 NEON_VOP(hadd_u16, neon_u16, 2)
175 #undef NEON_FN
176 
177 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
178 {
179     int32_t dest;
180 
181     dest = (src1 >> 1) + (src2 >> 1);
182     if (src1 & src2 & 1)
183         dest++;
184     return dest;
185 }
186 
187 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
188 {
189     uint32_t dest;
190 
191     dest = (src1 >> 1) + (src2 >> 1);
192     if (src1 & src2 & 1)
193         dest++;
194     return dest;
195 }
196 
197 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
198 NEON_VOP(rhadd_s8, neon_s8, 4)
199 NEON_VOP(rhadd_u8, neon_u8, 4)
200 NEON_VOP(rhadd_s16, neon_s16, 2)
201 NEON_VOP(rhadd_u16, neon_u16, 2)
202 #undef NEON_FN
203 
204 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
205 {
206     int32_t dest;
207 
208     dest = (src1 >> 1) + (src2 >> 1);
209     if ((src1 | src2) & 1)
210         dest++;
211     return dest;
212 }
213 
214 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
215 {
216     uint32_t dest;
217 
218     dest = (src1 >> 1) + (src2 >> 1);
219     if ((src1 | src2) & 1)
220         dest++;
221     return dest;
222 }
223 
224 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
225 NEON_VOP(hsub_s8, neon_s8, 4)
226 NEON_VOP(hsub_u8, neon_u8, 4)
227 NEON_VOP(hsub_s16, neon_s16, 2)
228 NEON_VOP(hsub_u16, neon_u16, 2)
229 #undef NEON_FN
230 
231 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
232 {
233     int32_t dest;
234 
235     dest = (src1 >> 1) - (src2 >> 1);
236     if ((~src1) & src2 & 1)
237         dest--;
238     return dest;
239 }
240 
241 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
242 {
243     uint32_t dest;
244 
245     dest = (src1 >> 1) - (src2 >> 1);
246     if ((~src1) & src2 & 1)
247         dest--;
248     return dest;
249 }
250 
251 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
252 NEON_POP(pmin_s8, neon_s8, 4)
253 NEON_POP(pmin_u8, neon_u8, 4)
254 NEON_POP(pmin_s16, neon_s16, 2)
255 NEON_POP(pmin_u16, neon_u16, 2)
256 #undef NEON_FN
257 
258 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
259 NEON_POP(pmax_s8, neon_s8, 4)
260 NEON_POP(pmax_u8, neon_u8, 4)
261 NEON_POP(pmax_s16, neon_s16, 2)
262 NEON_POP(pmax_u16, neon_u16, 2)
263 #undef NEON_FN
264 
265 #define NEON_FN(dest, src1, src2) \
266     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
267 NEON_VOP(shl_u16, neon_u16, 2)
268 #undef NEON_FN
269 
270 #define NEON_FN(dest, src1, src2) \
271     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
272 NEON_VOP(shl_s16, neon_s16, 2)
273 #undef NEON_FN
274 
275 #define NEON_FN(dest, src1, src2) \
276     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
277 NEON_VOP(rshl_s8, neon_s8, 4)
278 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
279 #undef NEON_FN
280 
281 #define NEON_FN(dest, src1, src2) \
282     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
283 NEON_VOP(rshl_s16, neon_s16, 2)
284 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
285 #undef NEON_FN
286 
287 #define NEON_FN(dest, src1, src2) \
288     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
289 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
290 #undef NEON_FN
291 
292 #define NEON_FN(dest, src1, src2) \
293     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
294 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
295 #undef NEON_FN
296 
297 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
298 {
299     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
300 }
301 
302 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
303 {
304     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
305 }
306 
307 #define NEON_FN(dest, src1, src2) \
308     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
309 NEON_VOP(rshl_u8, neon_u8, 4)
310 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
311 #undef NEON_FN
312 
313 #define NEON_FN(dest, src1, src2) \
314     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
315 NEON_VOP(rshl_u16, neon_u16, 2)
316 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
317 #undef NEON_FN
318 
319 #define NEON_FN(dest, src1, src2) \
320     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
321 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
322 #undef NEON_FN
323 
324 #define NEON_FN(dest, src1, src2) \
325     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
326 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
327 #undef NEON_FN
328 
329 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
330 {
331     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
332 }
333 
334 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
335 {
336     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
337 }
338 
339 #define NEON_FN(dest, src1, src2) \
340     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
341 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
342 #undef NEON_FN
343 
344 #define NEON_FN(dest, src1, src2) \
345     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
346 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
347 #undef NEON_FN
348 
349 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
350 {
351     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
352 }
353 
354 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
355 {
356     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
357 }
358 
359 #define NEON_FN(dest, src1, src2) \
360     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
361 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
362 #undef NEON_FN
363 
364 #define NEON_FN(dest, src1, src2) \
365     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
366 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
367 #undef NEON_FN
368 
369 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
370 {
371     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
372 }
373 
374 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
375 {
376     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
377 }
378 
379 #define NEON_FN(dest, src1, src2) \
380     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
381 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
382 #undef NEON_FN
383 
384 #define NEON_FN(dest, src1, src2) \
385     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
386 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
387 #undef NEON_FN
388 
389 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
390 {
391     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
392 }
393 
394 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
395 {
396     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
397 }
398 
399 #define NEON_FN(dest, src1, src2) \
400     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
401 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
402 #undef NEON_FN
403 
404 #define NEON_FN(dest, src1, src2) \
405     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
406 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
407 #undef NEON_FN
408 
409 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
410 {
411     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
412 }
413 
414 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
415 {
416     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
417 }
418 
419 #define NEON_FN(dest, src1, src2) \
420     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
421 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
422 #undef NEON_FN
423 
424 #define NEON_FN(dest, src1, src2) \
425     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
426 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
427 #undef NEON_FN
428 
429 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
430 {
431     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
432 }
433 
434 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
435 {
436     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
437 }
438 
439 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
440 {
441     uint32_t mask;
442     mask = (a ^ b) & 0x80808080u;
443     a &= ~0x80808080u;
444     b &= ~0x80808080u;
445     return (a + b) ^ mask;
446 }
447 
448 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
449 {
450     uint32_t mask;
451     mask = (a ^ b) & 0x80008000u;
452     a &= ~0x80008000u;
453     b &= ~0x80008000u;
454     return (a + b) ^ mask;
455 }
456 
457 #define NEON_FN(dest, src1, src2) dest = src1 - src2
458 NEON_VOP(sub_u8, neon_u8, 4)
459 NEON_VOP(sub_u16, neon_u16, 2)
460 #undef NEON_FN
461 
462 #define NEON_FN(dest, src1, src2) dest = src1 * src2
463 NEON_VOP(mul_u8, neon_u8, 4)
464 NEON_VOP(mul_u16, neon_u16, 2)
465 #undef NEON_FN
466 
467 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
468 NEON_VOP(tst_u8, neon_u8, 4)
469 NEON_VOP(tst_u16, neon_u16, 2)
470 NEON_VOP(tst_u32, neon_u32, 1)
471 #undef NEON_FN
472 
473 /* Count Leading Sign/Zero Bits.  */
474 static inline int do_clz8(uint8_t x)
475 {
476     int n;
477     for (n = 8; x; n--)
478         x >>= 1;
479     return n;
480 }
481 
482 static inline int do_clz16(uint16_t x)
483 {
484     int n;
485     for (n = 16; x; n--)
486         x >>= 1;
487     return n;
488 }
489 
490 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
491 NEON_VOP1(clz_u8, neon_u8, 4)
492 #undef NEON_FN
493 
494 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
495 NEON_VOP1(clz_u16, neon_u16, 2)
496 #undef NEON_FN
497 
498 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
499 NEON_VOP1(cls_s8, neon_s8, 4)
500 #undef NEON_FN
501 
502 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
503 NEON_VOP1(cls_s16, neon_s16, 2)
504 #undef NEON_FN
505 
506 uint32_t HELPER(neon_cls_s32)(uint32_t x)
507 {
508     int count;
509     if ((int32_t)x < 0)
510         x = ~x;
511     for (count = 32; x; count--)
512         x = x >> 1;
513     return count - 1;
514 }
515 
516 /* Bit count.  */
517 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
518 {
519     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
520     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
521     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
522     return x;
523 }
524 
525 /* Reverse bits in each 8 bit word */
526 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
527 {
528     x =  ((x & 0xf0f0f0f0) >> 4)
529        | ((x & 0x0f0f0f0f) << 4);
530     x =  ((x & 0x88888888) >> 3)
531        | ((x & 0x44444444) >> 1)
532        | ((x & 0x22222222) << 1)
533        | ((x & 0x11111111) << 3);
534     return x;
535 }
536 
537 #define NEON_QDMULH16(dest, src1, src2, round) do { \
538     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
539     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
540         SET_QC(); \
541         tmp = (tmp >> 31) ^ ~SIGNBIT; \
542     } else { \
543         tmp <<= 1; \
544     } \
545     if (round) { \
546         int32_t old = tmp; \
547         tmp += 1 << 15; \
548         if ((int32_t)tmp < old) { \
549             SET_QC(); \
550             tmp = SIGNBIT - 1; \
551         } \
552     } \
553     dest = tmp >> 16; \
554     } while(0)
555 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
556 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
557 #undef NEON_FN
558 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
559 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
560 #undef NEON_FN
561 #undef NEON_QDMULH16
562 
563 #define NEON_QDMULH32(dest, src1, src2, round) do { \
564     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
565     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
566         SET_QC(); \
567         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
568     } else { \
569         tmp <<= 1; \
570     } \
571     if (round) { \
572         int64_t old = tmp; \
573         tmp += (int64_t)1 << 31; \
574         if ((int64_t)tmp < old) { \
575             SET_QC(); \
576             tmp = SIGNBIT64 - 1; \
577         } \
578     } \
579     dest = tmp >> 32; \
580     } while(0)
581 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
582 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
583 #undef NEON_FN
584 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
585 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
586 #undef NEON_FN
587 #undef NEON_QDMULH32
588 
589 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
590 {
591     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
592            | ((x >> 24) & 0xff000000u);
593 }
594 
595 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
596 {
597     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
598 }
599 
600 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
601 {
602     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
603             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
604 }
605 
606 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
607 {
608     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
609 }
610 
611 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
612 {
613     x &= 0xff80ff80ff80ff80ull;
614     x += 0x0080008000800080ull;
615     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
616             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
617 }
618 
619 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
620 {
621     x &= 0xffff8000ffff8000ull;
622     x += 0x0000800000008000ull;
623     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
624 }
625 
626 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
627 {
628     uint16_t s;
629     uint8_t d;
630     uint32_t res = 0;
631 #define SAT8(n) \
632     s = x >> n; \
633     if (s & 0x8000) { \
634         SET_QC(); \
635     } else { \
636         if (s > 0xff) { \
637             d = 0xff; \
638             SET_QC(); \
639         } else  { \
640             d = s; \
641         } \
642         res |= (uint32_t)d << (n / 2); \
643     }
644 
645     SAT8(0);
646     SAT8(16);
647     SAT8(32);
648     SAT8(48);
649 #undef SAT8
650     return res;
651 }
652 
653 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
654 {
655     uint16_t s;
656     uint8_t d;
657     uint32_t res = 0;
658 #define SAT8(n) \
659     s = x >> n; \
660     if (s > 0xff) { \
661         d = 0xff; \
662         SET_QC(); \
663     } else  { \
664         d = s; \
665     } \
666     res |= (uint32_t)d << (n / 2);
667 
668     SAT8(0);
669     SAT8(16);
670     SAT8(32);
671     SAT8(48);
672 #undef SAT8
673     return res;
674 }
675 
676 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
677 {
678     int16_t s;
679     uint8_t d;
680     uint32_t res = 0;
681 #define SAT8(n) \
682     s = x >> n; \
683     if (s != (int8_t)s) { \
684         d = (s >> 15) ^ 0x7f; \
685         SET_QC(); \
686     } else  { \
687         d = s; \
688     } \
689     res |= (uint32_t)d << (n / 2);
690 
691     SAT8(0);
692     SAT8(16);
693     SAT8(32);
694     SAT8(48);
695 #undef SAT8
696     return res;
697 }
698 
699 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
700 {
701     uint32_t high;
702     uint32_t low;
703     low = x;
704     if (low & 0x80000000) {
705         low = 0;
706         SET_QC();
707     } else if (low > 0xffff) {
708         low = 0xffff;
709         SET_QC();
710     }
711     high = x >> 32;
712     if (high & 0x80000000) {
713         high = 0;
714         SET_QC();
715     } else if (high > 0xffff) {
716         high = 0xffff;
717         SET_QC();
718     }
719     return low | (high << 16);
720 }
721 
722 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
723 {
724     uint32_t high;
725     uint32_t low;
726     low = x;
727     if (low > 0xffff) {
728         low = 0xffff;
729         SET_QC();
730     }
731     high = x >> 32;
732     if (high > 0xffff) {
733         high = 0xffff;
734         SET_QC();
735     }
736     return low | (high << 16);
737 }
738 
739 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
740 {
741     int32_t low;
742     int32_t high;
743     low = x;
744     if (low != (int16_t)low) {
745         low = (low >> 31) ^ 0x7fff;
746         SET_QC();
747     }
748     high = x >> 32;
749     if (high != (int16_t)high) {
750         high = (high >> 31) ^ 0x7fff;
751         SET_QC();
752     }
753     return (uint16_t)low | (high << 16);
754 }
755 
756 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
757 {
758     if (x & 0x8000000000000000ull) {
759         SET_QC();
760         return 0;
761     }
762     if (x > 0xffffffffu) {
763         SET_QC();
764         return 0xffffffffu;
765     }
766     return x;
767 }
768 
769 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
770 {
771     if (x > 0xffffffffu) {
772         SET_QC();
773         return 0xffffffffu;
774     }
775     return x;
776 }
777 
778 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
779 {
780     if ((int64_t)x != (int32_t)x) {
781         SET_QC();
782         return ((int64_t)x >> 63) ^ 0x7fffffff;
783     }
784     return x;
785 }
786 
787 uint64_t HELPER(neon_widen_u8)(uint32_t x)
788 {
789     uint64_t tmp;
790     uint64_t ret;
791     ret = (uint8_t)x;
792     tmp = (uint8_t)(x >> 8);
793     ret |= tmp << 16;
794     tmp = (uint8_t)(x >> 16);
795     ret |= tmp << 32;
796     tmp = (uint8_t)(x >> 24);
797     ret |= tmp << 48;
798     return ret;
799 }
800 
801 uint64_t HELPER(neon_widen_s8)(uint32_t x)
802 {
803     uint64_t tmp;
804     uint64_t ret;
805     ret = (uint16_t)(int8_t)x;
806     tmp = (uint16_t)(int8_t)(x >> 8);
807     ret |= tmp << 16;
808     tmp = (uint16_t)(int8_t)(x >> 16);
809     ret |= tmp << 32;
810     tmp = (uint16_t)(int8_t)(x >> 24);
811     ret |= tmp << 48;
812     return ret;
813 }
814 
815 uint64_t HELPER(neon_widen_u16)(uint32_t x)
816 {
817     uint64_t high = (uint16_t)(x >> 16);
818     return ((uint16_t)x) | (high << 32);
819 }
820 
821 uint64_t HELPER(neon_widen_s16)(uint32_t x)
822 {
823     uint64_t high = (int16_t)(x >> 16);
824     return ((uint32_t)(int16_t)x) | (high << 32);
825 }
826 
827 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
828 {
829     uint64_t mask;
830     mask = (a ^ b) & 0x8000800080008000ull;
831     a &= ~0x8000800080008000ull;
832     b &= ~0x8000800080008000ull;
833     return (a + b) ^ mask;
834 }
835 
836 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
837 {
838     uint64_t mask;
839     mask = (a ^ b) & 0x8000000080000000ull;
840     a &= ~0x8000000080000000ull;
841     b &= ~0x8000000080000000ull;
842     return (a + b) ^ mask;
843 }
844 
845 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
846 {
847     uint64_t tmp;
848     uint64_t tmp2;
849 
850     tmp = a & 0x0000ffff0000ffffull;
851     tmp += (a >> 16) & 0x0000ffff0000ffffull;
852     tmp2 = b & 0xffff0000ffff0000ull;
853     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
854     return    ( tmp         & 0xffff)
855             | ((tmp  >> 16) & 0xffff0000ull)
856             | ((tmp2 << 16) & 0xffff00000000ull)
857             | ( tmp2        & 0xffff000000000000ull);
858 }
859 
860 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
861 {
862     uint32_t low = a + (a >> 32);
863     uint32_t high = b + (b >> 32);
864     return low + ((uint64_t)high << 32);
865 }
866 
867 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
868 {
869     uint64_t mask;
870     mask = (a ^ ~b) & 0x8000800080008000ull;
871     a |= 0x8000800080008000ull;
872     b &= ~0x8000800080008000ull;
873     return (a - b) ^ mask;
874 }
875 
876 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
877 {
878     uint64_t mask;
879     mask = (a ^ ~b) & 0x8000000080000000ull;
880     a |= 0x8000000080000000ull;
881     b &= ~0x8000000080000000ull;
882     return (a - b) ^ mask;
883 }
884 
885 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
886 {
887     uint32_t x, y;
888     uint32_t low, high;
889 
890     x = a;
891     y = b;
892     low = x + y;
893     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
894         SET_QC();
895         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
896     }
897     x = a >> 32;
898     y = b >> 32;
899     high = x + y;
900     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
901         SET_QC();
902         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
903     }
904     return low | ((uint64_t)high << 32);
905 }
906 
907 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
908 {
909     uint64_t result;
910 
911     result = a + b;
912     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
913         SET_QC();
914         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
915     }
916     return result;
917 }
918 
919 /* We have to do the arithmetic in a larger type than
920  * the input type, because for example with a signed 32 bit
921  * op the absolute difference can overflow a signed 32 bit value.
922  */
923 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
924     arithtype tmp_x = (intype)(x);                            \
925     arithtype tmp_y = (intype)(y);                            \
926     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
927     } while(0)
928 
929 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
930 {
931     uint64_t tmp;
932     uint64_t result;
933     DO_ABD(result, a, b, uint8_t, uint32_t);
934     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
935     result |= tmp << 16;
936     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
937     result |= tmp << 32;
938     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
939     result |= tmp << 48;
940     return result;
941 }
942 
943 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
944 {
945     uint64_t tmp;
946     uint64_t result;
947     DO_ABD(result, a, b, int8_t, int32_t);
948     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
949     result |= tmp << 16;
950     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
951     result |= tmp << 32;
952     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
953     result |= tmp << 48;
954     return result;
955 }
956 
957 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
958 {
959     uint64_t tmp;
960     uint64_t result;
961     DO_ABD(result, a, b, uint16_t, uint32_t);
962     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
963     return result | (tmp << 32);
964 }
965 
966 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
967 {
968     uint64_t tmp;
969     uint64_t result;
970     DO_ABD(result, a, b, int16_t, int32_t);
971     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
972     return result | (tmp << 32);
973 }
974 
975 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
976 {
977     uint64_t result;
978     DO_ABD(result, a, b, uint32_t, uint64_t);
979     return result;
980 }
981 
982 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
983 {
984     uint64_t result;
985     DO_ABD(result, a, b, int32_t, int64_t);
986     return result;
987 }
988 #undef DO_ABD
989 
990 /* Widening multiply. Named type is the source type.  */
991 #define DO_MULL(dest, x, y, type1, type2) do { \
992     type1 tmp_x = x; \
993     type1 tmp_y = y; \
994     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
995     } while(0)
996 
997 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
998 {
999     uint64_t tmp;
1000     uint64_t result;
1001 
1002     DO_MULL(result, a, b, uint8_t, uint16_t);
1003     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1004     result |= tmp << 16;
1005     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1006     result |= tmp << 32;
1007     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1008     result |= tmp << 48;
1009     return result;
1010 }
1011 
1012 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1013 {
1014     uint64_t tmp;
1015     uint64_t result;
1016 
1017     DO_MULL(result, a, b, int8_t, uint16_t);
1018     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1019     result |= tmp << 16;
1020     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1021     result |= tmp << 32;
1022     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1023     result |= tmp << 48;
1024     return result;
1025 }
1026 
1027 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1028 {
1029     uint64_t tmp;
1030     uint64_t result;
1031 
1032     DO_MULL(result, a, b, uint16_t, uint32_t);
1033     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1034     return result | (tmp << 32);
1035 }
1036 
1037 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1038 {
1039     uint64_t tmp;
1040     uint64_t result;
1041 
1042     DO_MULL(result, a, b, int16_t, uint32_t);
1043     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1044     return result | (tmp << 32);
1045 }
1046 
1047 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1048 {
1049     uint16_t tmp;
1050     uint64_t result;
1051     result = (uint16_t)-x;
1052     tmp = -(x >> 16);
1053     result |= (uint64_t)tmp << 16;
1054     tmp = -(x >> 32);
1055     result |= (uint64_t)tmp << 32;
1056     tmp = -(x >> 48);
1057     result |= (uint64_t)tmp << 48;
1058     return result;
1059 }
1060 
1061 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1062 {
1063     uint32_t low = -x;
1064     uint32_t high = -(x >> 32);
1065     return low | ((uint64_t)high << 32);
1066 }
1067 
1068 /* Saturating sign manipulation.  */
1069 /* ??? Make these use NEON_VOP1 */
1070 #define DO_QABS8(x) do { \
1071     if (x == (int8_t)0x80) { \
1072         x = 0x7f; \
1073         SET_QC(); \
1074     } else if (x < 0) { \
1075         x = -x; \
1076     }} while (0)
1077 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1078 {
1079     neon_s8 vec;
1080     NEON_UNPACK(neon_s8, vec, x);
1081     DO_QABS8(vec.v1);
1082     DO_QABS8(vec.v2);
1083     DO_QABS8(vec.v3);
1084     DO_QABS8(vec.v4);
1085     NEON_PACK(neon_s8, x, vec);
1086     return x;
1087 }
1088 #undef DO_QABS8
1089 
1090 #define DO_QNEG8(x) do { \
1091     if (x == (int8_t)0x80) { \
1092         x = 0x7f; \
1093         SET_QC(); \
1094     } else { \
1095         x = -x; \
1096     }} while (0)
1097 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1098 {
1099     neon_s8 vec;
1100     NEON_UNPACK(neon_s8, vec, x);
1101     DO_QNEG8(vec.v1);
1102     DO_QNEG8(vec.v2);
1103     DO_QNEG8(vec.v3);
1104     DO_QNEG8(vec.v4);
1105     NEON_PACK(neon_s8, x, vec);
1106     return x;
1107 }
1108 #undef DO_QNEG8
1109 
1110 #define DO_QABS16(x) do { \
1111     if (x == (int16_t)0x8000) { \
1112         x = 0x7fff; \
1113         SET_QC(); \
1114     } else if (x < 0) { \
1115         x = -x; \
1116     }} while (0)
1117 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1118 {
1119     neon_s16 vec;
1120     NEON_UNPACK(neon_s16, vec, x);
1121     DO_QABS16(vec.v1);
1122     DO_QABS16(vec.v2);
1123     NEON_PACK(neon_s16, x, vec);
1124     return x;
1125 }
1126 #undef DO_QABS16
1127 
1128 #define DO_QNEG16(x) do { \
1129     if (x == (int16_t)0x8000) { \
1130         x = 0x7fff; \
1131         SET_QC(); \
1132     } else { \
1133         x = -x; \
1134     }} while (0)
1135 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1136 {
1137     neon_s16 vec;
1138     NEON_UNPACK(neon_s16, vec, x);
1139     DO_QNEG16(vec.v1);
1140     DO_QNEG16(vec.v2);
1141     NEON_PACK(neon_s16, x, vec);
1142     return x;
1143 }
1144 #undef DO_QNEG16
1145 
1146 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1147 {
1148     if (x == SIGNBIT) {
1149         SET_QC();
1150         x = ~SIGNBIT;
1151     } else if ((int32_t)x < 0) {
1152         x = -x;
1153     }
1154     return x;
1155 }
1156 
1157 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1158 {
1159     if (x == SIGNBIT) {
1160         SET_QC();
1161         x = ~SIGNBIT;
1162     } else {
1163         x = -x;
1164     }
1165     return x;
1166 }
1167 
1168 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1169 {
1170     if (x == SIGNBIT64) {
1171         SET_QC();
1172         x = ~SIGNBIT64;
1173     } else if ((int64_t)x < 0) {
1174         x = -x;
1175     }
1176     return x;
1177 }
1178 
1179 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1180 {
1181     if (x == SIGNBIT64) {
1182         SET_QC();
1183         x = ~SIGNBIT64;
1184     } else {
1185         x = -x;
1186     }
1187     return x;
1188 }
1189 
1190 /* NEON Float helpers.  */
1191 
1192 /* Floating point comparisons produce an integer result.
1193  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1194  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1195  */
1196 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1197 {
1198     float_status *fpst = fpstp;
1199     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1200 }
1201 
1202 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1203 {
1204     float_status *fpst = fpstp;
1205     return -float32_le(make_float32(b), make_float32(a), fpst);
1206 }
1207 
1208 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1209 {
1210     float_status *fpst = fpstp;
1211     return -float32_lt(make_float32(b), make_float32(a), fpst);
1212 }
1213 
1214 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1215 {
1216     float_status *fpst = fpstp;
1217     float32 f0 = float32_abs(make_float32(a));
1218     float32 f1 = float32_abs(make_float32(b));
1219     return -float32_le(f1, f0, fpst);
1220 }
1221 
1222 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1223 {
1224     float_status *fpst = fpstp;
1225     float32 f0 = float32_abs(make_float32(a));
1226     float32 f1 = float32_abs(make_float32(b));
1227     return -float32_lt(f1, f0, fpst);
1228 }
1229 
1230 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1231 {
1232     float_status *fpst = fpstp;
1233     float64 f0 = float64_abs(make_float64(a));
1234     float64 f1 = float64_abs(make_float64(b));
1235     return -float64_le(f1, f0, fpst);
1236 }
1237 
1238 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1239 {
1240     float_status *fpst = fpstp;
1241     float64 f0 = float64_abs(make_float64(a));
1242     float64 f1 = float64_abs(make_float64(b));
1243     return -float64_lt(f1, f0, fpst);
1244 }
1245 
1246 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1247 
1248 void HELPER(neon_qunzip8)(void *vd, void *vm)
1249 {
1250     uint64_t *rd = vd, *rm = vm;
1251     uint64_t zd0 = rd[0], zd1 = rd[1];
1252     uint64_t zm0 = rm[0], zm1 = rm[1];
1253 
1254     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1255         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1256         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1257         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1258     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1259         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1260         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1261         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1262     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1263         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1264         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1265         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1266     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1267         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1268         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1269         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1270 
1271     rm[0] = m0;
1272     rm[1] = m1;
1273     rd[0] = d0;
1274     rd[1] = d1;
1275 }
1276 
1277 void HELPER(neon_qunzip16)(void *vd, void *vm)
1278 {
1279     uint64_t *rd = vd, *rm = vm;
1280     uint64_t zd0 = rd[0], zd1 = rd[1];
1281     uint64_t zm0 = rm[0], zm1 = rm[1];
1282 
1283     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1284         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1285     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1286         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1287     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1288         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1289     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1290         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1291 
1292     rm[0] = m0;
1293     rm[1] = m1;
1294     rd[0] = d0;
1295     rd[1] = d1;
1296 }
1297 
1298 void HELPER(neon_qunzip32)(void *vd, void *vm)
1299 {
1300     uint64_t *rd = vd, *rm = vm;
1301     uint64_t zd0 = rd[0], zd1 = rd[1];
1302     uint64_t zm0 = rm[0], zm1 = rm[1];
1303 
1304     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1305     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1306     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1307     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1308 
1309     rm[0] = m0;
1310     rm[1] = m1;
1311     rd[0] = d0;
1312     rd[1] = d1;
1313 }
1314 
1315 void HELPER(neon_unzip8)(void *vd, void *vm)
1316 {
1317     uint64_t *rd = vd, *rm = vm;
1318     uint64_t zd = rd[0], zm = rm[0];
1319 
1320     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1321         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1322         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1323         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1324     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1325         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1326         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1327         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1328 
1329     rm[0] = m0;
1330     rd[0] = d0;
1331 }
1332 
1333 void HELPER(neon_unzip16)(void *vd, void *vm)
1334 {
1335     uint64_t *rd = vd, *rm = vm;
1336     uint64_t zd = rd[0], zm = rm[0];
1337 
1338     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1339         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1340     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1341         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1342 
1343     rm[0] = m0;
1344     rd[0] = d0;
1345 }
1346 
1347 void HELPER(neon_qzip8)(void *vd, void *vm)
1348 {
1349     uint64_t *rd = vd, *rm = vm;
1350     uint64_t zd0 = rd[0], zd1 = rd[1];
1351     uint64_t zm0 = rm[0], zm1 = rm[1];
1352 
1353     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1354         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1355         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1356         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1357     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1358         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1359         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1360         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1361     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1362         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1363         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1364         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1365     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1366         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1367         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1368         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1369 
1370     rm[0] = m0;
1371     rm[1] = m1;
1372     rd[0] = d0;
1373     rd[1] = d1;
1374 }
1375 
1376 void HELPER(neon_qzip16)(void *vd, void *vm)
1377 {
1378     uint64_t *rd = vd, *rm = vm;
1379     uint64_t zd0 = rd[0], zd1 = rd[1];
1380     uint64_t zm0 = rm[0], zm1 = rm[1];
1381 
1382     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1383         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1384     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1385         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1386     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1387         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1388     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1389         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1390 
1391     rm[0] = m0;
1392     rm[1] = m1;
1393     rd[0] = d0;
1394     rd[1] = d1;
1395 }
1396 
1397 void HELPER(neon_qzip32)(void *vd, void *vm)
1398 {
1399     uint64_t *rd = vd, *rm = vm;
1400     uint64_t zd0 = rd[0], zd1 = rd[1];
1401     uint64_t zm0 = rm[0], zm1 = rm[1];
1402 
1403     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1404     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1405     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1406     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1407 
1408     rm[0] = m0;
1409     rm[1] = m1;
1410     rd[0] = d0;
1411     rd[1] = d1;
1412 }
1413 
1414 void HELPER(neon_zip8)(void *vd, void *vm)
1415 {
1416     uint64_t *rd = vd, *rm = vm;
1417     uint64_t zd = rd[0], zm = rm[0];
1418 
1419     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1420         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1421         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1422         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1423     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1424         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1425         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1426         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1427 
1428     rm[0] = m0;
1429     rd[0] = d0;
1430 }
1431 
1432 void HELPER(neon_zip16)(void *vd, void *vm)
1433 {
1434     uint64_t *rd = vd, *rm = vm;
1435     uint64_t zd = rd[0], zm = rm[0];
1436 
1437     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1438         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1439     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1440         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1441 
1442     rm[0] = m0;
1443     rd[0] = d0;
1444 }
1445