1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "tcg/tcg-gvec-desc.h" 14 #include "fpu/softfloat.h" 15 #include "vec_internal.h" 16 17 #define SIGNBIT (uint32_t)0x80000000 18 #define SIGNBIT64 ((uint64_t)1 << 63) 19 20 #define SET_QC() env->vfp.qc[0] = 1 21 22 #define NEON_TYPE1(name, type) \ 23 typedef struct \ 24 { \ 25 type v1; \ 26 } neon_##name; 27 #if HOST_BIG_ENDIAN 28 #define NEON_TYPE2(name, type) \ 29 typedef struct \ 30 { \ 31 type v2; \ 32 type v1; \ 33 } neon_##name; 34 #define NEON_TYPE4(name, type) \ 35 typedef struct \ 36 { \ 37 type v4; \ 38 type v3; \ 39 type v2; \ 40 type v1; \ 41 } neon_##name; 42 #else 43 #define NEON_TYPE2(name, type) \ 44 typedef struct \ 45 { \ 46 type v1; \ 47 type v2; \ 48 } neon_##name; 49 #define NEON_TYPE4(name, type) \ 50 typedef struct \ 51 { \ 52 type v1; \ 53 type v2; \ 54 type v3; \ 55 type v4; \ 56 } neon_##name; 57 #endif 58 59 NEON_TYPE4(s8, int8_t) 60 NEON_TYPE4(u8, uint8_t) 61 NEON_TYPE2(s16, int16_t) 62 NEON_TYPE2(u16, uint16_t) 63 NEON_TYPE1(s32, int32_t) 64 NEON_TYPE1(u32, uint32_t) 65 #undef NEON_TYPE4 66 #undef NEON_TYPE2 67 #undef NEON_TYPE1 68 69 /* Copy from a uint32_t to a vector structure type. */ 70 #define NEON_UNPACK(vtype, dest, val) do { \ 71 union { \ 72 vtype v; \ 73 uint32_t i; \ 74 } conv_u; \ 75 conv_u.i = (val); \ 76 dest = conv_u.v; \ 77 } while(0) 78 79 /* Copy from a vector structure type to a uint32_t. */ 80 #define NEON_PACK(vtype, dest, val) do { \ 81 union { \ 82 vtype v; \ 83 uint32_t i; \ 84 } conv_u; \ 85 conv_u.v = (val); \ 86 dest = conv_u.i; \ 87 } while(0) 88 89 #define NEON_DO1 \ 90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 91 #define NEON_DO2 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 94 #define NEON_DO4 \ 95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 99 100 #define NEON_VOP_BODY(vtype, n) \ 101 { \ 102 uint32_t res; \ 103 vtype vsrc1; \ 104 vtype vsrc2; \ 105 vtype vdest; \ 106 NEON_UNPACK(vtype, vsrc1, arg1); \ 107 NEON_UNPACK(vtype, vsrc2, arg2); \ 108 NEON_DO##n; \ 109 NEON_PACK(vtype, res, vdest); \ 110 return res; \ 111 } 112 113 #define NEON_VOP(name, vtype, n) \ 114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 115 NEON_VOP_BODY(vtype, n) 116 117 #define NEON_VOP_ENV(name, vtype, n) \ 118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 119 NEON_VOP_BODY(vtype, n) 120 121 #define NEON_GVEC_VOP2(name, vtype) \ 122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 123 { \ 124 intptr_t i, opr_sz = simd_oprsz(desc); \ 125 vtype *d = vd, *n = vn, *m = vm; \ 126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 127 NEON_FN(d[i], n[i], m[i]); \ 128 } \ 129 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 130 } 131 132 /* Pairwise operations. */ 133 /* For 32-bit elements each segment only contains a single element, so 134 the elementwise and pairwise operations are the same. */ 135 #define NEON_PDO2 \ 136 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 137 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 138 #define NEON_PDO4 \ 139 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 140 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 141 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 142 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 143 144 #define NEON_POP(name, vtype, n) \ 145 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 146 { \ 147 uint32_t res; \ 148 vtype vsrc1; \ 149 vtype vsrc2; \ 150 vtype vdest; \ 151 NEON_UNPACK(vtype, vsrc1, arg1); \ 152 NEON_UNPACK(vtype, vsrc2, arg2); \ 153 NEON_PDO##n; \ 154 NEON_PACK(vtype, res, vdest); \ 155 return res; \ 156 } 157 158 /* Unary operators. */ 159 #define NEON_VOP1(name, vtype, n) \ 160 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 161 { \ 162 vtype vsrc1; \ 163 vtype vdest; \ 164 NEON_UNPACK(vtype, vsrc1, arg); \ 165 NEON_DO##n; \ 166 NEON_PACK(vtype, arg, vdest); \ 167 return arg; \ 168 } 169 170 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 171 NEON_VOP(hadd_s8, neon_s8, 4) 172 NEON_VOP(hadd_u8, neon_u8, 4) 173 NEON_VOP(hadd_s16, neon_s16, 2) 174 NEON_VOP(hadd_u16, neon_u16, 2) 175 #undef NEON_FN 176 177 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 178 { 179 int32_t dest; 180 181 dest = (src1 >> 1) + (src2 >> 1); 182 if (src1 & src2 & 1) 183 dest++; 184 return dest; 185 } 186 187 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 188 { 189 uint32_t dest; 190 191 dest = (src1 >> 1) + (src2 >> 1); 192 if (src1 & src2 & 1) 193 dest++; 194 return dest; 195 } 196 197 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 198 NEON_VOP(rhadd_s8, neon_s8, 4) 199 NEON_VOP(rhadd_u8, neon_u8, 4) 200 NEON_VOP(rhadd_s16, neon_s16, 2) 201 NEON_VOP(rhadd_u16, neon_u16, 2) 202 #undef NEON_FN 203 204 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 205 { 206 int32_t dest; 207 208 dest = (src1 >> 1) + (src2 >> 1); 209 if ((src1 | src2) & 1) 210 dest++; 211 return dest; 212 } 213 214 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 215 { 216 uint32_t dest; 217 218 dest = (src1 >> 1) + (src2 >> 1); 219 if ((src1 | src2) & 1) 220 dest++; 221 return dest; 222 } 223 224 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 225 NEON_VOP(hsub_s8, neon_s8, 4) 226 NEON_VOP(hsub_u8, neon_u8, 4) 227 NEON_VOP(hsub_s16, neon_s16, 2) 228 NEON_VOP(hsub_u16, neon_u16, 2) 229 #undef NEON_FN 230 231 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 232 { 233 int32_t dest; 234 235 dest = (src1 >> 1) - (src2 >> 1); 236 if ((~src1) & src2 & 1) 237 dest--; 238 return dest; 239 } 240 241 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 242 { 243 uint32_t dest; 244 245 dest = (src1 >> 1) - (src2 >> 1); 246 if ((~src1) & src2 & 1) 247 dest--; 248 return dest; 249 } 250 251 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 252 NEON_POP(pmin_s8, neon_s8, 4) 253 NEON_POP(pmin_u8, neon_u8, 4) 254 NEON_POP(pmin_s16, neon_s16, 2) 255 NEON_POP(pmin_u16, neon_u16, 2) 256 #undef NEON_FN 257 258 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 259 NEON_POP(pmax_s8, neon_s8, 4) 260 NEON_POP(pmax_u8, neon_u8, 4) 261 NEON_POP(pmax_s16, neon_s16, 2) 262 NEON_POP(pmax_u16, neon_u16, 2) 263 #undef NEON_FN 264 265 #define NEON_FN(dest, src1, src2) \ 266 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 267 NEON_VOP(shl_u16, neon_u16, 2) 268 #undef NEON_FN 269 270 #define NEON_FN(dest, src1, src2) \ 271 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 272 NEON_VOP(shl_s16, neon_s16, 2) 273 #undef NEON_FN 274 275 #define NEON_FN(dest, src1, src2) \ 276 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 277 NEON_VOP(rshl_s8, neon_s8, 4) 278 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 279 #undef NEON_FN 280 281 #define NEON_FN(dest, src1, src2) \ 282 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 283 NEON_VOP(rshl_s16, neon_s16, 2) 284 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 285 #undef NEON_FN 286 287 #define NEON_FN(dest, src1, src2) \ 288 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 289 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 290 #undef NEON_FN 291 292 #define NEON_FN(dest, src1, src2) \ 293 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 294 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 295 #undef NEON_FN 296 297 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 298 { 299 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 300 } 301 302 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 303 { 304 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 305 } 306 307 #define NEON_FN(dest, src1, src2) \ 308 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 309 NEON_VOP(rshl_u8, neon_u8, 4) 310 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 311 #undef NEON_FN 312 313 #define NEON_FN(dest, src1, src2) \ 314 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 315 NEON_VOP(rshl_u16, neon_u16, 2) 316 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 317 #undef NEON_FN 318 319 #define NEON_FN(dest, src1, src2) \ 320 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 321 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 322 #undef NEON_FN 323 324 #define NEON_FN(dest, src1, src2) \ 325 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 326 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 327 #undef NEON_FN 328 329 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 330 { 331 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 332 } 333 334 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 335 { 336 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 337 } 338 339 #define NEON_FN(dest, src1, src2) \ 340 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 341 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 342 #undef NEON_FN 343 344 #define NEON_FN(dest, src1, src2) \ 345 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 346 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 347 #undef NEON_FN 348 349 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 350 { 351 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 352 } 353 354 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 355 { 356 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 357 } 358 359 #define NEON_FN(dest, src1, src2) \ 360 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 361 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 362 #undef NEON_FN 363 364 #define NEON_FN(dest, src1, src2) \ 365 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 366 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 367 #undef NEON_FN 368 369 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 370 { 371 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 372 } 373 374 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 375 { 376 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 377 } 378 379 #define NEON_FN(dest, src1, src2) \ 380 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 381 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 382 #undef NEON_FN 383 384 #define NEON_FN(dest, src1, src2) \ 385 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 386 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 387 #undef NEON_FN 388 389 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 390 { 391 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 392 } 393 394 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 395 { 396 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 397 } 398 399 #define NEON_FN(dest, src1, src2) \ 400 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 401 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 402 #undef NEON_FN 403 404 #define NEON_FN(dest, src1, src2) \ 405 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 406 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 407 #undef NEON_FN 408 409 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 410 { 411 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 412 } 413 414 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 415 { 416 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 417 } 418 419 #define NEON_FN(dest, src1, src2) \ 420 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 421 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 422 #undef NEON_FN 423 424 #define NEON_FN(dest, src1, src2) \ 425 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 426 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 427 #undef NEON_FN 428 429 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 430 { 431 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 432 } 433 434 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 435 { 436 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 437 } 438 439 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 440 { 441 uint32_t mask; 442 mask = (a ^ b) & 0x80808080u; 443 a &= ~0x80808080u; 444 b &= ~0x80808080u; 445 return (a + b) ^ mask; 446 } 447 448 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 449 { 450 uint32_t mask; 451 mask = (a ^ b) & 0x80008000u; 452 a &= ~0x80008000u; 453 b &= ~0x80008000u; 454 return (a + b) ^ mask; 455 } 456 457 #define NEON_FN(dest, src1, src2) dest = src1 - src2 458 NEON_VOP(sub_u8, neon_u8, 4) 459 NEON_VOP(sub_u16, neon_u16, 2) 460 #undef NEON_FN 461 462 #define NEON_FN(dest, src1, src2) dest = src1 * src2 463 NEON_VOP(mul_u8, neon_u8, 4) 464 NEON_VOP(mul_u16, neon_u16, 2) 465 #undef NEON_FN 466 467 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 468 NEON_VOP(tst_u8, neon_u8, 4) 469 NEON_VOP(tst_u16, neon_u16, 2) 470 NEON_VOP(tst_u32, neon_u32, 1) 471 #undef NEON_FN 472 473 /* Count Leading Sign/Zero Bits. */ 474 static inline int do_clz8(uint8_t x) 475 { 476 int n; 477 for (n = 8; x; n--) 478 x >>= 1; 479 return n; 480 } 481 482 static inline int do_clz16(uint16_t x) 483 { 484 int n; 485 for (n = 16; x; n--) 486 x >>= 1; 487 return n; 488 } 489 490 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 491 NEON_VOP1(clz_u8, neon_u8, 4) 492 #undef NEON_FN 493 494 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 495 NEON_VOP1(clz_u16, neon_u16, 2) 496 #undef NEON_FN 497 498 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 499 NEON_VOP1(cls_s8, neon_s8, 4) 500 #undef NEON_FN 501 502 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 503 NEON_VOP1(cls_s16, neon_s16, 2) 504 #undef NEON_FN 505 506 uint32_t HELPER(neon_cls_s32)(uint32_t x) 507 { 508 int count; 509 if ((int32_t)x < 0) 510 x = ~x; 511 for (count = 32; x; count--) 512 x = x >> 1; 513 return count - 1; 514 } 515 516 /* Bit count. */ 517 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 518 { 519 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 520 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 521 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 522 return x; 523 } 524 525 /* Reverse bits in each 8 bit word */ 526 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 527 { 528 x = ((x & 0xf0f0f0f0) >> 4) 529 | ((x & 0x0f0f0f0f) << 4); 530 x = ((x & 0x88888888) >> 3) 531 | ((x & 0x44444444) >> 1) 532 | ((x & 0x22222222) << 1) 533 | ((x & 0x11111111) << 3); 534 return x; 535 } 536 537 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 538 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 539 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 540 SET_QC(); \ 541 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 542 } else { \ 543 tmp <<= 1; \ 544 } \ 545 if (round) { \ 546 int32_t old = tmp; \ 547 tmp += 1 << 15; \ 548 if ((int32_t)tmp < old) { \ 549 SET_QC(); \ 550 tmp = SIGNBIT - 1; \ 551 } \ 552 } \ 553 dest = tmp >> 16; \ 554 } while(0) 555 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 556 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 557 #undef NEON_FN 558 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 559 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 560 #undef NEON_FN 561 #undef NEON_QDMULH16 562 563 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 564 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 565 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 566 SET_QC(); \ 567 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 568 } else { \ 569 tmp <<= 1; \ 570 } \ 571 if (round) { \ 572 int64_t old = tmp; \ 573 tmp += (int64_t)1 << 31; \ 574 if ((int64_t)tmp < old) { \ 575 SET_QC(); \ 576 tmp = SIGNBIT64 - 1; \ 577 } \ 578 } \ 579 dest = tmp >> 32; \ 580 } while(0) 581 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 582 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 583 #undef NEON_FN 584 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 585 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 586 #undef NEON_FN 587 #undef NEON_QDMULH32 588 589 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 590 { 591 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 592 | ((x >> 24) & 0xff000000u); 593 } 594 595 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 596 { 597 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 598 } 599 600 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 601 { 602 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 603 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 604 } 605 606 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 607 { 608 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 609 } 610 611 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 612 { 613 x &= 0xff80ff80ff80ff80ull; 614 x += 0x0080008000800080ull; 615 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 616 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 617 } 618 619 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 620 { 621 x &= 0xffff8000ffff8000ull; 622 x += 0x0000800000008000ull; 623 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 624 } 625 626 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 627 { 628 uint16_t s; 629 uint8_t d; 630 uint32_t res = 0; 631 #define SAT8(n) \ 632 s = x >> n; \ 633 if (s & 0x8000) { \ 634 SET_QC(); \ 635 } else { \ 636 if (s > 0xff) { \ 637 d = 0xff; \ 638 SET_QC(); \ 639 } else { \ 640 d = s; \ 641 } \ 642 res |= (uint32_t)d << (n / 2); \ 643 } 644 645 SAT8(0); 646 SAT8(16); 647 SAT8(32); 648 SAT8(48); 649 #undef SAT8 650 return res; 651 } 652 653 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 654 { 655 uint16_t s; 656 uint8_t d; 657 uint32_t res = 0; 658 #define SAT8(n) \ 659 s = x >> n; \ 660 if (s > 0xff) { \ 661 d = 0xff; \ 662 SET_QC(); \ 663 } else { \ 664 d = s; \ 665 } \ 666 res |= (uint32_t)d << (n / 2); 667 668 SAT8(0); 669 SAT8(16); 670 SAT8(32); 671 SAT8(48); 672 #undef SAT8 673 return res; 674 } 675 676 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 677 { 678 int16_t s; 679 uint8_t d; 680 uint32_t res = 0; 681 #define SAT8(n) \ 682 s = x >> n; \ 683 if (s != (int8_t)s) { \ 684 d = (s >> 15) ^ 0x7f; \ 685 SET_QC(); \ 686 } else { \ 687 d = s; \ 688 } \ 689 res |= (uint32_t)d << (n / 2); 690 691 SAT8(0); 692 SAT8(16); 693 SAT8(32); 694 SAT8(48); 695 #undef SAT8 696 return res; 697 } 698 699 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 700 { 701 uint32_t high; 702 uint32_t low; 703 low = x; 704 if (low & 0x80000000) { 705 low = 0; 706 SET_QC(); 707 } else if (low > 0xffff) { 708 low = 0xffff; 709 SET_QC(); 710 } 711 high = x >> 32; 712 if (high & 0x80000000) { 713 high = 0; 714 SET_QC(); 715 } else if (high > 0xffff) { 716 high = 0xffff; 717 SET_QC(); 718 } 719 return low | (high << 16); 720 } 721 722 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 723 { 724 uint32_t high; 725 uint32_t low; 726 low = x; 727 if (low > 0xffff) { 728 low = 0xffff; 729 SET_QC(); 730 } 731 high = x >> 32; 732 if (high > 0xffff) { 733 high = 0xffff; 734 SET_QC(); 735 } 736 return low | (high << 16); 737 } 738 739 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 740 { 741 int32_t low; 742 int32_t high; 743 low = x; 744 if (low != (int16_t)low) { 745 low = (low >> 31) ^ 0x7fff; 746 SET_QC(); 747 } 748 high = x >> 32; 749 if (high != (int16_t)high) { 750 high = (high >> 31) ^ 0x7fff; 751 SET_QC(); 752 } 753 return (uint16_t)low | (high << 16); 754 } 755 756 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 757 { 758 if (x & 0x8000000000000000ull) { 759 SET_QC(); 760 return 0; 761 } 762 if (x > 0xffffffffu) { 763 SET_QC(); 764 return 0xffffffffu; 765 } 766 return x; 767 } 768 769 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 770 { 771 if (x > 0xffffffffu) { 772 SET_QC(); 773 return 0xffffffffu; 774 } 775 return x; 776 } 777 778 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 779 { 780 if ((int64_t)x != (int32_t)x) { 781 SET_QC(); 782 return ((int64_t)x >> 63) ^ 0x7fffffff; 783 } 784 return x; 785 } 786 787 uint64_t HELPER(neon_widen_u8)(uint32_t x) 788 { 789 uint64_t tmp; 790 uint64_t ret; 791 ret = (uint8_t)x; 792 tmp = (uint8_t)(x >> 8); 793 ret |= tmp << 16; 794 tmp = (uint8_t)(x >> 16); 795 ret |= tmp << 32; 796 tmp = (uint8_t)(x >> 24); 797 ret |= tmp << 48; 798 return ret; 799 } 800 801 uint64_t HELPER(neon_widen_s8)(uint32_t x) 802 { 803 uint64_t tmp; 804 uint64_t ret; 805 ret = (uint16_t)(int8_t)x; 806 tmp = (uint16_t)(int8_t)(x >> 8); 807 ret |= tmp << 16; 808 tmp = (uint16_t)(int8_t)(x >> 16); 809 ret |= tmp << 32; 810 tmp = (uint16_t)(int8_t)(x >> 24); 811 ret |= tmp << 48; 812 return ret; 813 } 814 815 uint64_t HELPER(neon_widen_u16)(uint32_t x) 816 { 817 uint64_t high = (uint16_t)(x >> 16); 818 return ((uint16_t)x) | (high << 32); 819 } 820 821 uint64_t HELPER(neon_widen_s16)(uint32_t x) 822 { 823 uint64_t high = (int16_t)(x >> 16); 824 return ((uint32_t)(int16_t)x) | (high << 32); 825 } 826 827 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 828 { 829 uint64_t mask; 830 mask = (a ^ b) & 0x8000800080008000ull; 831 a &= ~0x8000800080008000ull; 832 b &= ~0x8000800080008000ull; 833 return (a + b) ^ mask; 834 } 835 836 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 837 { 838 uint64_t mask; 839 mask = (a ^ b) & 0x8000000080000000ull; 840 a &= ~0x8000000080000000ull; 841 b &= ~0x8000000080000000ull; 842 return (a + b) ^ mask; 843 } 844 845 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 846 { 847 uint64_t tmp; 848 uint64_t tmp2; 849 850 tmp = a & 0x0000ffff0000ffffull; 851 tmp += (a >> 16) & 0x0000ffff0000ffffull; 852 tmp2 = b & 0xffff0000ffff0000ull; 853 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 854 return ( tmp & 0xffff) 855 | ((tmp >> 16) & 0xffff0000ull) 856 | ((tmp2 << 16) & 0xffff00000000ull) 857 | ( tmp2 & 0xffff000000000000ull); 858 } 859 860 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 861 { 862 uint32_t low = a + (a >> 32); 863 uint32_t high = b + (b >> 32); 864 return low + ((uint64_t)high << 32); 865 } 866 867 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 868 { 869 uint64_t mask; 870 mask = (a ^ ~b) & 0x8000800080008000ull; 871 a |= 0x8000800080008000ull; 872 b &= ~0x8000800080008000ull; 873 return (a - b) ^ mask; 874 } 875 876 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 877 { 878 uint64_t mask; 879 mask = (a ^ ~b) & 0x8000000080000000ull; 880 a |= 0x8000000080000000ull; 881 b &= ~0x8000000080000000ull; 882 return (a - b) ^ mask; 883 } 884 885 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 886 { 887 uint32_t x, y; 888 uint32_t low, high; 889 890 x = a; 891 y = b; 892 low = x + y; 893 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 894 SET_QC(); 895 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 896 } 897 x = a >> 32; 898 y = b >> 32; 899 high = x + y; 900 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 901 SET_QC(); 902 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 903 } 904 return low | ((uint64_t)high << 32); 905 } 906 907 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 908 { 909 uint64_t result; 910 911 result = a + b; 912 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 913 SET_QC(); 914 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 915 } 916 return result; 917 } 918 919 /* We have to do the arithmetic in a larger type than 920 * the input type, because for example with a signed 32 bit 921 * op the absolute difference can overflow a signed 32 bit value. 922 */ 923 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 924 arithtype tmp_x = (intype)(x); \ 925 arithtype tmp_y = (intype)(y); \ 926 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 927 } while(0) 928 929 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 930 { 931 uint64_t tmp; 932 uint64_t result; 933 DO_ABD(result, a, b, uint8_t, uint32_t); 934 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 935 result |= tmp << 16; 936 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 937 result |= tmp << 32; 938 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 939 result |= tmp << 48; 940 return result; 941 } 942 943 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 944 { 945 uint64_t tmp; 946 uint64_t result; 947 DO_ABD(result, a, b, int8_t, int32_t); 948 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 949 result |= tmp << 16; 950 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 951 result |= tmp << 32; 952 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 953 result |= tmp << 48; 954 return result; 955 } 956 957 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 958 { 959 uint64_t tmp; 960 uint64_t result; 961 DO_ABD(result, a, b, uint16_t, uint32_t); 962 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 963 return result | (tmp << 32); 964 } 965 966 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 967 { 968 uint64_t tmp; 969 uint64_t result; 970 DO_ABD(result, a, b, int16_t, int32_t); 971 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 972 return result | (tmp << 32); 973 } 974 975 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 976 { 977 uint64_t result; 978 DO_ABD(result, a, b, uint32_t, uint64_t); 979 return result; 980 } 981 982 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 983 { 984 uint64_t result; 985 DO_ABD(result, a, b, int32_t, int64_t); 986 return result; 987 } 988 #undef DO_ABD 989 990 /* Widening multiply. Named type is the source type. */ 991 #define DO_MULL(dest, x, y, type1, type2) do { \ 992 type1 tmp_x = x; \ 993 type1 tmp_y = y; \ 994 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 995 } while(0) 996 997 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 998 { 999 uint64_t tmp; 1000 uint64_t result; 1001 1002 DO_MULL(result, a, b, uint8_t, uint16_t); 1003 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1004 result |= tmp << 16; 1005 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1006 result |= tmp << 32; 1007 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1008 result |= tmp << 48; 1009 return result; 1010 } 1011 1012 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1013 { 1014 uint64_t tmp; 1015 uint64_t result; 1016 1017 DO_MULL(result, a, b, int8_t, uint16_t); 1018 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1019 result |= tmp << 16; 1020 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1021 result |= tmp << 32; 1022 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1023 result |= tmp << 48; 1024 return result; 1025 } 1026 1027 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1028 { 1029 uint64_t tmp; 1030 uint64_t result; 1031 1032 DO_MULL(result, a, b, uint16_t, uint32_t); 1033 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1034 return result | (tmp << 32); 1035 } 1036 1037 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1038 { 1039 uint64_t tmp; 1040 uint64_t result; 1041 1042 DO_MULL(result, a, b, int16_t, uint32_t); 1043 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1044 return result | (tmp << 32); 1045 } 1046 1047 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1048 { 1049 uint16_t tmp; 1050 uint64_t result; 1051 result = (uint16_t)-x; 1052 tmp = -(x >> 16); 1053 result |= (uint64_t)tmp << 16; 1054 tmp = -(x >> 32); 1055 result |= (uint64_t)tmp << 32; 1056 tmp = -(x >> 48); 1057 result |= (uint64_t)tmp << 48; 1058 return result; 1059 } 1060 1061 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1062 { 1063 uint32_t low = -x; 1064 uint32_t high = -(x >> 32); 1065 return low | ((uint64_t)high << 32); 1066 } 1067 1068 /* Saturating sign manipulation. */ 1069 /* ??? Make these use NEON_VOP1 */ 1070 #define DO_QABS8(x) do { \ 1071 if (x == (int8_t)0x80) { \ 1072 x = 0x7f; \ 1073 SET_QC(); \ 1074 } else if (x < 0) { \ 1075 x = -x; \ 1076 }} while (0) 1077 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1078 { 1079 neon_s8 vec; 1080 NEON_UNPACK(neon_s8, vec, x); 1081 DO_QABS8(vec.v1); 1082 DO_QABS8(vec.v2); 1083 DO_QABS8(vec.v3); 1084 DO_QABS8(vec.v4); 1085 NEON_PACK(neon_s8, x, vec); 1086 return x; 1087 } 1088 #undef DO_QABS8 1089 1090 #define DO_QNEG8(x) do { \ 1091 if (x == (int8_t)0x80) { \ 1092 x = 0x7f; \ 1093 SET_QC(); \ 1094 } else { \ 1095 x = -x; \ 1096 }} while (0) 1097 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1098 { 1099 neon_s8 vec; 1100 NEON_UNPACK(neon_s8, vec, x); 1101 DO_QNEG8(vec.v1); 1102 DO_QNEG8(vec.v2); 1103 DO_QNEG8(vec.v3); 1104 DO_QNEG8(vec.v4); 1105 NEON_PACK(neon_s8, x, vec); 1106 return x; 1107 } 1108 #undef DO_QNEG8 1109 1110 #define DO_QABS16(x) do { \ 1111 if (x == (int16_t)0x8000) { \ 1112 x = 0x7fff; \ 1113 SET_QC(); \ 1114 } else if (x < 0) { \ 1115 x = -x; \ 1116 }} while (0) 1117 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1118 { 1119 neon_s16 vec; 1120 NEON_UNPACK(neon_s16, vec, x); 1121 DO_QABS16(vec.v1); 1122 DO_QABS16(vec.v2); 1123 NEON_PACK(neon_s16, x, vec); 1124 return x; 1125 } 1126 #undef DO_QABS16 1127 1128 #define DO_QNEG16(x) do { \ 1129 if (x == (int16_t)0x8000) { \ 1130 x = 0x7fff; \ 1131 SET_QC(); \ 1132 } else { \ 1133 x = -x; \ 1134 }} while (0) 1135 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1136 { 1137 neon_s16 vec; 1138 NEON_UNPACK(neon_s16, vec, x); 1139 DO_QNEG16(vec.v1); 1140 DO_QNEG16(vec.v2); 1141 NEON_PACK(neon_s16, x, vec); 1142 return x; 1143 } 1144 #undef DO_QNEG16 1145 1146 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1147 { 1148 if (x == SIGNBIT) { 1149 SET_QC(); 1150 x = ~SIGNBIT; 1151 } else if ((int32_t)x < 0) { 1152 x = -x; 1153 } 1154 return x; 1155 } 1156 1157 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1158 { 1159 if (x == SIGNBIT) { 1160 SET_QC(); 1161 x = ~SIGNBIT; 1162 } else { 1163 x = -x; 1164 } 1165 return x; 1166 } 1167 1168 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1169 { 1170 if (x == SIGNBIT64) { 1171 SET_QC(); 1172 x = ~SIGNBIT64; 1173 } else if ((int64_t)x < 0) { 1174 x = -x; 1175 } 1176 return x; 1177 } 1178 1179 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1180 { 1181 if (x == SIGNBIT64) { 1182 SET_QC(); 1183 x = ~SIGNBIT64; 1184 } else { 1185 x = -x; 1186 } 1187 return x; 1188 } 1189 1190 /* NEON Float helpers. */ 1191 1192 /* Floating point comparisons produce an integer result. 1193 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1194 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1195 */ 1196 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1197 { 1198 float_status *fpst = fpstp; 1199 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1200 } 1201 1202 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1203 { 1204 float_status *fpst = fpstp; 1205 return -float32_le(make_float32(b), make_float32(a), fpst); 1206 } 1207 1208 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1209 { 1210 float_status *fpst = fpstp; 1211 return -float32_lt(make_float32(b), make_float32(a), fpst); 1212 } 1213 1214 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1215 { 1216 float_status *fpst = fpstp; 1217 float32 f0 = float32_abs(make_float32(a)); 1218 float32 f1 = float32_abs(make_float32(b)); 1219 return -float32_le(f1, f0, fpst); 1220 } 1221 1222 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1223 { 1224 float_status *fpst = fpstp; 1225 float32 f0 = float32_abs(make_float32(a)); 1226 float32 f1 = float32_abs(make_float32(b)); 1227 return -float32_lt(f1, f0, fpst); 1228 } 1229 1230 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1231 { 1232 float_status *fpst = fpstp; 1233 float64 f0 = float64_abs(make_float64(a)); 1234 float64 f1 = float64_abs(make_float64(b)); 1235 return -float64_le(f1, f0, fpst); 1236 } 1237 1238 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1239 { 1240 float_status *fpst = fpstp; 1241 float64 f0 = float64_abs(make_float64(a)); 1242 float64 f1 = float64_abs(make_float64(b)); 1243 return -float64_lt(f1, f0, fpst); 1244 } 1245 1246 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1247 1248 void HELPER(neon_qunzip8)(void *vd, void *vm) 1249 { 1250 uint64_t *rd = vd, *rm = vm; 1251 uint64_t zd0 = rd[0], zd1 = rd[1]; 1252 uint64_t zm0 = rm[0], zm1 = rm[1]; 1253 1254 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1255 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1256 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1257 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1258 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1259 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1260 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1261 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1262 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1263 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1264 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1265 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1266 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1267 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1268 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1269 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1270 1271 rm[0] = m0; 1272 rm[1] = m1; 1273 rd[0] = d0; 1274 rd[1] = d1; 1275 } 1276 1277 void HELPER(neon_qunzip16)(void *vd, void *vm) 1278 { 1279 uint64_t *rd = vd, *rm = vm; 1280 uint64_t zd0 = rd[0], zd1 = rd[1]; 1281 uint64_t zm0 = rm[0], zm1 = rm[1]; 1282 1283 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1284 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1285 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1286 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1287 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1288 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1289 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1290 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1291 1292 rm[0] = m0; 1293 rm[1] = m1; 1294 rd[0] = d0; 1295 rd[1] = d1; 1296 } 1297 1298 void HELPER(neon_qunzip32)(void *vd, void *vm) 1299 { 1300 uint64_t *rd = vd, *rm = vm; 1301 uint64_t zd0 = rd[0], zd1 = rd[1]; 1302 uint64_t zm0 = rm[0], zm1 = rm[1]; 1303 1304 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1305 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1306 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1307 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1308 1309 rm[0] = m0; 1310 rm[1] = m1; 1311 rd[0] = d0; 1312 rd[1] = d1; 1313 } 1314 1315 void HELPER(neon_unzip8)(void *vd, void *vm) 1316 { 1317 uint64_t *rd = vd, *rm = vm; 1318 uint64_t zd = rd[0], zm = rm[0]; 1319 1320 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1321 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1322 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1323 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1324 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1325 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1326 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1327 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1328 1329 rm[0] = m0; 1330 rd[0] = d0; 1331 } 1332 1333 void HELPER(neon_unzip16)(void *vd, void *vm) 1334 { 1335 uint64_t *rd = vd, *rm = vm; 1336 uint64_t zd = rd[0], zm = rm[0]; 1337 1338 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1339 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1340 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1341 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1342 1343 rm[0] = m0; 1344 rd[0] = d0; 1345 } 1346 1347 void HELPER(neon_qzip8)(void *vd, void *vm) 1348 { 1349 uint64_t *rd = vd, *rm = vm; 1350 uint64_t zd0 = rd[0], zd1 = rd[1]; 1351 uint64_t zm0 = rm[0], zm1 = rm[1]; 1352 1353 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1354 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1355 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1356 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1357 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1358 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1359 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1360 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1361 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1362 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1363 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1364 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1365 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1366 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1367 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1368 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1369 1370 rm[0] = m0; 1371 rm[1] = m1; 1372 rd[0] = d0; 1373 rd[1] = d1; 1374 } 1375 1376 void HELPER(neon_qzip16)(void *vd, void *vm) 1377 { 1378 uint64_t *rd = vd, *rm = vm; 1379 uint64_t zd0 = rd[0], zd1 = rd[1]; 1380 uint64_t zm0 = rm[0], zm1 = rm[1]; 1381 1382 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1383 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1384 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1385 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1386 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1387 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1388 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1389 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1390 1391 rm[0] = m0; 1392 rm[1] = m1; 1393 rd[0] = d0; 1394 rd[1] = d1; 1395 } 1396 1397 void HELPER(neon_qzip32)(void *vd, void *vm) 1398 { 1399 uint64_t *rd = vd, *rm = vm; 1400 uint64_t zd0 = rd[0], zd1 = rd[1]; 1401 uint64_t zm0 = rm[0], zm1 = rm[1]; 1402 1403 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1404 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1405 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1406 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1407 1408 rm[0] = m0; 1409 rm[1] = m1; 1410 rd[0] = d0; 1411 rd[1] = d1; 1412 } 1413 1414 void HELPER(neon_zip8)(void *vd, void *vm) 1415 { 1416 uint64_t *rd = vd, *rm = vm; 1417 uint64_t zd = rd[0], zm = rm[0]; 1418 1419 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1420 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1421 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1422 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1423 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1424 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1425 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1426 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1427 1428 rm[0] = m0; 1429 rd[0] = d0; 1430 } 1431 1432 void HELPER(neon_zip16)(void *vd, void *vm) 1433 { 1434 uint64_t *rd = vd, *rm = vm; 1435 uint64_t zd = rd[0], zm = rm[0]; 1436 1437 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1438 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1439 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1440 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1441 1442 rm[0] = m0; 1443 rd[0] = d0; 1444 } 1445