1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 #include "qemu/osdep.h" 10 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "fpu/softfloat.h" 14 #include "vec_internal.h" 15 16 #define SIGNBIT (uint32_t)0x80000000 17 #define SIGNBIT64 ((uint64_t)1 << 63) 18 19 #define SET_QC() env->vfp.qc[0] = 1 20 21 #define NEON_TYPE1(name, type) \ 22 typedef struct \ 23 { \ 24 type v1; \ 25 } neon_##name; 26 #if HOST_BIG_ENDIAN 27 #define NEON_TYPE2(name, type) \ 28 typedef struct \ 29 { \ 30 type v2; \ 31 type v1; \ 32 } neon_##name; 33 #define NEON_TYPE4(name, type) \ 34 typedef struct \ 35 { \ 36 type v4; \ 37 type v3; \ 38 type v2; \ 39 type v1; \ 40 } neon_##name; 41 #else 42 #define NEON_TYPE2(name, type) \ 43 typedef struct \ 44 { \ 45 type v1; \ 46 type v2; \ 47 } neon_##name; 48 #define NEON_TYPE4(name, type) \ 49 typedef struct \ 50 { \ 51 type v1; \ 52 type v2; \ 53 type v3; \ 54 type v4; \ 55 } neon_##name; 56 #endif 57 58 NEON_TYPE4(s8, int8_t) 59 NEON_TYPE4(u8, uint8_t) 60 NEON_TYPE2(s16, int16_t) 61 NEON_TYPE2(u16, uint16_t) 62 NEON_TYPE1(s32, int32_t) 63 NEON_TYPE1(u32, uint32_t) 64 #undef NEON_TYPE4 65 #undef NEON_TYPE2 66 #undef NEON_TYPE1 67 68 /* Copy from a uint32_t to a vector structure type. */ 69 #define NEON_UNPACK(vtype, dest, val) do { \ 70 union { \ 71 vtype v; \ 72 uint32_t i; \ 73 } conv_u; \ 74 conv_u.i = (val); \ 75 dest = conv_u.v; \ 76 } while(0) 77 78 /* Copy from a vector structure type to a uint32_t. */ 79 #define NEON_PACK(vtype, dest, val) do { \ 80 union { \ 81 vtype v; \ 82 uint32_t i; \ 83 } conv_u; \ 84 conv_u.v = (val); \ 85 dest = conv_u.i; \ 86 } while(0) 87 88 #define NEON_DO1 \ 89 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 90 #define NEON_DO2 \ 91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 92 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 93 #define NEON_DO4 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 96 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 97 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 98 99 #define NEON_VOP_BODY(vtype, n) \ 100 { \ 101 uint32_t res; \ 102 vtype vsrc1; \ 103 vtype vsrc2; \ 104 vtype vdest; \ 105 NEON_UNPACK(vtype, vsrc1, arg1); \ 106 NEON_UNPACK(vtype, vsrc2, arg2); \ 107 NEON_DO##n; \ 108 NEON_PACK(vtype, res, vdest); \ 109 return res; \ 110 } 111 112 #define NEON_VOP(name, vtype, n) \ 113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 114 NEON_VOP_BODY(vtype, n) 115 116 #define NEON_VOP_ENV(name, vtype, n) \ 117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 118 NEON_VOP_BODY(vtype, n) 119 120 /* Pairwise operations. */ 121 /* For 32-bit elements each segment only contains a single element, so 122 the elementwise and pairwise operations are the same. */ 123 #define NEON_PDO2 \ 124 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 125 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 126 #define NEON_PDO4 \ 127 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 128 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 129 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 130 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 131 132 #define NEON_POP(name, vtype, n) \ 133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 134 { \ 135 uint32_t res; \ 136 vtype vsrc1; \ 137 vtype vsrc2; \ 138 vtype vdest; \ 139 NEON_UNPACK(vtype, vsrc1, arg1); \ 140 NEON_UNPACK(vtype, vsrc2, arg2); \ 141 NEON_PDO##n; \ 142 NEON_PACK(vtype, res, vdest); \ 143 return res; \ 144 } 145 146 /* Unary operators. */ 147 #define NEON_VOP1(name, vtype, n) \ 148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 149 { \ 150 vtype vsrc1; \ 151 vtype vdest; \ 152 NEON_UNPACK(vtype, vsrc1, arg); \ 153 NEON_DO##n; \ 154 NEON_PACK(vtype, arg, vdest); \ 155 return arg; \ 156 } 157 158 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 159 NEON_VOP(hadd_s8, neon_s8, 4) 160 NEON_VOP(hadd_u8, neon_u8, 4) 161 NEON_VOP(hadd_s16, neon_s16, 2) 162 NEON_VOP(hadd_u16, neon_u16, 2) 163 #undef NEON_FN 164 165 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 166 { 167 int32_t dest; 168 169 dest = (src1 >> 1) + (src2 >> 1); 170 if (src1 & src2 & 1) 171 dest++; 172 return dest; 173 } 174 175 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 176 { 177 uint32_t dest; 178 179 dest = (src1 >> 1) + (src2 >> 1); 180 if (src1 & src2 & 1) 181 dest++; 182 return dest; 183 } 184 185 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 186 NEON_VOP(rhadd_s8, neon_s8, 4) 187 NEON_VOP(rhadd_u8, neon_u8, 4) 188 NEON_VOP(rhadd_s16, neon_s16, 2) 189 NEON_VOP(rhadd_u16, neon_u16, 2) 190 #undef NEON_FN 191 192 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 193 { 194 int32_t dest; 195 196 dest = (src1 >> 1) + (src2 >> 1); 197 if ((src1 | src2) & 1) 198 dest++; 199 return dest; 200 } 201 202 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 203 { 204 uint32_t dest; 205 206 dest = (src1 >> 1) + (src2 >> 1); 207 if ((src1 | src2) & 1) 208 dest++; 209 return dest; 210 } 211 212 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 213 NEON_VOP(hsub_s8, neon_s8, 4) 214 NEON_VOP(hsub_u8, neon_u8, 4) 215 NEON_VOP(hsub_s16, neon_s16, 2) 216 NEON_VOP(hsub_u16, neon_u16, 2) 217 #undef NEON_FN 218 219 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 220 { 221 int32_t dest; 222 223 dest = (src1 >> 1) - (src2 >> 1); 224 if ((~src1) & src2 & 1) 225 dest--; 226 return dest; 227 } 228 229 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 230 { 231 uint32_t dest; 232 233 dest = (src1 >> 1) - (src2 >> 1); 234 if ((~src1) & src2 & 1) 235 dest--; 236 return dest; 237 } 238 239 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 240 NEON_POP(pmin_s8, neon_s8, 4) 241 NEON_POP(pmin_u8, neon_u8, 4) 242 NEON_POP(pmin_s16, neon_s16, 2) 243 NEON_POP(pmin_u16, neon_u16, 2) 244 #undef NEON_FN 245 246 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 247 NEON_POP(pmax_s8, neon_s8, 4) 248 NEON_POP(pmax_u8, neon_u8, 4) 249 NEON_POP(pmax_s16, neon_s16, 2) 250 NEON_POP(pmax_u16, neon_u16, 2) 251 #undef NEON_FN 252 253 #define NEON_FN(dest, src1, src2) \ 254 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 255 NEON_VOP(shl_u16, neon_u16, 2) 256 #undef NEON_FN 257 258 #define NEON_FN(dest, src1, src2) \ 259 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 260 NEON_VOP(shl_s16, neon_s16, 2) 261 #undef NEON_FN 262 263 #define NEON_FN(dest, src1, src2) \ 264 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 265 NEON_VOP(rshl_s8, neon_s8, 4) 266 #undef NEON_FN 267 268 #define NEON_FN(dest, src1, src2) \ 269 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 270 NEON_VOP(rshl_s16, neon_s16, 2) 271 #undef NEON_FN 272 273 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 274 { 275 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 276 } 277 278 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 279 { 280 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 281 } 282 283 #define NEON_FN(dest, src1, src2) \ 284 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 285 NEON_VOP(rshl_u8, neon_u8, 4) 286 #undef NEON_FN 287 288 #define NEON_FN(dest, src1, src2) \ 289 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 290 NEON_VOP(rshl_u16, neon_u16, 2) 291 #undef NEON_FN 292 293 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 294 { 295 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 296 } 297 298 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 299 { 300 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 301 } 302 303 #define NEON_FN(dest, src1, src2) \ 304 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 305 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 306 #undef NEON_FN 307 308 #define NEON_FN(dest, src1, src2) \ 309 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 310 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 311 #undef NEON_FN 312 313 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 314 { 315 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 316 } 317 318 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 319 { 320 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 321 } 322 323 #define NEON_FN(dest, src1, src2) \ 324 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 325 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 326 #undef NEON_FN 327 328 #define NEON_FN(dest, src1, src2) \ 329 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 330 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 331 #undef NEON_FN 332 333 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 334 { 335 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 336 } 337 338 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 339 { 340 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 341 } 342 343 #define NEON_FN(dest, src1, src2) \ 344 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 345 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 346 #undef NEON_FN 347 348 #define NEON_FN(dest, src1, src2) \ 349 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 350 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 351 #undef NEON_FN 352 353 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 354 { 355 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 356 } 357 358 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 359 { 360 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 361 } 362 363 #define NEON_FN(dest, src1, src2) \ 364 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 365 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 366 #undef NEON_FN 367 368 #define NEON_FN(dest, src1, src2) \ 369 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 370 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 371 #undef NEON_FN 372 373 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 374 { 375 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 376 } 377 378 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 379 { 380 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 381 } 382 383 #define NEON_FN(dest, src1, src2) \ 384 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 385 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 386 #undef NEON_FN 387 388 #define NEON_FN(dest, src1, src2) \ 389 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 390 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 391 #undef NEON_FN 392 393 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 394 { 395 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 396 } 397 398 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 399 { 400 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 401 } 402 403 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 404 { 405 uint32_t mask; 406 mask = (a ^ b) & 0x80808080u; 407 a &= ~0x80808080u; 408 b &= ~0x80808080u; 409 return (a + b) ^ mask; 410 } 411 412 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 413 { 414 uint32_t mask; 415 mask = (a ^ b) & 0x80008000u; 416 a &= ~0x80008000u; 417 b &= ~0x80008000u; 418 return (a + b) ^ mask; 419 } 420 421 #define NEON_FN(dest, src1, src2) dest = src1 - src2 422 NEON_VOP(sub_u8, neon_u8, 4) 423 NEON_VOP(sub_u16, neon_u16, 2) 424 #undef NEON_FN 425 426 #define NEON_FN(dest, src1, src2) dest = src1 * src2 427 NEON_VOP(mul_u8, neon_u8, 4) 428 NEON_VOP(mul_u16, neon_u16, 2) 429 #undef NEON_FN 430 431 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 432 NEON_VOP(tst_u8, neon_u8, 4) 433 NEON_VOP(tst_u16, neon_u16, 2) 434 NEON_VOP(tst_u32, neon_u32, 1) 435 #undef NEON_FN 436 437 /* Count Leading Sign/Zero Bits. */ 438 static inline int do_clz8(uint8_t x) 439 { 440 int n; 441 for (n = 8; x; n--) 442 x >>= 1; 443 return n; 444 } 445 446 static inline int do_clz16(uint16_t x) 447 { 448 int n; 449 for (n = 16; x; n--) 450 x >>= 1; 451 return n; 452 } 453 454 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 455 NEON_VOP1(clz_u8, neon_u8, 4) 456 #undef NEON_FN 457 458 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 459 NEON_VOP1(clz_u16, neon_u16, 2) 460 #undef NEON_FN 461 462 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 463 NEON_VOP1(cls_s8, neon_s8, 4) 464 #undef NEON_FN 465 466 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 467 NEON_VOP1(cls_s16, neon_s16, 2) 468 #undef NEON_FN 469 470 uint32_t HELPER(neon_cls_s32)(uint32_t x) 471 { 472 int count; 473 if ((int32_t)x < 0) 474 x = ~x; 475 for (count = 32; x; count--) 476 x = x >> 1; 477 return count - 1; 478 } 479 480 /* Bit count. */ 481 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 482 { 483 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 484 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 485 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 486 return x; 487 } 488 489 /* Reverse bits in each 8 bit word */ 490 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 491 { 492 x = ((x & 0xf0f0f0f0) >> 4) 493 | ((x & 0x0f0f0f0f) << 4); 494 x = ((x & 0x88888888) >> 3) 495 | ((x & 0x44444444) >> 1) 496 | ((x & 0x22222222) << 1) 497 | ((x & 0x11111111) << 3); 498 return x; 499 } 500 501 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 502 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 503 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 504 SET_QC(); \ 505 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 506 } else { \ 507 tmp <<= 1; \ 508 } \ 509 if (round) { \ 510 int32_t old = tmp; \ 511 tmp += 1 << 15; \ 512 if ((int32_t)tmp < old) { \ 513 SET_QC(); \ 514 tmp = SIGNBIT - 1; \ 515 } \ 516 } \ 517 dest = tmp >> 16; \ 518 } while(0) 519 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 520 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 521 #undef NEON_FN 522 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 523 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 524 #undef NEON_FN 525 #undef NEON_QDMULH16 526 527 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 528 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 529 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 530 SET_QC(); \ 531 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 532 } else { \ 533 tmp <<= 1; \ 534 } \ 535 if (round) { \ 536 int64_t old = tmp; \ 537 tmp += (int64_t)1 << 31; \ 538 if ((int64_t)tmp < old) { \ 539 SET_QC(); \ 540 tmp = SIGNBIT64 - 1; \ 541 } \ 542 } \ 543 dest = tmp >> 32; \ 544 } while(0) 545 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 546 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 547 #undef NEON_FN 548 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 549 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 550 #undef NEON_FN 551 #undef NEON_QDMULH32 552 553 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 554 { 555 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 556 | ((x >> 24) & 0xff000000u); 557 } 558 559 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 560 { 561 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 562 } 563 564 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 565 { 566 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 567 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 568 } 569 570 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 571 { 572 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 573 } 574 575 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 576 { 577 x &= 0xff80ff80ff80ff80ull; 578 x += 0x0080008000800080ull; 579 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 580 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 581 } 582 583 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 584 { 585 x &= 0xffff8000ffff8000ull; 586 x += 0x0000800000008000ull; 587 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 588 } 589 590 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 591 { 592 uint16_t s; 593 uint8_t d; 594 uint32_t res = 0; 595 #define SAT8(n) \ 596 s = x >> n; \ 597 if (s & 0x8000) { \ 598 SET_QC(); \ 599 } else { \ 600 if (s > 0xff) { \ 601 d = 0xff; \ 602 SET_QC(); \ 603 } else { \ 604 d = s; \ 605 } \ 606 res |= (uint32_t)d << (n / 2); \ 607 } 608 609 SAT8(0); 610 SAT8(16); 611 SAT8(32); 612 SAT8(48); 613 #undef SAT8 614 return res; 615 } 616 617 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 618 { 619 uint16_t s; 620 uint8_t d; 621 uint32_t res = 0; 622 #define SAT8(n) \ 623 s = x >> n; \ 624 if (s > 0xff) { \ 625 d = 0xff; \ 626 SET_QC(); \ 627 } else { \ 628 d = s; \ 629 } \ 630 res |= (uint32_t)d << (n / 2); 631 632 SAT8(0); 633 SAT8(16); 634 SAT8(32); 635 SAT8(48); 636 #undef SAT8 637 return res; 638 } 639 640 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 641 { 642 int16_t s; 643 uint8_t d; 644 uint32_t res = 0; 645 #define SAT8(n) \ 646 s = x >> n; \ 647 if (s != (int8_t)s) { \ 648 d = (s >> 15) ^ 0x7f; \ 649 SET_QC(); \ 650 } else { \ 651 d = s; \ 652 } \ 653 res |= (uint32_t)d << (n / 2); 654 655 SAT8(0); 656 SAT8(16); 657 SAT8(32); 658 SAT8(48); 659 #undef SAT8 660 return res; 661 } 662 663 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 664 { 665 uint32_t high; 666 uint32_t low; 667 low = x; 668 if (low & 0x80000000) { 669 low = 0; 670 SET_QC(); 671 } else if (low > 0xffff) { 672 low = 0xffff; 673 SET_QC(); 674 } 675 high = x >> 32; 676 if (high & 0x80000000) { 677 high = 0; 678 SET_QC(); 679 } else if (high > 0xffff) { 680 high = 0xffff; 681 SET_QC(); 682 } 683 return low | (high << 16); 684 } 685 686 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 687 { 688 uint32_t high; 689 uint32_t low; 690 low = x; 691 if (low > 0xffff) { 692 low = 0xffff; 693 SET_QC(); 694 } 695 high = x >> 32; 696 if (high > 0xffff) { 697 high = 0xffff; 698 SET_QC(); 699 } 700 return low | (high << 16); 701 } 702 703 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 704 { 705 int32_t low; 706 int32_t high; 707 low = x; 708 if (low != (int16_t)low) { 709 low = (low >> 31) ^ 0x7fff; 710 SET_QC(); 711 } 712 high = x >> 32; 713 if (high != (int16_t)high) { 714 high = (high >> 31) ^ 0x7fff; 715 SET_QC(); 716 } 717 return (uint16_t)low | (high << 16); 718 } 719 720 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 721 { 722 if (x & 0x8000000000000000ull) { 723 SET_QC(); 724 return 0; 725 } 726 if (x > 0xffffffffu) { 727 SET_QC(); 728 return 0xffffffffu; 729 } 730 return x; 731 } 732 733 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 734 { 735 if (x > 0xffffffffu) { 736 SET_QC(); 737 return 0xffffffffu; 738 } 739 return x; 740 } 741 742 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 743 { 744 if ((int64_t)x != (int32_t)x) { 745 SET_QC(); 746 return ((int64_t)x >> 63) ^ 0x7fffffff; 747 } 748 return x; 749 } 750 751 uint64_t HELPER(neon_widen_u8)(uint32_t x) 752 { 753 uint64_t tmp; 754 uint64_t ret; 755 ret = (uint8_t)x; 756 tmp = (uint8_t)(x >> 8); 757 ret |= tmp << 16; 758 tmp = (uint8_t)(x >> 16); 759 ret |= tmp << 32; 760 tmp = (uint8_t)(x >> 24); 761 ret |= tmp << 48; 762 return ret; 763 } 764 765 uint64_t HELPER(neon_widen_s8)(uint32_t x) 766 { 767 uint64_t tmp; 768 uint64_t ret; 769 ret = (uint16_t)(int8_t)x; 770 tmp = (uint16_t)(int8_t)(x >> 8); 771 ret |= tmp << 16; 772 tmp = (uint16_t)(int8_t)(x >> 16); 773 ret |= tmp << 32; 774 tmp = (uint16_t)(int8_t)(x >> 24); 775 ret |= tmp << 48; 776 return ret; 777 } 778 779 uint64_t HELPER(neon_widen_u16)(uint32_t x) 780 { 781 uint64_t high = (uint16_t)(x >> 16); 782 return ((uint16_t)x) | (high << 32); 783 } 784 785 uint64_t HELPER(neon_widen_s16)(uint32_t x) 786 { 787 uint64_t high = (int16_t)(x >> 16); 788 return ((uint32_t)(int16_t)x) | (high << 32); 789 } 790 791 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 792 { 793 uint64_t mask; 794 mask = (a ^ b) & 0x8000800080008000ull; 795 a &= ~0x8000800080008000ull; 796 b &= ~0x8000800080008000ull; 797 return (a + b) ^ mask; 798 } 799 800 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 801 { 802 uint64_t mask; 803 mask = (a ^ b) & 0x8000000080000000ull; 804 a &= ~0x8000000080000000ull; 805 b &= ~0x8000000080000000ull; 806 return (a + b) ^ mask; 807 } 808 809 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 810 { 811 uint64_t tmp; 812 uint64_t tmp2; 813 814 tmp = a & 0x0000ffff0000ffffull; 815 tmp += (a >> 16) & 0x0000ffff0000ffffull; 816 tmp2 = b & 0xffff0000ffff0000ull; 817 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 818 return ( tmp & 0xffff) 819 | ((tmp >> 16) & 0xffff0000ull) 820 | ((tmp2 << 16) & 0xffff00000000ull) 821 | ( tmp2 & 0xffff000000000000ull); 822 } 823 824 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 825 { 826 uint32_t low = a + (a >> 32); 827 uint32_t high = b + (b >> 32); 828 return low + ((uint64_t)high << 32); 829 } 830 831 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 832 { 833 uint64_t mask; 834 mask = (a ^ ~b) & 0x8000800080008000ull; 835 a |= 0x8000800080008000ull; 836 b &= ~0x8000800080008000ull; 837 return (a - b) ^ mask; 838 } 839 840 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 841 { 842 uint64_t mask; 843 mask = (a ^ ~b) & 0x8000000080000000ull; 844 a |= 0x8000000080000000ull; 845 b &= ~0x8000000080000000ull; 846 return (a - b) ^ mask; 847 } 848 849 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 850 { 851 uint32_t x, y; 852 uint32_t low, high; 853 854 x = a; 855 y = b; 856 low = x + y; 857 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 858 SET_QC(); 859 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 860 } 861 x = a >> 32; 862 y = b >> 32; 863 high = x + y; 864 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 865 SET_QC(); 866 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 867 } 868 return low | ((uint64_t)high << 32); 869 } 870 871 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 872 { 873 uint64_t result; 874 875 result = a + b; 876 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 877 SET_QC(); 878 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 879 } 880 return result; 881 } 882 883 /* We have to do the arithmetic in a larger type than 884 * the input type, because for example with a signed 32 bit 885 * op the absolute difference can overflow a signed 32 bit value. 886 */ 887 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 888 arithtype tmp_x = (intype)(x); \ 889 arithtype tmp_y = (intype)(y); \ 890 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 891 } while(0) 892 893 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 894 { 895 uint64_t tmp; 896 uint64_t result; 897 DO_ABD(result, a, b, uint8_t, uint32_t); 898 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 899 result |= tmp << 16; 900 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 901 result |= tmp << 32; 902 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 903 result |= tmp << 48; 904 return result; 905 } 906 907 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 908 { 909 uint64_t tmp; 910 uint64_t result; 911 DO_ABD(result, a, b, int8_t, int32_t); 912 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 913 result |= tmp << 16; 914 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 915 result |= tmp << 32; 916 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 917 result |= tmp << 48; 918 return result; 919 } 920 921 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 922 { 923 uint64_t tmp; 924 uint64_t result; 925 DO_ABD(result, a, b, uint16_t, uint32_t); 926 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 927 return result | (tmp << 32); 928 } 929 930 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 931 { 932 uint64_t tmp; 933 uint64_t result; 934 DO_ABD(result, a, b, int16_t, int32_t); 935 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 936 return result | (tmp << 32); 937 } 938 939 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 940 { 941 uint64_t result; 942 DO_ABD(result, a, b, uint32_t, uint64_t); 943 return result; 944 } 945 946 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 947 { 948 uint64_t result; 949 DO_ABD(result, a, b, int32_t, int64_t); 950 return result; 951 } 952 #undef DO_ABD 953 954 /* Widening multiply. Named type is the source type. */ 955 #define DO_MULL(dest, x, y, type1, type2) do { \ 956 type1 tmp_x = x; \ 957 type1 tmp_y = y; \ 958 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 959 } while(0) 960 961 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 962 { 963 uint64_t tmp; 964 uint64_t result; 965 966 DO_MULL(result, a, b, uint8_t, uint16_t); 967 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 968 result |= tmp << 16; 969 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 970 result |= tmp << 32; 971 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 972 result |= tmp << 48; 973 return result; 974 } 975 976 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 977 { 978 uint64_t tmp; 979 uint64_t result; 980 981 DO_MULL(result, a, b, int8_t, uint16_t); 982 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 983 result |= tmp << 16; 984 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 985 result |= tmp << 32; 986 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 987 result |= tmp << 48; 988 return result; 989 } 990 991 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 992 { 993 uint64_t tmp; 994 uint64_t result; 995 996 DO_MULL(result, a, b, uint16_t, uint32_t); 997 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 998 return result | (tmp << 32); 999 } 1000 1001 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1002 { 1003 uint64_t tmp; 1004 uint64_t result; 1005 1006 DO_MULL(result, a, b, int16_t, uint32_t); 1007 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1008 return result | (tmp << 32); 1009 } 1010 1011 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1012 { 1013 uint16_t tmp; 1014 uint64_t result; 1015 result = (uint16_t)-x; 1016 tmp = -(x >> 16); 1017 result |= (uint64_t)tmp << 16; 1018 tmp = -(x >> 32); 1019 result |= (uint64_t)tmp << 32; 1020 tmp = -(x >> 48); 1021 result |= (uint64_t)tmp << 48; 1022 return result; 1023 } 1024 1025 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1026 { 1027 uint32_t low = -x; 1028 uint32_t high = -(x >> 32); 1029 return low | ((uint64_t)high << 32); 1030 } 1031 1032 /* Saturating sign manipulation. */ 1033 /* ??? Make these use NEON_VOP1 */ 1034 #define DO_QABS8(x) do { \ 1035 if (x == (int8_t)0x80) { \ 1036 x = 0x7f; \ 1037 SET_QC(); \ 1038 } else if (x < 0) { \ 1039 x = -x; \ 1040 }} while (0) 1041 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1042 { 1043 neon_s8 vec; 1044 NEON_UNPACK(neon_s8, vec, x); 1045 DO_QABS8(vec.v1); 1046 DO_QABS8(vec.v2); 1047 DO_QABS8(vec.v3); 1048 DO_QABS8(vec.v4); 1049 NEON_PACK(neon_s8, x, vec); 1050 return x; 1051 } 1052 #undef DO_QABS8 1053 1054 #define DO_QNEG8(x) do { \ 1055 if (x == (int8_t)0x80) { \ 1056 x = 0x7f; \ 1057 SET_QC(); \ 1058 } else { \ 1059 x = -x; \ 1060 }} while (0) 1061 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1062 { 1063 neon_s8 vec; 1064 NEON_UNPACK(neon_s8, vec, x); 1065 DO_QNEG8(vec.v1); 1066 DO_QNEG8(vec.v2); 1067 DO_QNEG8(vec.v3); 1068 DO_QNEG8(vec.v4); 1069 NEON_PACK(neon_s8, x, vec); 1070 return x; 1071 } 1072 #undef DO_QNEG8 1073 1074 #define DO_QABS16(x) do { \ 1075 if (x == (int16_t)0x8000) { \ 1076 x = 0x7fff; \ 1077 SET_QC(); \ 1078 } else if (x < 0) { \ 1079 x = -x; \ 1080 }} while (0) 1081 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1082 { 1083 neon_s16 vec; 1084 NEON_UNPACK(neon_s16, vec, x); 1085 DO_QABS16(vec.v1); 1086 DO_QABS16(vec.v2); 1087 NEON_PACK(neon_s16, x, vec); 1088 return x; 1089 } 1090 #undef DO_QABS16 1091 1092 #define DO_QNEG16(x) do { \ 1093 if (x == (int16_t)0x8000) { \ 1094 x = 0x7fff; \ 1095 SET_QC(); \ 1096 } else { \ 1097 x = -x; \ 1098 }} while (0) 1099 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1100 { 1101 neon_s16 vec; 1102 NEON_UNPACK(neon_s16, vec, x); 1103 DO_QNEG16(vec.v1); 1104 DO_QNEG16(vec.v2); 1105 NEON_PACK(neon_s16, x, vec); 1106 return x; 1107 } 1108 #undef DO_QNEG16 1109 1110 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1111 { 1112 if (x == SIGNBIT) { 1113 SET_QC(); 1114 x = ~SIGNBIT; 1115 } else if ((int32_t)x < 0) { 1116 x = -x; 1117 } 1118 return x; 1119 } 1120 1121 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1122 { 1123 if (x == SIGNBIT) { 1124 SET_QC(); 1125 x = ~SIGNBIT; 1126 } else { 1127 x = -x; 1128 } 1129 return x; 1130 } 1131 1132 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1133 { 1134 if (x == SIGNBIT64) { 1135 SET_QC(); 1136 x = ~SIGNBIT64; 1137 } else if ((int64_t)x < 0) { 1138 x = -x; 1139 } 1140 return x; 1141 } 1142 1143 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1144 { 1145 if (x == SIGNBIT64) { 1146 SET_QC(); 1147 x = ~SIGNBIT64; 1148 } else { 1149 x = -x; 1150 } 1151 return x; 1152 } 1153 1154 /* NEON Float helpers. */ 1155 1156 /* Floating point comparisons produce an integer result. 1157 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1158 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1159 */ 1160 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1161 { 1162 float_status *fpst = fpstp; 1163 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1164 } 1165 1166 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1167 { 1168 float_status *fpst = fpstp; 1169 return -float32_le(make_float32(b), make_float32(a), fpst); 1170 } 1171 1172 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1173 { 1174 float_status *fpst = fpstp; 1175 return -float32_lt(make_float32(b), make_float32(a), fpst); 1176 } 1177 1178 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1179 { 1180 float_status *fpst = fpstp; 1181 float32 f0 = float32_abs(make_float32(a)); 1182 float32 f1 = float32_abs(make_float32(b)); 1183 return -float32_le(f1, f0, fpst); 1184 } 1185 1186 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1187 { 1188 float_status *fpst = fpstp; 1189 float32 f0 = float32_abs(make_float32(a)); 1190 float32 f1 = float32_abs(make_float32(b)); 1191 return -float32_lt(f1, f0, fpst); 1192 } 1193 1194 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1195 { 1196 float_status *fpst = fpstp; 1197 float64 f0 = float64_abs(make_float64(a)); 1198 float64 f1 = float64_abs(make_float64(b)); 1199 return -float64_le(f1, f0, fpst); 1200 } 1201 1202 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1203 { 1204 float_status *fpst = fpstp; 1205 float64 f0 = float64_abs(make_float64(a)); 1206 float64 f1 = float64_abs(make_float64(b)); 1207 return -float64_lt(f1, f0, fpst); 1208 } 1209 1210 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1211 1212 void HELPER(neon_qunzip8)(void *vd, void *vm) 1213 { 1214 uint64_t *rd = vd, *rm = vm; 1215 uint64_t zd0 = rd[0], zd1 = rd[1]; 1216 uint64_t zm0 = rm[0], zm1 = rm[1]; 1217 1218 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1219 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1220 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1221 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1222 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1223 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1224 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1225 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1226 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1227 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1228 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1229 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1230 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1231 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1232 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1233 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1234 1235 rm[0] = m0; 1236 rm[1] = m1; 1237 rd[0] = d0; 1238 rd[1] = d1; 1239 } 1240 1241 void HELPER(neon_qunzip16)(void *vd, void *vm) 1242 { 1243 uint64_t *rd = vd, *rm = vm; 1244 uint64_t zd0 = rd[0], zd1 = rd[1]; 1245 uint64_t zm0 = rm[0], zm1 = rm[1]; 1246 1247 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1248 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1249 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1250 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1251 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1252 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1253 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1254 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1255 1256 rm[0] = m0; 1257 rm[1] = m1; 1258 rd[0] = d0; 1259 rd[1] = d1; 1260 } 1261 1262 void HELPER(neon_qunzip32)(void *vd, void *vm) 1263 { 1264 uint64_t *rd = vd, *rm = vm; 1265 uint64_t zd0 = rd[0], zd1 = rd[1]; 1266 uint64_t zm0 = rm[0], zm1 = rm[1]; 1267 1268 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1269 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1270 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1271 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1272 1273 rm[0] = m0; 1274 rm[1] = m1; 1275 rd[0] = d0; 1276 rd[1] = d1; 1277 } 1278 1279 void HELPER(neon_unzip8)(void *vd, void *vm) 1280 { 1281 uint64_t *rd = vd, *rm = vm; 1282 uint64_t zd = rd[0], zm = rm[0]; 1283 1284 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1285 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1286 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1287 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1288 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1289 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1290 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1291 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1292 1293 rm[0] = m0; 1294 rd[0] = d0; 1295 } 1296 1297 void HELPER(neon_unzip16)(void *vd, void *vm) 1298 { 1299 uint64_t *rd = vd, *rm = vm; 1300 uint64_t zd = rd[0], zm = rm[0]; 1301 1302 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1303 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1304 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1305 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1306 1307 rm[0] = m0; 1308 rd[0] = d0; 1309 } 1310 1311 void HELPER(neon_qzip8)(void *vd, void *vm) 1312 { 1313 uint64_t *rd = vd, *rm = vm; 1314 uint64_t zd0 = rd[0], zd1 = rd[1]; 1315 uint64_t zm0 = rm[0], zm1 = rm[1]; 1316 1317 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1318 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1319 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1320 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1321 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1322 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1323 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1324 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1325 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1326 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1327 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1328 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1329 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1330 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1331 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1332 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1333 1334 rm[0] = m0; 1335 rm[1] = m1; 1336 rd[0] = d0; 1337 rd[1] = d1; 1338 } 1339 1340 void HELPER(neon_qzip16)(void *vd, void *vm) 1341 { 1342 uint64_t *rd = vd, *rm = vm; 1343 uint64_t zd0 = rd[0], zd1 = rd[1]; 1344 uint64_t zm0 = rm[0], zm1 = rm[1]; 1345 1346 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1347 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1348 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1349 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1350 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1351 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1352 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1353 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1354 1355 rm[0] = m0; 1356 rm[1] = m1; 1357 rd[0] = d0; 1358 rd[1] = d1; 1359 } 1360 1361 void HELPER(neon_qzip32)(void *vd, void *vm) 1362 { 1363 uint64_t *rd = vd, *rm = vm; 1364 uint64_t zd0 = rd[0], zd1 = rd[1]; 1365 uint64_t zm0 = rm[0], zm1 = rm[1]; 1366 1367 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1368 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1369 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1370 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1371 1372 rm[0] = m0; 1373 rm[1] = m1; 1374 rd[0] = d0; 1375 rd[1] = d1; 1376 } 1377 1378 void HELPER(neon_zip8)(void *vd, void *vm) 1379 { 1380 uint64_t *rd = vd, *rm = vm; 1381 uint64_t zd = rd[0], zm = rm[0]; 1382 1383 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1384 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1385 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1386 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1387 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1388 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1389 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1390 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1391 1392 rm[0] = m0; 1393 rd[0] = d0; 1394 } 1395 1396 void HELPER(neon_zip16)(void *vd, void *vm) 1397 { 1398 uint64_t *rd = vd, *rm = vm; 1399 uint64_t zd = rd[0], zm = rm[0]; 1400 1401 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1402 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1403 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1404 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1405 1406 rm[0] = m0; 1407 rd[0] = d0; 1408 } 1409