1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "tcg/tcg-gvec-desc.h" 14 #include "fpu/softfloat.h" 15 #include "vec_internal.h" 16 17 #define SIGNBIT (uint32_t)0x80000000 18 #define SIGNBIT64 ((uint64_t)1 << 63) 19 20 #define SET_QC() env->vfp.qc[0] = 1 21 22 #define NEON_TYPE1(name, type) \ 23 typedef struct \ 24 { \ 25 type v1; \ 26 } neon_##name; 27 #if HOST_BIG_ENDIAN 28 #define NEON_TYPE2(name, type) \ 29 typedef struct \ 30 { \ 31 type v2; \ 32 type v1; \ 33 } neon_##name; 34 #define NEON_TYPE4(name, type) \ 35 typedef struct \ 36 { \ 37 type v4; \ 38 type v3; \ 39 type v2; \ 40 type v1; \ 41 } neon_##name; 42 #else 43 #define NEON_TYPE2(name, type) \ 44 typedef struct \ 45 { \ 46 type v1; \ 47 type v2; \ 48 } neon_##name; 49 #define NEON_TYPE4(name, type) \ 50 typedef struct \ 51 { \ 52 type v1; \ 53 type v2; \ 54 type v3; \ 55 type v4; \ 56 } neon_##name; 57 #endif 58 59 NEON_TYPE4(s8, int8_t) 60 NEON_TYPE4(u8, uint8_t) 61 NEON_TYPE2(s16, int16_t) 62 NEON_TYPE2(u16, uint16_t) 63 NEON_TYPE1(s32, int32_t) 64 NEON_TYPE1(u32, uint32_t) 65 #undef NEON_TYPE4 66 #undef NEON_TYPE2 67 #undef NEON_TYPE1 68 69 /* Copy from a uint32_t to a vector structure type. */ 70 #define NEON_UNPACK(vtype, dest, val) do { \ 71 union { \ 72 vtype v; \ 73 uint32_t i; \ 74 } conv_u; \ 75 conv_u.i = (val); \ 76 dest = conv_u.v; \ 77 } while(0) 78 79 /* Copy from a vector structure type to a uint32_t. */ 80 #define NEON_PACK(vtype, dest, val) do { \ 81 union { \ 82 vtype v; \ 83 uint32_t i; \ 84 } conv_u; \ 85 conv_u.v = (val); \ 86 dest = conv_u.i; \ 87 } while(0) 88 89 #define NEON_DO1 \ 90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 91 #define NEON_DO2 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 94 #define NEON_DO4 \ 95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 99 100 #define NEON_VOP_BODY(vtype, n) \ 101 { \ 102 uint32_t res; \ 103 vtype vsrc1; \ 104 vtype vsrc2; \ 105 vtype vdest; \ 106 NEON_UNPACK(vtype, vsrc1, arg1); \ 107 NEON_UNPACK(vtype, vsrc2, arg2); \ 108 NEON_DO##n; \ 109 NEON_PACK(vtype, res, vdest); \ 110 return res; \ 111 } 112 113 #define NEON_VOP(name, vtype, n) \ 114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 115 NEON_VOP_BODY(vtype, n) 116 117 #define NEON_VOP_ENV(name, vtype, n) \ 118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 119 NEON_VOP_BODY(vtype, n) 120 121 #define NEON_GVEC_VOP2(name, vtype) \ 122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 123 { \ 124 intptr_t i, opr_sz = simd_oprsz(desc); \ 125 vtype *d = vd, *n = vn, *m = vm; \ 126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 127 NEON_FN(d[i], n[i], m[i]); \ 128 } \ 129 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 130 } 131 132 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \ 134 { \ 135 intptr_t i, opr_sz = simd_oprsz(desc); \ 136 vtype *d = vd, *n = vn, *m = vm; \ 137 CPUARMState *env = venv; \ 138 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 139 NEON_FN(d[i], n[i], m[i]); \ 140 } \ 141 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 142 } 143 144 /* Pairwise operations. */ 145 /* For 32-bit elements each segment only contains a single element, so 146 the elementwise and pairwise operations are the same. */ 147 #define NEON_PDO2 \ 148 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 149 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 150 #define NEON_PDO4 \ 151 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 152 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 153 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 154 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 155 156 #define NEON_POP(name, vtype, n) \ 157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 158 { \ 159 uint32_t res; \ 160 vtype vsrc1; \ 161 vtype vsrc2; \ 162 vtype vdest; \ 163 NEON_UNPACK(vtype, vsrc1, arg1); \ 164 NEON_UNPACK(vtype, vsrc2, arg2); \ 165 NEON_PDO##n; \ 166 NEON_PACK(vtype, res, vdest); \ 167 return res; \ 168 } 169 170 /* Unary operators. */ 171 #define NEON_VOP1(name, vtype, n) \ 172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 173 { \ 174 vtype vsrc1; \ 175 vtype vdest; \ 176 NEON_UNPACK(vtype, vsrc1, arg); \ 177 NEON_DO##n; \ 178 NEON_PACK(vtype, arg, vdest); \ 179 return arg; \ 180 } 181 182 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 183 NEON_POP(pmin_s8, neon_s8, 4) 184 NEON_POP(pmin_u8, neon_u8, 4) 185 NEON_POP(pmin_s16, neon_s16, 2) 186 NEON_POP(pmin_u16, neon_u16, 2) 187 #undef NEON_FN 188 189 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 190 NEON_POP(pmax_s8, neon_s8, 4) 191 NEON_POP(pmax_u8, neon_u8, 4) 192 NEON_POP(pmax_s16, neon_s16, 2) 193 NEON_POP(pmax_u16, neon_u16, 2) 194 #undef NEON_FN 195 196 #define NEON_FN(dest, src1, src2) \ 197 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 198 NEON_VOP(shl_u16, neon_u16, 2) 199 #undef NEON_FN 200 201 #define NEON_FN(dest, src1, src2) \ 202 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 203 NEON_VOP(shl_s16, neon_s16, 2) 204 #undef NEON_FN 205 206 #define NEON_FN(dest, src1, src2) \ 207 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 208 NEON_VOP(rshl_s8, neon_s8, 4) 209 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 210 #undef NEON_FN 211 212 #define NEON_FN(dest, src1, src2) \ 213 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 214 NEON_VOP(rshl_s16, neon_s16, 2) 215 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 216 #undef NEON_FN 217 218 #define NEON_FN(dest, src1, src2) \ 219 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 220 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 221 #undef NEON_FN 222 223 #define NEON_FN(dest, src1, src2) \ 224 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 225 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 226 #undef NEON_FN 227 228 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 229 { 230 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 231 } 232 233 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 234 { 235 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 236 } 237 238 #define NEON_FN(dest, src1, src2) \ 239 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 240 NEON_VOP(rshl_u8, neon_u8, 4) 241 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 242 #undef NEON_FN 243 244 #define NEON_FN(dest, src1, src2) \ 245 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 246 NEON_VOP(rshl_u16, neon_u16, 2) 247 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 248 #undef NEON_FN 249 250 #define NEON_FN(dest, src1, src2) \ 251 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 252 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 253 #undef NEON_FN 254 255 #define NEON_FN(dest, src1, src2) \ 256 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 257 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 258 #undef NEON_FN 259 260 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 261 { 262 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 263 } 264 265 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 266 { 267 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 268 } 269 270 #define NEON_FN(dest, src1, src2) \ 271 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 272 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 273 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 274 #undef NEON_FN 275 276 #define NEON_FN(dest, src1, src2) \ 277 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 278 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 279 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 280 #undef NEON_FN 281 282 #define NEON_FN(dest, src1, src2) \ 283 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 284 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 285 #undef NEON_FN 286 287 #define NEON_FN(dest, src1, src2) \ 288 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 289 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 290 #undef NEON_FN 291 292 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 293 { 294 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 295 } 296 297 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 298 { 299 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 300 } 301 302 #define NEON_FN(dest, src1, src2) \ 303 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 304 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 305 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 306 #undef NEON_FN 307 308 #define NEON_FN(dest, src1, src2) \ 309 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 310 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 311 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 312 #undef NEON_FN 313 314 #define NEON_FN(dest, src1, src2) \ 315 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 316 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 317 #undef NEON_FN 318 319 #define NEON_FN(dest, src1, src2) \ 320 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 321 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 322 #undef NEON_FN 323 324 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 325 { 326 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 327 } 328 329 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 330 { 331 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 332 } 333 334 #define NEON_FN(dest, src1, src2) \ 335 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 336 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 337 #undef NEON_FN 338 339 #define NEON_FN(dest, src1, src2) \ 340 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 341 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 342 #undef NEON_FN 343 344 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 345 { 346 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 347 } 348 349 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 350 { 351 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 352 } 353 354 #define NEON_FN(dest, src1, src2) \ 355 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 356 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 357 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) 358 #undef NEON_FN 359 360 #define NEON_FN(dest, src1, src2) \ 361 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 362 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 363 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t) 364 #undef NEON_FN 365 366 #define NEON_FN(dest, src1, src2) \ 367 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 368 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t) 369 #undef NEON_FN 370 371 #define NEON_FN(dest, src1, src2) \ 372 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 373 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t) 374 #undef NEON_FN 375 376 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 377 { 378 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 379 } 380 381 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 382 { 383 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 384 } 385 386 #define NEON_FN(dest, src1, src2) \ 387 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 388 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 389 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t) 390 #undef NEON_FN 391 392 #define NEON_FN(dest, src1, src2) \ 393 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 394 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 395 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t) 396 #undef NEON_FN 397 398 #define NEON_FN(dest, src1, src2) \ 399 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 400 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t) 401 #undef NEON_FN 402 403 #define NEON_FN(dest, src1, src2) \ 404 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 405 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t) 406 #undef NEON_FN 407 408 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 409 { 410 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 411 } 412 413 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 414 { 415 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 416 } 417 418 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 419 { 420 uint32_t mask; 421 mask = (a ^ b) & 0x80808080u; 422 a &= ~0x80808080u; 423 b &= ~0x80808080u; 424 return (a + b) ^ mask; 425 } 426 427 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 428 { 429 uint32_t mask; 430 mask = (a ^ b) & 0x80008000u; 431 a &= ~0x80008000u; 432 b &= ~0x80008000u; 433 return (a + b) ^ mask; 434 } 435 436 #define NEON_FN(dest, src1, src2) dest = src1 - src2 437 NEON_VOP(sub_u8, neon_u8, 4) 438 NEON_VOP(sub_u16, neon_u16, 2) 439 #undef NEON_FN 440 441 #define NEON_FN(dest, src1, src2) dest = src1 * src2 442 NEON_VOP(mul_u8, neon_u8, 4) 443 NEON_VOP(mul_u16, neon_u16, 2) 444 #undef NEON_FN 445 446 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 447 NEON_VOP(tst_u8, neon_u8, 4) 448 NEON_VOP(tst_u16, neon_u16, 2) 449 NEON_VOP(tst_u32, neon_u32, 1) 450 #undef NEON_FN 451 452 /* Count Leading Sign/Zero Bits. */ 453 static inline int do_clz8(uint8_t x) 454 { 455 int n; 456 for (n = 8; x; n--) 457 x >>= 1; 458 return n; 459 } 460 461 static inline int do_clz16(uint16_t x) 462 { 463 int n; 464 for (n = 16; x; n--) 465 x >>= 1; 466 return n; 467 } 468 469 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 470 NEON_VOP1(clz_u8, neon_u8, 4) 471 #undef NEON_FN 472 473 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 474 NEON_VOP1(clz_u16, neon_u16, 2) 475 #undef NEON_FN 476 477 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 478 NEON_VOP1(cls_s8, neon_s8, 4) 479 #undef NEON_FN 480 481 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 482 NEON_VOP1(cls_s16, neon_s16, 2) 483 #undef NEON_FN 484 485 uint32_t HELPER(neon_cls_s32)(uint32_t x) 486 { 487 int count; 488 if ((int32_t)x < 0) 489 x = ~x; 490 for (count = 32; x; count--) 491 x = x >> 1; 492 return count - 1; 493 } 494 495 /* Bit count. */ 496 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 497 { 498 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 499 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 500 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 501 return x; 502 } 503 504 /* Reverse bits in each 8 bit word */ 505 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 506 { 507 x = ((x & 0xf0f0f0f0) >> 4) 508 | ((x & 0x0f0f0f0f) << 4); 509 x = ((x & 0x88888888) >> 3) 510 | ((x & 0x44444444) >> 1) 511 | ((x & 0x22222222) << 1) 512 | ((x & 0x11111111) << 3); 513 return x; 514 } 515 516 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 517 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 518 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 519 SET_QC(); \ 520 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 521 } else { \ 522 tmp <<= 1; \ 523 } \ 524 if (round) { \ 525 int32_t old = tmp; \ 526 tmp += 1 << 15; \ 527 if ((int32_t)tmp < old) { \ 528 SET_QC(); \ 529 tmp = SIGNBIT - 1; \ 530 } \ 531 } \ 532 dest = tmp >> 16; \ 533 } while(0) 534 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 535 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 536 #undef NEON_FN 537 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 538 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 539 #undef NEON_FN 540 #undef NEON_QDMULH16 541 542 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 543 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 544 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 545 SET_QC(); \ 546 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 547 } else { \ 548 tmp <<= 1; \ 549 } \ 550 if (round) { \ 551 int64_t old = tmp; \ 552 tmp += (int64_t)1 << 31; \ 553 if ((int64_t)tmp < old) { \ 554 SET_QC(); \ 555 tmp = SIGNBIT64 - 1; \ 556 } \ 557 } \ 558 dest = tmp >> 32; \ 559 } while(0) 560 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 561 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 562 #undef NEON_FN 563 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 564 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 565 #undef NEON_FN 566 #undef NEON_QDMULH32 567 568 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 569 { 570 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 571 | ((x >> 24) & 0xff000000u); 572 } 573 574 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 575 { 576 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 577 } 578 579 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 580 { 581 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 582 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 583 } 584 585 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 586 { 587 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 588 } 589 590 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 591 { 592 x &= 0xff80ff80ff80ff80ull; 593 x += 0x0080008000800080ull; 594 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 595 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 596 } 597 598 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 599 { 600 x &= 0xffff8000ffff8000ull; 601 x += 0x0000800000008000ull; 602 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 603 } 604 605 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 606 { 607 uint16_t s; 608 uint8_t d; 609 uint32_t res = 0; 610 #define SAT8(n) \ 611 s = x >> n; \ 612 if (s & 0x8000) { \ 613 SET_QC(); \ 614 } else { \ 615 if (s > 0xff) { \ 616 d = 0xff; \ 617 SET_QC(); \ 618 } else { \ 619 d = s; \ 620 } \ 621 res |= (uint32_t)d << (n / 2); \ 622 } 623 624 SAT8(0); 625 SAT8(16); 626 SAT8(32); 627 SAT8(48); 628 #undef SAT8 629 return res; 630 } 631 632 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 633 { 634 uint16_t s; 635 uint8_t d; 636 uint32_t res = 0; 637 #define SAT8(n) \ 638 s = x >> n; \ 639 if (s > 0xff) { \ 640 d = 0xff; \ 641 SET_QC(); \ 642 } else { \ 643 d = s; \ 644 } \ 645 res |= (uint32_t)d << (n / 2); 646 647 SAT8(0); 648 SAT8(16); 649 SAT8(32); 650 SAT8(48); 651 #undef SAT8 652 return res; 653 } 654 655 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 656 { 657 int16_t s; 658 uint8_t d; 659 uint32_t res = 0; 660 #define SAT8(n) \ 661 s = x >> n; \ 662 if (s != (int8_t)s) { \ 663 d = (s >> 15) ^ 0x7f; \ 664 SET_QC(); \ 665 } else { \ 666 d = s; \ 667 } \ 668 res |= (uint32_t)d << (n / 2); 669 670 SAT8(0); 671 SAT8(16); 672 SAT8(32); 673 SAT8(48); 674 #undef SAT8 675 return res; 676 } 677 678 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 679 { 680 uint32_t high; 681 uint32_t low; 682 low = x; 683 if (low & 0x80000000) { 684 low = 0; 685 SET_QC(); 686 } else if (low > 0xffff) { 687 low = 0xffff; 688 SET_QC(); 689 } 690 high = x >> 32; 691 if (high & 0x80000000) { 692 high = 0; 693 SET_QC(); 694 } else if (high > 0xffff) { 695 high = 0xffff; 696 SET_QC(); 697 } 698 return low | (high << 16); 699 } 700 701 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 702 { 703 uint32_t high; 704 uint32_t low; 705 low = x; 706 if (low > 0xffff) { 707 low = 0xffff; 708 SET_QC(); 709 } 710 high = x >> 32; 711 if (high > 0xffff) { 712 high = 0xffff; 713 SET_QC(); 714 } 715 return low | (high << 16); 716 } 717 718 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 719 { 720 int32_t low; 721 int32_t high; 722 low = x; 723 if (low != (int16_t)low) { 724 low = (low >> 31) ^ 0x7fff; 725 SET_QC(); 726 } 727 high = x >> 32; 728 if (high != (int16_t)high) { 729 high = (high >> 31) ^ 0x7fff; 730 SET_QC(); 731 } 732 return (uint16_t)low | (high << 16); 733 } 734 735 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 736 { 737 if (x & 0x8000000000000000ull) { 738 SET_QC(); 739 return 0; 740 } 741 if (x > 0xffffffffu) { 742 SET_QC(); 743 return 0xffffffffu; 744 } 745 return x; 746 } 747 748 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 749 { 750 if (x > 0xffffffffu) { 751 SET_QC(); 752 return 0xffffffffu; 753 } 754 return x; 755 } 756 757 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 758 { 759 if ((int64_t)x != (int32_t)x) { 760 SET_QC(); 761 return ((int64_t)x >> 63) ^ 0x7fffffff; 762 } 763 return x; 764 } 765 766 uint64_t HELPER(neon_widen_u8)(uint32_t x) 767 { 768 uint64_t tmp; 769 uint64_t ret; 770 ret = (uint8_t)x; 771 tmp = (uint8_t)(x >> 8); 772 ret |= tmp << 16; 773 tmp = (uint8_t)(x >> 16); 774 ret |= tmp << 32; 775 tmp = (uint8_t)(x >> 24); 776 ret |= tmp << 48; 777 return ret; 778 } 779 780 uint64_t HELPER(neon_widen_s8)(uint32_t x) 781 { 782 uint64_t tmp; 783 uint64_t ret; 784 ret = (uint16_t)(int8_t)x; 785 tmp = (uint16_t)(int8_t)(x >> 8); 786 ret |= tmp << 16; 787 tmp = (uint16_t)(int8_t)(x >> 16); 788 ret |= tmp << 32; 789 tmp = (uint16_t)(int8_t)(x >> 24); 790 ret |= tmp << 48; 791 return ret; 792 } 793 794 uint64_t HELPER(neon_widen_u16)(uint32_t x) 795 { 796 uint64_t high = (uint16_t)(x >> 16); 797 return ((uint16_t)x) | (high << 32); 798 } 799 800 uint64_t HELPER(neon_widen_s16)(uint32_t x) 801 { 802 uint64_t high = (int16_t)(x >> 16); 803 return ((uint32_t)(int16_t)x) | (high << 32); 804 } 805 806 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 807 { 808 uint64_t mask; 809 mask = (a ^ b) & 0x8000800080008000ull; 810 a &= ~0x8000800080008000ull; 811 b &= ~0x8000800080008000ull; 812 return (a + b) ^ mask; 813 } 814 815 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 816 { 817 uint64_t mask; 818 mask = (a ^ b) & 0x8000000080000000ull; 819 a &= ~0x8000000080000000ull; 820 b &= ~0x8000000080000000ull; 821 return (a + b) ^ mask; 822 } 823 824 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 825 { 826 uint64_t tmp; 827 uint64_t tmp2; 828 829 tmp = a & 0x0000ffff0000ffffull; 830 tmp += (a >> 16) & 0x0000ffff0000ffffull; 831 tmp2 = b & 0xffff0000ffff0000ull; 832 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 833 return ( tmp & 0xffff) 834 | ((tmp >> 16) & 0xffff0000ull) 835 | ((tmp2 << 16) & 0xffff00000000ull) 836 | ( tmp2 & 0xffff000000000000ull); 837 } 838 839 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 840 { 841 uint32_t low = a + (a >> 32); 842 uint32_t high = b + (b >> 32); 843 return low + ((uint64_t)high << 32); 844 } 845 846 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 847 { 848 uint64_t mask; 849 mask = (a ^ ~b) & 0x8000800080008000ull; 850 a |= 0x8000800080008000ull; 851 b &= ~0x8000800080008000ull; 852 return (a - b) ^ mask; 853 } 854 855 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 856 { 857 uint64_t mask; 858 mask = (a ^ ~b) & 0x8000000080000000ull; 859 a |= 0x8000000080000000ull; 860 b &= ~0x8000000080000000ull; 861 return (a - b) ^ mask; 862 } 863 864 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 865 { 866 uint32_t x, y; 867 uint32_t low, high; 868 869 x = a; 870 y = b; 871 low = x + y; 872 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 873 SET_QC(); 874 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 875 } 876 x = a >> 32; 877 y = b >> 32; 878 high = x + y; 879 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 880 SET_QC(); 881 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 882 } 883 return low | ((uint64_t)high << 32); 884 } 885 886 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 887 { 888 uint64_t result; 889 890 result = a + b; 891 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 892 SET_QC(); 893 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 894 } 895 return result; 896 } 897 898 /* We have to do the arithmetic in a larger type than 899 * the input type, because for example with a signed 32 bit 900 * op the absolute difference can overflow a signed 32 bit value. 901 */ 902 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 903 arithtype tmp_x = (intype)(x); \ 904 arithtype tmp_y = (intype)(y); \ 905 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 906 } while(0) 907 908 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 909 { 910 uint64_t tmp; 911 uint64_t result; 912 DO_ABD(result, a, b, uint8_t, uint32_t); 913 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 914 result |= tmp << 16; 915 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 916 result |= tmp << 32; 917 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 918 result |= tmp << 48; 919 return result; 920 } 921 922 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 923 { 924 uint64_t tmp; 925 uint64_t result; 926 DO_ABD(result, a, b, int8_t, int32_t); 927 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 928 result |= tmp << 16; 929 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 930 result |= tmp << 32; 931 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 932 result |= tmp << 48; 933 return result; 934 } 935 936 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 937 { 938 uint64_t tmp; 939 uint64_t result; 940 DO_ABD(result, a, b, uint16_t, uint32_t); 941 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 942 return result | (tmp << 32); 943 } 944 945 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 946 { 947 uint64_t tmp; 948 uint64_t result; 949 DO_ABD(result, a, b, int16_t, int32_t); 950 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 951 return result | (tmp << 32); 952 } 953 954 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 955 { 956 uint64_t result; 957 DO_ABD(result, a, b, uint32_t, uint64_t); 958 return result; 959 } 960 961 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 962 { 963 uint64_t result; 964 DO_ABD(result, a, b, int32_t, int64_t); 965 return result; 966 } 967 #undef DO_ABD 968 969 /* Widening multiply. Named type is the source type. */ 970 #define DO_MULL(dest, x, y, type1, type2) do { \ 971 type1 tmp_x = x; \ 972 type1 tmp_y = y; \ 973 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 974 } while(0) 975 976 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 977 { 978 uint64_t tmp; 979 uint64_t result; 980 981 DO_MULL(result, a, b, uint8_t, uint16_t); 982 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 983 result |= tmp << 16; 984 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 985 result |= tmp << 32; 986 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 987 result |= tmp << 48; 988 return result; 989 } 990 991 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 992 { 993 uint64_t tmp; 994 uint64_t result; 995 996 DO_MULL(result, a, b, int8_t, uint16_t); 997 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 998 result |= tmp << 16; 999 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1000 result |= tmp << 32; 1001 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1002 result |= tmp << 48; 1003 return result; 1004 } 1005 1006 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1007 { 1008 uint64_t tmp; 1009 uint64_t result; 1010 1011 DO_MULL(result, a, b, uint16_t, uint32_t); 1012 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1013 return result | (tmp << 32); 1014 } 1015 1016 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1017 { 1018 uint64_t tmp; 1019 uint64_t result; 1020 1021 DO_MULL(result, a, b, int16_t, uint32_t); 1022 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1023 return result | (tmp << 32); 1024 } 1025 1026 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1027 { 1028 uint16_t tmp; 1029 uint64_t result; 1030 result = (uint16_t)-x; 1031 tmp = -(x >> 16); 1032 result |= (uint64_t)tmp << 16; 1033 tmp = -(x >> 32); 1034 result |= (uint64_t)tmp << 32; 1035 tmp = -(x >> 48); 1036 result |= (uint64_t)tmp << 48; 1037 return result; 1038 } 1039 1040 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1041 { 1042 uint32_t low = -x; 1043 uint32_t high = -(x >> 32); 1044 return low | ((uint64_t)high << 32); 1045 } 1046 1047 /* Saturating sign manipulation. */ 1048 /* ??? Make these use NEON_VOP1 */ 1049 #define DO_QABS8(x) do { \ 1050 if (x == (int8_t)0x80) { \ 1051 x = 0x7f; \ 1052 SET_QC(); \ 1053 } else if (x < 0) { \ 1054 x = -x; \ 1055 }} while (0) 1056 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1057 { 1058 neon_s8 vec; 1059 NEON_UNPACK(neon_s8, vec, x); 1060 DO_QABS8(vec.v1); 1061 DO_QABS8(vec.v2); 1062 DO_QABS8(vec.v3); 1063 DO_QABS8(vec.v4); 1064 NEON_PACK(neon_s8, x, vec); 1065 return x; 1066 } 1067 #undef DO_QABS8 1068 1069 #define DO_QNEG8(x) do { \ 1070 if (x == (int8_t)0x80) { \ 1071 x = 0x7f; \ 1072 SET_QC(); \ 1073 } else { \ 1074 x = -x; \ 1075 }} while (0) 1076 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1077 { 1078 neon_s8 vec; 1079 NEON_UNPACK(neon_s8, vec, x); 1080 DO_QNEG8(vec.v1); 1081 DO_QNEG8(vec.v2); 1082 DO_QNEG8(vec.v3); 1083 DO_QNEG8(vec.v4); 1084 NEON_PACK(neon_s8, x, vec); 1085 return x; 1086 } 1087 #undef DO_QNEG8 1088 1089 #define DO_QABS16(x) do { \ 1090 if (x == (int16_t)0x8000) { \ 1091 x = 0x7fff; \ 1092 SET_QC(); \ 1093 } else if (x < 0) { \ 1094 x = -x; \ 1095 }} while (0) 1096 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1097 { 1098 neon_s16 vec; 1099 NEON_UNPACK(neon_s16, vec, x); 1100 DO_QABS16(vec.v1); 1101 DO_QABS16(vec.v2); 1102 NEON_PACK(neon_s16, x, vec); 1103 return x; 1104 } 1105 #undef DO_QABS16 1106 1107 #define DO_QNEG16(x) do { \ 1108 if (x == (int16_t)0x8000) { \ 1109 x = 0x7fff; \ 1110 SET_QC(); \ 1111 } else { \ 1112 x = -x; \ 1113 }} while (0) 1114 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1115 { 1116 neon_s16 vec; 1117 NEON_UNPACK(neon_s16, vec, x); 1118 DO_QNEG16(vec.v1); 1119 DO_QNEG16(vec.v2); 1120 NEON_PACK(neon_s16, x, vec); 1121 return x; 1122 } 1123 #undef DO_QNEG16 1124 1125 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1126 { 1127 if (x == SIGNBIT) { 1128 SET_QC(); 1129 x = ~SIGNBIT; 1130 } else if ((int32_t)x < 0) { 1131 x = -x; 1132 } 1133 return x; 1134 } 1135 1136 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1137 { 1138 if (x == SIGNBIT) { 1139 SET_QC(); 1140 x = ~SIGNBIT; 1141 } else { 1142 x = -x; 1143 } 1144 return x; 1145 } 1146 1147 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1148 { 1149 if (x == SIGNBIT64) { 1150 SET_QC(); 1151 x = ~SIGNBIT64; 1152 } else if ((int64_t)x < 0) { 1153 x = -x; 1154 } 1155 return x; 1156 } 1157 1158 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1159 { 1160 if (x == SIGNBIT64) { 1161 SET_QC(); 1162 x = ~SIGNBIT64; 1163 } else { 1164 x = -x; 1165 } 1166 return x; 1167 } 1168 1169 /* NEON Float helpers. */ 1170 1171 /* Floating point comparisons produce an integer result. 1172 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1173 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1174 */ 1175 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1176 { 1177 float_status *fpst = fpstp; 1178 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1179 } 1180 1181 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1182 { 1183 float_status *fpst = fpstp; 1184 return -float32_le(make_float32(b), make_float32(a), fpst); 1185 } 1186 1187 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1188 { 1189 float_status *fpst = fpstp; 1190 return -float32_lt(make_float32(b), make_float32(a), fpst); 1191 } 1192 1193 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1194 { 1195 float_status *fpst = fpstp; 1196 float32 f0 = float32_abs(make_float32(a)); 1197 float32 f1 = float32_abs(make_float32(b)); 1198 return -float32_le(f1, f0, fpst); 1199 } 1200 1201 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1202 { 1203 float_status *fpst = fpstp; 1204 float32 f0 = float32_abs(make_float32(a)); 1205 float32 f1 = float32_abs(make_float32(b)); 1206 return -float32_lt(f1, f0, fpst); 1207 } 1208 1209 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1210 { 1211 float_status *fpst = fpstp; 1212 float64 f0 = float64_abs(make_float64(a)); 1213 float64 f1 = float64_abs(make_float64(b)); 1214 return -float64_le(f1, f0, fpst); 1215 } 1216 1217 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1218 { 1219 float_status *fpst = fpstp; 1220 float64 f0 = float64_abs(make_float64(a)); 1221 float64 f1 = float64_abs(make_float64(b)); 1222 return -float64_lt(f1, f0, fpst); 1223 } 1224 1225 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1226 1227 void HELPER(neon_qunzip8)(void *vd, void *vm) 1228 { 1229 uint64_t *rd = vd, *rm = vm; 1230 uint64_t zd0 = rd[0], zd1 = rd[1]; 1231 uint64_t zm0 = rm[0], zm1 = rm[1]; 1232 1233 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1234 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1235 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1236 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1237 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1238 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1239 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1240 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1241 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1242 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1243 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1244 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1245 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1246 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1247 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1248 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1249 1250 rm[0] = m0; 1251 rm[1] = m1; 1252 rd[0] = d0; 1253 rd[1] = d1; 1254 } 1255 1256 void HELPER(neon_qunzip16)(void *vd, void *vm) 1257 { 1258 uint64_t *rd = vd, *rm = vm; 1259 uint64_t zd0 = rd[0], zd1 = rd[1]; 1260 uint64_t zm0 = rm[0], zm1 = rm[1]; 1261 1262 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1263 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1264 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1265 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1266 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1267 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1268 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1269 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1270 1271 rm[0] = m0; 1272 rm[1] = m1; 1273 rd[0] = d0; 1274 rd[1] = d1; 1275 } 1276 1277 void HELPER(neon_qunzip32)(void *vd, void *vm) 1278 { 1279 uint64_t *rd = vd, *rm = vm; 1280 uint64_t zd0 = rd[0], zd1 = rd[1]; 1281 uint64_t zm0 = rm[0], zm1 = rm[1]; 1282 1283 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1284 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1285 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1286 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1287 1288 rm[0] = m0; 1289 rm[1] = m1; 1290 rd[0] = d0; 1291 rd[1] = d1; 1292 } 1293 1294 void HELPER(neon_unzip8)(void *vd, void *vm) 1295 { 1296 uint64_t *rd = vd, *rm = vm; 1297 uint64_t zd = rd[0], zm = rm[0]; 1298 1299 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1300 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1301 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1302 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1303 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1304 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1305 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1306 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1307 1308 rm[0] = m0; 1309 rd[0] = d0; 1310 } 1311 1312 void HELPER(neon_unzip16)(void *vd, void *vm) 1313 { 1314 uint64_t *rd = vd, *rm = vm; 1315 uint64_t zd = rd[0], zm = rm[0]; 1316 1317 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1318 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1319 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1320 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1321 1322 rm[0] = m0; 1323 rd[0] = d0; 1324 } 1325 1326 void HELPER(neon_qzip8)(void *vd, void *vm) 1327 { 1328 uint64_t *rd = vd, *rm = vm; 1329 uint64_t zd0 = rd[0], zd1 = rd[1]; 1330 uint64_t zm0 = rm[0], zm1 = rm[1]; 1331 1332 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1333 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1334 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1335 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1336 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1337 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1338 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1339 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1340 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1341 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1342 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1343 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1344 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1345 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1346 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1347 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1348 1349 rm[0] = m0; 1350 rm[1] = m1; 1351 rd[0] = d0; 1352 rd[1] = d1; 1353 } 1354 1355 void HELPER(neon_qzip16)(void *vd, void *vm) 1356 { 1357 uint64_t *rd = vd, *rm = vm; 1358 uint64_t zd0 = rd[0], zd1 = rd[1]; 1359 uint64_t zm0 = rm[0], zm1 = rm[1]; 1360 1361 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1362 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1363 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1364 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1365 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1366 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1367 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1368 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1369 1370 rm[0] = m0; 1371 rm[1] = m1; 1372 rd[0] = d0; 1373 rd[1] = d1; 1374 } 1375 1376 void HELPER(neon_qzip32)(void *vd, void *vm) 1377 { 1378 uint64_t *rd = vd, *rm = vm; 1379 uint64_t zd0 = rd[0], zd1 = rd[1]; 1380 uint64_t zm0 = rm[0], zm1 = rm[1]; 1381 1382 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1383 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1384 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1385 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1386 1387 rm[0] = m0; 1388 rm[1] = m1; 1389 rd[0] = d0; 1390 rd[1] = d1; 1391 } 1392 1393 void HELPER(neon_zip8)(void *vd, void *vm) 1394 { 1395 uint64_t *rd = vd, *rm = vm; 1396 uint64_t zd = rd[0], zm = rm[0]; 1397 1398 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1399 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1400 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1401 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1402 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1403 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1404 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1405 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1406 1407 rm[0] = m0; 1408 rd[0] = d0; 1409 } 1410 1411 void HELPER(neon_zip16)(void *vd, void *vm) 1412 { 1413 uint64_t *rd = vd, *rm = vm; 1414 uint64_t zd = rd[0], zm = rm[0]; 1415 1416 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1417 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1418 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1419 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1420 1421 rm[0] = m0; 1422 rd[0] = d0; 1423 } 1424