1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "tcg/tcg-gvec-desc.h" 14 #include "fpu/softfloat.h" 15 #include "vec_internal.h" 16 17 #define SIGNBIT (uint32_t)0x80000000 18 #define SIGNBIT64 ((uint64_t)1 << 63) 19 20 #define SET_QC() env->vfp.qc[0] = 1 21 22 #define NEON_TYPE1(name, type) \ 23 typedef struct \ 24 { \ 25 type v1; \ 26 } neon_##name; 27 #if HOST_BIG_ENDIAN 28 #define NEON_TYPE2(name, type) \ 29 typedef struct \ 30 { \ 31 type v2; \ 32 type v1; \ 33 } neon_##name; 34 #define NEON_TYPE4(name, type) \ 35 typedef struct \ 36 { \ 37 type v4; \ 38 type v3; \ 39 type v2; \ 40 type v1; \ 41 } neon_##name; 42 #else 43 #define NEON_TYPE2(name, type) \ 44 typedef struct \ 45 { \ 46 type v1; \ 47 type v2; \ 48 } neon_##name; 49 #define NEON_TYPE4(name, type) \ 50 typedef struct \ 51 { \ 52 type v1; \ 53 type v2; \ 54 type v3; \ 55 type v4; \ 56 } neon_##name; 57 #endif 58 59 NEON_TYPE4(s8, int8_t) 60 NEON_TYPE4(u8, uint8_t) 61 NEON_TYPE2(s16, int16_t) 62 NEON_TYPE2(u16, uint16_t) 63 NEON_TYPE1(s32, int32_t) 64 NEON_TYPE1(u32, uint32_t) 65 #undef NEON_TYPE4 66 #undef NEON_TYPE2 67 #undef NEON_TYPE1 68 69 /* Copy from a uint32_t to a vector structure type. */ 70 #define NEON_UNPACK(vtype, dest, val) do { \ 71 union { \ 72 vtype v; \ 73 uint32_t i; \ 74 } conv_u; \ 75 conv_u.i = (val); \ 76 dest = conv_u.v; \ 77 } while(0) 78 79 /* Copy from a vector structure type to a uint32_t. */ 80 #define NEON_PACK(vtype, dest, val) do { \ 81 union { \ 82 vtype v; \ 83 uint32_t i; \ 84 } conv_u; \ 85 conv_u.v = (val); \ 86 dest = conv_u.i; \ 87 } while(0) 88 89 #define NEON_DO1 \ 90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 91 #define NEON_DO2 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 94 #define NEON_DO4 \ 95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 99 100 #define NEON_VOP_BODY(vtype, n) \ 101 { \ 102 uint32_t res; \ 103 vtype vsrc1; \ 104 vtype vsrc2; \ 105 vtype vdest; \ 106 NEON_UNPACK(vtype, vsrc1, arg1); \ 107 NEON_UNPACK(vtype, vsrc2, arg2); \ 108 NEON_DO##n; \ 109 NEON_PACK(vtype, res, vdest); \ 110 return res; \ 111 } 112 113 #define NEON_VOP(name, vtype, n) \ 114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 115 NEON_VOP_BODY(vtype, n) 116 117 #define NEON_VOP_ENV(name, vtype, n) \ 118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 119 NEON_VOP_BODY(vtype, n) 120 121 #define NEON_GVEC_VOP2(name, vtype) \ 122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 123 { \ 124 intptr_t i, opr_sz = simd_oprsz(desc); \ 125 vtype *d = vd, *n = vn, *m = vm; \ 126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 127 NEON_FN(d[i], n[i], m[i]); \ 128 } \ 129 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 130 } 131 132 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \ 134 { \ 135 intptr_t i, opr_sz = simd_oprsz(desc); \ 136 vtype *d = vd, *n = vn, *m = vm; \ 137 CPUARMState *env = venv; \ 138 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 139 NEON_FN(d[i], n[i], m[i]); \ 140 } \ 141 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 142 } 143 144 /* Pairwise operations. */ 145 /* For 32-bit elements each segment only contains a single element, so 146 the elementwise and pairwise operations are the same. */ 147 #define NEON_PDO2 \ 148 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 149 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 150 #define NEON_PDO4 \ 151 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 152 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 153 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 154 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 155 156 #define NEON_POP(name, vtype, n) \ 157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 158 { \ 159 uint32_t res; \ 160 vtype vsrc1; \ 161 vtype vsrc2; \ 162 vtype vdest; \ 163 NEON_UNPACK(vtype, vsrc1, arg1); \ 164 NEON_UNPACK(vtype, vsrc2, arg2); \ 165 NEON_PDO##n; \ 166 NEON_PACK(vtype, res, vdest); \ 167 return res; \ 168 } 169 170 /* Unary operators. */ 171 #define NEON_VOP1(name, vtype, n) \ 172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 173 { \ 174 vtype vsrc1; \ 175 vtype vdest; \ 176 NEON_UNPACK(vtype, vsrc1, arg); \ 177 NEON_DO##n; \ 178 NEON_PACK(vtype, arg, vdest); \ 179 return arg; \ 180 } 181 182 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 183 NEON_VOP(rhadd_s8, neon_s8, 4) 184 NEON_VOP(rhadd_u8, neon_u8, 4) 185 NEON_VOP(rhadd_s16, neon_s16, 2) 186 NEON_VOP(rhadd_u16, neon_u16, 2) 187 #undef NEON_FN 188 189 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 190 { 191 int32_t dest; 192 193 dest = (src1 >> 1) + (src2 >> 1); 194 if ((src1 | src2) & 1) 195 dest++; 196 return dest; 197 } 198 199 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 200 { 201 uint32_t dest; 202 203 dest = (src1 >> 1) + (src2 >> 1); 204 if ((src1 | src2) & 1) 205 dest++; 206 return dest; 207 } 208 209 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 210 NEON_POP(pmin_s8, neon_s8, 4) 211 NEON_POP(pmin_u8, neon_u8, 4) 212 NEON_POP(pmin_s16, neon_s16, 2) 213 NEON_POP(pmin_u16, neon_u16, 2) 214 #undef NEON_FN 215 216 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 217 NEON_POP(pmax_s8, neon_s8, 4) 218 NEON_POP(pmax_u8, neon_u8, 4) 219 NEON_POP(pmax_s16, neon_s16, 2) 220 NEON_POP(pmax_u16, neon_u16, 2) 221 #undef NEON_FN 222 223 #define NEON_FN(dest, src1, src2) \ 224 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 225 NEON_VOP(shl_u16, neon_u16, 2) 226 #undef NEON_FN 227 228 #define NEON_FN(dest, src1, src2) \ 229 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 230 NEON_VOP(shl_s16, neon_s16, 2) 231 #undef NEON_FN 232 233 #define NEON_FN(dest, src1, src2) \ 234 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 235 NEON_VOP(rshl_s8, neon_s8, 4) 236 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 237 #undef NEON_FN 238 239 #define NEON_FN(dest, src1, src2) \ 240 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 241 NEON_VOP(rshl_s16, neon_s16, 2) 242 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 243 #undef NEON_FN 244 245 #define NEON_FN(dest, src1, src2) \ 246 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 247 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 248 #undef NEON_FN 249 250 #define NEON_FN(dest, src1, src2) \ 251 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 252 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 253 #undef NEON_FN 254 255 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 256 { 257 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 258 } 259 260 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 261 { 262 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 263 } 264 265 #define NEON_FN(dest, src1, src2) \ 266 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 267 NEON_VOP(rshl_u8, neon_u8, 4) 268 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 269 #undef NEON_FN 270 271 #define NEON_FN(dest, src1, src2) \ 272 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 273 NEON_VOP(rshl_u16, neon_u16, 2) 274 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 275 #undef NEON_FN 276 277 #define NEON_FN(dest, src1, src2) \ 278 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 279 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 280 #undef NEON_FN 281 282 #define NEON_FN(dest, src1, src2) \ 283 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 284 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 285 #undef NEON_FN 286 287 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 288 { 289 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 290 } 291 292 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 293 { 294 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 295 } 296 297 #define NEON_FN(dest, src1, src2) \ 298 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 299 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 300 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 301 #undef NEON_FN 302 303 #define NEON_FN(dest, src1, src2) \ 304 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 305 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 306 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 307 #undef NEON_FN 308 309 #define NEON_FN(dest, src1, src2) \ 310 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 311 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 312 #undef NEON_FN 313 314 #define NEON_FN(dest, src1, src2) \ 315 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 316 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 317 #undef NEON_FN 318 319 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 320 { 321 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 322 } 323 324 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 325 { 326 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 327 } 328 329 #define NEON_FN(dest, src1, src2) \ 330 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 331 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 332 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 333 #undef NEON_FN 334 335 #define NEON_FN(dest, src1, src2) \ 336 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 337 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 338 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 339 #undef NEON_FN 340 341 #define NEON_FN(dest, src1, src2) \ 342 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 343 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 344 #undef NEON_FN 345 346 #define NEON_FN(dest, src1, src2) \ 347 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 348 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 349 #undef NEON_FN 350 351 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 352 { 353 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 354 } 355 356 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 357 { 358 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 359 } 360 361 #define NEON_FN(dest, src1, src2) \ 362 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 363 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 364 #undef NEON_FN 365 366 #define NEON_FN(dest, src1, src2) \ 367 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 368 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 369 #undef NEON_FN 370 371 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 372 { 373 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 374 } 375 376 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 377 { 378 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 379 } 380 381 #define NEON_FN(dest, src1, src2) \ 382 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 383 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 384 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) 385 #undef NEON_FN 386 387 #define NEON_FN(dest, src1, src2) \ 388 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 389 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 390 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t) 391 #undef NEON_FN 392 393 #define NEON_FN(dest, src1, src2) \ 394 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 395 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t) 396 #undef NEON_FN 397 398 #define NEON_FN(dest, src1, src2) \ 399 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 400 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t) 401 #undef NEON_FN 402 403 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 404 { 405 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 406 } 407 408 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 409 { 410 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 411 } 412 413 #define NEON_FN(dest, src1, src2) \ 414 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 415 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 416 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t) 417 #undef NEON_FN 418 419 #define NEON_FN(dest, src1, src2) \ 420 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 421 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 422 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t) 423 #undef NEON_FN 424 425 #define NEON_FN(dest, src1, src2) \ 426 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 427 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t) 428 #undef NEON_FN 429 430 #define NEON_FN(dest, src1, src2) \ 431 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 432 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t) 433 #undef NEON_FN 434 435 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 436 { 437 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 438 } 439 440 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 441 { 442 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 443 } 444 445 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 446 { 447 uint32_t mask; 448 mask = (a ^ b) & 0x80808080u; 449 a &= ~0x80808080u; 450 b &= ~0x80808080u; 451 return (a + b) ^ mask; 452 } 453 454 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 455 { 456 uint32_t mask; 457 mask = (a ^ b) & 0x80008000u; 458 a &= ~0x80008000u; 459 b &= ~0x80008000u; 460 return (a + b) ^ mask; 461 } 462 463 #define NEON_FN(dest, src1, src2) dest = src1 - src2 464 NEON_VOP(sub_u8, neon_u8, 4) 465 NEON_VOP(sub_u16, neon_u16, 2) 466 #undef NEON_FN 467 468 #define NEON_FN(dest, src1, src2) dest = src1 * src2 469 NEON_VOP(mul_u8, neon_u8, 4) 470 NEON_VOP(mul_u16, neon_u16, 2) 471 #undef NEON_FN 472 473 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 474 NEON_VOP(tst_u8, neon_u8, 4) 475 NEON_VOP(tst_u16, neon_u16, 2) 476 NEON_VOP(tst_u32, neon_u32, 1) 477 #undef NEON_FN 478 479 /* Count Leading Sign/Zero Bits. */ 480 static inline int do_clz8(uint8_t x) 481 { 482 int n; 483 for (n = 8; x; n--) 484 x >>= 1; 485 return n; 486 } 487 488 static inline int do_clz16(uint16_t x) 489 { 490 int n; 491 for (n = 16; x; n--) 492 x >>= 1; 493 return n; 494 } 495 496 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 497 NEON_VOP1(clz_u8, neon_u8, 4) 498 #undef NEON_FN 499 500 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 501 NEON_VOP1(clz_u16, neon_u16, 2) 502 #undef NEON_FN 503 504 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 505 NEON_VOP1(cls_s8, neon_s8, 4) 506 #undef NEON_FN 507 508 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 509 NEON_VOP1(cls_s16, neon_s16, 2) 510 #undef NEON_FN 511 512 uint32_t HELPER(neon_cls_s32)(uint32_t x) 513 { 514 int count; 515 if ((int32_t)x < 0) 516 x = ~x; 517 for (count = 32; x; count--) 518 x = x >> 1; 519 return count - 1; 520 } 521 522 /* Bit count. */ 523 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 524 { 525 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 526 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 527 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 528 return x; 529 } 530 531 /* Reverse bits in each 8 bit word */ 532 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 533 { 534 x = ((x & 0xf0f0f0f0) >> 4) 535 | ((x & 0x0f0f0f0f) << 4); 536 x = ((x & 0x88888888) >> 3) 537 | ((x & 0x44444444) >> 1) 538 | ((x & 0x22222222) << 1) 539 | ((x & 0x11111111) << 3); 540 return x; 541 } 542 543 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 544 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 545 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 546 SET_QC(); \ 547 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 548 } else { \ 549 tmp <<= 1; \ 550 } \ 551 if (round) { \ 552 int32_t old = tmp; \ 553 tmp += 1 << 15; \ 554 if ((int32_t)tmp < old) { \ 555 SET_QC(); \ 556 tmp = SIGNBIT - 1; \ 557 } \ 558 } \ 559 dest = tmp >> 16; \ 560 } while(0) 561 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 562 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 563 #undef NEON_FN 564 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 565 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 566 #undef NEON_FN 567 #undef NEON_QDMULH16 568 569 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 570 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 571 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 572 SET_QC(); \ 573 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 574 } else { \ 575 tmp <<= 1; \ 576 } \ 577 if (round) { \ 578 int64_t old = tmp; \ 579 tmp += (int64_t)1 << 31; \ 580 if ((int64_t)tmp < old) { \ 581 SET_QC(); \ 582 tmp = SIGNBIT64 - 1; \ 583 } \ 584 } \ 585 dest = tmp >> 32; \ 586 } while(0) 587 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 588 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 589 #undef NEON_FN 590 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 591 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 592 #undef NEON_FN 593 #undef NEON_QDMULH32 594 595 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 596 { 597 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 598 | ((x >> 24) & 0xff000000u); 599 } 600 601 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 602 { 603 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 604 } 605 606 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 607 { 608 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 609 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 610 } 611 612 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 613 { 614 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 615 } 616 617 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 618 { 619 x &= 0xff80ff80ff80ff80ull; 620 x += 0x0080008000800080ull; 621 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 622 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 623 } 624 625 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 626 { 627 x &= 0xffff8000ffff8000ull; 628 x += 0x0000800000008000ull; 629 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 630 } 631 632 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 633 { 634 uint16_t s; 635 uint8_t d; 636 uint32_t res = 0; 637 #define SAT8(n) \ 638 s = x >> n; \ 639 if (s & 0x8000) { \ 640 SET_QC(); \ 641 } else { \ 642 if (s > 0xff) { \ 643 d = 0xff; \ 644 SET_QC(); \ 645 } else { \ 646 d = s; \ 647 } \ 648 res |= (uint32_t)d << (n / 2); \ 649 } 650 651 SAT8(0); 652 SAT8(16); 653 SAT8(32); 654 SAT8(48); 655 #undef SAT8 656 return res; 657 } 658 659 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 660 { 661 uint16_t s; 662 uint8_t d; 663 uint32_t res = 0; 664 #define SAT8(n) \ 665 s = x >> n; \ 666 if (s > 0xff) { \ 667 d = 0xff; \ 668 SET_QC(); \ 669 } else { \ 670 d = s; \ 671 } \ 672 res |= (uint32_t)d << (n / 2); 673 674 SAT8(0); 675 SAT8(16); 676 SAT8(32); 677 SAT8(48); 678 #undef SAT8 679 return res; 680 } 681 682 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 683 { 684 int16_t s; 685 uint8_t d; 686 uint32_t res = 0; 687 #define SAT8(n) \ 688 s = x >> n; \ 689 if (s != (int8_t)s) { \ 690 d = (s >> 15) ^ 0x7f; \ 691 SET_QC(); \ 692 } else { \ 693 d = s; \ 694 } \ 695 res |= (uint32_t)d << (n / 2); 696 697 SAT8(0); 698 SAT8(16); 699 SAT8(32); 700 SAT8(48); 701 #undef SAT8 702 return res; 703 } 704 705 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 706 { 707 uint32_t high; 708 uint32_t low; 709 low = x; 710 if (low & 0x80000000) { 711 low = 0; 712 SET_QC(); 713 } else if (low > 0xffff) { 714 low = 0xffff; 715 SET_QC(); 716 } 717 high = x >> 32; 718 if (high & 0x80000000) { 719 high = 0; 720 SET_QC(); 721 } else if (high > 0xffff) { 722 high = 0xffff; 723 SET_QC(); 724 } 725 return low | (high << 16); 726 } 727 728 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 729 { 730 uint32_t high; 731 uint32_t low; 732 low = x; 733 if (low > 0xffff) { 734 low = 0xffff; 735 SET_QC(); 736 } 737 high = x >> 32; 738 if (high > 0xffff) { 739 high = 0xffff; 740 SET_QC(); 741 } 742 return low | (high << 16); 743 } 744 745 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 746 { 747 int32_t low; 748 int32_t high; 749 low = x; 750 if (low != (int16_t)low) { 751 low = (low >> 31) ^ 0x7fff; 752 SET_QC(); 753 } 754 high = x >> 32; 755 if (high != (int16_t)high) { 756 high = (high >> 31) ^ 0x7fff; 757 SET_QC(); 758 } 759 return (uint16_t)low | (high << 16); 760 } 761 762 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 763 { 764 if (x & 0x8000000000000000ull) { 765 SET_QC(); 766 return 0; 767 } 768 if (x > 0xffffffffu) { 769 SET_QC(); 770 return 0xffffffffu; 771 } 772 return x; 773 } 774 775 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 776 { 777 if (x > 0xffffffffu) { 778 SET_QC(); 779 return 0xffffffffu; 780 } 781 return x; 782 } 783 784 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 785 { 786 if ((int64_t)x != (int32_t)x) { 787 SET_QC(); 788 return ((int64_t)x >> 63) ^ 0x7fffffff; 789 } 790 return x; 791 } 792 793 uint64_t HELPER(neon_widen_u8)(uint32_t x) 794 { 795 uint64_t tmp; 796 uint64_t ret; 797 ret = (uint8_t)x; 798 tmp = (uint8_t)(x >> 8); 799 ret |= tmp << 16; 800 tmp = (uint8_t)(x >> 16); 801 ret |= tmp << 32; 802 tmp = (uint8_t)(x >> 24); 803 ret |= tmp << 48; 804 return ret; 805 } 806 807 uint64_t HELPER(neon_widen_s8)(uint32_t x) 808 { 809 uint64_t tmp; 810 uint64_t ret; 811 ret = (uint16_t)(int8_t)x; 812 tmp = (uint16_t)(int8_t)(x >> 8); 813 ret |= tmp << 16; 814 tmp = (uint16_t)(int8_t)(x >> 16); 815 ret |= tmp << 32; 816 tmp = (uint16_t)(int8_t)(x >> 24); 817 ret |= tmp << 48; 818 return ret; 819 } 820 821 uint64_t HELPER(neon_widen_u16)(uint32_t x) 822 { 823 uint64_t high = (uint16_t)(x >> 16); 824 return ((uint16_t)x) | (high << 32); 825 } 826 827 uint64_t HELPER(neon_widen_s16)(uint32_t x) 828 { 829 uint64_t high = (int16_t)(x >> 16); 830 return ((uint32_t)(int16_t)x) | (high << 32); 831 } 832 833 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 834 { 835 uint64_t mask; 836 mask = (a ^ b) & 0x8000800080008000ull; 837 a &= ~0x8000800080008000ull; 838 b &= ~0x8000800080008000ull; 839 return (a + b) ^ mask; 840 } 841 842 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 843 { 844 uint64_t mask; 845 mask = (a ^ b) & 0x8000000080000000ull; 846 a &= ~0x8000000080000000ull; 847 b &= ~0x8000000080000000ull; 848 return (a + b) ^ mask; 849 } 850 851 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 852 { 853 uint64_t tmp; 854 uint64_t tmp2; 855 856 tmp = a & 0x0000ffff0000ffffull; 857 tmp += (a >> 16) & 0x0000ffff0000ffffull; 858 tmp2 = b & 0xffff0000ffff0000ull; 859 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 860 return ( tmp & 0xffff) 861 | ((tmp >> 16) & 0xffff0000ull) 862 | ((tmp2 << 16) & 0xffff00000000ull) 863 | ( tmp2 & 0xffff000000000000ull); 864 } 865 866 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 867 { 868 uint32_t low = a + (a >> 32); 869 uint32_t high = b + (b >> 32); 870 return low + ((uint64_t)high << 32); 871 } 872 873 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 874 { 875 uint64_t mask; 876 mask = (a ^ ~b) & 0x8000800080008000ull; 877 a |= 0x8000800080008000ull; 878 b &= ~0x8000800080008000ull; 879 return (a - b) ^ mask; 880 } 881 882 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 883 { 884 uint64_t mask; 885 mask = (a ^ ~b) & 0x8000000080000000ull; 886 a |= 0x8000000080000000ull; 887 b &= ~0x8000000080000000ull; 888 return (a - b) ^ mask; 889 } 890 891 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 892 { 893 uint32_t x, y; 894 uint32_t low, high; 895 896 x = a; 897 y = b; 898 low = x + y; 899 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 900 SET_QC(); 901 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 902 } 903 x = a >> 32; 904 y = b >> 32; 905 high = x + y; 906 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 907 SET_QC(); 908 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 909 } 910 return low | ((uint64_t)high << 32); 911 } 912 913 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 914 { 915 uint64_t result; 916 917 result = a + b; 918 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 919 SET_QC(); 920 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 921 } 922 return result; 923 } 924 925 /* We have to do the arithmetic in a larger type than 926 * the input type, because for example with a signed 32 bit 927 * op the absolute difference can overflow a signed 32 bit value. 928 */ 929 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 930 arithtype tmp_x = (intype)(x); \ 931 arithtype tmp_y = (intype)(y); \ 932 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 933 } while(0) 934 935 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 936 { 937 uint64_t tmp; 938 uint64_t result; 939 DO_ABD(result, a, b, uint8_t, uint32_t); 940 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 941 result |= tmp << 16; 942 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 943 result |= tmp << 32; 944 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 945 result |= tmp << 48; 946 return result; 947 } 948 949 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 950 { 951 uint64_t tmp; 952 uint64_t result; 953 DO_ABD(result, a, b, int8_t, int32_t); 954 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 955 result |= tmp << 16; 956 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 957 result |= tmp << 32; 958 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 959 result |= tmp << 48; 960 return result; 961 } 962 963 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 964 { 965 uint64_t tmp; 966 uint64_t result; 967 DO_ABD(result, a, b, uint16_t, uint32_t); 968 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 969 return result | (tmp << 32); 970 } 971 972 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 973 { 974 uint64_t tmp; 975 uint64_t result; 976 DO_ABD(result, a, b, int16_t, int32_t); 977 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 978 return result | (tmp << 32); 979 } 980 981 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 982 { 983 uint64_t result; 984 DO_ABD(result, a, b, uint32_t, uint64_t); 985 return result; 986 } 987 988 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 989 { 990 uint64_t result; 991 DO_ABD(result, a, b, int32_t, int64_t); 992 return result; 993 } 994 #undef DO_ABD 995 996 /* Widening multiply. Named type is the source type. */ 997 #define DO_MULL(dest, x, y, type1, type2) do { \ 998 type1 tmp_x = x; \ 999 type1 tmp_y = y; \ 1000 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1001 } while(0) 1002 1003 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1004 { 1005 uint64_t tmp; 1006 uint64_t result; 1007 1008 DO_MULL(result, a, b, uint8_t, uint16_t); 1009 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1010 result |= tmp << 16; 1011 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1012 result |= tmp << 32; 1013 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1014 result |= tmp << 48; 1015 return result; 1016 } 1017 1018 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1019 { 1020 uint64_t tmp; 1021 uint64_t result; 1022 1023 DO_MULL(result, a, b, int8_t, uint16_t); 1024 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1025 result |= tmp << 16; 1026 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1027 result |= tmp << 32; 1028 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1029 result |= tmp << 48; 1030 return result; 1031 } 1032 1033 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1034 { 1035 uint64_t tmp; 1036 uint64_t result; 1037 1038 DO_MULL(result, a, b, uint16_t, uint32_t); 1039 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1040 return result | (tmp << 32); 1041 } 1042 1043 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1044 { 1045 uint64_t tmp; 1046 uint64_t result; 1047 1048 DO_MULL(result, a, b, int16_t, uint32_t); 1049 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1050 return result | (tmp << 32); 1051 } 1052 1053 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1054 { 1055 uint16_t tmp; 1056 uint64_t result; 1057 result = (uint16_t)-x; 1058 tmp = -(x >> 16); 1059 result |= (uint64_t)tmp << 16; 1060 tmp = -(x >> 32); 1061 result |= (uint64_t)tmp << 32; 1062 tmp = -(x >> 48); 1063 result |= (uint64_t)tmp << 48; 1064 return result; 1065 } 1066 1067 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1068 { 1069 uint32_t low = -x; 1070 uint32_t high = -(x >> 32); 1071 return low | ((uint64_t)high << 32); 1072 } 1073 1074 /* Saturating sign manipulation. */ 1075 /* ??? Make these use NEON_VOP1 */ 1076 #define DO_QABS8(x) do { \ 1077 if (x == (int8_t)0x80) { \ 1078 x = 0x7f; \ 1079 SET_QC(); \ 1080 } else if (x < 0) { \ 1081 x = -x; \ 1082 }} while (0) 1083 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1084 { 1085 neon_s8 vec; 1086 NEON_UNPACK(neon_s8, vec, x); 1087 DO_QABS8(vec.v1); 1088 DO_QABS8(vec.v2); 1089 DO_QABS8(vec.v3); 1090 DO_QABS8(vec.v4); 1091 NEON_PACK(neon_s8, x, vec); 1092 return x; 1093 } 1094 #undef DO_QABS8 1095 1096 #define DO_QNEG8(x) do { \ 1097 if (x == (int8_t)0x80) { \ 1098 x = 0x7f; \ 1099 SET_QC(); \ 1100 } else { \ 1101 x = -x; \ 1102 }} while (0) 1103 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1104 { 1105 neon_s8 vec; 1106 NEON_UNPACK(neon_s8, vec, x); 1107 DO_QNEG8(vec.v1); 1108 DO_QNEG8(vec.v2); 1109 DO_QNEG8(vec.v3); 1110 DO_QNEG8(vec.v4); 1111 NEON_PACK(neon_s8, x, vec); 1112 return x; 1113 } 1114 #undef DO_QNEG8 1115 1116 #define DO_QABS16(x) do { \ 1117 if (x == (int16_t)0x8000) { \ 1118 x = 0x7fff; \ 1119 SET_QC(); \ 1120 } else if (x < 0) { \ 1121 x = -x; \ 1122 }} while (0) 1123 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1124 { 1125 neon_s16 vec; 1126 NEON_UNPACK(neon_s16, vec, x); 1127 DO_QABS16(vec.v1); 1128 DO_QABS16(vec.v2); 1129 NEON_PACK(neon_s16, x, vec); 1130 return x; 1131 } 1132 #undef DO_QABS16 1133 1134 #define DO_QNEG16(x) do { \ 1135 if (x == (int16_t)0x8000) { \ 1136 x = 0x7fff; \ 1137 SET_QC(); \ 1138 } else { \ 1139 x = -x; \ 1140 }} while (0) 1141 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1142 { 1143 neon_s16 vec; 1144 NEON_UNPACK(neon_s16, vec, x); 1145 DO_QNEG16(vec.v1); 1146 DO_QNEG16(vec.v2); 1147 NEON_PACK(neon_s16, x, vec); 1148 return x; 1149 } 1150 #undef DO_QNEG16 1151 1152 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1153 { 1154 if (x == SIGNBIT) { 1155 SET_QC(); 1156 x = ~SIGNBIT; 1157 } else if ((int32_t)x < 0) { 1158 x = -x; 1159 } 1160 return x; 1161 } 1162 1163 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1164 { 1165 if (x == SIGNBIT) { 1166 SET_QC(); 1167 x = ~SIGNBIT; 1168 } else { 1169 x = -x; 1170 } 1171 return x; 1172 } 1173 1174 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1175 { 1176 if (x == SIGNBIT64) { 1177 SET_QC(); 1178 x = ~SIGNBIT64; 1179 } else if ((int64_t)x < 0) { 1180 x = -x; 1181 } 1182 return x; 1183 } 1184 1185 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1186 { 1187 if (x == SIGNBIT64) { 1188 SET_QC(); 1189 x = ~SIGNBIT64; 1190 } else { 1191 x = -x; 1192 } 1193 return x; 1194 } 1195 1196 /* NEON Float helpers. */ 1197 1198 /* Floating point comparisons produce an integer result. 1199 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1200 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1201 */ 1202 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1203 { 1204 float_status *fpst = fpstp; 1205 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1206 } 1207 1208 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1209 { 1210 float_status *fpst = fpstp; 1211 return -float32_le(make_float32(b), make_float32(a), fpst); 1212 } 1213 1214 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1215 { 1216 float_status *fpst = fpstp; 1217 return -float32_lt(make_float32(b), make_float32(a), fpst); 1218 } 1219 1220 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1221 { 1222 float_status *fpst = fpstp; 1223 float32 f0 = float32_abs(make_float32(a)); 1224 float32 f1 = float32_abs(make_float32(b)); 1225 return -float32_le(f1, f0, fpst); 1226 } 1227 1228 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1229 { 1230 float_status *fpst = fpstp; 1231 float32 f0 = float32_abs(make_float32(a)); 1232 float32 f1 = float32_abs(make_float32(b)); 1233 return -float32_lt(f1, f0, fpst); 1234 } 1235 1236 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1237 { 1238 float_status *fpst = fpstp; 1239 float64 f0 = float64_abs(make_float64(a)); 1240 float64 f1 = float64_abs(make_float64(b)); 1241 return -float64_le(f1, f0, fpst); 1242 } 1243 1244 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1245 { 1246 float_status *fpst = fpstp; 1247 float64 f0 = float64_abs(make_float64(a)); 1248 float64 f1 = float64_abs(make_float64(b)); 1249 return -float64_lt(f1, f0, fpst); 1250 } 1251 1252 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1253 1254 void HELPER(neon_qunzip8)(void *vd, void *vm) 1255 { 1256 uint64_t *rd = vd, *rm = vm; 1257 uint64_t zd0 = rd[0], zd1 = rd[1]; 1258 uint64_t zm0 = rm[0], zm1 = rm[1]; 1259 1260 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1261 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1262 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1263 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1264 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1265 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1266 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1267 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1268 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1269 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1270 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1271 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1272 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1273 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1274 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1275 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1276 1277 rm[0] = m0; 1278 rm[1] = m1; 1279 rd[0] = d0; 1280 rd[1] = d1; 1281 } 1282 1283 void HELPER(neon_qunzip16)(void *vd, void *vm) 1284 { 1285 uint64_t *rd = vd, *rm = vm; 1286 uint64_t zd0 = rd[0], zd1 = rd[1]; 1287 uint64_t zm0 = rm[0], zm1 = rm[1]; 1288 1289 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1290 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1291 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1292 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1293 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1294 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1295 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1296 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1297 1298 rm[0] = m0; 1299 rm[1] = m1; 1300 rd[0] = d0; 1301 rd[1] = d1; 1302 } 1303 1304 void HELPER(neon_qunzip32)(void *vd, void *vm) 1305 { 1306 uint64_t *rd = vd, *rm = vm; 1307 uint64_t zd0 = rd[0], zd1 = rd[1]; 1308 uint64_t zm0 = rm[0], zm1 = rm[1]; 1309 1310 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1311 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1312 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1313 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1314 1315 rm[0] = m0; 1316 rm[1] = m1; 1317 rd[0] = d0; 1318 rd[1] = d1; 1319 } 1320 1321 void HELPER(neon_unzip8)(void *vd, void *vm) 1322 { 1323 uint64_t *rd = vd, *rm = vm; 1324 uint64_t zd = rd[0], zm = rm[0]; 1325 1326 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1327 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1328 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1329 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1330 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1331 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1332 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1333 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1334 1335 rm[0] = m0; 1336 rd[0] = d0; 1337 } 1338 1339 void HELPER(neon_unzip16)(void *vd, void *vm) 1340 { 1341 uint64_t *rd = vd, *rm = vm; 1342 uint64_t zd = rd[0], zm = rm[0]; 1343 1344 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1345 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1346 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1347 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1348 1349 rm[0] = m0; 1350 rd[0] = d0; 1351 } 1352 1353 void HELPER(neon_qzip8)(void *vd, void *vm) 1354 { 1355 uint64_t *rd = vd, *rm = vm; 1356 uint64_t zd0 = rd[0], zd1 = rd[1]; 1357 uint64_t zm0 = rm[0], zm1 = rm[1]; 1358 1359 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1360 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1361 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1362 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1363 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1364 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1365 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1366 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1367 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1368 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1369 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1370 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1371 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1372 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1373 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1374 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1375 1376 rm[0] = m0; 1377 rm[1] = m1; 1378 rd[0] = d0; 1379 rd[1] = d1; 1380 } 1381 1382 void HELPER(neon_qzip16)(void *vd, void *vm) 1383 { 1384 uint64_t *rd = vd, *rm = vm; 1385 uint64_t zd0 = rd[0], zd1 = rd[1]; 1386 uint64_t zm0 = rm[0], zm1 = rm[1]; 1387 1388 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1389 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1390 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1391 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1392 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1393 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1394 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1395 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1396 1397 rm[0] = m0; 1398 rm[1] = m1; 1399 rd[0] = d0; 1400 rd[1] = d1; 1401 } 1402 1403 void HELPER(neon_qzip32)(void *vd, void *vm) 1404 { 1405 uint64_t *rd = vd, *rm = vm; 1406 uint64_t zd0 = rd[0], zd1 = rd[1]; 1407 uint64_t zm0 = rm[0], zm1 = rm[1]; 1408 1409 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1410 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1411 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1412 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1413 1414 rm[0] = m0; 1415 rm[1] = m1; 1416 rd[0] = d0; 1417 rd[1] = d1; 1418 } 1419 1420 void HELPER(neon_zip8)(void *vd, void *vm) 1421 { 1422 uint64_t *rd = vd, *rm = vm; 1423 uint64_t zd = rd[0], zm = rm[0]; 1424 1425 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1426 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1427 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1428 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1429 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1430 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1431 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1432 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1433 1434 rm[0] = m0; 1435 rd[0] = d0; 1436 } 1437 1438 void HELPER(neon_zip16)(void *vd, void *vm) 1439 { 1440 uint64_t *rd = vd, *rm = vm; 1441 uint64_t zd = rd[0], zm = rm[0]; 1442 1443 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1444 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1445 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1446 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1447 1448 rm[0] = m0; 1449 rd[0] = d0; 1450 } 1451