1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "tcg/tcg-gvec-desc.h" 14 #include "fpu/softfloat.h" 15 #include "vec_internal.h" 16 17 #define SIGNBIT (uint32_t)0x80000000 18 #define SIGNBIT64 ((uint64_t)1 << 63) 19 20 #define SET_QC() env->vfp.qc[0] = 1 21 22 #define NEON_TYPE1(name, type) \ 23 typedef struct \ 24 { \ 25 type v1; \ 26 } neon_##name; 27 #if HOST_BIG_ENDIAN 28 #define NEON_TYPE2(name, type) \ 29 typedef struct \ 30 { \ 31 type v2; \ 32 type v1; \ 33 } neon_##name; 34 #define NEON_TYPE4(name, type) \ 35 typedef struct \ 36 { \ 37 type v4; \ 38 type v3; \ 39 type v2; \ 40 type v1; \ 41 } neon_##name; 42 #else 43 #define NEON_TYPE2(name, type) \ 44 typedef struct \ 45 { \ 46 type v1; \ 47 type v2; \ 48 } neon_##name; 49 #define NEON_TYPE4(name, type) \ 50 typedef struct \ 51 { \ 52 type v1; \ 53 type v2; \ 54 type v3; \ 55 type v4; \ 56 } neon_##name; 57 #endif 58 59 NEON_TYPE4(s8, int8_t) 60 NEON_TYPE4(u8, uint8_t) 61 NEON_TYPE2(s16, int16_t) 62 NEON_TYPE2(u16, uint16_t) 63 NEON_TYPE1(s32, int32_t) 64 NEON_TYPE1(u32, uint32_t) 65 #undef NEON_TYPE4 66 #undef NEON_TYPE2 67 #undef NEON_TYPE1 68 69 /* Copy from a uint32_t to a vector structure type. */ 70 #define NEON_UNPACK(vtype, dest, val) do { \ 71 union { \ 72 vtype v; \ 73 uint32_t i; \ 74 } conv_u; \ 75 conv_u.i = (val); \ 76 dest = conv_u.v; \ 77 } while(0) 78 79 /* Copy from a vector structure type to a uint32_t. */ 80 #define NEON_PACK(vtype, dest, val) do { \ 81 union { \ 82 vtype v; \ 83 uint32_t i; \ 84 } conv_u; \ 85 conv_u.v = (val); \ 86 dest = conv_u.i; \ 87 } while(0) 88 89 #define NEON_DO1 \ 90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 91 #define NEON_DO2 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 94 #define NEON_DO4 \ 95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 99 100 #define NEON_VOP_BODY(vtype, n) \ 101 { \ 102 uint32_t res; \ 103 vtype vsrc1; \ 104 vtype vsrc2; \ 105 vtype vdest; \ 106 NEON_UNPACK(vtype, vsrc1, arg1); \ 107 NEON_UNPACK(vtype, vsrc2, arg2); \ 108 NEON_DO##n; \ 109 NEON_PACK(vtype, res, vdest); \ 110 return res; \ 111 } 112 113 #define NEON_VOP(name, vtype, n) \ 114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 115 NEON_VOP_BODY(vtype, n) 116 117 #define NEON_VOP_ENV(name, vtype, n) \ 118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 119 NEON_VOP_BODY(vtype, n) 120 121 #define NEON_GVEC_VOP2(name, vtype) \ 122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 123 { \ 124 intptr_t i, opr_sz = simd_oprsz(desc); \ 125 vtype *d = vd, *n = vn, *m = vm; \ 126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 127 NEON_FN(d[i], n[i], m[i]); \ 128 } \ 129 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 130 } 131 132 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \ 134 { \ 135 intptr_t i, opr_sz = simd_oprsz(desc); \ 136 vtype *d = vd, *n = vn, *m = vm; \ 137 CPUARMState *env = venv; \ 138 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 139 NEON_FN(d[i], n[i], m[i]); \ 140 } \ 141 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 142 } 143 144 /* Pairwise operations. */ 145 /* For 32-bit elements each segment only contains a single element, so 146 the elementwise and pairwise operations are the same. */ 147 #define NEON_PDO2 \ 148 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 149 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 150 #define NEON_PDO4 \ 151 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 152 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 153 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 154 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 155 156 #define NEON_POP(name, vtype, n) \ 157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 158 { \ 159 uint32_t res; \ 160 vtype vsrc1; \ 161 vtype vsrc2; \ 162 vtype vdest; \ 163 NEON_UNPACK(vtype, vsrc1, arg1); \ 164 NEON_UNPACK(vtype, vsrc2, arg2); \ 165 NEON_PDO##n; \ 166 NEON_PACK(vtype, res, vdest); \ 167 return res; \ 168 } 169 170 /* Unary operators. */ 171 #define NEON_VOP1(name, vtype, n) \ 172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 173 { \ 174 vtype vsrc1; \ 175 vtype vdest; \ 176 NEON_UNPACK(vtype, vsrc1, arg); \ 177 NEON_DO##n; \ 178 NEON_PACK(vtype, arg, vdest); \ 179 return arg; \ 180 } 181 182 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 183 NEON_VOP(hadd_s8, neon_s8, 4) 184 NEON_VOP(hadd_u8, neon_u8, 4) 185 NEON_VOP(hadd_s16, neon_s16, 2) 186 NEON_VOP(hadd_u16, neon_u16, 2) 187 #undef NEON_FN 188 189 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 190 { 191 int32_t dest; 192 193 dest = (src1 >> 1) + (src2 >> 1); 194 if (src1 & src2 & 1) 195 dest++; 196 return dest; 197 } 198 199 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 200 { 201 uint32_t dest; 202 203 dest = (src1 >> 1) + (src2 >> 1); 204 if (src1 & src2 & 1) 205 dest++; 206 return dest; 207 } 208 209 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 210 NEON_VOP(rhadd_s8, neon_s8, 4) 211 NEON_VOP(rhadd_u8, neon_u8, 4) 212 NEON_VOP(rhadd_s16, neon_s16, 2) 213 NEON_VOP(rhadd_u16, neon_u16, 2) 214 #undef NEON_FN 215 216 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 217 { 218 int32_t dest; 219 220 dest = (src1 >> 1) + (src2 >> 1); 221 if ((src1 | src2) & 1) 222 dest++; 223 return dest; 224 } 225 226 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 227 { 228 uint32_t dest; 229 230 dest = (src1 >> 1) + (src2 >> 1); 231 if ((src1 | src2) & 1) 232 dest++; 233 return dest; 234 } 235 236 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 237 NEON_VOP(hsub_s8, neon_s8, 4) 238 NEON_VOP(hsub_u8, neon_u8, 4) 239 NEON_VOP(hsub_s16, neon_s16, 2) 240 NEON_VOP(hsub_u16, neon_u16, 2) 241 #undef NEON_FN 242 243 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 244 { 245 int32_t dest; 246 247 dest = (src1 >> 1) - (src2 >> 1); 248 if ((~src1) & src2 & 1) 249 dest--; 250 return dest; 251 } 252 253 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 254 { 255 uint32_t dest; 256 257 dest = (src1 >> 1) - (src2 >> 1); 258 if ((~src1) & src2 & 1) 259 dest--; 260 return dest; 261 } 262 263 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 264 NEON_POP(pmin_s8, neon_s8, 4) 265 NEON_POP(pmin_u8, neon_u8, 4) 266 NEON_POP(pmin_s16, neon_s16, 2) 267 NEON_POP(pmin_u16, neon_u16, 2) 268 #undef NEON_FN 269 270 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 271 NEON_POP(pmax_s8, neon_s8, 4) 272 NEON_POP(pmax_u8, neon_u8, 4) 273 NEON_POP(pmax_s16, neon_s16, 2) 274 NEON_POP(pmax_u16, neon_u16, 2) 275 #undef NEON_FN 276 277 #define NEON_FN(dest, src1, src2) \ 278 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 279 NEON_VOP(shl_u16, neon_u16, 2) 280 #undef NEON_FN 281 282 #define NEON_FN(dest, src1, src2) \ 283 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 284 NEON_VOP(shl_s16, neon_s16, 2) 285 #undef NEON_FN 286 287 #define NEON_FN(dest, src1, src2) \ 288 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 289 NEON_VOP(rshl_s8, neon_s8, 4) 290 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 291 #undef NEON_FN 292 293 #define NEON_FN(dest, src1, src2) \ 294 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 295 NEON_VOP(rshl_s16, neon_s16, 2) 296 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 297 #undef NEON_FN 298 299 #define NEON_FN(dest, src1, src2) \ 300 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 301 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 302 #undef NEON_FN 303 304 #define NEON_FN(dest, src1, src2) \ 305 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 306 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 307 #undef NEON_FN 308 309 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 310 { 311 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 312 } 313 314 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 315 { 316 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 317 } 318 319 #define NEON_FN(dest, src1, src2) \ 320 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 321 NEON_VOP(rshl_u8, neon_u8, 4) 322 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 323 #undef NEON_FN 324 325 #define NEON_FN(dest, src1, src2) \ 326 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 327 NEON_VOP(rshl_u16, neon_u16, 2) 328 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 329 #undef NEON_FN 330 331 #define NEON_FN(dest, src1, src2) \ 332 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 333 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 334 #undef NEON_FN 335 336 #define NEON_FN(dest, src1, src2) \ 337 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 338 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 339 #undef NEON_FN 340 341 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 342 { 343 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 344 } 345 346 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 347 { 348 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 349 } 350 351 #define NEON_FN(dest, src1, src2) \ 352 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 353 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 354 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 355 #undef NEON_FN 356 357 #define NEON_FN(dest, src1, src2) \ 358 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 359 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 360 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 361 #undef NEON_FN 362 363 #define NEON_FN(dest, src1, src2) \ 364 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 365 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 366 #undef NEON_FN 367 368 #define NEON_FN(dest, src1, src2) \ 369 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 370 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 371 #undef NEON_FN 372 373 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 374 { 375 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 376 } 377 378 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 379 { 380 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 381 } 382 383 #define NEON_FN(dest, src1, src2) \ 384 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 385 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 386 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 387 #undef NEON_FN 388 389 #define NEON_FN(dest, src1, src2) \ 390 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 391 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 392 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 393 #undef NEON_FN 394 395 #define NEON_FN(dest, src1, src2) \ 396 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 397 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 398 #undef NEON_FN 399 400 #define NEON_FN(dest, src1, src2) \ 401 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 402 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 403 #undef NEON_FN 404 405 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 406 { 407 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 408 } 409 410 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 411 { 412 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 413 } 414 415 #define NEON_FN(dest, src1, src2) \ 416 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 417 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 418 #undef NEON_FN 419 420 #define NEON_FN(dest, src1, src2) \ 421 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 422 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 423 #undef NEON_FN 424 425 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 426 { 427 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 428 } 429 430 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 431 { 432 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 433 } 434 435 #define NEON_FN(dest, src1, src2) \ 436 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 437 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 438 #undef NEON_FN 439 440 #define NEON_FN(dest, src1, src2) \ 441 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 442 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 443 #undef NEON_FN 444 445 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 446 { 447 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 448 } 449 450 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 451 { 452 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 453 } 454 455 #define NEON_FN(dest, src1, src2) \ 456 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 457 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 458 #undef NEON_FN 459 460 #define NEON_FN(dest, src1, src2) \ 461 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 462 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 463 #undef NEON_FN 464 465 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 466 { 467 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 468 } 469 470 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 471 { 472 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 473 } 474 475 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 476 { 477 uint32_t mask; 478 mask = (a ^ b) & 0x80808080u; 479 a &= ~0x80808080u; 480 b &= ~0x80808080u; 481 return (a + b) ^ mask; 482 } 483 484 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 485 { 486 uint32_t mask; 487 mask = (a ^ b) & 0x80008000u; 488 a &= ~0x80008000u; 489 b &= ~0x80008000u; 490 return (a + b) ^ mask; 491 } 492 493 #define NEON_FN(dest, src1, src2) dest = src1 - src2 494 NEON_VOP(sub_u8, neon_u8, 4) 495 NEON_VOP(sub_u16, neon_u16, 2) 496 #undef NEON_FN 497 498 #define NEON_FN(dest, src1, src2) dest = src1 * src2 499 NEON_VOP(mul_u8, neon_u8, 4) 500 NEON_VOP(mul_u16, neon_u16, 2) 501 #undef NEON_FN 502 503 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 504 NEON_VOP(tst_u8, neon_u8, 4) 505 NEON_VOP(tst_u16, neon_u16, 2) 506 NEON_VOP(tst_u32, neon_u32, 1) 507 #undef NEON_FN 508 509 /* Count Leading Sign/Zero Bits. */ 510 static inline int do_clz8(uint8_t x) 511 { 512 int n; 513 for (n = 8; x; n--) 514 x >>= 1; 515 return n; 516 } 517 518 static inline int do_clz16(uint16_t x) 519 { 520 int n; 521 for (n = 16; x; n--) 522 x >>= 1; 523 return n; 524 } 525 526 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 527 NEON_VOP1(clz_u8, neon_u8, 4) 528 #undef NEON_FN 529 530 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 531 NEON_VOP1(clz_u16, neon_u16, 2) 532 #undef NEON_FN 533 534 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 535 NEON_VOP1(cls_s8, neon_s8, 4) 536 #undef NEON_FN 537 538 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 539 NEON_VOP1(cls_s16, neon_s16, 2) 540 #undef NEON_FN 541 542 uint32_t HELPER(neon_cls_s32)(uint32_t x) 543 { 544 int count; 545 if ((int32_t)x < 0) 546 x = ~x; 547 for (count = 32; x; count--) 548 x = x >> 1; 549 return count - 1; 550 } 551 552 /* Bit count. */ 553 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 554 { 555 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 556 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 557 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 558 return x; 559 } 560 561 /* Reverse bits in each 8 bit word */ 562 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 563 { 564 x = ((x & 0xf0f0f0f0) >> 4) 565 | ((x & 0x0f0f0f0f) << 4); 566 x = ((x & 0x88888888) >> 3) 567 | ((x & 0x44444444) >> 1) 568 | ((x & 0x22222222) << 1) 569 | ((x & 0x11111111) << 3); 570 return x; 571 } 572 573 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 574 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 575 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 576 SET_QC(); \ 577 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 578 } else { \ 579 tmp <<= 1; \ 580 } \ 581 if (round) { \ 582 int32_t old = tmp; \ 583 tmp += 1 << 15; \ 584 if ((int32_t)tmp < old) { \ 585 SET_QC(); \ 586 tmp = SIGNBIT - 1; \ 587 } \ 588 } \ 589 dest = tmp >> 16; \ 590 } while(0) 591 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 592 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 593 #undef NEON_FN 594 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 595 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 596 #undef NEON_FN 597 #undef NEON_QDMULH16 598 599 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 600 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 601 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 602 SET_QC(); \ 603 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 604 } else { \ 605 tmp <<= 1; \ 606 } \ 607 if (round) { \ 608 int64_t old = tmp; \ 609 tmp += (int64_t)1 << 31; \ 610 if ((int64_t)tmp < old) { \ 611 SET_QC(); \ 612 tmp = SIGNBIT64 - 1; \ 613 } \ 614 } \ 615 dest = tmp >> 32; \ 616 } while(0) 617 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 618 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 619 #undef NEON_FN 620 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 621 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 622 #undef NEON_FN 623 #undef NEON_QDMULH32 624 625 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 626 { 627 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 628 | ((x >> 24) & 0xff000000u); 629 } 630 631 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 632 { 633 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 634 } 635 636 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 637 { 638 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 639 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 640 } 641 642 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 643 { 644 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 645 } 646 647 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 648 { 649 x &= 0xff80ff80ff80ff80ull; 650 x += 0x0080008000800080ull; 651 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 652 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 653 } 654 655 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 656 { 657 x &= 0xffff8000ffff8000ull; 658 x += 0x0000800000008000ull; 659 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 660 } 661 662 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 663 { 664 uint16_t s; 665 uint8_t d; 666 uint32_t res = 0; 667 #define SAT8(n) \ 668 s = x >> n; \ 669 if (s & 0x8000) { \ 670 SET_QC(); \ 671 } else { \ 672 if (s > 0xff) { \ 673 d = 0xff; \ 674 SET_QC(); \ 675 } else { \ 676 d = s; \ 677 } \ 678 res |= (uint32_t)d << (n / 2); \ 679 } 680 681 SAT8(0); 682 SAT8(16); 683 SAT8(32); 684 SAT8(48); 685 #undef SAT8 686 return res; 687 } 688 689 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 690 { 691 uint16_t s; 692 uint8_t d; 693 uint32_t res = 0; 694 #define SAT8(n) \ 695 s = x >> n; \ 696 if (s > 0xff) { \ 697 d = 0xff; \ 698 SET_QC(); \ 699 } else { \ 700 d = s; \ 701 } \ 702 res |= (uint32_t)d << (n / 2); 703 704 SAT8(0); 705 SAT8(16); 706 SAT8(32); 707 SAT8(48); 708 #undef SAT8 709 return res; 710 } 711 712 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 713 { 714 int16_t s; 715 uint8_t d; 716 uint32_t res = 0; 717 #define SAT8(n) \ 718 s = x >> n; \ 719 if (s != (int8_t)s) { \ 720 d = (s >> 15) ^ 0x7f; \ 721 SET_QC(); \ 722 } else { \ 723 d = s; \ 724 } \ 725 res |= (uint32_t)d << (n / 2); 726 727 SAT8(0); 728 SAT8(16); 729 SAT8(32); 730 SAT8(48); 731 #undef SAT8 732 return res; 733 } 734 735 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 736 { 737 uint32_t high; 738 uint32_t low; 739 low = x; 740 if (low & 0x80000000) { 741 low = 0; 742 SET_QC(); 743 } else if (low > 0xffff) { 744 low = 0xffff; 745 SET_QC(); 746 } 747 high = x >> 32; 748 if (high & 0x80000000) { 749 high = 0; 750 SET_QC(); 751 } else if (high > 0xffff) { 752 high = 0xffff; 753 SET_QC(); 754 } 755 return low | (high << 16); 756 } 757 758 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 759 { 760 uint32_t high; 761 uint32_t low; 762 low = x; 763 if (low > 0xffff) { 764 low = 0xffff; 765 SET_QC(); 766 } 767 high = x >> 32; 768 if (high > 0xffff) { 769 high = 0xffff; 770 SET_QC(); 771 } 772 return low | (high << 16); 773 } 774 775 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 776 { 777 int32_t low; 778 int32_t high; 779 low = x; 780 if (low != (int16_t)low) { 781 low = (low >> 31) ^ 0x7fff; 782 SET_QC(); 783 } 784 high = x >> 32; 785 if (high != (int16_t)high) { 786 high = (high >> 31) ^ 0x7fff; 787 SET_QC(); 788 } 789 return (uint16_t)low | (high << 16); 790 } 791 792 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 793 { 794 if (x & 0x8000000000000000ull) { 795 SET_QC(); 796 return 0; 797 } 798 if (x > 0xffffffffu) { 799 SET_QC(); 800 return 0xffffffffu; 801 } 802 return x; 803 } 804 805 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 806 { 807 if (x > 0xffffffffu) { 808 SET_QC(); 809 return 0xffffffffu; 810 } 811 return x; 812 } 813 814 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 815 { 816 if ((int64_t)x != (int32_t)x) { 817 SET_QC(); 818 return ((int64_t)x >> 63) ^ 0x7fffffff; 819 } 820 return x; 821 } 822 823 uint64_t HELPER(neon_widen_u8)(uint32_t x) 824 { 825 uint64_t tmp; 826 uint64_t ret; 827 ret = (uint8_t)x; 828 tmp = (uint8_t)(x >> 8); 829 ret |= tmp << 16; 830 tmp = (uint8_t)(x >> 16); 831 ret |= tmp << 32; 832 tmp = (uint8_t)(x >> 24); 833 ret |= tmp << 48; 834 return ret; 835 } 836 837 uint64_t HELPER(neon_widen_s8)(uint32_t x) 838 { 839 uint64_t tmp; 840 uint64_t ret; 841 ret = (uint16_t)(int8_t)x; 842 tmp = (uint16_t)(int8_t)(x >> 8); 843 ret |= tmp << 16; 844 tmp = (uint16_t)(int8_t)(x >> 16); 845 ret |= tmp << 32; 846 tmp = (uint16_t)(int8_t)(x >> 24); 847 ret |= tmp << 48; 848 return ret; 849 } 850 851 uint64_t HELPER(neon_widen_u16)(uint32_t x) 852 { 853 uint64_t high = (uint16_t)(x >> 16); 854 return ((uint16_t)x) | (high << 32); 855 } 856 857 uint64_t HELPER(neon_widen_s16)(uint32_t x) 858 { 859 uint64_t high = (int16_t)(x >> 16); 860 return ((uint32_t)(int16_t)x) | (high << 32); 861 } 862 863 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 864 { 865 uint64_t mask; 866 mask = (a ^ b) & 0x8000800080008000ull; 867 a &= ~0x8000800080008000ull; 868 b &= ~0x8000800080008000ull; 869 return (a + b) ^ mask; 870 } 871 872 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 873 { 874 uint64_t mask; 875 mask = (a ^ b) & 0x8000000080000000ull; 876 a &= ~0x8000000080000000ull; 877 b &= ~0x8000000080000000ull; 878 return (a + b) ^ mask; 879 } 880 881 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 882 { 883 uint64_t tmp; 884 uint64_t tmp2; 885 886 tmp = a & 0x0000ffff0000ffffull; 887 tmp += (a >> 16) & 0x0000ffff0000ffffull; 888 tmp2 = b & 0xffff0000ffff0000ull; 889 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 890 return ( tmp & 0xffff) 891 | ((tmp >> 16) & 0xffff0000ull) 892 | ((tmp2 << 16) & 0xffff00000000ull) 893 | ( tmp2 & 0xffff000000000000ull); 894 } 895 896 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 897 { 898 uint32_t low = a + (a >> 32); 899 uint32_t high = b + (b >> 32); 900 return low + ((uint64_t)high << 32); 901 } 902 903 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 904 { 905 uint64_t mask; 906 mask = (a ^ ~b) & 0x8000800080008000ull; 907 a |= 0x8000800080008000ull; 908 b &= ~0x8000800080008000ull; 909 return (a - b) ^ mask; 910 } 911 912 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 913 { 914 uint64_t mask; 915 mask = (a ^ ~b) & 0x8000000080000000ull; 916 a |= 0x8000000080000000ull; 917 b &= ~0x8000000080000000ull; 918 return (a - b) ^ mask; 919 } 920 921 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 922 { 923 uint32_t x, y; 924 uint32_t low, high; 925 926 x = a; 927 y = b; 928 low = x + y; 929 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 930 SET_QC(); 931 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 932 } 933 x = a >> 32; 934 y = b >> 32; 935 high = x + y; 936 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 937 SET_QC(); 938 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 939 } 940 return low | ((uint64_t)high << 32); 941 } 942 943 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 944 { 945 uint64_t result; 946 947 result = a + b; 948 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 949 SET_QC(); 950 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 951 } 952 return result; 953 } 954 955 /* We have to do the arithmetic in a larger type than 956 * the input type, because for example with a signed 32 bit 957 * op the absolute difference can overflow a signed 32 bit value. 958 */ 959 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 960 arithtype tmp_x = (intype)(x); \ 961 arithtype tmp_y = (intype)(y); \ 962 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 963 } while(0) 964 965 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 966 { 967 uint64_t tmp; 968 uint64_t result; 969 DO_ABD(result, a, b, uint8_t, uint32_t); 970 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 971 result |= tmp << 16; 972 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 973 result |= tmp << 32; 974 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 975 result |= tmp << 48; 976 return result; 977 } 978 979 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 980 { 981 uint64_t tmp; 982 uint64_t result; 983 DO_ABD(result, a, b, int8_t, int32_t); 984 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 985 result |= tmp << 16; 986 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 987 result |= tmp << 32; 988 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 989 result |= tmp << 48; 990 return result; 991 } 992 993 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 994 { 995 uint64_t tmp; 996 uint64_t result; 997 DO_ABD(result, a, b, uint16_t, uint32_t); 998 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 999 return result | (tmp << 32); 1000 } 1001 1002 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1003 { 1004 uint64_t tmp; 1005 uint64_t result; 1006 DO_ABD(result, a, b, int16_t, int32_t); 1007 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1008 return result | (tmp << 32); 1009 } 1010 1011 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1012 { 1013 uint64_t result; 1014 DO_ABD(result, a, b, uint32_t, uint64_t); 1015 return result; 1016 } 1017 1018 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1019 { 1020 uint64_t result; 1021 DO_ABD(result, a, b, int32_t, int64_t); 1022 return result; 1023 } 1024 #undef DO_ABD 1025 1026 /* Widening multiply. Named type is the source type. */ 1027 #define DO_MULL(dest, x, y, type1, type2) do { \ 1028 type1 tmp_x = x; \ 1029 type1 tmp_y = y; \ 1030 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1031 } while(0) 1032 1033 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1034 { 1035 uint64_t tmp; 1036 uint64_t result; 1037 1038 DO_MULL(result, a, b, uint8_t, uint16_t); 1039 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1040 result |= tmp << 16; 1041 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1042 result |= tmp << 32; 1043 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1044 result |= tmp << 48; 1045 return result; 1046 } 1047 1048 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1049 { 1050 uint64_t tmp; 1051 uint64_t result; 1052 1053 DO_MULL(result, a, b, int8_t, uint16_t); 1054 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1055 result |= tmp << 16; 1056 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1057 result |= tmp << 32; 1058 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1059 result |= tmp << 48; 1060 return result; 1061 } 1062 1063 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1064 { 1065 uint64_t tmp; 1066 uint64_t result; 1067 1068 DO_MULL(result, a, b, uint16_t, uint32_t); 1069 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1070 return result | (tmp << 32); 1071 } 1072 1073 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1074 { 1075 uint64_t tmp; 1076 uint64_t result; 1077 1078 DO_MULL(result, a, b, int16_t, uint32_t); 1079 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1080 return result | (tmp << 32); 1081 } 1082 1083 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1084 { 1085 uint16_t tmp; 1086 uint64_t result; 1087 result = (uint16_t)-x; 1088 tmp = -(x >> 16); 1089 result |= (uint64_t)tmp << 16; 1090 tmp = -(x >> 32); 1091 result |= (uint64_t)tmp << 32; 1092 tmp = -(x >> 48); 1093 result |= (uint64_t)tmp << 48; 1094 return result; 1095 } 1096 1097 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1098 { 1099 uint32_t low = -x; 1100 uint32_t high = -(x >> 32); 1101 return low | ((uint64_t)high << 32); 1102 } 1103 1104 /* Saturating sign manipulation. */ 1105 /* ??? Make these use NEON_VOP1 */ 1106 #define DO_QABS8(x) do { \ 1107 if (x == (int8_t)0x80) { \ 1108 x = 0x7f; \ 1109 SET_QC(); \ 1110 } else if (x < 0) { \ 1111 x = -x; \ 1112 }} while (0) 1113 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1114 { 1115 neon_s8 vec; 1116 NEON_UNPACK(neon_s8, vec, x); 1117 DO_QABS8(vec.v1); 1118 DO_QABS8(vec.v2); 1119 DO_QABS8(vec.v3); 1120 DO_QABS8(vec.v4); 1121 NEON_PACK(neon_s8, x, vec); 1122 return x; 1123 } 1124 #undef DO_QABS8 1125 1126 #define DO_QNEG8(x) do { \ 1127 if (x == (int8_t)0x80) { \ 1128 x = 0x7f; \ 1129 SET_QC(); \ 1130 } else { \ 1131 x = -x; \ 1132 }} while (0) 1133 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1134 { 1135 neon_s8 vec; 1136 NEON_UNPACK(neon_s8, vec, x); 1137 DO_QNEG8(vec.v1); 1138 DO_QNEG8(vec.v2); 1139 DO_QNEG8(vec.v3); 1140 DO_QNEG8(vec.v4); 1141 NEON_PACK(neon_s8, x, vec); 1142 return x; 1143 } 1144 #undef DO_QNEG8 1145 1146 #define DO_QABS16(x) do { \ 1147 if (x == (int16_t)0x8000) { \ 1148 x = 0x7fff; \ 1149 SET_QC(); \ 1150 } else if (x < 0) { \ 1151 x = -x; \ 1152 }} while (0) 1153 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1154 { 1155 neon_s16 vec; 1156 NEON_UNPACK(neon_s16, vec, x); 1157 DO_QABS16(vec.v1); 1158 DO_QABS16(vec.v2); 1159 NEON_PACK(neon_s16, x, vec); 1160 return x; 1161 } 1162 #undef DO_QABS16 1163 1164 #define DO_QNEG16(x) do { \ 1165 if (x == (int16_t)0x8000) { \ 1166 x = 0x7fff; \ 1167 SET_QC(); \ 1168 } else { \ 1169 x = -x; \ 1170 }} while (0) 1171 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1172 { 1173 neon_s16 vec; 1174 NEON_UNPACK(neon_s16, vec, x); 1175 DO_QNEG16(vec.v1); 1176 DO_QNEG16(vec.v2); 1177 NEON_PACK(neon_s16, x, vec); 1178 return x; 1179 } 1180 #undef DO_QNEG16 1181 1182 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1183 { 1184 if (x == SIGNBIT) { 1185 SET_QC(); 1186 x = ~SIGNBIT; 1187 } else if ((int32_t)x < 0) { 1188 x = -x; 1189 } 1190 return x; 1191 } 1192 1193 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1194 { 1195 if (x == SIGNBIT) { 1196 SET_QC(); 1197 x = ~SIGNBIT; 1198 } else { 1199 x = -x; 1200 } 1201 return x; 1202 } 1203 1204 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1205 { 1206 if (x == SIGNBIT64) { 1207 SET_QC(); 1208 x = ~SIGNBIT64; 1209 } else if ((int64_t)x < 0) { 1210 x = -x; 1211 } 1212 return x; 1213 } 1214 1215 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1216 { 1217 if (x == SIGNBIT64) { 1218 SET_QC(); 1219 x = ~SIGNBIT64; 1220 } else { 1221 x = -x; 1222 } 1223 return x; 1224 } 1225 1226 /* NEON Float helpers. */ 1227 1228 /* Floating point comparisons produce an integer result. 1229 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1230 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1231 */ 1232 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1233 { 1234 float_status *fpst = fpstp; 1235 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1236 } 1237 1238 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1239 { 1240 float_status *fpst = fpstp; 1241 return -float32_le(make_float32(b), make_float32(a), fpst); 1242 } 1243 1244 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1245 { 1246 float_status *fpst = fpstp; 1247 return -float32_lt(make_float32(b), make_float32(a), fpst); 1248 } 1249 1250 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1251 { 1252 float_status *fpst = fpstp; 1253 float32 f0 = float32_abs(make_float32(a)); 1254 float32 f1 = float32_abs(make_float32(b)); 1255 return -float32_le(f1, f0, fpst); 1256 } 1257 1258 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1259 { 1260 float_status *fpst = fpstp; 1261 float32 f0 = float32_abs(make_float32(a)); 1262 float32 f1 = float32_abs(make_float32(b)); 1263 return -float32_lt(f1, f0, fpst); 1264 } 1265 1266 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1267 { 1268 float_status *fpst = fpstp; 1269 float64 f0 = float64_abs(make_float64(a)); 1270 float64 f1 = float64_abs(make_float64(b)); 1271 return -float64_le(f1, f0, fpst); 1272 } 1273 1274 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1275 { 1276 float_status *fpst = fpstp; 1277 float64 f0 = float64_abs(make_float64(a)); 1278 float64 f1 = float64_abs(make_float64(b)); 1279 return -float64_lt(f1, f0, fpst); 1280 } 1281 1282 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1283 1284 void HELPER(neon_qunzip8)(void *vd, void *vm) 1285 { 1286 uint64_t *rd = vd, *rm = vm; 1287 uint64_t zd0 = rd[0], zd1 = rd[1]; 1288 uint64_t zm0 = rm[0], zm1 = rm[1]; 1289 1290 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1291 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1292 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1293 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1294 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1295 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1296 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1297 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1298 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1299 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1300 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1301 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1302 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1303 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1304 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1305 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1306 1307 rm[0] = m0; 1308 rm[1] = m1; 1309 rd[0] = d0; 1310 rd[1] = d1; 1311 } 1312 1313 void HELPER(neon_qunzip16)(void *vd, void *vm) 1314 { 1315 uint64_t *rd = vd, *rm = vm; 1316 uint64_t zd0 = rd[0], zd1 = rd[1]; 1317 uint64_t zm0 = rm[0], zm1 = rm[1]; 1318 1319 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1320 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1321 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1322 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1323 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1324 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1325 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1326 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1327 1328 rm[0] = m0; 1329 rm[1] = m1; 1330 rd[0] = d0; 1331 rd[1] = d1; 1332 } 1333 1334 void HELPER(neon_qunzip32)(void *vd, void *vm) 1335 { 1336 uint64_t *rd = vd, *rm = vm; 1337 uint64_t zd0 = rd[0], zd1 = rd[1]; 1338 uint64_t zm0 = rm[0], zm1 = rm[1]; 1339 1340 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1341 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1342 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1343 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1344 1345 rm[0] = m0; 1346 rm[1] = m1; 1347 rd[0] = d0; 1348 rd[1] = d1; 1349 } 1350 1351 void HELPER(neon_unzip8)(void *vd, void *vm) 1352 { 1353 uint64_t *rd = vd, *rm = vm; 1354 uint64_t zd = rd[0], zm = rm[0]; 1355 1356 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1357 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1358 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1359 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1360 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1361 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1362 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1363 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1364 1365 rm[0] = m0; 1366 rd[0] = d0; 1367 } 1368 1369 void HELPER(neon_unzip16)(void *vd, void *vm) 1370 { 1371 uint64_t *rd = vd, *rm = vm; 1372 uint64_t zd = rd[0], zm = rm[0]; 1373 1374 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1375 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1376 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1377 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1378 1379 rm[0] = m0; 1380 rd[0] = d0; 1381 } 1382 1383 void HELPER(neon_qzip8)(void *vd, void *vm) 1384 { 1385 uint64_t *rd = vd, *rm = vm; 1386 uint64_t zd0 = rd[0], zd1 = rd[1]; 1387 uint64_t zm0 = rm[0], zm1 = rm[1]; 1388 1389 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1390 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1391 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1392 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1393 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1394 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1395 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1396 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1397 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1398 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1399 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1400 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1401 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1402 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1403 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1404 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1405 1406 rm[0] = m0; 1407 rm[1] = m1; 1408 rd[0] = d0; 1409 rd[1] = d1; 1410 } 1411 1412 void HELPER(neon_qzip16)(void *vd, void *vm) 1413 { 1414 uint64_t *rd = vd, *rm = vm; 1415 uint64_t zd0 = rd[0], zd1 = rd[1]; 1416 uint64_t zm0 = rm[0], zm1 = rm[1]; 1417 1418 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1419 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1420 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1421 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1422 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1423 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1424 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1425 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1426 1427 rm[0] = m0; 1428 rm[1] = m1; 1429 rd[0] = d0; 1430 rd[1] = d1; 1431 } 1432 1433 void HELPER(neon_qzip32)(void *vd, void *vm) 1434 { 1435 uint64_t *rd = vd, *rm = vm; 1436 uint64_t zd0 = rd[0], zd1 = rd[1]; 1437 uint64_t zm0 = rm[0], zm1 = rm[1]; 1438 1439 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1440 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1441 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1442 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1443 1444 rm[0] = m0; 1445 rm[1] = m1; 1446 rd[0] = d0; 1447 rd[1] = d1; 1448 } 1449 1450 void HELPER(neon_zip8)(void *vd, void *vm) 1451 { 1452 uint64_t *rd = vd, *rm = vm; 1453 uint64_t zd = rd[0], zm = rm[0]; 1454 1455 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1456 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1457 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1458 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1459 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1460 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1461 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1462 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1463 1464 rm[0] = m0; 1465 rd[0] = d0; 1466 } 1467 1468 void HELPER(neon_zip16)(void *vd, void *vm) 1469 { 1470 uint64_t *rd = vd, *rm = vm; 1471 uint64_t zd = rd[0], zm = rm[0]; 1472 1473 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1474 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1475 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1476 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1477 1478 rm[0] = m0; 1479 rd[0] = d0; 1480 } 1481