1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 #include "qemu/osdep.h" 10 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "fpu/softfloat.h" 14 #include "vec_internal.h" 15 16 #define SIGNBIT (uint32_t)0x80000000 17 #define SIGNBIT64 ((uint64_t)1 << 63) 18 19 #define SET_QC() env->vfp.qc[0] = 1 20 21 #define NEON_TYPE1(name, type) \ 22 typedef struct \ 23 { \ 24 type v1; \ 25 } neon_##name; 26 #if HOST_BIG_ENDIAN 27 #define NEON_TYPE2(name, type) \ 28 typedef struct \ 29 { \ 30 type v2; \ 31 type v1; \ 32 } neon_##name; 33 #define NEON_TYPE4(name, type) \ 34 typedef struct \ 35 { \ 36 type v4; \ 37 type v3; \ 38 type v2; \ 39 type v1; \ 40 } neon_##name; 41 #else 42 #define NEON_TYPE2(name, type) \ 43 typedef struct \ 44 { \ 45 type v1; \ 46 type v2; \ 47 } neon_##name; 48 #define NEON_TYPE4(name, type) \ 49 typedef struct \ 50 { \ 51 type v1; \ 52 type v2; \ 53 type v3; \ 54 type v4; \ 55 } neon_##name; 56 #endif 57 58 NEON_TYPE4(s8, int8_t) 59 NEON_TYPE4(u8, uint8_t) 60 NEON_TYPE2(s16, int16_t) 61 NEON_TYPE2(u16, uint16_t) 62 NEON_TYPE1(s32, int32_t) 63 NEON_TYPE1(u32, uint32_t) 64 #undef NEON_TYPE4 65 #undef NEON_TYPE2 66 #undef NEON_TYPE1 67 68 /* Copy from a uint32_t to a vector structure type. */ 69 #define NEON_UNPACK(vtype, dest, val) do { \ 70 union { \ 71 vtype v; \ 72 uint32_t i; \ 73 } conv_u; \ 74 conv_u.i = (val); \ 75 dest = conv_u.v; \ 76 } while(0) 77 78 /* Copy from a vector structure type to a uint32_t. */ 79 #define NEON_PACK(vtype, dest, val) do { \ 80 union { \ 81 vtype v; \ 82 uint32_t i; \ 83 } conv_u; \ 84 conv_u.v = (val); \ 85 dest = conv_u.i; \ 86 } while(0) 87 88 #define NEON_DO1 \ 89 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 90 #define NEON_DO2 \ 91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 92 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 93 #define NEON_DO4 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 96 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 97 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 98 99 #define NEON_VOP_BODY(vtype, n) \ 100 { \ 101 uint32_t res; \ 102 vtype vsrc1; \ 103 vtype vsrc2; \ 104 vtype vdest; \ 105 NEON_UNPACK(vtype, vsrc1, arg1); \ 106 NEON_UNPACK(vtype, vsrc2, arg2); \ 107 NEON_DO##n; \ 108 NEON_PACK(vtype, res, vdest); \ 109 return res; \ 110 } 111 112 #define NEON_VOP(name, vtype, n) \ 113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 114 NEON_VOP_BODY(vtype, n) 115 116 #define NEON_VOP_ENV(name, vtype, n) \ 117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 118 NEON_VOP_BODY(vtype, n) 119 120 /* Pairwise operations. */ 121 /* For 32-bit elements each segment only contains a single element, so 122 the elementwise and pairwise operations are the same. */ 123 #define NEON_PDO2 \ 124 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 125 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 126 #define NEON_PDO4 \ 127 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 128 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 129 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 130 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 131 132 #define NEON_POP(name, vtype, n) \ 133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 134 { \ 135 uint32_t res; \ 136 vtype vsrc1; \ 137 vtype vsrc2; \ 138 vtype vdest; \ 139 NEON_UNPACK(vtype, vsrc1, arg1); \ 140 NEON_UNPACK(vtype, vsrc2, arg2); \ 141 NEON_PDO##n; \ 142 NEON_PACK(vtype, res, vdest); \ 143 return res; \ 144 } 145 146 /* Unary operators. */ 147 #define NEON_VOP1(name, vtype, n) \ 148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 149 { \ 150 vtype vsrc1; \ 151 vtype vdest; \ 152 NEON_UNPACK(vtype, vsrc1, arg); \ 153 NEON_DO##n; \ 154 NEON_PACK(vtype, arg, vdest); \ 155 return arg; \ 156 } 157 158 159 #define NEON_USAT(dest, src1, src2, type) do { \ 160 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 161 if (tmp != (type)tmp) { \ 162 SET_QC(); \ 163 dest = ~0; \ 164 } else { \ 165 dest = tmp; \ 166 }} while(0) 167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 168 NEON_VOP_ENV(qadd_u8, neon_u8, 4) 169 #undef NEON_FN 170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 171 NEON_VOP_ENV(qadd_u16, neon_u16, 2) 172 #undef NEON_FN 173 #undef NEON_USAT 174 175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 176 { 177 uint32_t res = a + b; 178 if (res < a) { 179 SET_QC(); 180 res = ~0; 181 } 182 return res; 183 } 184 185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 186 { 187 uint64_t res; 188 189 res = src1 + src2; 190 if (res < src1) { 191 SET_QC(); 192 res = ~(uint64_t)0; 193 } 194 return res; 195 } 196 197 #define NEON_SSAT(dest, src1, src2, type) do { \ 198 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 199 if (tmp != (type)tmp) { \ 200 SET_QC(); \ 201 if (src2 > 0) { \ 202 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 203 } else { \ 204 tmp = 1 << (sizeof(type) * 8 - 1); \ 205 } \ 206 } \ 207 dest = tmp; \ 208 } while(0) 209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 210 NEON_VOP_ENV(qadd_s8, neon_s8, 4) 211 #undef NEON_FN 212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 213 NEON_VOP_ENV(qadd_s16, neon_s16, 2) 214 #undef NEON_FN 215 #undef NEON_SSAT 216 217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 218 { 219 uint32_t res = a + b; 220 if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { 221 SET_QC(); 222 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 223 } 224 return res; 225 } 226 227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 228 { 229 uint64_t res; 230 231 res = src1 + src2; 232 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { 233 SET_QC(); 234 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 235 } 236 return res; 237 } 238 239 /* Unsigned saturating accumulate of signed value 240 * 241 * Op1/Rn is treated as signed 242 * Op2/Rd is treated as unsigned 243 * 244 * Explicit casting is used to ensure the correct sign extension of 245 * inputs. The result is treated as a unsigned value and saturated as such. 246 * 247 * We use a macro for the 8/16 bit cases which expects signed integers of va, 248 * vb, and vr for interim calculation and an unsigned 32 bit result value r. 249 */ 250 251 #define USATACC(bits, shift) \ 252 do { \ 253 va = sextract32(a, shift, bits); \ 254 vb = extract32(b, shift, bits); \ 255 vr = va + vb; \ 256 if (vr > UINT##bits##_MAX) { \ 257 SET_QC(); \ 258 vr = UINT##bits##_MAX; \ 259 } else if (vr < 0) { \ 260 SET_QC(); \ 261 vr = 0; \ 262 } \ 263 r = deposit32(r, shift, bits, vr); \ 264 } while (0) 265 266 uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b) 267 { 268 int16_t va, vb, vr; 269 uint32_t r = 0; 270 271 USATACC(8, 0); 272 USATACC(8, 8); 273 USATACC(8, 16); 274 USATACC(8, 24); 275 return r; 276 } 277 278 uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b) 279 { 280 int32_t va, vb, vr; 281 uint64_t r = 0; 282 283 USATACC(16, 0); 284 USATACC(16, 16); 285 return r; 286 } 287 288 #undef USATACC 289 290 uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 291 { 292 int64_t va = (int32_t)a; 293 int64_t vb = (uint32_t)b; 294 int64_t vr = va + vb; 295 if (vr > UINT32_MAX) { 296 SET_QC(); 297 vr = UINT32_MAX; 298 } else if (vr < 0) { 299 SET_QC(); 300 vr = 0; 301 } 302 return vr; 303 } 304 305 uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b) 306 { 307 uint64_t res; 308 res = a + b; 309 /* We only need to look at the pattern of SIGN bits to detect 310 * +ve/-ve saturation 311 */ 312 if (~a & b & ~res & SIGNBIT64) { 313 SET_QC(); 314 res = UINT64_MAX; 315 } else if (a & ~b & res & SIGNBIT64) { 316 SET_QC(); 317 res = 0; 318 } 319 return res; 320 } 321 322 /* Signed saturating accumulate of unsigned value 323 * 324 * Op1/Rn is treated as unsigned 325 * Op2/Rd is treated as signed 326 * 327 * The result is treated as a signed value and saturated as such 328 * 329 * We use a macro for the 8/16 bit cases which expects signed integers of va, 330 * vb, and vr for interim calculation and an unsigned 32 bit result value r. 331 */ 332 333 #define SSATACC(bits, shift) \ 334 do { \ 335 va = extract32(a, shift, bits); \ 336 vb = sextract32(b, shift, bits); \ 337 vr = va + vb; \ 338 if (vr > INT##bits##_MAX) { \ 339 SET_QC(); \ 340 vr = INT##bits##_MAX; \ 341 } else if (vr < INT##bits##_MIN) { \ 342 SET_QC(); \ 343 vr = INT##bits##_MIN; \ 344 } \ 345 r = deposit32(r, shift, bits, vr); \ 346 } while (0) 347 348 uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b) 349 { 350 int16_t va, vb, vr; 351 uint32_t r = 0; 352 353 SSATACC(8, 0); 354 SSATACC(8, 8); 355 SSATACC(8, 16); 356 SSATACC(8, 24); 357 return r; 358 } 359 360 uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b) 361 { 362 int32_t va, vb, vr; 363 uint32_t r = 0; 364 365 SSATACC(16, 0); 366 SSATACC(16, 16); 367 368 return r; 369 } 370 371 #undef SSATACC 372 373 uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 374 { 375 int64_t res; 376 int64_t op1 = (uint32_t)a; 377 int64_t op2 = (int32_t)b; 378 res = op1 + op2; 379 if (res > INT32_MAX) { 380 SET_QC(); 381 res = INT32_MAX; 382 } else if (res < INT32_MIN) { 383 SET_QC(); 384 res = INT32_MIN; 385 } 386 return res; 387 } 388 389 uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b) 390 { 391 uint64_t res; 392 res = a + b; 393 /* We only need to look at the pattern of SIGN bits to detect an overflow */ 394 if (((a & res) 395 | (~b & res) 396 | (a & ~b)) & SIGNBIT64) { 397 SET_QC(); 398 res = INT64_MAX; 399 } 400 return res; 401 } 402 403 404 #define NEON_USAT(dest, src1, src2, type) do { \ 405 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 406 if (tmp != (type)tmp) { \ 407 SET_QC(); \ 408 dest = 0; \ 409 } else { \ 410 dest = tmp; \ 411 }} while(0) 412 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 413 NEON_VOP_ENV(qsub_u8, neon_u8, 4) 414 #undef NEON_FN 415 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 416 NEON_VOP_ENV(qsub_u16, neon_u16, 2) 417 #undef NEON_FN 418 #undef NEON_USAT 419 420 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b) 421 { 422 uint32_t res = a - b; 423 if (res > a) { 424 SET_QC(); 425 res = 0; 426 } 427 return res; 428 } 429 430 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 431 { 432 uint64_t res; 433 434 if (src1 < src2) { 435 SET_QC(); 436 res = 0; 437 } else { 438 res = src1 - src2; 439 } 440 return res; 441 } 442 443 #define NEON_SSAT(dest, src1, src2, type) do { \ 444 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 445 if (tmp != (type)tmp) { \ 446 SET_QC(); \ 447 if (src2 < 0) { \ 448 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 449 } else { \ 450 tmp = 1 << (sizeof(type) * 8 - 1); \ 451 } \ 452 } \ 453 dest = tmp; \ 454 } while(0) 455 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 456 NEON_VOP_ENV(qsub_s8, neon_s8, 4) 457 #undef NEON_FN 458 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 459 NEON_VOP_ENV(qsub_s16, neon_s16, 2) 460 #undef NEON_FN 461 #undef NEON_SSAT 462 463 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b) 464 { 465 uint32_t res = a - b; 466 if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { 467 SET_QC(); 468 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 469 } 470 return res; 471 } 472 473 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 474 { 475 uint64_t res; 476 477 res = src1 - src2; 478 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { 479 SET_QC(); 480 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 481 } 482 return res; 483 } 484 485 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 486 NEON_VOP(hadd_s8, neon_s8, 4) 487 NEON_VOP(hadd_u8, neon_u8, 4) 488 NEON_VOP(hadd_s16, neon_s16, 2) 489 NEON_VOP(hadd_u16, neon_u16, 2) 490 #undef NEON_FN 491 492 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 493 { 494 int32_t dest; 495 496 dest = (src1 >> 1) + (src2 >> 1); 497 if (src1 & src2 & 1) 498 dest++; 499 return dest; 500 } 501 502 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 503 { 504 uint32_t dest; 505 506 dest = (src1 >> 1) + (src2 >> 1); 507 if (src1 & src2 & 1) 508 dest++; 509 return dest; 510 } 511 512 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 513 NEON_VOP(rhadd_s8, neon_s8, 4) 514 NEON_VOP(rhadd_u8, neon_u8, 4) 515 NEON_VOP(rhadd_s16, neon_s16, 2) 516 NEON_VOP(rhadd_u16, neon_u16, 2) 517 #undef NEON_FN 518 519 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 520 { 521 int32_t dest; 522 523 dest = (src1 >> 1) + (src2 >> 1); 524 if ((src1 | src2) & 1) 525 dest++; 526 return dest; 527 } 528 529 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 530 { 531 uint32_t dest; 532 533 dest = (src1 >> 1) + (src2 >> 1); 534 if ((src1 | src2) & 1) 535 dest++; 536 return dest; 537 } 538 539 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 540 NEON_VOP(hsub_s8, neon_s8, 4) 541 NEON_VOP(hsub_u8, neon_u8, 4) 542 NEON_VOP(hsub_s16, neon_s16, 2) 543 NEON_VOP(hsub_u16, neon_u16, 2) 544 #undef NEON_FN 545 546 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 547 { 548 int32_t dest; 549 550 dest = (src1 >> 1) - (src2 >> 1); 551 if ((~src1) & src2 & 1) 552 dest--; 553 return dest; 554 } 555 556 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 557 { 558 uint32_t dest; 559 560 dest = (src1 >> 1) - (src2 >> 1); 561 if ((~src1) & src2 & 1) 562 dest--; 563 return dest; 564 } 565 566 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 567 NEON_POP(pmin_s8, neon_s8, 4) 568 NEON_POP(pmin_u8, neon_u8, 4) 569 NEON_POP(pmin_s16, neon_s16, 2) 570 NEON_POP(pmin_u16, neon_u16, 2) 571 #undef NEON_FN 572 573 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 574 NEON_POP(pmax_s8, neon_s8, 4) 575 NEON_POP(pmax_u8, neon_u8, 4) 576 NEON_POP(pmax_s16, neon_s16, 2) 577 NEON_POP(pmax_u16, neon_u16, 2) 578 #undef NEON_FN 579 580 #define NEON_FN(dest, src1, src2) \ 581 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 582 NEON_VOP(shl_u16, neon_u16, 2) 583 #undef NEON_FN 584 585 #define NEON_FN(dest, src1, src2) \ 586 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 587 NEON_VOP(shl_s16, neon_s16, 2) 588 #undef NEON_FN 589 590 #define NEON_FN(dest, src1, src2) \ 591 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 592 NEON_VOP(rshl_s8, neon_s8, 4) 593 #undef NEON_FN 594 595 #define NEON_FN(dest, src1, src2) \ 596 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 597 NEON_VOP(rshl_s16, neon_s16, 2) 598 #undef NEON_FN 599 600 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 601 { 602 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 603 } 604 605 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 606 { 607 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 608 } 609 610 #define NEON_FN(dest, src1, src2) \ 611 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 612 NEON_VOP(rshl_u8, neon_u8, 4) 613 #undef NEON_FN 614 615 #define NEON_FN(dest, src1, src2) \ 616 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 617 NEON_VOP(rshl_u16, neon_u16, 2) 618 #undef NEON_FN 619 620 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 621 { 622 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 623 } 624 625 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 626 { 627 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 628 } 629 630 #define NEON_FN(dest, src1, src2) \ 631 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 632 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 633 #undef NEON_FN 634 635 #define NEON_FN(dest, src1, src2) \ 636 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 637 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 638 #undef NEON_FN 639 640 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 641 { 642 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 643 } 644 645 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 646 { 647 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 648 } 649 650 #define NEON_FN(dest, src1, src2) \ 651 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 652 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 653 #undef NEON_FN 654 655 #define NEON_FN(dest, src1, src2) \ 656 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 657 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 658 #undef NEON_FN 659 660 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 661 { 662 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 663 } 664 665 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 666 { 667 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 668 } 669 670 #define NEON_FN(dest, src1, src2) \ 671 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 672 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 673 #undef NEON_FN 674 675 #define NEON_FN(dest, src1, src2) \ 676 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 677 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 678 #undef NEON_FN 679 680 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 681 { 682 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 683 } 684 685 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 686 { 687 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 688 } 689 690 #define NEON_FN(dest, src1, src2) \ 691 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 692 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 693 #undef NEON_FN 694 695 #define NEON_FN(dest, src1, src2) \ 696 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 697 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 698 #undef NEON_FN 699 700 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 701 { 702 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 703 } 704 705 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 706 { 707 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 708 } 709 710 #define NEON_FN(dest, src1, src2) \ 711 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 712 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 713 #undef NEON_FN 714 715 #define NEON_FN(dest, src1, src2) \ 716 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 717 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 718 #undef NEON_FN 719 720 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 721 { 722 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 723 } 724 725 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 726 { 727 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 728 } 729 730 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 731 { 732 uint32_t mask; 733 mask = (a ^ b) & 0x80808080u; 734 a &= ~0x80808080u; 735 b &= ~0x80808080u; 736 return (a + b) ^ mask; 737 } 738 739 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 740 { 741 uint32_t mask; 742 mask = (a ^ b) & 0x80008000u; 743 a &= ~0x80008000u; 744 b &= ~0x80008000u; 745 return (a + b) ^ mask; 746 } 747 748 #define NEON_FN(dest, src1, src2) dest = src1 + src2 749 NEON_POP(padd_u8, neon_u8, 4) 750 NEON_POP(padd_u16, neon_u16, 2) 751 #undef NEON_FN 752 753 #define NEON_FN(dest, src1, src2) dest = src1 - src2 754 NEON_VOP(sub_u8, neon_u8, 4) 755 NEON_VOP(sub_u16, neon_u16, 2) 756 #undef NEON_FN 757 758 #define NEON_FN(dest, src1, src2) dest = src1 * src2 759 NEON_VOP(mul_u8, neon_u8, 4) 760 NEON_VOP(mul_u16, neon_u16, 2) 761 #undef NEON_FN 762 763 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 764 NEON_VOP(tst_u8, neon_u8, 4) 765 NEON_VOP(tst_u16, neon_u16, 2) 766 NEON_VOP(tst_u32, neon_u32, 1) 767 #undef NEON_FN 768 769 /* Count Leading Sign/Zero Bits. */ 770 static inline int do_clz8(uint8_t x) 771 { 772 int n; 773 for (n = 8; x; n--) 774 x >>= 1; 775 return n; 776 } 777 778 static inline int do_clz16(uint16_t x) 779 { 780 int n; 781 for (n = 16; x; n--) 782 x >>= 1; 783 return n; 784 } 785 786 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 787 NEON_VOP1(clz_u8, neon_u8, 4) 788 #undef NEON_FN 789 790 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 791 NEON_VOP1(clz_u16, neon_u16, 2) 792 #undef NEON_FN 793 794 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 795 NEON_VOP1(cls_s8, neon_s8, 4) 796 #undef NEON_FN 797 798 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 799 NEON_VOP1(cls_s16, neon_s16, 2) 800 #undef NEON_FN 801 802 uint32_t HELPER(neon_cls_s32)(uint32_t x) 803 { 804 int count; 805 if ((int32_t)x < 0) 806 x = ~x; 807 for (count = 32; x; count--) 808 x = x >> 1; 809 return count - 1; 810 } 811 812 /* Bit count. */ 813 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 814 { 815 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 816 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 817 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 818 return x; 819 } 820 821 /* Reverse bits in each 8 bit word */ 822 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 823 { 824 x = ((x & 0xf0f0f0f0) >> 4) 825 | ((x & 0x0f0f0f0f) << 4); 826 x = ((x & 0x88888888) >> 3) 827 | ((x & 0x44444444) >> 1) 828 | ((x & 0x22222222) << 1) 829 | ((x & 0x11111111) << 3); 830 return x; 831 } 832 833 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 834 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 835 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 836 SET_QC(); \ 837 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 838 } else { \ 839 tmp <<= 1; \ 840 } \ 841 if (round) { \ 842 int32_t old = tmp; \ 843 tmp += 1 << 15; \ 844 if ((int32_t)tmp < old) { \ 845 SET_QC(); \ 846 tmp = SIGNBIT - 1; \ 847 } \ 848 } \ 849 dest = tmp >> 16; \ 850 } while(0) 851 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 852 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 853 #undef NEON_FN 854 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 855 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 856 #undef NEON_FN 857 #undef NEON_QDMULH16 858 859 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 860 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 861 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 862 SET_QC(); \ 863 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 864 } else { \ 865 tmp <<= 1; \ 866 } \ 867 if (round) { \ 868 int64_t old = tmp; \ 869 tmp += (int64_t)1 << 31; \ 870 if ((int64_t)tmp < old) { \ 871 SET_QC(); \ 872 tmp = SIGNBIT64 - 1; \ 873 } \ 874 } \ 875 dest = tmp >> 32; \ 876 } while(0) 877 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 878 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 879 #undef NEON_FN 880 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 881 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 882 #undef NEON_FN 883 #undef NEON_QDMULH32 884 885 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 886 { 887 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 888 | ((x >> 24) & 0xff000000u); 889 } 890 891 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 892 { 893 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 894 } 895 896 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 897 { 898 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 899 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 900 } 901 902 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 903 { 904 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 905 } 906 907 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 908 { 909 x &= 0xff80ff80ff80ff80ull; 910 x += 0x0080008000800080ull; 911 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 912 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 913 } 914 915 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 916 { 917 x &= 0xffff8000ffff8000ull; 918 x += 0x0000800000008000ull; 919 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 920 } 921 922 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 923 { 924 uint16_t s; 925 uint8_t d; 926 uint32_t res = 0; 927 #define SAT8(n) \ 928 s = x >> n; \ 929 if (s & 0x8000) { \ 930 SET_QC(); \ 931 } else { \ 932 if (s > 0xff) { \ 933 d = 0xff; \ 934 SET_QC(); \ 935 } else { \ 936 d = s; \ 937 } \ 938 res |= (uint32_t)d << (n / 2); \ 939 } 940 941 SAT8(0); 942 SAT8(16); 943 SAT8(32); 944 SAT8(48); 945 #undef SAT8 946 return res; 947 } 948 949 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 950 { 951 uint16_t s; 952 uint8_t d; 953 uint32_t res = 0; 954 #define SAT8(n) \ 955 s = x >> n; \ 956 if (s > 0xff) { \ 957 d = 0xff; \ 958 SET_QC(); \ 959 } else { \ 960 d = s; \ 961 } \ 962 res |= (uint32_t)d << (n / 2); 963 964 SAT8(0); 965 SAT8(16); 966 SAT8(32); 967 SAT8(48); 968 #undef SAT8 969 return res; 970 } 971 972 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 973 { 974 int16_t s; 975 uint8_t d; 976 uint32_t res = 0; 977 #define SAT8(n) \ 978 s = x >> n; \ 979 if (s != (int8_t)s) { \ 980 d = (s >> 15) ^ 0x7f; \ 981 SET_QC(); \ 982 } else { \ 983 d = s; \ 984 } \ 985 res |= (uint32_t)d << (n / 2); 986 987 SAT8(0); 988 SAT8(16); 989 SAT8(32); 990 SAT8(48); 991 #undef SAT8 992 return res; 993 } 994 995 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 996 { 997 uint32_t high; 998 uint32_t low; 999 low = x; 1000 if (low & 0x80000000) { 1001 low = 0; 1002 SET_QC(); 1003 } else if (low > 0xffff) { 1004 low = 0xffff; 1005 SET_QC(); 1006 } 1007 high = x >> 32; 1008 if (high & 0x80000000) { 1009 high = 0; 1010 SET_QC(); 1011 } else if (high > 0xffff) { 1012 high = 0xffff; 1013 SET_QC(); 1014 } 1015 return low | (high << 16); 1016 } 1017 1018 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 1019 { 1020 uint32_t high; 1021 uint32_t low; 1022 low = x; 1023 if (low > 0xffff) { 1024 low = 0xffff; 1025 SET_QC(); 1026 } 1027 high = x >> 32; 1028 if (high > 0xffff) { 1029 high = 0xffff; 1030 SET_QC(); 1031 } 1032 return low | (high << 16); 1033 } 1034 1035 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 1036 { 1037 int32_t low; 1038 int32_t high; 1039 low = x; 1040 if (low != (int16_t)low) { 1041 low = (low >> 31) ^ 0x7fff; 1042 SET_QC(); 1043 } 1044 high = x >> 32; 1045 if (high != (int16_t)high) { 1046 high = (high >> 31) ^ 0x7fff; 1047 SET_QC(); 1048 } 1049 return (uint16_t)low | (high << 16); 1050 } 1051 1052 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 1053 { 1054 if (x & 0x8000000000000000ull) { 1055 SET_QC(); 1056 return 0; 1057 } 1058 if (x > 0xffffffffu) { 1059 SET_QC(); 1060 return 0xffffffffu; 1061 } 1062 return x; 1063 } 1064 1065 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 1066 { 1067 if (x > 0xffffffffu) { 1068 SET_QC(); 1069 return 0xffffffffu; 1070 } 1071 return x; 1072 } 1073 1074 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 1075 { 1076 if ((int64_t)x != (int32_t)x) { 1077 SET_QC(); 1078 return ((int64_t)x >> 63) ^ 0x7fffffff; 1079 } 1080 return x; 1081 } 1082 1083 uint64_t HELPER(neon_widen_u8)(uint32_t x) 1084 { 1085 uint64_t tmp; 1086 uint64_t ret; 1087 ret = (uint8_t)x; 1088 tmp = (uint8_t)(x >> 8); 1089 ret |= tmp << 16; 1090 tmp = (uint8_t)(x >> 16); 1091 ret |= tmp << 32; 1092 tmp = (uint8_t)(x >> 24); 1093 ret |= tmp << 48; 1094 return ret; 1095 } 1096 1097 uint64_t HELPER(neon_widen_s8)(uint32_t x) 1098 { 1099 uint64_t tmp; 1100 uint64_t ret; 1101 ret = (uint16_t)(int8_t)x; 1102 tmp = (uint16_t)(int8_t)(x >> 8); 1103 ret |= tmp << 16; 1104 tmp = (uint16_t)(int8_t)(x >> 16); 1105 ret |= tmp << 32; 1106 tmp = (uint16_t)(int8_t)(x >> 24); 1107 ret |= tmp << 48; 1108 return ret; 1109 } 1110 1111 uint64_t HELPER(neon_widen_u16)(uint32_t x) 1112 { 1113 uint64_t high = (uint16_t)(x >> 16); 1114 return ((uint16_t)x) | (high << 32); 1115 } 1116 1117 uint64_t HELPER(neon_widen_s16)(uint32_t x) 1118 { 1119 uint64_t high = (int16_t)(x >> 16); 1120 return ((uint32_t)(int16_t)x) | (high << 32); 1121 } 1122 1123 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 1124 { 1125 uint64_t mask; 1126 mask = (a ^ b) & 0x8000800080008000ull; 1127 a &= ~0x8000800080008000ull; 1128 b &= ~0x8000800080008000ull; 1129 return (a + b) ^ mask; 1130 } 1131 1132 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 1133 { 1134 uint64_t mask; 1135 mask = (a ^ b) & 0x8000000080000000ull; 1136 a &= ~0x8000000080000000ull; 1137 b &= ~0x8000000080000000ull; 1138 return (a + b) ^ mask; 1139 } 1140 1141 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 1142 { 1143 uint64_t tmp; 1144 uint64_t tmp2; 1145 1146 tmp = a & 0x0000ffff0000ffffull; 1147 tmp += (a >> 16) & 0x0000ffff0000ffffull; 1148 tmp2 = b & 0xffff0000ffff0000ull; 1149 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 1150 return ( tmp & 0xffff) 1151 | ((tmp >> 16) & 0xffff0000ull) 1152 | ((tmp2 << 16) & 0xffff00000000ull) 1153 | ( tmp2 & 0xffff000000000000ull); 1154 } 1155 1156 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 1157 { 1158 uint32_t low = a + (a >> 32); 1159 uint32_t high = b + (b >> 32); 1160 return low + ((uint64_t)high << 32); 1161 } 1162 1163 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 1164 { 1165 uint64_t mask; 1166 mask = (a ^ ~b) & 0x8000800080008000ull; 1167 a |= 0x8000800080008000ull; 1168 b &= ~0x8000800080008000ull; 1169 return (a - b) ^ mask; 1170 } 1171 1172 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 1173 { 1174 uint64_t mask; 1175 mask = (a ^ ~b) & 0x8000000080000000ull; 1176 a |= 0x8000000080000000ull; 1177 b &= ~0x8000000080000000ull; 1178 return (a - b) ^ mask; 1179 } 1180 1181 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 1182 { 1183 uint32_t x, y; 1184 uint32_t low, high; 1185 1186 x = a; 1187 y = b; 1188 low = x + y; 1189 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1190 SET_QC(); 1191 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 1192 } 1193 x = a >> 32; 1194 y = b >> 32; 1195 high = x + y; 1196 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1197 SET_QC(); 1198 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 1199 } 1200 return low | ((uint64_t)high << 32); 1201 } 1202 1203 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 1204 { 1205 uint64_t result; 1206 1207 result = a + b; 1208 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 1209 SET_QC(); 1210 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 1211 } 1212 return result; 1213 } 1214 1215 /* We have to do the arithmetic in a larger type than 1216 * the input type, because for example with a signed 32 bit 1217 * op the absolute difference can overflow a signed 32 bit value. 1218 */ 1219 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 1220 arithtype tmp_x = (intype)(x); \ 1221 arithtype tmp_y = (intype)(y); \ 1222 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 1223 } while(0) 1224 1225 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 1226 { 1227 uint64_t tmp; 1228 uint64_t result; 1229 DO_ABD(result, a, b, uint8_t, uint32_t); 1230 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 1231 result |= tmp << 16; 1232 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 1233 result |= tmp << 32; 1234 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 1235 result |= tmp << 48; 1236 return result; 1237 } 1238 1239 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 1240 { 1241 uint64_t tmp; 1242 uint64_t result; 1243 DO_ABD(result, a, b, int8_t, int32_t); 1244 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 1245 result |= tmp << 16; 1246 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 1247 result |= tmp << 32; 1248 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 1249 result |= tmp << 48; 1250 return result; 1251 } 1252 1253 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 1254 { 1255 uint64_t tmp; 1256 uint64_t result; 1257 DO_ABD(result, a, b, uint16_t, uint32_t); 1258 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1259 return result | (tmp << 32); 1260 } 1261 1262 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1263 { 1264 uint64_t tmp; 1265 uint64_t result; 1266 DO_ABD(result, a, b, int16_t, int32_t); 1267 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1268 return result | (tmp << 32); 1269 } 1270 1271 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1272 { 1273 uint64_t result; 1274 DO_ABD(result, a, b, uint32_t, uint64_t); 1275 return result; 1276 } 1277 1278 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1279 { 1280 uint64_t result; 1281 DO_ABD(result, a, b, int32_t, int64_t); 1282 return result; 1283 } 1284 #undef DO_ABD 1285 1286 /* Widening multiply. Named type is the source type. */ 1287 #define DO_MULL(dest, x, y, type1, type2) do { \ 1288 type1 tmp_x = x; \ 1289 type1 tmp_y = y; \ 1290 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1291 } while(0) 1292 1293 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1294 { 1295 uint64_t tmp; 1296 uint64_t result; 1297 1298 DO_MULL(result, a, b, uint8_t, uint16_t); 1299 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1300 result |= tmp << 16; 1301 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1302 result |= tmp << 32; 1303 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1304 result |= tmp << 48; 1305 return result; 1306 } 1307 1308 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1309 { 1310 uint64_t tmp; 1311 uint64_t result; 1312 1313 DO_MULL(result, a, b, int8_t, uint16_t); 1314 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1315 result |= tmp << 16; 1316 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1317 result |= tmp << 32; 1318 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1319 result |= tmp << 48; 1320 return result; 1321 } 1322 1323 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1324 { 1325 uint64_t tmp; 1326 uint64_t result; 1327 1328 DO_MULL(result, a, b, uint16_t, uint32_t); 1329 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1330 return result | (tmp << 32); 1331 } 1332 1333 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1334 { 1335 uint64_t tmp; 1336 uint64_t result; 1337 1338 DO_MULL(result, a, b, int16_t, uint32_t); 1339 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1340 return result | (tmp << 32); 1341 } 1342 1343 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1344 { 1345 uint16_t tmp; 1346 uint64_t result; 1347 result = (uint16_t)-x; 1348 tmp = -(x >> 16); 1349 result |= (uint64_t)tmp << 16; 1350 tmp = -(x >> 32); 1351 result |= (uint64_t)tmp << 32; 1352 tmp = -(x >> 48); 1353 result |= (uint64_t)tmp << 48; 1354 return result; 1355 } 1356 1357 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1358 { 1359 uint32_t low = -x; 1360 uint32_t high = -(x >> 32); 1361 return low | ((uint64_t)high << 32); 1362 } 1363 1364 /* Saturating sign manipulation. */ 1365 /* ??? Make these use NEON_VOP1 */ 1366 #define DO_QABS8(x) do { \ 1367 if (x == (int8_t)0x80) { \ 1368 x = 0x7f; \ 1369 SET_QC(); \ 1370 } else if (x < 0) { \ 1371 x = -x; \ 1372 }} while (0) 1373 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1374 { 1375 neon_s8 vec; 1376 NEON_UNPACK(neon_s8, vec, x); 1377 DO_QABS8(vec.v1); 1378 DO_QABS8(vec.v2); 1379 DO_QABS8(vec.v3); 1380 DO_QABS8(vec.v4); 1381 NEON_PACK(neon_s8, x, vec); 1382 return x; 1383 } 1384 #undef DO_QABS8 1385 1386 #define DO_QNEG8(x) do { \ 1387 if (x == (int8_t)0x80) { \ 1388 x = 0x7f; \ 1389 SET_QC(); \ 1390 } else { \ 1391 x = -x; \ 1392 }} while (0) 1393 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1394 { 1395 neon_s8 vec; 1396 NEON_UNPACK(neon_s8, vec, x); 1397 DO_QNEG8(vec.v1); 1398 DO_QNEG8(vec.v2); 1399 DO_QNEG8(vec.v3); 1400 DO_QNEG8(vec.v4); 1401 NEON_PACK(neon_s8, x, vec); 1402 return x; 1403 } 1404 #undef DO_QNEG8 1405 1406 #define DO_QABS16(x) do { \ 1407 if (x == (int16_t)0x8000) { \ 1408 x = 0x7fff; \ 1409 SET_QC(); \ 1410 } else if (x < 0) { \ 1411 x = -x; \ 1412 }} while (0) 1413 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1414 { 1415 neon_s16 vec; 1416 NEON_UNPACK(neon_s16, vec, x); 1417 DO_QABS16(vec.v1); 1418 DO_QABS16(vec.v2); 1419 NEON_PACK(neon_s16, x, vec); 1420 return x; 1421 } 1422 #undef DO_QABS16 1423 1424 #define DO_QNEG16(x) do { \ 1425 if (x == (int16_t)0x8000) { \ 1426 x = 0x7fff; \ 1427 SET_QC(); \ 1428 } else { \ 1429 x = -x; \ 1430 }} while (0) 1431 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1432 { 1433 neon_s16 vec; 1434 NEON_UNPACK(neon_s16, vec, x); 1435 DO_QNEG16(vec.v1); 1436 DO_QNEG16(vec.v2); 1437 NEON_PACK(neon_s16, x, vec); 1438 return x; 1439 } 1440 #undef DO_QNEG16 1441 1442 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1443 { 1444 if (x == SIGNBIT) { 1445 SET_QC(); 1446 x = ~SIGNBIT; 1447 } else if ((int32_t)x < 0) { 1448 x = -x; 1449 } 1450 return x; 1451 } 1452 1453 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1454 { 1455 if (x == SIGNBIT) { 1456 SET_QC(); 1457 x = ~SIGNBIT; 1458 } else { 1459 x = -x; 1460 } 1461 return x; 1462 } 1463 1464 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1465 { 1466 if (x == SIGNBIT64) { 1467 SET_QC(); 1468 x = ~SIGNBIT64; 1469 } else if ((int64_t)x < 0) { 1470 x = -x; 1471 } 1472 return x; 1473 } 1474 1475 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1476 { 1477 if (x == SIGNBIT64) { 1478 SET_QC(); 1479 x = ~SIGNBIT64; 1480 } else { 1481 x = -x; 1482 } 1483 return x; 1484 } 1485 1486 /* NEON Float helpers. */ 1487 1488 /* Floating point comparisons produce an integer result. 1489 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1490 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1491 */ 1492 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1493 { 1494 float_status *fpst = fpstp; 1495 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1496 } 1497 1498 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1499 { 1500 float_status *fpst = fpstp; 1501 return -float32_le(make_float32(b), make_float32(a), fpst); 1502 } 1503 1504 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1505 { 1506 float_status *fpst = fpstp; 1507 return -float32_lt(make_float32(b), make_float32(a), fpst); 1508 } 1509 1510 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1511 { 1512 float_status *fpst = fpstp; 1513 float32 f0 = float32_abs(make_float32(a)); 1514 float32 f1 = float32_abs(make_float32(b)); 1515 return -float32_le(f1, f0, fpst); 1516 } 1517 1518 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1519 { 1520 float_status *fpst = fpstp; 1521 float32 f0 = float32_abs(make_float32(a)); 1522 float32 f1 = float32_abs(make_float32(b)); 1523 return -float32_lt(f1, f0, fpst); 1524 } 1525 1526 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1527 { 1528 float_status *fpst = fpstp; 1529 float64 f0 = float64_abs(make_float64(a)); 1530 float64 f1 = float64_abs(make_float64(b)); 1531 return -float64_le(f1, f0, fpst); 1532 } 1533 1534 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1535 { 1536 float_status *fpst = fpstp; 1537 float64 f0 = float64_abs(make_float64(a)); 1538 float64 f1 = float64_abs(make_float64(b)); 1539 return -float64_lt(f1, f0, fpst); 1540 } 1541 1542 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1543 1544 void HELPER(neon_qunzip8)(void *vd, void *vm) 1545 { 1546 uint64_t *rd = vd, *rm = vm; 1547 uint64_t zd0 = rd[0], zd1 = rd[1]; 1548 uint64_t zm0 = rm[0], zm1 = rm[1]; 1549 1550 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1551 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1552 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1553 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1554 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1555 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1556 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1557 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1558 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1559 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1560 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1561 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1562 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1563 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1564 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1565 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1566 1567 rm[0] = m0; 1568 rm[1] = m1; 1569 rd[0] = d0; 1570 rd[1] = d1; 1571 } 1572 1573 void HELPER(neon_qunzip16)(void *vd, void *vm) 1574 { 1575 uint64_t *rd = vd, *rm = vm; 1576 uint64_t zd0 = rd[0], zd1 = rd[1]; 1577 uint64_t zm0 = rm[0], zm1 = rm[1]; 1578 1579 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1580 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1581 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1582 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1583 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1584 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1585 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1586 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1587 1588 rm[0] = m0; 1589 rm[1] = m1; 1590 rd[0] = d0; 1591 rd[1] = d1; 1592 } 1593 1594 void HELPER(neon_qunzip32)(void *vd, void *vm) 1595 { 1596 uint64_t *rd = vd, *rm = vm; 1597 uint64_t zd0 = rd[0], zd1 = rd[1]; 1598 uint64_t zm0 = rm[0], zm1 = rm[1]; 1599 1600 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1601 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1602 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1603 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1604 1605 rm[0] = m0; 1606 rm[1] = m1; 1607 rd[0] = d0; 1608 rd[1] = d1; 1609 } 1610 1611 void HELPER(neon_unzip8)(void *vd, void *vm) 1612 { 1613 uint64_t *rd = vd, *rm = vm; 1614 uint64_t zd = rd[0], zm = rm[0]; 1615 1616 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1617 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1618 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1619 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1620 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1621 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1622 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1623 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1624 1625 rm[0] = m0; 1626 rd[0] = d0; 1627 } 1628 1629 void HELPER(neon_unzip16)(void *vd, void *vm) 1630 { 1631 uint64_t *rd = vd, *rm = vm; 1632 uint64_t zd = rd[0], zm = rm[0]; 1633 1634 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1635 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1636 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1637 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1638 1639 rm[0] = m0; 1640 rd[0] = d0; 1641 } 1642 1643 void HELPER(neon_qzip8)(void *vd, void *vm) 1644 { 1645 uint64_t *rd = vd, *rm = vm; 1646 uint64_t zd0 = rd[0], zd1 = rd[1]; 1647 uint64_t zm0 = rm[0], zm1 = rm[1]; 1648 1649 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1650 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1651 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1652 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1653 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1654 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1655 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1656 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1657 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1658 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1659 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1660 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1661 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1662 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1663 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1664 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1665 1666 rm[0] = m0; 1667 rm[1] = m1; 1668 rd[0] = d0; 1669 rd[1] = d1; 1670 } 1671 1672 void HELPER(neon_qzip16)(void *vd, void *vm) 1673 { 1674 uint64_t *rd = vd, *rm = vm; 1675 uint64_t zd0 = rd[0], zd1 = rd[1]; 1676 uint64_t zm0 = rm[0], zm1 = rm[1]; 1677 1678 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1679 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1680 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1681 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1682 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1683 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1684 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1685 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1686 1687 rm[0] = m0; 1688 rm[1] = m1; 1689 rd[0] = d0; 1690 rd[1] = d1; 1691 } 1692 1693 void HELPER(neon_qzip32)(void *vd, void *vm) 1694 { 1695 uint64_t *rd = vd, *rm = vm; 1696 uint64_t zd0 = rd[0], zd1 = rd[1]; 1697 uint64_t zm0 = rm[0], zm1 = rm[1]; 1698 1699 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1700 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1701 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1702 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1703 1704 rm[0] = m0; 1705 rm[1] = m1; 1706 rd[0] = d0; 1707 rd[1] = d1; 1708 } 1709 1710 void HELPER(neon_zip8)(void *vd, void *vm) 1711 { 1712 uint64_t *rd = vd, *rm = vm; 1713 uint64_t zd = rd[0], zm = rm[0]; 1714 1715 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1716 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1717 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1718 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1719 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1720 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1721 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1722 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1723 1724 rm[0] = m0; 1725 rd[0] = d0; 1726 } 1727 1728 void HELPER(neon_zip16)(void *vd, void *vm) 1729 { 1730 uint64_t *rd = vd, *rm = vm; 1731 uint64_t zd = rd[0], zm = rm[0]; 1732 1733 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1734 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1735 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1736 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1737 1738 rm[0] = m0; 1739 rd[0] = d0; 1740 } 1741