1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 #include "qemu/osdep.h" 10 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "fpu/softfloat.h" 14 #include "vec_internal.h" 15 16 #define SIGNBIT (uint32_t)0x80000000 17 #define SIGNBIT64 ((uint64_t)1 << 63) 18 19 #define SET_QC() env->vfp.qc[0] = 1 20 21 #define NEON_TYPE1(name, type) \ 22 typedef struct \ 23 { \ 24 type v1; \ 25 } neon_##name; 26 #if HOST_BIG_ENDIAN 27 #define NEON_TYPE2(name, type) \ 28 typedef struct \ 29 { \ 30 type v2; \ 31 type v1; \ 32 } neon_##name; 33 #define NEON_TYPE4(name, type) \ 34 typedef struct \ 35 { \ 36 type v4; \ 37 type v3; \ 38 type v2; \ 39 type v1; \ 40 } neon_##name; 41 #else 42 #define NEON_TYPE2(name, type) \ 43 typedef struct \ 44 { \ 45 type v1; \ 46 type v2; \ 47 } neon_##name; 48 #define NEON_TYPE4(name, type) \ 49 typedef struct \ 50 { \ 51 type v1; \ 52 type v2; \ 53 type v3; \ 54 type v4; \ 55 } neon_##name; 56 #endif 57 58 NEON_TYPE4(s8, int8_t) 59 NEON_TYPE4(u8, uint8_t) 60 NEON_TYPE2(s16, int16_t) 61 NEON_TYPE2(u16, uint16_t) 62 NEON_TYPE1(s32, int32_t) 63 NEON_TYPE1(u32, uint32_t) 64 #undef NEON_TYPE4 65 #undef NEON_TYPE2 66 #undef NEON_TYPE1 67 68 /* Copy from a uint32_t to a vector structure type. */ 69 #define NEON_UNPACK(vtype, dest, val) do { \ 70 union { \ 71 vtype v; \ 72 uint32_t i; \ 73 } conv_u; \ 74 conv_u.i = (val); \ 75 dest = conv_u.v; \ 76 } while(0) 77 78 /* Copy from a vector structure type to a uint32_t. */ 79 #define NEON_PACK(vtype, dest, val) do { \ 80 union { \ 81 vtype v; \ 82 uint32_t i; \ 83 } conv_u; \ 84 conv_u.v = (val); \ 85 dest = conv_u.i; \ 86 } while(0) 87 88 #define NEON_DO1 \ 89 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 90 #define NEON_DO2 \ 91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 92 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 93 #define NEON_DO4 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 96 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 97 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 98 99 #define NEON_VOP_BODY(vtype, n) \ 100 { \ 101 uint32_t res; \ 102 vtype vsrc1; \ 103 vtype vsrc2; \ 104 vtype vdest; \ 105 NEON_UNPACK(vtype, vsrc1, arg1); \ 106 NEON_UNPACK(vtype, vsrc2, arg2); \ 107 NEON_DO##n; \ 108 NEON_PACK(vtype, res, vdest); \ 109 return res; \ 110 } 111 112 #define NEON_VOP(name, vtype, n) \ 113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 114 NEON_VOP_BODY(vtype, n) 115 116 #define NEON_VOP_ENV(name, vtype, n) \ 117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 118 NEON_VOP_BODY(vtype, n) 119 120 /* Pairwise operations. */ 121 /* For 32-bit elements each segment only contains a single element, so 122 the elementwise and pairwise operations are the same. */ 123 #define NEON_PDO2 \ 124 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 125 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 126 #define NEON_PDO4 \ 127 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 128 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 129 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 130 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 131 132 #define NEON_POP(name, vtype, n) \ 133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 134 { \ 135 uint32_t res; \ 136 vtype vsrc1; \ 137 vtype vsrc2; \ 138 vtype vdest; \ 139 NEON_UNPACK(vtype, vsrc1, arg1); \ 140 NEON_UNPACK(vtype, vsrc2, arg2); \ 141 NEON_PDO##n; \ 142 NEON_PACK(vtype, res, vdest); \ 143 return res; \ 144 } 145 146 /* Unary operators. */ 147 #define NEON_VOP1(name, vtype, n) \ 148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 149 { \ 150 vtype vsrc1; \ 151 vtype vdest; \ 152 NEON_UNPACK(vtype, vsrc1, arg); \ 153 NEON_DO##n; \ 154 NEON_PACK(vtype, arg, vdest); \ 155 return arg; \ 156 } 157 158 159 #define NEON_USAT(dest, src1, src2, type) do { \ 160 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 161 if (tmp != (type)tmp) { \ 162 SET_QC(); \ 163 dest = ~0; \ 164 } else { \ 165 dest = tmp; \ 166 }} while(0) 167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 168 NEON_VOP_ENV(qadd_u8, neon_u8, 4) 169 #undef NEON_FN 170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 171 NEON_VOP_ENV(qadd_u16, neon_u16, 2) 172 #undef NEON_FN 173 #undef NEON_USAT 174 175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 176 { 177 uint32_t res = a + b; 178 if (res < a) { 179 SET_QC(); 180 res = ~0; 181 } 182 return res; 183 } 184 185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 186 { 187 uint64_t res; 188 189 res = src1 + src2; 190 if (res < src1) { 191 SET_QC(); 192 res = ~(uint64_t)0; 193 } 194 return res; 195 } 196 197 #define NEON_SSAT(dest, src1, src2, type) do { \ 198 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 199 if (tmp != (type)tmp) { \ 200 SET_QC(); \ 201 if (src2 > 0) { \ 202 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 203 } else { \ 204 tmp = 1 << (sizeof(type) * 8 - 1); \ 205 } \ 206 } \ 207 dest = tmp; \ 208 } while(0) 209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 210 NEON_VOP_ENV(qadd_s8, neon_s8, 4) 211 #undef NEON_FN 212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 213 NEON_VOP_ENV(qadd_s16, neon_s16, 2) 214 #undef NEON_FN 215 #undef NEON_SSAT 216 217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 218 { 219 uint32_t res = a + b; 220 if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { 221 SET_QC(); 222 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 223 } 224 return res; 225 } 226 227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 228 { 229 uint64_t res; 230 231 res = src1 + src2; 232 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { 233 SET_QC(); 234 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 235 } 236 return res; 237 } 238 239 /* Unsigned saturating accumulate of signed value 240 * 241 * Op1/Rn is treated as signed 242 * Op2/Rd is treated as unsigned 243 * 244 * Explicit casting is used to ensure the correct sign extension of 245 * inputs. The result is treated as a unsigned value and saturated as such. 246 * 247 * We use a macro for the 8/16 bit cases which expects signed integers of va, 248 * vb, and vr for interim calculation and an unsigned 32 bit result value r. 249 */ 250 251 #define USATACC(bits, shift) \ 252 do { \ 253 va = sextract32(a, shift, bits); \ 254 vb = extract32(b, shift, bits); \ 255 vr = va + vb; \ 256 if (vr > UINT##bits##_MAX) { \ 257 SET_QC(); \ 258 vr = UINT##bits##_MAX; \ 259 } else if (vr < 0) { \ 260 SET_QC(); \ 261 vr = 0; \ 262 } \ 263 r = deposit32(r, shift, bits, vr); \ 264 } while (0) 265 266 uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b) 267 { 268 int16_t va, vb, vr; 269 uint32_t r = 0; 270 271 USATACC(8, 0); 272 USATACC(8, 8); 273 USATACC(8, 16); 274 USATACC(8, 24); 275 return r; 276 } 277 278 uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b) 279 { 280 int32_t va, vb, vr; 281 uint64_t r = 0; 282 283 USATACC(16, 0); 284 USATACC(16, 16); 285 return r; 286 } 287 288 #undef USATACC 289 290 uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 291 { 292 int64_t va = (int32_t)a; 293 int64_t vb = (uint32_t)b; 294 int64_t vr = va + vb; 295 if (vr > UINT32_MAX) { 296 SET_QC(); 297 vr = UINT32_MAX; 298 } else if (vr < 0) { 299 SET_QC(); 300 vr = 0; 301 } 302 return vr; 303 } 304 305 uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b) 306 { 307 uint64_t res; 308 res = a + b; 309 /* We only need to look at the pattern of SIGN bits to detect 310 * +ve/-ve saturation 311 */ 312 if (~a & b & ~res & SIGNBIT64) { 313 SET_QC(); 314 res = UINT64_MAX; 315 } else if (a & ~b & res & SIGNBIT64) { 316 SET_QC(); 317 res = 0; 318 } 319 return res; 320 } 321 322 /* Signed saturating accumulate of unsigned value 323 * 324 * Op1/Rn is treated as unsigned 325 * Op2/Rd is treated as signed 326 * 327 * The result is treated as a signed value and saturated as such 328 * 329 * We use a macro for the 8/16 bit cases which expects signed integers of va, 330 * vb, and vr for interim calculation and an unsigned 32 bit result value r. 331 */ 332 333 #define SSATACC(bits, shift) \ 334 do { \ 335 va = extract32(a, shift, bits); \ 336 vb = sextract32(b, shift, bits); \ 337 vr = va + vb; \ 338 if (vr > INT##bits##_MAX) { \ 339 SET_QC(); \ 340 vr = INT##bits##_MAX; \ 341 } else if (vr < INT##bits##_MIN) { \ 342 SET_QC(); \ 343 vr = INT##bits##_MIN; \ 344 } \ 345 r = deposit32(r, shift, bits, vr); \ 346 } while (0) 347 348 uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b) 349 { 350 int16_t va, vb, vr; 351 uint32_t r = 0; 352 353 SSATACC(8, 0); 354 SSATACC(8, 8); 355 SSATACC(8, 16); 356 SSATACC(8, 24); 357 return r; 358 } 359 360 uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b) 361 { 362 int32_t va, vb, vr; 363 uint32_t r = 0; 364 365 SSATACC(16, 0); 366 SSATACC(16, 16); 367 368 return r; 369 } 370 371 #undef SSATACC 372 373 uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 374 { 375 int64_t res; 376 int64_t op1 = (uint32_t)a; 377 int64_t op2 = (int32_t)b; 378 res = op1 + op2; 379 if (res > INT32_MAX) { 380 SET_QC(); 381 res = INT32_MAX; 382 } else if (res < INT32_MIN) { 383 SET_QC(); 384 res = INT32_MIN; 385 } 386 return res; 387 } 388 389 uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b) 390 { 391 uint64_t res; 392 res = a + b; 393 /* We only need to look at the pattern of SIGN bits to detect an overflow */ 394 if (((a & res) 395 | (~b & res) 396 | (a & ~b)) & SIGNBIT64) { 397 SET_QC(); 398 res = INT64_MAX; 399 } 400 return res; 401 } 402 403 404 #define NEON_USAT(dest, src1, src2, type) do { \ 405 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 406 if (tmp != (type)tmp) { \ 407 SET_QC(); \ 408 dest = 0; \ 409 } else { \ 410 dest = tmp; \ 411 }} while(0) 412 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 413 NEON_VOP_ENV(qsub_u8, neon_u8, 4) 414 #undef NEON_FN 415 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 416 NEON_VOP_ENV(qsub_u16, neon_u16, 2) 417 #undef NEON_FN 418 #undef NEON_USAT 419 420 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b) 421 { 422 uint32_t res = a - b; 423 if (res > a) { 424 SET_QC(); 425 res = 0; 426 } 427 return res; 428 } 429 430 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 431 { 432 uint64_t res; 433 434 if (src1 < src2) { 435 SET_QC(); 436 res = 0; 437 } else { 438 res = src1 - src2; 439 } 440 return res; 441 } 442 443 #define NEON_SSAT(dest, src1, src2, type) do { \ 444 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 445 if (tmp != (type)tmp) { \ 446 SET_QC(); \ 447 if (src2 < 0) { \ 448 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 449 } else { \ 450 tmp = 1 << (sizeof(type) * 8 - 1); \ 451 } \ 452 } \ 453 dest = tmp; \ 454 } while(0) 455 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 456 NEON_VOP_ENV(qsub_s8, neon_s8, 4) 457 #undef NEON_FN 458 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 459 NEON_VOP_ENV(qsub_s16, neon_s16, 2) 460 #undef NEON_FN 461 #undef NEON_SSAT 462 463 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b) 464 { 465 uint32_t res = a - b; 466 if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { 467 SET_QC(); 468 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 469 } 470 return res; 471 } 472 473 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 474 { 475 uint64_t res; 476 477 res = src1 - src2; 478 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { 479 SET_QC(); 480 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 481 } 482 return res; 483 } 484 485 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 486 NEON_VOP(hadd_s8, neon_s8, 4) 487 NEON_VOP(hadd_u8, neon_u8, 4) 488 NEON_VOP(hadd_s16, neon_s16, 2) 489 NEON_VOP(hadd_u16, neon_u16, 2) 490 #undef NEON_FN 491 492 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 493 { 494 int32_t dest; 495 496 dest = (src1 >> 1) + (src2 >> 1); 497 if (src1 & src2 & 1) 498 dest++; 499 return dest; 500 } 501 502 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 503 { 504 uint32_t dest; 505 506 dest = (src1 >> 1) + (src2 >> 1); 507 if (src1 & src2 & 1) 508 dest++; 509 return dest; 510 } 511 512 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 513 NEON_VOP(rhadd_s8, neon_s8, 4) 514 NEON_VOP(rhadd_u8, neon_u8, 4) 515 NEON_VOP(rhadd_s16, neon_s16, 2) 516 NEON_VOP(rhadd_u16, neon_u16, 2) 517 #undef NEON_FN 518 519 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 520 { 521 int32_t dest; 522 523 dest = (src1 >> 1) + (src2 >> 1); 524 if ((src1 | src2) & 1) 525 dest++; 526 return dest; 527 } 528 529 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 530 { 531 uint32_t dest; 532 533 dest = (src1 >> 1) + (src2 >> 1); 534 if ((src1 | src2) & 1) 535 dest++; 536 return dest; 537 } 538 539 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 540 NEON_VOP(hsub_s8, neon_s8, 4) 541 NEON_VOP(hsub_u8, neon_u8, 4) 542 NEON_VOP(hsub_s16, neon_s16, 2) 543 NEON_VOP(hsub_u16, neon_u16, 2) 544 #undef NEON_FN 545 546 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 547 { 548 int32_t dest; 549 550 dest = (src1 >> 1) - (src2 >> 1); 551 if ((~src1) & src2 & 1) 552 dest--; 553 return dest; 554 } 555 556 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 557 { 558 uint32_t dest; 559 560 dest = (src1 >> 1) - (src2 >> 1); 561 if ((~src1) & src2 & 1) 562 dest--; 563 return dest; 564 } 565 566 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 567 NEON_POP(pmin_s8, neon_s8, 4) 568 NEON_POP(pmin_u8, neon_u8, 4) 569 NEON_POP(pmin_s16, neon_s16, 2) 570 NEON_POP(pmin_u16, neon_u16, 2) 571 #undef NEON_FN 572 573 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 574 NEON_POP(pmax_s8, neon_s8, 4) 575 NEON_POP(pmax_u8, neon_u8, 4) 576 NEON_POP(pmax_s16, neon_s16, 2) 577 NEON_POP(pmax_u16, neon_u16, 2) 578 #undef NEON_FN 579 580 #define NEON_FN(dest, src1, src2) \ 581 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 582 NEON_VOP(shl_u16, neon_u16, 2) 583 #undef NEON_FN 584 585 #define NEON_FN(dest, src1, src2) \ 586 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 587 NEON_VOP(shl_s16, neon_s16, 2) 588 #undef NEON_FN 589 590 #define NEON_FN(dest, src1, src2) \ 591 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 592 NEON_VOP(rshl_s8, neon_s8, 4) 593 #undef NEON_FN 594 595 #define NEON_FN(dest, src1, src2) \ 596 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 597 NEON_VOP(rshl_s16, neon_s16, 2) 598 #undef NEON_FN 599 600 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 601 { 602 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 603 } 604 605 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 606 { 607 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 608 } 609 610 #define NEON_FN(dest, src1, src2) \ 611 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 612 NEON_VOP(rshl_u8, neon_u8, 4) 613 #undef NEON_FN 614 615 #define NEON_FN(dest, src1, src2) \ 616 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 617 NEON_VOP(rshl_u16, neon_u16, 2) 618 #undef NEON_FN 619 620 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 621 { 622 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 623 } 624 625 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 626 { 627 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 628 } 629 630 #define NEON_FN(dest, src1, src2) \ 631 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 632 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 633 #undef NEON_FN 634 635 #define NEON_FN(dest, src1, src2) \ 636 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 637 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 638 #undef NEON_FN 639 640 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 641 { 642 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 643 } 644 645 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 646 { 647 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 648 } 649 650 #define NEON_FN(dest, src1, src2) \ 651 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 652 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 653 #undef NEON_FN 654 655 #define NEON_FN(dest, src1, src2) \ 656 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 657 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 658 #undef NEON_FN 659 660 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 661 { 662 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 663 } 664 665 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 666 { 667 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 668 } 669 670 #define NEON_FN(dest, src1, src2) \ 671 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 672 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 673 #undef NEON_FN 674 675 #define NEON_FN(dest, src1, src2) \ 676 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 677 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 678 #undef NEON_FN 679 680 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 681 { 682 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 683 } 684 685 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 686 { 687 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 688 } 689 690 #define NEON_FN(dest, src1, src2) \ 691 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 692 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 693 #undef NEON_FN 694 695 #define NEON_FN(dest, src1, src2) \ 696 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 697 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 698 #undef NEON_FN 699 700 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 701 { 702 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 703 } 704 705 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 706 { 707 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 708 } 709 710 #define NEON_FN(dest, src1, src2) \ 711 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 712 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 713 #undef NEON_FN 714 715 #define NEON_FN(dest, src1, src2) \ 716 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 717 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 718 #undef NEON_FN 719 720 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 721 { 722 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 723 } 724 725 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 726 { 727 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 728 } 729 730 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 731 { 732 uint32_t mask; 733 mask = (a ^ b) & 0x80808080u; 734 a &= ~0x80808080u; 735 b &= ~0x80808080u; 736 return (a + b) ^ mask; 737 } 738 739 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 740 { 741 uint32_t mask; 742 mask = (a ^ b) & 0x80008000u; 743 a &= ~0x80008000u; 744 b &= ~0x80008000u; 745 return (a + b) ^ mask; 746 } 747 748 #define NEON_FN(dest, src1, src2) dest = src1 - src2 749 NEON_VOP(sub_u8, neon_u8, 4) 750 NEON_VOP(sub_u16, neon_u16, 2) 751 #undef NEON_FN 752 753 #define NEON_FN(dest, src1, src2) dest = src1 * src2 754 NEON_VOP(mul_u8, neon_u8, 4) 755 NEON_VOP(mul_u16, neon_u16, 2) 756 #undef NEON_FN 757 758 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 759 NEON_VOP(tst_u8, neon_u8, 4) 760 NEON_VOP(tst_u16, neon_u16, 2) 761 NEON_VOP(tst_u32, neon_u32, 1) 762 #undef NEON_FN 763 764 /* Count Leading Sign/Zero Bits. */ 765 static inline int do_clz8(uint8_t x) 766 { 767 int n; 768 for (n = 8; x; n--) 769 x >>= 1; 770 return n; 771 } 772 773 static inline int do_clz16(uint16_t x) 774 { 775 int n; 776 for (n = 16; x; n--) 777 x >>= 1; 778 return n; 779 } 780 781 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 782 NEON_VOP1(clz_u8, neon_u8, 4) 783 #undef NEON_FN 784 785 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 786 NEON_VOP1(clz_u16, neon_u16, 2) 787 #undef NEON_FN 788 789 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 790 NEON_VOP1(cls_s8, neon_s8, 4) 791 #undef NEON_FN 792 793 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 794 NEON_VOP1(cls_s16, neon_s16, 2) 795 #undef NEON_FN 796 797 uint32_t HELPER(neon_cls_s32)(uint32_t x) 798 { 799 int count; 800 if ((int32_t)x < 0) 801 x = ~x; 802 for (count = 32; x; count--) 803 x = x >> 1; 804 return count - 1; 805 } 806 807 /* Bit count. */ 808 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 809 { 810 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 811 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 812 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 813 return x; 814 } 815 816 /* Reverse bits in each 8 bit word */ 817 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 818 { 819 x = ((x & 0xf0f0f0f0) >> 4) 820 | ((x & 0x0f0f0f0f) << 4); 821 x = ((x & 0x88888888) >> 3) 822 | ((x & 0x44444444) >> 1) 823 | ((x & 0x22222222) << 1) 824 | ((x & 0x11111111) << 3); 825 return x; 826 } 827 828 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 829 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 830 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 831 SET_QC(); \ 832 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 833 } else { \ 834 tmp <<= 1; \ 835 } \ 836 if (round) { \ 837 int32_t old = tmp; \ 838 tmp += 1 << 15; \ 839 if ((int32_t)tmp < old) { \ 840 SET_QC(); \ 841 tmp = SIGNBIT - 1; \ 842 } \ 843 } \ 844 dest = tmp >> 16; \ 845 } while(0) 846 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 847 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 848 #undef NEON_FN 849 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 850 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 851 #undef NEON_FN 852 #undef NEON_QDMULH16 853 854 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 855 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 856 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 857 SET_QC(); \ 858 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 859 } else { \ 860 tmp <<= 1; \ 861 } \ 862 if (round) { \ 863 int64_t old = tmp; \ 864 tmp += (int64_t)1 << 31; \ 865 if ((int64_t)tmp < old) { \ 866 SET_QC(); \ 867 tmp = SIGNBIT64 - 1; \ 868 } \ 869 } \ 870 dest = tmp >> 32; \ 871 } while(0) 872 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 873 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 874 #undef NEON_FN 875 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 876 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 877 #undef NEON_FN 878 #undef NEON_QDMULH32 879 880 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 881 { 882 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 883 | ((x >> 24) & 0xff000000u); 884 } 885 886 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 887 { 888 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 889 } 890 891 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 892 { 893 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 894 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 895 } 896 897 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 898 { 899 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 900 } 901 902 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 903 { 904 x &= 0xff80ff80ff80ff80ull; 905 x += 0x0080008000800080ull; 906 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 907 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 908 } 909 910 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 911 { 912 x &= 0xffff8000ffff8000ull; 913 x += 0x0000800000008000ull; 914 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 915 } 916 917 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 918 { 919 uint16_t s; 920 uint8_t d; 921 uint32_t res = 0; 922 #define SAT8(n) \ 923 s = x >> n; \ 924 if (s & 0x8000) { \ 925 SET_QC(); \ 926 } else { \ 927 if (s > 0xff) { \ 928 d = 0xff; \ 929 SET_QC(); \ 930 } else { \ 931 d = s; \ 932 } \ 933 res |= (uint32_t)d << (n / 2); \ 934 } 935 936 SAT8(0); 937 SAT8(16); 938 SAT8(32); 939 SAT8(48); 940 #undef SAT8 941 return res; 942 } 943 944 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 945 { 946 uint16_t s; 947 uint8_t d; 948 uint32_t res = 0; 949 #define SAT8(n) \ 950 s = x >> n; \ 951 if (s > 0xff) { \ 952 d = 0xff; \ 953 SET_QC(); \ 954 } else { \ 955 d = s; \ 956 } \ 957 res |= (uint32_t)d << (n / 2); 958 959 SAT8(0); 960 SAT8(16); 961 SAT8(32); 962 SAT8(48); 963 #undef SAT8 964 return res; 965 } 966 967 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 968 { 969 int16_t s; 970 uint8_t d; 971 uint32_t res = 0; 972 #define SAT8(n) \ 973 s = x >> n; \ 974 if (s != (int8_t)s) { \ 975 d = (s >> 15) ^ 0x7f; \ 976 SET_QC(); \ 977 } else { \ 978 d = s; \ 979 } \ 980 res |= (uint32_t)d << (n / 2); 981 982 SAT8(0); 983 SAT8(16); 984 SAT8(32); 985 SAT8(48); 986 #undef SAT8 987 return res; 988 } 989 990 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 991 { 992 uint32_t high; 993 uint32_t low; 994 low = x; 995 if (low & 0x80000000) { 996 low = 0; 997 SET_QC(); 998 } else if (low > 0xffff) { 999 low = 0xffff; 1000 SET_QC(); 1001 } 1002 high = x >> 32; 1003 if (high & 0x80000000) { 1004 high = 0; 1005 SET_QC(); 1006 } else if (high > 0xffff) { 1007 high = 0xffff; 1008 SET_QC(); 1009 } 1010 return low | (high << 16); 1011 } 1012 1013 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 1014 { 1015 uint32_t high; 1016 uint32_t low; 1017 low = x; 1018 if (low > 0xffff) { 1019 low = 0xffff; 1020 SET_QC(); 1021 } 1022 high = x >> 32; 1023 if (high > 0xffff) { 1024 high = 0xffff; 1025 SET_QC(); 1026 } 1027 return low | (high << 16); 1028 } 1029 1030 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 1031 { 1032 int32_t low; 1033 int32_t high; 1034 low = x; 1035 if (low != (int16_t)low) { 1036 low = (low >> 31) ^ 0x7fff; 1037 SET_QC(); 1038 } 1039 high = x >> 32; 1040 if (high != (int16_t)high) { 1041 high = (high >> 31) ^ 0x7fff; 1042 SET_QC(); 1043 } 1044 return (uint16_t)low | (high << 16); 1045 } 1046 1047 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 1048 { 1049 if (x & 0x8000000000000000ull) { 1050 SET_QC(); 1051 return 0; 1052 } 1053 if (x > 0xffffffffu) { 1054 SET_QC(); 1055 return 0xffffffffu; 1056 } 1057 return x; 1058 } 1059 1060 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 1061 { 1062 if (x > 0xffffffffu) { 1063 SET_QC(); 1064 return 0xffffffffu; 1065 } 1066 return x; 1067 } 1068 1069 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 1070 { 1071 if ((int64_t)x != (int32_t)x) { 1072 SET_QC(); 1073 return ((int64_t)x >> 63) ^ 0x7fffffff; 1074 } 1075 return x; 1076 } 1077 1078 uint64_t HELPER(neon_widen_u8)(uint32_t x) 1079 { 1080 uint64_t tmp; 1081 uint64_t ret; 1082 ret = (uint8_t)x; 1083 tmp = (uint8_t)(x >> 8); 1084 ret |= tmp << 16; 1085 tmp = (uint8_t)(x >> 16); 1086 ret |= tmp << 32; 1087 tmp = (uint8_t)(x >> 24); 1088 ret |= tmp << 48; 1089 return ret; 1090 } 1091 1092 uint64_t HELPER(neon_widen_s8)(uint32_t x) 1093 { 1094 uint64_t tmp; 1095 uint64_t ret; 1096 ret = (uint16_t)(int8_t)x; 1097 tmp = (uint16_t)(int8_t)(x >> 8); 1098 ret |= tmp << 16; 1099 tmp = (uint16_t)(int8_t)(x >> 16); 1100 ret |= tmp << 32; 1101 tmp = (uint16_t)(int8_t)(x >> 24); 1102 ret |= tmp << 48; 1103 return ret; 1104 } 1105 1106 uint64_t HELPER(neon_widen_u16)(uint32_t x) 1107 { 1108 uint64_t high = (uint16_t)(x >> 16); 1109 return ((uint16_t)x) | (high << 32); 1110 } 1111 1112 uint64_t HELPER(neon_widen_s16)(uint32_t x) 1113 { 1114 uint64_t high = (int16_t)(x >> 16); 1115 return ((uint32_t)(int16_t)x) | (high << 32); 1116 } 1117 1118 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 1119 { 1120 uint64_t mask; 1121 mask = (a ^ b) & 0x8000800080008000ull; 1122 a &= ~0x8000800080008000ull; 1123 b &= ~0x8000800080008000ull; 1124 return (a + b) ^ mask; 1125 } 1126 1127 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 1128 { 1129 uint64_t mask; 1130 mask = (a ^ b) & 0x8000000080000000ull; 1131 a &= ~0x8000000080000000ull; 1132 b &= ~0x8000000080000000ull; 1133 return (a + b) ^ mask; 1134 } 1135 1136 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 1137 { 1138 uint64_t tmp; 1139 uint64_t tmp2; 1140 1141 tmp = a & 0x0000ffff0000ffffull; 1142 tmp += (a >> 16) & 0x0000ffff0000ffffull; 1143 tmp2 = b & 0xffff0000ffff0000ull; 1144 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 1145 return ( tmp & 0xffff) 1146 | ((tmp >> 16) & 0xffff0000ull) 1147 | ((tmp2 << 16) & 0xffff00000000ull) 1148 | ( tmp2 & 0xffff000000000000ull); 1149 } 1150 1151 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 1152 { 1153 uint32_t low = a + (a >> 32); 1154 uint32_t high = b + (b >> 32); 1155 return low + ((uint64_t)high << 32); 1156 } 1157 1158 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 1159 { 1160 uint64_t mask; 1161 mask = (a ^ ~b) & 0x8000800080008000ull; 1162 a |= 0x8000800080008000ull; 1163 b &= ~0x8000800080008000ull; 1164 return (a - b) ^ mask; 1165 } 1166 1167 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 1168 { 1169 uint64_t mask; 1170 mask = (a ^ ~b) & 0x8000000080000000ull; 1171 a |= 0x8000000080000000ull; 1172 b &= ~0x8000000080000000ull; 1173 return (a - b) ^ mask; 1174 } 1175 1176 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 1177 { 1178 uint32_t x, y; 1179 uint32_t low, high; 1180 1181 x = a; 1182 y = b; 1183 low = x + y; 1184 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1185 SET_QC(); 1186 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 1187 } 1188 x = a >> 32; 1189 y = b >> 32; 1190 high = x + y; 1191 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1192 SET_QC(); 1193 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 1194 } 1195 return low | ((uint64_t)high << 32); 1196 } 1197 1198 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 1199 { 1200 uint64_t result; 1201 1202 result = a + b; 1203 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 1204 SET_QC(); 1205 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 1206 } 1207 return result; 1208 } 1209 1210 /* We have to do the arithmetic in a larger type than 1211 * the input type, because for example with a signed 32 bit 1212 * op the absolute difference can overflow a signed 32 bit value. 1213 */ 1214 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 1215 arithtype tmp_x = (intype)(x); \ 1216 arithtype tmp_y = (intype)(y); \ 1217 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 1218 } while(0) 1219 1220 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 1221 { 1222 uint64_t tmp; 1223 uint64_t result; 1224 DO_ABD(result, a, b, uint8_t, uint32_t); 1225 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 1226 result |= tmp << 16; 1227 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 1228 result |= tmp << 32; 1229 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 1230 result |= tmp << 48; 1231 return result; 1232 } 1233 1234 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 1235 { 1236 uint64_t tmp; 1237 uint64_t result; 1238 DO_ABD(result, a, b, int8_t, int32_t); 1239 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 1240 result |= tmp << 16; 1241 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 1242 result |= tmp << 32; 1243 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 1244 result |= tmp << 48; 1245 return result; 1246 } 1247 1248 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 1249 { 1250 uint64_t tmp; 1251 uint64_t result; 1252 DO_ABD(result, a, b, uint16_t, uint32_t); 1253 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1254 return result | (tmp << 32); 1255 } 1256 1257 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1258 { 1259 uint64_t tmp; 1260 uint64_t result; 1261 DO_ABD(result, a, b, int16_t, int32_t); 1262 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1263 return result | (tmp << 32); 1264 } 1265 1266 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1267 { 1268 uint64_t result; 1269 DO_ABD(result, a, b, uint32_t, uint64_t); 1270 return result; 1271 } 1272 1273 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1274 { 1275 uint64_t result; 1276 DO_ABD(result, a, b, int32_t, int64_t); 1277 return result; 1278 } 1279 #undef DO_ABD 1280 1281 /* Widening multiply. Named type is the source type. */ 1282 #define DO_MULL(dest, x, y, type1, type2) do { \ 1283 type1 tmp_x = x; \ 1284 type1 tmp_y = y; \ 1285 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1286 } while(0) 1287 1288 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1289 { 1290 uint64_t tmp; 1291 uint64_t result; 1292 1293 DO_MULL(result, a, b, uint8_t, uint16_t); 1294 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1295 result |= tmp << 16; 1296 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1297 result |= tmp << 32; 1298 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1299 result |= tmp << 48; 1300 return result; 1301 } 1302 1303 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1304 { 1305 uint64_t tmp; 1306 uint64_t result; 1307 1308 DO_MULL(result, a, b, int8_t, uint16_t); 1309 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1310 result |= tmp << 16; 1311 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1312 result |= tmp << 32; 1313 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1314 result |= tmp << 48; 1315 return result; 1316 } 1317 1318 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1319 { 1320 uint64_t tmp; 1321 uint64_t result; 1322 1323 DO_MULL(result, a, b, uint16_t, uint32_t); 1324 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1325 return result | (tmp << 32); 1326 } 1327 1328 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1329 { 1330 uint64_t tmp; 1331 uint64_t result; 1332 1333 DO_MULL(result, a, b, int16_t, uint32_t); 1334 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1335 return result | (tmp << 32); 1336 } 1337 1338 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1339 { 1340 uint16_t tmp; 1341 uint64_t result; 1342 result = (uint16_t)-x; 1343 tmp = -(x >> 16); 1344 result |= (uint64_t)tmp << 16; 1345 tmp = -(x >> 32); 1346 result |= (uint64_t)tmp << 32; 1347 tmp = -(x >> 48); 1348 result |= (uint64_t)tmp << 48; 1349 return result; 1350 } 1351 1352 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1353 { 1354 uint32_t low = -x; 1355 uint32_t high = -(x >> 32); 1356 return low | ((uint64_t)high << 32); 1357 } 1358 1359 /* Saturating sign manipulation. */ 1360 /* ??? Make these use NEON_VOP1 */ 1361 #define DO_QABS8(x) do { \ 1362 if (x == (int8_t)0x80) { \ 1363 x = 0x7f; \ 1364 SET_QC(); \ 1365 } else if (x < 0) { \ 1366 x = -x; \ 1367 }} while (0) 1368 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1369 { 1370 neon_s8 vec; 1371 NEON_UNPACK(neon_s8, vec, x); 1372 DO_QABS8(vec.v1); 1373 DO_QABS8(vec.v2); 1374 DO_QABS8(vec.v3); 1375 DO_QABS8(vec.v4); 1376 NEON_PACK(neon_s8, x, vec); 1377 return x; 1378 } 1379 #undef DO_QABS8 1380 1381 #define DO_QNEG8(x) do { \ 1382 if (x == (int8_t)0x80) { \ 1383 x = 0x7f; \ 1384 SET_QC(); \ 1385 } else { \ 1386 x = -x; \ 1387 }} while (0) 1388 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1389 { 1390 neon_s8 vec; 1391 NEON_UNPACK(neon_s8, vec, x); 1392 DO_QNEG8(vec.v1); 1393 DO_QNEG8(vec.v2); 1394 DO_QNEG8(vec.v3); 1395 DO_QNEG8(vec.v4); 1396 NEON_PACK(neon_s8, x, vec); 1397 return x; 1398 } 1399 #undef DO_QNEG8 1400 1401 #define DO_QABS16(x) do { \ 1402 if (x == (int16_t)0x8000) { \ 1403 x = 0x7fff; \ 1404 SET_QC(); \ 1405 } else if (x < 0) { \ 1406 x = -x; \ 1407 }} while (0) 1408 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1409 { 1410 neon_s16 vec; 1411 NEON_UNPACK(neon_s16, vec, x); 1412 DO_QABS16(vec.v1); 1413 DO_QABS16(vec.v2); 1414 NEON_PACK(neon_s16, x, vec); 1415 return x; 1416 } 1417 #undef DO_QABS16 1418 1419 #define DO_QNEG16(x) do { \ 1420 if (x == (int16_t)0x8000) { \ 1421 x = 0x7fff; \ 1422 SET_QC(); \ 1423 } else { \ 1424 x = -x; \ 1425 }} while (0) 1426 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1427 { 1428 neon_s16 vec; 1429 NEON_UNPACK(neon_s16, vec, x); 1430 DO_QNEG16(vec.v1); 1431 DO_QNEG16(vec.v2); 1432 NEON_PACK(neon_s16, x, vec); 1433 return x; 1434 } 1435 #undef DO_QNEG16 1436 1437 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1438 { 1439 if (x == SIGNBIT) { 1440 SET_QC(); 1441 x = ~SIGNBIT; 1442 } else if ((int32_t)x < 0) { 1443 x = -x; 1444 } 1445 return x; 1446 } 1447 1448 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1449 { 1450 if (x == SIGNBIT) { 1451 SET_QC(); 1452 x = ~SIGNBIT; 1453 } else { 1454 x = -x; 1455 } 1456 return x; 1457 } 1458 1459 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1460 { 1461 if (x == SIGNBIT64) { 1462 SET_QC(); 1463 x = ~SIGNBIT64; 1464 } else if ((int64_t)x < 0) { 1465 x = -x; 1466 } 1467 return x; 1468 } 1469 1470 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1471 { 1472 if (x == SIGNBIT64) { 1473 SET_QC(); 1474 x = ~SIGNBIT64; 1475 } else { 1476 x = -x; 1477 } 1478 return x; 1479 } 1480 1481 /* NEON Float helpers. */ 1482 1483 /* Floating point comparisons produce an integer result. 1484 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1485 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1486 */ 1487 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1488 { 1489 float_status *fpst = fpstp; 1490 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1491 } 1492 1493 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1494 { 1495 float_status *fpst = fpstp; 1496 return -float32_le(make_float32(b), make_float32(a), fpst); 1497 } 1498 1499 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1500 { 1501 float_status *fpst = fpstp; 1502 return -float32_lt(make_float32(b), make_float32(a), fpst); 1503 } 1504 1505 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1506 { 1507 float_status *fpst = fpstp; 1508 float32 f0 = float32_abs(make_float32(a)); 1509 float32 f1 = float32_abs(make_float32(b)); 1510 return -float32_le(f1, f0, fpst); 1511 } 1512 1513 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1514 { 1515 float_status *fpst = fpstp; 1516 float32 f0 = float32_abs(make_float32(a)); 1517 float32 f1 = float32_abs(make_float32(b)); 1518 return -float32_lt(f1, f0, fpst); 1519 } 1520 1521 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1522 { 1523 float_status *fpst = fpstp; 1524 float64 f0 = float64_abs(make_float64(a)); 1525 float64 f1 = float64_abs(make_float64(b)); 1526 return -float64_le(f1, f0, fpst); 1527 } 1528 1529 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1530 { 1531 float_status *fpst = fpstp; 1532 float64 f0 = float64_abs(make_float64(a)); 1533 float64 f1 = float64_abs(make_float64(b)); 1534 return -float64_lt(f1, f0, fpst); 1535 } 1536 1537 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1538 1539 void HELPER(neon_qunzip8)(void *vd, void *vm) 1540 { 1541 uint64_t *rd = vd, *rm = vm; 1542 uint64_t zd0 = rd[0], zd1 = rd[1]; 1543 uint64_t zm0 = rm[0], zm1 = rm[1]; 1544 1545 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1546 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1547 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1548 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1549 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1550 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1551 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1552 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1553 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1554 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1555 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1556 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1557 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1558 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1559 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1560 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1561 1562 rm[0] = m0; 1563 rm[1] = m1; 1564 rd[0] = d0; 1565 rd[1] = d1; 1566 } 1567 1568 void HELPER(neon_qunzip16)(void *vd, void *vm) 1569 { 1570 uint64_t *rd = vd, *rm = vm; 1571 uint64_t zd0 = rd[0], zd1 = rd[1]; 1572 uint64_t zm0 = rm[0], zm1 = rm[1]; 1573 1574 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1575 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1576 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1577 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1578 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1579 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1580 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1581 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1582 1583 rm[0] = m0; 1584 rm[1] = m1; 1585 rd[0] = d0; 1586 rd[1] = d1; 1587 } 1588 1589 void HELPER(neon_qunzip32)(void *vd, void *vm) 1590 { 1591 uint64_t *rd = vd, *rm = vm; 1592 uint64_t zd0 = rd[0], zd1 = rd[1]; 1593 uint64_t zm0 = rm[0], zm1 = rm[1]; 1594 1595 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1596 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1597 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1598 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1599 1600 rm[0] = m0; 1601 rm[1] = m1; 1602 rd[0] = d0; 1603 rd[1] = d1; 1604 } 1605 1606 void HELPER(neon_unzip8)(void *vd, void *vm) 1607 { 1608 uint64_t *rd = vd, *rm = vm; 1609 uint64_t zd = rd[0], zm = rm[0]; 1610 1611 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1612 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1613 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1614 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1615 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1616 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1617 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1618 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1619 1620 rm[0] = m0; 1621 rd[0] = d0; 1622 } 1623 1624 void HELPER(neon_unzip16)(void *vd, void *vm) 1625 { 1626 uint64_t *rd = vd, *rm = vm; 1627 uint64_t zd = rd[0], zm = rm[0]; 1628 1629 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1630 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1631 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1632 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1633 1634 rm[0] = m0; 1635 rd[0] = d0; 1636 } 1637 1638 void HELPER(neon_qzip8)(void *vd, void *vm) 1639 { 1640 uint64_t *rd = vd, *rm = vm; 1641 uint64_t zd0 = rd[0], zd1 = rd[1]; 1642 uint64_t zm0 = rm[0], zm1 = rm[1]; 1643 1644 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1645 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1646 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1647 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1648 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1649 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1650 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1651 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1652 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1653 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1654 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1655 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1656 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1657 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1658 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1659 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1660 1661 rm[0] = m0; 1662 rm[1] = m1; 1663 rd[0] = d0; 1664 rd[1] = d1; 1665 } 1666 1667 void HELPER(neon_qzip16)(void *vd, void *vm) 1668 { 1669 uint64_t *rd = vd, *rm = vm; 1670 uint64_t zd0 = rd[0], zd1 = rd[1]; 1671 uint64_t zm0 = rm[0], zm1 = rm[1]; 1672 1673 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1674 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1675 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1676 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1677 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1678 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1679 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1680 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1681 1682 rm[0] = m0; 1683 rm[1] = m1; 1684 rd[0] = d0; 1685 rd[1] = d1; 1686 } 1687 1688 void HELPER(neon_qzip32)(void *vd, void *vm) 1689 { 1690 uint64_t *rd = vd, *rm = vm; 1691 uint64_t zd0 = rd[0], zd1 = rd[1]; 1692 uint64_t zm0 = rm[0], zm1 = rm[1]; 1693 1694 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1695 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1696 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1697 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1698 1699 rm[0] = m0; 1700 rm[1] = m1; 1701 rd[0] = d0; 1702 rd[1] = d1; 1703 } 1704 1705 void HELPER(neon_zip8)(void *vd, void *vm) 1706 { 1707 uint64_t *rd = vd, *rm = vm; 1708 uint64_t zd = rd[0], zm = rm[0]; 1709 1710 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1711 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1712 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1713 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1714 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1715 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1716 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1717 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1718 1719 rm[0] = m0; 1720 rd[0] = d0; 1721 } 1722 1723 void HELPER(neon_zip16)(void *vd, void *vm) 1724 { 1725 uint64_t *rd = vd, *rm = vm; 1726 uint64_t zd = rd[0], zm = rm[0]; 1727 1728 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1729 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1730 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1731 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1732 1733 rm[0] = m0; 1734 rd[0] = d0; 1735 } 1736