1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 #include "qemu/osdep.h" 10 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "fpu/softfloat.h" 14 #include "vec_internal.h" 15 16 #define SIGNBIT (uint32_t)0x80000000 17 #define SIGNBIT64 ((uint64_t)1 << 63) 18 19 #define SET_QC() env->vfp.qc[0] = 1 20 21 #define NEON_TYPE1(name, type) \ 22 typedef struct \ 23 { \ 24 type v1; \ 25 } neon_##name; 26 #if HOST_BIG_ENDIAN 27 #define NEON_TYPE2(name, type) \ 28 typedef struct \ 29 { \ 30 type v2; \ 31 type v1; \ 32 } neon_##name; 33 #define NEON_TYPE4(name, type) \ 34 typedef struct \ 35 { \ 36 type v4; \ 37 type v3; \ 38 type v2; \ 39 type v1; \ 40 } neon_##name; 41 #else 42 #define NEON_TYPE2(name, type) \ 43 typedef struct \ 44 { \ 45 type v1; \ 46 type v2; \ 47 } neon_##name; 48 #define NEON_TYPE4(name, type) \ 49 typedef struct \ 50 { \ 51 type v1; \ 52 type v2; \ 53 type v3; \ 54 type v4; \ 55 } neon_##name; 56 #endif 57 58 NEON_TYPE4(s8, int8_t) 59 NEON_TYPE4(u8, uint8_t) 60 NEON_TYPE2(s16, int16_t) 61 NEON_TYPE2(u16, uint16_t) 62 NEON_TYPE1(s32, int32_t) 63 NEON_TYPE1(u32, uint32_t) 64 #undef NEON_TYPE4 65 #undef NEON_TYPE2 66 #undef NEON_TYPE1 67 68 /* Copy from a uint32_t to a vector structure type. */ 69 #define NEON_UNPACK(vtype, dest, val) do { \ 70 union { \ 71 vtype v; \ 72 uint32_t i; \ 73 } conv_u; \ 74 conv_u.i = (val); \ 75 dest = conv_u.v; \ 76 } while(0) 77 78 /* Copy from a vector structure type to a uint32_t. */ 79 #define NEON_PACK(vtype, dest, val) do { \ 80 union { \ 81 vtype v; \ 82 uint32_t i; \ 83 } conv_u; \ 84 conv_u.v = (val); \ 85 dest = conv_u.i; \ 86 } while(0) 87 88 #define NEON_DO1 \ 89 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 90 #define NEON_DO2 \ 91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 92 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 93 #define NEON_DO4 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 96 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 97 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 98 99 #define NEON_VOP_BODY(vtype, n) \ 100 { \ 101 uint32_t res; \ 102 vtype vsrc1; \ 103 vtype vsrc2; \ 104 vtype vdest; \ 105 NEON_UNPACK(vtype, vsrc1, arg1); \ 106 NEON_UNPACK(vtype, vsrc2, arg2); \ 107 NEON_DO##n; \ 108 NEON_PACK(vtype, res, vdest); \ 109 return res; \ 110 } 111 112 #define NEON_VOP(name, vtype, n) \ 113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 114 NEON_VOP_BODY(vtype, n) 115 116 #define NEON_VOP_ENV(name, vtype, n) \ 117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 118 NEON_VOP_BODY(vtype, n) 119 120 /* Pairwise operations. */ 121 /* For 32-bit elements each segment only contains a single element, so 122 the elementwise and pairwise operations are the same. */ 123 #define NEON_PDO2 \ 124 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 125 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 126 #define NEON_PDO4 \ 127 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 128 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 129 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 130 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 131 132 #define NEON_POP(name, vtype, n) \ 133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 134 { \ 135 uint32_t res; \ 136 vtype vsrc1; \ 137 vtype vsrc2; \ 138 vtype vdest; \ 139 NEON_UNPACK(vtype, vsrc1, arg1); \ 140 NEON_UNPACK(vtype, vsrc2, arg2); \ 141 NEON_PDO##n; \ 142 NEON_PACK(vtype, res, vdest); \ 143 return res; \ 144 } 145 146 /* Unary operators. */ 147 #define NEON_VOP1(name, vtype, n) \ 148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 149 { \ 150 vtype vsrc1; \ 151 vtype vdest; \ 152 NEON_UNPACK(vtype, vsrc1, arg); \ 153 NEON_DO##n; \ 154 NEON_PACK(vtype, arg, vdest); \ 155 return arg; \ 156 } 157 158 159 #define NEON_USAT(dest, src1, src2, type) do { \ 160 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 161 if (tmp != (type)tmp) { \ 162 SET_QC(); \ 163 dest = ~0; \ 164 } else { \ 165 dest = tmp; \ 166 }} while(0) 167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 168 NEON_VOP_ENV(qadd_u8, neon_u8, 4) 169 #undef NEON_FN 170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 171 NEON_VOP_ENV(qadd_u16, neon_u16, 2) 172 #undef NEON_FN 173 #undef NEON_USAT 174 175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 176 { 177 uint32_t res = a + b; 178 if (res < a) { 179 SET_QC(); 180 res = ~0; 181 } 182 return res; 183 } 184 185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 186 { 187 uint64_t res; 188 189 res = src1 + src2; 190 if (res < src1) { 191 SET_QC(); 192 res = ~(uint64_t)0; 193 } 194 return res; 195 } 196 197 #define NEON_SSAT(dest, src1, src2, type) do { \ 198 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 199 if (tmp != (type)tmp) { \ 200 SET_QC(); \ 201 if (src2 > 0) { \ 202 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 203 } else { \ 204 tmp = 1 << (sizeof(type) * 8 - 1); \ 205 } \ 206 } \ 207 dest = tmp; \ 208 } while(0) 209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 210 NEON_VOP_ENV(qadd_s8, neon_s8, 4) 211 #undef NEON_FN 212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 213 NEON_VOP_ENV(qadd_s16, neon_s16, 2) 214 #undef NEON_FN 215 #undef NEON_SSAT 216 217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 218 { 219 uint32_t res = a + b; 220 if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { 221 SET_QC(); 222 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 223 } 224 return res; 225 } 226 227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 228 { 229 uint64_t res; 230 231 res = src1 + src2; 232 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { 233 SET_QC(); 234 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 235 } 236 return res; 237 } 238 239 #define NEON_USAT(dest, src1, src2, type) do { \ 240 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 241 if (tmp != (type)tmp) { \ 242 SET_QC(); \ 243 dest = 0; \ 244 } else { \ 245 dest = tmp; \ 246 }} while(0) 247 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 248 NEON_VOP_ENV(qsub_u8, neon_u8, 4) 249 #undef NEON_FN 250 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 251 NEON_VOP_ENV(qsub_u16, neon_u16, 2) 252 #undef NEON_FN 253 #undef NEON_USAT 254 255 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b) 256 { 257 uint32_t res = a - b; 258 if (res > a) { 259 SET_QC(); 260 res = 0; 261 } 262 return res; 263 } 264 265 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 266 { 267 uint64_t res; 268 269 if (src1 < src2) { 270 SET_QC(); 271 res = 0; 272 } else { 273 res = src1 - src2; 274 } 275 return res; 276 } 277 278 #define NEON_SSAT(dest, src1, src2, type) do { \ 279 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 280 if (tmp != (type)tmp) { \ 281 SET_QC(); \ 282 if (src2 < 0) { \ 283 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 284 } else { \ 285 tmp = 1 << (sizeof(type) * 8 - 1); \ 286 } \ 287 } \ 288 dest = tmp; \ 289 } while(0) 290 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 291 NEON_VOP_ENV(qsub_s8, neon_s8, 4) 292 #undef NEON_FN 293 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 294 NEON_VOP_ENV(qsub_s16, neon_s16, 2) 295 #undef NEON_FN 296 #undef NEON_SSAT 297 298 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b) 299 { 300 uint32_t res = a - b; 301 if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { 302 SET_QC(); 303 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 304 } 305 return res; 306 } 307 308 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 309 { 310 uint64_t res; 311 312 res = src1 - src2; 313 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { 314 SET_QC(); 315 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 316 } 317 return res; 318 } 319 320 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 321 NEON_VOP(hadd_s8, neon_s8, 4) 322 NEON_VOP(hadd_u8, neon_u8, 4) 323 NEON_VOP(hadd_s16, neon_s16, 2) 324 NEON_VOP(hadd_u16, neon_u16, 2) 325 #undef NEON_FN 326 327 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 328 { 329 int32_t dest; 330 331 dest = (src1 >> 1) + (src2 >> 1); 332 if (src1 & src2 & 1) 333 dest++; 334 return dest; 335 } 336 337 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 338 { 339 uint32_t dest; 340 341 dest = (src1 >> 1) + (src2 >> 1); 342 if (src1 & src2 & 1) 343 dest++; 344 return dest; 345 } 346 347 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 348 NEON_VOP(rhadd_s8, neon_s8, 4) 349 NEON_VOP(rhadd_u8, neon_u8, 4) 350 NEON_VOP(rhadd_s16, neon_s16, 2) 351 NEON_VOP(rhadd_u16, neon_u16, 2) 352 #undef NEON_FN 353 354 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 355 { 356 int32_t dest; 357 358 dest = (src1 >> 1) + (src2 >> 1); 359 if ((src1 | src2) & 1) 360 dest++; 361 return dest; 362 } 363 364 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 365 { 366 uint32_t dest; 367 368 dest = (src1 >> 1) + (src2 >> 1); 369 if ((src1 | src2) & 1) 370 dest++; 371 return dest; 372 } 373 374 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 375 NEON_VOP(hsub_s8, neon_s8, 4) 376 NEON_VOP(hsub_u8, neon_u8, 4) 377 NEON_VOP(hsub_s16, neon_s16, 2) 378 NEON_VOP(hsub_u16, neon_u16, 2) 379 #undef NEON_FN 380 381 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 382 { 383 int32_t dest; 384 385 dest = (src1 >> 1) - (src2 >> 1); 386 if ((~src1) & src2 & 1) 387 dest--; 388 return dest; 389 } 390 391 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 392 { 393 uint32_t dest; 394 395 dest = (src1 >> 1) - (src2 >> 1); 396 if ((~src1) & src2 & 1) 397 dest--; 398 return dest; 399 } 400 401 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 402 NEON_POP(pmin_s8, neon_s8, 4) 403 NEON_POP(pmin_u8, neon_u8, 4) 404 NEON_POP(pmin_s16, neon_s16, 2) 405 NEON_POP(pmin_u16, neon_u16, 2) 406 #undef NEON_FN 407 408 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 409 NEON_POP(pmax_s8, neon_s8, 4) 410 NEON_POP(pmax_u8, neon_u8, 4) 411 NEON_POP(pmax_s16, neon_s16, 2) 412 NEON_POP(pmax_u16, neon_u16, 2) 413 #undef NEON_FN 414 415 #define NEON_FN(dest, src1, src2) \ 416 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 417 NEON_VOP(shl_u16, neon_u16, 2) 418 #undef NEON_FN 419 420 #define NEON_FN(dest, src1, src2) \ 421 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 422 NEON_VOP(shl_s16, neon_s16, 2) 423 #undef NEON_FN 424 425 #define NEON_FN(dest, src1, src2) \ 426 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 427 NEON_VOP(rshl_s8, neon_s8, 4) 428 #undef NEON_FN 429 430 #define NEON_FN(dest, src1, src2) \ 431 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 432 NEON_VOP(rshl_s16, neon_s16, 2) 433 #undef NEON_FN 434 435 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 436 { 437 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 438 } 439 440 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 441 { 442 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 443 } 444 445 #define NEON_FN(dest, src1, src2) \ 446 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 447 NEON_VOP(rshl_u8, neon_u8, 4) 448 #undef NEON_FN 449 450 #define NEON_FN(dest, src1, src2) \ 451 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 452 NEON_VOP(rshl_u16, neon_u16, 2) 453 #undef NEON_FN 454 455 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 456 { 457 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 458 } 459 460 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 461 { 462 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 463 } 464 465 #define NEON_FN(dest, src1, src2) \ 466 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 467 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 468 #undef NEON_FN 469 470 #define NEON_FN(dest, src1, src2) \ 471 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 472 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 473 #undef NEON_FN 474 475 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 476 { 477 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 478 } 479 480 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 481 { 482 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 483 } 484 485 #define NEON_FN(dest, src1, src2) \ 486 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 487 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 488 #undef NEON_FN 489 490 #define NEON_FN(dest, src1, src2) \ 491 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 492 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 493 #undef NEON_FN 494 495 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 496 { 497 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 498 } 499 500 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 501 { 502 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 503 } 504 505 #define NEON_FN(dest, src1, src2) \ 506 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 507 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 508 #undef NEON_FN 509 510 #define NEON_FN(dest, src1, src2) \ 511 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 512 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 513 #undef NEON_FN 514 515 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 516 { 517 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 518 } 519 520 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 521 { 522 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 523 } 524 525 #define NEON_FN(dest, src1, src2) \ 526 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 527 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 528 #undef NEON_FN 529 530 #define NEON_FN(dest, src1, src2) \ 531 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 532 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 533 #undef NEON_FN 534 535 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 536 { 537 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 538 } 539 540 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 541 { 542 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 543 } 544 545 #define NEON_FN(dest, src1, src2) \ 546 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 547 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 548 #undef NEON_FN 549 550 #define NEON_FN(dest, src1, src2) \ 551 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 552 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 553 #undef NEON_FN 554 555 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 556 { 557 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 558 } 559 560 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 561 { 562 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 563 } 564 565 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 566 { 567 uint32_t mask; 568 mask = (a ^ b) & 0x80808080u; 569 a &= ~0x80808080u; 570 b &= ~0x80808080u; 571 return (a + b) ^ mask; 572 } 573 574 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 575 { 576 uint32_t mask; 577 mask = (a ^ b) & 0x80008000u; 578 a &= ~0x80008000u; 579 b &= ~0x80008000u; 580 return (a + b) ^ mask; 581 } 582 583 #define NEON_FN(dest, src1, src2) dest = src1 - src2 584 NEON_VOP(sub_u8, neon_u8, 4) 585 NEON_VOP(sub_u16, neon_u16, 2) 586 #undef NEON_FN 587 588 #define NEON_FN(dest, src1, src2) dest = src1 * src2 589 NEON_VOP(mul_u8, neon_u8, 4) 590 NEON_VOP(mul_u16, neon_u16, 2) 591 #undef NEON_FN 592 593 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 594 NEON_VOP(tst_u8, neon_u8, 4) 595 NEON_VOP(tst_u16, neon_u16, 2) 596 NEON_VOP(tst_u32, neon_u32, 1) 597 #undef NEON_FN 598 599 /* Count Leading Sign/Zero Bits. */ 600 static inline int do_clz8(uint8_t x) 601 { 602 int n; 603 for (n = 8; x; n--) 604 x >>= 1; 605 return n; 606 } 607 608 static inline int do_clz16(uint16_t x) 609 { 610 int n; 611 for (n = 16; x; n--) 612 x >>= 1; 613 return n; 614 } 615 616 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 617 NEON_VOP1(clz_u8, neon_u8, 4) 618 #undef NEON_FN 619 620 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 621 NEON_VOP1(clz_u16, neon_u16, 2) 622 #undef NEON_FN 623 624 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 625 NEON_VOP1(cls_s8, neon_s8, 4) 626 #undef NEON_FN 627 628 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 629 NEON_VOP1(cls_s16, neon_s16, 2) 630 #undef NEON_FN 631 632 uint32_t HELPER(neon_cls_s32)(uint32_t x) 633 { 634 int count; 635 if ((int32_t)x < 0) 636 x = ~x; 637 for (count = 32; x; count--) 638 x = x >> 1; 639 return count - 1; 640 } 641 642 /* Bit count. */ 643 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 644 { 645 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 646 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 647 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 648 return x; 649 } 650 651 /* Reverse bits in each 8 bit word */ 652 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 653 { 654 x = ((x & 0xf0f0f0f0) >> 4) 655 | ((x & 0x0f0f0f0f) << 4); 656 x = ((x & 0x88888888) >> 3) 657 | ((x & 0x44444444) >> 1) 658 | ((x & 0x22222222) << 1) 659 | ((x & 0x11111111) << 3); 660 return x; 661 } 662 663 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 664 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 665 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 666 SET_QC(); \ 667 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 668 } else { \ 669 tmp <<= 1; \ 670 } \ 671 if (round) { \ 672 int32_t old = tmp; \ 673 tmp += 1 << 15; \ 674 if ((int32_t)tmp < old) { \ 675 SET_QC(); \ 676 tmp = SIGNBIT - 1; \ 677 } \ 678 } \ 679 dest = tmp >> 16; \ 680 } while(0) 681 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 682 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 683 #undef NEON_FN 684 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 685 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 686 #undef NEON_FN 687 #undef NEON_QDMULH16 688 689 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 690 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 691 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 692 SET_QC(); \ 693 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 694 } else { \ 695 tmp <<= 1; \ 696 } \ 697 if (round) { \ 698 int64_t old = tmp; \ 699 tmp += (int64_t)1 << 31; \ 700 if ((int64_t)tmp < old) { \ 701 SET_QC(); \ 702 tmp = SIGNBIT64 - 1; \ 703 } \ 704 } \ 705 dest = tmp >> 32; \ 706 } while(0) 707 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 708 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 709 #undef NEON_FN 710 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 711 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 712 #undef NEON_FN 713 #undef NEON_QDMULH32 714 715 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 716 { 717 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 718 | ((x >> 24) & 0xff000000u); 719 } 720 721 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 722 { 723 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 724 } 725 726 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 727 { 728 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 729 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 730 } 731 732 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 733 { 734 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 735 } 736 737 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 738 { 739 x &= 0xff80ff80ff80ff80ull; 740 x += 0x0080008000800080ull; 741 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 742 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 743 } 744 745 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 746 { 747 x &= 0xffff8000ffff8000ull; 748 x += 0x0000800000008000ull; 749 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 750 } 751 752 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 753 { 754 uint16_t s; 755 uint8_t d; 756 uint32_t res = 0; 757 #define SAT8(n) \ 758 s = x >> n; \ 759 if (s & 0x8000) { \ 760 SET_QC(); \ 761 } else { \ 762 if (s > 0xff) { \ 763 d = 0xff; \ 764 SET_QC(); \ 765 } else { \ 766 d = s; \ 767 } \ 768 res |= (uint32_t)d << (n / 2); \ 769 } 770 771 SAT8(0); 772 SAT8(16); 773 SAT8(32); 774 SAT8(48); 775 #undef SAT8 776 return res; 777 } 778 779 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 780 { 781 uint16_t s; 782 uint8_t d; 783 uint32_t res = 0; 784 #define SAT8(n) \ 785 s = x >> n; \ 786 if (s > 0xff) { \ 787 d = 0xff; \ 788 SET_QC(); \ 789 } else { \ 790 d = s; \ 791 } \ 792 res |= (uint32_t)d << (n / 2); 793 794 SAT8(0); 795 SAT8(16); 796 SAT8(32); 797 SAT8(48); 798 #undef SAT8 799 return res; 800 } 801 802 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 803 { 804 int16_t s; 805 uint8_t d; 806 uint32_t res = 0; 807 #define SAT8(n) \ 808 s = x >> n; \ 809 if (s != (int8_t)s) { \ 810 d = (s >> 15) ^ 0x7f; \ 811 SET_QC(); \ 812 } else { \ 813 d = s; \ 814 } \ 815 res |= (uint32_t)d << (n / 2); 816 817 SAT8(0); 818 SAT8(16); 819 SAT8(32); 820 SAT8(48); 821 #undef SAT8 822 return res; 823 } 824 825 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 826 { 827 uint32_t high; 828 uint32_t low; 829 low = x; 830 if (low & 0x80000000) { 831 low = 0; 832 SET_QC(); 833 } else if (low > 0xffff) { 834 low = 0xffff; 835 SET_QC(); 836 } 837 high = x >> 32; 838 if (high & 0x80000000) { 839 high = 0; 840 SET_QC(); 841 } else if (high > 0xffff) { 842 high = 0xffff; 843 SET_QC(); 844 } 845 return low | (high << 16); 846 } 847 848 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 849 { 850 uint32_t high; 851 uint32_t low; 852 low = x; 853 if (low > 0xffff) { 854 low = 0xffff; 855 SET_QC(); 856 } 857 high = x >> 32; 858 if (high > 0xffff) { 859 high = 0xffff; 860 SET_QC(); 861 } 862 return low | (high << 16); 863 } 864 865 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 866 { 867 int32_t low; 868 int32_t high; 869 low = x; 870 if (low != (int16_t)low) { 871 low = (low >> 31) ^ 0x7fff; 872 SET_QC(); 873 } 874 high = x >> 32; 875 if (high != (int16_t)high) { 876 high = (high >> 31) ^ 0x7fff; 877 SET_QC(); 878 } 879 return (uint16_t)low | (high << 16); 880 } 881 882 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 883 { 884 if (x & 0x8000000000000000ull) { 885 SET_QC(); 886 return 0; 887 } 888 if (x > 0xffffffffu) { 889 SET_QC(); 890 return 0xffffffffu; 891 } 892 return x; 893 } 894 895 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 896 { 897 if (x > 0xffffffffu) { 898 SET_QC(); 899 return 0xffffffffu; 900 } 901 return x; 902 } 903 904 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 905 { 906 if ((int64_t)x != (int32_t)x) { 907 SET_QC(); 908 return ((int64_t)x >> 63) ^ 0x7fffffff; 909 } 910 return x; 911 } 912 913 uint64_t HELPER(neon_widen_u8)(uint32_t x) 914 { 915 uint64_t tmp; 916 uint64_t ret; 917 ret = (uint8_t)x; 918 tmp = (uint8_t)(x >> 8); 919 ret |= tmp << 16; 920 tmp = (uint8_t)(x >> 16); 921 ret |= tmp << 32; 922 tmp = (uint8_t)(x >> 24); 923 ret |= tmp << 48; 924 return ret; 925 } 926 927 uint64_t HELPER(neon_widen_s8)(uint32_t x) 928 { 929 uint64_t tmp; 930 uint64_t ret; 931 ret = (uint16_t)(int8_t)x; 932 tmp = (uint16_t)(int8_t)(x >> 8); 933 ret |= tmp << 16; 934 tmp = (uint16_t)(int8_t)(x >> 16); 935 ret |= tmp << 32; 936 tmp = (uint16_t)(int8_t)(x >> 24); 937 ret |= tmp << 48; 938 return ret; 939 } 940 941 uint64_t HELPER(neon_widen_u16)(uint32_t x) 942 { 943 uint64_t high = (uint16_t)(x >> 16); 944 return ((uint16_t)x) | (high << 32); 945 } 946 947 uint64_t HELPER(neon_widen_s16)(uint32_t x) 948 { 949 uint64_t high = (int16_t)(x >> 16); 950 return ((uint32_t)(int16_t)x) | (high << 32); 951 } 952 953 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 954 { 955 uint64_t mask; 956 mask = (a ^ b) & 0x8000800080008000ull; 957 a &= ~0x8000800080008000ull; 958 b &= ~0x8000800080008000ull; 959 return (a + b) ^ mask; 960 } 961 962 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 963 { 964 uint64_t mask; 965 mask = (a ^ b) & 0x8000000080000000ull; 966 a &= ~0x8000000080000000ull; 967 b &= ~0x8000000080000000ull; 968 return (a + b) ^ mask; 969 } 970 971 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 972 { 973 uint64_t tmp; 974 uint64_t tmp2; 975 976 tmp = a & 0x0000ffff0000ffffull; 977 tmp += (a >> 16) & 0x0000ffff0000ffffull; 978 tmp2 = b & 0xffff0000ffff0000ull; 979 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 980 return ( tmp & 0xffff) 981 | ((tmp >> 16) & 0xffff0000ull) 982 | ((tmp2 << 16) & 0xffff00000000ull) 983 | ( tmp2 & 0xffff000000000000ull); 984 } 985 986 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 987 { 988 uint32_t low = a + (a >> 32); 989 uint32_t high = b + (b >> 32); 990 return low + ((uint64_t)high << 32); 991 } 992 993 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 994 { 995 uint64_t mask; 996 mask = (a ^ ~b) & 0x8000800080008000ull; 997 a |= 0x8000800080008000ull; 998 b &= ~0x8000800080008000ull; 999 return (a - b) ^ mask; 1000 } 1001 1002 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 1003 { 1004 uint64_t mask; 1005 mask = (a ^ ~b) & 0x8000000080000000ull; 1006 a |= 0x8000000080000000ull; 1007 b &= ~0x8000000080000000ull; 1008 return (a - b) ^ mask; 1009 } 1010 1011 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 1012 { 1013 uint32_t x, y; 1014 uint32_t low, high; 1015 1016 x = a; 1017 y = b; 1018 low = x + y; 1019 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1020 SET_QC(); 1021 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 1022 } 1023 x = a >> 32; 1024 y = b >> 32; 1025 high = x + y; 1026 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1027 SET_QC(); 1028 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 1029 } 1030 return low | ((uint64_t)high << 32); 1031 } 1032 1033 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 1034 { 1035 uint64_t result; 1036 1037 result = a + b; 1038 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 1039 SET_QC(); 1040 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 1041 } 1042 return result; 1043 } 1044 1045 /* We have to do the arithmetic in a larger type than 1046 * the input type, because for example with a signed 32 bit 1047 * op the absolute difference can overflow a signed 32 bit value. 1048 */ 1049 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 1050 arithtype tmp_x = (intype)(x); \ 1051 arithtype tmp_y = (intype)(y); \ 1052 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 1053 } while(0) 1054 1055 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 1056 { 1057 uint64_t tmp; 1058 uint64_t result; 1059 DO_ABD(result, a, b, uint8_t, uint32_t); 1060 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 1061 result |= tmp << 16; 1062 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 1063 result |= tmp << 32; 1064 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 1065 result |= tmp << 48; 1066 return result; 1067 } 1068 1069 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 1070 { 1071 uint64_t tmp; 1072 uint64_t result; 1073 DO_ABD(result, a, b, int8_t, int32_t); 1074 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 1075 result |= tmp << 16; 1076 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 1077 result |= tmp << 32; 1078 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 1079 result |= tmp << 48; 1080 return result; 1081 } 1082 1083 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 1084 { 1085 uint64_t tmp; 1086 uint64_t result; 1087 DO_ABD(result, a, b, uint16_t, uint32_t); 1088 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1089 return result | (tmp << 32); 1090 } 1091 1092 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1093 { 1094 uint64_t tmp; 1095 uint64_t result; 1096 DO_ABD(result, a, b, int16_t, int32_t); 1097 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1098 return result | (tmp << 32); 1099 } 1100 1101 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1102 { 1103 uint64_t result; 1104 DO_ABD(result, a, b, uint32_t, uint64_t); 1105 return result; 1106 } 1107 1108 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1109 { 1110 uint64_t result; 1111 DO_ABD(result, a, b, int32_t, int64_t); 1112 return result; 1113 } 1114 #undef DO_ABD 1115 1116 /* Widening multiply. Named type is the source type. */ 1117 #define DO_MULL(dest, x, y, type1, type2) do { \ 1118 type1 tmp_x = x; \ 1119 type1 tmp_y = y; \ 1120 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1121 } while(0) 1122 1123 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1124 { 1125 uint64_t tmp; 1126 uint64_t result; 1127 1128 DO_MULL(result, a, b, uint8_t, uint16_t); 1129 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1130 result |= tmp << 16; 1131 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1132 result |= tmp << 32; 1133 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1134 result |= tmp << 48; 1135 return result; 1136 } 1137 1138 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1139 { 1140 uint64_t tmp; 1141 uint64_t result; 1142 1143 DO_MULL(result, a, b, int8_t, uint16_t); 1144 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1145 result |= tmp << 16; 1146 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1147 result |= tmp << 32; 1148 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1149 result |= tmp << 48; 1150 return result; 1151 } 1152 1153 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1154 { 1155 uint64_t tmp; 1156 uint64_t result; 1157 1158 DO_MULL(result, a, b, uint16_t, uint32_t); 1159 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1160 return result | (tmp << 32); 1161 } 1162 1163 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1164 { 1165 uint64_t tmp; 1166 uint64_t result; 1167 1168 DO_MULL(result, a, b, int16_t, uint32_t); 1169 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1170 return result | (tmp << 32); 1171 } 1172 1173 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1174 { 1175 uint16_t tmp; 1176 uint64_t result; 1177 result = (uint16_t)-x; 1178 tmp = -(x >> 16); 1179 result |= (uint64_t)tmp << 16; 1180 tmp = -(x >> 32); 1181 result |= (uint64_t)tmp << 32; 1182 tmp = -(x >> 48); 1183 result |= (uint64_t)tmp << 48; 1184 return result; 1185 } 1186 1187 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1188 { 1189 uint32_t low = -x; 1190 uint32_t high = -(x >> 32); 1191 return low | ((uint64_t)high << 32); 1192 } 1193 1194 /* Saturating sign manipulation. */ 1195 /* ??? Make these use NEON_VOP1 */ 1196 #define DO_QABS8(x) do { \ 1197 if (x == (int8_t)0x80) { \ 1198 x = 0x7f; \ 1199 SET_QC(); \ 1200 } else if (x < 0) { \ 1201 x = -x; \ 1202 }} while (0) 1203 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1204 { 1205 neon_s8 vec; 1206 NEON_UNPACK(neon_s8, vec, x); 1207 DO_QABS8(vec.v1); 1208 DO_QABS8(vec.v2); 1209 DO_QABS8(vec.v3); 1210 DO_QABS8(vec.v4); 1211 NEON_PACK(neon_s8, x, vec); 1212 return x; 1213 } 1214 #undef DO_QABS8 1215 1216 #define DO_QNEG8(x) do { \ 1217 if (x == (int8_t)0x80) { \ 1218 x = 0x7f; \ 1219 SET_QC(); \ 1220 } else { \ 1221 x = -x; \ 1222 }} while (0) 1223 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1224 { 1225 neon_s8 vec; 1226 NEON_UNPACK(neon_s8, vec, x); 1227 DO_QNEG8(vec.v1); 1228 DO_QNEG8(vec.v2); 1229 DO_QNEG8(vec.v3); 1230 DO_QNEG8(vec.v4); 1231 NEON_PACK(neon_s8, x, vec); 1232 return x; 1233 } 1234 #undef DO_QNEG8 1235 1236 #define DO_QABS16(x) do { \ 1237 if (x == (int16_t)0x8000) { \ 1238 x = 0x7fff; \ 1239 SET_QC(); \ 1240 } else if (x < 0) { \ 1241 x = -x; \ 1242 }} while (0) 1243 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1244 { 1245 neon_s16 vec; 1246 NEON_UNPACK(neon_s16, vec, x); 1247 DO_QABS16(vec.v1); 1248 DO_QABS16(vec.v2); 1249 NEON_PACK(neon_s16, x, vec); 1250 return x; 1251 } 1252 #undef DO_QABS16 1253 1254 #define DO_QNEG16(x) do { \ 1255 if (x == (int16_t)0x8000) { \ 1256 x = 0x7fff; \ 1257 SET_QC(); \ 1258 } else { \ 1259 x = -x; \ 1260 }} while (0) 1261 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1262 { 1263 neon_s16 vec; 1264 NEON_UNPACK(neon_s16, vec, x); 1265 DO_QNEG16(vec.v1); 1266 DO_QNEG16(vec.v2); 1267 NEON_PACK(neon_s16, x, vec); 1268 return x; 1269 } 1270 #undef DO_QNEG16 1271 1272 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1273 { 1274 if (x == SIGNBIT) { 1275 SET_QC(); 1276 x = ~SIGNBIT; 1277 } else if ((int32_t)x < 0) { 1278 x = -x; 1279 } 1280 return x; 1281 } 1282 1283 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1284 { 1285 if (x == SIGNBIT) { 1286 SET_QC(); 1287 x = ~SIGNBIT; 1288 } else { 1289 x = -x; 1290 } 1291 return x; 1292 } 1293 1294 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1295 { 1296 if (x == SIGNBIT64) { 1297 SET_QC(); 1298 x = ~SIGNBIT64; 1299 } else if ((int64_t)x < 0) { 1300 x = -x; 1301 } 1302 return x; 1303 } 1304 1305 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1306 { 1307 if (x == SIGNBIT64) { 1308 SET_QC(); 1309 x = ~SIGNBIT64; 1310 } else { 1311 x = -x; 1312 } 1313 return x; 1314 } 1315 1316 /* NEON Float helpers. */ 1317 1318 /* Floating point comparisons produce an integer result. 1319 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1320 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1321 */ 1322 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1323 { 1324 float_status *fpst = fpstp; 1325 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1326 } 1327 1328 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1329 { 1330 float_status *fpst = fpstp; 1331 return -float32_le(make_float32(b), make_float32(a), fpst); 1332 } 1333 1334 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1335 { 1336 float_status *fpst = fpstp; 1337 return -float32_lt(make_float32(b), make_float32(a), fpst); 1338 } 1339 1340 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1341 { 1342 float_status *fpst = fpstp; 1343 float32 f0 = float32_abs(make_float32(a)); 1344 float32 f1 = float32_abs(make_float32(b)); 1345 return -float32_le(f1, f0, fpst); 1346 } 1347 1348 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1349 { 1350 float_status *fpst = fpstp; 1351 float32 f0 = float32_abs(make_float32(a)); 1352 float32 f1 = float32_abs(make_float32(b)); 1353 return -float32_lt(f1, f0, fpst); 1354 } 1355 1356 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1357 { 1358 float_status *fpst = fpstp; 1359 float64 f0 = float64_abs(make_float64(a)); 1360 float64 f1 = float64_abs(make_float64(b)); 1361 return -float64_le(f1, f0, fpst); 1362 } 1363 1364 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1365 { 1366 float_status *fpst = fpstp; 1367 float64 f0 = float64_abs(make_float64(a)); 1368 float64 f1 = float64_abs(make_float64(b)); 1369 return -float64_lt(f1, f0, fpst); 1370 } 1371 1372 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1373 1374 void HELPER(neon_qunzip8)(void *vd, void *vm) 1375 { 1376 uint64_t *rd = vd, *rm = vm; 1377 uint64_t zd0 = rd[0], zd1 = rd[1]; 1378 uint64_t zm0 = rm[0], zm1 = rm[1]; 1379 1380 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1381 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1382 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1383 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1384 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1385 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1386 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1387 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1388 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1389 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1390 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1391 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1392 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1393 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1394 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1395 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1396 1397 rm[0] = m0; 1398 rm[1] = m1; 1399 rd[0] = d0; 1400 rd[1] = d1; 1401 } 1402 1403 void HELPER(neon_qunzip16)(void *vd, void *vm) 1404 { 1405 uint64_t *rd = vd, *rm = vm; 1406 uint64_t zd0 = rd[0], zd1 = rd[1]; 1407 uint64_t zm0 = rm[0], zm1 = rm[1]; 1408 1409 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1410 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1411 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1412 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1413 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1414 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1415 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1416 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1417 1418 rm[0] = m0; 1419 rm[1] = m1; 1420 rd[0] = d0; 1421 rd[1] = d1; 1422 } 1423 1424 void HELPER(neon_qunzip32)(void *vd, void *vm) 1425 { 1426 uint64_t *rd = vd, *rm = vm; 1427 uint64_t zd0 = rd[0], zd1 = rd[1]; 1428 uint64_t zm0 = rm[0], zm1 = rm[1]; 1429 1430 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1431 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1432 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1433 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1434 1435 rm[0] = m0; 1436 rm[1] = m1; 1437 rd[0] = d0; 1438 rd[1] = d1; 1439 } 1440 1441 void HELPER(neon_unzip8)(void *vd, void *vm) 1442 { 1443 uint64_t *rd = vd, *rm = vm; 1444 uint64_t zd = rd[0], zm = rm[0]; 1445 1446 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1447 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1448 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1449 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1450 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1451 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1452 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1453 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1454 1455 rm[0] = m0; 1456 rd[0] = d0; 1457 } 1458 1459 void HELPER(neon_unzip16)(void *vd, void *vm) 1460 { 1461 uint64_t *rd = vd, *rm = vm; 1462 uint64_t zd = rd[0], zm = rm[0]; 1463 1464 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1465 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1466 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1467 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1468 1469 rm[0] = m0; 1470 rd[0] = d0; 1471 } 1472 1473 void HELPER(neon_qzip8)(void *vd, void *vm) 1474 { 1475 uint64_t *rd = vd, *rm = vm; 1476 uint64_t zd0 = rd[0], zd1 = rd[1]; 1477 uint64_t zm0 = rm[0], zm1 = rm[1]; 1478 1479 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1480 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1481 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1482 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1483 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1484 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1485 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1486 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1487 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1488 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1489 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1490 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1491 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1492 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1493 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1494 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1495 1496 rm[0] = m0; 1497 rm[1] = m1; 1498 rd[0] = d0; 1499 rd[1] = d1; 1500 } 1501 1502 void HELPER(neon_qzip16)(void *vd, void *vm) 1503 { 1504 uint64_t *rd = vd, *rm = vm; 1505 uint64_t zd0 = rd[0], zd1 = rd[1]; 1506 uint64_t zm0 = rm[0], zm1 = rm[1]; 1507 1508 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1509 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1510 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1511 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1512 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1513 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1514 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1515 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1516 1517 rm[0] = m0; 1518 rm[1] = m1; 1519 rd[0] = d0; 1520 rd[1] = d1; 1521 } 1522 1523 void HELPER(neon_qzip32)(void *vd, void *vm) 1524 { 1525 uint64_t *rd = vd, *rm = vm; 1526 uint64_t zd0 = rd[0], zd1 = rd[1]; 1527 uint64_t zm0 = rm[0], zm1 = rm[1]; 1528 1529 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1530 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1531 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1532 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1533 1534 rm[0] = m0; 1535 rm[1] = m1; 1536 rd[0] = d0; 1537 rd[1] = d1; 1538 } 1539 1540 void HELPER(neon_zip8)(void *vd, void *vm) 1541 { 1542 uint64_t *rd = vd, *rm = vm; 1543 uint64_t zd = rd[0], zm = rm[0]; 1544 1545 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1546 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1547 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1548 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1549 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1550 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1551 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1552 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1553 1554 rm[0] = m0; 1555 rd[0] = d0; 1556 } 1557 1558 void HELPER(neon_zip16)(void *vd, void *vm) 1559 { 1560 uint64_t *rd = vd, *rm = vm; 1561 uint64_t zd = rd[0], zm = rm[0]; 1562 1563 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1564 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1565 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1566 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1567 1568 rm[0] = m0; 1569 rd[0] = d0; 1570 } 1571