1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "tcg/tcg-gvec-desc.h" 14 #include "fpu/softfloat.h" 15 #include "vec_internal.h" 16 17 #define SIGNBIT (uint32_t)0x80000000 18 #define SIGNBIT64 ((uint64_t)1 << 63) 19 20 #define SET_QC() env->vfp.qc[0] = 1 21 22 #define NEON_TYPE1(name, type) \ 23 typedef struct \ 24 { \ 25 type v1; \ 26 } neon_##name; 27 #if HOST_BIG_ENDIAN 28 #define NEON_TYPE2(name, type) \ 29 typedef struct \ 30 { \ 31 type v2; \ 32 type v1; \ 33 } neon_##name; 34 #define NEON_TYPE4(name, type) \ 35 typedef struct \ 36 { \ 37 type v4; \ 38 type v3; \ 39 type v2; \ 40 type v1; \ 41 } neon_##name; 42 #else 43 #define NEON_TYPE2(name, type) \ 44 typedef struct \ 45 { \ 46 type v1; \ 47 type v2; \ 48 } neon_##name; 49 #define NEON_TYPE4(name, type) \ 50 typedef struct \ 51 { \ 52 type v1; \ 53 type v2; \ 54 type v3; \ 55 type v4; \ 56 } neon_##name; 57 #endif 58 59 NEON_TYPE4(s8, int8_t) 60 NEON_TYPE4(u8, uint8_t) 61 NEON_TYPE2(s16, int16_t) 62 NEON_TYPE2(u16, uint16_t) 63 NEON_TYPE1(s32, int32_t) 64 NEON_TYPE1(u32, uint32_t) 65 #undef NEON_TYPE4 66 #undef NEON_TYPE2 67 #undef NEON_TYPE1 68 69 /* Copy from a uint32_t to a vector structure type. */ 70 #define NEON_UNPACK(vtype, dest, val) do { \ 71 union { \ 72 vtype v; \ 73 uint32_t i; \ 74 } conv_u; \ 75 conv_u.i = (val); \ 76 dest = conv_u.v; \ 77 } while(0) 78 79 /* Copy from a vector structure type to a uint32_t. */ 80 #define NEON_PACK(vtype, dest, val) do { \ 81 union { \ 82 vtype v; \ 83 uint32_t i; \ 84 } conv_u; \ 85 conv_u.v = (val); \ 86 dest = conv_u.i; \ 87 } while(0) 88 89 #define NEON_DO1 \ 90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 91 #define NEON_DO2 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 94 #define NEON_DO4 \ 95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 99 100 #define NEON_VOP_BODY(vtype, n) \ 101 { \ 102 uint32_t res; \ 103 vtype vsrc1; \ 104 vtype vsrc2; \ 105 vtype vdest; \ 106 NEON_UNPACK(vtype, vsrc1, arg1); \ 107 NEON_UNPACK(vtype, vsrc2, arg2); \ 108 NEON_DO##n; \ 109 NEON_PACK(vtype, res, vdest); \ 110 return res; \ 111 } 112 113 #define NEON_VOP(name, vtype, n) \ 114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 115 NEON_VOP_BODY(vtype, n) 116 117 #define NEON_VOP_ENV(name, vtype, n) \ 118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 119 NEON_VOP_BODY(vtype, n) 120 121 #define NEON_GVEC_VOP2(name, vtype) \ 122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 123 { \ 124 intptr_t i, opr_sz = simd_oprsz(desc); \ 125 vtype *d = vd, *n = vn, *m = vm; \ 126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 127 NEON_FN(d[i], n[i], m[i]); \ 128 } \ 129 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 130 } 131 132 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \ 134 { \ 135 intptr_t i, opr_sz = simd_oprsz(desc); \ 136 vtype *d = vd, *n = vn, *m = vm; \ 137 CPUARMState *env = venv; \ 138 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 139 NEON_FN(d[i], n[i], m[i]); \ 140 } \ 141 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 142 } 143 144 /* Pairwise operations. */ 145 /* For 32-bit elements each segment only contains a single element, so 146 the elementwise and pairwise operations are the same. */ 147 #define NEON_PDO2 \ 148 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 149 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 150 #define NEON_PDO4 \ 151 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 152 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 153 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 154 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 155 156 #define NEON_POP(name, vtype, n) \ 157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 158 { \ 159 uint32_t res; \ 160 vtype vsrc1; \ 161 vtype vsrc2; \ 162 vtype vdest; \ 163 NEON_UNPACK(vtype, vsrc1, arg1); \ 164 NEON_UNPACK(vtype, vsrc2, arg2); \ 165 NEON_PDO##n; \ 166 NEON_PACK(vtype, res, vdest); \ 167 return res; \ 168 } 169 170 /* Unary operators. */ 171 #define NEON_VOP1(name, vtype, n) \ 172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 173 { \ 174 vtype vsrc1; \ 175 vtype vdest; \ 176 NEON_UNPACK(vtype, vsrc1, arg); \ 177 NEON_DO##n; \ 178 NEON_PACK(vtype, arg, vdest); \ 179 return arg; \ 180 } 181 182 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 183 NEON_VOP(hadd_s8, neon_s8, 4) 184 NEON_VOP(hadd_u8, neon_u8, 4) 185 NEON_VOP(hadd_s16, neon_s16, 2) 186 NEON_VOP(hadd_u16, neon_u16, 2) 187 #undef NEON_FN 188 189 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 190 { 191 int32_t dest; 192 193 dest = (src1 >> 1) + (src2 >> 1); 194 if (src1 & src2 & 1) 195 dest++; 196 return dest; 197 } 198 199 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 200 { 201 uint32_t dest; 202 203 dest = (src1 >> 1) + (src2 >> 1); 204 if (src1 & src2 & 1) 205 dest++; 206 return dest; 207 } 208 209 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 210 NEON_VOP(rhadd_s8, neon_s8, 4) 211 NEON_VOP(rhadd_u8, neon_u8, 4) 212 NEON_VOP(rhadd_s16, neon_s16, 2) 213 NEON_VOP(rhadd_u16, neon_u16, 2) 214 #undef NEON_FN 215 216 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 217 { 218 int32_t dest; 219 220 dest = (src1 >> 1) + (src2 >> 1); 221 if ((src1 | src2) & 1) 222 dest++; 223 return dest; 224 } 225 226 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 227 { 228 uint32_t dest; 229 230 dest = (src1 >> 1) + (src2 >> 1); 231 if ((src1 | src2) & 1) 232 dest++; 233 return dest; 234 } 235 236 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 237 NEON_VOP(hsub_s8, neon_s8, 4) 238 NEON_VOP(hsub_u8, neon_u8, 4) 239 NEON_VOP(hsub_s16, neon_s16, 2) 240 NEON_VOP(hsub_u16, neon_u16, 2) 241 #undef NEON_FN 242 243 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 244 { 245 int32_t dest; 246 247 dest = (src1 >> 1) - (src2 >> 1); 248 if ((~src1) & src2 & 1) 249 dest--; 250 return dest; 251 } 252 253 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 254 { 255 uint32_t dest; 256 257 dest = (src1 >> 1) - (src2 >> 1); 258 if ((~src1) & src2 & 1) 259 dest--; 260 return dest; 261 } 262 263 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 264 NEON_POP(pmin_s8, neon_s8, 4) 265 NEON_POP(pmin_u8, neon_u8, 4) 266 NEON_POP(pmin_s16, neon_s16, 2) 267 NEON_POP(pmin_u16, neon_u16, 2) 268 #undef NEON_FN 269 270 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 271 NEON_POP(pmax_s8, neon_s8, 4) 272 NEON_POP(pmax_u8, neon_u8, 4) 273 NEON_POP(pmax_s16, neon_s16, 2) 274 NEON_POP(pmax_u16, neon_u16, 2) 275 #undef NEON_FN 276 277 #define NEON_FN(dest, src1, src2) \ 278 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 279 NEON_VOP(shl_u16, neon_u16, 2) 280 #undef NEON_FN 281 282 #define NEON_FN(dest, src1, src2) \ 283 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 284 NEON_VOP(shl_s16, neon_s16, 2) 285 #undef NEON_FN 286 287 #define NEON_FN(dest, src1, src2) \ 288 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 289 NEON_VOP(rshl_s8, neon_s8, 4) 290 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 291 #undef NEON_FN 292 293 #define NEON_FN(dest, src1, src2) \ 294 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 295 NEON_VOP(rshl_s16, neon_s16, 2) 296 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 297 #undef NEON_FN 298 299 #define NEON_FN(dest, src1, src2) \ 300 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 301 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 302 #undef NEON_FN 303 304 #define NEON_FN(dest, src1, src2) \ 305 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 306 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 307 #undef NEON_FN 308 309 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 310 { 311 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 312 } 313 314 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 315 { 316 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 317 } 318 319 #define NEON_FN(dest, src1, src2) \ 320 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 321 NEON_VOP(rshl_u8, neon_u8, 4) 322 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 323 #undef NEON_FN 324 325 #define NEON_FN(dest, src1, src2) \ 326 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 327 NEON_VOP(rshl_u16, neon_u16, 2) 328 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 329 #undef NEON_FN 330 331 #define NEON_FN(dest, src1, src2) \ 332 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 333 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 334 #undef NEON_FN 335 336 #define NEON_FN(dest, src1, src2) \ 337 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 338 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 339 #undef NEON_FN 340 341 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 342 { 343 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 344 } 345 346 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 347 { 348 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 349 } 350 351 #define NEON_FN(dest, src1, src2) \ 352 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 353 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 354 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 355 #undef NEON_FN 356 357 #define NEON_FN(dest, src1, src2) \ 358 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 359 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 360 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 361 #undef NEON_FN 362 363 #define NEON_FN(dest, src1, src2) \ 364 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 365 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 366 #undef NEON_FN 367 368 #define NEON_FN(dest, src1, src2) \ 369 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 370 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 371 #undef NEON_FN 372 373 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 374 { 375 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 376 } 377 378 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 379 { 380 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 381 } 382 383 #define NEON_FN(dest, src1, src2) \ 384 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 385 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 386 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 387 #undef NEON_FN 388 389 #define NEON_FN(dest, src1, src2) \ 390 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 391 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 392 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 393 #undef NEON_FN 394 395 #define NEON_FN(dest, src1, src2) \ 396 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 397 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 398 #undef NEON_FN 399 400 #define NEON_FN(dest, src1, src2) \ 401 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 402 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 403 #undef NEON_FN 404 405 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 406 { 407 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 408 } 409 410 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 411 { 412 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 413 } 414 415 #define NEON_FN(dest, src1, src2) \ 416 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 417 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 418 #undef NEON_FN 419 420 #define NEON_FN(dest, src1, src2) \ 421 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 422 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 423 #undef NEON_FN 424 425 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 426 { 427 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 428 } 429 430 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 431 { 432 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 433 } 434 435 #define NEON_FN(dest, src1, src2) \ 436 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 437 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 438 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) 439 #undef NEON_FN 440 441 #define NEON_FN(dest, src1, src2) \ 442 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 443 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 444 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t) 445 #undef NEON_FN 446 447 #define NEON_FN(dest, src1, src2) \ 448 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 449 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t) 450 #undef NEON_FN 451 452 #define NEON_FN(dest, src1, src2) \ 453 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 454 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t) 455 #undef NEON_FN 456 457 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 458 { 459 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 460 } 461 462 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 463 { 464 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 465 } 466 467 #define NEON_FN(dest, src1, src2) \ 468 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 469 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 470 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t) 471 #undef NEON_FN 472 473 #define NEON_FN(dest, src1, src2) \ 474 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 475 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 476 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t) 477 #undef NEON_FN 478 479 #define NEON_FN(dest, src1, src2) \ 480 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 481 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t) 482 #undef NEON_FN 483 484 #define NEON_FN(dest, src1, src2) \ 485 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 486 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t) 487 #undef NEON_FN 488 489 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 490 { 491 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 492 } 493 494 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 495 { 496 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 497 } 498 499 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 500 { 501 uint32_t mask; 502 mask = (a ^ b) & 0x80808080u; 503 a &= ~0x80808080u; 504 b &= ~0x80808080u; 505 return (a + b) ^ mask; 506 } 507 508 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 509 { 510 uint32_t mask; 511 mask = (a ^ b) & 0x80008000u; 512 a &= ~0x80008000u; 513 b &= ~0x80008000u; 514 return (a + b) ^ mask; 515 } 516 517 #define NEON_FN(dest, src1, src2) dest = src1 - src2 518 NEON_VOP(sub_u8, neon_u8, 4) 519 NEON_VOP(sub_u16, neon_u16, 2) 520 #undef NEON_FN 521 522 #define NEON_FN(dest, src1, src2) dest = src1 * src2 523 NEON_VOP(mul_u8, neon_u8, 4) 524 NEON_VOP(mul_u16, neon_u16, 2) 525 #undef NEON_FN 526 527 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 528 NEON_VOP(tst_u8, neon_u8, 4) 529 NEON_VOP(tst_u16, neon_u16, 2) 530 NEON_VOP(tst_u32, neon_u32, 1) 531 #undef NEON_FN 532 533 /* Count Leading Sign/Zero Bits. */ 534 static inline int do_clz8(uint8_t x) 535 { 536 int n; 537 for (n = 8; x; n--) 538 x >>= 1; 539 return n; 540 } 541 542 static inline int do_clz16(uint16_t x) 543 { 544 int n; 545 for (n = 16; x; n--) 546 x >>= 1; 547 return n; 548 } 549 550 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 551 NEON_VOP1(clz_u8, neon_u8, 4) 552 #undef NEON_FN 553 554 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 555 NEON_VOP1(clz_u16, neon_u16, 2) 556 #undef NEON_FN 557 558 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 559 NEON_VOP1(cls_s8, neon_s8, 4) 560 #undef NEON_FN 561 562 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 563 NEON_VOP1(cls_s16, neon_s16, 2) 564 #undef NEON_FN 565 566 uint32_t HELPER(neon_cls_s32)(uint32_t x) 567 { 568 int count; 569 if ((int32_t)x < 0) 570 x = ~x; 571 for (count = 32; x; count--) 572 x = x >> 1; 573 return count - 1; 574 } 575 576 /* Bit count. */ 577 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 578 { 579 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 580 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 581 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 582 return x; 583 } 584 585 /* Reverse bits in each 8 bit word */ 586 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 587 { 588 x = ((x & 0xf0f0f0f0) >> 4) 589 | ((x & 0x0f0f0f0f) << 4); 590 x = ((x & 0x88888888) >> 3) 591 | ((x & 0x44444444) >> 1) 592 | ((x & 0x22222222) << 1) 593 | ((x & 0x11111111) << 3); 594 return x; 595 } 596 597 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 598 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 599 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 600 SET_QC(); \ 601 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 602 } else { \ 603 tmp <<= 1; \ 604 } \ 605 if (round) { \ 606 int32_t old = tmp; \ 607 tmp += 1 << 15; \ 608 if ((int32_t)tmp < old) { \ 609 SET_QC(); \ 610 tmp = SIGNBIT - 1; \ 611 } \ 612 } \ 613 dest = tmp >> 16; \ 614 } while(0) 615 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 616 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 617 #undef NEON_FN 618 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 619 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 620 #undef NEON_FN 621 #undef NEON_QDMULH16 622 623 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 624 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 625 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 626 SET_QC(); \ 627 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 628 } else { \ 629 tmp <<= 1; \ 630 } \ 631 if (round) { \ 632 int64_t old = tmp; \ 633 tmp += (int64_t)1 << 31; \ 634 if ((int64_t)tmp < old) { \ 635 SET_QC(); \ 636 tmp = SIGNBIT64 - 1; \ 637 } \ 638 } \ 639 dest = tmp >> 32; \ 640 } while(0) 641 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 642 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 643 #undef NEON_FN 644 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 645 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 646 #undef NEON_FN 647 #undef NEON_QDMULH32 648 649 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 650 { 651 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 652 | ((x >> 24) & 0xff000000u); 653 } 654 655 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 656 { 657 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 658 } 659 660 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 661 { 662 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 663 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 664 } 665 666 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 667 { 668 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 669 } 670 671 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 672 { 673 x &= 0xff80ff80ff80ff80ull; 674 x += 0x0080008000800080ull; 675 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 676 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 677 } 678 679 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 680 { 681 x &= 0xffff8000ffff8000ull; 682 x += 0x0000800000008000ull; 683 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 684 } 685 686 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 687 { 688 uint16_t s; 689 uint8_t d; 690 uint32_t res = 0; 691 #define SAT8(n) \ 692 s = x >> n; \ 693 if (s & 0x8000) { \ 694 SET_QC(); \ 695 } else { \ 696 if (s > 0xff) { \ 697 d = 0xff; \ 698 SET_QC(); \ 699 } else { \ 700 d = s; \ 701 } \ 702 res |= (uint32_t)d << (n / 2); \ 703 } 704 705 SAT8(0); 706 SAT8(16); 707 SAT8(32); 708 SAT8(48); 709 #undef SAT8 710 return res; 711 } 712 713 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 714 { 715 uint16_t s; 716 uint8_t d; 717 uint32_t res = 0; 718 #define SAT8(n) \ 719 s = x >> n; \ 720 if (s > 0xff) { \ 721 d = 0xff; \ 722 SET_QC(); \ 723 } else { \ 724 d = s; \ 725 } \ 726 res |= (uint32_t)d << (n / 2); 727 728 SAT8(0); 729 SAT8(16); 730 SAT8(32); 731 SAT8(48); 732 #undef SAT8 733 return res; 734 } 735 736 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 737 { 738 int16_t s; 739 uint8_t d; 740 uint32_t res = 0; 741 #define SAT8(n) \ 742 s = x >> n; \ 743 if (s != (int8_t)s) { \ 744 d = (s >> 15) ^ 0x7f; \ 745 SET_QC(); \ 746 } else { \ 747 d = s; \ 748 } \ 749 res |= (uint32_t)d << (n / 2); 750 751 SAT8(0); 752 SAT8(16); 753 SAT8(32); 754 SAT8(48); 755 #undef SAT8 756 return res; 757 } 758 759 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 760 { 761 uint32_t high; 762 uint32_t low; 763 low = x; 764 if (low & 0x80000000) { 765 low = 0; 766 SET_QC(); 767 } else if (low > 0xffff) { 768 low = 0xffff; 769 SET_QC(); 770 } 771 high = x >> 32; 772 if (high & 0x80000000) { 773 high = 0; 774 SET_QC(); 775 } else if (high > 0xffff) { 776 high = 0xffff; 777 SET_QC(); 778 } 779 return low | (high << 16); 780 } 781 782 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 783 { 784 uint32_t high; 785 uint32_t low; 786 low = x; 787 if (low > 0xffff) { 788 low = 0xffff; 789 SET_QC(); 790 } 791 high = x >> 32; 792 if (high > 0xffff) { 793 high = 0xffff; 794 SET_QC(); 795 } 796 return low | (high << 16); 797 } 798 799 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 800 { 801 int32_t low; 802 int32_t high; 803 low = x; 804 if (low != (int16_t)low) { 805 low = (low >> 31) ^ 0x7fff; 806 SET_QC(); 807 } 808 high = x >> 32; 809 if (high != (int16_t)high) { 810 high = (high >> 31) ^ 0x7fff; 811 SET_QC(); 812 } 813 return (uint16_t)low | (high << 16); 814 } 815 816 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 817 { 818 if (x & 0x8000000000000000ull) { 819 SET_QC(); 820 return 0; 821 } 822 if (x > 0xffffffffu) { 823 SET_QC(); 824 return 0xffffffffu; 825 } 826 return x; 827 } 828 829 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 830 { 831 if (x > 0xffffffffu) { 832 SET_QC(); 833 return 0xffffffffu; 834 } 835 return x; 836 } 837 838 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 839 { 840 if ((int64_t)x != (int32_t)x) { 841 SET_QC(); 842 return ((int64_t)x >> 63) ^ 0x7fffffff; 843 } 844 return x; 845 } 846 847 uint64_t HELPER(neon_widen_u8)(uint32_t x) 848 { 849 uint64_t tmp; 850 uint64_t ret; 851 ret = (uint8_t)x; 852 tmp = (uint8_t)(x >> 8); 853 ret |= tmp << 16; 854 tmp = (uint8_t)(x >> 16); 855 ret |= tmp << 32; 856 tmp = (uint8_t)(x >> 24); 857 ret |= tmp << 48; 858 return ret; 859 } 860 861 uint64_t HELPER(neon_widen_s8)(uint32_t x) 862 { 863 uint64_t tmp; 864 uint64_t ret; 865 ret = (uint16_t)(int8_t)x; 866 tmp = (uint16_t)(int8_t)(x >> 8); 867 ret |= tmp << 16; 868 tmp = (uint16_t)(int8_t)(x >> 16); 869 ret |= tmp << 32; 870 tmp = (uint16_t)(int8_t)(x >> 24); 871 ret |= tmp << 48; 872 return ret; 873 } 874 875 uint64_t HELPER(neon_widen_u16)(uint32_t x) 876 { 877 uint64_t high = (uint16_t)(x >> 16); 878 return ((uint16_t)x) | (high << 32); 879 } 880 881 uint64_t HELPER(neon_widen_s16)(uint32_t x) 882 { 883 uint64_t high = (int16_t)(x >> 16); 884 return ((uint32_t)(int16_t)x) | (high << 32); 885 } 886 887 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 888 { 889 uint64_t mask; 890 mask = (a ^ b) & 0x8000800080008000ull; 891 a &= ~0x8000800080008000ull; 892 b &= ~0x8000800080008000ull; 893 return (a + b) ^ mask; 894 } 895 896 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 897 { 898 uint64_t mask; 899 mask = (a ^ b) & 0x8000000080000000ull; 900 a &= ~0x8000000080000000ull; 901 b &= ~0x8000000080000000ull; 902 return (a + b) ^ mask; 903 } 904 905 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 906 { 907 uint64_t tmp; 908 uint64_t tmp2; 909 910 tmp = a & 0x0000ffff0000ffffull; 911 tmp += (a >> 16) & 0x0000ffff0000ffffull; 912 tmp2 = b & 0xffff0000ffff0000ull; 913 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 914 return ( tmp & 0xffff) 915 | ((tmp >> 16) & 0xffff0000ull) 916 | ((tmp2 << 16) & 0xffff00000000ull) 917 | ( tmp2 & 0xffff000000000000ull); 918 } 919 920 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 921 { 922 uint32_t low = a + (a >> 32); 923 uint32_t high = b + (b >> 32); 924 return low + ((uint64_t)high << 32); 925 } 926 927 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 928 { 929 uint64_t mask; 930 mask = (a ^ ~b) & 0x8000800080008000ull; 931 a |= 0x8000800080008000ull; 932 b &= ~0x8000800080008000ull; 933 return (a - b) ^ mask; 934 } 935 936 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 937 { 938 uint64_t mask; 939 mask = (a ^ ~b) & 0x8000000080000000ull; 940 a |= 0x8000000080000000ull; 941 b &= ~0x8000000080000000ull; 942 return (a - b) ^ mask; 943 } 944 945 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 946 { 947 uint32_t x, y; 948 uint32_t low, high; 949 950 x = a; 951 y = b; 952 low = x + y; 953 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 954 SET_QC(); 955 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 956 } 957 x = a >> 32; 958 y = b >> 32; 959 high = x + y; 960 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 961 SET_QC(); 962 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 963 } 964 return low | ((uint64_t)high << 32); 965 } 966 967 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 968 { 969 uint64_t result; 970 971 result = a + b; 972 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 973 SET_QC(); 974 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 975 } 976 return result; 977 } 978 979 /* We have to do the arithmetic in a larger type than 980 * the input type, because for example with a signed 32 bit 981 * op the absolute difference can overflow a signed 32 bit value. 982 */ 983 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 984 arithtype tmp_x = (intype)(x); \ 985 arithtype tmp_y = (intype)(y); \ 986 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 987 } while(0) 988 989 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 990 { 991 uint64_t tmp; 992 uint64_t result; 993 DO_ABD(result, a, b, uint8_t, uint32_t); 994 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 995 result |= tmp << 16; 996 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 997 result |= tmp << 32; 998 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 999 result |= tmp << 48; 1000 return result; 1001 } 1002 1003 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 1004 { 1005 uint64_t tmp; 1006 uint64_t result; 1007 DO_ABD(result, a, b, int8_t, int32_t); 1008 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 1009 result |= tmp << 16; 1010 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 1011 result |= tmp << 32; 1012 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 1013 result |= tmp << 48; 1014 return result; 1015 } 1016 1017 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 1018 { 1019 uint64_t tmp; 1020 uint64_t result; 1021 DO_ABD(result, a, b, uint16_t, uint32_t); 1022 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1023 return result | (tmp << 32); 1024 } 1025 1026 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1027 { 1028 uint64_t tmp; 1029 uint64_t result; 1030 DO_ABD(result, a, b, int16_t, int32_t); 1031 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1032 return result | (tmp << 32); 1033 } 1034 1035 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1036 { 1037 uint64_t result; 1038 DO_ABD(result, a, b, uint32_t, uint64_t); 1039 return result; 1040 } 1041 1042 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1043 { 1044 uint64_t result; 1045 DO_ABD(result, a, b, int32_t, int64_t); 1046 return result; 1047 } 1048 #undef DO_ABD 1049 1050 /* Widening multiply. Named type is the source type. */ 1051 #define DO_MULL(dest, x, y, type1, type2) do { \ 1052 type1 tmp_x = x; \ 1053 type1 tmp_y = y; \ 1054 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1055 } while(0) 1056 1057 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1058 { 1059 uint64_t tmp; 1060 uint64_t result; 1061 1062 DO_MULL(result, a, b, uint8_t, uint16_t); 1063 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1064 result |= tmp << 16; 1065 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1066 result |= tmp << 32; 1067 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1068 result |= tmp << 48; 1069 return result; 1070 } 1071 1072 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1073 { 1074 uint64_t tmp; 1075 uint64_t result; 1076 1077 DO_MULL(result, a, b, int8_t, uint16_t); 1078 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1079 result |= tmp << 16; 1080 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1081 result |= tmp << 32; 1082 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1083 result |= tmp << 48; 1084 return result; 1085 } 1086 1087 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1088 { 1089 uint64_t tmp; 1090 uint64_t result; 1091 1092 DO_MULL(result, a, b, uint16_t, uint32_t); 1093 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1094 return result | (tmp << 32); 1095 } 1096 1097 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1098 { 1099 uint64_t tmp; 1100 uint64_t result; 1101 1102 DO_MULL(result, a, b, int16_t, uint32_t); 1103 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1104 return result | (tmp << 32); 1105 } 1106 1107 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1108 { 1109 uint16_t tmp; 1110 uint64_t result; 1111 result = (uint16_t)-x; 1112 tmp = -(x >> 16); 1113 result |= (uint64_t)tmp << 16; 1114 tmp = -(x >> 32); 1115 result |= (uint64_t)tmp << 32; 1116 tmp = -(x >> 48); 1117 result |= (uint64_t)tmp << 48; 1118 return result; 1119 } 1120 1121 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1122 { 1123 uint32_t low = -x; 1124 uint32_t high = -(x >> 32); 1125 return low | ((uint64_t)high << 32); 1126 } 1127 1128 /* Saturating sign manipulation. */ 1129 /* ??? Make these use NEON_VOP1 */ 1130 #define DO_QABS8(x) do { \ 1131 if (x == (int8_t)0x80) { \ 1132 x = 0x7f; \ 1133 SET_QC(); \ 1134 } else if (x < 0) { \ 1135 x = -x; \ 1136 }} while (0) 1137 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1138 { 1139 neon_s8 vec; 1140 NEON_UNPACK(neon_s8, vec, x); 1141 DO_QABS8(vec.v1); 1142 DO_QABS8(vec.v2); 1143 DO_QABS8(vec.v3); 1144 DO_QABS8(vec.v4); 1145 NEON_PACK(neon_s8, x, vec); 1146 return x; 1147 } 1148 #undef DO_QABS8 1149 1150 #define DO_QNEG8(x) do { \ 1151 if (x == (int8_t)0x80) { \ 1152 x = 0x7f; \ 1153 SET_QC(); \ 1154 } else { \ 1155 x = -x; \ 1156 }} while (0) 1157 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1158 { 1159 neon_s8 vec; 1160 NEON_UNPACK(neon_s8, vec, x); 1161 DO_QNEG8(vec.v1); 1162 DO_QNEG8(vec.v2); 1163 DO_QNEG8(vec.v3); 1164 DO_QNEG8(vec.v4); 1165 NEON_PACK(neon_s8, x, vec); 1166 return x; 1167 } 1168 #undef DO_QNEG8 1169 1170 #define DO_QABS16(x) do { \ 1171 if (x == (int16_t)0x8000) { \ 1172 x = 0x7fff; \ 1173 SET_QC(); \ 1174 } else if (x < 0) { \ 1175 x = -x; \ 1176 }} while (0) 1177 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1178 { 1179 neon_s16 vec; 1180 NEON_UNPACK(neon_s16, vec, x); 1181 DO_QABS16(vec.v1); 1182 DO_QABS16(vec.v2); 1183 NEON_PACK(neon_s16, x, vec); 1184 return x; 1185 } 1186 #undef DO_QABS16 1187 1188 #define DO_QNEG16(x) do { \ 1189 if (x == (int16_t)0x8000) { \ 1190 x = 0x7fff; \ 1191 SET_QC(); \ 1192 } else { \ 1193 x = -x; \ 1194 }} while (0) 1195 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1196 { 1197 neon_s16 vec; 1198 NEON_UNPACK(neon_s16, vec, x); 1199 DO_QNEG16(vec.v1); 1200 DO_QNEG16(vec.v2); 1201 NEON_PACK(neon_s16, x, vec); 1202 return x; 1203 } 1204 #undef DO_QNEG16 1205 1206 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1207 { 1208 if (x == SIGNBIT) { 1209 SET_QC(); 1210 x = ~SIGNBIT; 1211 } else if ((int32_t)x < 0) { 1212 x = -x; 1213 } 1214 return x; 1215 } 1216 1217 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1218 { 1219 if (x == SIGNBIT) { 1220 SET_QC(); 1221 x = ~SIGNBIT; 1222 } else { 1223 x = -x; 1224 } 1225 return x; 1226 } 1227 1228 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1229 { 1230 if (x == SIGNBIT64) { 1231 SET_QC(); 1232 x = ~SIGNBIT64; 1233 } else if ((int64_t)x < 0) { 1234 x = -x; 1235 } 1236 return x; 1237 } 1238 1239 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1240 { 1241 if (x == SIGNBIT64) { 1242 SET_QC(); 1243 x = ~SIGNBIT64; 1244 } else { 1245 x = -x; 1246 } 1247 return x; 1248 } 1249 1250 /* NEON Float helpers. */ 1251 1252 /* Floating point comparisons produce an integer result. 1253 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1254 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1255 */ 1256 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1257 { 1258 float_status *fpst = fpstp; 1259 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1260 } 1261 1262 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1263 { 1264 float_status *fpst = fpstp; 1265 return -float32_le(make_float32(b), make_float32(a), fpst); 1266 } 1267 1268 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1269 { 1270 float_status *fpst = fpstp; 1271 return -float32_lt(make_float32(b), make_float32(a), fpst); 1272 } 1273 1274 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1275 { 1276 float_status *fpst = fpstp; 1277 float32 f0 = float32_abs(make_float32(a)); 1278 float32 f1 = float32_abs(make_float32(b)); 1279 return -float32_le(f1, f0, fpst); 1280 } 1281 1282 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1283 { 1284 float_status *fpst = fpstp; 1285 float32 f0 = float32_abs(make_float32(a)); 1286 float32 f1 = float32_abs(make_float32(b)); 1287 return -float32_lt(f1, f0, fpst); 1288 } 1289 1290 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1291 { 1292 float_status *fpst = fpstp; 1293 float64 f0 = float64_abs(make_float64(a)); 1294 float64 f1 = float64_abs(make_float64(b)); 1295 return -float64_le(f1, f0, fpst); 1296 } 1297 1298 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1299 { 1300 float_status *fpst = fpstp; 1301 float64 f0 = float64_abs(make_float64(a)); 1302 float64 f1 = float64_abs(make_float64(b)); 1303 return -float64_lt(f1, f0, fpst); 1304 } 1305 1306 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1307 1308 void HELPER(neon_qunzip8)(void *vd, void *vm) 1309 { 1310 uint64_t *rd = vd, *rm = vm; 1311 uint64_t zd0 = rd[0], zd1 = rd[1]; 1312 uint64_t zm0 = rm[0], zm1 = rm[1]; 1313 1314 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1315 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1316 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1317 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1318 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1319 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1320 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1321 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1322 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1323 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1324 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1325 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1326 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1327 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1328 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1329 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1330 1331 rm[0] = m0; 1332 rm[1] = m1; 1333 rd[0] = d0; 1334 rd[1] = d1; 1335 } 1336 1337 void HELPER(neon_qunzip16)(void *vd, void *vm) 1338 { 1339 uint64_t *rd = vd, *rm = vm; 1340 uint64_t zd0 = rd[0], zd1 = rd[1]; 1341 uint64_t zm0 = rm[0], zm1 = rm[1]; 1342 1343 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1344 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1345 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1346 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1347 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1348 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1349 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1350 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1351 1352 rm[0] = m0; 1353 rm[1] = m1; 1354 rd[0] = d0; 1355 rd[1] = d1; 1356 } 1357 1358 void HELPER(neon_qunzip32)(void *vd, void *vm) 1359 { 1360 uint64_t *rd = vd, *rm = vm; 1361 uint64_t zd0 = rd[0], zd1 = rd[1]; 1362 uint64_t zm0 = rm[0], zm1 = rm[1]; 1363 1364 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1365 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1366 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1367 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1368 1369 rm[0] = m0; 1370 rm[1] = m1; 1371 rd[0] = d0; 1372 rd[1] = d1; 1373 } 1374 1375 void HELPER(neon_unzip8)(void *vd, void *vm) 1376 { 1377 uint64_t *rd = vd, *rm = vm; 1378 uint64_t zd = rd[0], zm = rm[0]; 1379 1380 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1381 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1382 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1383 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1384 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1385 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1386 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1387 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1388 1389 rm[0] = m0; 1390 rd[0] = d0; 1391 } 1392 1393 void HELPER(neon_unzip16)(void *vd, void *vm) 1394 { 1395 uint64_t *rd = vd, *rm = vm; 1396 uint64_t zd = rd[0], zm = rm[0]; 1397 1398 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1399 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1400 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1401 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1402 1403 rm[0] = m0; 1404 rd[0] = d0; 1405 } 1406 1407 void HELPER(neon_qzip8)(void *vd, void *vm) 1408 { 1409 uint64_t *rd = vd, *rm = vm; 1410 uint64_t zd0 = rd[0], zd1 = rd[1]; 1411 uint64_t zm0 = rm[0], zm1 = rm[1]; 1412 1413 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1414 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1415 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1416 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1417 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1418 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1419 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1420 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1421 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1422 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1423 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1424 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1425 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1426 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1427 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1428 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1429 1430 rm[0] = m0; 1431 rm[1] = m1; 1432 rd[0] = d0; 1433 rd[1] = d1; 1434 } 1435 1436 void HELPER(neon_qzip16)(void *vd, void *vm) 1437 { 1438 uint64_t *rd = vd, *rm = vm; 1439 uint64_t zd0 = rd[0], zd1 = rd[1]; 1440 uint64_t zm0 = rm[0], zm1 = rm[1]; 1441 1442 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1443 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1444 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1445 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1446 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1447 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1448 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1449 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1450 1451 rm[0] = m0; 1452 rm[1] = m1; 1453 rd[0] = d0; 1454 rd[1] = d1; 1455 } 1456 1457 void HELPER(neon_qzip32)(void *vd, void *vm) 1458 { 1459 uint64_t *rd = vd, *rm = vm; 1460 uint64_t zd0 = rd[0], zd1 = rd[1]; 1461 uint64_t zm0 = rm[0], zm1 = rm[1]; 1462 1463 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1464 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1465 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1466 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1467 1468 rm[0] = m0; 1469 rm[1] = m1; 1470 rd[0] = d0; 1471 rd[1] = d1; 1472 } 1473 1474 void HELPER(neon_zip8)(void *vd, void *vm) 1475 { 1476 uint64_t *rd = vd, *rm = vm; 1477 uint64_t zd = rd[0], zm = rm[0]; 1478 1479 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1480 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1481 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1482 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1483 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1484 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1485 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1486 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1487 1488 rm[0] = m0; 1489 rd[0] = d0; 1490 } 1491 1492 void HELPER(neon_zip16)(void *vd, void *vm) 1493 { 1494 uint64_t *rd = vd, *rm = vm; 1495 uint64_t zd = rd[0], zm = rm[0]; 1496 1497 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1498 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1499 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1500 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1501 1502 rm[0] = m0; 1503 rd[0] = d0; 1504 } 1505