1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "tcg/tcg-gvec-desc.h" 14 #include "fpu/softfloat.h" 15 #include "vec_internal.h" 16 17 #define SIGNBIT (uint32_t)0x80000000 18 #define SIGNBIT64 ((uint64_t)1 << 63) 19 20 #define SET_QC() env->vfp.qc[0] = 1 21 22 #define NEON_TYPE1(name, type) \ 23 typedef struct \ 24 { \ 25 type v1; \ 26 } neon_##name; 27 #if HOST_BIG_ENDIAN 28 #define NEON_TYPE2(name, type) \ 29 typedef struct \ 30 { \ 31 type v2; \ 32 type v1; \ 33 } neon_##name; 34 #define NEON_TYPE4(name, type) \ 35 typedef struct \ 36 { \ 37 type v4; \ 38 type v3; \ 39 type v2; \ 40 type v1; \ 41 } neon_##name; 42 #else 43 #define NEON_TYPE2(name, type) \ 44 typedef struct \ 45 { \ 46 type v1; \ 47 type v2; \ 48 } neon_##name; 49 #define NEON_TYPE4(name, type) \ 50 typedef struct \ 51 { \ 52 type v1; \ 53 type v2; \ 54 type v3; \ 55 type v4; \ 56 } neon_##name; 57 #endif 58 59 NEON_TYPE4(s8, int8_t) 60 NEON_TYPE4(u8, uint8_t) 61 NEON_TYPE2(s16, int16_t) 62 NEON_TYPE2(u16, uint16_t) 63 NEON_TYPE1(s32, int32_t) 64 NEON_TYPE1(u32, uint32_t) 65 #undef NEON_TYPE4 66 #undef NEON_TYPE2 67 #undef NEON_TYPE1 68 69 /* Copy from a uint32_t to a vector structure type. */ 70 #define NEON_UNPACK(vtype, dest, val) do { \ 71 union { \ 72 vtype v; \ 73 uint32_t i; \ 74 } conv_u; \ 75 conv_u.i = (val); \ 76 dest = conv_u.v; \ 77 } while(0) 78 79 /* Copy from a vector structure type to a uint32_t. */ 80 #define NEON_PACK(vtype, dest, val) do { \ 81 union { \ 82 vtype v; \ 83 uint32_t i; \ 84 } conv_u; \ 85 conv_u.v = (val); \ 86 dest = conv_u.i; \ 87 } while(0) 88 89 #define NEON_DO1 \ 90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 91 #define NEON_DO2 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 94 #define NEON_DO4 \ 95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 99 100 #define NEON_VOP_BODY(vtype, n) \ 101 { \ 102 uint32_t res; \ 103 vtype vsrc1; \ 104 vtype vsrc2; \ 105 vtype vdest; \ 106 NEON_UNPACK(vtype, vsrc1, arg1); \ 107 NEON_UNPACK(vtype, vsrc2, arg2); \ 108 NEON_DO##n; \ 109 NEON_PACK(vtype, res, vdest); \ 110 return res; \ 111 } 112 113 #define NEON_VOP(name, vtype, n) \ 114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 115 NEON_VOP_BODY(vtype, n) 116 117 #define NEON_VOP_ENV(name, vtype, n) \ 118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 119 NEON_VOP_BODY(vtype, n) 120 121 #define NEON_GVEC_VOP2(name, vtype) \ 122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 123 { \ 124 intptr_t i, opr_sz = simd_oprsz(desc); \ 125 vtype *d = vd, *n = vn, *m = vm; \ 126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 127 NEON_FN(d[i], n[i], m[i]); \ 128 } \ 129 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 130 } 131 132 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \ 134 { \ 135 intptr_t i, opr_sz = simd_oprsz(desc); \ 136 vtype *d = vd, *n = vn, *m = vm; \ 137 CPUARMState *env = venv; \ 138 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 139 NEON_FN(d[i], n[i], m[i]); \ 140 } \ 141 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 142 } 143 144 /* Pairwise operations. */ 145 /* For 32-bit elements each segment only contains a single element, so 146 the elementwise and pairwise operations are the same. */ 147 #define NEON_PDO2 \ 148 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 149 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 150 #define NEON_PDO4 \ 151 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 152 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 153 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 154 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 155 156 #define NEON_POP(name, vtype, n) \ 157 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 158 { \ 159 uint32_t res; \ 160 vtype vsrc1; \ 161 vtype vsrc2; \ 162 vtype vdest; \ 163 NEON_UNPACK(vtype, vsrc1, arg1); \ 164 NEON_UNPACK(vtype, vsrc2, arg2); \ 165 NEON_PDO##n; \ 166 NEON_PACK(vtype, res, vdest); \ 167 return res; \ 168 } 169 170 /* Unary operators. */ 171 #define NEON_VOP1(name, vtype, n) \ 172 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 173 { \ 174 vtype vsrc1; \ 175 vtype vdest; \ 176 NEON_UNPACK(vtype, vsrc1, arg); \ 177 NEON_DO##n; \ 178 NEON_PACK(vtype, arg, vdest); \ 179 return arg; \ 180 } 181 182 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 183 NEON_VOP(rhadd_s8, neon_s8, 4) 184 NEON_VOP(rhadd_u8, neon_u8, 4) 185 NEON_VOP(rhadd_s16, neon_s16, 2) 186 NEON_VOP(rhadd_u16, neon_u16, 2) 187 #undef NEON_FN 188 189 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 190 { 191 int32_t dest; 192 193 dest = (src1 >> 1) + (src2 >> 1); 194 if ((src1 | src2) & 1) 195 dest++; 196 return dest; 197 } 198 199 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 200 { 201 uint32_t dest; 202 203 dest = (src1 >> 1) + (src2 >> 1); 204 if ((src1 | src2) & 1) 205 dest++; 206 return dest; 207 } 208 209 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 210 NEON_VOP(hsub_s8, neon_s8, 4) 211 NEON_VOP(hsub_u8, neon_u8, 4) 212 NEON_VOP(hsub_s16, neon_s16, 2) 213 NEON_VOP(hsub_u16, neon_u16, 2) 214 #undef NEON_FN 215 216 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 217 { 218 int32_t dest; 219 220 dest = (src1 >> 1) - (src2 >> 1); 221 if ((~src1) & src2 & 1) 222 dest--; 223 return dest; 224 } 225 226 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 227 { 228 uint32_t dest; 229 230 dest = (src1 >> 1) - (src2 >> 1); 231 if ((~src1) & src2 & 1) 232 dest--; 233 return dest; 234 } 235 236 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 237 NEON_POP(pmin_s8, neon_s8, 4) 238 NEON_POP(pmin_u8, neon_u8, 4) 239 NEON_POP(pmin_s16, neon_s16, 2) 240 NEON_POP(pmin_u16, neon_u16, 2) 241 #undef NEON_FN 242 243 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 244 NEON_POP(pmax_s8, neon_s8, 4) 245 NEON_POP(pmax_u8, neon_u8, 4) 246 NEON_POP(pmax_s16, neon_s16, 2) 247 NEON_POP(pmax_u16, neon_u16, 2) 248 #undef NEON_FN 249 250 #define NEON_FN(dest, src1, src2) \ 251 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 252 NEON_VOP(shl_u16, neon_u16, 2) 253 #undef NEON_FN 254 255 #define NEON_FN(dest, src1, src2) \ 256 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 257 NEON_VOP(shl_s16, neon_s16, 2) 258 #undef NEON_FN 259 260 #define NEON_FN(dest, src1, src2) \ 261 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 262 NEON_VOP(rshl_s8, neon_s8, 4) 263 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 264 #undef NEON_FN 265 266 #define NEON_FN(dest, src1, src2) \ 267 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 268 NEON_VOP(rshl_s16, neon_s16, 2) 269 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 270 #undef NEON_FN 271 272 #define NEON_FN(dest, src1, src2) \ 273 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 274 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 275 #undef NEON_FN 276 277 #define NEON_FN(dest, src1, src2) \ 278 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 279 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 280 #undef NEON_FN 281 282 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 283 { 284 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 285 } 286 287 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 288 { 289 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 290 } 291 292 #define NEON_FN(dest, src1, src2) \ 293 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 294 NEON_VOP(rshl_u8, neon_u8, 4) 295 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 296 #undef NEON_FN 297 298 #define NEON_FN(dest, src1, src2) \ 299 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 300 NEON_VOP(rshl_u16, neon_u16, 2) 301 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 302 #undef NEON_FN 303 304 #define NEON_FN(dest, src1, src2) \ 305 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 306 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 307 #undef NEON_FN 308 309 #define NEON_FN(dest, src1, src2) \ 310 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 311 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 312 #undef NEON_FN 313 314 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 315 { 316 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 317 } 318 319 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 320 { 321 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 322 } 323 324 #define NEON_FN(dest, src1, src2) \ 325 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 326 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 327 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 328 #undef NEON_FN 329 330 #define NEON_FN(dest, src1, src2) \ 331 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 332 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 333 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 334 #undef NEON_FN 335 336 #define NEON_FN(dest, src1, src2) \ 337 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 338 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 339 #undef NEON_FN 340 341 #define NEON_FN(dest, src1, src2) \ 342 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 343 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 344 #undef NEON_FN 345 346 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 347 { 348 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 349 } 350 351 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 352 { 353 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 354 } 355 356 #define NEON_FN(dest, src1, src2) \ 357 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 358 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 359 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 360 #undef NEON_FN 361 362 #define NEON_FN(dest, src1, src2) \ 363 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 364 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 365 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 366 #undef NEON_FN 367 368 #define NEON_FN(dest, src1, src2) \ 369 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 370 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 371 #undef NEON_FN 372 373 #define NEON_FN(dest, src1, src2) \ 374 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 375 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 376 #undef NEON_FN 377 378 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 379 { 380 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 381 } 382 383 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 384 { 385 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 386 } 387 388 #define NEON_FN(dest, src1, src2) \ 389 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 390 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 391 #undef NEON_FN 392 393 #define NEON_FN(dest, src1, src2) \ 394 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 395 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 396 #undef NEON_FN 397 398 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 399 { 400 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 401 } 402 403 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 404 { 405 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 406 } 407 408 #define NEON_FN(dest, src1, src2) \ 409 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 410 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 411 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) 412 #undef NEON_FN 413 414 #define NEON_FN(dest, src1, src2) \ 415 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 416 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 417 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t) 418 #undef NEON_FN 419 420 #define NEON_FN(dest, src1, src2) \ 421 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 422 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t) 423 #undef NEON_FN 424 425 #define NEON_FN(dest, src1, src2) \ 426 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 427 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t) 428 #undef NEON_FN 429 430 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 431 { 432 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 433 } 434 435 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 436 { 437 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 438 } 439 440 #define NEON_FN(dest, src1, src2) \ 441 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 442 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 443 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t) 444 #undef NEON_FN 445 446 #define NEON_FN(dest, src1, src2) \ 447 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 448 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 449 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t) 450 #undef NEON_FN 451 452 #define NEON_FN(dest, src1, src2) \ 453 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 454 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t) 455 #undef NEON_FN 456 457 #define NEON_FN(dest, src1, src2) \ 458 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 459 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t) 460 #undef NEON_FN 461 462 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 463 { 464 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 465 } 466 467 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 468 { 469 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 470 } 471 472 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 473 { 474 uint32_t mask; 475 mask = (a ^ b) & 0x80808080u; 476 a &= ~0x80808080u; 477 b &= ~0x80808080u; 478 return (a + b) ^ mask; 479 } 480 481 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 482 { 483 uint32_t mask; 484 mask = (a ^ b) & 0x80008000u; 485 a &= ~0x80008000u; 486 b &= ~0x80008000u; 487 return (a + b) ^ mask; 488 } 489 490 #define NEON_FN(dest, src1, src2) dest = src1 - src2 491 NEON_VOP(sub_u8, neon_u8, 4) 492 NEON_VOP(sub_u16, neon_u16, 2) 493 #undef NEON_FN 494 495 #define NEON_FN(dest, src1, src2) dest = src1 * src2 496 NEON_VOP(mul_u8, neon_u8, 4) 497 NEON_VOP(mul_u16, neon_u16, 2) 498 #undef NEON_FN 499 500 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 501 NEON_VOP(tst_u8, neon_u8, 4) 502 NEON_VOP(tst_u16, neon_u16, 2) 503 NEON_VOP(tst_u32, neon_u32, 1) 504 #undef NEON_FN 505 506 /* Count Leading Sign/Zero Bits. */ 507 static inline int do_clz8(uint8_t x) 508 { 509 int n; 510 for (n = 8; x; n--) 511 x >>= 1; 512 return n; 513 } 514 515 static inline int do_clz16(uint16_t x) 516 { 517 int n; 518 for (n = 16; x; n--) 519 x >>= 1; 520 return n; 521 } 522 523 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 524 NEON_VOP1(clz_u8, neon_u8, 4) 525 #undef NEON_FN 526 527 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 528 NEON_VOP1(clz_u16, neon_u16, 2) 529 #undef NEON_FN 530 531 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 532 NEON_VOP1(cls_s8, neon_s8, 4) 533 #undef NEON_FN 534 535 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 536 NEON_VOP1(cls_s16, neon_s16, 2) 537 #undef NEON_FN 538 539 uint32_t HELPER(neon_cls_s32)(uint32_t x) 540 { 541 int count; 542 if ((int32_t)x < 0) 543 x = ~x; 544 for (count = 32; x; count--) 545 x = x >> 1; 546 return count - 1; 547 } 548 549 /* Bit count. */ 550 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 551 { 552 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 553 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 554 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 555 return x; 556 } 557 558 /* Reverse bits in each 8 bit word */ 559 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 560 { 561 x = ((x & 0xf0f0f0f0) >> 4) 562 | ((x & 0x0f0f0f0f) << 4); 563 x = ((x & 0x88888888) >> 3) 564 | ((x & 0x44444444) >> 1) 565 | ((x & 0x22222222) << 1) 566 | ((x & 0x11111111) << 3); 567 return x; 568 } 569 570 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 571 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 572 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 573 SET_QC(); \ 574 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 575 } else { \ 576 tmp <<= 1; \ 577 } \ 578 if (round) { \ 579 int32_t old = tmp; \ 580 tmp += 1 << 15; \ 581 if ((int32_t)tmp < old) { \ 582 SET_QC(); \ 583 tmp = SIGNBIT - 1; \ 584 } \ 585 } \ 586 dest = tmp >> 16; \ 587 } while(0) 588 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 589 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 590 #undef NEON_FN 591 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 592 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 593 #undef NEON_FN 594 #undef NEON_QDMULH16 595 596 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 597 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 598 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 599 SET_QC(); \ 600 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 601 } else { \ 602 tmp <<= 1; \ 603 } \ 604 if (round) { \ 605 int64_t old = tmp; \ 606 tmp += (int64_t)1 << 31; \ 607 if ((int64_t)tmp < old) { \ 608 SET_QC(); \ 609 tmp = SIGNBIT64 - 1; \ 610 } \ 611 } \ 612 dest = tmp >> 32; \ 613 } while(0) 614 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 615 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 616 #undef NEON_FN 617 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 618 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 619 #undef NEON_FN 620 #undef NEON_QDMULH32 621 622 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 623 { 624 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 625 | ((x >> 24) & 0xff000000u); 626 } 627 628 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 629 { 630 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 631 } 632 633 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 634 { 635 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 636 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 637 } 638 639 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 640 { 641 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 642 } 643 644 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 645 { 646 x &= 0xff80ff80ff80ff80ull; 647 x += 0x0080008000800080ull; 648 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 649 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 650 } 651 652 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 653 { 654 x &= 0xffff8000ffff8000ull; 655 x += 0x0000800000008000ull; 656 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 657 } 658 659 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 660 { 661 uint16_t s; 662 uint8_t d; 663 uint32_t res = 0; 664 #define SAT8(n) \ 665 s = x >> n; \ 666 if (s & 0x8000) { \ 667 SET_QC(); \ 668 } else { \ 669 if (s > 0xff) { \ 670 d = 0xff; \ 671 SET_QC(); \ 672 } else { \ 673 d = s; \ 674 } \ 675 res |= (uint32_t)d << (n / 2); \ 676 } 677 678 SAT8(0); 679 SAT8(16); 680 SAT8(32); 681 SAT8(48); 682 #undef SAT8 683 return res; 684 } 685 686 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 687 { 688 uint16_t s; 689 uint8_t d; 690 uint32_t res = 0; 691 #define SAT8(n) \ 692 s = x >> n; \ 693 if (s > 0xff) { \ 694 d = 0xff; \ 695 SET_QC(); \ 696 } else { \ 697 d = s; \ 698 } \ 699 res |= (uint32_t)d << (n / 2); 700 701 SAT8(0); 702 SAT8(16); 703 SAT8(32); 704 SAT8(48); 705 #undef SAT8 706 return res; 707 } 708 709 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 710 { 711 int16_t s; 712 uint8_t d; 713 uint32_t res = 0; 714 #define SAT8(n) \ 715 s = x >> n; \ 716 if (s != (int8_t)s) { \ 717 d = (s >> 15) ^ 0x7f; \ 718 SET_QC(); \ 719 } else { \ 720 d = s; \ 721 } \ 722 res |= (uint32_t)d << (n / 2); 723 724 SAT8(0); 725 SAT8(16); 726 SAT8(32); 727 SAT8(48); 728 #undef SAT8 729 return res; 730 } 731 732 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 733 { 734 uint32_t high; 735 uint32_t low; 736 low = x; 737 if (low & 0x80000000) { 738 low = 0; 739 SET_QC(); 740 } else if (low > 0xffff) { 741 low = 0xffff; 742 SET_QC(); 743 } 744 high = x >> 32; 745 if (high & 0x80000000) { 746 high = 0; 747 SET_QC(); 748 } else if (high > 0xffff) { 749 high = 0xffff; 750 SET_QC(); 751 } 752 return low | (high << 16); 753 } 754 755 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 756 { 757 uint32_t high; 758 uint32_t low; 759 low = x; 760 if (low > 0xffff) { 761 low = 0xffff; 762 SET_QC(); 763 } 764 high = x >> 32; 765 if (high > 0xffff) { 766 high = 0xffff; 767 SET_QC(); 768 } 769 return low | (high << 16); 770 } 771 772 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 773 { 774 int32_t low; 775 int32_t high; 776 low = x; 777 if (low != (int16_t)low) { 778 low = (low >> 31) ^ 0x7fff; 779 SET_QC(); 780 } 781 high = x >> 32; 782 if (high != (int16_t)high) { 783 high = (high >> 31) ^ 0x7fff; 784 SET_QC(); 785 } 786 return (uint16_t)low | (high << 16); 787 } 788 789 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 790 { 791 if (x & 0x8000000000000000ull) { 792 SET_QC(); 793 return 0; 794 } 795 if (x > 0xffffffffu) { 796 SET_QC(); 797 return 0xffffffffu; 798 } 799 return x; 800 } 801 802 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 803 { 804 if (x > 0xffffffffu) { 805 SET_QC(); 806 return 0xffffffffu; 807 } 808 return x; 809 } 810 811 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 812 { 813 if ((int64_t)x != (int32_t)x) { 814 SET_QC(); 815 return ((int64_t)x >> 63) ^ 0x7fffffff; 816 } 817 return x; 818 } 819 820 uint64_t HELPER(neon_widen_u8)(uint32_t x) 821 { 822 uint64_t tmp; 823 uint64_t ret; 824 ret = (uint8_t)x; 825 tmp = (uint8_t)(x >> 8); 826 ret |= tmp << 16; 827 tmp = (uint8_t)(x >> 16); 828 ret |= tmp << 32; 829 tmp = (uint8_t)(x >> 24); 830 ret |= tmp << 48; 831 return ret; 832 } 833 834 uint64_t HELPER(neon_widen_s8)(uint32_t x) 835 { 836 uint64_t tmp; 837 uint64_t ret; 838 ret = (uint16_t)(int8_t)x; 839 tmp = (uint16_t)(int8_t)(x >> 8); 840 ret |= tmp << 16; 841 tmp = (uint16_t)(int8_t)(x >> 16); 842 ret |= tmp << 32; 843 tmp = (uint16_t)(int8_t)(x >> 24); 844 ret |= tmp << 48; 845 return ret; 846 } 847 848 uint64_t HELPER(neon_widen_u16)(uint32_t x) 849 { 850 uint64_t high = (uint16_t)(x >> 16); 851 return ((uint16_t)x) | (high << 32); 852 } 853 854 uint64_t HELPER(neon_widen_s16)(uint32_t x) 855 { 856 uint64_t high = (int16_t)(x >> 16); 857 return ((uint32_t)(int16_t)x) | (high << 32); 858 } 859 860 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 861 { 862 uint64_t mask; 863 mask = (a ^ b) & 0x8000800080008000ull; 864 a &= ~0x8000800080008000ull; 865 b &= ~0x8000800080008000ull; 866 return (a + b) ^ mask; 867 } 868 869 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 870 { 871 uint64_t mask; 872 mask = (a ^ b) & 0x8000000080000000ull; 873 a &= ~0x8000000080000000ull; 874 b &= ~0x8000000080000000ull; 875 return (a + b) ^ mask; 876 } 877 878 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 879 { 880 uint64_t tmp; 881 uint64_t tmp2; 882 883 tmp = a & 0x0000ffff0000ffffull; 884 tmp += (a >> 16) & 0x0000ffff0000ffffull; 885 tmp2 = b & 0xffff0000ffff0000ull; 886 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 887 return ( tmp & 0xffff) 888 | ((tmp >> 16) & 0xffff0000ull) 889 | ((tmp2 << 16) & 0xffff00000000ull) 890 | ( tmp2 & 0xffff000000000000ull); 891 } 892 893 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 894 { 895 uint32_t low = a + (a >> 32); 896 uint32_t high = b + (b >> 32); 897 return low + ((uint64_t)high << 32); 898 } 899 900 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 901 { 902 uint64_t mask; 903 mask = (a ^ ~b) & 0x8000800080008000ull; 904 a |= 0x8000800080008000ull; 905 b &= ~0x8000800080008000ull; 906 return (a - b) ^ mask; 907 } 908 909 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 910 { 911 uint64_t mask; 912 mask = (a ^ ~b) & 0x8000000080000000ull; 913 a |= 0x8000000080000000ull; 914 b &= ~0x8000000080000000ull; 915 return (a - b) ^ mask; 916 } 917 918 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 919 { 920 uint32_t x, y; 921 uint32_t low, high; 922 923 x = a; 924 y = b; 925 low = x + y; 926 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 927 SET_QC(); 928 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 929 } 930 x = a >> 32; 931 y = b >> 32; 932 high = x + y; 933 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 934 SET_QC(); 935 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 936 } 937 return low | ((uint64_t)high << 32); 938 } 939 940 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 941 { 942 uint64_t result; 943 944 result = a + b; 945 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 946 SET_QC(); 947 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 948 } 949 return result; 950 } 951 952 /* We have to do the arithmetic in a larger type than 953 * the input type, because for example with a signed 32 bit 954 * op the absolute difference can overflow a signed 32 bit value. 955 */ 956 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 957 arithtype tmp_x = (intype)(x); \ 958 arithtype tmp_y = (intype)(y); \ 959 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 960 } while(0) 961 962 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 963 { 964 uint64_t tmp; 965 uint64_t result; 966 DO_ABD(result, a, b, uint8_t, uint32_t); 967 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 968 result |= tmp << 16; 969 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 970 result |= tmp << 32; 971 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 972 result |= tmp << 48; 973 return result; 974 } 975 976 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 977 { 978 uint64_t tmp; 979 uint64_t result; 980 DO_ABD(result, a, b, int8_t, int32_t); 981 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 982 result |= tmp << 16; 983 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 984 result |= tmp << 32; 985 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 986 result |= tmp << 48; 987 return result; 988 } 989 990 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 991 { 992 uint64_t tmp; 993 uint64_t result; 994 DO_ABD(result, a, b, uint16_t, uint32_t); 995 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 996 return result | (tmp << 32); 997 } 998 999 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1000 { 1001 uint64_t tmp; 1002 uint64_t result; 1003 DO_ABD(result, a, b, int16_t, int32_t); 1004 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1005 return result | (tmp << 32); 1006 } 1007 1008 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1009 { 1010 uint64_t result; 1011 DO_ABD(result, a, b, uint32_t, uint64_t); 1012 return result; 1013 } 1014 1015 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1016 { 1017 uint64_t result; 1018 DO_ABD(result, a, b, int32_t, int64_t); 1019 return result; 1020 } 1021 #undef DO_ABD 1022 1023 /* Widening multiply. Named type is the source type. */ 1024 #define DO_MULL(dest, x, y, type1, type2) do { \ 1025 type1 tmp_x = x; \ 1026 type1 tmp_y = y; \ 1027 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1028 } while(0) 1029 1030 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1031 { 1032 uint64_t tmp; 1033 uint64_t result; 1034 1035 DO_MULL(result, a, b, uint8_t, uint16_t); 1036 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1037 result |= tmp << 16; 1038 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1039 result |= tmp << 32; 1040 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1041 result |= tmp << 48; 1042 return result; 1043 } 1044 1045 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1046 { 1047 uint64_t tmp; 1048 uint64_t result; 1049 1050 DO_MULL(result, a, b, int8_t, uint16_t); 1051 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1052 result |= tmp << 16; 1053 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1054 result |= tmp << 32; 1055 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1056 result |= tmp << 48; 1057 return result; 1058 } 1059 1060 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1061 { 1062 uint64_t tmp; 1063 uint64_t result; 1064 1065 DO_MULL(result, a, b, uint16_t, uint32_t); 1066 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1067 return result | (tmp << 32); 1068 } 1069 1070 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1071 { 1072 uint64_t tmp; 1073 uint64_t result; 1074 1075 DO_MULL(result, a, b, int16_t, uint32_t); 1076 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1077 return result | (tmp << 32); 1078 } 1079 1080 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1081 { 1082 uint16_t tmp; 1083 uint64_t result; 1084 result = (uint16_t)-x; 1085 tmp = -(x >> 16); 1086 result |= (uint64_t)tmp << 16; 1087 tmp = -(x >> 32); 1088 result |= (uint64_t)tmp << 32; 1089 tmp = -(x >> 48); 1090 result |= (uint64_t)tmp << 48; 1091 return result; 1092 } 1093 1094 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1095 { 1096 uint32_t low = -x; 1097 uint32_t high = -(x >> 32); 1098 return low | ((uint64_t)high << 32); 1099 } 1100 1101 /* Saturating sign manipulation. */ 1102 /* ??? Make these use NEON_VOP1 */ 1103 #define DO_QABS8(x) do { \ 1104 if (x == (int8_t)0x80) { \ 1105 x = 0x7f; \ 1106 SET_QC(); \ 1107 } else if (x < 0) { \ 1108 x = -x; \ 1109 }} while (0) 1110 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1111 { 1112 neon_s8 vec; 1113 NEON_UNPACK(neon_s8, vec, x); 1114 DO_QABS8(vec.v1); 1115 DO_QABS8(vec.v2); 1116 DO_QABS8(vec.v3); 1117 DO_QABS8(vec.v4); 1118 NEON_PACK(neon_s8, x, vec); 1119 return x; 1120 } 1121 #undef DO_QABS8 1122 1123 #define DO_QNEG8(x) do { \ 1124 if (x == (int8_t)0x80) { \ 1125 x = 0x7f; \ 1126 SET_QC(); \ 1127 } else { \ 1128 x = -x; \ 1129 }} while (0) 1130 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1131 { 1132 neon_s8 vec; 1133 NEON_UNPACK(neon_s8, vec, x); 1134 DO_QNEG8(vec.v1); 1135 DO_QNEG8(vec.v2); 1136 DO_QNEG8(vec.v3); 1137 DO_QNEG8(vec.v4); 1138 NEON_PACK(neon_s8, x, vec); 1139 return x; 1140 } 1141 #undef DO_QNEG8 1142 1143 #define DO_QABS16(x) do { \ 1144 if (x == (int16_t)0x8000) { \ 1145 x = 0x7fff; \ 1146 SET_QC(); \ 1147 } else if (x < 0) { \ 1148 x = -x; \ 1149 }} while (0) 1150 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1151 { 1152 neon_s16 vec; 1153 NEON_UNPACK(neon_s16, vec, x); 1154 DO_QABS16(vec.v1); 1155 DO_QABS16(vec.v2); 1156 NEON_PACK(neon_s16, x, vec); 1157 return x; 1158 } 1159 #undef DO_QABS16 1160 1161 #define DO_QNEG16(x) do { \ 1162 if (x == (int16_t)0x8000) { \ 1163 x = 0x7fff; \ 1164 SET_QC(); \ 1165 } else { \ 1166 x = -x; \ 1167 }} while (0) 1168 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1169 { 1170 neon_s16 vec; 1171 NEON_UNPACK(neon_s16, vec, x); 1172 DO_QNEG16(vec.v1); 1173 DO_QNEG16(vec.v2); 1174 NEON_PACK(neon_s16, x, vec); 1175 return x; 1176 } 1177 #undef DO_QNEG16 1178 1179 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1180 { 1181 if (x == SIGNBIT) { 1182 SET_QC(); 1183 x = ~SIGNBIT; 1184 } else if ((int32_t)x < 0) { 1185 x = -x; 1186 } 1187 return x; 1188 } 1189 1190 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1191 { 1192 if (x == SIGNBIT) { 1193 SET_QC(); 1194 x = ~SIGNBIT; 1195 } else { 1196 x = -x; 1197 } 1198 return x; 1199 } 1200 1201 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1202 { 1203 if (x == SIGNBIT64) { 1204 SET_QC(); 1205 x = ~SIGNBIT64; 1206 } else if ((int64_t)x < 0) { 1207 x = -x; 1208 } 1209 return x; 1210 } 1211 1212 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1213 { 1214 if (x == SIGNBIT64) { 1215 SET_QC(); 1216 x = ~SIGNBIT64; 1217 } else { 1218 x = -x; 1219 } 1220 return x; 1221 } 1222 1223 /* NEON Float helpers. */ 1224 1225 /* Floating point comparisons produce an integer result. 1226 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1227 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1228 */ 1229 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1230 { 1231 float_status *fpst = fpstp; 1232 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1233 } 1234 1235 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1236 { 1237 float_status *fpst = fpstp; 1238 return -float32_le(make_float32(b), make_float32(a), fpst); 1239 } 1240 1241 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1242 { 1243 float_status *fpst = fpstp; 1244 return -float32_lt(make_float32(b), make_float32(a), fpst); 1245 } 1246 1247 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1248 { 1249 float_status *fpst = fpstp; 1250 float32 f0 = float32_abs(make_float32(a)); 1251 float32 f1 = float32_abs(make_float32(b)); 1252 return -float32_le(f1, f0, fpst); 1253 } 1254 1255 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1256 { 1257 float_status *fpst = fpstp; 1258 float32 f0 = float32_abs(make_float32(a)); 1259 float32 f1 = float32_abs(make_float32(b)); 1260 return -float32_lt(f1, f0, fpst); 1261 } 1262 1263 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1264 { 1265 float_status *fpst = fpstp; 1266 float64 f0 = float64_abs(make_float64(a)); 1267 float64 f1 = float64_abs(make_float64(b)); 1268 return -float64_le(f1, f0, fpst); 1269 } 1270 1271 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1272 { 1273 float_status *fpst = fpstp; 1274 float64 f0 = float64_abs(make_float64(a)); 1275 float64 f1 = float64_abs(make_float64(b)); 1276 return -float64_lt(f1, f0, fpst); 1277 } 1278 1279 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1280 1281 void HELPER(neon_qunzip8)(void *vd, void *vm) 1282 { 1283 uint64_t *rd = vd, *rm = vm; 1284 uint64_t zd0 = rd[0], zd1 = rd[1]; 1285 uint64_t zm0 = rm[0], zm1 = rm[1]; 1286 1287 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1288 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1289 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1290 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1291 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1292 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1293 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1294 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1295 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1296 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1297 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1298 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1299 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1300 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1301 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1302 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1303 1304 rm[0] = m0; 1305 rm[1] = m1; 1306 rd[0] = d0; 1307 rd[1] = d1; 1308 } 1309 1310 void HELPER(neon_qunzip16)(void *vd, void *vm) 1311 { 1312 uint64_t *rd = vd, *rm = vm; 1313 uint64_t zd0 = rd[0], zd1 = rd[1]; 1314 uint64_t zm0 = rm[0], zm1 = rm[1]; 1315 1316 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1317 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1318 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1319 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1320 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1321 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1322 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1323 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1324 1325 rm[0] = m0; 1326 rm[1] = m1; 1327 rd[0] = d0; 1328 rd[1] = d1; 1329 } 1330 1331 void HELPER(neon_qunzip32)(void *vd, void *vm) 1332 { 1333 uint64_t *rd = vd, *rm = vm; 1334 uint64_t zd0 = rd[0], zd1 = rd[1]; 1335 uint64_t zm0 = rm[0], zm1 = rm[1]; 1336 1337 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1338 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1339 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1340 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1341 1342 rm[0] = m0; 1343 rm[1] = m1; 1344 rd[0] = d0; 1345 rd[1] = d1; 1346 } 1347 1348 void HELPER(neon_unzip8)(void *vd, void *vm) 1349 { 1350 uint64_t *rd = vd, *rm = vm; 1351 uint64_t zd = rd[0], zm = rm[0]; 1352 1353 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1354 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1355 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1356 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1357 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1358 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1359 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1360 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1361 1362 rm[0] = m0; 1363 rd[0] = d0; 1364 } 1365 1366 void HELPER(neon_unzip16)(void *vd, void *vm) 1367 { 1368 uint64_t *rd = vd, *rm = vm; 1369 uint64_t zd = rd[0], zm = rm[0]; 1370 1371 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1372 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1373 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1374 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1375 1376 rm[0] = m0; 1377 rd[0] = d0; 1378 } 1379 1380 void HELPER(neon_qzip8)(void *vd, void *vm) 1381 { 1382 uint64_t *rd = vd, *rm = vm; 1383 uint64_t zd0 = rd[0], zd1 = rd[1]; 1384 uint64_t zm0 = rm[0], zm1 = rm[1]; 1385 1386 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1387 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1388 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1389 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1390 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1391 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1392 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1393 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1394 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1395 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1396 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1397 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1398 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1399 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1400 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1401 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1402 1403 rm[0] = m0; 1404 rm[1] = m1; 1405 rd[0] = d0; 1406 rd[1] = d1; 1407 } 1408 1409 void HELPER(neon_qzip16)(void *vd, void *vm) 1410 { 1411 uint64_t *rd = vd, *rm = vm; 1412 uint64_t zd0 = rd[0], zd1 = rd[1]; 1413 uint64_t zm0 = rm[0], zm1 = rm[1]; 1414 1415 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1416 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1417 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1418 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1419 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1420 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1421 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1422 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1423 1424 rm[0] = m0; 1425 rm[1] = m1; 1426 rd[0] = d0; 1427 rd[1] = d1; 1428 } 1429 1430 void HELPER(neon_qzip32)(void *vd, void *vm) 1431 { 1432 uint64_t *rd = vd, *rm = vm; 1433 uint64_t zd0 = rd[0], zd1 = rd[1]; 1434 uint64_t zm0 = rm[0], zm1 = rm[1]; 1435 1436 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1437 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1438 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1439 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1440 1441 rm[0] = m0; 1442 rm[1] = m1; 1443 rd[0] = d0; 1444 rd[1] = d1; 1445 } 1446 1447 void HELPER(neon_zip8)(void *vd, void *vm) 1448 { 1449 uint64_t *rd = vd, *rm = vm; 1450 uint64_t zd = rd[0], zm = rm[0]; 1451 1452 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1453 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1454 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1455 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1456 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1457 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1458 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1459 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1460 1461 rm[0] = m0; 1462 rd[0] = d0; 1463 } 1464 1465 void HELPER(neon_zip16)(void *vd, void *vm) 1466 { 1467 uint64_t *rd = vd, *rm = vm; 1468 uint64_t zd = rd[0], zm = rm[0]; 1469 1470 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1471 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1472 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1473 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1474 1475 rm[0] = m0; 1476 rd[0] = d0; 1477 } 1478