1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "tcg/tcg-gvec-desc.h" 14 #include "fpu/softfloat.h" 15 #include "vec_internal.h" 16 17 #define SIGNBIT (uint32_t)0x80000000 18 #define SIGNBIT64 ((uint64_t)1 << 63) 19 20 #define SET_QC() env->vfp.qc[0] = 1 21 22 #define NEON_TYPE1(name, type) \ 23 typedef struct \ 24 { \ 25 type v1; \ 26 } neon_##name; 27 #if HOST_BIG_ENDIAN 28 #define NEON_TYPE2(name, type) \ 29 typedef struct \ 30 { \ 31 type v2; \ 32 type v1; \ 33 } neon_##name; 34 #define NEON_TYPE4(name, type) \ 35 typedef struct \ 36 { \ 37 type v4; \ 38 type v3; \ 39 type v2; \ 40 type v1; \ 41 } neon_##name; 42 #else 43 #define NEON_TYPE2(name, type) \ 44 typedef struct \ 45 { \ 46 type v1; \ 47 type v2; \ 48 } neon_##name; 49 #define NEON_TYPE4(name, type) \ 50 typedef struct \ 51 { \ 52 type v1; \ 53 type v2; \ 54 type v3; \ 55 type v4; \ 56 } neon_##name; 57 #endif 58 59 NEON_TYPE4(s8, int8_t) 60 NEON_TYPE4(u8, uint8_t) 61 NEON_TYPE2(s16, int16_t) 62 NEON_TYPE2(u16, uint16_t) 63 NEON_TYPE1(s32, int32_t) 64 NEON_TYPE1(u32, uint32_t) 65 #undef NEON_TYPE4 66 #undef NEON_TYPE2 67 #undef NEON_TYPE1 68 69 /* Copy from a uint32_t to a vector structure type. */ 70 #define NEON_UNPACK(vtype, dest, val) do { \ 71 union { \ 72 vtype v; \ 73 uint32_t i; \ 74 } conv_u; \ 75 conv_u.i = (val); \ 76 dest = conv_u.v; \ 77 } while(0) 78 79 /* Copy from a vector structure type to a uint32_t. */ 80 #define NEON_PACK(vtype, dest, val) do { \ 81 union { \ 82 vtype v; \ 83 uint32_t i; \ 84 } conv_u; \ 85 conv_u.v = (val); \ 86 dest = conv_u.i; \ 87 } while(0) 88 89 #define NEON_DO1 \ 90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 91 #define NEON_DO2 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 94 #define NEON_DO4 \ 95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 99 100 #define NEON_VOP_BODY(vtype, n) \ 101 { \ 102 uint32_t res; \ 103 vtype vsrc1; \ 104 vtype vsrc2; \ 105 vtype vdest; \ 106 NEON_UNPACK(vtype, vsrc1, arg1); \ 107 NEON_UNPACK(vtype, vsrc2, arg2); \ 108 NEON_DO##n; \ 109 NEON_PACK(vtype, res, vdest); \ 110 return res; \ 111 } 112 113 #define NEON_VOP(name, vtype, n) \ 114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 115 NEON_VOP_BODY(vtype, n) 116 117 #define NEON_VOP_ENV(name, vtype, n) \ 118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 119 NEON_VOP_BODY(vtype, n) 120 121 #define NEON_GVEC_VOP2(name, vtype) \ 122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 123 { \ 124 intptr_t i, opr_sz = simd_oprsz(desc); \ 125 vtype *d = vd, *n = vn, *m = vm; \ 126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 127 NEON_FN(d[i], n[i], m[i]); \ 128 } \ 129 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 130 } 131 132 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \ 134 { \ 135 intptr_t i, opr_sz = simd_oprsz(desc); \ 136 vtype *d = vd, *n = vn, *m = vm; \ 137 CPUARMState *env = venv; \ 138 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 139 NEON_FN(d[i], n[i], m[i]); \ 140 } \ 141 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 142 } 143 144 #define NEON_GVEC_VOP2i_ENV(name, vtype) \ 145 void HELPER(name)(void *vd, void *vn, void *venv, uint32_t desc) \ 146 { \ 147 intptr_t i, opr_sz = simd_oprsz(desc); \ 148 int imm = simd_data(desc); \ 149 vtype *d = vd, *n = vn; \ 150 CPUARMState *env = venv; \ 151 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 152 NEON_FN(d[i], n[i], imm); \ 153 } \ 154 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 155 } 156 157 /* Pairwise operations. */ 158 /* For 32-bit elements each segment only contains a single element, so 159 the elementwise and pairwise operations are the same. */ 160 #define NEON_PDO2 \ 161 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 162 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 163 #define NEON_PDO4 \ 164 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 165 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 166 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 167 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 168 169 #define NEON_POP(name, vtype, n) \ 170 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 171 { \ 172 uint32_t res; \ 173 vtype vsrc1; \ 174 vtype vsrc2; \ 175 vtype vdest; \ 176 NEON_UNPACK(vtype, vsrc1, arg1); \ 177 NEON_UNPACK(vtype, vsrc2, arg2); \ 178 NEON_PDO##n; \ 179 NEON_PACK(vtype, res, vdest); \ 180 return res; \ 181 } 182 183 /* Unary operators. */ 184 #define NEON_VOP1(name, vtype, n) \ 185 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 186 { \ 187 vtype vsrc1; \ 188 vtype vdest; \ 189 NEON_UNPACK(vtype, vsrc1, arg); \ 190 NEON_DO##n; \ 191 NEON_PACK(vtype, arg, vdest); \ 192 return arg; \ 193 } 194 195 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 196 NEON_POP(pmin_s8, neon_s8, 4) 197 NEON_POP(pmin_u8, neon_u8, 4) 198 NEON_POP(pmin_s16, neon_s16, 2) 199 NEON_POP(pmin_u16, neon_u16, 2) 200 #undef NEON_FN 201 202 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 203 NEON_POP(pmax_s8, neon_s8, 4) 204 NEON_POP(pmax_u8, neon_u8, 4) 205 NEON_POP(pmax_s16, neon_s16, 2) 206 NEON_POP(pmax_u16, neon_u16, 2) 207 #undef NEON_FN 208 209 #define NEON_FN(dest, src1, src2) \ 210 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 211 NEON_VOP(shl_u16, neon_u16, 2) 212 #undef NEON_FN 213 214 #define NEON_FN(dest, src1, src2) \ 215 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 216 NEON_VOP(shl_s16, neon_s16, 2) 217 #undef NEON_FN 218 219 #define NEON_FN(dest, src1, src2) \ 220 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 221 NEON_VOP(rshl_s8, neon_s8, 4) 222 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 223 #undef NEON_FN 224 225 #define NEON_FN(dest, src1, src2) \ 226 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 227 NEON_VOP(rshl_s16, neon_s16, 2) 228 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 229 #undef NEON_FN 230 231 #define NEON_FN(dest, src1, src2) \ 232 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 233 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 234 #undef NEON_FN 235 236 #define NEON_FN(dest, src1, src2) \ 237 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 238 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 239 #undef NEON_FN 240 241 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 242 { 243 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 244 } 245 246 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 247 { 248 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 249 } 250 251 #define NEON_FN(dest, src1, src2) \ 252 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 253 NEON_VOP(rshl_u8, neon_u8, 4) 254 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 255 #undef NEON_FN 256 257 #define NEON_FN(dest, src1, src2) \ 258 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 259 NEON_VOP(rshl_u16, neon_u16, 2) 260 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 261 #undef NEON_FN 262 263 #define NEON_FN(dest, src1, src2) \ 264 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 265 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 266 #undef NEON_FN 267 268 #define NEON_FN(dest, src1, src2) \ 269 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 270 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 271 #undef NEON_FN 272 273 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 274 { 275 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 276 } 277 278 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 279 { 280 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 281 } 282 283 #define NEON_FN(dest, src1, src2) \ 284 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 285 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 286 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 287 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t) 288 #undef NEON_FN 289 290 #define NEON_FN(dest, src1, src2) \ 291 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 292 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 293 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 294 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t) 295 #undef NEON_FN 296 297 #define NEON_FN(dest, src1, src2) \ 298 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 299 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 300 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t) 301 #undef NEON_FN 302 303 #define NEON_FN(dest, src1, src2) \ 304 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 305 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 306 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t) 307 #undef NEON_FN 308 309 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 310 { 311 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 312 } 313 314 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 315 { 316 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 317 } 318 319 #define NEON_FN(dest, src1, src2) \ 320 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 321 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 322 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 323 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t) 324 #undef NEON_FN 325 326 #define NEON_FN(dest, src1, src2) \ 327 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 328 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 329 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 330 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t) 331 #undef NEON_FN 332 333 #define NEON_FN(dest, src1, src2) \ 334 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 335 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 336 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t) 337 #undef NEON_FN 338 339 #define NEON_FN(dest, src1, src2) \ 340 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 341 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 342 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t) 343 #undef NEON_FN 344 345 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 346 { 347 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 348 } 349 350 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 351 { 352 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 353 } 354 355 #define NEON_FN(dest, src1, src2) \ 356 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 357 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 358 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t) 359 #undef NEON_FN 360 361 #define NEON_FN(dest, src1, src2) \ 362 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 363 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 364 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t) 365 #undef NEON_FN 366 367 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 368 { 369 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 370 } 371 372 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 373 { 374 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 375 } 376 377 #define NEON_FN(dest, src1, src2) \ 378 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 379 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t) 380 #undef NEON_FN 381 382 #define NEON_FN(dest, src1, src2) \ 383 (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 384 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t) 385 #undef NEON_FN 386 387 #define NEON_FN(dest, src1, src2) \ 388 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 389 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 390 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) 391 #undef NEON_FN 392 393 #define NEON_FN(dest, src1, src2) \ 394 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 395 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 396 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t) 397 #undef NEON_FN 398 399 #define NEON_FN(dest, src1, src2) \ 400 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 401 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t) 402 #undef NEON_FN 403 404 #define NEON_FN(dest, src1, src2) \ 405 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 406 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t) 407 #undef NEON_FN 408 409 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 410 { 411 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 412 } 413 414 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 415 { 416 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 417 } 418 419 #define NEON_FN(dest, src1, src2) \ 420 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 421 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 422 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t) 423 #undef NEON_FN 424 425 #define NEON_FN(dest, src1, src2) \ 426 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 427 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 428 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t) 429 #undef NEON_FN 430 431 #define NEON_FN(dest, src1, src2) \ 432 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 433 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t) 434 #undef NEON_FN 435 436 #define NEON_FN(dest, src1, src2) \ 437 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 438 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t) 439 #undef NEON_FN 440 441 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 442 { 443 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 444 } 445 446 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 447 { 448 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 449 } 450 451 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 452 { 453 uint32_t mask; 454 mask = (a ^ b) & 0x80808080u; 455 a &= ~0x80808080u; 456 b &= ~0x80808080u; 457 return (a + b) ^ mask; 458 } 459 460 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 461 { 462 uint32_t mask; 463 mask = (a ^ b) & 0x80008000u; 464 a &= ~0x80008000u; 465 b &= ~0x80008000u; 466 return (a + b) ^ mask; 467 } 468 469 #define NEON_FN(dest, src1, src2) dest = src1 - src2 470 NEON_VOP(sub_u8, neon_u8, 4) 471 NEON_VOP(sub_u16, neon_u16, 2) 472 #undef NEON_FN 473 474 #define NEON_FN(dest, src1, src2) dest = src1 * src2 475 NEON_VOP(mul_u8, neon_u8, 4) 476 NEON_VOP(mul_u16, neon_u16, 2) 477 #undef NEON_FN 478 479 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 480 NEON_VOP(tst_u8, neon_u8, 4) 481 NEON_VOP(tst_u16, neon_u16, 2) 482 NEON_VOP(tst_u32, neon_u32, 1) 483 #undef NEON_FN 484 485 /* Count Leading Sign/Zero Bits. */ 486 static inline int do_clz8(uint8_t x) 487 { 488 int n; 489 for (n = 8; x; n--) 490 x >>= 1; 491 return n; 492 } 493 494 static inline int do_clz16(uint16_t x) 495 { 496 int n; 497 for (n = 16; x; n--) 498 x >>= 1; 499 return n; 500 } 501 502 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 503 NEON_VOP1(clz_u8, neon_u8, 4) 504 #undef NEON_FN 505 506 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 507 NEON_VOP1(clz_u16, neon_u16, 2) 508 #undef NEON_FN 509 510 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 511 NEON_VOP1(cls_s8, neon_s8, 4) 512 #undef NEON_FN 513 514 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 515 NEON_VOP1(cls_s16, neon_s16, 2) 516 #undef NEON_FN 517 518 uint32_t HELPER(neon_cls_s32)(uint32_t x) 519 { 520 int count; 521 if ((int32_t)x < 0) 522 x = ~x; 523 for (count = 32; x; count--) 524 x = x >> 1; 525 return count - 1; 526 } 527 528 /* Bit count. */ 529 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 530 { 531 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 532 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 533 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 534 return x; 535 } 536 537 /* Reverse bits in each 8 bit word */ 538 uint32_t HELPER(neon_rbit_u8)(uint32_t x) 539 { 540 x = ((x & 0xf0f0f0f0) >> 4) 541 | ((x & 0x0f0f0f0f) << 4); 542 x = ((x & 0x88888888) >> 3) 543 | ((x & 0x44444444) >> 1) 544 | ((x & 0x22222222) << 1) 545 | ((x & 0x11111111) << 3); 546 return x; 547 } 548 549 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 550 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 551 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 552 SET_QC(); \ 553 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 554 } else { \ 555 tmp <<= 1; \ 556 } \ 557 if (round) { \ 558 int32_t old = tmp; \ 559 tmp += 1 << 15; \ 560 if ((int32_t)tmp < old) { \ 561 SET_QC(); \ 562 tmp = SIGNBIT - 1; \ 563 } \ 564 } \ 565 dest = tmp >> 16; \ 566 } while(0) 567 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 568 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 569 #undef NEON_FN 570 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 571 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 572 #undef NEON_FN 573 #undef NEON_QDMULH16 574 575 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 576 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 577 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 578 SET_QC(); \ 579 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 580 } else { \ 581 tmp <<= 1; \ 582 } \ 583 if (round) { \ 584 int64_t old = tmp; \ 585 tmp += (int64_t)1 << 31; \ 586 if ((int64_t)tmp < old) { \ 587 SET_QC(); \ 588 tmp = SIGNBIT64 - 1; \ 589 } \ 590 } \ 591 dest = tmp >> 32; \ 592 } while(0) 593 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 594 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 595 #undef NEON_FN 596 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 597 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 598 #undef NEON_FN 599 #undef NEON_QDMULH32 600 601 /* Only the low 32-bits of output are significant. */ 602 uint64_t HELPER(neon_narrow_u8)(uint64_t x) 603 { 604 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 605 | ((x >> 24) & 0xff000000u); 606 } 607 608 /* Only the low 32-bits of output are significant. */ 609 uint64_t HELPER(neon_narrow_u16)(uint64_t x) 610 { 611 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 612 } 613 614 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 615 { 616 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 617 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 618 } 619 620 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 621 { 622 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 623 } 624 625 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 626 { 627 x &= 0xff80ff80ff80ff80ull; 628 x += 0x0080008000800080ull; 629 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 630 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 631 } 632 633 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 634 { 635 x &= 0xffff8000ffff8000ull; 636 x += 0x0000800000008000ull; 637 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 638 } 639 640 /* Only the low 32-bits of output are significant. */ 641 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 642 { 643 uint16_t s; 644 uint8_t d; 645 uint32_t res = 0; 646 #define SAT8(n) \ 647 s = x >> n; \ 648 if (s & 0x8000) { \ 649 SET_QC(); \ 650 } else { \ 651 if (s > 0xff) { \ 652 d = 0xff; \ 653 SET_QC(); \ 654 } else { \ 655 d = s; \ 656 } \ 657 res |= (uint32_t)d << (n / 2); \ 658 } 659 660 SAT8(0); 661 SAT8(16); 662 SAT8(32); 663 SAT8(48); 664 #undef SAT8 665 return res; 666 } 667 668 /* Only the low 32-bits of output are significant. */ 669 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 670 { 671 uint16_t s; 672 uint8_t d; 673 uint32_t res = 0; 674 #define SAT8(n) \ 675 s = x >> n; \ 676 if (s > 0xff) { \ 677 d = 0xff; \ 678 SET_QC(); \ 679 } else { \ 680 d = s; \ 681 } \ 682 res |= (uint32_t)d << (n / 2); 683 684 SAT8(0); 685 SAT8(16); 686 SAT8(32); 687 SAT8(48); 688 #undef SAT8 689 return res; 690 } 691 692 /* Only the low 32-bits of output are significant. */ 693 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 694 { 695 int16_t s; 696 uint8_t d; 697 uint32_t res = 0; 698 #define SAT8(n) \ 699 s = x >> n; \ 700 if (s != (int8_t)s) { \ 701 d = (s >> 15) ^ 0x7f; \ 702 SET_QC(); \ 703 } else { \ 704 d = s; \ 705 } \ 706 res |= (uint32_t)d << (n / 2); 707 708 SAT8(0); 709 SAT8(16); 710 SAT8(32); 711 SAT8(48); 712 #undef SAT8 713 return res; 714 } 715 716 /* Only the low 32-bits of output are significant. */ 717 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 718 { 719 uint32_t high; 720 uint32_t low; 721 low = x; 722 if (low & 0x80000000) { 723 low = 0; 724 SET_QC(); 725 } else if (low > 0xffff) { 726 low = 0xffff; 727 SET_QC(); 728 } 729 high = x >> 32; 730 if (high & 0x80000000) { 731 high = 0; 732 SET_QC(); 733 } else if (high > 0xffff) { 734 high = 0xffff; 735 SET_QC(); 736 } 737 return deposit32(low, 16, 16, high); 738 } 739 740 /* Only the low 32-bits of output are significant. */ 741 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 742 { 743 uint32_t high; 744 uint32_t low; 745 low = x; 746 if (low > 0xffff) { 747 low = 0xffff; 748 SET_QC(); 749 } 750 high = x >> 32; 751 if (high > 0xffff) { 752 high = 0xffff; 753 SET_QC(); 754 } 755 return deposit32(low, 16, 16, high); 756 } 757 758 /* Only the low 32-bits of output are significant. */ 759 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 760 { 761 int32_t low; 762 int32_t high; 763 low = x; 764 if (low != (int16_t)low) { 765 low = (low >> 31) ^ 0x7fff; 766 SET_QC(); 767 } 768 high = x >> 32; 769 if (high != (int16_t)high) { 770 high = (high >> 31) ^ 0x7fff; 771 SET_QC(); 772 } 773 return deposit32(low, 16, 16, high); 774 } 775 776 /* Only the low 32-bits of output are significant. */ 777 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 778 { 779 if (x & 0x8000000000000000ull) { 780 SET_QC(); 781 return 0; 782 } 783 if (x > 0xffffffffu) { 784 SET_QC(); 785 return 0xffffffffu; 786 } 787 return x; 788 } 789 790 /* Only the low 32-bits of output are significant. */ 791 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 792 { 793 if (x > 0xffffffffu) { 794 SET_QC(); 795 return 0xffffffffu; 796 } 797 return x; 798 } 799 800 /* Only the low 32-bits of output are significant. */ 801 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 802 { 803 if ((int64_t)x != (int32_t)x) { 804 SET_QC(); 805 return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff; 806 } 807 return (uint32_t)x; 808 } 809 810 uint64_t HELPER(neon_widen_u8)(uint32_t x) 811 { 812 uint64_t tmp; 813 uint64_t ret; 814 ret = (uint8_t)x; 815 tmp = (uint8_t)(x >> 8); 816 ret |= tmp << 16; 817 tmp = (uint8_t)(x >> 16); 818 ret |= tmp << 32; 819 tmp = (uint8_t)(x >> 24); 820 ret |= tmp << 48; 821 return ret; 822 } 823 824 uint64_t HELPER(neon_widen_s8)(uint32_t x) 825 { 826 uint64_t tmp; 827 uint64_t ret; 828 ret = (uint16_t)(int8_t)x; 829 tmp = (uint16_t)(int8_t)(x >> 8); 830 ret |= tmp << 16; 831 tmp = (uint16_t)(int8_t)(x >> 16); 832 ret |= tmp << 32; 833 tmp = (uint16_t)(int8_t)(x >> 24); 834 ret |= tmp << 48; 835 return ret; 836 } 837 838 uint64_t HELPER(neon_widen_u16)(uint32_t x) 839 { 840 uint64_t high = (uint16_t)(x >> 16); 841 return ((uint16_t)x) | (high << 32); 842 } 843 844 uint64_t HELPER(neon_widen_s16)(uint32_t x) 845 { 846 uint64_t high = (int16_t)(x >> 16); 847 return ((uint32_t)(int16_t)x) | (high << 32); 848 } 849 850 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 851 { 852 uint64_t mask; 853 mask = (a ^ b) & 0x8000800080008000ull; 854 a &= ~0x8000800080008000ull; 855 b &= ~0x8000800080008000ull; 856 return (a + b) ^ mask; 857 } 858 859 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 860 { 861 uint64_t mask; 862 mask = (a ^ b) & 0x8000000080000000ull; 863 a &= ~0x8000000080000000ull; 864 b &= ~0x8000000080000000ull; 865 return (a + b) ^ mask; 866 } 867 868 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 869 { 870 uint64_t tmp; 871 uint64_t tmp2; 872 873 tmp = a & 0x0000ffff0000ffffull; 874 tmp += (a >> 16) & 0x0000ffff0000ffffull; 875 tmp2 = b & 0xffff0000ffff0000ull; 876 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 877 return ( tmp & 0xffff) 878 | ((tmp >> 16) & 0xffff0000ull) 879 | ((tmp2 << 16) & 0xffff00000000ull) 880 | ( tmp2 & 0xffff000000000000ull); 881 } 882 883 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 884 { 885 uint32_t low = a + (a >> 32); 886 uint32_t high = b + (b >> 32); 887 return low + ((uint64_t)high << 32); 888 } 889 890 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 891 { 892 uint64_t mask; 893 mask = (a ^ ~b) & 0x8000800080008000ull; 894 a |= 0x8000800080008000ull; 895 b &= ~0x8000800080008000ull; 896 return (a - b) ^ mask; 897 } 898 899 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 900 { 901 uint64_t mask; 902 mask = (a ^ ~b) & 0x8000000080000000ull; 903 a |= 0x8000000080000000ull; 904 b &= ~0x8000000080000000ull; 905 return (a - b) ^ mask; 906 } 907 908 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 909 { 910 uint32_t x, y; 911 uint32_t low, high; 912 913 x = a; 914 y = b; 915 low = x + y; 916 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 917 SET_QC(); 918 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 919 } 920 x = a >> 32; 921 y = b >> 32; 922 high = x + y; 923 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 924 SET_QC(); 925 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 926 } 927 return low | ((uint64_t)high << 32); 928 } 929 930 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 931 { 932 uint64_t result; 933 934 result = a + b; 935 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 936 SET_QC(); 937 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 938 } 939 return result; 940 } 941 942 /* We have to do the arithmetic in a larger type than 943 * the input type, because for example with a signed 32 bit 944 * op the absolute difference can overflow a signed 32 bit value. 945 */ 946 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 947 arithtype tmp_x = (intype)(x); \ 948 arithtype tmp_y = (intype)(y); \ 949 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 950 } while(0) 951 952 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 953 { 954 uint64_t tmp; 955 uint64_t result; 956 DO_ABD(result, a, b, uint8_t, uint32_t); 957 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 958 result |= tmp << 16; 959 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 960 result |= tmp << 32; 961 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 962 result |= tmp << 48; 963 return result; 964 } 965 966 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 967 { 968 uint64_t tmp; 969 uint64_t result; 970 DO_ABD(result, a, b, int8_t, int32_t); 971 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 972 result |= tmp << 16; 973 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 974 result |= tmp << 32; 975 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 976 result |= tmp << 48; 977 return result; 978 } 979 980 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 981 { 982 uint64_t tmp; 983 uint64_t result; 984 DO_ABD(result, a, b, uint16_t, uint32_t); 985 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 986 return result | (tmp << 32); 987 } 988 989 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 990 { 991 uint64_t tmp; 992 uint64_t result; 993 DO_ABD(result, a, b, int16_t, int32_t); 994 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 995 return result | (tmp << 32); 996 } 997 998 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 999 { 1000 uint64_t result; 1001 DO_ABD(result, a, b, uint32_t, uint64_t); 1002 return result; 1003 } 1004 1005 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1006 { 1007 uint64_t result; 1008 DO_ABD(result, a, b, int32_t, int64_t); 1009 return result; 1010 } 1011 #undef DO_ABD 1012 1013 /* Widening multiply. Named type is the source type. */ 1014 #define DO_MULL(dest, x, y, type1, type2) do { \ 1015 type1 tmp_x = x; \ 1016 type1 tmp_y = y; \ 1017 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1018 } while(0) 1019 1020 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1021 { 1022 uint64_t tmp; 1023 uint64_t result; 1024 1025 DO_MULL(result, a, b, uint8_t, uint16_t); 1026 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1027 result |= tmp << 16; 1028 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1029 result |= tmp << 32; 1030 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1031 result |= tmp << 48; 1032 return result; 1033 } 1034 1035 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1036 { 1037 uint64_t tmp; 1038 uint64_t result; 1039 1040 DO_MULL(result, a, b, int8_t, uint16_t); 1041 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1042 result |= tmp << 16; 1043 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1044 result |= tmp << 32; 1045 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1046 result |= tmp << 48; 1047 return result; 1048 } 1049 1050 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1051 { 1052 uint64_t tmp; 1053 uint64_t result; 1054 1055 DO_MULL(result, a, b, uint16_t, uint32_t); 1056 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1057 return result | (tmp << 32); 1058 } 1059 1060 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1061 { 1062 uint64_t tmp; 1063 uint64_t result; 1064 1065 DO_MULL(result, a, b, int16_t, uint32_t); 1066 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1067 return result | (tmp << 32); 1068 } 1069 1070 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1071 { 1072 uint16_t tmp; 1073 uint64_t result; 1074 result = (uint16_t)-x; 1075 tmp = -(x >> 16); 1076 result |= (uint64_t)tmp << 16; 1077 tmp = -(x >> 32); 1078 result |= (uint64_t)tmp << 32; 1079 tmp = -(x >> 48); 1080 result |= (uint64_t)tmp << 48; 1081 return result; 1082 } 1083 1084 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1085 { 1086 uint32_t low = -x; 1087 uint32_t high = -(x >> 32); 1088 return low | ((uint64_t)high << 32); 1089 } 1090 1091 /* Saturating sign manipulation. */ 1092 /* ??? Make these use NEON_VOP1 */ 1093 #define DO_QABS8(x) do { \ 1094 if (x == (int8_t)0x80) { \ 1095 x = 0x7f; \ 1096 SET_QC(); \ 1097 } else if (x < 0) { \ 1098 x = -x; \ 1099 }} while (0) 1100 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1101 { 1102 neon_s8 vec; 1103 NEON_UNPACK(neon_s8, vec, x); 1104 DO_QABS8(vec.v1); 1105 DO_QABS8(vec.v2); 1106 DO_QABS8(vec.v3); 1107 DO_QABS8(vec.v4); 1108 NEON_PACK(neon_s8, x, vec); 1109 return x; 1110 } 1111 #undef DO_QABS8 1112 1113 #define DO_QNEG8(x) do { \ 1114 if (x == (int8_t)0x80) { \ 1115 x = 0x7f; \ 1116 SET_QC(); \ 1117 } else { \ 1118 x = -x; \ 1119 }} while (0) 1120 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1121 { 1122 neon_s8 vec; 1123 NEON_UNPACK(neon_s8, vec, x); 1124 DO_QNEG8(vec.v1); 1125 DO_QNEG8(vec.v2); 1126 DO_QNEG8(vec.v3); 1127 DO_QNEG8(vec.v4); 1128 NEON_PACK(neon_s8, x, vec); 1129 return x; 1130 } 1131 #undef DO_QNEG8 1132 1133 #define DO_QABS16(x) do { \ 1134 if (x == (int16_t)0x8000) { \ 1135 x = 0x7fff; \ 1136 SET_QC(); \ 1137 } else if (x < 0) { \ 1138 x = -x; \ 1139 }} while (0) 1140 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1141 { 1142 neon_s16 vec; 1143 NEON_UNPACK(neon_s16, vec, x); 1144 DO_QABS16(vec.v1); 1145 DO_QABS16(vec.v2); 1146 NEON_PACK(neon_s16, x, vec); 1147 return x; 1148 } 1149 #undef DO_QABS16 1150 1151 #define DO_QNEG16(x) do { \ 1152 if (x == (int16_t)0x8000) { \ 1153 x = 0x7fff; \ 1154 SET_QC(); \ 1155 } else { \ 1156 x = -x; \ 1157 }} while (0) 1158 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1159 { 1160 neon_s16 vec; 1161 NEON_UNPACK(neon_s16, vec, x); 1162 DO_QNEG16(vec.v1); 1163 DO_QNEG16(vec.v2); 1164 NEON_PACK(neon_s16, x, vec); 1165 return x; 1166 } 1167 #undef DO_QNEG16 1168 1169 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1170 { 1171 if (x == SIGNBIT) { 1172 SET_QC(); 1173 x = ~SIGNBIT; 1174 } else if ((int32_t)x < 0) { 1175 x = -x; 1176 } 1177 return x; 1178 } 1179 1180 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1181 { 1182 if (x == SIGNBIT) { 1183 SET_QC(); 1184 x = ~SIGNBIT; 1185 } else { 1186 x = -x; 1187 } 1188 return x; 1189 } 1190 1191 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1192 { 1193 if (x == SIGNBIT64) { 1194 SET_QC(); 1195 x = ~SIGNBIT64; 1196 } else if ((int64_t)x < 0) { 1197 x = -x; 1198 } 1199 return x; 1200 } 1201 1202 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1203 { 1204 if (x == SIGNBIT64) { 1205 SET_QC(); 1206 x = ~SIGNBIT64; 1207 } else { 1208 x = -x; 1209 } 1210 return x; 1211 } 1212 1213 /* NEON Float helpers. */ 1214 1215 /* Floating point comparisons produce an integer result. 1216 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1217 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1218 */ 1219 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1220 { 1221 float_status *fpst = fpstp; 1222 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1223 } 1224 1225 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1226 { 1227 float_status *fpst = fpstp; 1228 return -float32_le(make_float32(b), make_float32(a), fpst); 1229 } 1230 1231 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1232 { 1233 float_status *fpst = fpstp; 1234 return -float32_lt(make_float32(b), make_float32(a), fpst); 1235 } 1236 1237 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1238 { 1239 float_status *fpst = fpstp; 1240 float32 f0 = float32_abs(make_float32(a)); 1241 float32 f1 = float32_abs(make_float32(b)); 1242 return -float32_le(f1, f0, fpst); 1243 } 1244 1245 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1246 { 1247 float_status *fpst = fpstp; 1248 float32 f0 = float32_abs(make_float32(a)); 1249 float32 f1 = float32_abs(make_float32(b)); 1250 return -float32_lt(f1, f0, fpst); 1251 } 1252 1253 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1254 { 1255 float_status *fpst = fpstp; 1256 float64 f0 = float64_abs(make_float64(a)); 1257 float64 f1 = float64_abs(make_float64(b)); 1258 return -float64_le(f1, f0, fpst); 1259 } 1260 1261 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1262 { 1263 float_status *fpst = fpstp; 1264 float64 f0 = float64_abs(make_float64(a)); 1265 float64 f1 = float64_abs(make_float64(b)); 1266 return -float64_lt(f1, f0, fpst); 1267 } 1268 1269 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1270 1271 void HELPER(neon_qunzip8)(void *vd, void *vm) 1272 { 1273 uint64_t *rd = vd, *rm = vm; 1274 uint64_t zd0 = rd[0], zd1 = rd[1]; 1275 uint64_t zm0 = rm[0], zm1 = rm[1]; 1276 1277 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1278 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1279 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1280 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1281 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1282 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1283 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1284 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1285 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1286 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1287 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1288 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1289 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1290 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1291 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1292 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1293 1294 rm[0] = m0; 1295 rm[1] = m1; 1296 rd[0] = d0; 1297 rd[1] = d1; 1298 } 1299 1300 void HELPER(neon_qunzip16)(void *vd, void *vm) 1301 { 1302 uint64_t *rd = vd, *rm = vm; 1303 uint64_t zd0 = rd[0], zd1 = rd[1]; 1304 uint64_t zm0 = rm[0], zm1 = rm[1]; 1305 1306 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1307 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1308 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1309 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1310 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1311 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1312 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1313 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1314 1315 rm[0] = m0; 1316 rm[1] = m1; 1317 rd[0] = d0; 1318 rd[1] = d1; 1319 } 1320 1321 void HELPER(neon_qunzip32)(void *vd, void *vm) 1322 { 1323 uint64_t *rd = vd, *rm = vm; 1324 uint64_t zd0 = rd[0], zd1 = rd[1]; 1325 uint64_t zm0 = rm[0], zm1 = rm[1]; 1326 1327 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1328 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1329 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1330 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1331 1332 rm[0] = m0; 1333 rm[1] = m1; 1334 rd[0] = d0; 1335 rd[1] = d1; 1336 } 1337 1338 void HELPER(neon_unzip8)(void *vd, void *vm) 1339 { 1340 uint64_t *rd = vd, *rm = vm; 1341 uint64_t zd = rd[0], zm = rm[0]; 1342 1343 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1344 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1345 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1346 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1347 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1348 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1349 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1350 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1351 1352 rm[0] = m0; 1353 rd[0] = d0; 1354 } 1355 1356 void HELPER(neon_unzip16)(void *vd, void *vm) 1357 { 1358 uint64_t *rd = vd, *rm = vm; 1359 uint64_t zd = rd[0], zm = rm[0]; 1360 1361 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1362 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1363 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1364 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1365 1366 rm[0] = m0; 1367 rd[0] = d0; 1368 } 1369 1370 void HELPER(neon_qzip8)(void *vd, void *vm) 1371 { 1372 uint64_t *rd = vd, *rm = vm; 1373 uint64_t zd0 = rd[0], zd1 = rd[1]; 1374 uint64_t zm0 = rm[0], zm1 = rm[1]; 1375 1376 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1377 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1378 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1379 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1380 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1381 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1382 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1383 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1384 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1385 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1386 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1387 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1388 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1389 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1390 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1391 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1392 1393 rm[0] = m0; 1394 rm[1] = m1; 1395 rd[0] = d0; 1396 rd[1] = d1; 1397 } 1398 1399 void HELPER(neon_qzip16)(void *vd, void *vm) 1400 { 1401 uint64_t *rd = vd, *rm = vm; 1402 uint64_t zd0 = rd[0], zd1 = rd[1]; 1403 uint64_t zm0 = rm[0], zm1 = rm[1]; 1404 1405 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1406 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1407 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1408 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1409 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1410 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1411 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1412 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1413 1414 rm[0] = m0; 1415 rm[1] = m1; 1416 rd[0] = d0; 1417 rd[1] = d1; 1418 } 1419 1420 void HELPER(neon_qzip32)(void *vd, void *vm) 1421 { 1422 uint64_t *rd = vd, *rm = vm; 1423 uint64_t zd0 = rd[0], zd1 = rd[1]; 1424 uint64_t zm0 = rm[0], zm1 = rm[1]; 1425 1426 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1427 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1428 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1429 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1430 1431 rm[0] = m0; 1432 rm[1] = m1; 1433 rd[0] = d0; 1434 rd[1] = d1; 1435 } 1436 1437 void HELPER(neon_zip8)(void *vd, void *vm) 1438 { 1439 uint64_t *rd = vd, *rm = vm; 1440 uint64_t zd = rd[0], zm = rm[0]; 1441 1442 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1443 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1444 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1445 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1446 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1447 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1448 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1449 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1450 1451 rm[0] = m0; 1452 rd[0] = d0; 1453 } 1454 1455 void HELPER(neon_zip16)(void *vd, void *vm) 1456 { 1457 uint64_t *rd = vd, *rm = vm; 1458 uint64_t zd = rd[0], zm = rm[0]; 1459 1460 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1461 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1462 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1463 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1464 1465 rm[0] = m0; 1466 rd[0] = d0; 1467 } 1468