arm/tcg/vec_helper.c

22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
132  * Similarly for half-word elements.
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
168         ret = -ret;  in do_sqrdmlah_b()
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
228         ret = -ret;  in do_sqrdmlah_h()
243     uint32_t *sat = &env->vfp.qc[0];  in HELPER()
268     uint32_t *sat = &env->vfp.qc[0];  in HELPER()
324         int16_t mm = m[i];  in HELPER()  local
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);  in HELPER()
342         int16_t mm = m[i];  in HELPER()  local
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);  in HELPER()
360         int16_t mm = m[i];  in HELPER()  local
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);  in HELPER()
378         int16_t mm = m[i];  in HELPER()  local
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);  in HELPER()
440         int16_t mm = m[i];  in HELPER()  local
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);  in HELPER()
455         int16_t mm = m[i];  in HELPER()  local
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);  in HELPER()
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
469         ret = -ret;  in do_sqrdmlah_s()
484     uint32_t *sat = &env->vfp.qc[0];  in HELPER()
506     uint32_t *sat = &env->vfp.qc[0];  in HELPER()
559         int32_t mm = m[i];  in HELPER()  local
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);  in HELPER()
577         int32_t mm = m[i];  in HELPER()  local
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);  in HELPER()
595         int32_t mm = m[i];  in HELPER()  local
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);  in HELPER()
613         int32_t mm = m[i];  in HELPER()  local
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);  in HELPER()
675         int32_t mm = m[i];  in HELPER()  local
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);  in HELPER()
690         int32_t mm = m[i];  in HELPER()  local
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);  in HELPER()
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */  in do_sqrdmlah_d()
783         int64_t mm = m[i];  in HELPER()  local
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);  in HELPER()
797         int64_t mm = m[i];  in HELPER()  local
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);  in HELPER()
804 /* Integer 8 and 16-bit dot-product.
807  * with respect to the ordering of data within the quad-width lanes.
878 /* Similar for 2-way dot product */
1035         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;  in HELPER()
1103         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;  in HELPER()
1171         float64 e3 = m[i + 1 - flip] ^ negx_imag;  in HELPER()
1182  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1186     return -float16_eq_quiet(op1, op2, stat);  in float16_ceq()
1191     return -float32_eq_quiet(op1, op2, stat);  in float32_ceq()
1196     return -float64_eq_quiet(op1, op2, stat);  in float64_ceq()
1201     return -float16_le(op2, op1, stat);  in float16_cge()
1206     return -float32_le(op2, op1, stat);  in float32_cge()
1211     return -float64_le(op2, op1, stat);  in float64_cge()
1216     return -float16_lt(op2, op1, stat);  in float16_cgt()
1221     return -float32_lt(op2, op1, stat);  in float32_cgt()
1226     return -float64_lt(op2, op1, stat);  in float64_cgt()
1231     return -float16_le(float16_abs(op2), float16_abs(op1), stat);  in float16_acge()
1236     return -float32_le(float32_abs(op2), float32_abs(op1), stat);  in float32_acge()
1241     return -float64_le(float64_abs(op2), float64_abs(op1), stat);  in float64_acge()
1246     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);  in float16_acgt()
1251     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);  in float32_acgt()
1256     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);  in float64_acgt()
1339 /* Floating-point trigonometric starting value.
1405  * non-fused multiply-and-subtract.
1431 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1584 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */  in DO_3OP()
1717 /* For the indexed ops, SVE applies the index per 128-bit vector segment.  in DO_MULADD()
1729         TYPE mm = m[H(i + idx)];                                           \  in DO_MULADD()
1731             d[i + j] = n[i + j] * mm;                                      \  in DO_MULADD()
1751         TYPE mm = m[H(i + idx)];                                           \
1753             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1763 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1764 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1765 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1778         TYPE mm = m[H(i + idx)];                                           \
1780             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1804  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1823         TYPE mm = m[H(i + idx)];                                           \
1825             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1881 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1882 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1883 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1885 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1886 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1887 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1907         uint64_t nn = n[i], mm = m[i], dd = nn + mm;  local
1929         uint64_t nn = n[i], mm = m[i], dd = nn - mm;  in HELPER()  local
1930         if (nn < mm) {  in HELPER()
1951         int64_t nn = n[i], mm = m[i], dd = nn + mm;  in HELPER()  local
1952         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {  in HELPER()
1973         int64_t nn = n[i], mm = m[i], dd = nn - mm;  in HELPER()  local
1974         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {  in HELPER()
1996         int64_t mm = m[i];  in HELPER()  local
1997         uint64_t dd = nn + mm;  in HELPER()
1999         if (mm < 0) {  in HELPER()
2000             if (nn < (uint64_t)-mm) {  in HELPER()
2028         uint64_t mm = m[i];  in HELPER()  local
2029         int64_t dd = nn + mm;  in HELPER()
2031         if (mm > (uint64_t)(INT64_MAX - nn)) {  in HELPER()
2075         TYPE tmp = n[i] >> (shift - 1);                 \  in DO_SRA()
2100         TYPE tmp = n[i] >> (shift - 1);                 \
2125         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2144         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2186                 int shift = clz32(frac) - 21;
2188                 exp = f32_bias - f16_bias - shift + 1;
2193         exp += f32_bias - f16_bias;
2197     frac <<= 23 - 10;
2223     float_status *fpst = &env->vfp.fp_status[fpst_idx];  in do_fmlal()
2224     bool fz16 = env->vfp.fpcr & FPCR_FZ16;  in do_fmlal()
2231      * Pre-load all of the f16 data, avoiding overlap issues.  in do_fmlal()
2262         if (env->vfp.fpcr & FPCR_AH) {  in HELPER()
2278     float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];  in HELPER()
2279     bool fz16 = env->vfp.fpcr & FPCR_FZ16;  in HELPER()
2283         if (env->vfp.fpcr & FPCR_AH) {  in HELPER()
2294         float32 mm = float16_to_float32_by_bits(mm_16, fz16);  in HELPER()  local
2297         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status);  in HELPER()
2306     float_status *fpst = &env->vfp.fp_status[fpst_idx];  in do_fmlal_idx()
2307     bool fz16 = env->vfp.fpcr & FPCR_FZ16;  in do_fmlal_idx()
2316      * Pre-load all of the f16 data, avoiding overlap issues.  in do_fmlal_idx()
2346         if (env->vfp.fpcr & FPCR_AH) {  in HELPER()
2363     float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];  in HELPER()
2364     bool fz16 = env->vfp.fpcr & FPCR_FZ16;  in HELPER()
2368         if (env->vfp.fpcr & FPCR_AH) {  in HELPER()
2376         float32 mm = float16_to_float32_by_bits(mm_16, fz16);  in HELPER()  local
2384                 float32_muladd(nn, mm, aa, negf, status);  in HELPER()
2395         int8_t mm = m[i];  in HELPER()  local
2398         if (mm >= 0) {  in HELPER()
2399             if (mm < 8) {  in HELPER()
2400                 res = nn << mm;  in HELPER()
2403             res = nn >> (mm > -8 ? -mm : 7);  in HELPER()
2416         int8_t mm = m[i];   /* only 8 bits of shift are significant */  in HELPER()  local
2419         if (mm >= 0) {  in HELPER()
2420             if (mm < 16) {  in HELPER()
2421                 res = nn << mm;  in HELPER()
2424             res = nn >> (mm > -16 ? -mm : 15);  in HELPER()
2437         int8_t mm = m[i];  in HELPER()  local
2440         if (mm >= 0) {  in HELPER()
2441             if (mm < 8) {  in HELPER()
2442                 res = nn << mm;  in HELPER()
2445             if (mm > -8) {  in HELPER()
2446                 res = nn >> -mm;  in HELPER()
2460         int8_t mm = m[i];   /* only 8 bits of shift are significant */  in HELPER()  local
2463         if (mm >= 0) {  in HELPER()
2464             if (mm < 16) {  in HELPER()
2465                 res = nn << mm;  in HELPER()
2468             if (mm > -16) {  in HELPER()
2469                 res = nn >> -mm;  in HELPER()
2478  * 8x8->8 polynomial multiply.
2498  * 64x64->128 polynomial multiply.
2520     uint64_t nn = n[hi], mm = m[hi];  in HELPER()  local
2522     d[0] = clmul_8x4_packed(nn, mm);  in HELPER()
2524     mm >>= 32;  in HELPER()
2525     d[1] = clmul_8x4_packed(nn, mm);  in HELPER()
2561         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2587         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2611         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \  in DO_ABD()
2829              * which is a series of 128-bit vectors concatenated)
2845  * NxN -> N highpart multiply
2953  * Integer matrix-multiply accumulate
3038      * mode and denormal-flushing, and we do unfused multiplies and  in DO_MMLA_B()
3040      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,  in DO_MMLA_B()
3041      * and we perform a fused two-way sum-of-products without intermediate  in DO_MMLA_B()
3048     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;  in DO_MMLA_B()
3050     *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32];  in DO_MMLA_B()
3054         /* EBF=1 needs to do a step with round-to-odd semantics */  in DO_MMLA_B()
3130          * by performing the first multiply in round-to-odd, then doing  in bfdotadd_ebf()
3131          * the second multiply as fused multiply-add, and rounding to  in bfdotadd_ebf()
3319         float32 mm = m[H2(i * 2 + sel)] << 16;  in do_bfmlal()  local
3320         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat);  in do_bfmlal()
3388         TYPE mm = *(TYPE *)(m + i);                                     \
3389         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3405 /* Bit count in each 8-bit word. */  in DO_CLAMP()
3509     unsigned segment = idx & (segments - 1);                            \
3512     do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements,   \