arm/tcg/vec_helper.c

6  * This library is free software; you can redistribute it and/or
11  * This library is distributed in the hope that it will be useful,
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
132  * Similarly for half-word elements.
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
168         ret = -ret;  in do_sqrdmlah_b()
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
228         ret = -ret;  in do_sqrdmlah_h()
243     uint32_t *sat = &env->vfp.qc[0];  in HELPER()
268     uint32_t *sat = &env->vfp.qc[0];  in HELPER()
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
469         ret = -ret;  in do_sqrdmlah_s()
484     uint32_t *sat = &env->vfp.qc[0];  in HELPER()
506     uint32_t *sat = &env->vfp.qc[0];  in HELPER()
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */  in do_sqrdmlah_d()
804 /* Integer 8 and 16-bit dot-product.
807  * with respect to the ordering of data within the quad-width lanes.
984         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;  in HELPER()
1052         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;  in HELPER()
1120         float64 e3 = m[i + 1 - flip] ^ negx_imag;  in HELPER()
1131  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1135     return -float16_eq_quiet(op1, op2, stat);  in float16_ceq()
1140     return -float32_eq_quiet(op1, op2, stat);  in float32_ceq()
1145     return -float64_eq_quiet(op1, op2, stat);  in float64_ceq()
1150     return -float16_le(op2, op1, stat);  in float16_cge()
1155     return -float32_le(op2, op1, stat);  in float32_cge()
1160     return -float64_le(op2, op1, stat);  in float64_cge()
1165     return -float16_lt(op2, op1, stat);  in float16_cgt()
1170     return -float32_lt(op2, op1, stat);  in float32_cgt()
1175     return -float64_lt(op2, op1, stat);  in float64_cgt()
1180     return -float16_le(float16_abs(op2), float16_abs(op1), stat);  in float16_acge()
1185     return -float32_le(float32_abs(op2), float32_abs(op1), stat);  in float32_acge()
1190     return -float64_le(float64_abs(op2), float64_abs(op1), stat);  in float64_acge()
1195     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);  in float16_acgt()
1200     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);  in float32_acgt()
1205     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);  in float64_acgt()
1288 /* Floating-point trigonometric starting value.
1354  * non-fused multiply-and-subtract.
1380 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1521 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */  in DO_3OP()
1631 /* For the indexed ops, SVE applies the index per 128-bit vector segment.  in DO_MULADD()
1677 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1678 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1679 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1717  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1791 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1792 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1793 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1795 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1796 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1797 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1839         uint64_t nn = n[i], mm = m[i], dd = nn - mm;  in HELPER()
1883         int64_t nn = n[i], mm = m[i], dd = nn - mm;  in HELPER()
1910             if (nn < (uint64_t)-mm) {  in HELPER()
1941         if (mm > (uint64_t)(INT64_MAX - nn)) {  in HELPER()
1985         TYPE tmp = n[i] >> (shift - 1);                 \  in DO_SRA()
2010         TYPE tmp = n[i] >> (shift - 1);                 \
2035         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2054         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2096                 int shift = clz32(frac) - 21;
2098                 exp = f32_bias - f16_bias - shift + 1;
2103         exp += f32_bias - f16_bias;
2107     frac <<= 23 - 10;
2133     float_status *fpst = &env->vfp.fp_status[fpst_idx];  in do_fmlal()
2134     bool fz16 = env->vfp.fpcr & FPCR_FZ16;  in do_fmlal()
2141      * Pre-load all of the f16 data, avoiding overlap issues.  in do_fmlal()
2172         if (env->vfp.fpcr & FPCR_AH) {  in HELPER()
2186     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);  in HELPER()  local
2187     float_status *status = &env->vfp.fp_status[FPST_A64];  in HELPER()
2188     bool fz16 = env->vfp.fpcr & FPCR_FZ16;  in HELPER()
2192         if (env->vfp.fpcr & FPCR_AH) {  in HELPER()
2200         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx;  in HELPER()
2201         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));  in HELPER()
2215     float_status *fpst = &env->vfp.fp_status[fpst_idx];  in do_fmlal_idx()
2216     bool fz16 = env->vfp.fpcr & FPCR_FZ16;  in do_fmlal_idx()
2225      * Pre-load all of the f16 data, avoiding overlap issues.  in do_fmlal_idx()
2255         if (env->vfp.fpcr & FPCR_AH) {  in HELPER()
2269     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);  in HELPER()  local
2271     float_status *status = &env->vfp.fp_status[FPST_A64];  in HELPER()
2272     bool fz16 = env->vfp.fpcr & FPCR_FZ16;  in HELPER()
2276         if (env->vfp.fpcr & FPCR_AH) {  in HELPER()
2287             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx;  in HELPER()
2311             res = nn >> (mm > -8 ? -mm : 7);  in HELPER()
2332             res = nn >> (mm > -16 ? -mm : 15);  in HELPER()
2353             if (mm > -8) {  in HELPER()
2354                 res = nn >> -mm;  in HELPER()
2376             if (mm > -16) {  in HELPER()
2377                 res = nn >> -mm;  in HELPER()
2386  * 8x8->8 polynomial multiply.
2406  * 64x64->128 polynomial multiply.
2452     intptr_t sel = H4(simd_data(desc));  in HELPER()  local
2458         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);  in HELPER()
2469         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2495         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2519         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \  in DO_ABD()
2737              * which is a series of 128-bit vectors concatenated)
2753  * NxN -> N highpart multiply
2861  * Integer matrix-multiply accumulate
2946      * mode and denormal-flushing, and we do unfused multiplies and  in DO_MMLA_B()
2948      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,  in DO_MMLA_B()
2949      * and we perform a fused two-way sum-of-products without intermediate  in DO_MMLA_B()
2956     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;  in DO_MMLA_B()
2958     *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32];  in DO_MMLA_B()
2962         /* EBF=1 needs to do a step with round-to-odd semantics */  in DO_MMLA_B()
3008      * by performing the first multiply in round-to-odd, then doing  in bfdotadd_ebf()
3009      * the second multiply as fused multiply-add, and rounding to  in bfdotadd_ebf()
3153     intptr_t sel = simd_data(desc);  in HELPER()  local
3158         float32 nn = n[H2(i * 2 + sel)] << 16;  in HELPER()
3159         float32 mm = m[H2(i * 2 + sel)] << 16;  in HELPER()
3169     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);  in HELPER()  local
3180             float32 n_j = n[H2(2 * j + sel)] << 16;  in HELPER()
3211 /* Bit count in each 8-bit word. */  in DO_CLAMP()