Lines Matching +full:library +full:- +full:sel
6 * This library is free software; you can redistribute it and/or
11 * This library is distributed in the hope that it will be useful,
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
132 * Similarly for half-word elements.
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
168 ret = -ret; in do_sqrdmlah_b()
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
228 ret = -ret; in do_sqrdmlah_h()
243 uint32_t *sat = &env->vfp.qc[0]; in HELPER()
268 uint32_t *sat = &env->vfp.qc[0]; in HELPER()
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
469 ret = -ret; in do_sqrdmlah_s()
484 uint32_t *sat = &env->vfp.qc[0]; in HELPER()
506 uint32_t *sat = &env->vfp.qc[0]; in HELPER()
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ in do_sqrdmlah_d()
804 /* Integer 8 and 16-bit dot-product.
807 * with respect to the ordering of data within the quad-width lanes.
984 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; in HELPER()
1052 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; in HELPER()
1120 float64 e3 = m[i + 1 - flip] ^ negx_imag; in HELPER()
1131 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1135 return -float16_eq_quiet(op1, op2, stat); in float16_ceq()
1140 return -float32_eq_quiet(op1, op2, stat); in float32_ceq()
1145 return -float64_eq_quiet(op1, op2, stat); in float64_ceq()
1150 return -float16_le(op2, op1, stat); in float16_cge()
1155 return -float32_le(op2, op1, stat); in float32_cge()
1160 return -float64_le(op2, op1, stat); in float64_cge()
1165 return -float16_lt(op2, op1, stat); in float16_cgt()
1170 return -float32_lt(op2, op1, stat); in float32_cgt()
1175 return -float64_lt(op2, op1, stat); in float64_cgt()
1180 return -float16_le(float16_abs(op2), float16_abs(op1), stat); in float16_acge()
1185 return -float32_le(float32_abs(op2), float32_abs(op1), stat); in float32_acge()
1190 return -float64_le(float64_abs(op2), float64_abs(op1), stat); in float64_acge()
1195 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); in float16_acgt()
1200 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); in float32_acgt()
1205 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); in float64_acgt()
1288 /* Floating-point trigonometric starting value.
1354 * non-fused multiply-and-subtract.
1380 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1521 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ in DO_3OP()
1631 /* For the indexed ops, SVE applies the index per 128-bit vector segment. in DO_MULADD()
1677 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1678 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1679 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1717 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1791 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1792 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1793 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1795 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1796 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1797 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1839 uint64_t nn = n[i], mm = m[i], dd = nn - mm; in HELPER()
1883 int64_t nn = n[i], mm = m[i], dd = nn - mm; in HELPER()
1910 if (nn < (uint64_t)-mm) { in HELPER()
1941 if (mm > (uint64_t)(INT64_MAX - nn)) { in HELPER()
1985 TYPE tmp = n[i] >> (shift - 1); \ in DO_SRA()
2010 TYPE tmp = n[i] >> (shift - 1); \
2035 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2054 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2096 int shift = clz32(frac) - 21;
2098 exp = f32_bias - f16_bias - shift + 1;
2103 exp += f32_bias - f16_bias;
2107 frac <<= 23 - 10;
2133 float_status *fpst = &env->vfp.fp_status[fpst_idx]; in do_fmlal()
2134 bool fz16 = env->vfp.fpcr & FPCR_FZ16; in do_fmlal()
2141 * Pre-load all of the f16 data, avoiding overlap issues. in do_fmlal()
2172 if (env->vfp.fpcr & FPCR_AH) { in HELPER()
2186 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); in HELPER() local
2187 float_status *status = &env->vfp.fp_status[FPST_A64]; in HELPER()
2188 bool fz16 = env->vfp.fpcr & FPCR_FZ16; in HELPER()
2192 if (env->vfp.fpcr & FPCR_AH) { in HELPER()
2200 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx; in HELPER()
2201 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); in HELPER()
2215 float_status *fpst = &env->vfp.fp_status[fpst_idx]; in do_fmlal_idx()
2216 bool fz16 = env->vfp.fpcr & FPCR_FZ16; in do_fmlal_idx()
2225 * Pre-load all of the f16 data, avoiding overlap issues. in do_fmlal_idx()
2255 if (env->vfp.fpcr & FPCR_AH) { in HELPER()
2269 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); in HELPER() local
2271 float_status *status = &env->vfp.fp_status[FPST_A64]; in HELPER()
2272 bool fz16 = env->vfp.fpcr & FPCR_FZ16; in HELPER()
2276 if (env->vfp.fpcr & FPCR_AH) { in HELPER()
2287 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx; in HELPER()
2311 res = nn >> (mm > -8 ? -mm : 7); in HELPER()
2332 res = nn >> (mm > -16 ? -mm : 15); in HELPER()
2353 if (mm > -8) { in HELPER()
2354 res = nn >> -mm; in HELPER()
2376 if (mm > -16) { in HELPER()
2377 res = nn >> -mm; in HELPER()
2386 * 8x8->8 polynomial multiply.
2406 * 64x64->128 polynomial multiply.
2452 intptr_t sel = H4(simd_data(desc)); in HELPER() local
2458 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); in HELPER()
2469 *(TYPE *)(vd + i) = -(nn OP 0); \
2495 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2519 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ in DO_ABD()
2737 * which is a series of 128-bit vectors concatenated)
2753 * NxN -> N highpart multiply
2861 * Integer matrix-multiply accumulate
2946 * mode and denormal-flushing, and we do unfused multiplies and in DO_MMLA_B()
2948 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, in DO_MMLA_B()
2949 * and we perform a fused two-way sum-of-products without intermediate in DO_MMLA_B()
2956 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; in DO_MMLA_B()
2958 *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32]; in DO_MMLA_B()
2962 /* EBF=1 needs to do a step with round-to-odd semantics */ in DO_MMLA_B()
3008 * by performing the first multiply in round-to-odd, then doing in bfdotadd_ebf()
3009 * the second multiply as fused multiply-add, and rounding to in bfdotadd_ebf()
3153 intptr_t sel = simd_data(desc); in HELPER() local
3158 float32 nn = n[H2(i * 2 + sel)] << 16; in HELPER()
3159 float32 mm = m[H2(i * 2 + sel)] << 16; in HELPER()
3169 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); in HELPER() local
3180 float32 n_j = n[H2(2 * j + sel)] << 16; in HELPER()
3211 /* Bit count in each 8-bit word. */ in DO_CLAMP()