Lines Matching +full:width +full:- +full:mm
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
132 * Similarly for half-word elements.
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
168 ret = -ret; in do_sqrdmlah_b()
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
228 ret = -ret; in do_sqrdmlah_h()
243 uint32_t *sat = &env->vfp.qc[0]; in HELPER()
268 uint32_t *sat = &env->vfp.qc[0]; in HELPER()
324 int16_t mm = m[i]; in HELPER() local
326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); in HELPER()
342 int16_t mm = m[i]; in HELPER() local
344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); in HELPER()
360 int16_t mm = m[i]; in HELPER() local
362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); in HELPER()
378 int16_t mm = m[i]; in HELPER() local
380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); in HELPER()
440 int16_t mm = m[i]; in HELPER() local
442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); in HELPER()
455 int16_t mm = m[i]; in HELPER() local
457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); in HELPER()
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
469 ret = -ret; in do_sqrdmlah_s()
484 uint32_t *sat = &env->vfp.qc[0]; in HELPER()
506 uint32_t *sat = &env->vfp.qc[0]; in HELPER()
559 int32_t mm = m[i]; in HELPER() local
561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); in HELPER()
577 int32_t mm = m[i]; in HELPER() local
579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); in HELPER()
595 int32_t mm = m[i]; in HELPER() local
597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); in HELPER()
613 int32_t mm = m[i]; in HELPER() local
615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); in HELPER()
675 int32_t mm = m[i]; in HELPER() local
677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); in HELPER()
690 int32_t mm = m[i]; in HELPER() local
692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); in HELPER()
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ in do_sqrdmlah_d()
783 int64_t mm = m[i]; in HELPER() local
785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); in HELPER()
797 int64_t mm = m[i]; in HELPER() local
799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); in HELPER()
804 /* Integer 8 and 16-bit dot-product.
807 * with respect to the ordering of data within the quad-width lanes.
878 /* Similar for 2-way dot product */
1035 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; in HELPER()
1103 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; in HELPER()
1171 float64 e3 = m[i + 1 - flip] ^ negx_imag; in HELPER()
1182 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1186 return -float16_eq_quiet(op1, op2, stat); in float16_ceq()
1191 return -float32_eq_quiet(op1, op2, stat); in float32_ceq()
1196 return -float64_eq_quiet(op1, op2, stat); in float64_ceq()
1201 return -float16_le(op2, op1, stat); in float16_cge()
1206 return -float32_le(op2, op1, stat); in float32_cge()
1211 return -float64_le(op2, op1, stat); in float64_cge()
1216 return -float16_lt(op2, op1, stat); in float16_cgt()
1221 return -float32_lt(op2, op1, stat); in float32_cgt()
1226 return -float64_lt(op2, op1, stat); in float64_cgt()
1231 return -float16_le(float16_abs(op2), float16_abs(op1), stat); in float16_acge()
1236 return -float32_le(float32_abs(op2), float32_abs(op1), stat); in float32_acge()
1241 return -float64_le(float64_abs(op2), float64_abs(op1), stat); in float64_acge()
1246 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); in float16_acgt()
1251 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); in float32_acgt()
1256 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); in float64_acgt()
1339 /* Floating-point trigonometric starting value.
1405 * non-fused multiply-and-subtract.
1431 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1584 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ in DO_3OP()
1717 /* For the indexed ops, SVE applies the index per 128-bit vector segment. in DO_MULADD()
1729 TYPE mm = m[H(i + idx)]; \ in DO_MULADD()
1731 d[i + j] = n[i + j] * mm; \ in DO_MULADD()
1751 TYPE mm = m[H(i + idx)]; \
1753 d[i + j] = a[i + j] OP n[i + j] * mm; \
1763 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1764 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1765 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1778 TYPE mm = m[H(i + idx)]; \
1780 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \
1804 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1823 TYPE mm = m[H(i + idx)]; \
1825 d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \
1881 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1882 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1883 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1885 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1886 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1887 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1907 uint64_t nn = n[i], mm = m[i], dd = nn + mm; local
1929 uint64_t nn = n[i], mm = m[i], dd = nn - mm; in HELPER() local
1930 if (nn < mm) { in HELPER()
1951 int64_t nn = n[i], mm = m[i], dd = nn + mm; in HELPER() local
1952 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { in HELPER()
1973 int64_t nn = n[i], mm = m[i], dd = nn - mm; in HELPER() local
1974 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { in HELPER()
1996 int64_t mm = m[i]; in HELPER() local
1997 uint64_t dd = nn + mm; in HELPER()
1999 if (mm < 0) { in HELPER()
2000 if (nn < (uint64_t)-mm) { in HELPER()
2028 uint64_t mm = m[i]; in HELPER() local
2029 int64_t dd = nn + mm; in HELPER()
2031 if (mm > (uint64_t)(INT64_MAX - nn)) { in HELPER()
2075 TYPE tmp = n[i] >> (shift - 1); \ in DO_SRA()
2100 TYPE tmp = n[i] >> (shift - 1); \
2125 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2144 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2186 int shift = clz32(frac) - 21;
2188 exp = f32_bias - f16_bias - shift + 1;
2193 exp += f32_bias - f16_bias;
2197 frac <<= 23 - 10;
2223 float_status *fpst = &env->vfp.fp_status[fpst_idx]; in do_fmlal()
2224 bool fz16 = env->vfp.fpcr & FPCR_FZ16; in do_fmlal()
2231 * Pre-load all of the f16 data, avoiding overlap issues. in do_fmlal()
2262 if (env->vfp.fpcr & FPCR_AH) { in HELPER()
2278 float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64]; in HELPER()
2279 bool fz16 = env->vfp.fpcr & FPCR_FZ16; in HELPER()
2283 if (env->vfp.fpcr & FPCR_AH) { in HELPER()
2294 float32 mm = float16_to_float32_by_bits(mm_16, fz16); in HELPER() local
2297 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status); in HELPER()
2306 float_status *fpst = &env->vfp.fp_status[fpst_idx]; in do_fmlal_idx()
2307 bool fz16 = env->vfp.fpcr & FPCR_FZ16; in do_fmlal_idx()
2316 * Pre-load all of the f16 data, avoiding overlap issues. in do_fmlal_idx()
2346 if (env->vfp.fpcr & FPCR_AH) { in HELPER()
2363 float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64]; in HELPER()
2364 bool fz16 = env->vfp.fpcr & FPCR_FZ16; in HELPER()
2368 if (env->vfp.fpcr & FPCR_AH) { in HELPER()
2376 float32 mm = float16_to_float32_by_bits(mm_16, fz16); in HELPER() local
2384 float32_muladd(nn, mm, aa, negf, status); in HELPER()
2395 int8_t mm = m[i]; in HELPER() local
2398 if (mm >= 0) { in HELPER()
2399 if (mm < 8) { in HELPER()
2400 res = nn << mm; in HELPER()
2403 res = nn >> (mm > -8 ? -mm : 7); in HELPER()
2416 int8_t mm = m[i]; /* only 8 bits of shift are significant */ in HELPER() local
2419 if (mm >= 0) { in HELPER()
2420 if (mm < 16) { in HELPER()
2421 res = nn << mm; in HELPER()
2424 res = nn >> (mm > -16 ? -mm : 15); in HELPER()
2437 int8_t mm = m[i]; in HELPER() local
2440 if (mm >= 0) { in HELPER()
2441 if (mm < 8) { in HELPER()
2442 res = nn << mm; in HELPER()
2445 if (mm > -8) { in HELPER()
2446 res = nn >> -mm; in HELPER()
2460 int8_t mm = m[i]; /* only 8 bits of shift are significant */ in HELPER() local
2463 if (mm >= 0) { in HELPER()
2464 if (mm < 16) { in HELPER()
2465 res = nn << mm; in HELPER()
2468 if (mm > -16) { in HELPER()
2469 res = nn >> -mm; in HELPER()
2478 * 8x8->8 polynomial multiply.
2498 * 64x64->128 polynomial multiply.
2520 uint64_t nn = n[hi], mm = m[hi]; in HELPER() local
2522 d[0] = clmul_8x4_packed(nn, mm); in HELPER()
2524 mm >>= 32; in HELPER()
2525 d[1] = clmul_8x4_packed(nn, mm); in HELPER()
2561 *(TYPE *)(vd + i) = -(nn OP 0); \
2587 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2611 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ in DO_ABD()
2829 * which is a series of 128-bit vectors concatenated)
2845 * NxN -> N highpart multiply
2953 * Integer matrix-multiply accumulate
3038 * mode and denormal-flushing, and we do unfused multiplies and in DO_MMLA_B()
3040 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, in DO_MMLA_B()
3041 * and we perform a fused two-way sum-of-products without intermediate in DO_MMLA_B()
3048 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; in DO_MMLA_B()
3050 *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32]; in DO_MMLA_B()
3054 /* EBF=1 needs to do a step with round-to-odd semantics */ in DO_MMLA_B()
3130 * by performing the first multiply in round-to-odd, then doing in bfdotadd_ebf()
3131 * the second multiply as fused multiply-add, and rounding to in bfdotadd_ebf()
3319 float32 mm = m[H2(i * 2 + sel)] << 16; in do_bfmlal() local
3320 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat); in do_bfmlal()
3388 TYPE mm = *(TYPE *)(m + i); \
3389 TYPE dd = MIN(MAX(aa, nn), mm); \
3405 /* Bit count in each 8-bit word. */ in DO_CLAMP()
3509 unsigned segment = idx & (segments - 1); \
3512 do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements, \