arm/tcg/mve_helper.c

2  * M-profile MVE Operations
24 #include "exec/helper-proto.h"
25 #include "accel/tcg/cpu-ldst.h"
39     if ((env->condexec_bits & 0xf) != 0) {  in mve_eci_mask()
43     eci = env->condexec_bits >> 4;  in mve_eci_mask()
66      *  (3) low-overhead-branch tail predication will mask out part  in mve_element_mask()
70      * We combine all these into a 16-bit result with the same semantics  in mve_element_mask()
72      * 8-bit vector ops will look at all bits of the result;  in mve_element_mask()
73      * 16-bit ops will look at bits 0, 2, 4, ...;  in mve_element_mask()
74      * 32-bit ops will look at bits 0, 4, 8 and 12.  in mve_element_mask()
76      * the 4-bit slice of the mask corresponding to a single beat.  in mve_element_mask()
78     uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);  in mve_element_mask()
80     if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {  in mve_element_mask()
83     if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {  in mve_element_mask()
87     if (env->v7m.ltpsize < 4 &&  in mve_element_mask()
88         env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {  in mve_element_mask()
95         int masklen = env->regs[14] << env->v7m.ltpsize;  in mve_element_mask()
112     uint32_t vpr = env->v7m.vpr;  in mve_advance_vpt()
117     if ((env->condexec_bits & 0xf) == 0) {  in mve_advance_vpt()
118         env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?  in mve_advance_vpt()
147     env->v7m.vpr = vpr;  in mve_advance_vpt()
276  * 64-bit accesses are slightly different: they are done as two 32-bit
278  * and with a single 32-bit offset in the first of the two Qm elements.
281  * stored in the even-beat element.
303                 m[H4(e & ~1)] = addr - 4;                               \
331                 m[H4(e & ~1)] = addr - 4;                               \
396  * one 32-bit memory access per beat.  in DO_VLDR64_SG()
589             for (e = 3; e >= 0; e--) {                                  \
681             for (e = 3; e >= 0; e--) {                                  \
708             for (e = 1; e >= 0; e--) {                                  \
815      * into the 32-bit value, so we only need to write the 32-bit  in HELPER()
839 #define DO_CLS_B(N)   (clrsb32(N) - 24)
840 #define DO_CLS_H(N)   (clrsb32(N) - 16)
846 #define DO_CLZ_B(N)   (clz32(N) - 24)
847 #define DO_CLZ_H(N)   (clz32(N) - 16)
864 #define DO_ABS(N) ((N) < 0 ? -(N) : (N))
876 #define DO_NEG(N)    (-(N))
890  * All these insns work at 64-bit widths.
926 /* provide unsigned 2-op helpers for all sizes */
932 /* provide signed 2-op helpers for all sizes */
939  * "Long" operations where two half-sized inputs (taken from either the
940  * top or the bottom of the input vector) produce a double-width result.
972             env->vfp.qc[0] = qc;                                        \
977 /* provide unsigned 2-op helpers for all sizes */
983 /* provide signed 2-op helpers for all sizes */
1002 #define DO_SUB(N, M) ((N) - (M))
1088 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))  in DO_2OP_S()
1105     return ((uint64_t)n - m) >> 1;  in do_vhsub_u()
1110     return ((int64_t)n - m) >> 1;  in do_vhsub_s()
1157         env->vfp.fpsr &= ~FPSR_NZCV_MASK;  in DO_2OP_S()
1158         env->vfp.fpsr |= carry_in * FPSR_C;  in DO_2OP_S()
1165     bool carry_in = env->vfp.fpsr & FPSR_C;  in HELPER()
1171     bool carry_in = env->vfp.fpsr & FPSR_C;  in HELPER()
1172     do_vadc(env, vd, vn, vm, -1, carry_in, false);  in HELPER()
1183     do_vadc(env, vd, vn, vm, -1, 1, true);  in HELPER()
1198                 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]);         \
1237 #define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
1238 #define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
1239 #define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
1241 #define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
1242 #define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
1243 #define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
1247  * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
1321  * (A * B - C * D) etc for VQDMLSDH.  in DO_2OP_SAT_S()
1335                             m[H##ESIZE(e - XCHG)],                      \  in DO_2OP_SAT_S()
1336                             n[H##ESIZE(e + (1 - 2 * XCHG))],            \  in DO_2OP_SAT_S()
1337                             m[H##ESIZE(e + (1 - XCHG))],                \  in DO_2OP_SAT_S()
1344             env->vfp.qc[0] = qc;                                        \  in DO_2OP_SAT_S()
1374      * bring it back into the non-saturated range. However, if  in do_vqdmladh_w()
1393     int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);  in do_vqdmlsdh_b()
1400     int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);  in do_vqdmlsdh_h()
1478             env->vfp.qc[0] = qc;                                        \
1516             env->vfp.qc[0] = qc;                                        \
1521 /* provide unsigned 2-op scalar helpers for all sizes */
1586      * bring it back into the non-saturated range. However, if  in do_vqdmlah_w()
1652  * whether to propagate a saturation indication into FPSCR.QC -- for  in DO_2OP_ACC_SCALAR_U()
1653  * the 16x16->32 case we must check only the bit corresponding to the T or B  in DO_2OP_ACC_SCALAR_U()
1654  * half that we used, but for the 32x32->64 case we propagate if the mask  in DO_2OP_ACC_SCALAR_U()
1674             env->vfp.qc[0] = qc;                                        \  in DO_2OP_ACC_SCALAR_U()
1733             env->vfp.qc[0] = qc;                                        \
1751         n >>= 8 - m;  in do_vbrsrb()
1764         n >>= 16 - m;  in do_vbrsrh()
1777         n >>= 32 - m;  in do_vbrsrw()
1800                         (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
1819 DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
1820 DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
1821 DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
1822 DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
1838                         n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)];     \
1861 DO_DAV_S(vmlsdav, false, +=, -=)
1863 DO_DAV_S(vmlsdavx, true, +=, -=)
1867  * this is implemented with a 72-bit internal accumulator value of which
1869  * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
1870  * is squashed back into 64-bits after each beat.
1883                     mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)];        \
1885                         mul = -mul;                                     \
1968         m = -m;
1976         m = -m;  in do_mina()
2007                 uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0);  \  in DO_VMAXMINV_S()
2071             env->vfp.qc[0] = qc;                                \
2076 /* provide unsigned 2-op shift helpers for all sizes */
2105 /* Shift-and-insert; we always work with 64 bits at a time */
2118              * this because it would try to shift by an out-of-range    \
2138 #define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
2139 #define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
2149  * Long shifts taking half-sized inputs from top or bottom of the input
2150  * vector and producing a double-width result. ESIZE, TYPE are for
2153  * because the long shift is strictly left-only.
2242             env->vfp.qc[0] = qc;                                \
2341             env->vfp.qc[0] = qc;                                        \
2392      * For each 32-bit element, we shift it left, bringing in the
2412                 rdm = d[H4(e)] >> (32 - shift);
2423     return do_sqrshl_d(n, -(int8_t)shift, false, NULL);  in HELPER()
2433     return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);  in HELPER()
2438     return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);  in HELPER()
2443     return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);  in HELPER()
2448     return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);  in HELPER()
2451 /* Operate on 64-bit values, but saturate at 48 bits */
2457     if (shift <= -48) {  in do_sqrshl48_d()
2465             src >>= -shift - 1;  in do_sqrshl48_d()
2468             val = src >> -shift;  in do_sqrshl48_d()
2487 /* Operate on 64-bit values, but saturate at 48 bits */
2493     if (shift <= -(48 + round)) {  in do_uqrshl48_d()
2497             val = src >> (-shift - 1);  in do_uqrshl48_d()
2500             val = src >> -shift;  in do_uqrshl48_d()
2521     return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);  in HELPER()
2526     return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);  in HELPER()
2531     return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);  in HELPER()
2536     return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);  in HELPER()
2541     return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);  in HELPER()
2546     return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);  in HELPER()
2604     offset -= imm;  in do_sub_wrap()
2614  * P0 bits for non-executed beats (where eci_mask is 0) are unchanged.  in DO_VIDUP_ALL()
2635         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \  in DO_VIDUP_ALL()
2657         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
2705     uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
2722      * This insn is itself subject to predication and to beat-wise execution,  in HELPER()
2727     uint16_t beatpred = ~env->v7m.vpr & mask;  in HELPER()
2728     env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask);  in HELPER()
2736  * ltpmask in mve_element_mask(), but we have pre-calculated
2748     env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask);  in HELPER()
2765             env->vfp.qc[0] = qc;                                        \
2777 #define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP)
2778 #define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP)
2779 #define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP)
2816  * 2-operand floating point. Note that if an element is partially
2817  * predicated we must do the FP operation to update the non-predicated
2835             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2908             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \  in DO_2OP_FP_ALL()
2917                 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst);   \  in DO_2OP_FP_ALL()
2931 #define DO_VFMA(OP, ESIZE, TYPE, CHS)                                   \  argument
2945             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2952             if (CHS) {                                                  \
2982             fpst0 = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3066             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3100             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3132             &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3182             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3193         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
3215             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3226         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
3279             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3312             &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD];  \
3353     bool ieee = !(env->vfp.fpcr & FPCR_AHP);
3357     float_status *base_fpst = &env->vfp.fp_status[FPST_STD];
3383     bool ieee = !(env->vfp.fpcr & FPCR_AHP);  in do_vcvt_hs()
3387     float_status *base_fpst = &env->vfp.fp_status[FPST_STD];  in do_vcvt_hs()
3437             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \