arm/tcg/sve_helper.c

23 #include "exec/page-protection.h"
24 #include "exec/helper-proto.h"
26 #include "exec/tlb-flags.h"
27 #include "tcg/tcg-gvec-desc.h"
32 #include "accel/tcg/cpu-ldst.h"
33 #include "accel/tcg/helper-retaddr.h"
34 #include "accel/tcg/cpu-ops.h"
37 #include "user/page-protection.h"
60             flags |= ((d & (g & -g)) != 0) << 31;  in iter_predtest_fwd()
82             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */  in iter_predtest_bwd()
90         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);  in iter_predtest_bwd()
101 /* The same for a multi-word predicate.  */
128     return -(uint64_t)(byte & 1);  in expand_pred_d()
170 /* Fully general three-operand expander, controlled by a predicate.  in LOGICAL_PPPP()
171  * This is complicated by the host-endian storage of the register file.  in LOGICAL_PPPP()
187                 TYPE mm = *(TYPE *)(vm + H(i));                         \  in LOGICAL_PPPP()
188                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \  in LOGICAL_PPPP()
195 /* Similarly, specialized for 64-bit operands.  */
204             TYPE nn = n[i], mm = m[i];                          \
205             d[i] = OP(nn, mm);                                  \
216 #define DO_SUB(N, M)  (N - M)
219 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
225  * zero and signed division of INT_MIN by -1. Both of these
227  * We special case all signed divisions by -1 to avoid having
230 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
347 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))  in DO_ZPZZ()
523 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
524 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
570 #define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m)  in DO_ZPZZ()
571 #define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m)  in DO_ZPZZ()
572 #define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m)  in DO_ZPZZ()
576     int64_t r = n - m;  in DO_ZPZZ()
589 #define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m)  in DO_ZPZZ()
590 #define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m)  in DO_ZPZZ()
591 #define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m)  in DO_ZPZZ()
595     return n > m ? n - m : 0;  in DO_ZPZZ()
612         /* Note that m - abs(n) cannot underflow. */  in DO_ZPZZ()
615             if (m > -n) {  in DO_ZPZZ()
644         return n < -m ? 0 : r;  in DO_ZPZZ()
660  * If the slot I is odd, the elements from from VM {I-1, I}.  in DO_ZPZZ()
686 /* Similarly, specialized for 64-bit operands.  */
779 /* Three-operand expander, controlled by a predicate, in which the
780  * third operand is "wide".  That is, for D = N op M, the same 64-bit
789         TYPEW mm = *(TYPEW *)(vm + i);                                  \
793                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
814 /* Fully general two-operand expander, controlled by a predicate.
832 /* Similarly, specialized for 64-bit operands.  */
847 #define DO_CLS_B(N)   (clrsb32(N) - 24)
848 #define DO_CLS_H(N)   (clrsb32(N) - 16)
855 #define DO_CLZ_B(N)   (clz32(N) - 24)
856 #define DO_CLZ_H(N)   (clz32(N) - 16)
875 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
889 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
931 #define DO_ABS(N)    (N < 0 ? -N : N)
938 #define DO_NEG(N)    (-N)
976     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \  in DO_ZPZ()
977        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })  in DO_ZPZ()
985     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
986        x_ == min_ ? -min_ - 1 : -x_; })
996 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1003         TYPEW mm = *(TYPEW *)(vm + i);                         \
1006             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
1039  * Three-operand expander, unpredicated, in which the two inputs are
1050         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1051         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1119         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \  in DO_ZZZ_TB()
1120         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \  in DO_ZZZ_TB()
1150         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1151         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1169         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1171         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1191 #define DO_NMUL(N, M)  -(N * M)
1255     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1263         /* Compute and store the entire 33-bit result at once. */
1272     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);  in HELPER()
1293         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1295         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1333 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1425     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);  in HELPER()
1440     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);  in HELPER()
1456     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);  in HELPER()
1476     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);  in HELPER()
1495         TYPE mm = m[i];                                                 \
1497             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1531         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1535             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1547 #define DO_MLS(N, M, A)  (A - N * M)
1577         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1580             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1602         TYPE mm = *(TYPE *)(vm + i);                           \
1603         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1733 /* Two-operand reduction expander, controlled by a predicate.
1735  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1736  * but TYPERET must be unsigned so that e.g. a 32-bit value
1737  * is not sign-extended to the ABI uint64_t return type.
1786 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1787 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1788 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1789 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1815 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1816 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1817 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1818 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1826     TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \
1862 DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN)
1863 DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN)
1864 DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN)
1865 DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN)
1880 #define DO_SUBR(X, Y)   (Y - X)
1937 DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC)
1938 DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC)
1939 DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC)
1940 DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC)
1963    indication; e.g. not found for esz=3 is -8.  */
1970         uint64_t this_g = g[--i] & mask;
1972             return i * 64 + (63 - clz64(this_g));
1975     return (intptr_t)-1 << esz;
1992                 this_d |= this_g & -this_g;  in HELPER()
2016         uint64_t mask = -1;  in HELPER()
2019             mask = ~((1ull << (next & 63)) - 1);  in HELPER()
2020             next &= -64;  in HELPER()
2025                 next = (next & -64) + ctz64(this_g);  in HELPER()
2029             mask = -1;  in HELPER()
2053     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);  in HELPER()
2065     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);  in HELPER()
2077     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);  in HELPER()
2094         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);  in HELPER()
2098 /* Three-operand expander, immediate operand, controlled by a predicate.
2117 /* Similarly, specialized for 64-bit operands.  */
2138    when N is negative, add 2**M-1.  */
2139 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2319         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2320         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2330         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2331         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2336 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2337 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2338 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2379 /* Fully general four-operand expander, controlled by a predicate.
2391                 TYPE mm = *(TYPE *)(vm + H(i));               \
2393                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2400 /* Similarly, specialized for 64-bit operands.  */
2410             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2411             d[i] = OP(aa, nn, mm);                            \
2417 #define DO_MLS(A, N, M)  (A - N * M)
2518     /* These constants are cut-and-paste directly from the ARM pseudocode.  */  in HELPER()
2538     /* These constants are cut-and-paste directly from the ARM pseudocode.  */  in HELPER()
2570     /* These constants are cut-and-paste directly from the ARM pseudocode.  */  in HELPER()
2613         uint16_t mm = m[i];  in HELPER()  local
2614         if (mm & 1) {  in HELPER()
2617         if (mm & 2) {  in HELPER()
2631         uint32_t mm = m[i];  in HELPER()  local
2632         if (mm & 1) {  in HELPER()
2635         if (mm & 2) {  in HELPER()
2649         uint64_t mm = m[i];  in HELPER()  local
2650         if (mm & 1) {  in HELPER()
2653         if (mm & 2) {  in HELPER()
2753                          uint64_t mm, uint32_t desc)  in HELPER()
2759     mm = dup_const(MO_8, mm);  in HELPER()
2763         d[i] = (mm & pp) | (nn & ~pp);  in HELPER()
2768                          uint64_t mm, uint32_t desc)  in HELPER()
2774     mm = dup_const(MO_16, mm);  in HELPER()
2778         d[i] = (mm & pp) | (nn & ~pp);  in HELPER()
2783                          uint64_t mm, uint32_t desc)  in HELPER()
2789     mm = dup_const(MO_32, mm);  in HELPER()
2793         d[i] = (mm & pp) | (nn & ~pp);  in HELPER()
2798                          uint64_t mm, uint32_t desc)  in HELPER()
2806         d[i] = (pg[H1(i)] & 1 ? mm : nn);  in HELPER()
2857 /* Big-endian hosts need to frob the byte indices.  If the copy
2858  * happens to be 8-byte aligned, then no frobbing necessary.
2882                 i -= 4;  in swap_memmove()
2896                 i -= 2;  in swap_memmove()
2909                 i -= 1;  in swap_memmove()
2962     size_t n_siz = opr_sz - n_ofs;  in HELPER()
2983     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2997     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {  in DO_INSR()
3008     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {  in HELPER()
3019     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {  in HELPER()
3030     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {  in HELPER()
3052         depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \
3078         depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \
3132             index -= nelem;                                             \
3170     if (unlikely(vn - vd < opr_sz)) {                          \
3188 /* Mask of bits included in the even numbered predicates of width esz.
3190  * same pattern out to 16-bit units.
3200 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3203  * section 7-2 Shuffling Bits.
3210     for (i = 4; i >= n; i--) {  in expand_bits()
3220  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3245         uint64_t mm = *(uint64_t *)vm;  in HELPER()  local
3249         mm = extract64(mm, high * half, half);  in HELPER()
3251         mm = expand_bits(mm, esz);  in HELPER()
3252         d[0] = nn | (mm << esize);  in HELPER()
3276                 uint64_t mm = m[H4(high + i)];  in HELPER()  local
3279                 mm = expand_bits(mm, esz);  in HELPER()
3280                 d[i] = nn | (mm << esize);  in HELPER()
3288                 uint16_t mm = m[H1(high + i)];  in HELPER()  local
3291                 mm = expand_bits(mm, esz);  in HELPER()
3292                 d16[H2(i)] = nn | (mm << esize);  in HELPER()
3315         if ((vm - vd) < (uintptr_t)oprsz) {  in HELPER()
3388         uint64_t mm = (m[i] & mask) << shl;  in HELPER()  local
3389         d[i] = nn + mm;  in HELPER()
3399     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {  in reverse_bits_64()
3411     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {  in reverse_bits_8()
3425         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);  in HELPER()
3429             intptr_t ih = oprsz - 8 - i;  in HELPER()
3438             intptr_t ih = H1(oprsz - 1 - i);  in HELPER()
3466         if ((vn - vd) < (uintptr_t)oprsz) {  in HELPER()
3502     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3505     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3514         memset(vd + oprsz - 16, 0, 16);                              \
3531     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3539     p -= oprsz;                                                        \
3605         memset(vd + oprsz - 16, 0, 16);                                \  in DO_PERSEG_ZZZ()
3655  * indication; e.g. not found for esz=3 is -8.
3677     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {  in HELPER()
3692         last_i = last_i * 8 + 63 - clz64(last_g);  in HELPER()
3693         len = last_i - first_i + (1 << esz);  in HELPER()
3699     swap_memmove(vd + len, vm, opr_sz * 8 - len);  in HELPER()
3710         uint64_t nn = n[i], mm = m[i];  in HELPER()  local
3712         d[i] = (nn & pp) | (mm & ~pp);  in HELPER()
3724         uint64_t nn = n[i], mm = m[i];  in HELPER()  local
3726         d[i] = (nn & pp) | (mm & ~pp);  in HELPER()
3738         uint64_t nn = n[i], mm = m[i];  in HELPER()  local
3740         d[i] = (nn & pp) | (mm & ~pp);  in HELPER()
3752         uint64_t nn = n[i], mm = m[i];  in HELPER()  local
3753         d[i] = (pg[H1(i)] & 1 ? nn : mm);  in HELPER()
3787  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3799             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3801             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3802             out |= nn OP mm;                                                 \
3867             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3869                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3871                 out |= nn OP mm;                                             \
3940     TYPE mm = simd_data(desc);                                       \
3945             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3947             out |= nn OP mm;                                         \
4027     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {  in last_active_pred()
4053         b = b & -b;           /* first such */  in compute_brk()
4055             b = b | (b - 1);  /* break after same */  in compute_brk()
4057             b = b - 1;        /* break before same */  in compute_brk()
4244             flags = iter_predtest_fwd(d->p[i], -1, flags);  in HELPER()
4247             uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));  in HELPER()
4248             flags = iter_predtest_fwd(d->p[i], mask, flags);  in HELPER()
4284         count = maxelem - count;  in HELPER()
4301         count = elements - count;  in encode_pred_count()
4341             d->p[i] = esz_mask;  in do_whilel()
4344             d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;  in do_whilel()
4377         do_whilel(&d[1], esz_mask, count - oprbits, oprbits);  in HELPER()
4404         uint32_t i, invcount = oprbits - count;  in do_whileg()
4408             d->p[i] = bits;  in do_whileg()
4412             d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63);  in do_whileg()
4445         do_whileg(&d[0], esz_mask, count - oprbits, oprbits);  in HELPER()
4472  * little to gain with a more complex non-recursive form.
4569                 float16 mm = *(float16 *)(vm + H1_2(i));  in DO_REDUCE()  local
4570                 result = float16_add(result, mm, status);  in DO_REDUCE()
4589                 float32 mm = *(float32 *)(vm + H1_2(i));  in HELPER()  local
4590                 result = float32_add(result, mm, status);  in HELPER()
4615 /* Fully general three-operand expander, controlled by a predicate,
4625         uint64_t pg = g[(i - 1) >> 6];                          \
4627             i -= sizeof(TYPE);                                  \
4630                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4631                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4743 /* Three-operand expander, with one scalar operand, controlled by  in DO_ZPZZ_FP()
4752     TYPE mm = scalar;                                             \  in DO_ZPZZ_FP()
4754         uint64_t pg = g[(i - 1) >> 6];                            \  in DO_ZPZZ_FP()
4756             i -= sizeof(TYPE);                                    \  in DO_ZPZZ_FP()
4759                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \  in DO_ZPZZ_FP()
4820 /* Fully general two-operand expander, controlled by a predicate,  in DO_ZPZS_FP()
4830         uint64_t pg = g[(i - 1) >> 6];                                \  in DO_ZPZS_FP()
4832             i -= sizeof(TYPE);                                        \  in DO_ZPZS_FP()
5026                 /* denormal: bias - fractional_zeros */  in DO_ZPZ_FP()
5027                 return -15 - clz32(frac);  in DO_ZPZ_FP()
5037         /* normal: exp - bias */  in DO_ZPZ_FP()
5038         return exp - 15;  in DO_ZPZ_FP()
5054                 /* denormal: bias - fractional_zeros */  in do_float32_logb_as_int()
5055                 return -127 - clz32(frac);  in do_float32_logb_as_int()
5065         /* normal: exp - bias */  in do_float32_logb_as_int()
5066         return exp - 127;  in do_float32_logb_as_int()
5082                 /* denormal: bias - fractional_zeros */  in do_float64_logb_as_int()
5083                 return -1023 - clz64(frac);  in do_float64_logb_as_int()
5093         /* normal: exp - bias */  in do_float64_logb_as_int()
5094         return exp - 1023;  in do_float64_logb_as_int()
5115         uint64_t pg = g[(i - 1) >> 6];  in DO_ZPZ_FP()
5117             i -= 2;  in DO_ZPZ_FP()
5184         uint64_t pg = g[(i - 1) >> 6];  in do_fmla_zpzzz_h()
5186             i -= 2;  in do_fmla_zpzzz_h()
5253         uint64_t pg = g[(i - 1) >> 6];  in do_fmla_zpzzz_s()
5255             i -= 4;  in do_fmla_zpzzz_s()
5322         uint64_t pg = g[(i - 1) >> 6];  in do_fmla_zpzzz_d()
5324             i -= 8;  in do_fmla_zpzzz_d()
5383 /* Two operand floating-point comparison controlled by a predicate.
5392     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
5397             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
5400                 TYPE mm = *(TYPE *)(vm + H(i));                         \
5401                 out |= OP(TYPE, nn, mm, status);                        \
5404         d[j--] = out;                                                   \
5447 /* One operand floating-point comparison against zero, controlled  in DO_FPCMP_PPZZ_ALL()
5454     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \  in DO_FPCMP_PPZZ_ALL()
5459             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \  in DO_FPCMP_PPZZ_ALL()
5465         d[j--] = out;                                      \  in DO_FPCMP_PPZZ_ALL()
5488 /* FP Trig Multiply-Add. */
5503         float16 mm = m[i];  local
5507         if (float16_is_neg(mm)) {
5511                 mm = float16_abs(mm);
5515         d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5534         float32 mm = m[i];  in HELPER()  local
5538         if (float32_is_neg(mm)) {  in HELPER()
5542                 mm = float32_abs(mm);  in HELPER()
5546         d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);  in HELPER()
5569         float64 mm = m[i];  in HELPER()  local
5573         if (float64_is_neg(mm)) {  in HELPER()
5577                 mm = float64_abs(mm);  in HELPER()
5581         d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);  in HELPER()
5598         uint64_t pg = g[(i - 1) >> 6];  in HELPER()
5603             j = i - sizeof(float16);  in HELPER()
5604             i -= 2 * sizeof(float16);  in HELPER()
5636         uint64_t pg = g[(i - 1) >> 6];  in HELPER()
5641             j = i - sizeof(float32);  in HELPER()
5642             i -= 2 * sizeof(float32);  in HELPER()
5674         uint64_t pg = g[(i - 1) >> 6];  in HELPER()
5679             j = i - sizeof(float64);  in HELPER()
5680             i -= 2 * sizeof(float64);  in HELPER()
5725         uint64_t pg = g[(i - 1) >> 6];  in HELPER()
5730             j = i - sizeof(float16);  in HELPER()
5731             i -= 2 * sizeof(float16);  in HELPER()
5775         uint64_t pg = g[(i - 1) >> 6];  in HELPER()
5780             j = i - sizeof(float32);  in HELPER()
5781             i -= 2 * sizeof(float32);  in HELPER()
5825         uint64_t pg = g[(i - 1) >> 6];  in HELPER()
5830             j = i - sizeof(float64);  in HELPER()
5831             i -= 2 * sizeof(float64);  in HELPER()
5878         reg_off &= -64;  in find_next_active()
5896  * Resolve the guest virtual address to info->host and info->flags.
5910      * User-only currently always issues with TBI.  See the comment  in sve_probe_page()
5915      * We currently always enable TBI for user-only, and do not provide  in sve_probe_page()
5923                                &info->host, retaddr);  in sve_probe_page()
5927                               &info->host, &full, retaddr);  in sve_probe_page()
5929     info->flags = flags;  in sve_probe_page()
5937     memset(&info->attrs, 0, sizeof(info->attrs));  in sve_probe_page()
5939     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);  in sve_probe_page()
5941     info->attrs = full->attrs;  in sve_probe_page()
5942     info->tagged = full->extra.arm.pte_attrs == 0xf0;  in sve_probe_page()
5945     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */  in sve_probe_page()
5946     info->host -= mem_off;  in sve_probe_page()
5960     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;  in sve_cont_ldst_elements()
5965     /* Set all of the element indices to -1, and the TLB data to 0. */  in sve_cont_ldst_elements()
5966     memset(info, -1, offsetof(SVEContLdSt, page));  in sve_cont_ldst_elements()
5967     memset(info->page, 0, sizeof(info->page));  in sve_cont_ldst_elements()
5974             reg_off_last = i * 64 + 63 - clz64(pg);  in sve_cont_ldst_elements()
5987     info->reg_off_first[0] = reg_off_first;  in sve_cont_ldst_elements()
5988     info->mem_off_first[0] = (reg_off_first >> esz) * msize;  in sve_cont_ldst_elements()
5991     page_split = -(addr | TARGET_PAGE_MASK);  in sve_cont_ldst_elements()
5994         info->reg_off_last[0] = reg_off_last;  in sve_cont_ldst_elements()
5998     info->page_split = page_split;  in sve_cont_ldst_elements()
6006      * active element is the one that's split, this value remains -1.  in sve_cont_ldst_elements()
6010         info->reg_off_last[0] = reg_off_split - esize;  in sve_cont_ldst_elements()
6017             info->reg_off_split = reg_off_split;  in sve_cont_ldst_elements()
6018             info->mem_off_split = mem_off_split;  in sve_cont_ldst_elements()
6035     info->reg_off_first[1] = reg_off_split;  in sve_cont_ldst_elements()
6036     info->mem_off_first[1] = (reg_off_split >> esz) * msize;  in sve_cont_ldst_elements()
6037     info->reg_off_last[1] = reg_off_last;  in sve_cont_ldst_elements()
6042  * Resolve the guest virtual addresses to info->page[].
6051     int mem_off = info->mem_off_first[0];  in sve_cont_ldst_pages()
6055     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,  in sve_cont_ldst_pages()
6061     if (likely(info->page_split < 0)) {  in sve_cont_ldst_pages()
6070     if (info->mem_off_split >= 0) {  in sve_cont_ldst_pages()
6075         mem_off = info->page_split;  in sve_cont_ldst_pages()
6078          * of the vector, then:  For first-fault we should continue  in sve_cont_ldst_pages()
6079          * to generate faults for the second page.  For no-fault,  in sve_cont_ldst_pages()
6082         if (info->mem_off_first[0] < info->mem_off_split) {  in sve_cont_ldst_pages()
6091         mem_off = info->mem_off_first[1];  in sve_cont_ldst_pages()
6094          * so we're out of first-fault territory.  in sve_cont_ldst_pages()
6099     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,  in sve_cont_ldst_pages()
6111     int flags0 = info->page[0].flags;  in sve_cont_ldst_watchpoints()
6112     int flags1 = info->page[1].flags;  in sve_cont_ldst_watchpoints()
6119     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;  in sve_cont_ldst_watchpoints()
6120     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;  in sve_cont_ldst_watchpoints()
6123         mem_off = info->mem_off_first[0];  in sve_cont_ldst_watchpoints()
6124         reg_off = info->reg_off_first[0];  in sve_cont_ldst_watchpoints()
6125         reg_last = info->reg_off_last[0];  in sve_cont_ldst_watchpoints()
6132                                          msize, info->page[0].attrs,  in sve_cont_ldst_watchpoints()
6141     mem_off = info->mem_off_split;  in sve_cont_ldst_watchpoints()
6144                              info->page[0].attrs, wp_access, retaddr);  in sve_cont_ldst_watchpoints()
6147     mem_off = info->mem_off_first[1];  in sve_cont_ldst_watchpoints()
6149         reg_off = info->reg_off_first[1];  in sve_cont_ldst_watchpoints()
6150         reg_last = info->reg_off_last[1];  in sve_cont_ldst_watchpoints()
6157                                          msize, info->page[1].attrs,  in sve_cont_ldst_watchpoints()
6175     if (info->page[0].tagged) {  in sve_cont_ldst_mte_check()
6176         mem_off = info->mem_off_first[0];  in sve_cont_ldst_mte_check()
6177         reg_off = info->reg_off_first[0];  in sve_cont_ldst_mte_check()
6178         reg_last = info->reg_off_split;  in sve_cont_ldst_mte_check()
6180             reg_last = info->reg_off_last[0];  in sve_cont_ldst_mte_check()
6195     mem_off = info->mem_off_first[1];  in sve_cont_ldst_mte_check()
6196     if (mem_off >= 0 && info->page[1].tagged) {  in sve_cont_ldst_mte_check()
6197         reg_off = info->reg_off_first[1];  in sve_cont_ldst_mte_check()
6198         reg_last = info->reg_off_last[1];  in sve_cont_ldst_mte_check()
6214  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6234             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);  in sve_ldN_r()
6290             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);  in sve_ldN_r()
6298         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);  in sve_ldN_r()
6313                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,  in sve_ldN_r()
6325      * Use the slow path to manage the cross-page misalignment.  in sve_ldN_r()
6332             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,  in sve_ldN_r()
6350                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,  in sve_ldN_r()
6512  * Load contiguous data, first-fault and no-fault.
6514  * For user-only, we control the race between page_check_range and
6527     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6539  * Common helper for all contiguous no-fault and first-fault loads.
6549     void *vd = &env->vfp.zregs[rd];  in sve_ldnfff1_r()
6584         /* Trapping mte check for the first-fault element.  */  in sve_ldnfff1_r()
6596              * Use the slow path for cross-page handling.  in sve_ldnfff1_r()
6605             swap_memzero(vd + reg_off, reg_max - reg_off);  in sve_ldnfff1_r()
6633              * Use the slow path for cross-page handling.  in sve_ldnfff1_r()
6644      * Per the MemSingleNF pseudocode, a no-fault load from Device memory  in sve_ldnfff1_r()
6645      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.  in sve_ldnfff1_r()
6698      * As an implementation choice, decline to handle a cross-page element  in sve_ldnfff1_r()
6716      * be low frequency as the guest walks through memory -- the next  in sve_ldnfff1_r()
6845  * Common helper for all contiguous 1,2,3,4-register predicated stores.  in DO_LDFF1_LDNF1_1()
6907                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,  in DO_LDFF1_LDNF1_1()
6930                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,  in DO_LDFF1_LDNF1_1()
6942      * Use the slow path to manage the cross-page misalignment.  in DO_LDFF1_LDNF1_1()
6949             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,  in DO_LDFF1_LDNF1_1()
6967                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,  in DO_LDFF1_LDNF1_1()
7080  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
7131                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);  in sve_ld1_z()
7185      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot  in sve_ld1_z_mte()
7309  * Common helpers for all gather first-faulting loads.  in DO_LD1_ZPZ_S()
7355     swap_memzero(vd + reg_off, reg_max - reg_off);  in DO_LD1_ZPZ_S()
7365                 in_page = -(addr | TARGET_PAGE_MASK);  in DO_LD1_ZPZ_S()
7410      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot  in sve_ldff1_z_mte()
7540             target_ulong in_page = -(addr | TARGET_PAGE_MASK);  in DO_LDFF1_ZPZ_S()
7583      * as a first-level check against the predicate, since only enabled  in DO_LDFF1_ZPZ_S()
7584      * elements have non-null host addresses.  in DO_LDFF1_ZPZ_S()
7612      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot  in sve_st1_z_mte()
7714     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;  in DO_ST1_ZPZ_S()
7720     /* Set all of the element indices to -1, and the TLB data to 0. */  in DO_ST1_ZPZ_S()
7721     memset(info, -1, offsetof(SVEContLdSt, page));  in DO_ST1_ZPZ_S()
7722     memset(info->page, 0, sizeof(info->page));  in DO_ST1_ZPZ_S()
7729         reg_off_last = reg_max * N - b_stride;  in DO_ST1_ZPZ_S()
7735         reg_off_last = MIN(b_count - esize, reg_max * N - b_stride);  in DO_ST1_ZPZ_S()
7738     info->reg_off_first[0] = reg_off_first;  in DO_ST1_ZPZ_S()
7739     info->mem_off_first[0] = reg_off_first;  in DO_ST1_ZPZ_S()
7741     page_split = -(addr | TARGET_PAGE_MASK);  in DO_ST1_ZPZ_S()
7744         info->reg_off_last[0] = reg_off_last;  in DO_ST1_ZPZ_S()
7748     info->page_split = page_split;  in DO_ST1_ZPZ_S()
7754      * active element is the one that's split, this value remains -1.  in DO_ST1_ZPZ_S()
7758         info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride);  in DO_ST1_ZPZ_S()
7762     if (page_split & (esize - 1)) {  in DO_ST1_ZPZ_S()
7764         if ((reg_off_split & (b_stride - 1)) == 0) {  in DO_ST1_ZPZ_S()
7765             info->reg_off_split = reg_off_split;  in DO_ST1_ZPZ_S()
7766             info->mem_off_split = reg_off_split;  in DO_ST1_ZPZ_S()
7777         info->reg_off_first[1] = reg_off_split;  in DO_ST1_ZPZ_S()
7778         info->mem_off_first[1] = reg_off_split;  in DO_ST1_ZPZ_S()
7779         info->reg_off_last[1] = reg_off_last;  in DO_ST1_ZPZ_S()
7790     int flags0 = info->page[0].flags;  in sve2p1_cont_ldst_watchpoints()
7791     int flags1 = info->page[1].flags;  in sve2p1_cont_ldst_watchpoints()
7798     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;  in sve2p1_cont_ldst_watchpoints()
7799     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;  in sve2p1_cont_ldst_watchpoints()
7802         count_off = info->reg_off_first[0];  in sve2p1_cont_ldst_watchpoints()
7803         count_last = info->reg_off_split;  in sve2p1_cont_ldst_watchpoints()
7805             count_last = info->reg_off_last[0];  in sve2p1_cont_ldst_watchpoints()
7809                                  esize, info->page[0].attrs, wp_access, ra);  in sve2p1_cont_ldst_watchpoints()
7814     count_off = info->reg_off_first[1];  in sve2p1_cont_ldst_watchpoints()
7816         count_last = info->reg_off_last[1];  in sve2p1_cont_ldst_watchpoints()
7819                                  esize, info->page[1].attrs,  in sve2p1_cont_ldst_watchpoints()
7837      *   - first iteration hits addr + off, as required,  in sve2p1_cont_ldst_mte_check()
7838      *   - second iteration hits ALIGN_UP(addr, 16),  in sve2p1_cont_ldst_mte_check()
7839      *   - other iterations advance addr by 16.  in sve2p1_cont_ldst_mte_check()
7844     if (info->page[0].tagged) {  in sve2p1_cont_ldst_mte_check()
7845         count_off = info->reg_off_first[0];  in sve2p1_cont_ldst_mte_check()
7846         count_last = info->reg_off_split;  in sve2p1_cont_ldst_mte_check()
7848             count_last = info->reg_off_last[0];  in sve2p1_cont_ldst_mte_check()
7857     count_off = info->reg_off_first[1];  in sve2p1_cont_ldst_mte_check()
7858     if (count_off >= 0 && info->page[1].tagged) {  in sve2p1_cont_ldst_mte_check()
7859         count_last = info->reg_off_last[1];  in sve2p1_cont_ldst_mte_check()
7933             reg_last = MIN(count_last - count_off, reg_max - esize);  in sve2p1_ld1_c()
7964         reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);  in sve2p1_ld1_c()
7977      * Use the slow path to manage the cross-page misalignment.  in sve2p1_ld1_c()
7997             reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);  in sve2p1_ld1_c()
8101             reg_last = MIN(count_last - count_off, reg_max - esize);  in DO_LD1_2()
8124         reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);  in DO_LD1_2()
8137      * Use the slow path to manage the cross-page misalignment.  in DO_LD1_2()
8157             reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);  in DO_LD1_2()
8259     uint64_t signs = ones << (bits - 1);  in do_match2()
8265     cmp0 = (cmp0 - ones) & ~cmp0;  in do_match2()
8266     cmp1 = (cmp1 - ones) & ~cmp1;  in do_match2()
8449     int shl = 8 - shr;  in HELPER()
8463     int shl = 16 - shr;  in HELPER()
8565         uint64_t pg = g[(i - 1) >> 6];                                        \
8567             i -= sizeof(TYPEW);                                               \
8587         uint64_t pg = g[(i - 1) >> 6];                                        \  in DO_FCVTNT()
8589             i -= sizeof(TYPEW);                                               \  in DO_FCVTNT()
8618     int b_count = (p.count << v_esz) - vl * part;
8625             do_whileg(vd, mask, vl - b_count, vl);