1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/helper-proto.h" 25 #include "tcg/tcg-gvec-desc.h" 26 #include "fpu/softfloat.h" 27 #include "tcg/tcg.h" 28 #include "vec_internal.h" 29 #include "sve_ldst_internal.h" 30 #include "hw/core/tcg-cpu-ops.h" 31 32 33 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 34 * 35 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 36 * and bit 0 set if C is set. Compare the definitions of these variables 37 * within CPUARMState. 38 */ 39 40 /* For no G bits set, NZCV = C. */ 41 #define PREDTEST_INIT 1 42 43 /* This is an iterative function, called for each Pd and Pg word 44 * moving forward. 45 */ 46 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 47 { 48 if (likely(g)) { 49 /* Compute N from first D & G. 50 Use bit 2 to signal first G bit seen. */ 51 if (!(flags & 4)) { 52 flags |= ((d & (g & -g)) != 0) << 31; 53 flags |= 4; 54 } 55 56 /* Accumulate Z from each D & G. */ 57 flags |= ((d & g) != 0) << 1; 58 59 /* Compute C from last !(D & G). Replace previous. */ 60 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 61 } 62 return flags; 63 } 64 65 /* This is an iterative function, called for each Pd and Pg word 66 * moving backward. 67 */ 68 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 69 { 70 if (likely(g)) { 71 /* Compute C from first (i.e last) !(D & G). 72 Use bit 2 to signal first G bit seen. */ 73 if (!(flags & 4)) { 74 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 75 flags |= (d & pow2floor(g)) == 0; 76 } 77 78 /* Accumulate Z from each D & G. */ 79 flags |= ((d & g) != 0) << 1; 80 81 /* Compute N from last (i.e first) D & G. Replace previous. */ 82 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 83 } 84 return flags; 85 } 86 87 /* The same for a single word predicate. */ 88 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 89 { 90 return iter_predtest_fwd(d, g, PREDTEST_INIT); 91 } 92 93 /* The same for a multi-word predicate. */ 94 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 95 { 96 uint32_t flags = PREDTEST_INIT; 97 uint64_t *d = vd, *g = vg; 98 uintptr_t i = 0; 99 100 do { 101 flags = iter_predtest_fwd(d[i], g[i], flags); 102 } while (++i < words); 103 104 return flags; 105 } 106 107 /* Similarly for single word elements. */ 108 static inline uint64_t expand_pred_s(uint8_t byte) 109 { 110 static const uint64_t word[] = { 111 [0x01] = 0x00000000ffffffffull, 112 [0x10] = 0xffffffff00000000ull, 113 [0x11] = 0xffffffffffffffffull, 114 }; 115 return word[byte & 0x11]; 116 } 117 118 #define LOGICAL_PPPP(NAME, FUNC) \ 119 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 120 { \ 121 uintptr_t opr_sz = simd_oprsz(desc); \ 122 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 123 uintptr_t i; \ 124 for (i = 0; i < opr_sz / 8; ++i) { \ 125 d[i] = FUNC(n[i], m[i], g[i]); \ 126 } \ 127 } 128 129 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 130 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 131 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 132 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 133 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 134 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 135 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 136 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 137 138 LOGICAL_PPPP(sve_and_pppp, DO_AND) 139 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 140 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 141 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 142 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 143 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 144 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 145 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 146 147 #undef DO_AND 148 #undef DO_BIC 149 #undef DO_EOR 150 #undef DO_ORR 151 #undef DO_ORN 152 #undef DO_NOR 153 #undef DO_NAND 154 #undef DO_SEL 155 #undef LOGICAL_PPPP 156 157 /* Fully general three-operand expander, controlled by a predicate. 158 * This is complicated by the host-endian storage of the register file. 159 */ 160 /* ??? I don't expect the compiler could ever vectorize this itself. 161 * With some tables we can convert bit masks to byte masks, and with 162 * extra care wrt byte/word ordering we could use gcc generic vectors 163 * and do 16 bytes at a time. 164 */ 165 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 166 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 167 { \ 168 intptr_t i, opr_sz = simd_oprsz(desc); \ 169 for (i = 0; i < opr_sz; ) { \ 170 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 171 do { \ 172 if (pg & 1) { \ 173 TYPE nn = *(TYPE *)(vn + H(i)); \ 174 TYPE mm = *(TYPE *)(vm + H(i)); \ 175 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 176 } \ 177 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 178 } while (i & 15); \ 179 } \ 180 } 181 182 /* Similarly, specialized for 64-bit operands. */ 183 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 184 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 185 { \ 186 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 187 TYPE *d = vd, *n = vn, *m = vm; \ 188 uint8_t *pg = vg; \ 189 for (i = 0; i < opr_sz; i += 1) { \ 190 if (pg[H1(i)] & 1) { \ 191 TYPE nn = n[i], mm = m[i]; \ 192 d[i] = OP(nn, mm); \ 193 } \ 194 } \ 195 } 196 197 #define DO_AND(N, M) (N & M) 198 #define DO_EOR(N, M) (N ^ M) 199 #define DO_ORR(N, M) (N | M) 200 #define DO_BIC(N, M) (N & ~M) 201 #define DO_ADD(N, M) (N + M) 202 #define DO_SUB(N, M) (N - M) 203 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 204 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 205 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 206 #define DO_MUL(N, M) (N * M) 207 208 209 /* 210 * We must avoid the C undefined behaviour cases: division by 211 * zero and signed division of INT_MIN by -1. Both of these 212 * have architecturally defined required results for Arm. 213 * We special case all signed divisions by -1 to avoid having 214 * to deduce the minimum integer for the type involved. 215 */ 216 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 217 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 218 219 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 220 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 221 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 222 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 223 224 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 225 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 226 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 227 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 228 229 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 230 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 231 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 232 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 233 234 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 235 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 236 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 237 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 238 239 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 240 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 241 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 242 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 243 244 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 245 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 246 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 247 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 248 249 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 250 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 251 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 252 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 253 254 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 255 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 256 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 257 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 258 259 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 260 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 261 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 262 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 263 264 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 265 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 266 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 267 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 268 269 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 270 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 271 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 272 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 273 274 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 275 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 276 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 277 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 278 279 /* Because the computation type is at least twice as large as required, 280 these work for both signed and unsigned source types. */ 281 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 282 { 283 return (n * m) >> 8; 284 } 285 286 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 287 { 288 return (n * m) >> 16; 289 } 290 291 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 292 { 293 return (n * m) >> 32; 294 } 295 296 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 297 { 298 uint64_t lo, hi; 299 muls64(&lo, &hi, n, m); 300 return hi; 301 } 302 303 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 304 { 305 uint64_t lo, hi; 306 mulu64(&lo, &hi, n, m); 307 return hi; 308 } 309 310 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 311 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 312 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 313 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 314 315 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 316 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 317 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 318 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 319 320 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 321 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 322 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 323 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 324 325 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 326 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 327 328 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 329 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 330 331 /* Note that all bits of the shift are significant 332 and not modulo the element size. */ 333 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 334 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 335 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 336 337 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 338 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 339 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 340 341 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 342 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 343 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 344 345 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 346 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 347 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 348 349 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 350 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 351 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 352 353 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 354 { 355 int8_t n1 = n, n2 = n >> 8; 356 return m + n1 + n2; 357 } 358 359 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 360 { 361 int16_t n1 = n, n2 = n >> 16; 362 return m + n1 + n2; 363 } 364 365 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 366 { 367 int32_t n1 = n, n2 = n >> 32; 368 return m + n1 + n2; 369 } 370 371 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 372 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 373 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 374 375 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 376 { 377 uint8_t n1 = n, n2 = n >> 8; 378 return m + n1 + n2; 379 } 380 381 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 382 { 383 uint16_t n1 = n, n2 = n >> 16; 384 return m + n1 + n2; 385 } 386 387 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 388 { 389 uint32_t n1 = n, n2 = n >> 32; 390 return m + n1 + n2; 391 } 392 393 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 394 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 395 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 396 397 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 398 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 399 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 400 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 401 402 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 403 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 404 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 405 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 406 407 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 408 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 409 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 410 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 411 412 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 413 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 414 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 415 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 416 417 /* 418 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 419 * We pass in a pointer to a dummy saturation field to trigger 420 * the saturating arithmetic but discard the information about 421 * whether it has occurred. 422 */ 423 #define do_sqshl_b(n, m) \ 424 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 425 #define do_sqshl_h(n, m) \ 426 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 427 #define do_sqshl_s(n, m) \ 428 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 429 #define do_sqshl_d(n, m) \ 430 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 431 432 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 433 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 434 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 435 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 436 437 #define do_uqshl_b(n, m) \ 438 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 439 #define do_uqshl_h(n, m) \ 440 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 441 #define do_uqshl_s(n, m) \ 442 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 443 #define do_uqshl_d(n, m) \ 444 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 445 446 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 447 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 448 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 449 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 450 451 #define do_sqrshl_b(n, m) \ 452 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 453 #define do_sqrshl_h(n, m) \ 454 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 455 #define do_sqrshl_s(n, m) \ 456 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 457 #define do_sqrshl_d(n, m) \ 458 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 459 460 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 461 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 462 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 463 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 464 465 #undef do_sqrshl_d 466 467 #define do_uqrshl_b(n, m) \ 468 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 469 #define do_uqrshl_h(n, m) \ 470 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 471 #define do_uqrshl_s(n, m) \ 472 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 473 #define do_uqrshl_d(n, m) \ 474 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 475 476 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 477 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 478 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 479 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 480 481 #undef do_uqrshl_d 482 483 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 484 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 485 486 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 487 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 488 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 489 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 490 491 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 492 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 493 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 494 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 495 496 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 497 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 498 499 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 500 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 501 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 502 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 503 504 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 505 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 506 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 507 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 508 509 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 510 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 511 512 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 513 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 514 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 515 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 516 517 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 518 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 519 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 520 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 521 522 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 523 { 524 return val >= max ? max : val <= min ? min : val; 525 } 526 527 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 528 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 529 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 530 531 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 532 { 533 int64_t r = n + m; 534 if (((r ^ n) & ~(n ^ m)) < 0) { 535 /* Signed overflow. */ 536 return r < 0 ? INT64_MAX : INT64_MIN; 537 } 538 return r; 539 } 540 541 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 542 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 543 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 544 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 545 546 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 547 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 548 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 549 550 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 551 { 552 uint64_t r = n + m; 553 return r < n ? UINT64_MAX : r; 554 } 555 556 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 557 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 558 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 559 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 560 561 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 562 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 563 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 564 565 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 566 { 567 int64_t r = n - m; 568 if (((r ^ n) & (n ^ m)) < 0) { 569 /* Signed overflow. */ 570 return r < 0 ? INT64_MAX : INT64_MIN; 571 } 572 return r; 573 } 574 575 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 576 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 577 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 578 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 579 580 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 581 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 582 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 583 584 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 585 { 586 return n > m ? n - m : 0; 587 } 588 589 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 590 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 591 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 592 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 593 594 #define DO_SUQADD_B(n, m) \ 595 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 596 #define DO_SUQADD_H(n, m) \ 597 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 598 #define DO_SUQADD_S(n, m) \ 599 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 600 601 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 602 { 603 uint64_t r = n + m; 604 605 if (n < 0) { 606 /* Note that m - abs(n) cannot underflow. */ 607 if (r > INT64_MAX) { 608 /* Result is either very large positive or negative. */ 609 if (m > -n) { 610 /* m > abs(n), so r is a very large positive. */ 611 return INT64_MAX; 612 } 613 /* Result is negative. */ 614 } 615 } else { 616 /* Both inputs are positive: check for overflow. */ 617 if (r < m || r > INT64_MAX) { 618 return INT64_MAX; 619 } 620 } 621 return r; 622 } 623 624 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 625 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 626 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 627 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 628 629 #define DO_USQADD_B(n, m) \ 630 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 631 #define DO_USQADD_H(n, m) \ 632 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 633 #define DO_USQADD_S(n, m) \ 634 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 635 636 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 637 { 638 uint64_t r = n + m; 639 640 if (m < 0) { 641 return n < -m ? 0 : r; 642 } 643 return r < n ? UINT64_MAX : r; 644 } 645 646 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 647 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 648 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 649 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 650 651 #undef DO_ZPZZ 652 #undef DO_ZPZZ_D 653 654 /* 655 * Three operand expander, operating on element pairs. 656 * If the slot I is even, the elements from from VN {I, I+1}. 657 * If the slot I is odd, the elements from from VM {I-1, I}. 658 * Load all of the input elements in each pair before overwriting output. 659 */ 660 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 661 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 662 { \ 663 intptr_t i, opr_sz = simd_oprsz(desc); \ 664 for (i = 0; i < opr_sz; ) { \ 665 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 666 do { \ 667 TYPE n0 = *(TYPE *)(vn + H(i)); \ 668 TYPE m0 = *(TYPE *)(vm + H(i)); \ 669 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 670 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 671 if (pg & 1) { \ 672 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 673 } \ 674 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 675 if (pg & 1) { \ 676 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 677 } \ 678 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 679 } while (i & 15); \ 680 } \ 681 } 682 683 /* Similarly, specialized for 64-bit operands. */ 684 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 685 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 686 { \ 687 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 688 TYPE *d = vd, *n = vn, *m = vm; \ 689 uint8_t *pg = vg; \ 690 for (i = 0; i < opr_sz; i += 2) { \ 691 TYPE n0 = n[i], n1 = n[i + 1]; \ 692 TYPE m0 = m[i], m1 = m[i + 1]; \ 693 if (pg[H1(i)] & 1) { \ 694 d[i] = OP(n0, n1); \ 695 } \ 696 if (pg[H1(i + 1)] & 1) { \ 697 d[i + 1] = OP(m0, m1); \ 698 } \ 699 } \ 700 } 701 702 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 703 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 704 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 705 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 706 707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 710 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 711 712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 715 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 716 717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 720 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 721 722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 725 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 726 727 #undef DO_ZPZZ_PAIR 728 #undef DO_ZPZZ_PAIR_D 729 730 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 731 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 732 void *status, uint32_t desc) \ 733 { \ 734 intptr_t i, opr_sz = simd_oprsz(desc); \ 735 for (i = 0; i < opr_sz; ) { \ 736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 737 do { \ 738 TYPE n0 = *(TYPE *)(vn + H(i)); \ 739 TYPE m0 = *(TYPE *)(vm + H(i)); \ 740 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 741 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 742 if (pg & 1) { \ 743 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 744 } \ 745 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 746 if (pg & 1) { \ 747 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 748 } \ 749 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 750 } while (i & 15); \ 751 } \ 752 } 753 754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 757 758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 761 762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 765 766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 769 770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 773 774 #undef DO_ZPZZ_PAIR_FP 775 776 /* Three-operand expander, controlled by a predicate, in which the 777 * third operand is "wide". That is, for D = N op M, the same 64-bit 778 * value of M is used with all of the narrower values of N. 779 */ 780 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 781 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 782 { \ 783 intptr_t i, opr_sz = simd_oprsz(desc); \ 784 for (i = 0; i < opr_sz; ) { \ 785 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 786 TYPEW mm = *(TYPEW *)(vm + i); \ 787 do { \ 788 if (pg & 1) { \ 789 TYPE nn = *(TYPE *)(vn + H(i)); \ 790 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 791 } \ 792 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 793 } while (i & 7); \ 794 } \ 795 } 796 797 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 798 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 799 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 800 801 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 802 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 803 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 804 805 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 806 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 807 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 808 809 #undef DO_ZPZW 810 811 /* Fully general two-operand expander, controlled by a predicate. 812 */ 813 #define DO_ZPZ(NAME, TYPE, H, OP) \ 814 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 815 { \ 816 intptr_t i, opr_sz = simd_oprsz(desc); \ 817 for (i = 0; i < opr_sz; ) { \ 818 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 819 do { \ 820 if (pg & 1) { \ 821 TYPE nn = *(TYPE *)(vn + H(i)); \ 822 *(TYPE *)(vd + H(i)) = OP(nn); \ 823 } \ 824 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 825 } while (i & 15); \ 826 } \ 827 } 828 829 /* Similarly, specialized for 64-bit operands. */ 830 #define DO_ZPZ_D(NAME, TYPE, OP) \ 831 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 832 { \ 833 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 834 TYPE *d = vd, *n = vn; \ 835 uint8_t *pg = vg; \ 836 for (i = 0; i < opr_sz; i += 1) { \ 837 if (pg[H1(i)] & 1) { \ 838 TYPE nn = n[i]; \ 839 d[i] = OP(nn); \ 840 } \ 841 } \ 842 } 843 844 #define DO_CLS_B(N) (clrsb32(N) - 24) 845 #define DO_CLS_H(N) (clrsb32(N) - 16) 846 847 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 848 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 849 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 850 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 851 852 #define DO_CLZ_B(N) (clz32(N) - 24) 853 #define DO_CLZ_H(N) (clz32(N) - 16) 854 855 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 856 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 857 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 858 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 859 860 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 861 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 862 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 863 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 864 865 #define DO_CNOT(N) (N == 0) 866 867 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 868 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 869 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 870 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 871 872 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 873 874 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 875 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 876 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 877 878 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 879 880 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 881 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 882 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 883 884 #define DO_NOT(N) (~N) 885 886 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 887 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 888 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 889 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 890 891 #define DO_SXTB(N) ((int8_t)N) 892 #define DO_SXTH(N) ((int16_t)N) 893 #define DO_SXTS(N) ((int32_t)N) 894 #define DO_UXTB(N) ((uint8_t)N) 895 #define DO_UXTH(N) ((uint16_t)N) 896 #define DO_UXTS(N) ((uint32_t)N) 897 898 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 899 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 900 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 901 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 902 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 903 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 904 905 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 906 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 907 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 908 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 909 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 910 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 911 912 #define DO_ABS(N) (N < 0 ? -N : N) 913 914 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 915 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 916 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 917 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 918 919 #define DO_NEG(N) (-N) 920 921 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 922 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 923 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 924 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 925 926 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 927 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 928 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 929 930 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 931 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 932 933 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 934 935 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 936 { 937 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 938 uint64_t *d = vd, *n = vn; 939 uint8_t *pg = vg; 940 941 for (i = 0; i < opr_sz; i += 2) { 942 if (pg[H1(i)] & 1) { 943 uint64_t n0 = n[i + 0]; 944 uint64_t n1 = n[i + 1]; 945 d[i + 0] = n1; 946 d[i + 1] = n0; 947 } 948 } 949 } 950 951 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 952 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 953 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 954 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 955 956 #define DO_SQABS(X) \ 957 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 958 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 959 960 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 961 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 962 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 963 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 964 965 #define DO_SQNEG(X) \ 966 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 967 x_ == min_ ? -min_ - 1 : -x_; }) 968 969 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 970 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 971 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 972 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 973 974 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 975 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 976 977 /* Three-operand expander, unpredicated, in which the third operand is "wide". 978 */ 979 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 980 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 981 { \ 982 intptr_t i, opr_sz = simd_oprsz(desc); \ 983 for (i = 0; i < opr_sz; ) { \ 984 TYPEW mm = *(TYPEW *)(vm + i); \ 985 do { \ 986 TYPE nn = *(TYPE *)(vn + H(i)); \ 987 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 988 i += sizeof(TYPE); \ 989 } while (i & 7); \ 990 } \ 991 } 992 993 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 994 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 995 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 996 997 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 998 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 999 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1000 1001 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1002 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1003 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1004 1005 #undef DO_ZZW 1006 1007 #undef DO_CLS_B 1008 #undef DO_CLS_H 1009 #undef DO_CLZ_B 1010 #undef DO_CLZ_H 1011 #undef DO_CNOT 1012 #undef DO_FABS 1013 #undef DO_FNEG 1014 #undef DO_ABS 1015 #undef DO_NEG 1016 #undef DO_ZPZ 1017 #undef DO_ZPZ_D 1018 1019 /* 1020 * Three-operand expander, unpredicated, in which the two inputs are 1021 * selected from the top or bottom half of the wide column. 1022 */ 1023 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1024 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1025 { \ 1026 intptr_t i, opr_sz = simd_oprsz(desc); \ 1027 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1028 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1029 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1030 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1031 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1032 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1033 } \ 1034 } 1035 1036 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1037 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1038 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1039 1040 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1041 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1042 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1043 1044 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1045 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1046 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1047 1048 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1049 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1050 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1051 1052 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1053 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1054 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1055 1056 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1057 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1058 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1059 1060 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1061 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1062 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1063 1064 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1065 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1066 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1067 1068 /* Note that the multiply cannot overflow, but the doubling can. */ 1069 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1070 { 1071 int16_t val = n * m; 1072 return DO_SQADD_H(val, val); 1073 } 1074 1075 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1076 { 1077 int32_t val = n * m; 1078 return DO_SQADD_S(val, val); 1079 } 1080 1081 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1082 { 1083 int64_t val = n * m; 1084 return do_sqadd_d(val, val); 1085 } 1086 1087 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1088 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1089 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1090 1091 #undef DO_ZZZ_TB 1092 1093 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1094 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1095 { \ 1096 intptr_t i, opr_sz = simd_oprsz(desc); \ 1097 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1098 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1099 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1100 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1101 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1102 } \ 1103 } 1104 1105 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1106 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1107 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1108 1109 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1110 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1111 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1112 1113 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1114 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1115 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1116 1117 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1118 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1119 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1120 1121 #undef DO_ZZZ_WTB 1122 1123 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1124 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1125 { \ 1126 intptr_t i, opr_sz = simd_oprsz(desc); \ 1127 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1128 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1129 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1130 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1131 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1132 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1133 } \ 1134 } 1135 1136 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1137 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1138 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1139 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1140 1141 #undef DO_ZZZ_NTB 1142 1143 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1144 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1145 { \ 1146 intptr_t i, opr_sz = simd_oprsz(desc); \ 1147 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1148 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1149 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1150 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1151 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1152 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1153 } \ 1154 } 1155 1156 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1157 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1158 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1159 1160 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1161 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1162 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1163 1164 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1165 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1166 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1167 1168 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1169 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1170 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1171 1172 #define DO_NMUL(N, M) -(N * M) 1173 1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1177 1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1181 1182 #undef DO_ZZZW_ACC 1183 1184 #define DO_XTNB(NAME, TYPE, OP) \ 1185 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1186 { \ 1187 intptr_t i, opr_sz = simd_oprsz(desc); \ 1188 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1189 TYPE nn = *(TYPE *)(vn + i); \ 1190 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1191 *(TYPE *)(vd + i) = nn; \ 1192 } \ 1193 } 1194 1195 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1196 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1197 { \ 1198 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1199 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1200 TYPE nn = *(TYPE *)(vn + i); \ 1201 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1202 } \ 1203 } 1204 1205 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1206 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1207 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1208 1209 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1210 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1211 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1212 1213 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1214 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1215 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1216 1217 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1218 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1219 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1220 1221 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1222 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1223 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1224 1225 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1226 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1227 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1228 1229 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1230 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1231 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1232 1233 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1234 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1235 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1236 1237 #undef DO_XTNB 1238 #undef DO_XTNT 1239 1240 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1241 { 1242 intptr_t i, opr_sz = simd_oprsz(desc); 1243 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1244 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1245 uint32_t *a = va, *n = vn; 1246 uint64_t *d = vd, *m = vm; 1247 1248 for (i = 0; i < opr_sz / 8; ++i) { 1249 uint32_t e1 = a[2 * i + H4(0)]; 1250 uint32_t e2 = n[2 * i + sel] ^ inv; 1251 uint64_t c = extract64(m[i], 32, 1); 1252 /* Compute and store the entire 33-bit result at once. */ 1253 d[i] = c + e1 + e2; 1254 } 1255 } 1256 1257 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1258 { 1259 intptr_t i, opr_sz = simd_oprsz(desc); 1260 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1261 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1262 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1263 1264 for (i = 0; i < opr_sz / 8; i += 2) { 1265 Int128 e1 = int128_make64(a[i]); 1266 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1267 Int128 c = int128_make64(m[i + 1] & 1); 1268 Int128 r = int128_add(int128_add(e1, e2), c); 1269 d[i + 0] = int128_getlo(r); 1270 d[i + 1] = int128_gethi(r); 1271 } 1272 } 1273 1274 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1275 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1276 { \ 1277 intptr_t i, opr_sz = simd_oprsz(desc); \ 1278 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1279 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1280 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1281 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1282 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1283 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1284 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1285 } \ 1286 } 1287 1288 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1289 do_sqdmull_h, DO_SQADD_H) 1290 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1291 do_sqdmull_s, DO_SQADD_S) 1292 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1293 do_sqdmull_d, do_sqadd_d) 1294 1295 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1296 do_sqdmull_h, DO_SQSUB_H) 1297 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1298 do_sqdmull_s, DO_SQSUB_S) 1299 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1300 do_sqdmull_d, do_sqsub_d) 1301 1302 #undef DO_SQDMLAL 1303 1304 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1305 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1306 { \ 1307 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1308 int rot = simd_data(desc); \ 1309 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1310 bool sub_r = rot == 1 || rot == 2; \ 1311 bool sub_i = rot >= 2; \ 1312 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1313 for (i = 0; i < opr_sz; i += 2) { \ 1314 TYPE elt1_a = n[H(i + sel_a)]; \ 1315 TYPE elt2_a = m[H(i + sel_a)]; \ 1316 TYPE elt2_b = m[H(i + sel_b)]; \ 1317 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1318 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1319 } \ 1320 } 1321 1322 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1323 1324 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1325 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1326 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1327 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1328 1329 #define DO_SQRDMLAH_B(N, M, A, S) \ 1330 do_sqrdmlah_b(N, M, A, S, true) 1331 #define DO_SQRDMLAH_H(N, M, A, S) \ 1332 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1333 #define DO_SQRDMLAH_S(N, M, A, S) \ 1334 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1335 #define DO_SQRDMLAH_D(N, M, A, S) \ 1336 do_sqrdmlah_d(N, M, A, S, true) 1337 1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1342 1343 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1344 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1345 { \ 1346 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1347 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1348 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1349 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1350 bool sub_r = rot == 1 || rot == 2; \ 1351 bool sub_i = rot >= 2; \ 1352 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1353 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1354 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1355 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1356 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1357 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1358 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1359 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1360 } \ 1361 } \ 1362 } 1363 1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1366 1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1369 1370 #undef DO_CMLA 1371 #undef DO_CMLA_FUNC 1372 #undef DO_CMLA_IDX_FUNC 1373 #undef DO_SQRDMLAH_B 1374 #undef DO_SQRDMLAH_H 1375 #undef DO_SQRDMLAH_S 1376 #undef DO_SQRDMLAH_D 1377 1378 /* Note N and M are 4 elements bundled into one unit. */ 1379 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1380 int sel_a, int sel_b, int sub_i) 1381 { 1382 for (int i = 0; i <= 1; i++) { 1383 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1384 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1385 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1386 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1387 1388 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1389 } 1390 return a; 1391 } 1392 1393 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1394 int sel_a, int sel_b, int sub_i) 1395 { 1396 for (int i = 0; i <= 1; i++) { 1397 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1398 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1399 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1400 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1401 1402 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1403 } 1404 return a; 1405 } 1406 1407 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1408 void *va, uint32_t desc) 1409 { 1410 int opr_sz = simd_oprsz(desc); 1411 int rot = simd_data(desc); 1412 int sel_a = rot & 1; 1413 int sel_b = sel_a ^ 1; 1414 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1415 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1416 1417 for (int e = 0; e < opr_sz / 4; e++) { 1418 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1419 } 1420 } 1421 1422 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1423 void *va, uint32_t desc) 1424 { 1425 int opr_sz = simd_oprsz(desc); 1426 int rot = simd_data(desc); 1427 int sel_a = rot & 1; 1428 int sel_b = sel_a ^ 1; 1429 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1430 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1431 1432 for (int e = 0; e < opr_sz / 8; e++) { 1433 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1434 } 1435 } 1436 1437 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1438 void *va, uint32_t desc) 1439 { 1440 int opr_sz = simd_oprsz(desc); 1441 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1442 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1443 int sel_a = rot & 1; 1444 int sel_b = sel_a ^ 1; 1445 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1446 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1447 1448 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1449 uint32_t seg_m = m[seg + idx]; 1450 for (int e = 0; e < 4; e++) { 1451 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1452 sel_a, sel_b, sub_i); 1453 } 1454 } 1455 } 1456 1457 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1458 void *va, uint32_t desc) 1459 { 1460 int seg, opr_sz = simd_oprsz(desc); 1461 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1462 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1463 int sel_a = rot & 1; 1464 int sel_b = sel_a ^ 1; 1465 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1466 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1467 1468 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1469 uint64_t seg_m = m[seg + idx]; 1470 for (int e = 0; e < 2; e++) { 1471 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1472 sel_a, sel_b, sub_i); 1473 } 1474 } 1475 } 1476 1477 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1478 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1479 { \ 1480 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1481 intptr_t i, j, idx = simd_data(desc); \ 1482 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1483 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1484 TYPE mm = m[i]; \ 1485 for (j = 0; j < segment; j++) { \ 1486 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1487 } \ 1488 } \ 1489 } 1490 1491 #define DO_SQRDMLAH_H(N, M, A) \ 1492 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1493 #define DO_SQRDMLAH_S(N, M, A) \ 1494 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1495 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1496 1497 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1498 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1499 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1500 1501 #define DO_SQRDMLSH_H(N, M, A) \ 1502 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1503 #define DO_SQRDMLSH_S(N, M, A) \ 1504 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1505 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1506 1507 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1508 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1509 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1510 1511 #undef DO_ZZXZ 1512 1513 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1514 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1515 { \ 1516 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1517 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1518 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1519 for (i = 0; i < oprsz; i += 16) { \ 1520 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1521 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1522 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1523 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1524 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1525 } \ 1526 } \ 1527 } 1528 1529 #define DO_MLA(N, M, A) (A + N * M) 1530 1531 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1532 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1533 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1534 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1535 1536 #define DO_MLS(N, M, A) (A - N * M) 1537 1538 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1539 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1540 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1541 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1542 1543 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1544 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1545 1546 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1547 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1548 1549 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1550 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1551 1552 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1553 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1554 1555 #undef DO_MLA 1556 #undef DO_MLS 1557 #undef DO_ZZXW 1558 1559 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1560 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1561 { \ 1562 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1563 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1564 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1565 for (i = 0; i < oprsz; i += 16) { \ 1566 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1567 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1568 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1569 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1570 } \ 1571 } \ 1572 } 1573 1574 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1575 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1576 1577 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1578 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1579 1580 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1581 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1582 1583 #undef DO_ZZX 1584 1585 #define DO_BITPERM(NAME, TYPE, OP) \ 1586 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1587 { \ 1588 intptr_t i, opr_sz = simd_oprsz(desc); \ 1589 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1590 TYPE nn = *(TYPE *)(vn + i); \ 1591 TYPE mm = *(TYPE *)(vm + i); \ 1592 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1593 } \ 1594 } 1595 1596 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1597 { 1598 uint64_t res = 0; 1599 int db, rb = 0; 1600 1601 for (db = 0; db < n; ++db) { 1602 if ((mask >> db) & 1) { 1603 res |= ((data >> db) & 1) << rb; 1604 ++rb; 1605 } 1606 } 1607 return res; 1608 } 1609 1610 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1611 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1612 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1613 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1614 1615 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1616 { 1617 uint64_t res = 0; 1618 int rb, db = 0; 1619 1620 for (rb = 0; rb < n; ++rb) { 1621 if ((mask >> rb) & 1) { 1622 res |= ((data >> db) & 1) << rb; 1623 ++db; 1624 } 1625 } 1626 return res; 1627 } 1628 1629 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1630 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1631 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1632 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1633 1634 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1635 { 1636 uint64_t resm = 0, resu = 0; 1637 int db, rbm = 0, rbu = 0; 1638 1639 for (db = 0; db < n; ++db) { 1640 uint64_t val = (data >> db) & 1; 1641 if ((mask >> db) & 1) { 1642 resm |= val << rbm++; 1643 } else { 1644 resu |= val << rbu++; 1645 } 1646 } 1647 1648 return resm | (resu << rbm); 1649 } 1650 1651 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1652 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1653 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1654 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1655 1656 #undef DO_BITPERM 1657 1658 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1659 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1660 { \ 1661 intptr_t i, opr_sz = simd_oprsz(desc); \ 1662 int sub_r = simd_data(desc); \ 1663 if (sub_r) { \ 1664 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1665 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1666 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1667 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1668 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1669 acc_r = ADD_OP(acc_r, el2_i); \ 1670 acc_i = SUB_OP(acc_i, el2_r); \ 1671 *(TYPE *)(vd + H(i)) = acc_r; \ 1672 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1673 } \ 1674 } else { \ 1675 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1676 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1677 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1678 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1679 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1680 acc_r = SUB_OP(acc_r, el2_i); \ 1681 acc_i = ADD_OP(acc_i, el2_r); \ 1682 *(TYPE *)(vd + H(i)) = acc_r; \ 1683 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1684 } \ 1685 } \ 1686 } 1687 1688 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1689 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1690 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1691 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1692 1693 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1694 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1695 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1696 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1697 1698 #undef DO_CADD 1699 1700 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1701 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1702 { \ 1703 intptr_t i, opr_sz = simd_oprsz(desc); \ 1704 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1705 int shift = simd_data(desc) >> 1; \ 1706 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1707 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1708 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1709 } \ 1710 } 1711 1712 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1713 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1714 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1715 1716 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1717 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1718 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1719 1720 #undef DO_ZZI_SHLL 1721 1722 /* Two-operand reduction expander, controlled by a predicate. 1723 * The difference between TYPERED and TYPERET has to do with 1724 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1725 * but TYPERET must be unsigned so that e.g. a 32-bit value 1726 * is not sign-extended to the ABI uint64_t return type. 1727 */ 1728 /* ??? If we were to vectorize this by hand the reduction ordering 1729 * would change. For integer operands, this is perfectly fine. 1730 */ 1731 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1732 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1733 { \ 1734 intptr_t i, opr_sz = simd_oprsz(desc); \ 1735 TYPERED ret = INIT; \ 1736 for (i = 0; i < opr_sz; ) { \ 1737 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1738 do { \ 1739 if (pg & 1) { \ 1740 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1741 ret = OP(ret, nn); \ 1742 } \ 1743 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1744 } while (i & 15); \ 1745 } \ 1746 return (TYPERET)ret; \ 1747 } 1748 1749 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1750 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1751 { \ 1752 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1753 TYPEE *n = vn; \ 1754 uint8_t *pg = vg; \ 1755 TYPER ret = INIT; \ 1756 for (i = 0; i < opr_sz; i += 1) { \ 1757 if (pg[H1(i)] & 1) { \ 1758 TYPEE nn = n[i]; \ 1759 ret = OP(ret, nn); \ 1760 } \ 1761 } \ 1762 return ret; \ 1763 } 1764 1765 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1766 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1767 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1768 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1769 1770 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1771 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1772 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1773 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1774 1775 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1776 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1777 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1778 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1779 1780 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1781 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1782 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1783 1784 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1785 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1786 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1787 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1788 1789 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1790 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1791 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1792 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1793 1794 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1795 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1796 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1797 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1798 1799 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1800 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1801 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1802 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1803 1804 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1805 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1806 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1807 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1808 1809 #undef DO_VPZ 1810 #undef DO_VPZ_D 1811 1812 /* Two vector operand, one scalar operand, unpredicated. */ 1813 #define DO_ZZI(NAME, TYPE, OP) \ 1814 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1815 { \ 1816 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1817 TYPE s = s64, *d = vd, *n = vn; \ 1818 for (i = 0; i < opr_sz; ++i) { \ 1819 d[i] = OP(n[i], s); \ 1820 } \ 1821 } 1822 1823 #define DO_SUBR(X, Y) (Y - X) 1824 1825 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1826 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1827 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1828 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1829 1830 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1831 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1832 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1833 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1834 1835 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1836 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1837 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1838 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1839 1840 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1841 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1842 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1843 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1844 1845 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1846 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1847 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1848 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1849 1850 #undef DO_ZZI 1851 1852 #undef DO_AND 1853 #undef DO_ORR 1854 #undef DO_EOR 1855 #undef DO_BIC 1856 #undef DO_ADD 1857 #undef DO_SUB 1858 #undef DO_MAX 1859 #undef DO_MIN 1860 #undef DO_ABD 1861 #undef DO_MUL 1862 #undef DO_DIV 1863 #undef DO_ASR 1864 #undef DO_LSR 1865 #undef DO_LSL 1866 #undef DO_SUBR 1867 1868 /* Similar to the ARM LastActiveElement pseudocode function, except the 1869 result is multiplied by the element size. This includes the not found 1870 indication; e.g. not found for esz=3 is -8. */ 1871 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1872 { 1873 uint64_t mask = pred_esz_masks[esz]; 1874 intptr_t i = words; 1875 1876 do { 1877 uint64_t this_g = g[--i] & mask; 1878 if (this_g) { 1879 return i * 64 + (63 - clz64(this_g)); 1880 } 1881 } while (i > 0); 1882 return (intptr_t)-1 << esz; 1883 } 1884 1885 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1886 { 1887 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1888 uint32_t flags = PREDTEST_INIT; 1889 uint64_t *d = vd, *g = vg; 1890 intptr_t i = 0; 1891 1892 do { 1893 uint64_t this_d = d[i]; 1894 uint64_t this_g = g[i]; 1895 1896 if (this_g) { 1897 if (!(flags & 4)) { 1898 /* Set in D the first bit of G. */ 1899 this_d |= this_g & -this_g; 1900 d[i] = this_d; 1901 } 1902 flags = iter_predtest_fwd(this_d, this_g, flags); 1903 } 1904 } while (++i < words); 1905 1906 return flags; 1907 } 1908 1909 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1910 { 1911 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1912 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1913 uint32_t flags = PREDTEST_INIT; 1914 uint64_t *d = vd, *g = vg, esz_mask; 1915 intptr_t i, next; 1916 1917 next = last_active_element(vd, words, esz) + (1 << esz); 1918 esz_mask = pred_esz_masks[esz]; 1919 1920 /* Similar to the pseudocode for pnext, but scaled by ESZ 1921 so that we find the correct bit. */ 1922 if (next < words * 64) { 1923 uint64_t mask = -1; 1924 1925 if (next & 63) { 1926 mask = ~((1ull << (next & 63)) - 1); 1927 next &= -64; 1928 } 1929 do { 1930 uint64_t this_g = g[next / 64] & esz_mask & mask; 1931 if (this_g != 0) { 1932 next = (next & -64) + ctz64(this_g); 1933 break; 1934 } 1935 next += 64; 1936 mask = -1; 1937 } while (next < words * 64); 1938 } 1939 1940 i = 0; 1941 do { 1942 uint64_t this_d = 0; 1943 if (i == next / 64) { 1944 this_d = 1ull << (next & 63); 1945 } 1946 d[i] = this_d; 1947 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1948 } while (++i < words); 1949 1950 return flags; 1951 } 1952 1953 /* 1954 * Copy Zn into Zd, and store zero into inactive elements. 1955 * If inv, store zeros into the active elements. 1956 */ 1957 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1958 { 1959 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1960 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1961 uint64_t *d = vd, *n = vn; 1962 uint8_t *pg = vg; 1963 1964 for (i = 0; i < opr_sz; i += 1) { 1965 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1966 } 1967 } 1968 1969 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1970 { 1971 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1972 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1973 uint64_t *d = vd, *n = vn; 1974 uint8_t *pg = vg; 1975 1976 for (i = 0; i < opr_sz; i += 1) { 1977 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1978 } 1979 } 1980 1981 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 1982 { 1983 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1984 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1985 uint64_t *d = vd, *n = vn; 1986 uint8_t *pg = vg; 1987 1988 for (i = 0; i < opr_sz; i += 1) { 1989 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 1990 } 1991 } 1992 1993 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 1994 { 1995 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1996 uint64_t *d = vd, *n = vn; 1997 uint8_t *pg = vg; 1998 uint8_t inv = simd_data(desc); 1999 2000 for (i = 0; i < opr_sz; i += 1) { 2001 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2002 } 2003 } 2004 2005 /* Three-operand expander, immediate operand, controlled by a predicate. 2006 */ 2007 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2008 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2009 { \ 2010 intptr_t i, opr_sz = simd_oprsz(desc); \ 2011 TYPE imm = simd_data(desc); \ 2012 for (i = 0; i < opr_sz; ) { \ 2013 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2014 do { \ 2015 if (pg & 1) { \ 2016 TYPE nn = *(TYPE *)(vn + H(i)); \ 2017 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2018 } \ 2019 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2020 } while (i & 15); \ 2021 } \ 2022 } 2023 2024 /* Similarly, specialized for 64-bit operands. */ 2025 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2026 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2027 { \ 2028 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2029 TYPE *d = vd, *n = vn; \ 2030 TYPE imm = simd_data(desc); \ 2031 uint8_t *pg = vg; \ 2032 for (i = 0; i < opr_sz; i += 1) { \ 2033 if (pg[H1(i)] & 1) { \ 2034 TYPE nn = n[i]; \ 2035 d[i] = OP(nn, imm); \ 2036 } \ 2037 } \ 2038 } 2039 2040 #define DO_SHR(N, M) (N >> M) 2041 #define DO_SHL(N, M) (N << M) 2042 2043 /* Arithmetic shift right for division. This rounds negative numbers 2044 toward zero as per signed division. Therefore before shifting, 2045 when N is negative, add 2**M-1. */ 2046 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2047 2048 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2049 { 2050 if (likely(sh < 64)) { 2051 return (x >> sh) + ((x >> (sh - 1)) & 1); 2052 } else if (sh == 64) { 2053 return x >> 63; 2054 } else { 2055 return 0; 2056 } 2057 } 2058 2059 static inline int64_t do_srshr(int64_t x, unsigned sh) 2060 { 2061 if (likely(sh < 64)) { 2062 return (x >> sh) + ((x >> (sh - 1)) & 1); 2063 } else { 2064 /* Rounding the sign bit always produces 0. */ 2065 return 0; 2066 } 2067 } 2068 2069 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2070 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2071 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2072 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2073 2074 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2075 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2076 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2077 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2078 2079 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2080 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2081 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2082 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2083 2084 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2085 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2086 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2087 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2088 2089 /* SVE2 bitwise shift by immediate */ 2090 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2091 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2092 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2093 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2094 2095 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2096 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2097 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2098 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2099 2100 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2101 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2102 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2103 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2104 2105 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2106 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2107 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2108 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2109 2110 #define do_suqrshl_b(n, m) \ 2111 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2112 #define do_suqrshl_h(n, m) \ 2113 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2114 #define do_suqrshl_s(n, m) \ 2115 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2116 #define do_suqrshl_d(n, m) \ 2117 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2118 2119 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2120 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2121 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2122 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2123 2124 #undef DO_ASRD 2125 #undef DO_ZPZI 2126 #undef DO_ZPZI_D 2127 2128 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2129 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2130 { \ 2131 intptr_t i, opr_sz = simd_oprsz(desc); \ 2132 int shift = simd_data(desc); \ 2133 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2134 TYPEW nn = *(TYPEW *)(vn + i); \ 2135 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2136 } \ 2137 } 2138 2139 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2140 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2141 { \ 2142 intptr_t i, opr_sz = simd_oprsz(desc); \ 2143 int shift = simd_data(desc); \ 2144 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2145 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2146 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2147 } \ 2148 } 2149 2150 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2151 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2152 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2153 2154 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2155 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2156 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2157 2158 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2159 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2160 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2161 2162 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2163 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2164 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2165 2166 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2167 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2168 #define DO_SQSHRUN_D(x, sh) \ 2169 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2170 2171 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2172 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2173 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2174 2175 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2176 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2177 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2178 2179 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2180 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2181 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2182 2183 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2184 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2185 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2186 2187 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2188 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2189 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2190 2191 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2192 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2193 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2194 2195 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2196 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2197 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2198 2199 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2200 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2201 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2202 2203 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2204 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2205 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2206 2207 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2208 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2209 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2210 2211 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2212 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2213 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2214 2215 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2216 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2217 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2218 2219 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2220 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2221 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2222 2223 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2224 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2225 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2226 2227 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2228 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2229 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2230 2231 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2232 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2233 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2234 2235 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2236 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2237 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2238 2239 #undef DO_SHRNB 2240 #undef DO_SHRNT 2241 2242 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2243 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2244 { \ 2245 intptr_t i, opr_sz = simd_oprsz(desc); \ 2246 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2247 TYPEW nn = *(TYPEW *)(vn + i); \ 2248 TYPEW mm = *(TYPEW *)(vm + i); \ 2249 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2250 } \ 2251 } 2252 2253 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2254 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2255 { \ 2256 intptr_t i, opr_sz = simd_oprsz(desc); \ 2257 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2258 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2259 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2260 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2261 } \ 2262 } 2263 2264 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2265 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2266 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2267 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2268 2269 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2270 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2271 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2272 2273 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2274 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2275 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2276 2277 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2278 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2279 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2280 2281 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2282 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2283 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2284 2285 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2286 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2287 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2288 2289 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2290 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2291 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2292 2293 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2294 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2295 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2296 2297 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2298 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2299 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2300 2301 #undef DO_RSUBHN 2302 #undef DO_SUBHN 2303 #undef DO_RADDHN 2304 #undef DO_ADDHN 2305 2306 #undef DO_BINOPNB 2307 2308 /* Fully general four-operand expander, controlled by a predicate. 2309 */ 2310 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2311 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2312 void *vg, uint32_t desc) \ 2313 { \ 2314 intptr_t i, opr_sz = simd_oprsz(desc); \ 2315 for (i = 0; i < opr_sz; ) { \ 2316 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2317 do { \ 2318 if (pg & 1) { \ 2319 TYPE nn = *(TYPE *)(vn + H(i)); \ 2320 TYPE mm = *(TYPE *)(vm + H(i)); \ 2321 TYPE aa = *(TYPE *)(va + H(i)); \ 2322 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2323 } \ 2324 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2325 } while (i & 15); \ 2326 } \ 2327 } 2328 2329 /* Similarly, specialized for 64-bit operands. */ 2330 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2331 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2332 void *vg, uint32_t desc) \ 2333 { \ 2334 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2335 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2336 uint8_t *pg = vg; \ 2337 for (i = 0; i < opr_sz; i += 1) { \ 2338 if (pg[H1(i)] & 1) { \ 2339 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2340 d[i] = OP(aa, nn, mm); \ 2341 } \ 2342 } \ 2343 } 2344 2345 #define DO_MLA(A, N, M) (A + N * M) 2346 #define DO_MLS(A, N, M) (A - N * M) 2347 2348 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2349 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2350 2351 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2352 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2353 2354 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2355 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2356 2357 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2358 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2359 2360 #undef DO_MLA 2361 #undef DO_MLS 2362 #undef DO_ZPZZZ 2363 #undef DO_ZPZZZ_D 2364 2365 void HELPER(sve_index_b)(void *vd, uint32_t start, 2366 uint32_t incr, uint32_t desc) 2367 { 2368 intptr_t i, opr_sz = simd_oprsz(desc); 2369 uint8_t *d = vd; 2370 for (i = 0; i < opr_sz; i += 1) { 2371 d[H1(i)] = start + i * incr; 2372 } 2373 } 2374 2375 void HELPER(sve_index_h)(void *vd, uint32_t start, 2376 uint32_t incr, uint32_t desc) 2377 { 2378 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2379 uint16_t *d = vd; 2380 for (i = 0; i < opr_sz; i += 1) { 2381 d[H2(i)] = start + i * incr; 2382 } 2383 } 2384 2385 void HELPER(sve_index_s)(void *vd, uint32_t start, 2386 uint32_t incr, uint32_t desc) 2387 { 2388 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2389 uint32_t *d = vd; 2390 for (i = 0; i < opr_sz; i += 1) { 2391 d[H4(i)] = start + i * incr; 2392 } 2393 } 2394 2395 void HELPER(sve_index_d)(void *vd, uint64_t start, 2396 uint64_t incr, uint32_t desc) 2397 { 2398 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2399 uint64_t *d = vd; 2400 for (i = 0; i < opr_sz; i += 1) { 2401 d[i] = start + i * incr; 2402 } 2403 } 2404 2405 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2406 { 2407 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2408 uint32_t sh = simd_data(desc); 2409 uint32_t *d = vd, *n = vn, *m = vm; 2410 for (i = 0; i < opr_sz; i += 1) { 2411 d[i] = n[i] + (m[i] << sh); 2412 } 2413 } 2414 2415 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2416 { 2417 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2418 uint64_t sh = simd_data(desc); 2419 uint64_t *d = vd, *n = vn, *m = vm; 2420 for (i = 0; i < opr_sz; i += 1) { 2421 d[i] = n[i] + (m[i] << sh); 2422 } 2423 } 2424 2425 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2426 { 2427 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2428 uint64_t sh = simd_data(desc); 2429 uint64_t *d = vd, *n = vn, *m = vm; 2430 for (i = 0; i < opr_sz; i += 1) { 2431 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2432 } 2433 } 2434 2435 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2436 { 2437 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2438 uint64_t sh = simd_data(desc); 2439 uint64_t *d = vd, *n = vn, *m = vm; 2440 for (i = 0; i < opr_sz; i += 1) { 2441 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2442 } 2443 } 2444 2445 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2446 { 2447 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2448 static const uint16_t coeff[] = { 2449 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2450 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2451 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2452 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2453 }; 2454 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2455 uint16_t *d = vd, *n = vn; 2456 2457 for (i = 0; i < opr_sz; i++) { 2458 uint16_t nn = n[i]; 2459 intptr_t idx = extract32(nn, 0, 5); 2460 uint16_t exp = extract32(nn, 5, 5); 2461 d[i] = coeff[idx] | (exp << 10); 2462 } 2463 } 2464 2465 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2466 { 2467 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2468 static const uint32_t coeff[] = { 2469 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2470 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2471 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2472 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2473 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2474 0x1ef532, 0x20b051, 0x227043, 0x243516, 2475 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2476 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2477 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2478 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2479 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2480 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2481 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2482 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2483 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2484 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2485 }; 2486 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2487 uint32_t *d = vd, *n = vn; 2488 2489 for (i = 0; i < opr_sz; i++) { 2490 uint32_t nn = n[i]; 2491 intptr_t idx = extract32(nn, 0, 6); 2492 uint32_t exp = extract32(nn, 6, 8); 2493 d[i] = coeff[idx] | (exp << 23); 2494 } 2495 } 2496 2497 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2498 { 2499 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2500 static const uint64_t coeff[] = { 2501 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2502 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2503 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2504 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2505 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2506 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2507 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2508 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2509 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2510 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2511 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2512 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2513 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2514 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2515 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2516 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2517 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2518 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2519 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2520 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2521 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2522 0xFA7C1819E90D8ull, 2523 }; 2524 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2525 uint64_t *d = vd, *n = vn; 2526 2527 for (i = 0; i < opr_sz; i++) { 2528 uint64_t nn = n[i]; 2529 intptr_t idx = extract32(nn, 0, 6); 2530 uint64_t exp = extract32(nn, 6, 11); 2531 d[i] = coeff[idx] | (exp << 52); 2532 } 2533 } 2534 2535 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2536 { 2537 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2538 uint16_t *d = vd, *n = vn, *m = vm; 2539 for (i = 0; i < opr_sz; i += 1) { 2540 uint16_t nn = n[i]; 2541 uint16_t mm = m[i]; 2542 if (mm & 1) { 2543 nn = float16_one; 2544 } 2545 d[i] = nn ^ (mm & 2) << 14; 2546 } 2547 } 2548 2549 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2550 { 2551 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2552 uint32_t *d = vd, *n = vn, *m = vm; 2553 for (i = 0; i < opr_sz; i += 1) { 2554 uint32_t nn = n[i]; 2555 uint32_t mm = m[i]; 2556 if (mm & 1) { 2557 nn = float32_one; 2558 } 2559 d[i] = nn ^ (mm & 2) << 30; 2560 } 2561 } 2562 2563 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2564 { 2565 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2566 uint64_t *d = vd, *n = vn, *m = vm; 2567 for (i = 0; i < opr_sz; i += 1) { 2568 uint64_t nn = n[i]; 2569 uint64_t mm = m[i]; 2570 if (mm & 1) { 2571 nn = float64_one; 2572 } 2573 d[i] = nn ^ (mm & 2) << 62; 2574 } 2575 } 2576 2577 /* 2578 * Signed saturating addition with scalar operand. 2579 */ 2580 2581 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2582 { 2583 intptr_t i, oprsz = simd_oprsz(desc); 2584 2585 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2586 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2587 } 2588 } 2589 2590 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2591 { 2592 intptr_t i, oprsz = simd_oprsz(desc); 2593 2594 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2595 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2596 } 2597 } 2598 2599 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2600 { 2601 intptr_t i, oprsz = simd_oprsz(desc); 2602 2603 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2604 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2605 } 2606 } 2607 2608 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2609 { 2610 intptr_t i, oprsz = simd_oprsz(desc); 2611 2612 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2613 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2614 } 2615 } 2616 2617 /* 2618 * Unsigned saturating addition with scalar operand. 2619 */ 2620 2621 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2622 { 2623 intptr_t i, oprsz = simd_oprsz(desc); 2624 2625 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2626 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2627 } 2628 } 2629 2630 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2631 { 2632 intptr_t i, oprsz = simd_oprsz(desc); 2633 2634 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2635 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2636 } 2637 } 2638 2639 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2640 { 2641 intptr_t i, oprsz = simd_oprsz(desc); 2642 2643 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2644 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2645 } 2646 } 2647 2648 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2649 { 2650 intptr_t i, oprsz = simd_oprsz(desc); 2651 2652 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2653 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2654 } 2655 } 2656 2657 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2658 { 2659 intptr_t i, oprsz = simd_oprsz(desc); 2660 2661 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2662 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2663 } 2664 } 2665 2666 /* Two operand predicated copy immediate with merge. All valid immediates 2667 * can fit within 17 signed bits in the simd_data field. 2668 */ 2669 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2670 uint64_t mm, uint32_t desc) 2671 { 2672 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2673 uint64_t *d = vd, *n = vn; 2674 uint8_t *pg = vg; 2675 2676 mm = dup_const(MO_8, mm); 2677 for (i = 0; i < opr_sz; i += 1) { 2678 uint64_t nn = n[i]; 2679 uint64_t pp = expand_pred_b(pg[H1(i)]); 2680 d[i] = (mm & pp) | (nn & ~pp); 2681 } 2682 } 2683 2684 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2685 uint64_t mm, uint32_t desc) 2686 { 2687 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2688 uint64_t *d = vd, *n = vn; 2689 uint8_t *pg = vg; 2690 2691 mm = dup_const(MO_16, mm); 2692 for (i = 0; i < opr_sz; i += 1) { 2693 uint64_t nn = n[i]; 2694 uint64_t pp = expand_pred_h(pg[H1(i)]); 2695 d[i] = (mm & pp) | (nn & ~pp); 2696 } 2697 } 2698 2699 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2700 uint64_t mm, uint32_t desc) 2701 { 2702 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2703 uint64_t *d = vd, *n = vn; 2704 uint8_t *pg = vg; 2705 2706 mm = dup_const(MO_32, mm); 2707 for (i = 0; i < opr_sz; i += 1) { 2708 uint64_t nn = n[i]; 2709 uint64_t pp = expand_pred_s(pg[H1(i)]); 2710 d[i] = (mm & pp) | (nn & ~pp); 2711 } 2712 } 2713 2714 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2715 uint64_t mm, uint32_t desc) 2716 { 2717 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2718 uint64_t *d = vd, *n = vn; 2719 uint8_t *pg = vg; 2720 2721 for (i = 0; i < opr_sz; i += 1) { 2722 uint64_t nn = n[i]; 2723 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2724 } 2725 } 2726 2727 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2728 { 2729 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2730 uint64_t *d = vd; 2731 uint8_t *pg = vg; 2732 2733 val = dup_const(MO_8, val); 2734 for (i = 0; i < opr_sz; i += 1) { 2735 d[i] = val & expand_pred_b(pg[H1(i)]); 2736 } 2737 } 2738 2739 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2740 { 2741 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2742 uint64_t *d = vd; 2743 uint8_t *pg = vg; 2744 2745 val = dup_const(MO_16, val); 2746 for (i = 0; i < opr_sz; i += 1) { 2747 d[i] = val & expand_pred_h(pg[H1(i)]); 2748 } 2749 } 2750 2751 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2752 { 2753 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2754 uint64_t *d = vd; 2755 uint8_t *pg = vg; 2756 2757 val = dup_const(MO_32, val); 2758 for (i = 0; i < opr_sz; i += 1) { 2759 d[i] = val & expand_pred_s(pg[H1(i)]); 2760 } 2761 } 2762 2763 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2764 { 2765 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2766 uint64_t *d = vd; 2767 uint8_t *pg = vg; 2768 2769 for (i = 0; i < opr_sz; i += 1) { 2770 d[i] = (pg[H1(i)] & 1 ? val : 0); 2771 } 2772 } 2773 2774 /* Big-endian hosts need to frob the byte indices. If the copy 2775 * happens to be 8-byte aligned, then no frobbing necessary. 2776 */ 2777 static void swap_memmove(void *vd, void *vs, size_t n) 2778 { 2779 uintptr_t d = (uintptr_t)vd; 2780 uintptr_t s = (uintptr_t)vs; 2781 uintptr_t o = (d | s | n) & 7; 2782 size_t i; 2783 2784 #if !HOST_BIG_ENDIAN 2785 o = 0; 2786 #endif 2787 switch (o) { 2788 case 0: 2789 memmove(vd, vs, n); 2790 break; 2791 2792 case 4: 2793 if (d < s || d >= s + n) { 2794 for (i = 0; i < n; i += 4) { 2795 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2796 } 2797 } else { 2798 for (i = n; i > 0; ) { 2799 i -= 4; 2800 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2801 } 2802 } 2803 break; 2804 2805 case 2: 2806 case 6: 2807 if (d < s || d >= s + n) { 2808 for (i = 0; i < n; i += 2) { 2809 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2810 } 2811 } else { 2812 for (i = n; i > 0; ) { 2813 i -= 2; 2814 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2815 } 2816 } 2817 break; 2818 2819 default: 2820 if (d < s || d >= s + n) { 2821 for (i = 0; i < n; i++) { 2822 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2823 } 2824 } else { 2825 for (i = n; i > 0; ) { 2826 i -= 1; 2827 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2828 } 2829 } 2830 break; 2831 } 2832 } 2833 2834 /* Similarly for memset of 0. */ 2835 static void swap_memzero(void *vd, size_t n) 2836 { 2837 uintptr_t d = (uintptr_t)vd; 2838 uintptr_t o = (d | n) & 7; 2839 size_t i; 2840 2841 /* Usually, the first bit of a predicate is set, so N is 0. */ 2842 if (likely(n == 0)) { 2843 return; 2844 } 2845 2846 #if !HOST_BIG_ENDIAN 2847 o = 0; 2848 #endif 2849 switch (o) { 2850 case 0: 2851 memset(vd, 0, n); 2852 break; 2853 2854 case 4: 2855 for (i = 0; i < n; i += 4) { 2856 *(uint32_t *)H1_4(d + i) = 0; 2857 } 2858 break; 2859 2860 case 2: 2861 case 6: 2862 for (i = 0; i < n; i += 2) { 2863 *(uint16_t *)H1_2(d + i) = 0; 2864 } 2865 break; 2866 2867 default: 2868 for (i = 0; i < n; i++) { 2869 *(uint8_t *)H1(d + i) = 0; 2870 } 2871 break; 2872 } 2873 } 2874 2875 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2876 { 2877 intptr_t opr_sz = simd_oprsz(desc); 2878 size_t n_ofs = simd_data(desc); 2879 size_t n_siz = opr_sz - n_ofs; 2880 2881 if (vd != vm) { 2882 swap_memmove(vd, vn + n_ofs, n_siz); 2883 swap_memmove(vd + n_siz, vm, n_ofs); 2884 } else if (vd != vn) { 2885 swap_memmove(vd + n_siz, vd, n_ofs); 2886 swap_memmove(vd, vn + n_ofs, n_siz); 2887 } else { 2888 /* vd == vn == vm. Need temp space. */ 2889 ARMVectorReg tmp; 2890 swap_memmove(&tmp, vm, n_ofs); 2891 swap_memmove(vd, vd + n_ofs, n_siz); 2892 memcpy(vd + n_siz, &tmp, n_ofs); 2893 } 2894 } 2895 2896 #define DO_INSR(NAME, TYPE, H) \ 2897 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2898 { \ 2899 intptr_t opr_sz = simd_oprsz(desc); \ 2900 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2901 *(TYPE *)(vd + H(0)) = val; \ 2902 } 2903 2904 DO_INSR(sve_insr_b, uint8_t, H1) 2905 DO_INSR(sve_insr_h, uint16_t, H1_2) 2906 DO_INSR(sve_insr_s, uint32_t, H1_4) 2907 DO_INSR(sve_insr_d, uint64_t, H1_8) 2908 2909 #undef DO_INSR 2910 2911 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2912 { 2913 intptr_t i, j, opr_sz = simd_oprsz(desc); 2914 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2915 uint64_t f = *(uint64_t *)(vn + i); 2916 uint64_t b = *(uint64_t *)(vn + j); 2917 *(uint64_t *)(vd + i) = bswap64(b); 2918 *(uint64_t *)(vd + j) = bswap64(f); 2919 } 2920 } 2921 2922 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2923 { 2924 intptr_t i, j, opr_sz = simd_oprsz(desc); 2925 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2926 uint64_t f = *(uint64_t *)(vn + i); 2927 uint64_t b = *(uint64_t *)(vn + j); 2928 *(uint64_t *)(vd + i) = hswap64(b); 2929 *(uint64_t *)(vd + j) = hswap64(f); 2930 } 2931 } 2932 2933 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2934 { 2935 intptr_t i, j, opr_sz = simd_oprsz(desc); 2936 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2937 uint64_t f = *(uint64_t *)(vn + i); 2938 uint64_t b = *(uint64_t *)(vn + j); 2939 *(uint64_t *)(vd + i) = rol64(b, 32); 2940 *(uint64_t *)(vd + j) = rol64(f, 32); 2941 } 2942 } 2943 2944 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2945 { 2946 intptr_t i, j, opr_sz = simd_oprsz(desc); 2947 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2948 uint64_t f = *(uint64_t *)(vn + i); 2949 uint64_t b = *(uint64_t *)(vn + j); 2950 *(uint64_t *)(vd + i) = b; 2951 *(uint64_t *)(vd + j) = f; 2952 } 2953 } 2954 2955 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2956 2957 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2958 bool is_tbx, tb_impl_fn *fn) 2959 { 2960 ARMVectorReg scratch; 2961 uintptr_t oprsz = simd_oprsz(desc); 2962 2963 if (unlikely(vd == vn)) { 2964 vn = memcpy(&scratch, vn, oprsz); 2965 } 2966 2967 fn(vd, vn, NULL, vm, oprsz, is_tbx); 2968 } 2969 2970 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 2971 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 2972 { 2973 ARMVectorReg scratch; 2974 uintptr_t oprsz = simd_oprsz(desc); 2975 2976 if (unlikely(vd == vn0)) { 2977 vn0 = memcpy(&scratch, vn0, oprsz); 2978 if (vd == vn1) { 2979 vn1 = vn0; 2980 } 2981 } else if (unlikely(vd == vn1)) { 2982 vn1 = memcpy(&scratch, vn1, oprsz); 2983 } 2984 2985 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 2986 } 2987 2988 #define DO_TB(SUFF, TYPE, H) \ 2989 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 2990 void *vm, uintptr_t oprsz, bool is_tbx) \ 2991 { \ 2992 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 2993 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 2994 for (i = 0; i < nelem; ++i) { \ 2995 TYPE index = indexes[H1(i)], val = 0; \ 2996 if (index < nelem) { \ 2997 val = tbl0[H(index)]; \ 2998 } else { \ 2999 index -= nelem; \ 3000 if (tbl1 && index < nelem) { \ 3001 val = tbl1[H(index)]; \ 3002 } else if (is_tbx) { \ 3003 continue; \ 3004 } \ 3005 } \ 3006 d[H(i)] = val; \ 3007 } \ 3008 } \ 3009 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3010 { \ 3011 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3012 } \ 3013 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3014 void *vm, uint32_t desc) \ 3015 { \ 3016 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3017 } \ 3018 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3019 { \ 3020 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3021 } 3022 3023 DO_TB(b, uint8_t, H1) 3024 DO_TB(h, uint16_t, H2) 3025 DO_TB(s, uint32_t, H4) 3026 DO_TB(d, uint64_t, H8) 3027 3028 #undef DO_TB 3029 3030 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3031 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3032 { \ 3033 intptr_t i, opr_sz = simd_oprsz(desc); \ 3034 TYPED *d = vd; \ 3035 TYPES *n = vn; \ 3036 ARMVectorReg tmp; \ 3037 if (unlikely(vn - vd < opr_sz)) { \ 3038 n = memcpy(&tmp, n, opr_sz / 2); \ 3039 } \ 3040 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3041 d[HD(i)] = n[HS(i)]; \ 3042 } \ 3043 } 3044 3045 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3046 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3047 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3048 3049 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3050 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3051 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3052 3053 #undef DO_UNPK 3054 3055 /* Mask of bits included in the even numbered predicates of width esz. 3056 * We also use this for expand_bits/compress_bits, and so extend the 3057 * same pattern out to 16-bit units. 3058 */ 3059 static const uint64_t even_bit_esz_masks[5] = { 3060 0x5555555555555555ull, 3061 0x3333333333333333ull, 3062 0x0f0f0f0f0f0f0f0full, 3063 0x00ff00ff00ff00ffull, 3064 0x0000ffff0000ffffull, 3065 }; 3066 3067 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3068 * For N==0, this corresponds to the operation that in qemu/bitops.h 3069 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3070 * section 7-2 Shuffling Bits. 3071 */ 3072 static uint64_t expand_bits(uint64_t x, int n) 3073 { 3074 int i; 3075 3076 x &= 0xffffffffu; 3077 for (i = 4; i >= n; i--) { 3078 int sh = 1 << i; 3079 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3080 } 3081 return x; 3082 } 3083 3084 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3085 * For N==0, this corresponds to the operation that in qemu/bitops.h 3086 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3087 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3088 */ 3089 static uint64_t compress_bits(uint64_t x, int n) 3090 { 3091 int i; 3092 3093 for (i = n; i <= 4; i++) { 3094 int sh = 1 << i; 3095 x &= even_bit_esz_masks[i]; 3096 x = (x >> sh) | x; 3097 } 3098 return x & 0xffffffffu; 3099 } 3100 3101 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3102 { 3103 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3104 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3105 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3106 int esize = 1 << esz; 3107 uint64_t *d = vd; 3108 intptr_t i; 3109 3110 if (oprsz <= 8) { 3111 uint64_t nn = *(uint64_t *)vn; 3112 uint64_t mm = *(uint64_t *)vm; 3113 int half = 4 * oprsz; 3114 3115 nn = extract64(nn, high * half, half); 3116 mm = extract64(mm, high * half, half); 3117 nn = expand_bits(nn, esz); 3118 mm = expand_bits(mm, esz); 3119 d[0] = nn | (mm << esize); 3120 } else { 3121 ARMPredicateReg tmp; 3122 3123 /* We produce output faster than we consume input. 3124 Therefore we must be mindful of possible overlap. */ 3125 if (vd == vn) { 3126 vn = memcpy(&tmp, vn, oprsz); 3127 if (vd == vm) { 3128 vm = vn; 3129 } 3130 } else if (vd == vm) { 3131 vm = memcpy(&tmp, vm, oprsz); 3132 } 3133 if (high) { 3134 high = oprsz >> 1; 3135 } 3136 3137 if ((oprsz & 7) == 0) { 3138 uint32_t *n = vn, *m = vm; 3139 high >>= 2; 3140 3141 for (i = 0; i < oprsz / 8; i++) { 3142 uint64_t nn = n[H4(high + i)]; 3143 uint64_t mm = m[H4(high + i)]; 3144 3145 nn = expand_bits(nn, esz); 3146 mm = expand_bits(mm, esz); 3147 d[i] = nn | (mm << esize); 3148 } 3149 } else { 3150 uint8_t *n = vn, *m = vm; 3151 uint16_t *d16 = vd; 3152 3153 for (i = 0; i < oprsz / 2; i++) { 3154 uint16_t nn = n[H1(high + i)]; 3155 uint16_t mm = m[H1(high + i)]; 3156 3157 nn = expand_bits(nn, esz); 3158 mm = expand_bits(mm, esz); 3159 d16[H2(i)] = nn | (mm << esize); 3160 } 3161 } 3162 } 3163 } 3164 3165 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3166 { 3167 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3168 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3169 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3170 uint64_t *d = vd, *n = vn, *m = vm; 3171 uint64_t l, h; 3172 intptr_t i; 3173 3174 if (oprsz <= 8) { 3175 l = compress_bits(n[0] >> odd, esz); 3176 h = compress_bits(m[0] >> odd, esz); 3177 d[0] = l | (h << (4 * oprsz)); 3178 } else { 3179 ARMPredicateReg tmp_m; 3180 intptr_t oprsz_16 = oprsz / 16; 3181 3182 if ((vm - vd) < (uintptr_t)oprsz) { 3183 m = memcpy(&tmp_m, vm, oprsz); 3184 } 3185 3186 for (i = 0; i < oprsz_16; i++) { 3187 l = n[2 * i + 0]; 3188 h = n[2 * i + 1]; 3189 l = compress_bits(l >> odd, esz); 3190 h = compress_bits(h >> odd, esz); 3191 d[i] = l | (h << 32); 3192 } 3193 3194 /* 3195 * For VL which is not a multiple of 512, the results from M do not 3196 * align nicely with the uint64_t for D. Put the aligned results 3197 * from M into TMP_M and then copy it into place afterward. 3198 */ 3199 if (oprsz & 15) { 3200 int final_shift = (oprsz & 15) * 2; 3201 3202 l = n[2 * i + 0]; 3203 h = n[2 * i + 1]; 3204 l = compress_bits(l >> odd, esz); 3205 h = compress_bits(h >> odd, esz); 3206 d[i] = l | (h << final_shift); 3207 3208 for (i = 0; i < oprsz_16; i++) { 3209 l = m[2 * i + 0]; 3210 h = m[2 * i + 1]; 3211 l = compress_bits(l >> odd, esz); 3212 h = compress_bits(h >> odd, esz); 3213 tmp_m.p[i] = l | (h << 32); 3214 } 3215 l = m[2 * i + 0]; 3216 h = m[2 * i + 1]; 3217 l = compress_bits(l >> odd, esz); 3218 h = compress_bits(h >> odd, esz); 3219 tmp_m.p[i] = l | (h << final_shift); 3220 3221 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3222 } else { 3223 for (i = 0; i < oprsz_16; i++) { 3224 l = m[2 * i + 0]; 3225 h = m[2 * i + 1]; 3226 l = compress_bits(l >> odd, esz); 3227 h = compress_bits(h >> odd, esz); 3228 d[oprsz_16 + i] = l | (h << 32); 3229 } 3230 } 3231 } 3232 } 3233 3234 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3235 { 3236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3237 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3238 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3239 uint64_t *d = vd, *n = vn, *m = vm; 3240 uint64_t mask; 3241 int shr, shl; 3242 intptr_t i; 3243 3244 shl = 1 << esz; 3245 shr = 0; 3246 mask = even_bit_esz_masks[esz]; 3247 if (odd) { 3248 mask <<= shl; 3249 shr = shl; 3250 shl = 0; 3251 } 3252 3253 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3254 uint64_t nn = (n[i] & mask) >> shr; 3255 uint64_t mm = (m[i] & mask) << shl; 3256 d[i] = nn + mm; 3257 } 3258 } 3259 3260 /* Reverse units of 2**N bits. */ 3261 static uint64_t reverse_bits_64(uint64_t x, int n) 3262 { 3263 int i, sh; 3264 3265 x = bswap64(x); 3266 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3267 uint64_t mask = even_bit_esz_masks[i]; 3268 x = ((x & mask) << sh) | ((x >> sh) & mask); 3269 } 3270 return x; 3271 } 3272 3273 static uint8_t reverse_bits_8(uint8_t x, int n) 3274 { 3275 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3276 int i, sh; 3277 3278 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3279 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3280 } 3281 return x; 3282 } 3283 3284 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3285 { 3286 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3287 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3288 intptr_t i, oprsz_2 = oprsz / 2; 3289 3290 if (oprsz <= 8) { 3291 uint64_t l = *(uint64_t *)vn; 3292 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3293 *(uint64_t *)vd = l; 3294 } else if ((oprsz & 15) == 0) { 3295 for (i = 0; i < oprsz_2; i += 8) { 3296 intptr_t ih = oprsz - 8 - i; 3297 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3298 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3299 *(uint64_t *)(vd + i) = h; 3300 *(uint64_t *)(vd + ih) = l; 3301 } 3302 } else { 3303 for (i = 0; i < oprsz_2; i += 1) { 3304 intptr_t il = H1(i); 3305 intptr_t ih = H1(oprsz - 1 - i); 3306 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3307 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3308 *(uint8_t *)(vd + il) = h; 3309 *(uint8_t *)(vd + ih) = l; 3310 } 3311 } 3312 } 3313 3314 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3315 { 3316 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3317 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3318 uint64_t *d = vd; 3319 intptr_t i; 3320 3321 if (oprsz <= 8) { 3322 uint64_t nn = *(uint64_t *)vn; 3323 int half = 4 * oprsz; 3324 3325 nn = extract64(nn, high * half, half); 3326 nn = expand_bits(nn, 0); 3327 d[0] = nn; 3328 } else { 3329 ARMPredicateReg tmp_n; 3330 3331 /* We produce output faster than we consume input. 3332 Therefore we must be mindful of possible overlap. */ 3333 if ((vn - vd) < (uintptr_t)oprsz) { 3334 vn = memcpy(&tmp_n, vn, oprsz); 3335 } 3336 if (high) { 3337 high = oprsz >> 1; 3338 } 3339 3340 if ((oprsz & 7) == 0) { 3341 uint32_t *n = vn; 3342 high >>= 2; 3343 3344 for (i = 0; i < oprsz / 8; i++) { 3345 uint64_t nn = n[H4(high + i)]; 3346 d[i] = expand_bits(nn, 0); 3347 } 3348 } else { 3349 uint16_t *d16 = vd; 3350 uint8_t *n = vn; 3351 3352 for (i = 0; i < oprsz / 2; i++) { 3353 uint16_t nn = n[H1(high + i)]; 3354 d16[H2(i)] = expand_bits(nn, 0); 3355 } 3356 } 3357 } 3358 } 3359 3360 #define DO_ZIP(NAME, TYPE, H) \ 3361 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3362 { \ 3363 intptr_t oprsz = simd_oprsz(desc); \ 3364 intptr_t odd_ofs = simd_data(desc); \ 3365 intptr_t i, oprsz_2 = oprsz / 2; \ 3366 ARMVectorReg tmp_n, tmp_m; \ 3367 /* We produce output faster than we consume input. \ 3368 Therefore we must be mindful of possible overlap. */ \ 3369 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3370 vn = memcpy(&tmp_n, vn, oprsz); \ 3371 } \ 3372 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3373 vm = memcpy(&tmp_m, vm, oprsz); \ 3374 } \ 3375 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3376 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3377 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3378 *(TYPE *)(vm + odd_ofs + H(i)); \ 3379 } \ 3380 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3381 memset(vd + oprsz - 16, 0, 16); \ 3382 } \ 3383 } 3384 3385 DO_ZIP(sve_zip_b, uint8_t, H1) 3386 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3387 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3388 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3389 DO_ZIP(sve2_zip_q, Int128, ) 3390 3391 #define DO_UZP(NAME, TYPE, H) \ 3392 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3393 { \ 3394 intptr_t oprsz = simd_oprsz(desc); \ 3395 intptr_t odd_ofs = simd_data(desc); \ 3396 intptr_t i, p; \ 3397 ARMVectorReg tmp_m; \ 3398 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3399 vm = memcpy(&tmp_m, vm, oprsz); \ 3400 } \ 3401 i = 0, p = odd_ofs; \ 3402 do { \ 3403 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3404 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3405 } while (p < oprsz); \ 3406 p -= oprsz; \ 3407 do { \ 3408 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3409 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3410 } while (p < oprsz); \ 3411 tcg_debug_assert(i == oprsz); \ 3412 } 3413 3414 DO_UZP(sve_uzp_b, uint8_t, H1) 3415 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3416 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3417 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3418 DO_UZP(sve2_uzp_q, Int128, ) 3419 3420 #define DO_TRN(NAME, TYPE, H) \ 3421 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3422 { \ 3423 intptr_t oprsz = simd_oprsz(desc); \ 3424 intptr_t odd_ofs = simd_data(desc); \ 3425 intptr_t i; \ 3426 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3427 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3428 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3429 *(TYPE *)(vd + H(i + 0)) = ae; \ 3430 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3431 } \ 3432 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3433 memset(vd + oprsz - 16, 0, 16); \ 3434 } \ 3435 } 3436 3437 DO_TRN(sve_trn_b, uint8_t, H1) 3438 DO_TRN(sve_trn_h, uint16_t, H1_2) 3439 DO_TRN(sve_trn_s, uint32_t, H1_4) 3440 DO_TRN(sve_trn_d, uint64_t, H1_8) 3441 DO_TRN(sve2_trn_q, Int128, ) 3442 3443 #undef DO_ZIP 3444 #undef DO_UZP 3445 #undef DO_TRN 3446 3447 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3448 { 3449 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3450 uint32_t *d = vd, *n = vn; 3451 uint8_t *pg = vg; 3452 3453 for (i = j = 0; i < opr_sz; i++) { 3454 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3455 d[H4(j)] = n[H4(i)]; 3456 j++; 3457 } 3458 } 3459 for (; j < opr_sz; j++) { 3460 d[H4(j)] = 0; 3461 } 3462 } 3463 3464 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3465 { 3466 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3467 uint64_t *d = vd, *n = vn; 3468 uint8_t *pg = vg; 3469 3470 for (i = j = 0; i < opr_sz; i++) { 3471 if (pg[H1(i)] & 1) { 3472 d[j] = n[i]; 3473 j++; 3474 } 3475 } 3476 for (; j < opr_sz; j++) { 3477 d[j] = 0; 3478 } 3479 } 3480 3481 /* Similar to the ARM LastActiveElement pseudocode function, except the 3482 * result is multiplied by the element size. This includes the not found 3483 * indication; e.g. not found for esz=3 is -8. 3484 */ 3485 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3486 { 3487 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3488 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3489 3490 return last_active_element(vg, words, esz); 3491 } 3492 3493 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3494 { 3495 intptr_t opr_sz = simd_oprsz(desc) / 8; 3496 int esz = simd_data(desc); 3497 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3498 intptr_t i, first_i, last_i; 3499 ARMVectorReg tmp; 3500 3501 first_i = last_i = 0; 3502 first_g = last_g = 0; 3503 3504 /* Find the extent of the active elements within VG. */ 3505 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3506 pg = *(uint64_t *)(vg + i) & mask; 3507 if (pg) { 3508 if (last_g == 0) { 3509 last_g = pg; 3510 last_i = i; 3511 } 3512 first_g = pg; 3513 first_i = i; 3514 } 3515 } 3516 3517 len = 0; 3518 if (first_g != 0) { 3519 first_i = first_i * 8 + ctz64(first_g); 3520 last_i = last_i * 8 + 63 - clz64(last_g); 3521 len = last_i - first_i + (1 << esz); 3522 if (vd == vm) { 3523 vm = memcpy(&tmp, vm, opr_sz * 8); 3524 } 3525 swap_memmove(vd, vn + first_i, len); 3526 } 3527 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3528 } 3529 3530 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3531 void *vg, uint32_t desc) 3532 { 3533 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3534 uint64_t *d = vd, *n = vn, *m = vm; 3535 uint8_t *pg = vg; 3536 3537 for (i = 0; i < opr_sz; i += 1) { 3538 uint64_t nn = n[i], mm = m[i]; 3539 uint64_t pp = expand_pred_b(pg[H1(i)]); 3540 d[i] = (nn & pp) | (mm & ~pp); 3541 } 3542 } 3543 3544 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3545 void *vg, uint32_t desc) 3546 { 3547 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3548 uint64_t *d = vd, *n = vn, *m = vm; 3549 uint8_t *pg = vg; 3550 3551 for (i = 0; i < opr_sz; i += 1) { 3552 uint64_t nn = n[i], mm = m[i]; 3553 uint64_t pp = expand_pred_h(pg[H1(i)]); 3554 d[i] = (nn & pp) | (mm & ~pp); 3555 } 3556 } 3557 3558 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3559 void *vg, uint32_t desc) 3560 { 3561 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3562 uint64_t *d = vd, *n = vn, *m = vm; 3563 uint8_t *pg = vg; 3564 3565 for (i = 0; i < opr_sz; i += 1) { 3566 uint64_t nn = n[i], mm = m[i]; 3567 uint64_t pp = expand_pred_s(pg[H1(i)]); 3568 d[i] = (nn & pp) | (mm & ~pp); 3569 } 3570 } 3571 3572 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3573 void *vg, uint32_t desc) 3574 { 3575 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3576 uint64_t *d = vd, *n = vn, *m = vm; 3577 uint8_t *pg = vg; 3578 3579 for (i = 0; i < opr_sz; i += 1) { 3580 uint64_t nn = n[i], mm = m[i]; 3581 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3582 } 3583 } 3584 3585 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3586 void *vg, uint32_t desc) 3587 { 3588 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3589 Int128 *d = vd, *n = vn, *m = vm; 3590 uint16_t *pg = vg; 3591 3592 for (i = 0; i < opr_sz; i += 1) { 3593 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3594 } 3595 } 3596 3597 /* Two operand comparison controlled by a predicate. 3598 * ??? It is very tempting to want to be able to expand this inline 3599 * with x86 instructions, e.g. 3600 * 3601 * vcmpeqw zm, zn, %ymm0 3602 * vpmovmskb %ymm0, %eax 3603 * and $0x5555, %eax 3604 * and pg, %eax 3605 * 3606 * or even aarch64, e.g. 3607 * 3608 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3609 * cmeq v0.8h, zn, zm 3610 * and v0.8h, v0.8h, mask 3611 * addv h0, v0.8h 3612 * and v0.8b, pg 3613 * 3614 * However, coming up with an abstraction that allows vector inputs and 3615 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3616 * scalar outputs, is tricky. 3617 */ 3618 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3619 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3620 { \ 3621 intptr_t opr_sz = simd_oprsz(desc); \ 3622 uint32_t flags = PREDTEST_INIT; \ 3623 intptr_t i = opr_sz; \ 3624 do { \ 3625 uint64_t out = 0, pg; \ 3626 do { \ 3627 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3628 TYPE nn = *(TYPE *)(vn + H(i)); \ 3629 TYPE mm = *(TYPE *)(vm + H(i)); \ 3630 out |= nn OP mm; \ 3631 } while (i & 63); \ 3632 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3633 out &= pg; \ 3634 *(uint64_t *)(vd + (i >> 3)) = out; \ 3635 flags = iter_predtest_bwd(out, pg, flags); \ 3636 } while (i > 0); \ 3637 return flags; \ 3638 } 3639 3640 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3641 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3642 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3643 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3644 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3645 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3646 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3647 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3648 3649 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3650 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3651 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3652 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3653 3654 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3655 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3656 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3657 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3658 3659 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3660 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3661 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3662 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3663 3664 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3665 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3666 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3667 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3668 3669 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3670 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3671 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3672 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3673 3674 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3675 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3676 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3677 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3678 3679 #undef DO_CMP_PPZZ_B 3680 #undef DO_CMP_PPZZ_H 3681 #undef DO_CMP_PPZZ_S 3682 #undef DO_CMP_PPZZ_D 3683 #undef DO_CMP_PPZZ 3684 3685 /* Similar, but the second source is "wide". */ 3686 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3687 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3688 { \ 3689 intptr_t opr_sz = simd_oprsz(desc); \ 3690 uint32_t flags = PREDTEST_INIT; \ 3691 intptr_t i = opr_sz; \ 3692 do { \ 3693 uint64_t out = 0, pg; \ 3694 do { \ 3695 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3696 do { \ 3697 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3698 TYPE nn = *(TYPE *)(vn + H(i)); \ 3699 out |= nn OP mm; \ 3700 } while (i & 7); \ 3701 } while (i & 63); \ 3702 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3703 out &= pg; \ 3704 *(uint64_t *)(vd + (i >> 3)) = out; \ 3705 flags = iter_predtest_bwd(out, pg, flags); \ 3706 } while (i > 0); \ 3707 return flags; \ 3708 } 3709 3710 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3711 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3712 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3713 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3714 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3715 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3716 3717 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3718 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3719 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3720 3721 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3722 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3723 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3724 3725 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3726 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3727 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3728 3729 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3730 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3731 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3732 3733 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3734 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3735 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3736 3737 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3738 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3739 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3740 3741 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3742 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3743 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3744 3745 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3746 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3747 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3748 3749 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3750 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3751 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3752 3753 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3754 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3755 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3756 3757 #undef DO_CMP_PPZW_B 3758 #undef DO_CMP_PPZW_H 3759 #undef DO_CMP_PPZW_S 3760 #undef DO_CMP_PPZW 3761 3762 /* Similar, but the second source is immediate. */ 3763 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3764 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3765 { \ 3766 intptr_t opr_sz = simd_oprsz(desc); \ 3767 uint32_t flags = PREDTEST_INIT; \ 3768 TYPE mm = simd_data(desc); \ 3769 intptr_t i = opr_sz; \ 3770 do { \ 3771 uint64_t out = 0, pg; \ 3772 do { \ 3773 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3774 TYPE nn = *(TYPE *)(vn + H(i)); \ 3775 out |= nn OP mm; \ 3776 } while (i & 63); \ 3777 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3778 out &= pg; \ 3779 *(uint64_t *)(vd + (i >> 3)) = out; \ 3780 flags = iter_predtest_bwd(out, pg, flags); \ 3781 } while (i > 0); \ 3782 return flags; \ 3783 } 3784 3785 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3786 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3787 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3788 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3789 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3790 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3791 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3792 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3793 3794 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3795 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3796 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3797 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3798 3799 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3800 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3801 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3802 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3803 3804 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3805 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3806 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3807 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3808 3809 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3810 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3811 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3812 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3813 3814 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3815 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3816 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3817 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3818 3819 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3820 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3821 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3822 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3823 3824 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3825 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3826 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3827 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3828 3829 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3830 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3831 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3832 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3833 3834 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3835 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3836 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3837 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3838 3839 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3840 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3841 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3842 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3843 3844 #undef DO_CMP_PPZI_B 3845 #undef DO_CMP_PPZI_H 3846 #undef DO_CMP_PPZI_S 3847 #undef DO_CMP_PPZI_D 3848 #undef DO_CMP_PPZI 3849 3850 /* Similar to the ARM LastActive pseudocode function. */ 3851 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3852 { 3853 intptr_t i; 3854 3855 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3856 uint64_t pg = *(uint64_t *)(vg + i); 3857 if (pg) { 3858 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3859 } 3860 } 3861 return 0; 3862 } 3863 3864 /* Compute a mask into RETB that is true for all G, up to and including 3865 * (if after) or excluding (if !after) the first G & N. 3866 * Return true if BRK found. 3867 */ 3868 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3869 bool brk, bool after) 3870 { 3871 uint64_t b; 3872 3873 if (brk) { 3874 b = 0; 3875 } else if ((g & n) == 0) { 3876 /* For all G, no N are set; break not found. */ 3877 b = g; 3878 } else { 3879 /* Break somewhere in N. Locate it. */ 3880 b = g & n; /* guard true, pred true */ 3881 b = b & -b; /* first such */ 3882 if (after) { 3883 b = b | (b - 1); /* break after same */ 3884 } else { 3885 b = b - 1; /* break before same */ 3886 } 3887 brk = true; 3888 } 3889 3890 *retb = b; 3891 return brk; 3892 } 3893 3894 /* Compute a zeroing BRK. */ 3895 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3896 intptr_t oprsz, bool after) 3897 { 3898 bool brk = false; 3899 intptr_t i; 3900 3901 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3902 uint64_t this_b, this_g = g[i]; 3903 3904 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3905 d[i] = this_b & this_g; 3906 } 3907 } 3908 3909 /* Likewise, but also compute flags. */ 3910 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3911 intptr_t oprsz, bool after) 3912 { 3913 uint32_t flags = PREDTEST_INIT; 3914 bool brk = false; 3915 intptr_t i; 3916 3917 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3918 uint64_t this_b, this_d, this_g = g[i]; 3919 3920 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3921 d[i] = this_d = this_b & this_g; 3922 flags = iter_predtest_fwd(this_d, this_g, flags); 3923 } 3924 return flags; 3925 } 3926 3927 /* Compute a merging BRK. */ 3928 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3929 intptr_t oprsz, bool after) 3930 { 3931 bool brk = false; 3932 intptr_t i; 3933 3934 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3935 uint64_t this_b, this_g = g[i]; 3936 3937 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3938 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3939 } 3940 } 3941 3942 /* Likewise, but also compute flags. */ 3943 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3944 intptr_t oprsz, bool after) 3945 { 3946 uint32_t flags = PREDTEST_INIT; 3947 bool brk = false; 3948 intptr_t i; 3949 3950 for (i = 0; i < oprsz / 8; ++i) { 3951 uint64_t this_b, this_d = d[i], this_g = g[i]; 3952 3953 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3954 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3955 flags = iter_predtest_fwd(this_d, this_g, flags); 3956 } 3957 return flags; 3958 } 3959 3960 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3961 { 3962 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3963 * The compiler should turn this into 4 64-bit integer stores. 3964 */ 3965 memset(d, 0, sizeof(ARMPredicateReg)); 3966 return PREDTEST_INIT; 3967 } 3968 3969 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 3970 uint32_t pred_desc) 3971 { 3972 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3973 if (last_active_pred(vn, vg, oprsz)) { 3974 compute_brk_z(vd, vm, vg, oprsz, true); 3975 } else { 3976 do_zero(vd, oprsz); 3977 } 3978 } 3979 3980 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 3981 uint32_t pred_desc) 3982 { 3983 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3984 if (last_active_pred(vn, vg, oprsz)) { 3985 return compute_brks_z(vd, vm, vg, oprsz, true); 3986 } else { 3987 return do_zero(vd, oprsz); 3988 } 3989 } 3990 3991 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 3992 uint32_t pred_desc) 3993 { 3994 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3995 if (last_active_pred(vn, vg, oprsz)) { 3996 compute_brk_z(vd, vm, vg, oprsz, false); 3997 } else { 3998 do_zero(vd, oprsz); 3999 } 4000 } 4001 4002 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4003 uint32_t pred_desc) 4004 { 4005 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4006 if (last_active_pred(vn, vg, oprsz)) { 4007 return compute_brks_z(vd, vm, vg, oprsz, false); 4008 } else { 4009 return do_zero(vd, oprsz); 4010 } 4011 } 4012 4013 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4014 { 4015 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4016 compute_brk_z(vd, vn, vg, oprsz, true); 4017 } 4018 4019 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4020 { 4021 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4022 return compute_brks_z(vd, vn, vg, oprsz, true); 4023 } 4024 4025 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4026 { 4027 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4028 compute_brk_z(vd, vn, vg, oprsz, false); 4029 } 4030 4031 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4032 { 4033 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4034 return compute_brks_z(vd, vn, vg, oprsz, false); 4035 } 4036 4037 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4038 { 4039 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4040 compute_brk_m(vd, vn, vg, oprsz, true); 4041 } 4042 4043 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4044 { 4045 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4046 return compute_brks_m(vd, vn, vg, oprsz, true); 4047 } 4048 4049 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4050 { 4051 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4052 compute_brk_m(vd, vn, vg, oprsz, false); 4053 } 4054 4055 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4056 { 4057 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4058 return compute_brks_m(vd, vn, vg, oprsz, false); 4059 } 4060 4061 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4062 { 4063 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4064 if (!last_active_pred(vn, vg, oprsz)) { 4065 do_zero(vd, oprsz); 4066 } 4067 } 4068 4069 /* As if PredTest(Ones(PL), D, esz). */ 4070 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4071 uint64_t esz_mask) 4072 { 4073 uint32_t flags = PREDTEST_INIT; 4074 intptr_t i; 4075 4076 for (i = 0; i < oprsz / 8; i++) { 4077 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4078 } 4079 if (oprsz & 7) { 4080 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4081 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4082 } 4083 return flags; 4084 } 4085 4086 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4087 { 4088 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4089 if (last_active_pred(vn, vg, oprsz)) { 4090 return predtest_ones(vd, oprsz, -1); 4091 } else { 4092 return do_zero(vd, oprsz); 4093 } 4094 } 4095 4096 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4097 { 4098 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4099 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4100 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4101 intptr_t i; 4102 4103 for (i = 0; i < words; ++i) { 4104 uint64_t t = n[i] & g[i] & mask; 4105 sum += ctpop64(t); 4106 } 4107 return sum; 4108 } 4109 4110 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4111 { 4112 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4113 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4114 uint64_t esz_mask = pred_esz_masks[esz]; 4115 ARMPredicateReg *d = vd; 4116 uint32_t flags; 4117 intptr_t i; 4118 4119 /* Begin with a zero predicate register. */ 4120 flags = do_zero(d, oprsz); 4121 if (count == 0) { 4122 return flags; 4123 } 4124 4125 /* Set all of the requested bits. */ 4126 for (i = 0; i < count / 64; ++i) { 4127 d->p[i] = esz_mask; 4128 } 4129 if (count & 63) { 4130 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4131 } 4132 4133 return predtest_ones(d, oprsz, esz_mask); 4134 } 4135 4136 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4137 { 4138 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4139 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4140 uint64_t esz_mask = pred_esz_masks[esz]; 4141 ARMPredicateReg *d = vd; 4142 intptr_t i, invcount, oprbits; 4143 uint64_t bits; 4144 4145 if (count == 0) { 4146 return do_zero(d, oprsz); 4147 } 4148 4149 oprbits = oprsz * 8; 4150 tcg_debug_assert(count <= oprbits); 4151 4152 bits = esz_mask; 4153 if (oprbits & 63) { 4154 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4155 } 4156 4157 invcount = oprbits - count; 4158 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4159 d->p[i] = bits; 4160 bits = esz_mask; 4161 } 4162 4163 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4164 4165 while (--i >= 0) { 4166 d->p[i] = 0; 4167 } 4168 4169 return predtest_ones(d, oprsz, esz_mask); 4170 } 4171 4172 /* Recursive reduction on a function; 4173 * C.f. the ARM ARM function ReducePredicated. 4174 * 4175 * While it would be possible to write this without the DATA temporary, 4176 * it is much simpler to process the predicate register this way. 4177 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4178 * little to gain with a more complex non-recursive form. 4179 */ 4180 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4181 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4182 { \ 4183 if (n == 1) { \ 4184 return *data; \ 4185 } else { \ 4186 uintptr_t half = n / 2; \ 4187 TYPE lo = NAME##_reduce(data, status, half); \ 4188 TYPE hi = NAME##_reduce(data + half, status, half); \ 4189 return TYPE##_##FUNC(lo, hi, status); \ 4190 } \ 4191 } \ 4192 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ 4193 { \ 4194 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4195 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4196 for (i = 0; i < oprsz; ) { \ 4197 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4198 do { \ 4199 TYPE nn = *(TYPE *)(vn + H(i)); \ 4200 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4201 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4202 } while (i & 15); \ 4203 } \ 4204 for (; i < maxsz; i += sizeof(TYPE)) { \ 4205 *(TYPE *)((void *)data + i) = IDENT; \ 4206 } \ 4207 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \ 4208 } 4209 4210 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) 4211 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) 4212 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) 4213 4214 /* Identity is floatN_default_nan, without the function call. */ 4215 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) 4216 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) 4217 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) 4218 4219 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) 4220 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) 4221 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) 4222 4223 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) 4224 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) 4225 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) 4226 4227 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) 4228 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) 4229 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) 4230 4231 #undef DO_REDUCE 4232 4233 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4234 void *status, uint32_t desc) 4235 { 4236 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4237 float16 result = nn; 4238 4239 do { 4240 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4241 do { 4242 if (pg & 1) { 4243 float16 mm = *(float16 *)(vm + H1_2(i)); 4244 result = float16_add(result, mm, status); 4245 } 4246 i += sizeof(float16), pg >>= sizeof(float16); 4247 } while (i & 15); 4248 } while (i < opr_sz); 4249 4250 return result; 4251 } 4252 4253 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4254 void *status, uint32_t desc) 4255 { 4256 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4257 float32 result = nn; 4258 4259 do { 4260 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4261 do { 4262 if (pg & 1) { 4263 float32 mm = *(float32 *)(vm + H1_2(i)); 4264 result = float32_add(result, mm, status); 4265 } 4266 i += sizeof(float32), pg >>= sizeof(float32); 4267 } while (i & 15); 4268 } while (i < opr_sz); 4269 4270 return result; 4271 } 4272 4273 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4274 void *status, uint32_t desc) 4275 { 4276 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4277 uint64_t *m = vm; 4278 uint8_t *pg = vg; 4279 4280 for (i = 0; i < opr_sz; i++) { 4281 if (pg[H1(i)] & 1) { 4282 nn = float64_add(nn, m[i], status); 4283 } 4284 } 4285 4286 return nn; 4287 } 4288 4289 /* Fully general three-operand expander, controlled by a predicate, 4290 * With the extra float_status parameter. 4291 */ 4292 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4293 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4294 void *status, uint32_t desc) \ 4295 { \ 4296 intptr_t i = simd_oprsz(desc); \ 4297 uint64_t *g = vg; \ 4298 do { \ 4299 uint64_t pg = g[(i - 1) >> 6]; \ 4300 do { \ 4301 i -= sizeof(TYPE); \ 4302 if (likely((pg >> (i & 63)) & 1)) { \ 4303 TYPE nn = *(TYPE *)(vn + H(i)); \ 4304 TYPE mm = *(TYPE *)(vm + H(i)); \ 4305 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4306 } \ 4307 } while (i & 63); \ 4308 } while (i != 0); \ 4309 } 4310 4311 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4312 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4313 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4314 4315 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4316 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4317 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4318 4319 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4320 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4321 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4322 4323 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4324 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4325 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4326 4327 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4328 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4329 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4330 4331 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4332 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4333 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4334 4335 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4336 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4337 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4338 4339 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4340 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4341 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4342 4343 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4344 { 4345 return float16_abs(float16_sub(a, b, s)); 4346 } 4347 4348 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4349 { 4350 return float32_abs(float32_sub(a, b, s)); 4351 } 4352 4353 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4354 { 4355 return float64_abs(float64_sub(a, b, s)); 4356 } 4357 4358 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4359 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4360 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4361 4362 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4363 { 4364 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4365 return float64_scalbn(a, b_int, s); 4366 } 4367 4368 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4369 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4370 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4371 4372 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4373 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4374 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4375 4376 #undef DO_ZPZZ_FP 4377 4378 /* Three-operand expander, with one scalar operand, controlled by 4379 * a predicate, with the extra float_status parameter. 4380 */ 4381 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4382 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4383 void *status, uint32_t desc) \ 4384 { \ 4385 intptr_t i = simd_oprsz(desc); \ 4386 uint64_t *g = vg; \ 4387 TYPE mm = scalar; \ 4388 do { \ 4389 uint64_t pg = g[(i - 1) >> 6]; \ 4390 do { \ 4391 i -= sizeof(TYPE); \ 4392 if (likely((pg >> (i & 63)) & 1)) { \ 4393 TYPE nn = *(TYPE *)(vn + H(i)); \ 4394 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4395 } \ 4396 } while (i & 63); \ 4397 } while (i != 0); \ 4398 } 4399 4400 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4401 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4402 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4403 4404 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4405 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4406 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4407 4408 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4409 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4410 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4411 4412 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4413 { 4414 return float16_sub(b, a, s); 4415 } 4416 4417 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4418 { 4419 return float32_sub(b, a, s); 4420 } 4421 4422 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4423 { 4424 return float64_sub(b, a, s); 4425 } 4426 4427 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4428 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4429 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4430 4431 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4432 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4433 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4434 4435 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4436 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4437 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4438 4439 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4440 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4441 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4442 4443 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4444 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4445 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4446 4447 /* Fully general two-operand expander, controlled by a predicate, 4448 * With the extra float_status parameter. 4449 */ 4450 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4451 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 4452 { \ 4453 intptr_t i = simd_oprsz(desc); \ 4454 uint64_t *g = vg; \ 4455 do { \ 4456 uint64_t pg = g[(i - 1) >> 6]; \ 4457 do { \ 4458 i -= sizeof(TYPE); \ 4459 if (likely((pg >> (i & 63)) & 1)) { \ 4460 TYPE nn = *(TYPE *)(vn + H(i)); \ 4461 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4462 } \ 4463 } while (i & 63); \ 4464 } while (i != 0); \ 4465 } 4466 4467 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4468 * FZ16. When converting from fp16, this affects flushing input denormals; 4469 * when converting to fp16, this affects flushing output denormals. 4470 */ 4471 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4472 { 4473 bool save = get_flush_inputs_to_zero(fpst); 4474 float32 ret; 4475 4476 set_flush_inputs_to_zero(false, fpst); 4477 ret = float16_to_float32(f, true, fpst); 4478 set_flush_inputs_to_zero(save, fpst); 4479 return ret; 4480 } 4481 4482 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4483 { 4484 bool save = get_flush_inputs_to_zero(fpst); 4485 float64 ret; 4486 4487 set_flush_inputs_to_zero(false, fpst); 4488 ret = float16_to_float64(f, true, fpst); 4489 set_flush_inputs_to_zero(save, fpst); 4490 return ret; 4491 } 4492 4493 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4494 { 4495 bool save = get_flush_to_zero(fpst); 4496 float16 ret; 4497 4498 set_flush_to_zero(false, fpst); 4499 ret = float32_to_float16(f, true, fpst); 4500 set_flush_to_zero(save, fpst); 4501 return ret; 4502 } 4503 4504 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4505 { 4506 bool save = get_flush_to_zero(fpst); 4507 float16 ret; 4508 4509 set_flush_to_zero(false, fpst); 4510 ret = float64_to_float16(f, true, fpst); 4511 set_flush_to_zero(save, fpst); 4512 return ret; 4513 } 4514 4515 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4516 { 4517 if (float16_is_any_nan(f)) { 4518 float_raise(float_flag_invalid, s); 4519 return 0; 4520 } 4521 return float16_to_int16_round_to_zero(f, s); 4522 } 4523 4524 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4525 { 4526 if (float16_is_any_nan(f)) { 4527 float_raise(float_flag_invalid, s); 4528 return 0; 4529 } 4530 return float16_to_int64_round_to_zero(f, s); 4531 } 4532 4533 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4534 { 4535 if (float32_is_any_nan(f)) { 4536 float_raise(float_flag_invalid, s); 4537 return 0; 4538 } 4539 return float32_to_int64_round_to_zero(f, s); 4540 } 4541 4542 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4543 { 4544 if (float64_is_any_nan(f)) { 4545 float_raise(float_flag_invalid, s); 4546 return 0; 4547 } 4548 return float64_to_int64_round_to_zero(f, s); 4549 } 4550 4551 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4552 { 4553 if (float16_is_any_nan(f)) { 4554 float_raise(float_flag_invalid, s); 4555 return 0; 4556 } 4557 return float16_to_uint16_round_to_zero(f, s); 4558 } 4559 4560 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4561 { 4562 if (float16_is_any_nan(f)) { 4563 float_raise(float_flag_invalid, s); 4564 return 0; 4565 } 4566 return float16_to_uint64_round_to_zero(f, s); 4567 } 4568 4569 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4570 { 4571 if (float32_is_any_nan(f)) { 4572 float_raise(float_flag_invalid, s); 4573 return 0; 4574 } 4575 return float32_to_uint64_round_to_zero(f, s); 4576 } 4577 4578 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4579 { 4580 if (float64_is_any_nan(f)) { 4581 float_raise(float_flag_invalid, s); 4582 return 0; 4583 } 4584 return float64_to_uint64_round_to_zero(f, s); 4585 } 4586 4587 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4588 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4589 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4590 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4591 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4592 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4593 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4594 4595 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4596 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4597 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4598 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4599 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4600 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4601 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4602 4603 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4604 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4605 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4606 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4607 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4608 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4609 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4610 4611 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4612 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4613 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4614 4615 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4616 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4617 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4618 4619 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4620 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4621 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4622 4623 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4624 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4625 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4626 4627 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4628 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4629 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4630 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4631 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4632 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4633 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4634 4635 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4636 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4637 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4638 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4639 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4640 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4641 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4642 4643 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4644 { 4645 /* Extract frac to the top of the uint32_t. */ 4646 uint32_t frac = (uint32_t)a << (16 + 6); 4647 int16_t exp = extract32(a, 10, 5); 4648 4649 if (unlikely(exp == 0)) { 4650 if (frac != 0) { 4651 if (!get_flush_inputs_to_zero(s)) { 4652 /* denormal: bias - fractional_zeros */ 4653 return -15 - clz32(frac); 4654 } 4655 /* flush to zero */ 4656 float_raise(float_flag_input_denormal, s); 4657 } 4658 } else if (unlikely(exp == 0x1f)) { 4659 if (frac == 0) { 4660 return INT16_MAX; /* infinity */ 4661 } 4662 } else { 4663 /* normal: exp - bias */ 4664 return exp - 15; 4665 } 4666 /* nan or zero */ 4667 float_raise(float_flag_invalid, s); 4668 return INT16_MIN; 4669 } 4670 4671 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4672 { 4673 /* Extract frac to the top of the uint32_t. */ 4674 uint32_t frac = a << 9; 4675 int32_t exp = extract32(a, 23, 8); 4676 4677 if (unlikely(exp == 0)) { 4678 if (frac != 0) { 4679 if (!get_flush_inputs_to_zero(s)) { 4680 /* denormal: bias - fractional_zeros */ 4681 return -127 - clz32(frac); 4682 } 4683 /* flush to zero */ 4684 float_raise(float_flag_input_denormal, s); 4685 } 4686 } else if (unlikely(exp == 0xff)) { 4687 if (frac == 0) { 4688 return INT32_MAX; /* infinity */ 4689 } 4690 } else { 4691 /* normal: exp - bias */ 4692 return exp - 127; 4693 } 4694 /* nan or zero */ 4695 float_raise(float_flag_invalid, s); 4696 return INT32_MIN; 4697 } 4698 4699 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4700 { 4701 /* Extract frac to the top of the uint64_t. */ 4702 uint64_t frac = a << 12; 4703 int64_t exp = extract64(a, 52, 11); 4704 4705 if (unlikely(exp == 0)) { 4706 if (frac != 0) { 4707 if (!get_flush_inputs_to_zero(s)) { 4708 /* denormal: bias - fractional_zeros */ 4709 return -1023 - clz64(frac); 4710 } 4711 /* flush to zero */ 4712 float_raise(float_flag_input_denormal, s); 4713 } 4714 } else if (unlikely(exp == 0x7ff)) { 4715 if (frac == 0) { 4716 return INT64_MAX; /* infinity */ 4717 } 4718 } else { 4719 /* normal: exp - bias */ 4720 return exp - 1023; 4721 } 4722 /* nan or zero */ 4723 float_raise(float_flag_invalid, s); 4724 return INT64_MIN; 4725 } 4726 4727 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4728 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4729 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4730 4731 #undef DO_ZPZ_FP 4732 4733 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4734 float_status *status, uint32_t desc, 4735 uint16_t neg1, uint16_t neg3) 4736 { 4737 intptr_t i = simd_oprsz(desc); 4738 uint64_t *g = vg; 4739 4740 do { 4741 uint64_t pg = g[(i - 1) >> 6]; 4742 do { 4743 i -= 2; 4744 if (likely((pg >> (i & 63)) & 1)) { 4745 float16 e1, e2, e3, r; 4746 4747 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4748 e2 = *(uint16_t *)(vm + H1_2(i)); 4749 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4750 r = float16_muladd(e1, e2, e3, 0, status); 4751 *(uint16_t *)(vd + H1_2(i)) = r; 4752 } 4753 } while (i & 63); 4754 } while (i != 0); 4755 } 4756 4757 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4758 void *vg, void *status, uint32_t desc) 4759 { 4760 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); 4761 } 4762 4763 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4764 void *vg, void *status, uint32_t desc) 4765 { 4766 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); 4767 } 4768 4769 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4770 void *vg, void *status, uint32_t desc) 4771 { 4772 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); 4773 } 4774 4775 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4776 void *vg, void *status, uint32_t desc) 4777 { 4778 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); 4779 } 4780 4781 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4782 float_status *status, uint32_t desc, 4783 uint32_t neg1, uint32_t neg3) 4784 { 4785 intptr_t i = simd_oprsz(desc); 4786 uint64_t *g = vg; 4787 4788 do { 4789 uint64_t pg = g[(i - 1) >> 6]; 4790 do { 4791 i -= 4; 4792 if (likely((pg >> (i & 63)) & 1)) { 4793 float32 e1, e2, e3, r; 4794 4795 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4796 e2 = *(uint32_t *)(vm + H1_4(i)); 4797 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4798 r = float32_muladd(e1, e2, e3, 0, status); 4799 *(uint32_t *)(vd + H1_4(i)) = r; 4800 } 4801 } while (i & 63); 4802 } while (i != 0); 4803 } 4804 4805 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4806 void *vg, void *status, uint32_t desc) 4807 { 4808 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); 4809 } 4810 4811 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4812 void *vg, void *status, uint32_t desc) 4813 { 4814 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); 4815 } 4816 4817 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4818 void *vg, void *status, uint32_t desc) 4819 { 4820 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); 4821 } 4822 4823 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4824 void *vg, void *status, uint32_t desc) 4825 { 4826 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); 4827 } 4828 4829 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4830 float_status *status, uint32_t desc, 4831 uint64_t neg1, uint64_t neg3) 4832 { 4833 intptr_t i = simd_oprsz(desc); 4834 uint64_t *g = vg; 4835 4836 do { 4837 uint64_t pg = g[(i - 1) >> 6]; 4838 do { 4839 i -= 8; 4840 if (likely((pg >> (i & 63)) & 1)) { 4841 float64 e1, e2, e3, r; 4842 4843 e1 = *(uint64_t *)(vn + i) ^ neg1; 4844 e2 = *(uint64_t *)(vm + i); 4845 e3 = *(uint64_t *)(va + i) ^ neg3; 4846 r = float64_muladd(e1, e2, e3, 0, status); 4847 *(uint64_t *)(vd + i) = r; 4848 } 4849 } while (i & 63); 4850 } while (i != 0); 4851 } 4852 4853 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4854 void *vg, void *status, uint32_t desc) 4855 { 4856 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); 4857 } 4858 4859 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4860 void *vg, void *status, uint32_t desc) 4861 { 4862 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); 4863 } 4864 4865 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4866 void *vg, void *status, uint32_t desc) 4867 { 4868 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); 4869 } 4870 4871 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4872 void *vg, void *status, uint32_t desc) 4873 { 4874 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); 4875 } 4876 4877 /* Two operand floating-point comparison controlled by a predicate. 4878 * Unlike the integer version, we are not allowed to optimistically 4879 * compare operands, since the comparison may have side effects wrt 4880 * the FPSR. 4881 */ 4882 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 4883 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4884 void *status, uint32_t desc) \ 4885 { \ 4886 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4887 uint64_t *d = vd, *g = vg; \ 4888 do { \ 4889 uint64_t out = 0, pg = g[j]; \ 4890 do { \ 4891 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4892 if (likely((pg >> (i & 63)) & 1)) { \ 4893 TYPE nn = *(TYPE *)(vn + H(i)); \ 4894 TYPE mm = *(TYPE *)(vm + H(i)); \ 4895 out |= OP(TYPE, nn, mm, status); \ 4896 } \ 4897 } while (i & 63); \ 4898 d[j--] = out; \ 4899 } while (i > 0); \ 4900 } 4901 4902 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 4903 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 4904 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 4905 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 4906 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 4907 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 4908 4909 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 4910 DO_FPCMP_PPZZ_H(NAME, OP) \ 4911 DO_FPCMP_PPZZ_S(NAME, OP) \ 4912 DO_FPCMP_PPZZ_D(NAME, OP) 4913 4914 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 4915 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 4916 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 4917 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 4918 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 4919 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 4920 #define DO_FCMUO(TYPE, X, Y, ST) \ 4921 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 4922 #define DO_FACGE(TYPE, X, Y, ST) \ 4923 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 4924 #define DO_FACGT(TYPE, X, Y, ST) \ 4925 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 4926 4927 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 4928 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 4929 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 4930 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 4931 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 4932 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 4933 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 4934 4935 #undef DO_FPCMP_PPZZ_ALL 4936 #undef DO_FPCMP_PPZZ_D 4937 #undef DO_FPCMP_PPZZ_S 4938 #undef DO_FPCMP_PPZZ_H 4939 #undef DO_FPCMP_PPZZ 4940 4941 /* One operand floating-point comparison against zero, controlled 4942 * by a predicate. 4943 */ 4944 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 4945 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4946 void *status, uint32_t desc) \ 4947 { \ 4948 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4949 uint64_t *d = vd, *g = vg; \ 4950 do { \ 4951 uint64_t out = 0, pg = g[j]; \ 4952 do { \ 4953 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4954 if ((pg >> (i & 63)) & 1) { \ 4955 TYPE nn = *(TYPE *)(vn + H(i)); \ 4956 out |= OP(TYPE, nn, 0, status); \ 4957 } \ 4958 } while (i & 63); \ 4959 d[j--] = out; \ 4960 } while (i > 0); \ 4961 } 4962 4963 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 4964 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 4965 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 4966 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 4967 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 4968 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 4969 4970 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 4971 DO_FPCMP_PPZ0_H(NAME, OP) \ 4972 DO_FPCMP_PPZ0_S(NAME, OP) \ 4973 DO_FPCMP_PPZ0_D(NAME, OP) 4974 4975 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 4976 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 4977 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 4978 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 4979 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 4980 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 4981 4982 /* FP Trig Multiply-Add. */ 4983 4984 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 4985 { 4986 static const float16 coeff[16] = { 4987 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4988 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4989 }; 4990 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 4991 intptr_t x = simd_data(desc); 4992 float16 *d = vd, *n = vn, *m = vm; 4993 for (i = 0; i < opr_sz; i++) { 4994 float16 mm = m[i]; 4995 intptr_t xx = x; 4996 if (float16_is_neg(mm)) { 4997 mm = float16_abs(mm); 4998 xx += 8; 4999 } 5000 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs); 5001 } 5002 } 5003 5004 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5005 { 5006 static const float32 coeff[16] = { 5007 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5008 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5009 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5010 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5011 }; 5012 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5013 intptr_t x = simd_data(desc); 5014 float32 *d = vd, *n = vn, *m = vm; 5015 for (i = 0; i < opr_sz; i++) { 5016 float32 mm = m[i]; 5017 intptr_t xx = x; 5018 if (float32_is_neg(mm)) { 5019 mm = float32_abs(mm); 5020 xx += 8; 5021 } 5022 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs); 5023 } 5024 } 5025 5026 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5027 { 5028 static const float64 coeff[16] = { 5029 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5030 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5031 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5032 0x3de5d8408868552full, 0x0000000000000000ull, 5033 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5034 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5035 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5036 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5037 }; 5038 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5039 intptr_t x = simd_data(desc); 5040 float64 *d = vd, *n = vn, *m = vm; 5041 for (i = 0; i < opr_sz; i++) { 5042 float64 mm = m[i]; 5043 intptr_t xx = x; 5044 if (float64_is_neg(mm)) { 5045 mm = float64_abs(mm); 5046 xx += 8; 5047 } 5048 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs); 5049 } 5050 } 5051 5052 /* 5053 * FP Complex Add 5054 */ 5055 5056 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5057 void *vs, uint32_t desc) 5058 { 5059 intptr_t j, i = simd_oprsz(desc); 5060 uint64_t *g = vg; 5061 float16 neg_imag = float16_set_sign(0, simd_data(desc)); 5062 float16 neg_real = float16_chs(neg_imag); 5063 5064 do { 5065 uint64_t pg = g[(i - 1) >> 6]; 5066 do { 5067 float16 e0, e1, e2, e3; 5068 5069 /* I holds the real index; J holds the imag index. */ 5070 j = i - sizeof(float16); 5071 i -= 2 * sizeof(float16); 5072 5073 e0 = *(float16 *)(vn + H1_2(i)); 5074 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; 5075 e2 = *(float16 *)(vn + H1_2(j)); 5076 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; 5077 5078 if (likely((pg >> (i & 63)) & 1)) { 5079 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs); 5080 } 5081 if (likely((pg >> (j & 63)) & 1)) { 5082 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs); 5083 } 5084 } while (i & 63); 5085 } while (i != 0); 5086 } 5087 5088 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5089 void *vs, uint32_t desc) 5090 { 5091 intptr_t j, i = simd_oprsz(desc); 5092 uint64_t *g = vg; 5093 float32 neg_imag = float32_set_sign(0, simd_data(desc)); 5094 float32 neg_real = float32_chs(neg_imag); 5095 5096 do { 5097 uint64_t pg = g[(i - 1) >> 6]; 5098 do { 5099 float32 e0, e1, e2, e3; 5100 5101 /* I holds the real index; J holds the imag index. */ 5102 j = i - sizeof(float32); 5103 i -= 2 * sizeof(float32); 5104 5105 e0 = *(float32 *)(vn + H1_2(i)); 5106 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; 5107 e2 = *(float32 *)(vn + H1_2(j)); 5108 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; 5109 5110 if (likely((pg >> (i & 63)) & 1)) { 5111 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs); 5112 } 5113 if (likely((pg >> (j & 63)) & 1)) { 5114 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs); 5115 } 5116 } while (i & 63); 5117 } while (i != 0); 5118 } 5119 5120 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5121 void *vs, uint32_t desc) 5122 { 5123 intptr_t j, i = simd_oprsz(desc); 5124 uint64_t *g = vg; 5125 float64 neg_imag = float64_set_sign(0, simd_data(desc)); 5126 float64 neg_real = float64_chs(neg_imag); 5127 5128 do { 5129 uint64_t pg = g[(i - 1) >> 6]; 5130 do { 5131 float64 e0, e1, e2, e3; 5132 5133 /* I holds the real index; J holds the imag index. */ 5134 j = i - sizeof(float64); 5135 i -= 2 * sizeof(float64); 5136 5137 e0 = *(float64 *)(vn + H1_2(i)); 5138 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; 5139 e2 = *(float64 *)(vn + H1_2(j)); 5140 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; 5141 5142 if (likely((pg >> (i & 63)) & 1)) { 5143 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs); 5144 } 5145 if (likely((pg >> (j & 63)) & 1)) { 5146 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs); 5147 } 5148 } while (i & 63); 5149 } while (i != 0); 5150 } 5151 5152 /* 5153 * FP Complex Multiply 5154 */ 5155 5156 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5157 void *vg, void *status, uint32_t desc) 5158 { 5159 intptr_t j, i = simd_oprsz(desc); 5160 unsigned rot = simd_data(desc); 5161 bool flip = rot & 1; 5162 float16 neg_imag, neg_real; 5163 uint64_t *g = vg; 5164 5165 neg_imag = float16_set_sign(0, (rot & 2) != 0); 5166 neg_real = float16_set_sign(0, rot == 1 || rot == 2); 5167 5168 do { 5169 uint64_t pg = g[(i - 1) >> 6]; 5170 do { 5171 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5172 5173 /* I holds the real index; J holds the imag index. */ 5174 j = i - sizeof(float16); 5175 i -= 2 * sizeof(float16); 5176 5177 nr = *(float16 *)(vn + H1_2(i)); 5178 ni = *(float16 *)(vn + H1_2(j)); 5179 mr = *(float16 *)(vm + H1_2(i)); 5180 mi = *(float16 *)(vm + H1_2(j)); 5181 5182 e2 = (flip ? ni : nr); 5183 e1 = (flip ? mi : mr) ^ neg_real; 5184 e4 = e2; 5185 e3 = (flip ? mr : mi) ^ neg_imag; 5186 5187 if (likely((pg >> (i & 63)) & 1)) { 5188 d = *(float16 *)(va + H1_2(i)); 5189 d = float16_muladd(e2, e1, d, 0, status); 5190 *(float16 *)(vd + H1_2(i)) = d; 5191 } 5192 if (likely((pg >> (j & 63)) & 1)) { 5193 d = *(float16 *)(va + H1_2(j)); 5194 d = float16_muladd(e4, e3, d, 0, status); 5195 *(float16 *)(vd + H1_2(j)) = d; 5196 } 5197 } while (i & 63); 5198 } while (i != 0); 5199 } 5200 5201 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5202 void *vg, void *status, uint32_t desc) 5203 { 5204 intptr_t j, i = simd_oprsz(desc); 5205 unsigned rot = simd_data(desc); 5206 bool flip = rot & 1; 5207 float32 neg_imag, neg_real; 5208 uint64_t *g = vg; 5209 5210 neg_imag = float32_set_sign(0, (rot & 2) != 0); 5211 neg_real = float32_set_sign(0, rot == 1 || rot == 2); 5212 5213 do { 5214 uint64_t pg = g[(i - 1) >> 6]; 5215 do { 5216 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5217 5218 /* I holds the real index; J holds the imag index. */ 5219 j = i - sizeof(float32); 5220 i -= 2 * sizeof(float32); 5221 5222 nr = *(float32 *)(vn + H1_2(i)); 5223 ni = *(float32 *)(vn + H1_2(j)); 5224 mr = *(float32 *)(vm + H1_2(i)); 5225 mi = *(float32 *)(vm + H1_2(j)); 5226 5227 e2 = (flip ? ni : nr); 5228 e1 = (flip ? mi : mr) ^ neg_real; 5229 e4 = e2; 5230 e3 = (flip ? mr : mi) ^ neg_imag; 5231 5232 if (likely((pg >> (i & 63)) & 1)) { 5233 d = *(float32 *)(va + H1_2(i)); 5234 d = float32_muladd(e2, e1, d, 0, status); 5235 *(float32 *)(vd + H1_2(i)) = d; 5236 } 5237 if (likely((pg >> (j & 63)) & 1)) { 5238 d = *(float32 *)(va + H1_2(j)); 5239 d = float32_muladd(e4, e3, d, 0, status); 5240 *(float32 *)(vd + H1_2(j)) = d; 5241 } 5242 } while (i & 63); 5243 } while (i != 0); 5244 } 5245 5246 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5247 void *vg, void *status, uint32_t desc) 5248 { 5249 intptr_t j, i = simd_oprsz(desc); 5250 unsigned rot = simd_data(desc); 5251 bool flip = rot & 1; 5252 float64 neg_imag, neg_real; 5253 uint64_t *g = vg; 5254 5255 neg_imag = float64_set_sign(0, (rot & 2) != 0); 5256 neg_real = float64_set_sign(0, rot == 1 || rot == 2); 5257 5258 do { 5259 uint64_t pg = g[(i - 1) >> 6]; 5260 do { 5261 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5262 5263 /* I holds the real index; J holds the imag index. */ 5264 j = i - sizeof(float64); 5265 i -= 2 * sizeof(float64); 5266 5267 nr = *(float64 *)(vn + H1_2(i)); 5268 ni = *(float64 *)(vn + H1_2(j)); 5269 mr = *(float64 *)(vm + H1_2(i)); 5270 mi = *(float64 *)(vm + H1_2(j)); 5271 5272 e2 = (flip ? ni : nr); 5273 e1 = (flip ? mi : mr) ^ neg_real; 5274 e4 = e2; 5275 e3 = (flip ? mr : mi) ^ neg_imag; 5276 5277 if (likely((pg >> (i & 63)) & 1)) { 5278 d = *(float64 *)(va + H1_2(i)); 5279 d = float64_muladd(e2, e1, d, 0, status); 5280 *(float64 *)(vd + H1_2(i)) = d; 5281 } 5282 if (likely((pg >> (j & 63)) & 1)) { 5283 d = *(float64 *)(va + H1_2(j)); 5284 d = float64_muladd(e4, e3, d, 0, status); 5285 *(float64 *)(vd + H1_2(j)) = d; 5286 } 5287 } while (i & 63); 5288 } while (i != 0); 5289 } 5290 5291 /* 5292 * Load contiguous data, protected by a governing predicate. 5293 */ 5294 5295 /* 5296 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5297 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5298 * element >= @reg_off, or @reg_max if there were no active elements at all. 5299 */ 5300 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5301 intptr_t reg_max, int esz) 5302 { 5303 uint64_t pg_mask = pred_esz_masks[esz]; 5304 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5305 5306 /* In normal usage, the first element is active. */ 5307 if (likely(pg & 1)) { 5308 return reg_off; 5309 } 5310 5311 if (pg == 0) { 5312 reg_off &= -64; 5313 do { 5314 reg_off += 64; 5315 if (unlikely(reg_off >= reg_max)) { 5316 /* The entire predicate was false. */ 5317 return reg_max; 5318 } 5319 pg = vg[reg_off >> 6] & pg_mask; 5320 } while (pg == 0); 5321 } 5322 reg_off += ctz64(pg); 5323 5324 /* We should never see an out of range predicate bit set. */ 5325 tcg_debug_assert(reg_off < reg_max); 5326 return reg_off; 5327 } 5328 5329 /* 5330 * Resolve the guest virtual address to info->host and info->flags. 5331 * If @nofault, return false if the page is invalid, otherwise 5332 * exit via page fault exception. 5333 */ 5334 5335 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5336 target_ulong addr, int mem_off, MMUAccessType access_type, 5337 int mmu_idx, uintptr_t retaddr) 5338 { 5339 int flags; 5340 5341 addr += mem_off; 5342 5343 /* 5344 * User-only currently always issues with TBI. See the comment 5345 * above useronly_clean_ptr. Usually we clean this top byte away 5346 * during translation, but we can't do that for e.g. vector + imm 5347 * addressing modes. 5348 * 5349 * We currently always enable TBI for user-only, and do not provide 5350 * a way to turn it off. So clean the pointer unconditionally here, 5351 * rather than look it up here, or pass it down from above. 5352 */ 5353 addr = useronly_clean_ptr(addr); 5354 5355 #ifdef CONFIG_USER_ONLY 5356 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5357 &info->host, retaddr); 5358 #else 5359 CPUTLBEntryFull *full; 5360 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5361 &info->host, &full, retaddr); 5362 #endif 5363 info->flags = flags; 5364 5365 if (flags & TLB_INVALID_MASK) { 5366 g_assert(nofault); 5367 return false; 5368 } 5369 5370 #ifdef CONFIG_USER_ONLY 5371 memset(&info->attrs, 0, sizeof(info->attrs)); 5372 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5373 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5374 #else 5375 info->attrs = full->attrs; 5376 info->tagged = full->pte_attrs == 0xf0; 5377 #endif 5378 5379 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5380 info->host -= mem_off; 5381 return true; 5382 } 5383 5384 /* 5385 * Find first active element on each page, and a loose bound for the 5386 * final element on each page. Identify any single element that spans 5387 * the page boundary. Return true if there are any active elements. 5388 */ 5389 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5390 intptr_t reg_max, int esz, int msize) 5391 { 5392 const int esize = 1 << esz; 5393 const uint64_t pg_mask = pred_esz_masks[esz]; 5394 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5395 intptr_t mem_off_last, mem_off_split; 5396 intptr_t page_split, elt_split; 5397 intptr_t i; 5398 5399 /* Set all of the element indices to -1, and the TLB data to 0. */ 5400 memset(info, -1, offsetof(SVEContLdSt, page)); 5401 memset(info->page, 0, sizeof(info->page)); 5402 5403 /* Gross scan over the entire predicate to find bounds. */ 5404 i = 0; 5405 do { 5406 uint64_t pg = vg[i] & pg_mask; 5407 if (pg) { 5408 reg_off_last = i * 64 + 63 - clz64(pg); 5409 if (reg_off_first < 0) { 5410 reg_off_first = i * 64 + ctz64(pg); 5411 } 5412 } 5413 } while (++i * 64 < reg_max); 5414 5415 if (unlikely(reg_off_first < 0)) { 5416 /* No active elements, no pages touched. */ 5417 return false; 5418 } 5419 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5420 5421 info->reg_off_first[0] = reg_off_first; 5422 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5423 mem_off_last = (reg_off_last >> esz) * msize; 5424 5425 page_split = -(addr | TARGET_PAGE_MASK); 5426 if (likely(mem_off_last + msize <= page_split)) { 5427 /* The entire operation fits within a single page. */ 5428 info->reg_off_last[0] = reg_off_last; 5429 return true; 5430 } 5431 5432 info->page_split = page_split; 5433 elt_split = page_split / msize; 5434 reg_off_split = elt_split << esz; 5435 mem_off_split = elt_split * msize; 5436 5437 /* 5438 * This is the last full element on the first page, but it is not 5439 * necessarily active. If there is no full element, i.e. the first 5440 * active element is the one that's split, this value remains -1. 5441 * It is useful as iteration bounds. 5442 */ 5443 if (elt_split != 0) { 5444 info->reg_off_last[0] = reg_off_split - esize; 5445 } 5446 5447 /* Determine if an unaligned element spans the pages. */ 5448 if (page_split % msize != 0) { 5449 /* It is helpful to know if the split element is active. */ 5450 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5451 info->reg_off_split = reg_off_split; 5452 info->mem_off_split = mem_off_split; 5453 5454 if (reg_off_split == reg_off_last) { 5455 /* The page crossing element is last. */ 5456 return true; 5457 } 5458 } 5459 reg_off_split += esize; 5460 mem_off_split += msize; 5461 } 5462 5463 /* 5464 * We do want the first active element on the second page, because 5465 * this may affect the address reported in an exception. 5466 */ 5467 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5468 tcg_debug_assert(reg_off_split <= reg_off_last); 5469 info->reg_off_first[1] = reg_off_split; 5470 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5471 info->reg_off_last[1] = reg_off_last; 5472 return true; 5473 } 5474 5475 /* 5476 * Resolve the guest virtual addresses to info->page[]. 5477 * Control the generation of page faults with @fault. Return false if 5478 * there is no work to do, which can only happen with @fault == FAULT_NO. 5479 */ 5480 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5481 CPUARMState *env, target_ulong addr, 5482 MMUAccessType access_type, uintptr_t retaddr) 5483 { 5484 int mmu_idx = cpu_mmu_index(env, false); 5485 int mem_off = info->mem_off_first[0]; 5486 bool nofault = fault == FAULT_NO; 5487 bool have_work = true; 5488 5489 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5490 access_type, mmu_idx, retaddr)) { 5491 /* No work to be done. */ 5492 return false; 5493 } 5494 5495 if (likely(info->page_split < 0)) { 5496 /* The entire operation was on the one page. */ 5497 return true; 5498 } 5499 5500 /* 5501 * If the second page is invalid, then we want the fault address to be 5502 * the first byte on that page which is accessed. 5503 */ 5504 if (info->mem_off_split >= 0) { 5505 /* 5506 * There is an element split across the pages. The fault address 5507 * should be the first byte of the second page. 5508 */ 5509 mem_off = info->page_split; 5510 /* 5511 * If the split element is also the first active element 5512 * of the vector, then: For first-fault we should continue 5513 * to generate faults for the second page. For no-fault, 5514 * we have work only if the second page is valid. 5515 */ 5516 if (info->mem_off_first[0] < info->mem_off_split) { 5517 nofault = FAULT_FIRST; 5518 have_work = false; 5519 } 5520 } else { 5521 /* 5522 * There is no element split across the pages. The fault address 5523 * should be the first active element on the second page. 5524 */ 5525 mem_off = info->mem_off_first[1]; 5526 /* 5527 * There must have been one active element on the first page, 5528 * so we're out of first-fault territory. 5529 */ 5530 nofault = fault != FAULT_ALL; 5531 } 5532 5533 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5534 access_type, mmu_idx, retaddr); 5535 return have_work; 5536 } 5537 5538 #ifndef CONFIG_USER_ONLY 5539 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5540 uint64_t *vg, target_ulong addr, 5541 int esize, int msize, int wp_access, 5542 uintptr_t retaddr) 5543 { 5544 intptr_t mem_off, reg_off, reg_last; 5545 int flags0 = info->page[0].flags; 5546 int flags1 = info->page[1].flags; 5547 5548 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5549 return; 5550 } 5551 5552 /* Indicate that watchpoints are handled. */ 5553 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5554 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5555 5556 if (flags0 & TLB_WATCHPOINT) { 5557 mem_off = info->mem_off_first[0]; 5558 reg_off = info->reg_off_first[0]; 5559 reg_last = info->reg_off_last[0]; 5560 5561 while (reg_off <= reg_last) { 5562 uint64_t pg = vg[reg_off >> 6]; 5563 do { 5564 if ((pg >> (reg_off & 63)) & 1) { 5565 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5566 msize, info->page[0].attrs, 5567 wp_access, retaddr); 5568 } 5569 reg_off += esize; 5570 mem_off += msize; 5571 } while (reg_off <= reg_last && (reg_off & 63)); 5572 } 5573 } 5574 5575 mem_off = info->mem_off_split; 5576 if (mem_off >= 0) { 5577 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5578 info->page[0].attrs, wp_access, retaddr); 5579 } 5580 5581 mem_off = info->mem_off_first[1]; 5582 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5583 reg_off = info->reg_off_first[1]; 5584 reg_last = info->reg_off_last[1]; 5585 5586 do { 5587 uint64_t pg = vg[reg_off >> 6]; 5588 do { 5589 if ((pg >> (reg_off & 63)) & 1) { 5590 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5591 msize, info->page[1].attrs, 5592 wp_access, retaddr); 5593 } 5594 reg_off += esize; 5595 mem_off += msize; 5596 } while (reg_off & 63); 5597 } while (reg_off <= reg_last); 5598 } 5599 } 5600 #endif 5601 5602 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5603 uint64_t *vg, target_ulong addr, int esize, 5604 int msize, uint32_t mtedesc, uintptr_t ra) 5605 { 5606 intptr_t mem_off, reg_off, reg_last; 5607 5608 /* Process the page only if MemAttr == Tagged. */ 5609 if (info->page[0].tagged) { 5610 mem_off = info->mem_off_first[0]; 5611 reg_off = info->reg_off_first[0]; 5612 reg_last = info->reg_off_split; 5613 if (reg_last < 0) { 5614 reg_last = info->reg_off_last[0]; 5615 } 5616 5617 do { 5618 uint64_t pg = vg[reg_off >> 6]; 5619 do { 5620 if ((pg >> (reg_off & 63)) & 1) { 5621 mte_check(env, mtedesc, addr, ra); 5622 } 5623 reg_off += esize; 5624 mem_off += msize; 5625 } while (reg_off <= reg_last && (reg_off & 63)); 5626 } while (reg_off <= reg_last); 5627 } 5628 5629 mem_off = info->mem_off_first[1]; 5630 if (mem_off >= 0 && info->page[1].tagged) { 5631 reg_off = info->reg_off_first[1]; 5632 reg_last = info->reg_off_last[1]; 5633 5634 do { 5635 uint64_t pg = vg[reg_off >> 6]; 5636 do { 5637 if ((pg >> (reg_off & 63)) & 1) { 5638 mte_check(env, mtedesc, addr, ra); 5639 } 5640 reg_off += esize; 5641 mem_off += msize; 5642 } while (reg_off & 63); 5643 } while (reg_off <= reg_last); 5644 } 5645 } 5646 5647 /* 5648 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5649 */ 5650 static inline QEMU_ALWAYS_INLINE 5651 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5652 uint32_t desc, const uintptr_t retaddr, 5653 const int esz, const int msz, const int N, uint32_t mtedesc, 5654 sve_ldst1_host_fn *host_fn, 5655 sve_ldst1_tlb_fn *tlb_fn) 5656 { 5657 const unsigned rd = simd_data(desc); 5658 const intptr_t reg_max = simd_oprsz(desc); 5659 intptr_t reg_off, reg_last, mem_off; 5660 SVEContLdSt info; 5661 void *host; 5662 int flags, i; 5663 5664 /* Find the active elements. */ 5665 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5666 /* The entire predicate was false; no load occurs. */ 5667 for (i = 0; i < N; ++i) { 5668 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5669 } 5670 return; 5671 } 5672 5673 /* Probe the page(s). Exit with exception for any invalid page. */ 5674 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5675 5676 /* Handle watchpoints for all active elements. */ 5677 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5678 BP_MEM_READ, retaddr); 5679 5680 /* 5681 * Handle mte checks for all active elements. 5682 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5683 */ 5684 if (mtedesc) { 5685 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5686 mtedesc, retaddr); 5687 } 5688 5689 flags = info.page[0].flags | info.page[1].flags; 5690 if (unlikely(flags != 0)) { 5691 #ifdef CONFIG_USER_ONLY 5692 g_assert_not_reached(); 5693 #else 5694 /* 5695 * At least one page includes MMIO. 5696 * Any bus operation can fail with cpu_transaction_failed, 5697 * which for ARM will raise SyncExternal. Perform the load 5698 * into scratch memory to preserve register state until the end. 5699 */ 5700 ARMVectorReg scratch[4] = { }; 5701 5702 mem_off = info.mem_off_first[0]; 5703 reg_off = info.reg_off_first[0]; 5704 reg_last = info.reg_off_last[1]; 5705 if (reg_last < 0) { 5706 reg_last = info.reg_off_split; 5707 if (reg_last < 0) { 5708 reg_last = info.reg_off_last[0]; 5709 } 5710 } 5711 5712 do { 5713 uint64_t pg = vg[reg_off >> 6]; 5714 do { 5715 if ((pg >> (reg_off & 63)) & 1) { 5716 for (i = 0; i < N; ++i) { 5717 tlb_fn(env, &scratch[i], reg_off, 5718 addr + mem_off + (i << msz), retaddr); 5719 } 5720 } 5721 reg_off += 1 << esz; 5722 mem_off += N << msz; 5723 } while (reg_off & 63); 5724 } while (reg_off <= reg_last); 5725 5726 for (i = 0; i < N; ++i) { 5727 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5728 } 5729 return; 5730 #endif 5731 } 5732 5733 /* The entire operation is in RAM, on valid pages. */ 5734 5735 for (i = 0; i < N; ++i) { 5736 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5737 } 5738 5739 mem_off = info.mem_off_first[0]; 5740 reg_off = info.reg_off_first[0]; 5741 reg_last = info.reg_off_last[0]; 5742 host = info.page[0].host; 5743 5744 while (reg_off <= reg_last) { 5745 uint64_t pg = vg[reg_off >> 6]; 5746 do { 5747 if ((pg >> (reg_off & 63)) & 1) { 5748 for (i = 0; i < N; ++i) { 5749 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5750 host + mem_off + (i << msz)); 5751 } 5752 } 5753 reg_off += 1 << esz; 5754 mem_off += N << msz; 5755 } while (reg_off <= reg_last && (reg_off & 63)); 5756 } 5757 5758 /* 5759 * Use the slow path to manage the cross-page misalignment. 5760 * But we know this is RAM and cannot trap. 5761 */ 5762 mem_off = info.mem_off_split; 5763 if (unlikely(mem_off >= 0)) { 5764 reg_off = info.reg_off_split; 5765 for (i = 0; i < N; ++i) { 5766 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5767 addr + mem_off + (i << msz), retaddr); 5768 } 5769 } 5770 5771 mem_off = info.mem_off_first[1]; 5772 if (unlikely(mem_off >= 0)) { 5773 reg_off = info.reg_off_first[1]; 5774 reg_last = info.reg_off_last[1]; 5775 host = info.page[1].host; 5776 5777 do { 5778 uint64_t pg = vg[reg_off >> 6]; 5779 do { 5780 if ((pg >> (reg_off & 63)) & 1) { 5781 for (i = 0; i < N; ++i) { 5782 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5783 host + mem_off + (i << msz)); 5784 } 5785 } 5786 reg_off += 1 << esz; 5787 mem_off += N << msz; 5788 } while (reg_off & 63); 5789 } while (reg_off <= reg_last); 5790 } 5791 } 5792 5793 static inline QEMU_ALWAYS_INLINE 5794 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 5795 uint32_t desc, const uintptr_t ra, 5796 const int esz, const int msz, const int N, 5797 sve_ldst1_host_fn *host_fn, 5798 sve_ldst1_tlb_fn *tlb_fn) 5799 { 5800 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5801 int bit55 = extract64(addr, 55, 1); 5802 5803 /* Remove mtedesc from the normal sve descriptor. */ 5804 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5805 5806 /* Perform gross MTE suppression early. */ 5807 if (!tbi_check(desc, bit55) || 5808 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 5809 mtedesc = 0; 5810 } 5811 5812 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 5813 } 5814 5815 #define DO_LD1_1(NAME, ESZ) \ 5816 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 5817 target_ulong addr, uint32_t desc) \ 5818 { \ 5819 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 5820 sve_##NAME##_host, sve_##NAME##_tlb); \ 5821 } \ 5822 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 5823 target_ulong addr, uint32_t desc) \ 5824 { \ 5825 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 5826 sve_##NAME##_host, sve_##NAME##_tlb); \ 5827 } 5828 5829 #define DO_LD1_2(NAME, ESZ, MSZ) \ 5830 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 5831 target_ulong addr, uint32_t desc) \ 5832 { \ 5833 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5834 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5835 } \ 5836 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 5837 target_ulong addr, uint32_t desc) \ 5838 { \ 5839 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5840 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5841 } \ 5842 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 5843 target_ulong addr, uint32_t desc) \ 5844 { \ 5845 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5846 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5847 } \ 5848 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 5849 target_ulong addr, uint32_t desc) \ 5850 { \ 5851 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5852 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5853 } 5854 5855 DO_LD1_1(ld1bb, MO_8) 5856 DO_LD1_1(ld1bhu, MO_16) 5857 DO_LD1_1(ld1bhs, MO_16) 5858 DO_LD1_1(ld1bsu, MO_32) 5859 DO_LD1_1(ld1bss, MO_32) 5860 DO_LD1_1(ld1bdu, MO_64) 5861 DO_LD1_1(ld1bds, MO_64) 5862 5863 DO_LD1_2(ld1hh, MO_16, MO_16) 5864 DO_LD1_2(ld1hsu, MO_32, MO_16) 5865 DO_LD1_2(ld1hss, MO_32, MO_16) 5866 DO_LD1_2(ld1hdu, MO_64, MO_16) 5867 DO_LD1_2(ld1hds, MO_64, MO_16) 5868 5869 DO_LD1_2(ld1ss, MO_32, MO_32) 5870 DO_LD1_2(ld1sdu, MO_64, MO_32) 5871 DO_LD1_2(ld1sds, MO_64, MO_32) 5872 5873 DO_LD1_2(ld1dd, MO_64, MO_64) 5874 5875 #undef DO_LD1_1 5876 #undef DO_LD1_2 5877 5878 #define DO_LDN_1(N) \ 5879 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 5880 target_ulong addr, uint32_t desc) \ 5881 { \ 5882 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 5883 sve_ld1bb_host, sve_ld1bb_tlb); \ 5884 } \ 5885 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 5886 target_ulong addr, uint32_t desc) \ 5887 { \ 5888 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 5889 sve_ld1bb_host, sve_ld1bb_tlb); \ 5890 } 5891 5892 #define DO_LDN_2(N, SUFF, ESZ) \ 5893 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 5894 target_ulong addr, uint32_t desc) \ 5895 { \ 5896 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5897 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5898 } \ 5899 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 5900 target_ulong addr, uint32_t desc) \ 5901 { \ 5902 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5903 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5904 } \ 5905 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 5906 target_ulong addr, uint32_t desc) \ 5907 { \ 5908 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5909 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5910 } \ 5911 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 5912 target_ulong addr, uint32_t desc) \ 5913 { \ 5914 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5915 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5916 } 5917 5918 DO_LDN_1(2) 5919 DO_LDN_1(3) 5920 DO_LDN_1(4) 5921 5922 DO_LDN_2(2, hh, MO_16) 5923 DO_LDN_2(3, hh, MO_16) 5924 DO_LDN_2(4, hh, MO_16) 5925 5926 DO_LDN_2(2, ss, MO_32) 5927 DO_LDN_2(3, ss, MO_32) 5928 DO_LDN_2(4, ss, MO_32) 5929 5930 DO_LDN_2(2, dd, MO_64) 5931 DO_LDN_2(3, dd, MO_64) 5932 DO_LDN_2(4, dd, MO_64) 5933 5934 #undef DO_LDN_1 5935 #undef DO_LDN_2 5936 5937 /* 5938 * Load contiguous data, first-fault and no-fault. 5939 * 5940 * For user-only, one could argue that we should hold the mmap_lock during 5941 * the operation so that there is no race between page_check_range and the 5942 * load operation. However, unmapping pages out from under a running thread 5943 * is extraordinarily unlikely. This theoretical race condition also affects 5944 * linux-user/ in its get_user/put_user macros. 5945 * 5946 * TODO: Construct some helpers, written in assembly, that interact with 5947 * host_signal_handler to produce memory ops which can properly report errors 5948 * without racing. 5949 */ 5950 5951 /* Fault on byte I. All bits in FFR from I are cleared. The vector 5952 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 5953 * option, which leaves subsequent data unchanged. 5954 */ 5955 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 5956 { 5957 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 5958 5959 if (i & 63) { 5960 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 5961 i = ROUND_UP(i, 64); 5962 } 5963 for (; i < oprsz; i += 64) { 5964 ffr[i / 64] = 0; 5965 } 5966 } 5967 5968 /* 5969 * Common helper for all contiguous no-fault and first-fault loads. 5970 */ 5971 static inline QEMU_ALWAYS_INLINE 5972 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 5973 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 5974 const int esz, const int msz, const SVEContFault fault, 5975 sve_ldst1_host_fn *host_fn, 5976 sve_ldst1_tlb_fn *tlb_fn) 5977 { 5978 const unsigned rd = simd_data(desc); 5979 void *vd = &env->vfp.zregs[rd]; 5980 const intptr_t reg_max = simd_oprsz(desc); 5981 intptr_t reg_off, mem_off, reg_last; 5982 SVEContLdSt info; 5983 int flags; 5984 void *host; 5985 5986 /* Find the active elements. */ 5987 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 5988 /* The entire predicate was false; no load occurs. */ 5989 memset(vd, 0, reg_max); 5990 return; 5991 } 5992 reg_off = info.reg_off_first[0]; 5993 5994 /* Probe the page(s). */ 5995 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 5996 /* Fault on first element. */ 5997 tcg_debug_assert(fault == FAULT_NO); 5998 memset(vd, 0, reg_max); 5999 goto do_fault; 6000 } 6001 6002 mem_off = info.mem_off_first[0]; 6003 flags = info.page[0].flags; 6004 6005 /* 6006 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6007 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6008 */ 6009 if (!info.page[0].tagged) { 6010 mtedesc = 0; 6011 } 6012 6013 if (fault == FAULT_FIRST) { 6014 /* Trapping mte check for the first-fault element. */ 6015 if (mtedesc) { 6016 mte_check(env, mtedesc, addr + mem_off, retaddr); 6017 } 6018 6019 /* 6020 * Special handling of the first active element, 6021 * if it crosses a page boundary or is MMIO. 6022 */ 6023 bool is_split = mem_off == info.mem_off_split; 6024 if (unlikely(flags != 0) || unlikely(is_split)) { 6025 /* 6026 * Use the slow path for cross-page handling. 6027 * Might trap for MMIO or watchpoints. 6028 */ 6029 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6030 6031 /* After any fault, zero the other elements. */ 6032 swap_memzero(vd, reg_off); 6033 reg_off += 1 << esz; 6034 mem_off += 1 << msz; 6035 swap_memzero(vd + reg_off, reg_max - reg_off); 6036 6037 if (is_split) { 6038 goto second_page; 6039 } 6040 } else { 6041 memset(vd, 0, reg_max); 6042 } 6043 } else { 6044 memset(vd, 0, reg_max); 6045 if (unlikely(mem_off == info.mem_off_split)) { 6046 /* The first active element crosses a page boundary. */ 6047 flags |= info.page[1].flags; 6048 if (unlikely(flags & TLB_MMIO)) { 6049 /* Some page is MMIO, see below. */ 6050 goto do_fault; 6051 } 6052 if (unlikely(flags & TLB_WATCHPOINT) && 6053 (cpu_watchpoint_address_matches 6054 (env_cpu(env), addr + mem_off, 1 << msz) 6055 & BP_MEM_READ)) { 6056 /* Watchpoint hit, see below. */ 6057 goto do_fault; 6058 } 6059 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6060 goto do_fault; 6061 } 6062 /* 6063 * Use the slow path for cross-page handling. 6064 * This is RAM, without a watchpoint, and will not trap. 6065 */ 6066 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6067 goto second_page; 6068 } 6069 } 6070 6071 /* 6072 * From this point on, all memory operations are MemSingleNF. 6073 * 6074 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6075 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6076 * 6077 * Unfortuately we do not have access to the memory attributes from the 6078 * PTE to tell Device memory from Normal memory. So we make a mostly 6079 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6080 * This gives the right answer for the common cases of "Normal memory, 6081 * backed by host RAM" and "Device memory, backed by MMIO". 6082 * The architecture allows us to suppress an NF load and return 6083 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6084 * case of "Normal memory, backed by MMIO" is permitted. The case we 6085 * get wrong is "Device memory, backed by host RAM", for which we 6086 * should return (UNKNOWN, FAULT) for but do not. 6087 * 6088 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6089 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6090 * architectural breakpoints the same. 6091 */ 6092 if (unlikely(flags & TLB_MMIO)) { 6093 goto do_fault; 6094 } 6095 6096 reg_last = info.reg_off_last[0]; 6097 host = info.page[0].host; 6098 6099 do { 6100 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6101 do { 6102 if ((pg >> (reg_off & 63)) & 1) { 6103 if (unlikely(flags & TLB_WATCHPOINT) && 6104 (cpu_watchpoint_address_matches 6105 (env_cpu(env), addr + mem_off, 1 << msz) 6106 & BP_MEM_READ)) { 6107 goto do_fault; 6108 } 6109 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6110 goto do_fault; 6111 } 6112 host_fn(vd, reg_off, host + mem_off); 6113 } 6114 reg_off += 1 << esz; 6115 mem_off += 1 << msz; 6116 } while (reg_off <= reg_last && (reg_off & 63)); 6117 } while (reg_off <= reg_last); 6118 6119 /* 6120 * MemSingleNF is allowed to fail for any reason. We have special 6121 * code above to handle the first element crossing a page boundary. 6122 * As an implementation choice, decline to handle a cross-page element 6123 * in any other position. 6124 */ 6125 reg_off = info.reg_off_split; 6126 if (reg_off >= 0) { 6127 goto do_fault; 6128 } 6129 6130 second_page: 6131 reg_off = info.reg_off_first[1]; 6132 if (likely(reg_off < 0)) { 6133 /* No active elements on the second page. All done. */ 6134 return; 6135 } 6136 6137 /* 6138 * MemSingleNF is allowed to fail for any reason. As an implementation 6139 * choice, decline to handle elements on the second page. This should 6140 * be low frequency as the guest walks through memory -- the next 6141 * iteration of the guest's loop should be aligned on the page boundary, 6142 * and then all following iterations will stay aligned. 6143 */ 6144 6145 do_fault: 6146 record_fault(env, reg_off, reg_max); 6147 } 6148 6149 static inline QEMU_ALWAYS_INLINE 6150 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6151 uint32_t desc, const uintptr_t retaddr, 6152 const int esz, const int msz, const SVEContFault fault, 6153 sve_ldst1_host_fn *host_fn, 6154 sve_ldst1_tlb_fn *tlb_fn) 6155 { 6156 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6157 int bit55 = extract64(addr, 55, 1); 6158 6159 /* Remove mtedesc from the normal sve descriptor. */ 6160 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6161 6162 /* Perform gross MTE suppression early. */ 6163 if (!tbi_check(desc, bit55) || 6164 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6165 mtedesc = 0; 6166 } 6167 6168 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6169 esz, msz, fault, host_fn, tlb_fn); 6170 } 6171 6172 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6173 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6174 target_ulong addr, uint32_t desc) \ 6175 { \ 6176 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6177 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6178 } \ 6179 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6180 target_ulong addr, uint32_t desc) \ 6181 { \ 6182 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6183 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6184 } \ 6185 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6186 target_ulong addr, uint32_t desc) \ 6187 { \ 6188 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6189 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6190 } \ 6191 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6192 target_ulong addr, uint32_t desc) \ 6193 { \ 6194 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6195 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6196 } 6197 6198 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6199 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6200 target_ulong addr, uint32_t desc) \ 6201 { \ 6202 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6203 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6204 } \ 6205 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6206 target_ulong addr, uint32_t desc) \ 6207 { \ 6208 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6209 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6210 } \ 6211 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6212 target_ulong addr, uint32_t desc) \ 6213 { \ 6214 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6215 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6216 } \ 6217 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6218 target_ulong addr, uint32_t desc) \ 6219 { \ 6220 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6221 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6222 } \ 6223 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6224 target_ulong addr, uint32_t desc) \ 6225 { \ 6226 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6227 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6228 } \ 6229 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6230 target_ulong addr, uint32_t desc) \ 6231 { \ 6232 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6233 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6234 } \ 6235 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6236 target_ulong addr, uint32_t desc) \ 6237 { \ 6238 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6239 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6240 } \ 6241 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6242 target_ulong addr, uint32_t desc) \ 6243 { \ 6244 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6245 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6246 } 6247 6248 DO_LDFF1_LDNF1_1(bb, MO_8) 6249 DO_LDFF1_LDNF1_1(bhu, MO_16) 6250 DO_LDFF1_LDNF1_1(bhs, MO_16) 6251 DO_LDFF1_LDNF1_1(bsu, MO_32) 6252 DO_LDFF1_LDNF1_1(bss, MO_32) 6253 DO_LDFF1_LDNF1_1(bdu, MO_64) 6254 DO_LDFF1_LDNF1_1(bds, MO_64) 6255 6256 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6257 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6258 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6259 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6260 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6261 6262 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6263 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6264 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6265 6266 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6267 6268 #undef DO_LDFF1_LDNF1_1 6269 #undef DO_LDFF1_LDNF1_2 6270 6271 /* 6272 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6273 */ 6274 6275 static inline QEMU_ALWAYS_INLINE 6276 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6277 uint32_t desc, const uintptr_t retaddr, 6278 const int esz, const int msz, const int N, uint32_t mtedesc, 6279 sve_ldst1_host_fn *host_fn, 6280 sve_ldst1_tlb_fn *tlb_fn) 6281 { 6282 const unsigned rd = simd_data(desc); 6283 const intptr_t reg_max = simd_oprsz(desc); 6284 intptr_t reg_off, reg_last, mem_off; 6285 SVEContLdSt info; 6286 void *host; 6287 int i, flags; 6288 6289 /* Find the active elements. */ 6290 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6291 /* The entire predicate was false; no store occurs. */ 6292 return; 6293 } 6294 6295 /* Probe the page(s). Exit with exception for any invalid page. */ 6296 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6297 6298 /* Handle watchpoints for all active elements. */ 6299 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6300 BP_MEM_WRITE, retaddr); 6301 6302 /* 6303 * Handle mte checks for all active elements. 6304 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6305 */ 6306 if (mtedesc) { 6307 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6308 mtedesc, retaddr); 6309 } 6310 6311 flags = info.page[0].flags | info.page[1].flags; 6312 if (unlikely(flags != 0)) { 6313 #ifdef CONFIG_USER_ONLY 6314 g_assert_not_reached(); 6315 #else 6316 /* 6317 * At least one page includes MMIO. 6318 * Any bus operation can fail with cpu_transaction_failed, 6319 * which for ARM will raise SyncExternal. We cannot avoid 6320 * this fault and will leave with the store incomplete. 6321 */ 6322 mem_off = info.mem_off_first[0]; 6323 reg_off = info.reg_off_first[0]; 6324 reg_last = info.reg_off_last[1]; 6325 if (reg_last < 0) { 6326 reg_last = info.reg_off_split; 6327 if (reg_last < 0) { 6328 reg_last = info.reg_off_last[0]; 6329 } 6330 } 6331 6332 do { 6333 uint64_t pg = vg[reg_off >> 6]; 6334 do { 6335 if ((pg >> (reg_off & 63)) & 1) { 6336 for (i = 0; i < N; ++i) { 6337 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6338 addr + mem_off + (i << msz), retaddr); 6339 } 6340 } 6341 reg_off += 1 << esz; 6342 mem_off += N << msz; 6343 } while (reg_off & 63); 6344 } while (reg_off <= reg_last); 6345 return; 6346 #endif 6347 } 6348 6349 mem_off = info.mem_off_first[0]; 6350 reg_off = info.reg_off_first[0]; 6351 reg_last = info.reg_off_last[0]; 6352 host = info.page[0].host; 6353 6354 while (reg_off <= reg_last) { 6355 uint64_t pg = vg[reg_off >> 6]; 6356 do { 6357 if ((pg >> (reg_off & 63)) & 1) { 6358 for (i = 0; i < N; ++i) { 6359 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6360 host + mem_off + (i << msz)); 6361 } 6362 } 6363 reg_off += 1 << esz; 6364 mem_off += N << msz; 6365 } while (reg_off <= reg_last && (reg_off & 63)); 6366 } 6367 6368 /* 6369 * Use the slow path to manage the cross-page misalignment. 6370 * But we know this is RAM and cannot trap. 6371 */ 6372 mem_off = info.mem_off_split; 6373 if (unlikely(mem_off >= 0)) { 6374 reg_off = info.reg_off_split; 6375 for (i = 0; i < N; ++i) { 6376 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6377 addr + mem_off + (i << msz), retaddr); 6378 } 6379 } 6380 6381 mem_off = info.mem_off_first[1]; 6382 if (unlikely(mem_off >= 0)) { 6383 reg_off = info.reg_off_first[1]; 6384 reg_last = info.reg_off_last[1]; 6385 host = info.page[1].host; 6386 6387 do { 6388 uint64_t pg = vg[reg_off >> 6]; 6389 do { 6390 if ((pg >> (reg_off & 63)) & 1) { 6391 for (i = 0; i < N; ++i) { 6392 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6393 host + mem_off + (i << msz)); 6394 } 6395 } 6396 reg_off += 1 << esz; 6397 mem_off += N << msz; 6398 } while (reg_off & 63); 6399 } while (reg_off <= reg_last); 6400 } 6401 } 6402 6403 static inline QEMU_ALWAYS_INLINE 6404 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6405 uint32_t desc, const uintptr_t ra, 6406 const int esz, const int msz, const int N, 6407 sve_ldst1_host_fn *host_fn, 6408 sve_ldst1_tlb_fn *tlb_fn) 6409 { 6410 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6411 int bit55 = extract64(addr, 55, 1); 6412 6413 /* Remove mtedesc from the normal sve descriptor. */ 6414 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6415 6416 /* Perform gross MTE suppression early. */ 6417 if (!tbi_check(desc, bit55) || 6418 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6419 mtedesc = 0; 6420 } 6421 6422 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6423 } 6424 6425 #define DO_STN_1(N, NAME, ESZ) \ 6426 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6427 target_ulong addr, uint32_t desc) \ 6428 { \ 6429 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6430 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6431 } \ 6432 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6433 target_ulong addr, uint32_t desc) \ 6434 { \ 6435 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6436 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6437 } 6438 6439 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6440 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6441 target_ulong addr, uint32_t desc) \ 6442 { \ 6443 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6444 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6445 } \ 6446 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6447 target_ulong addr, uint32_t desc) \ 6448 { \ 6449 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6450 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6451 } \ 6452 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6453 target_ulong addr, uint32_t desc) \ 6454 { \ 6455 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6456 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6457 } \ 6458 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6459 target_ulong addr, uint32_t desc) \ 6460 { \ 6461 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6462 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6463 } 6464 6465 DO_STN_1(1, bb, MO_8) 6466 DO_STN_1(1, bh, MO_16) 6467 DO_STN_1(1, bs, MO_32) 6468 DO_STN_1(1, bd, MO_64) 6469 DO_STN_1(2, bb, MO_8) 6470 DO_STN_1(3, bb, MO_8) 6471 DO_STN_1(4, bb, MO_8) 6472 6473 DO_STN_2(1, hh, MO_16, MO_16) 6474 DO_STN_2(1, hs, MO_32, MO_16) 6475 DO_STN_2(1, hd, MO_64, MO_16) 6476 DO_STN_2(2, hh, MO_16, MO_16) 6477 DO_STN_2(3, hh, MO_16, MO_16) 6478 DO_STN_2(4, hh, MO_16, MO_16) 6479 6480 DO_STN_2(1, ss, MO_32, MO_32) 6481 DO_STN_2(1, sd, MO_64, MO_32) 6482 DO_STN_2(2, ss, MO_32, MO_32) 6483 DO_STN_2(3, ss, MO_32, MO_32) 6484 DO_STN_2(4, ss, MO_32, MO_32) 6485 6486 DO_STN_2(1, dd, MO_64, MO_64) 6487 DO_STN_2(2, dd, MO_64, MO_64) 6488 DO_STN_2(3, dd, MO_64, MO_64) 6489 DO_STN_2(4, dd, MO_64, MO_64) 6490 6491 #undef DO_STN_1 6492 #undef DO_STN_2 6493 6494 /* 6495 * Loads with a vector index. 6496 */ 6497 6498 /* 6499 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6500 */ 6501 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6502 6503 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6504 { 6505 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6506 } 6507 6508 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6509 { 6510 return *(int32_t *)(reg + H1_4(reg_ofs)); 6511 } 6512 6513 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6514 { 6515 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6516 } 6517 6518 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6519 { 6520 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6521 } 6522 6523 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6524 { 6525 return *(uint64_t *)(reg + reg_ofs); 6526 } 6527 6528 static inline QEMU_ALWAYS_INLINE 6529 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6530 target_ulong base, uint32_t desc, uintptr_t retaddr, 6531 uint32_t mtedesc, int esize, int msize, 6532 zreg_off_fn *off_fn, 6533 sve_ldst1_host_fn *host_fn, 6534 sve_ldst1_tlb_fn *tlb_fn) 6535 { 6536 const int mmu_idx = cpu_mmu_index(env, false); 6537 const intptr_t reg_max = simd_oprsz(desc); 6538 const int scale = simd_data(desc); 6539 ARMVectorReg scratch; 6540 intptr_t reg_off; 6541 SVEHostPage info, info2; 6542 6543 memset(&scratch, 0, reg_max); 6544 reg_off = 0; 6545 do { 6546 uint64_t pg = vg[reg_off >> 6]; 6547 do { 6548 if (likely(pg & 1)) { 6549 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6550 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6551 6552 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6553 mmu_idx, retaddr); 6554 6555 if (likely(in_page >= msize)) { 6556 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6557 cpu_check_watchpoint(env_cpu(env), addr, msize, 6558 info.attrs, BP_MEM_READ, retaddr); 6559 } 6560 if (mtedesc && info.tagged) { 6561 mte_check(env, mtedesc, addr, retaddr); 6562 } 6563 if (unlikely(info.flags & TLB_MMIO)) { 6564 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6565 } else { 6566 host_fn(&scratch, reg_off, info.host); 6567 } 6568 } else { 6569 /* Element crosses the page boundary. */ 6570 sve_probe_page(&info2, false, env, addr + in_page, 0, 6571 MMU_DATA_LOAD, mmu_idx, retaddr); 6572 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6573 cpu_check_watchpoint(env_cpu(env), addr, 6574 msize, info.attrs, 6575 BP_MEM_READ, retaddr); 6576 } 6577 if (mtedesc && info.tagged) { 6578 mte_check(env, mtedesc, addr, retaddr); 6579 } 6580 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6581 } 6582 } 6583 reg_off += esize; 6584 pg >>= esize; 6585 } while (reg_off & 63); 6586 } while (reg_off < reg_max); 6587 6588 /* Wait until all exceptions have been raised to write back. */ 6589 memcpy(vd, &scratch, reg_max); 6590 } 6591 6592 static inline QEMU_ALWAYS_INLINE 6593 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6594 target_ulong base, uint32_t desc, uintptr_t retaddr, 6595 int esize, int msize, zreg_off_fn *off_fn, 6596 sve_ldst1_host_fn *host_fn, 6597 sve_ldst1_tlb_fn *tlb_fn) 6598 { 6599 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6600 /* Remove mtedesc from the normal sve descriptor. */ 6601 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6602 6603 /* 6604 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6605 * offset base entirely over the address space hole to change the 6606 * pointer tag, or change the bit55 selector. So we could here 6607 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6608 */ 6609 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6610 esize, msize, off_fn, host_fn, tlb_fn); 6611 } 6612 6613 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6614 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6615 void *vm, target_ulong base, uint32_t desc) \ 6616 { \ 6617 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6618 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6619 } \ 6620 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6621 void *vm, target_ulong base, uint32_t desc) \ 6622 { \ 6623 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6624 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6625 } 6626 6627 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6628 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6629 void *vm, target_ulong base, uint32_t desc) \ 6630 { \ 6631 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6632 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6633 } \ 6634 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6635 void *vm, target_ulong base, uint32_t desc) \ 6636 { \ 6637 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6638 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6639 } 6640 6641 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6642 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6643 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6644 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6645 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6646 6647 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6648 DO_LD1_ZPZ_S(bss, zss, MO_8) 6649 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6650 DO_LD1_ZPZ_D(bds, zss, MO_8) 6651 DO_LD1_ZPZ_D(bds, zd, MO_8) 6652 6653 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6654 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6655 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6656 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6657 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6658 6659 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6660 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6661 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6662 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6663 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6664 6665 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6666 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6667 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6668 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6669 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6670 6671 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6672 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6673 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6674 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6675 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6676 6677 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6678 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6679 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6680 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6681 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6682 6683 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6684 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6685 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6686 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6687 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6688 6689 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6690 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6691 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6692 6693 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6694 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6695 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6696 6697 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6698 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6699 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6700 6701 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6702 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6703 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6704 6705 #undef DO_LD1_ZPZ_S 6706 #undef DO_LD1_ZPZ_D 6707 6708 /* First fault loads with a vector index. */ 6709 6710 /* 6711 * Common helpers for all gather first-faulting loads. 6712 */ 6713 6714 static inline QEMU_ALWAYS_INLINE 6715 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6716 target_ulong base, uint32_t desc, uintptr_t retaddr, 6717 uint32_t mtedesc, const int esz, const int msz, 6718 zreg_off_fn *off_fn, 6719 sve_ldst1_host_fn *host_fn, 6720 sve_ldst1_tlb_fn *tlb_fn) 6721 { 6722 const int mmu_idx = cpu_mmu_index(env, false); 6723 const intptr_t reg_max = simd_oprsz(desc); 6724 const int scale = simd_data(desc); 6725 const int esize = 1 << esz; 6726 const int msize = 1 << msz; 6727 intptr_t reg_off; 6728 SVEHostPage info; 6729 target_ulong addr, in_page; 6730 ARMVectorReg scratch; 6731 6732 /* Skip to the first true predicate. */ 6733 reg_off = find_next_active(vg, 0, reg_max, esz); 6734 if (unlikely(reg_off >= reg_max)) { 6735 /* The entire predicate was false; no load occurs. */ 6736 memset(vd, 0, reg_max); 6737 return; 6738 } 6739 6740 /* Protect against overlap between vd and vm. */ 6741 if (unlikely(vd == vm)) { 6742 vm = memcpy(&scratch, vm, reg_max); 6743 } 6744 6745 /* 6746 * Probe the first element, allowing faults. 6747 */ 6748 addr = base + (off_fn(vm, reg_off) << scale); 6749 if (mtedesc) { 6750 mte_check(env, mtedesc, addr, retaddr); 6751 } 6752 tlb_fn(env, vd, reg_off, addr, retaddr); 6753 6754 /* After any fault, zero the other elements. */ 6755 swap_memzero(vd, reg_off); 6756 reg_off += esize; 6757 swap_memzero(vd + reg_off, reg_max - reg_off); 6758 6759 /* 6760 * Probe the remaining elements, not allowing faults. 6761 */ 6762 while (reg_off < reg_max) { 6763 uint64_t pg = vg[reg_off >> 6]; 6764 do { 6765 if (likely((pg >> (reg_off & 63)) & 1)) { 6766 addr = base + (off_fn(vm, reg_off) << scale); 6767 in_page = -(addr | TARGET_PAGE_MASK); 6768 6769 if (unlikely(in_page < msize)) { 6770 /* Stop if the element crosses a page boundary. */ 6771 goto fault; 6772 } 6773 6774 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6775 mmu_idx, retaddr); 6776 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6777 goto fault; 6778 } 6779 if (unlikely(info.flags & TLB_WATCHPOINT) && 6780 (cpu_watchpoint_address_matches 6781 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 6782 goto fault; 6783 } 6784 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 6785 goto fault; 6786 } 6787 6788 host_fn(vd, reg_off, info.host); 6789 } 6790 reg_off += esize; 6791 } while (reg_off & 63); 6792 } 6793 return; 6794 6795 fault: 6796 record_fault(env, reg_off, reg_max); 6797 } 6798 6799 static inline QEMU_ALWAYS_INLINE 6800 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6801 target_ulong base, uint32_t desc, uintptr_t retaddr, 6802 const int esz, const int msz, 6803 zreg_off_fn *off_fn, 6804 sve_ldst1_host_fn *host_fn, 6805 sve_ldst1_tlb_fn *tlb_fn) 6806 { 6807 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6808 /* Remove mtedesc from the normal sve descriptor. */ 6809 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6810 6811 /* 6812 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6813 * offset base entirely over the address space hole to change the 6814 * pointer tag, or change the bit55 selector. So we could here 6815 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6816 */ 6817 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6818 esz, msz, off_fn, host_fn, tlb_fn); 6819 } 6820 6821 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 6822 void HELPER(sve_ldff##MEM##_##OFS) \ 6823 (CPUARMState *env, void *vd, void *vg, \ 6824 void *vm, target_ulong base, uint32_t desc) \ 6825 { \ 6826 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 6827 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6828 } \ 6829 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6830 (CPUARMState *env, void *vd, void *vg, \ 6831 void *vm, target_ulong base, uint32_t desc) \ 6832 { \ 6833 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 6834 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6835 } 6836 6837 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 6838 void HELPER(sve_ldff##MEM##_##OFS) \ 6839 (CPUARMState *env, void *vd, void *vg, \ 6840 void *vm, target_ulong base, uint32_t desc) \ 6841 { \ 6842 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 6843 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6844 } \ 6845 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6846 (CPUARMState *env, void *vd, void *vg, \ 6847 void *vm, target_ulong base, uint32_t desc) \ 6848 { \ 6849 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 6850 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6851 } 6852 6853 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 6854 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 6855 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 6856 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 6857 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 6858 6859 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 6860 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 6861 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 6862 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 6863 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 6864 6865 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 6866 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 6867 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 6868 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 6869 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 6870 6871 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 6872 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 6873 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 6874 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 6875 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 6876 6877 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 6878 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 6879 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 6880 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 6881 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 6882 6883 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 6884 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 6885 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 6886 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 6887 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 6888 6889 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 6890 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 6891 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 6892 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 6893 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 6894 6895 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 6896 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 6897 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 6898 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 6899 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 6900 6901 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 6902 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 6903 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 6904 6905 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 6906 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 6907 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 6908 6909 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 6910 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 6911 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 6912 6913 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 6914 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 6915 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 6916 6917 /* Stores with a vector index. */ 6918 6919 static inline QEMU_ALWAYS_INLINE 6920 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6921 target_ulong base, uint32_t desc, uintptr_t retaddr, 6922 uint32_t mtedesc, int esize, int msize, 6923 zreg_off_fn *off_fn, 6924 sve_ldst1_host_fn *host_fn, 6925 sve_ldst1_tlb_fn *tlb_fn) 6926 { 6927 const int mmu_idx = cpu_mmu_index(env, false); 6928 const intptr_t reg_max = simd_oprsz(desc); 6929 const int scale = simd_data(desc); 6930 void *host[ARM_MAX_VQ * 4]; 6931 intptr_t reg_off, i; 6932 SVEHostPage info, info2; 6933 6934 /* 6935 * Probe all of the elements for host addresses and flags. 6936 */ 6937 i = reg_off = 0; 6938 do { 6939 uint64_t pg = vg[reg_off >> 6]; 6940 do { 6941 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6942 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6943 6944 host[i] = NULL; 6945 if (likely((pg >> (reg_off & 63)) & 1)) { 6946 if (likely(in_page >= msize)) { 6947 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 6948 mmu_idx, retaddr); 6949 if (!(info.flags & TLB_MMIO)) { 6950 host[i] = info.host; 6951 } 6952 } else { 6953 /* 6954 * Element crosses the page boundary. 6955 * Probe both pages, but do not record the host address, 6956 * so that we use the slow path. 6957 */ 6958 sve_probe_page(&info, false, env, addr, 0, 6959 MMU_DATA_STORE, mmu_idx, retaddr); 6960 sve_probe_page(&info2, false, env, addr + in_page, 0, 6961 MMU_DATA_STORE, mmu_idx, retaddr); 6962 info.flags |= info2.flags; 6963 } 6964 6965 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6966 cpu_check_watchpoint(env_cpu(env), addr, msize, 6967 info.attrs, BP_MEM_WRITE, retaddr); 6968 } 6969 6970 if (mtedesc && info.tagged) { 6971 mte_check(env, mtedesc, addr, retaddr); 6972 } 6973 } 6974 i += 1; 6975 reg_off += esize; 6976 } while (reg_off & 63); 6977 } while (reg_off < reg_max); 6978 6979 /* 6980 * Now that we have recognized all exceptions except SyncExternal 6981 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 6982 * 6983 * Note for the common case of an element in RAM, not crossing a page 6984 * boundary, we have stored the host address in host[]. This doubles 6985 * as a first-level check against the predicate, since only enabled 6986 * elements have non-null host addresses. 6987 */ 6988 i = reg_off = 0; 6989 do { 6990 void *h = host[i]; 6991 if (likely(h != NULL)) { 6992 host_fn(vd, reg_off, h); 6993 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 6994 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6995 tlb_fn(env, vd, reg_off, addr, retaddr); 6996 } 6997 i += 1; 6998 reg_off += esize; 6999 } while (reg_off < reg_max); 7000 } 7001 7002 static inline QEMU_ALWAYS_INLINE 7003 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7004 target_ulong base, uint32_t desc, uintptr_t retaddr, 7005 int esize, int msize, zreg_off_fn *off_fn, 7006 sve_ldst1_host_fn *host_fn, 7007 sve_ldst1_tlb_fn *tlb_fn) 7008 { 7009 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7010 /* Remove mtedesc from the normal sve descriptor. */ 7011 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7012 7013 /* 7014 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7015 * offset base entirely over the address space hole to change the 7016 * pointer tag, or change the bit55 selector. So we could here 7017 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7018 */ 7019 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7020 esize, msize, off_fn, host_fn, tlb_fn); 7021 } 7022 7023 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7024 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7025 void *vm, target_ulong base, uint32_t desc) \ 7026 { \ 7027 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7028 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7029 } \ 7030 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7031 void *vm, target_ulong base, uint32_t desc) \ 7032 { \ 7033 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7034 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7035 } 7036 7037 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7038 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7039 void *vm, target_ulong base, uint32_t desc) \ 7040 { \ 7041 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7042 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7043 } \ 7044 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7045 void *vm, target_ulong base, uint32_t desc) \ 7046 { \ 7047 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7048 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7049 } 7050 7051 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7052 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7053 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7054 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7055 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7056 7057 DO_ST1_ZPZ_S(bs, zss, MO_8) 7058 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7059 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7060 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7061 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7062 7063 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7064 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7065 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7066 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7067 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7068 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7069 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7070 7071 DO_ST1_ZPZ_D(bd, zss, MO_8) 7072 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7073 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7074 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7075 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7076 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7077 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7078 7079 DO_ST1_ZPZ_D(bd, zd, MO_8) 7080 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7081 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7082 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7083 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7084 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7085 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7086 7087 #undef DO_ST1_ZPZ_S 7088 #undef DO_ST1_ZPZ_D 7089 7090 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7091 { 7092 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7093 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7094 7095 for (i = 0; i < opr_sz; ++i) { 7096 d[i] = n[i] ^ m[i] ^ k[i]; 7097 } 7098 } 7099 7100 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7101 { 7102 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7103 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7104 7105 for (i = 0; i < opr_sz; ++i) { 7106 d[i] = n[i] ^ (m[i] & ~k[i]); 7107 } 7108 } 7109 7110 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7111 { 7112 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7113 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7114 7115 for (i = 0; i < opr_sz; ++i) { 7116 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7117 } 7118 } 7119 7120 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7121 { 7122 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7123 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7124 7125 for (i = 0; i < opr_sz; ++i) { 7126 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7127 } 7128 } 7129 7130 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7131 { 7132 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7133 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7134 7135 for (i = 0; i < opr_sz; ++i) { 7136 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7137 } 7138 } 7139 7140 /* 7141 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7142 * See hasless(v,1) from 7143 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7144 */ 7145 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7146 { 7147 int bits = 8 << esz; 7148 uint64_t ones = dup_const(esz, 1); 7149 uint64_t signs = ones << (bits - 1); 7150 uint64_t cmp0, cmp1; 7151 7152 cmp1 = dup_const(esz, n); 7153 cmp0 = cmp1 ^ m0; 7154 cmp1 = cmp1 ^ m1; 7155 cmp0 = (cmp0 - ones) & ~cmp0; 7156 cmp1 = (cmp1 - ones) & ~cmp1; 7157 return (cmp0 | cmp1) & signs; 7158 } 7159 7160 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7161 uint32_t desc, int esz, bool nmatch) 7162 { 7163 uint16_t esz_mask = pred_esz_masks[esz]; 7164 intptr_t opr_sz = simd_oprsz(desc); 7165 uint32_t flags = PREDTEST_INIT; 7166 intptr_t i, j, k; 7167 7168 for (i = 0; i < opr_sz; i += 16) { 7169 uint64_t m0 = *(uint64_t *)(vm + i); 7170 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7171 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7172 uint16_t out = 0; 7173 7174 for (j = 0; j < 16; j += 8) { 7175 uint64_t n = *(uint64_t *)(vn + i + j); 7176 7177 for (k = 0; k < 8; k += 1 << esz) { 7178 if (pg & (1 << (j + k))) { 7179 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7180 out |= (o ^ nmatch) << (j + k); 7181 } 7182 } 7183 } 7184 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7185 flags = iter_predtest_fwd(out, pg, flags); 7186 } 7187 return flags; 7188 } 7189 7190 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7191 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7192 { \ 7193 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7194 } 7195 7196 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7197 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7198 7199 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7200 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7201 7202 #undef DO_PPZZ_MATCH 7203 7204 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7205 uint32_t desc) 7206 { 7207 ARMVectorReg scratch; 7208 intptr_t i, j; 7209 intptr_t opr_sz = simd_oprsz(desc); 7210 uint32_t *d = vd, *n = vn, *m = vm; 7211 uint8_t *pg = vg; 7212 7213 if (d == n) { 7214 n = memcpy(&scratch, n, opr_sz); 7215 if (d == m) { 7216 m = n; 7217 } 7218 } else if (d == m) { 7219 m = memcpy(&scratch, m, opr_sz); 7220 } 7221 7222 for (i = 0; i < opr_sz; i += 4) { 7223 uint64_t count = 0; 7224 uint8_t pred; 7225 7226 pred = pg[H1(i >> 3)] >> (i & 7); 7227 if (pred & 1) { 7228 uint32_t nn = n[H4(i >> 2)]; 7229 7230 for (j = 0; j <= i; j += 4) { 7231 pred = pg[H1(j >> 3)] >> (j & 7); 7232 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7233 ++count; 7234 } 7235 } 7236 } 7237 d[H4(i >> 2)] = count; 7238 } 7239 } 7240 7241 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7242 uint32_t desc) 7243 { 7244 ARMVectorReg scratch; 7245 intptr_t i, j; 7246 intptr_t opr_sz = simd_oprsz(desc); 7247 uint64_t *d = vd, *n = vn, *m = vm; 7248 uint8_t *pg = vg; 7249 7250 if (d == n) { 7251 n = memcpy(&scratch, n, opr_sz); 7252 if (d == m) { 7253 m = n; 7254 } 7255 } else if (d == m) { 7256 m = memcpy(&scratch, m, opr_sz); 7257 } 7258 7259 for (i = 0; i < opr_sz / 8; ++i) { 7260 uint64_t count = 0; 7261 if (pg[H1(i)] & 1) { 7262 uint64_t nn = n[i]; 7263 for (j = 0; j <= i; ++j) { 7264 if ((pg[H1(j)] & 1) && nn == m[j]) { 7265 ++count; 7266 } 7267 } 7268 } 7269 d[i] = count; 7270 } 7271 } 7272 7273 /* 7274 * Returns the number of bytes in m0 and m1 that match n. 7275 * Unlike do_match2 we don't just need true/false, we need an exact count. 7276 * This requires two extra logical operations. 7277 */ 7278 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7279 { 7280 const uint64_t mask = dup_const(MO_8, 0x7f); 7281 uint64_t cmp0, cmp1; 7282 7283 cmp1 = dup_const(MO_8, n); 7284 cmp0 = cmp1 ^ m0; 7285 cmp1 = cmp1 ^ m1; 7286 7287 /* 7288 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7289 * 2: carry in to msb if byte != 0 (+ mask) 7290 * 3: set msb if cmp has msb set (| cmp) 7291 * 4: set ~msb to ignore them (| mask) 7292 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7293 * 5: invert, resulting in 0x80 if and only if byte == 0. 7294 */ 7295 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7296 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7297 7298 /* 7299 * Combine the two compares in a way that the bits do 7300 * not overlap, and so preserves the count of set bits. 7301 * If the host has an efficient instruction for ctpop, 7302 * then ctpop(x) + ctpop(y) has the same number of 7303 * operations as ctpop(x | (y >> 1)). If the host does 7304 * not have an efficient ctpop, then we only want to 7305 * use it once. 7306 */ 7307 return ctpop64(cmp0 | (cmp1 >> 1)); 7308 } 7309 7310 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7311 { 7312 intptr_t i, j; 7313 intptr_t opr_sz = simd_oprsz(desc); 7314 7315 for (i = 0; i < opr_sz; i += 16) { 7316 uint64_t n0 = *(uint64_t *)(vn + i); 7317 uint64_t m0 = *(uint64_t *)(vm + i); 7318 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7319 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7320 uint64_t out0 = 0; 7321 uint64_t out1 = 0; 7322 7323 for (j = 0; j < 64; j += 8) { 7324 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7325 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7326 out0 |= cnt0 << j; 7327 out1 |= cnt1 << j; 7328 } 7329 7330 *(uint64_t *)(vd + i) = out0; 7331 *(uint64_t *)(vd + i + 8) = out1; 7332 } 7333 } 7334 7335 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7336 { 7337 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7338 int shr = simd_data(desc); 7339 int shl = 8 - shr; 7340 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7341 uint64_t *d = vd, *n = vn, *m = vm; 7342 7343 for (i = 0; i < opr_sz; ++i) { 7344 uint64_t t = n[i] ^ m[i]; 7345 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7346 } 7347 } 7348 7349 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7350 { 7351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7352 int shr = simd_data(desc); 7353 int shl = 16 - shr; 7354 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7355 uint64_t *d = vd, *n = vn, *m = vm; 7356 7357 for (i = 0; i < opr_sz; ++i) { 7358 uint64_t t = n[i] ^ m[i]; 7359 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7360 } 7361 } 7362 7363 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7364 { 7365 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7366 int shr = simd_data(desc); 7367 uint32_t *d = vd, *n = vn, *m = vm; 7368 7369 for (i = 0; i < opr_sz; ++i) { 7370 d[i] = ror32(n[i] ^ m[i], shr); 7371 } 7372 } 7373 7374 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7375 void *status, uint32_t desc) 7376 { 7377 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7378 7379 for (s = 0; s < opr_sz; ++s) { 7380 float32 *n = vn + s * sizeof(float32) * 4; 7381 float32 *m = vm + s * sizeof(float32) * 4; 7382 float32 *a = va + s * sizeof(float32) * 4; 7383 float32 *d = vd + s * sizeof(float32) * 4; 7384 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7385 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7386 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7387 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7388 float32 p0, p1; 7389 7390 /* i = 0, j = 0 */ 7391 p0 = float32_mul(n00, m00, status); 7392 p1 = float32_mul(n01, m01, status); 7393 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7394 7395 /* i = 0, j = 1 */ 7396 p0 = float32_mul(n00, m10, status); 7397 p1 = float32_mul(n01, m11, status); 7398 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7399 7400 /* i = 1, j = 0 */ 7401 p0 = float32_mul(n10, m00, status); 7402 p1 = float32_mul(n11, m01, status); 7403 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7404 7405 /* i = 1, j = 1 */ 7406 p0 = float32_mul(n10, m10, status); 7407 p1 = float32_mul(n11, m11, status); 7408 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7409 } 7410 } 7411 7412 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7413 void *status, uint32_t desc) 7414 { 7415 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7416 7417 for (s = 0; s < opr_sz; ++s) { 7418 float64 *n = vn + s * sizeof(float64) * 4; 7419 float64 *m = vm + s * sizeof(float64) * 4; 7420 float64 *a = va + s * sizeof(float64) * 4; 7421 float64 *d = vd + s * sizeof(float64) * 4; 7422 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7423 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7424 float64 p0, p1; 7425 7426 /* i = 0, j = 0 */ 7427 p0 = float64_mul(n00, m00, status); 7428 p1 = float64_mul(n01, m01, status); 7429 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7430 7431 /* i = 0, j = 1 */ 7432 p0 = float64_mul(n00, m10, status); 7433 p1 = float64_mul(n01, m11, status); 7434 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7435 7436 /* i = 1, j = 0 */ 7437 p0 = float64_mul(n10, m00, status); 7438 p1 = float64_mul(n11, m01, status); 7439 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7440 7441 /* i = 1, j = 1 */ 7442 p0 = float64_mul(n10, m10, status); 7443 p1 = float64_mul(n11, m11, status); 7444 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7445 } 7446 } 7447 7448 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7449 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7450 { \ 7451 intptr_t i = simd_oprsz(desc); \ 7452 uint64_t *g = vg; \ 7453 do { \ 7454 uint64_t pg = g[(i - 1) >> 6]; \ 7455 do { \ 7456 i -= sizeof(TYPEW); \ 7457 if (likely((pg >> (i & 63)) & 1)) { \ 7458 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7459 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7460 } \ 7461 } while (i & 63); \ 7462 } while (i != 0); \ 7463 } 7464 7465 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7466 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7467 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7468 7469 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7470 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7471 { \ 7472 intptr_t i = simd_oprsz(desc); \ 7473 uint64_t *g = vg; \ 7474 do { \ 7475 uint64_t pg = g[(i - 1) >> 6]; \ 7476 do { \ 7477 i -= sizeof(TYPEW); \ 7478 if (likely((pg >> (i & 63)) & 1)) { \ 7479 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7480 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7481 } \ 7482 } while (i & 63); \ 7483 } while (i != 0); \ 7484 } 7485 7486 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7487 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7488 7489 #undef DO_FCVTLT 7490 #undef DO_FCVTNT 7491