1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/page-protection.h" 24 #include "exec/helper-proto.h" 25 #include "exec/target_page.h" 26 #include "exec/tlb-flags.h" 27 #include "tcg/tcg-gvec-desc.h" 28 #include "fpu/softfloat.h" 29 #include "tcg/tcg.h" 30 #include "vec_internal.h" 31 #include "sve_ldst_internal.h" 32 #include "accel/tcg/cpu-ldst.h" 33 #include "accel/tcg/helper-retaddr.h" 34 #include "accel/tcg/cpu-ops.h" 35 #include "accel/tcg/probe.h" 36 #ifdef CONFIG_USER_ONLY 37 #include "user/page-protection.h" 38 #endif 39 40 41 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 42 * 43 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 44 * and bit 0 set if C is set. Compare the definitions of these variables 45 * within CPUARMState. 46 */ 47 48 /* For no G bits set, NZCV = C. */ 49 #define PREDTEST_INIT 1 50 51 /* This is an iterative function, called for each Pd and Pg word 52 * moving forward. 53 */ 54 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 55 { 56 if (likely(g)) { 57 /* Compute N from first D & G. 58 Use bit 2 to signal first G bit seen. */ 59 if (!(flags & 4)) { 60 flags |= ((d & (g & -g)) != 0) << 31; 61 flags |= 4; 62 } 63 64 /* Accumulate Z from each D & G. */ 65 flags |= ((d & g) != 0) << 1; 66 67 /* Compute C from last !(D & G). Replace previous. */ 68 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 69 } 70 return flags; 71 } 72 73 /* This is an iterative function, called for each Pd and Pg word 74 * moving backward. 75 */ 76 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 77 { 78 if (likely(g)) { 79 /* Compute C from first (i.e last) !(D & G). 80 Use bit 2 to signal first G bit seen. */ 81 if (!(flags & 4)) { 82 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 83 flags |= (d & pow2floor(g)) == 0; 84 } 85 86 /* Accumulate Z from each D & G. */ 87 flags |= ((d & g) != 0) << 1; 88 89 /* Compute N from last (i.e first) D & G. Replace previous. */ 90 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 91 } 92 return flags; 93 } 94 95 /* The same for a single word predicate. */ 96 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 97 { 98 return iter_predtest_fwd(d, g, PREDTEST_INIT); 99 } 100 101 /* The same for a multi-word predicate. */ 102 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 103 { 104 uint32_t flags = PREDTEST_INIT; 105 uint64_t *d = vd, *g = vg; 106 uintptr_t i = 0; 107 108 do { 109 flags = iter_predtest_fwd(d[i], g[i], flags); 110 } while (++i < words); 111 112 return flags; 113 } 114 115 /* Similarly for single word elements. */ 116 static inline uint64_t expand_pred_s(uint8_t byte) 117 { 118 static const uint64_t word[] = { 119 [0x01] = 0x00000000ffffffffull, 120 [0x10] = 0xffffffff00000000ull, 121 [0x11] = 0xffffffffffffffffull, 122 }; 123 return word[byte & 0x11]; 124 } 125 126 static inline uint64_t expand_pred_d(uint8_t byte) 127 { 128 return -(uint64_t)(byte & 1); 129 } 130 131 #define LOGICAL_PPPP(NAME, FUNC) \ 132 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 133 { \ 134 uintptr_t opr_sz = simd_oprsz(desc); \ 135 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 136 uintptr_t i; \ 137 for (i = 0; i < opr_sz / 8; ++i) { \ 138 d[i] = FUNC(n[i], m[i], g[i]); \ 139 } \ 140 } 141 142 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 143 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 144 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 145 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 146 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 147 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 148 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 149 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 150 151 LOGICAL_PPPP(sve_and_pppp, DO_AND) 152 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 153 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 154 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 155 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 156 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 157 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 158 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 159 160 #undef DO_AND 161 #undef DO_BIC 162 #undef DO_EOR 163 #undef DO_ORR 164 #undef DO_ORN 165 #undef DO_NOR 166 #undef DO_NAND 167 #undef DO_SEL 168 #undef LOGICAL_PPPP 169 170 /* Fully general three-operand expander, controlled by a predicate. 171 * This is complicated by the host-endian storage of the register file. 172 */ 173 /* ??? I don't expect the compiler could ever vectorize this itself. 174 * With some tables we can convert bit masks to byte masks, and with 175 * extra care wrt byte/word ordering we could use gcc generic vectors 176 * and do 16 bytes at a time. 177 */ 178 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 179 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 180 { \ 181 intptr_t i, opr_sz = simd_oprsz(desc); \ 182 for (i = 0; i < opr_sz; ) { \ 183 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 184 do { \ 185 if (pg & 1) { \ 186 TYPE nn = *(TYPE *)(vn + H(i)); \ 187 TYPE mm = *(TYPE *)(vm + H(i)); \ 188 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 189 } \ 190 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 191 } while (i & 15); \ 192 } \ 193 } 194 195 /* Similarly, specialized for 64-bit operands. */ 196 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 197 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 198 { \ 199 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 200 TYPE *d = vd, *n = vn, *m = vm; \ 201 uint8_t *pg = vg; \ 202 for (i = 0; i < opr_sz; i += 1) { \ 203 if (pg[H1(i)] & 1) { \ 204 TYPE nn = n[i], mm = m[i]; \ 205 d[i] = OP(nn, mm); \ 206 } \ 207 } \ 208 } 209 210 #define DO_AND(N, M) (N & M) 211 #define DO_EOR(N, M) (N ^ M) 212 #define DO_ORR(N, M) (N | M) 213 #define DO_BIC(N, M) (N & ~M) 214 #define DO_ORC(N, M) (N | ~M) 215 #define DO_ADD(N, M) (N + M) 216 #define DO_SUB(N, M) (N - M) 217 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 218 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 219 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 220 #define DO_MUL(N, M) (N * M) 221 222 223 /* 224 * We must avoid the C undefined behaviour cases: division by 225 * zero and signed division of INT_MIN by -1. Both of these 226 * have architecturally defined required results for Arm. 227 * We special case all signed divisions by -1 to avoid having 228 * to deduce the minimum integer for the type involved. 229 */ 230 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 231 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 232 233 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 234 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 235 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 236 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 237 238 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 239 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 240 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 241 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 242 243 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 244 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 245 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 246 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 247 248 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 249 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 250 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 251 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 252 253 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 254 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 255 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 256 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 257 258 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 259 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 260 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 261 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 262 263 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 264 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 265 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 266 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 267 268 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 269 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 270 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 271 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 272 273 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 274 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 275 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 276 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 277 278 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 279 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 280 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 281 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 282 283 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 284 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 285 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 286 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 287 288 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 289 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 290 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 291 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 292 293 /* Because the computation type is at least twice as large as required, 294 these work for both signed and unsigned source types. */ 295 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 296 { 297 return (n * m) >> 8; 298 } 299 300 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 301 { 302 return (n * m) >> 16; 303 } 304 305 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 306 { 307 return (n * m) >> 32; 308 } 309 310 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 311 { 312 uint64_t lo, hi; 313 muls64(&lo, &hi, n, m); 314 return hi; 315 } 316 317 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 318 { 319 uint64_t lo, hi; 320 mulu64(&lo, &hi, n, m); 321 return hi; 322 } 323 324 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 325 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 326 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 327 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 328 329 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 330 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 331 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 332 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 333 334 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 335 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 336 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 337 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 338 339 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 340 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 341 342 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 343 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 344 345 /* Note that all bits of the shift are significant 346 and not modulo the element size. */ 347 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 348 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 349 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 350 351 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 352 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 353 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 354 355 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 356 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 357 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 358 359 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 360 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 361 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 362 363 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 364 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 365 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 366 367 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 368 { 369 int8_t n1 = n, n2 = n >> 8; 370 return m + n1 + n2; 371 } 372 373 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 374 { 375 int16_t n1 = n, n2 = n >> 16; 376 return m + n1 + n2; 377 } 378 379 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 380 { 381 int32_t n1 = n, n2 = n >> 32; 382 return m + n1 + n2; 383 } 384 385 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 386 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 387 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 388 389 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 390 { 391 uint8_t n1 = n, n2 = n >> 8; 392 return m + n1 + n2; 393 } 394 395 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 396 { 397 uint16_t n1 = n, n2 = n >> 16; 398 return m + n1 + n2; 399 } 400 401 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 402 { 403 uint32_t n1 = n, n2 = n >> 32; 404 return m + n1 + n2; 405 } 406 407 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 408 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 409 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 410 411 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 412 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 413 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 414 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 415 416 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 417 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 418 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 419 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 420 421 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 422 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 423 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 424 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 425 426 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 427 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 428 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 429 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 430 431 /* 432 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 433 * We pass in a pointer to a dummy saturation field to trigger 434 * the saturating arithmetic but discard the information about 435 * whether it has occurred. 436 */ 437 #define do_sqshl_b(n, m) \ 438 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 439 #define do_sqshl_h(n, m) \ 440 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 441 #define do_sqshl_s(n, m) \ 442 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 443 #define do_sqshl_d(n, m) \ 444 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 445 446 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 447 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 448 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 449 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 450 451 #define do_uqshl_b(n, m) \ 452 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 453 #define do_uqshl_h(n, m) \ 454 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 455 #define do_uqshl_s(n, m) \ 456 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 457 #define do_uqshl_d(n, m) \ 458 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 459 460 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 461 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 462 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 463 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 464 465 #define do_sqrshl_b(n, m) \ 466 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 467 #define do_sqrshl_h(n, m) \ 468 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 469 #define do_sqrshl_s(n, m) \ 470 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 471 #define do_sqrshl_d(n, m) \ 472 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 473 474 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 475 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 476 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 477 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 478 479 #undef do_sqrshl_d 480 481 #define do_uqrshl_b(n, m) \ 482 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 483 #define do_uqrshl_h(n, m) \ 484 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 485 #define do_uqrshl_s(n, m) \ 486 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 487 #define do_uqrshl_d(n, m) \ 488 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 489 490 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 491 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 492 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 493 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 494 495 #undef do_uqrshl_d 496 497 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 498 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 499 500 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 501 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 502 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 503 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 504 505 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 506 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 507 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 508 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 509 510 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 511 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 512 513 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 514 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 515 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 516 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 517 518 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 519 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 520 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 521 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 522 523 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 524 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 525 526 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 527 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 528 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 529 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 530 531 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 532 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 533 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 534 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 535 536 #define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m) 537 #define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m) 538 #define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m) 539 540 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 541 { 542 int64_t r = n + m; 543 if (((r ^ n) & ~(n ^ m)) < 0) { 544 /* Signed overflow. */ 545 return r < 0 ? INT64_MAX : INT64_MIN; 546 } 547 return r; 548 } 549 550 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 551 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 552 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 553 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 554 555 #define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m) 556 #define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m) 557 #define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m) 558 559 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 560 { 561 uint64_t r = n + m; 562 return r < n ? UINT64_MAX : r; 563 } 564 565 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 566 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 567 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 568 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 569 570 #define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m) 571 #define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m) 572 #define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m) 573 574 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 575 { 576 int64_t r = n - m; 577 if (((r ^ n) & (n ^ m)) < 0) { 578 /* Signed overflow. */ 579 return r < 0 ? INT64_MAX : INT64_MIN; 580 } 581 return r; 582 } 583 584 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 585 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 586 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 587 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 588 589 #define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m) 590 #define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m) 591 #define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m) 592 593 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 594 { 595 return n > m ? n - m : 0; 596 } 597 598 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 599 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 600 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 601 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 602 603 #define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m) 604 #define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m) 605 #define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m) 606 607 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 608 { 609 uint64_t r = n + m; 610 611 if (n < 0) { 612 /* Note that m - abs(n) cannot underflow. */ 613 if (r > INT64_MAX) { 614 /* Result is either very large positive or negative. */ 615 if (m > -n) { 616 /* m > abs(n), so r is a very large positive. */ 617 return INT64_MAX; 618 } 619 /* Result is negative. */ 620 } 621 } else { 622 /* Both inputs are positive: check for overflow. */ 623 if (r < m || r > INT64_MAX) { 624 return INT64_MAX; 625 } 626 } 627 return r; 628 } 629 630 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 631 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 632 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 633 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 634 635 #define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m) 636 #define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m) 637 #define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m) 638 639 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 640 { 641 uint64_t r = n + m; 642 643 if (m < 0) { 644 return n < -m ? 0 : r; 645 } 646 return r < n ? UINT64_MAX : r; 647 } 648 649 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 650 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 651 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 652 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 653 654 #undef DO_ZPZZ 655 #undef DO_ZPZZ_D 656 657 /* 658 * Three operand expander, operating on element pairs. 659 * If the slot I is even, the elements from from VN {I, I+1}. 660 * If the slot I is odd, the elements from from VM {I-1, I}. 661 * Load all of the input elements in each pair before overwriting output. 662 */ 663 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 664 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 665 { \ 666 intptr_t i, opr_sz = simd_oprsz(desc); \ 667 for (i = 0; i < opr_sz; ) { \ 668 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 669 do { \ 670 TYPE n0 = *(TYPE *)(vn + H(i)); \ 671 TYPE m0 = *(TYPE *)(vm + H(i)); \ 672 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 673 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 674 if (pg & 1) { \ 675 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 676 } \ 677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 678 if (pg & 1) { \ 679 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 680 } \ 681 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 682 } while (i & 15); \ 683 } \ 684 } 685 686 /* Similarly, specialized for 64-bit operands. */ 687 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 688 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 689 { \ 690 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 691 TYPE *d = vd, *n = vn, *m = vm; \ 692 uint8_t *pg = vg; \ 693 for (i = 0; i < opr_sz; i += 2) { \ 694 TYPE n0 = n[i], n1 = n[i + 1]; \ 695 TYPE m0 = m[i], m1 = m[i + 1]; \ 696 if (pg[H1(i)] & 1) { \ 697 d[i] = OP(n0, n1); \ 698 } \ 699 if (pg[H1(i + 1)] & 1) { \ 700 d[i + 1] = OP(m0, m1); \ 701 } \ 702 } \ 703 } 704 705 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 706 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 707 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 708 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 709 710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 713 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 714 715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 718 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 719 720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 723 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 724 725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 728 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 729 730 #undef DO_ZPZZ_PAIR 731 #undef DO_ZPZZ_PAIR_D 732 733 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 734 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 735 float_status *status, uint32_t desc) \ 736 { \ 737 intptr_t i, opr_sz = simd_oprsz(desc); \ 738 for (i = 0; i < opr_sz; ) { \ 739 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 740 do { \ 741 TYPE n0 = *(TYPE *)(vn + H(i)); \ 742 TYPE m0 = *(TYPE *)(vm + H(i)); \ 743 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 744 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 745 if (pg & 1) { \ 746 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 747 } \ 748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 749 if (pg & 1) { \ 750 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 751 } \ 752 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 753 } while (i & 15); \ 754 } \ 755 } 756 757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 760 761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 764 765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 768 769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 772 773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 776 777 #undef DO_ZPZZ_PAIR_FP 778 779 /* Three-operand expander, controlled by a predicate, in which the 780 * third operand is "wide". That is, for D = N op M, the same 64-bit 781 * value of M is used with all of the narrower values of N. 782 */ 783 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 784 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 785 { \ 786 intptr_t i, opr_sz = simd_oprsz(desc); \ 787 for (i = 0; i < opr_sz; ) { \ 788 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 789 TYPEW mm = *(TYPEW *)(vm + i); \ 790 do { \ 791 if (pg & 1) { \ 792 TYPE nn = *(TYPE *)(vn + H(i)); \ 793 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 794 } \ 795 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 796 } while (i & 7); \ 797 } \ 798 } 799 800 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 801 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 802 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 803 804 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 805 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 806 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 807 808 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 809 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 810 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 811 812 #undef DO_ZPZW 813 814 /* Fully general two-operand expander, controlled by a predicate. 815 */ 816 #define DO_ZPZ(NAME, TYPE, H, OP) \ 817 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 818 { \ 819 intptr_t i, opr_sz = simd_oprsz(desc); \ 820 for (i = 0; i < opr_sz; ) { \ 821 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 822 do { \ 823 if (pg & 1) { \ 824 TYPE nn = *(TYPE *)(vn + H(i)); \ 825 *(TYPE *)(vd + H(i)) = OP(nn); \ 826 } \ 827 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 828 } while (i & 15); \ 829 } \ 830 } 831 832 /* Similarly, specialized for 64-bit operands. */ 833 #define DO_ZPZ_D(NAME, TYPE, OP) \ 834 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 835 { \ 836 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 837 TYPE *d = vd, *n = vn; \ 838 uint8_t *pg = vg; \ 839 for (i = 0; i < opr_sz; i += 1) { \ 840 if (pg[H1(i)] & 1) { \ 841 TYPE nn = n[i]; \ 842 d[i] = OP(nn); \ 843 } \ 844 } \ 845 } 846 847 #define DO_CLS_B(N) (clrsb32(N) - 24) 848 #define DO_CLS_H(N) (clrsb32(N) - 16) 849 850 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 851 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 852 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 853 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 854 855 #define DO_CLZ_B(N) (clz32(N) - 24) 856 #define DO_CLZ_H(N) (clz32(N) - 16) 857 858 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 859 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 860 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 861 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 862 863 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 864 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 865 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 866 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 867 868 #define DO_CNOT(N) (N == 0) 869 870 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 871 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 872 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 873 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 874 875 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 876 877 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 878 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 879 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 880 881 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N)) 882 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N)) 883 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N)) 884 885 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H) 886 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S) 887 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D) 888 889 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 890 891 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 892 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 893 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 894 895 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N)) 896 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N)) 897 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N)) 898 899 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H) 900 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S) 901 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D) 902 903 #define DO_NOT(N) (~N) 904 905 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 906 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 907 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 908 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 909 910 #define DO_SXTB(N) ((int8_t)N) 911 #define DO_SXTH(N) ((int16_t)N) 912 #define DO_SXTS(N) ((int32_t)N) 913 #define DO_UXTB(N) ((uint8_t)N) 914 #define DO_UXTH(N) ((uint16_t)N) 915 #define DO_UXTS(N) ((uint32_t)N) 916 917 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 918 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 919 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 920 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 921 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 922 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 923 924 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 925 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 926 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 927 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 928 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 929 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 930 931 #define DO_ABS(N) (N < 0 ? -N : N) 932 933 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 934 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 935 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 936 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 937 938 #define DO_NEG(N) (-N) 939 940 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 941 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 942 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 943 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 944 945 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 946 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 947 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 948 949 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 950 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 951 952 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 953 954 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 955 { 956 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 957 uint64_t *d = vd, *n = vn; 958 uint8_t *pg = vg; 959 960 for (i = 0; i < opr_sz; i += 2) { 961 if (pg[H1(i)] & 1) { 962 uint64_t n0 = n[i + 0]; 963 uint64_t n1 = n[i + 1]; 964 d[i + 0] = n1; 965 d[i + 1] = n0; 966 } 967 } 968 } 969 970 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 971 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 972 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 973 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 974 975 #define DO_SQABS(X) \ 976 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 977 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 978 979 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 980 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 981 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 982 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 983 984 #define DO_SQNEG(X) \ 985 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 986 x_ == min_ ? -min_ - 1 : -x_; }) 987 988 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 989 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 990 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 991 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 992 993 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 994 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 995 996 /* Three-operand expander, unpredicated, in which the third operand is "wide". 997 */ 998 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 999 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1000 { \ 1001 intptr_t i, opr_sz = simd_oprsz(desc); \ 1002 for (i = 0; i < opr_sz; ) { \ 1003 TYPEW mm = *(TYPEW *)(vm + i); \ 1004 do { \ 1005 TYPE nn = *(TYPE *)(vn + H(i)); \ 1006 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 1007 i += sizeof(TYPE); \ 1008 } while (i & 7); \ 1009 } \ 1010 } 1011 1012 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 1013 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 1014 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1015 1016 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1017 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1018 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1019 1020 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1021 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1022 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1023 1024 #undef DO_ZZW 1025 1026 #undef DO_CLS_B 1027 #undef DO_CLS_H 1028 #undef DO_CLZ_B 1029 #undef DO_CLZ_H 1030 #undef DO_CNOT 1031 #undef DO_FABS 1032 #undef DO_FNEG 1033 #undef DO_ABS 1034 #undef DO_NEG 1035 #undef DO_ZPZ 1036 #undef DO_ZPZ_D 1037 1038 /* 1039 * Three-operand expander, unpredicated, in which the two inputs are 1040 * selected from the top or bottom half of the wide column. 1041 */ 1042 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1043 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1044 { \ 1045 intptr_t i, opr_sz = simd_oprsz(desc); \ 1046 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1047 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1048 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1049 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1050 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1051 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1052 } \ 1053 } 1054 1055 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1056 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1057 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1058 1059 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1060 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1061 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1062 1063 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1064 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1065 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1066 1067 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1068 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1069 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1070 1071 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1072 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1073 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1074 1075 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1076 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1077 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1078 1079 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1080 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1081 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1082 1083 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1084 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1085 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1086 1087 /* Note that the multiply cannot overflow, but the doubling can. */ 1088 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1089 { 1090 int16_t val = n * m; 1091 return DO_SQADD_H(val, val); 1092 } 1093 1094 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1095 { 1096 int32_t val = n * m; 1097 return DO_SQADD_S(val, val); 1098 } 1099 1100 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1101 { 1102 int64_t val = n * m; 1103 return do_sqadd_d(val, val); 1104 } 1105 1106 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1107 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1108 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1109 1110 #undef DO_ZZZ_TB 1111 1112 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1113 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1114 { \ 1115 intptr_t i, opr_sz = simd_oprsz(desc); \ 1116 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1117 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1118 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1119 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1120 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1121 } \ 1122 } 1123 1124 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1125 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1126 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1127 1128 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1129 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1130 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1131 1132 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1133 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1134 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1135 1136 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1137 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1138 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1139 1140 #undef DO_ZZZ_WTB 1141 1142 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1143 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1144 { \ 1145 intptr_t i, opr_sz = simd_oprsz(desc); \ 1146 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1147 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1148 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1149 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1150 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1151 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1152 } \ 1153 } 1154 1155 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1156 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1157 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1158 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1159 1160 #undef DO_ZZZ_NTB 1161 1162 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1163 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1164 { \ 1165 intptr_t i, opr_sz = simd_oprsz(desc); \ 1166 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1167 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1168 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1169 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1170 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1171 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1172 } \ 1173 } 1174 1175 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1176 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1177 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1178 1179 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1180 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1181 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1182 1183 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1184 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1185 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1186 1187 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1188 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1189 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1190 1191 #define DO_NMUL(N, M) -(N * M) 1192 1193 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1196 1197 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1200 1201 #undef DO_ZZZW_ACC 1202 1203 #define DO_XTNB(NAME, TYPE, OP) \ 1204 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1205 { \ 1206 intptr_t i, opr_sz = simd_oprsz(desc); \ 1207 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1208 TYPE nn = *(TYPE *)(vn + i); \ 1209 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1210 *(TYPE *)(vd + i) = nn; \ 1211 } \ 1212 } 1213 1214 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1215 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1216 { \ 1217 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1218 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1219 TYPE nn = *(TYPE *)(vn + i); \ 1220 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1221 } \ 1222 } 1223 1224 DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b) 1225 DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h) 1226 DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s) 1227 1228 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b) 1229 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h) 1230 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s) 1231 1232 DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b) 1233 DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h) 1234 DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s) 1235 1236 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b) 1237 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h) 1238 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s) 1239 1240 DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b) 1241 DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h) 1242 DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s) 1243 1244 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b) 1245 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h) 1246 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s) 1247 1248 #undef DO_XTNB 1249 #undef DO_XTNT 1250 1251 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1252 { 1253 intptr_t i, opr_sz = simd_oprsz(desc); 1254 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1255 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1256 uint32_t *a = va, *n = vn; 1257 uint64_t *d = vd, *m = vm; 1258 1259 for (i = 0; i < opr_sz / 8; ++i) { 1260 uint32_t e1 = a[2 * i + H4(0)]; 1261 uint32_t e2 = n[2 * i + sel] ^ inv; 1262 uint64_t c = extract64(m[i], 32, 1); 1263 /* Compute and store the entire 33-bit result at once. */ 1264 d[i] = c + e1 + e2; 1265 } 1266 } 1267 1268 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1269 { 1270 intptr_t i, opr_sz = simd_oprsz(desc); 1271 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1272 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1273 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1274 1275 for (i = 0; i < opr_sz / 8; i += 2) { 1276 Int128 e1 = int128_make64(a[i]); 1277 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1278 Int128 c = int128_make64(m[i + 1] & 1); 1279 Int128 r = int128_add(int128_add(e1, e2), c); 1280 d[i + 0] = int128_getlo(r); 1281 d[i + 1] = int128_gethi(r); 1282 } 1283 } 1284 1285 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1287 { \ 1288 intptr_t i, opr_sz = simd_oprsz(desc); \ 1289 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1290 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1291 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1292 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1293 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1294 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1295 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1296 } \ 1297 } 1298 1299 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1300 do_sqdmull_h, DO_SQADD_H) 1301 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1302 do_sqdmull_s, DO_SQADD_S) 1303 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1304 do_sqdmull_d, do_sqadd_d) 1305 1306 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1307 do_sqdmull_h, DO_SQSUB_H) 1308 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1309 do_sqdmull_s, DO_SQSUB_S) 1310 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1311 do_sqdmull_d, do_sqsub_d) 1312 1313 #undef DO_SQDMLAL 1314 1315 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1316 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1317 { \ 1318 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1319 int rot = simd_data(desc); \ 1320 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1321 bool sub_r = rot == 1 || rot == 2; \ 1322 bool sub_i = rot >= 2; \ 1323 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1324 for (i = 0; i < opr_sz; i += 2) { \ 1325 TYPE elt1_a = n[H(i + sel_a)]; \ 1326 TYPE elt2_a = m[H(i + sel_a)]; \ 1327 TYPE elt2_b = m[H(i + sel_b)]; \ 1328 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1329 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1330 } \ 1331 } 1332 1333 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1334 1335 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1336 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1337 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1338 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1339 1340 #define DO_SQRDMLAH_B(N, M, A, S) \ 1341 do_sqrdmlah_b(N, M, A, S, true) 1342 #define DO_SQRDMLAH_H(N, M, A, S) \ 1343 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1344 #define DO_SQRDMLAH_S(N, M, A, S) \ 1345 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1346 #define DO_SQRDMLAH_D(N, M, A, S) \ 1347 do_sqrdmlah_d(N, M, A, S, true) 1348 1349 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1350 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1351 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1352 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1353 1354 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1355 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1356 { \ 1357 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1358 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1359 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1360 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1361 bool sub_r = rot == 1 || rot == 2; \ 1362 bool sub_i = rot >= 2; \ 1363 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1364 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1365 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1366 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1367 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1368 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1369 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1370 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1371 } \ 1372 } \ 1373 } 1374 1375 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1376 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1377 1378 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1379 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1380 1381 #undef DO_CMLA 1382 #undef DO_CMLA_FUNC 1383 #undef DO_CMLA_IDX_FUNC 1384 #undef DO_SQRDMLAH_B 1385 #undef DO_SQRDMLAH_H 1386 #undef DO_SQRDMLAH_S 1387 #undef DO_SQRDMLAH_D 1388 1389 /* Note N and M are 4 elements bundled into one unit. */ 1390 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1391 int sel_a, int sel_b, int sub_i) 1392 { 1393 for (int i = 0; i <= 1; i++) { 1394 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1395 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1396 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1397 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1398 1399 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1400 } 1401 return a; 1402 } 1403 1404 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1405 int sel_a, int sel_b, int sub_i) 1406 { 1407 for (int i = 0; i <= 1; i++) { 1408 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1409 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1410 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1411 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1412 1413 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1414 } 1415 return a; 1416 } 1417 1418 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1419 void *va, uint32_t desc) 1420 { 1421 int opr_sz = simd_oprsz(desc); 1422 int rot = simd_data(desc); 1423 int sel_a = rot & 1; 1424 int sel_b = sel_a ^ 1; 1425 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1426 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1427 1428 for (int e = 0; e < opr_sz / 4; e++) { 1429 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1430 } 1431 } 1432 1433 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1434 void *va, uint32_t desc) 1435 { 1436 int opr_sz = simd_oprsz(desc); 1437 int rot = simd_data(desc); 1438 int sel_a = rot & 1; 1439 int sel_b = sel_a ^ 1; 1440 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1441 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1442 1443 for (int e = 0; e < opr_sz / 8; e++) { 1444 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1445 } 1446 } 1447 1448 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1449 void *va, uint32_t desc) 1450 { 1451 int opr_sz = simd_oprsz(desc); 1452 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1453 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1454 int sel_a = rot & 1; 1455 int sel_b = sel_a ^ 1; 1456 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1457 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1458 1459 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1460 uint32_t seg_m = m[seg + idx]; 1461 for (int e = 0; e < 4; e++) { 1462 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1463 sel_a, sel_b, sub_i); 1464 } 1465 } 1466 } 1467 1468 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1469 void *va, uint32_t desc) 1470 { 1471 int seg, opr_sz = simd_oprsz(desc); 1472 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1473 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1474 int sel_a = rot & 1; 1475 int sel_b = sel_a ^ 1; 1476 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1477 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1478 1479 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1480 uint64_t seg_m = m[seg + idx]; 1481 for (int e = 0; e < 2; e++) { 1482 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1483 sel_a, sel_b, sub_i); 1484 } 1485 } 1486 } 1487 1488 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1489 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1490 { \ 1491 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1492 intptr_t i, j, idx = simd_data(desc); \ 1493 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1494 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1495 TYPE mm = m[i]; \ 1496 for (j = 0; j < segment; j++) { \ 1497 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1498 } \ 1499 } \ 1500 } 1501 1502 #define DO_SQRDMLAH_H(N, M, A) \ 1503 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1504 #define DO_SQRDMLAH_S(N, M, A) \ 1505 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1506 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1507 1508 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1509 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1510 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1511 1512 #define DO_SQRDMLSH_H(N, M, A) \ 1513 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1514 #define DO_SQRDMLSH_S(N, M, A) \ 1515 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1516 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1517 1518 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1519 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1520 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1521 1522 #undef DO_ZZXZ 1523 1524 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1525 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1526 { \ 1527 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1528 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1529 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1530 for (i = 0; i < oprsz; i += 16) { \ 1531 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1532 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1533 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1534 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1535 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1536 } \ 1537 } \ 1538 } 1539 1540 #define DO_MLA(N, M, A) (A + N * M) 1541 1542 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1543 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1544 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1545 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1546 1547 #define DO_MLS(N, M, A) (A - N * M) 1548 1549 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1550 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1551 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1552 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1553 1554 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1555 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1556 1557 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1558 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1559 1560 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1561 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1562 1563 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1564 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1565 1566 #undef DO_MLA 1567 #undef DO_MLS 1568 #undef DO_ZZXW 1569 1570 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1571 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1572 { \ 1573 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1574 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1575 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1576 for (i = 0; i < oprsz; i += 16) { \ 1577 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1578 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1579 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1580 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1581 } \ 1582 } \ 1583 } 1584 1585 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1586 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1587 1588 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1589 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1590 1591 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1592 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1593 1594 #undef DO_ZZX 1595 1596 #define DO_BITPERM(NAME, TYPE, OP) \ 1597 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1598 { \ 1599 intptr_t i, opr_sz = simd_oprsz(desc); \ 1600 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1601 TYPE nn = *(TYPE *)(vn + i); \ 1602 TYPE mm = *(TYPE *)(vm + i); \ 1603 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1604 } \ 1605 } 1606 1607 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1608 { 1609 uint64_t res = 0; 1610 int db, rb = 0; 1611 1612 for (db = 0; db < n; ++db) { 1613 if ((mask >> db) & 1) { 1614 res |= ((data >> db) & 1) << rb; 1615 ++rb; 1616 } 1617 } 1618 return res; 1619 } 1620 1621 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1622 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1623 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1624 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1625 1626 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1627 { 1628 uint64_t res = 0; 1629 int rb, db = 0; 1630 1631 for (rb = 0; rb < n; ++rb) { 1632 if ((mask >> rb) & 1) { 1633 res |= ((data >> db) & 1) << rb; 1634 ++db; 1635 } 1636 } 1637 return res; 1638 } 1639 1640 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1641 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1642 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1643 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1644 1645 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1646 { 1647 uint64_t resm = 0, resu = 0; 1648 int db, rbm = 0, rbu = 0; 1649 1650 for (db = 0; db < n; ++db) { 1651 uint64_t val = (data >> db) & 1; 1652 if ((mask >> db) & 1) { 1653 resm |= val << rbm++; 1654 } else { 1655 resu |= val << rbu++; 1656 } 1657 } 1658 1659 return resm | (resu << rbm); 1660 } 1661 1662 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1663 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1664 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1665 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1666 1667 #undef DO_BITPERM 1668 1669 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1670 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1671 { \ 1672 intptr_t i, opr_sz = simd_oprsz(desc); \ 1673 int sub_r = simd_data(desc); \ 1674 if (sub_r) { \ 1675 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1676 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1677 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1678 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1679 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1680 acc_r = ADD_OP(acc_r, el2_i); \ 1681 acc_i = SUB_OP(acc_i, el2_r); \ 1682 *(TYPE *)(vd + H(i)) = acc_r; \ 1683 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1684 } \ 1685 } else { \ 1686 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1687 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1688 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1689 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1690 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1691 acc_r = SUB_OP(acc_r, el2_i); \ 1692 acc_i = ADD_OP(acc_i, el2_r); \ 1693 *(TYPE *)(vd + H(i)) = acc_r; \ 1694 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1695 } \ 1696 } \ 1697 } 1698 1699 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1700 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1701 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1702 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1703 1704 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1705 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1706 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1707 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1708 1709 #undef DO_CADD 1710 1711 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1712 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1713 { \ 1714 intptr_t i, opr_sz = simd_oprsz(desc); \ 1715 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1716 int shift = simd_data(desc) >> 1; \ 1717 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1718 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1719 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1720 } \ 1721 } 1722 1723 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1724 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1725 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1726 1727 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1728 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1729 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1730 1731 #undef DO_ZZI_SHLL 1732 1733 /* Two-operand reduction expander, controlled by a predicate. 1734 * The difference between TYPERED and TYPERET has to do with 1735 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1736 * but TYPERET must be unsigned so that e.g. a 32-bit value 1737 * is not sign-extended to the ABI uint64_t return type. 1738 */ 1739 /* ??? If we were to vectorize this by hand the reduction ordering 1740 * would change. For integer operands, this is perfectly fine. 1741 */ 1742 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1743 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1744 { \ 1745 intptr_t i, opr_sz = simd_oprsz(desc); \ 1746 TYPERED ret = INIT; \ 1747 for (i = 0; i < opr_sz; ) { \ 1748 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1749 do { \ 1750 if (pg & 1) { \ 1751 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1752 ret = OP(ret, nn); \ 1753 } \ 1754 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1755 } while (i & 15); \ 1756 } \ 1757 return (TYPERET)ret; \ 1758 } 1759 1760 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1761 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1762 { \ 1763 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1764 TYPEE *n = vn; \ 1765 uint8_t *pg = vg; \ 1766 TYPER ret = INIT; \ 1767 for (i = 0; i < opr_sz; i += 1) { \ 1768 if (pg[H1(i)] & 1) { \ 1769 TYPEE nn = n[i]; \ 1770 ret = OP(ret, nn); \ 1771 } \ 1772 } \ 1773 return ret; \ 1774 } 1775 1776 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1777 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1778 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1779 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1780 1781 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1782 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1783 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1784 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1785 1786 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1787 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1788 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1789 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1790 1791 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1792 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1793 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1794 1795 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1796 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1797 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1798 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1799 1800 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1801 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1802 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1803 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1804 1805 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1806 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1807 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1808 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1809 1810 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1811 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1812 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1813 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1814 1815 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1816 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1817 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1818 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1819 1820 #undef DO_VPZ 1821 #undef DO_VPZ_D 1822 1823 #define DO_VPQ(NAME, TYPE, H, INIT, OP) \ 1824 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 1825 { \ 1826 TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \ 1827 TYPE *n = vn; uint16_t *g = vg; \ 1828 uintptr_t oprsz = simd_oprsz(desc); \ 1829 uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE); \ 1830 for (uintptr_t s = 0; s < nseg; s++) { \ 1831 uint16_t pg = g[H2(s)]; \ 1832 for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) { \ 1833 if (pg & 1) { \ 1834 tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]); \ 1835 } \ 1836 } \ 1837 } \ 1838 memcpy(vd, tmp, 16); \ 1839 clear_tail(vd, 16, simd_maxsz(desc)); \ 1840 } 1841 1842 DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD) 1843 DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD) 1844 DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD) 1845 DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD) 1846 1847 DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX) 1848 DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX) 1849 DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX) 1850 DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX) 1851 1852 DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN) 1853 DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN) 1854 DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN) 1855 DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN) 1856 1857 DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX) 1858 DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX) 1859 DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX) 1860 DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX) 1861 1862 DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN) 1863 DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN) 1864 DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN) 1865 DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN) 1866 1867 #undef DO_VPQ 1868 1869 /* Two vector operand, one scalar operand, unpredicated. */ 1870 #define DO_ZZI(NAME, TYPE, OP) \ 1871 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1872 { \ 1873 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1874 TYPE s = s64, *d = vd, *n = vn; \ 1875 for (i = 0; i < opr_sz; ++i) { \ 1876 d[i] = OP(n[i], s); \ 1877 } \ 1878 } 1879 1880 #define DO_SUBR(X, Y) (Y - X) 1881 1882 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1883 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1884 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1885 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1886 1887 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1888 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1889 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1890 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1891 1892 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1893 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1894 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1895 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1896 1897 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1898 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1899 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1900 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1901 1902 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1903 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1904 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1905 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1906 1907 #undef DO_ZZI 1908 1909 #define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP) \ 1910 void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \ 1911 { \ 1912 unsigned seg = simd_oprsz(desc) / 16; \ 1913 uint64_t r0 = INIT, r1 = INIT; \ 1914 for (unsigned s = 0; s < seg; s++) { \ 1915 uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \ 1916 uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \ 1917 uint64_t v0 = *(uint64_t *)(vn + s * 16); \ 1918 uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8); \ 1919 v0 = POP(v0, p0), v1 = POP(v1, p1); \ 1920 r0 = VOP(r0, v0), r1 = VOP(r1, v1); \ 1921 } \ 1922 *(uint64_t *)(vd + 0) = r0; \ 1923 *(uint64_t *)(vd + 8) = r1; \ 1924 clear_tail(vd, 16, simd_maxsz(desc)); \ 1925 } 1926 1927 DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND) 1928 DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND) 1929 DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND) 1930 DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND) 1931 1932 DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND) 1933 DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND) 1934 DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND) 1935 DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND) 1936 1937 DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC) 1938 DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC) 1939 DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC) 1940 DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC) 1941 1942 #undef DO_LOGIC_QV 1943 1944 #undef DO_AND 1945 #undef DO_ORR 1946 #undef DO_EOR 1947 #undef DO_BIC 1948 #undef DO_ORC 1949 #undef DO_ADD 1950 #undef DO_SUB 1951 #undef DO_MAX 1952 #undef DO_MIN 1953 #undef DO_ABD 1954 #undef DO_MUL 1955 #undef DO_DIV 1956 #undef DO_ASR 1957 #undef DO_LSR 1958 #undef DO_LSL 1959 #undef DO_SUBR 1960 1961 /* Similar to the ARM LastActiveElement pseudocode function, except the 1962 result is multiplied by the element size. This includes the not found 1963 indication; e.g. not found for esz=3 is -8. */ 1964 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1965 { 1966 uint64_t mask = pred_esz_masks[esz]; 1967 intptr_t i = words; 1968 1969 do { 1970 uint64_t this_g = g[--i] & mask; 1971 if (this_g) { 1972 return i * 64 + (63 - clz64(this_g)); 1973 } 1974 } while (i > 0); 1975 return (intptr_t)-1 << esz; 1976 } 1977 1978 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1979 { 1980 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1981 uint32_t flags = PREDTEST_INIT; 1982 uint64_t *d = vd, *g = vg; 1983 intptr_t i = 0; 1984 1985 do { 1986 uint64_t this_d = d[i]; 1987 uint64_t this_g = g[i]; 1988 1989 if (this_g) { 1990 if (!(flags & 4)) { 1991 /* Set in D the first bit of G. */ 1992 this_d |= this_g & -this_g; 1993 d[i] = this_d; 1994 } 1995 flags = iter_predtest_fwd(this_d, this_g, flags); 1996 } 1997 } while (++i < words); 1998 1999 return flags; 2000 } 2001 2002 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 2003 { 2004 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 2005 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 2006 uint32_t flags = PREDTEST_INIT; 2007 uint64_t *d = vd, *g = vg, esz_mask; 2008 intptr_t i, next; 2009 2010 next = last_active_element(vd, words, esz) + (1 << esz); 2011 esz_mask = pred_esz_masks[esz]; 2012 2013 /* Similar to the pseudocode for pnext, but scaled by ESZ 2014 so that we find the correct bit. */ 2015 if (next < words * 64) { 2016 uint64_t mask = -1; 2017 2018 if (next & 63) { 2019 mask = ~((1ull << (next & 63)) - 1); 2020 next &= -64; 2021 } 2022 do { 2023 uint64_t this_g = g[next / 64] & esz_mask & mask; 2024 if (this_g != 0) { 2025 next = (next & -64) + ctz64(this_g); 2026 break; 2027 } 2028 next += 64; 2029 mask = -1; 2030 } while (next < words * 64); 2031 } 2032 2033 i = 0; 2034 do { 2035 uint64_t this_d = 0; 2036 if (i == next / 64) { 2037 this_d = 1ull << (next & 63); 2038 } 2039 d[i] = this_d; 2040 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 2041 } while (++i < words); 2042 2043 return flags; 2044 } 2045 2046 /* 2047 * Copy Zn into Zd, and store zero into inactive elements. 2048 * If inv, store zeros into the active elements. 2049 */ 2050 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 2051 { 2052 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2053 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2054 uint64_t *d = vd, *n = vn; 2055 uint8_t *pg = vg; 2056 2057 for (i = 0; i < opr_sz; i += 1) { 2058 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 2059 } 2060 } 2061 2062 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 2063 { 2064 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2065 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2066 uint64_t *d = vd, *n = vn; 2067 uint8_t *pg = vg; 2068 2069 for (i = 0; i < opr_sz; i += 1) { 2070 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 2071 } 2072 } 2073 2074 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 2075 { 2076 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2077 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2078 uint64_t *d = vd, *n = vn; 2079 uint8_t *pg = vg; 2080 2081 for (i = 0; i < opr_sz; i += 1) { 2082 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 2083 } 2084 } 2085 2086 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 2087 { 2088 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2089 uint64_t *d = vd, *n = vn; 2090 uint8_t *pg = vg; 2091 uint8_t inv = simd_data(desc); 2092 2093 for (i = 0; i < opr_sz; i += 1) { 2094 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2095 } 2096 } 2097 2098 /* Three-operand expander, immediate operand, controlled by a predicate. 2099 */ 2100 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2101 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2102 { \ 2103 intptr_t i, opr_sz = simd_oprsz(desc); \ 2104 TYPE imm = simd_data(desc); \ 2105 for (i = 0; i < opr_sz; ) { \ 2106 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2107 do { \ 2108 if (pg & 1) { \ 2109 TYPE nn = *(TYPE *)(vn + H(i)); \ 2110 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2111 } \ 2112 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2113 } while (i & 15); \ 2114 } \ 2115 } 2116 2117 /* Similarly, specialized for 64-bit operands. */ 2118 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2119 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2120 { \ 2121 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2122 TYPE *d = vd, *n = vn; \ 2123 TYPE imm = simd_data(desc); \ 2124 uint8_t *pg = vg; \ 2125 for (i = 0; i < opr_sz; i += 1) { \ 2126 if (pg[H1(i)] & 1) { \ 2127 TYPE nn = n[i]; \ 2128 d[i] = OP(nn, imm); \ 2129 } \ 2130 } \ 2131 } 2132 2133 #define DO_SHR(N, M) (N >> M) 2134 #define DO_SHL(N, M) (N << M) 2135 2136 /* Arithmetic shift right for division. This rounds negative numbers 2137 toward zero as per signed division. Therefore before shifting, 2138 when N is negative, add 2**M-1. */ 2139 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2140 2141 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2142 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2143 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2144 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2145 2146 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2147 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2148 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2149 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2150 2151 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2152 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2153 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2154 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2155 2156 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2157 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2158 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2159 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2160 2161 /* SVE2 bitwise shift by immediate */ 2162 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2163 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2164 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2165 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2166 2167 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2168 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2169 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2170 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2171 2172 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2173 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2174 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2175 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2176 2177 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2178 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2179 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2180 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2181 2182 #define do_suqrshl_b(n, m) \ 2183 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2184 #define do_suqrshl_h(n, m) \ 2185 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2186 #define do_suqrshl_s(n, m) \ 2187 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2188 #define do_suqrshl_d(n, m) \ 2189 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2190 2191 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2192 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2193 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2194 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2195 2196 #undef DO_ASRD 2197 #undef DO_ZPZI 2198 #undef DO_ZPZI_D 2199 2200 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2201 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2202 { \ 2203 intptr_t i, opr_sz = simd_oprsz(desc); \ 2204 int shift = simd_data(desc); \ 2205 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2206 TYPEW nn = *(TYPEW *)(vn + i); \ 2207 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2208 } \ 2209 } 2210 2211 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2212 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2213 { \ 2214 intptr_t i, opr_sz = simd_oprsz(desc); \ 2215 int shift = simd_data(desc); \ 2216 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2217 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2218 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2219 } \ 2220 } 2221 2222 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2223 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2224 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2225 2226 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2227 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2228 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2229 2230 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2231 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2232 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2233 2234 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2235 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2236 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2237 2238 #define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh) 2239 #define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh) 2240 #define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63)) 2241 2242 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2243 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2244 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2245 2246 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2247 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2248 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2249 2250 #define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh)) 2251 #define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh)) 2252 #define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh)) 2253 2254 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2255 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2256 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2257 2258 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2259 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2260 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2261 2262 #define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh) 2263 #define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh) 2264 #define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh) 2265 2266 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2267 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2268 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2269 2270 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2271 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2272 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2273 2274 #define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh)) 2275 #define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh)) 2276 #define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh)) 2277 2278 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2279 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2280 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2281 2282 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2283 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2284 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2285 2286 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2287 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2288 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2289 2290 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2291 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2292 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2293 2294 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2295 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2296 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2297 2298 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2299 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2300 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2301 2302 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2303 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2304 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2305 2306 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2307 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2308 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2309 2310 #undef DO_SHRNB 2311 #undef DO_SHRNT 2312 2313 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2314 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2315 { \ 2316 intptr_t i, opr_sz = simd_oprsz(desc); \ 2317 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2318 TYPEW nn = *(TYPEW *)(vn + i); \ 2319 TYPEW mm = *(TYPEW *)(vm + i); \ 2320 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2321 } \ 2322 } 2323 2324 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2325 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2326 { \ 2327 intptr_t i, opr_sz = simd_oprsz(desc); \ 2328 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2329 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2330 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2331 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2332 } \ 2333 } 2334 2335 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2336 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2337 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2338 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2339 2340 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2341 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2342 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2343 2344 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2345 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2346 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2347 2348 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2349 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2350 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2351 2352 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2353 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2354 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2355 2356 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2357 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2358 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2359 2360 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2361 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2362 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2363 2364 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2365 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2366 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2367 2368 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2369 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2370 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2371 2372 #undef DO_RSUBHN 2373 #undef DO_SUBHN 2374 #undef DO_RADDHN 2375 #undef DO_ADDHN 2376 2377 #undef DO_BINOPNB 2378 2379 /* Fully general four-operand expander, controlled by a predicate. 2380 */ 2381 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2382 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2383 void *vg, uint32_t desc) \ 2384 { \ 2385 intptr_t i, opr_sz = simd_oprsz(desc); \ 2386 for (i = 0; i < opr_sz; ) { \ 2387 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2388 do { \ 2389 if (pg & 1) { \ 2390 TYPE nn = *(TYPE *)(vn + H(i)); \ 2391 TYPE mm = *(TYPE *)(vm + H(i)); \ 2392 TYPE aa = *(TYPE *)(va + H(i)); \ 2393 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2394 } \ 2395 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2396 } while (i & 15); \ 2397 } \ 2398 } 2399 2400 /* Similarly, specialized for 64-bit operands. */ 2401 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2402 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2403 void *vg, uint32_t desc) \ 2404 { \ 2405 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2406 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2407 uint8_t *pg = vg; \ 2408 for (i = 0; i < opr_sz; i += 1) { \ 2409 if (pg[H1(i)] & 1) { \ 2410 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2411 d[i] = OP(aa, nn, mm); \ 2412 } \ 2413 } \ 2414 } 2415 2416 #define DO_MLA(A, N, M) (A + N * M) 2417 #define DO_MLS(A, N, M) (A - N * M) 2418 2419 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2420 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2421 2422 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2423 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2424 2425 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2426 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2427 2428 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2429 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2430 2431 #undef DO_MLA 2432 #undef DO_MLS 2433 #undef DO_ZPZZZ 2434 #undef DO_ZPZZZ_D 2435 2436 void HELPER(sve_index_b)(void *vd, uint32_t start, 2437 uint32_t incr, uint32_t desc) 2438 { 2439 intptr_t i, opr_sz = simd_oprsz(desc); 2440 uint8_t *d = vd; 2441 for (i = 0; i < opr_sz; i += 1) { 2442 d[H1(i)] = start + i * incr; 2443 } 2444 } 2445 2446 void HELPER(sve_index_h)(void *vd, uint32_t start, 2447 uint32_t incr, uint32_t desc) 2448 { 2449 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2450 uint16_t *d = vd; 2451 for (i = 0; i < opr_sz; i += 1) { 2452 d[H2(i)] = start + i * incr; 2453 } 2454 } 2455 2456 void HELPER(sve_index_s)(void *vd, uint32_t start, 2457 uint32_t incr, uint32_t desc) 2458 { 2459 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2460 uint32_t *d = vd; 2461 for (i = 0; i < opr_sz; i += 1) { 2462 d[H4(i)] = start + i * incr; 2463 } 2464 } 2465 2466 void HELPER(sve_index_d)(void *vd, uint64_t start, 2467 uint64_t incr, uint32_t desc) 2468 { 2469 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2470 uint64_t *d = vd; 2471 for (i = 0; i < opr_sz; i += 1) { 2472 d[i] = start + i * incr; 2473 } 2474 } 2475 2476 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2477 { 2478 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2479 uint32_t sh = simd_data(desc); 2480 uint32_t *d = vd, *n = vn, *m = vm; 2481 for (i = 0; i < opr_sz; i += 1) { 2482 d[i] = n[i] + (m[i] << sh); 2483 } 2484 } 2485 2486 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2487 { 2488 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2489 uint64_t sh = simd_data(desc); 2490 uint64_t *d = vd, *n = vn, *m = vm; 2491 for (i = 0; i < opr_sz; i += 1) { 2492 d[i] = n[i] + (m[i] << sh); 2493 } 2494 } 2495 2496 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2497 { 2498 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2499 uint64_t sh = simd_data(desc); 2500 uint64_t *d = vd, *n = vn, *m = vm; 2501 for (i = 0; i < opr_sz; i += 1) { 2502 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2503 } 2504 } 2505 2506 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2507 { 2508 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2509 uint64_t sh = simd_data(desc); 2510 uint64_t *d = vd, *n = vn, *m = vm; 2511 for (i = 0; i < opr_sz; i += 1) { 2512 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2513 } 2514 } 2515 2516 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2517 { 2518 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2519 static const uint16_t coeff[] = { 2520 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2521 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2522 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2523 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2524 }; 2525 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2526 uint16_t *d = vd, *n = vn; 2527 2528 for (i = 0; i < opr_sz; i++) { 2529 uint16_t nn = n[i]; 2530 intptr_t idx = extract32(nn, 0, 5); 2531 uint16_t exp = extract32(nn, 5, 5); 2532 d[i] = coeff[idx] | (exp << 10); 2533 } 2534 } 2535 2536 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2537 { 2538 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2539 static const uint32_t coeff[] = { 2540 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2541 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2542 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2543 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2544 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2545 0x1ef532, 0x20b051, 0x227043, 0x243516, 2546 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2547 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2548 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2549 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2550 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2551 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2552 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2553 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2554 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2555 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2556 }; 2557 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2558 uint32_t *d = vd, *n = vn; 2559 2560 for (i = 0; i < opr_sz; i++) { 2561 uint32_t nn = n[i]; 2562 intptr_t idx = extract32(nn, 0, 6); 2563 uint32_t exp = extract32(nn, 6, 8); 2564 d[i] = coeff[idx] | (exp << 23); 2565 } 2566 } 2567 2568 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2569 { 2570 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2571 static const uint64_t coeff[] = { 2572 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2573 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2574 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2575 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2576 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2577 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2578 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2579 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2580 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2581 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2582 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2583 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2584 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2585 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2586 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2587 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2588 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2589 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2590 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2591 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2592 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2593 0xFA7C1819E90D8ull, 2594 }; 2595 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2596 uint64_t *d = vd, *n = vn; 2597 2598 for (i = 0; i < opr_sz; i++) { 2599 uint64_t nn = n[i]; 2600 intptr_t idx = extract32(nn, 0, 6); 2601 uint64_t exp = extract32(nn, 6, 11); 2602 d[i] = coeff[idx] | (exp << 52); 2603 } 2604 } 2605 2606 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2607 { 2608 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2609 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2610 uint16_t *d = vd, *n = vn, *m = vm; 2611 for (i = 0; i < opr_sz; i += 1) { 2612 uint16_t nn = n[i]; 2613 uint16_t mm = m[i]; 2614 if (mm & 1) { 2615 nn = float16_one; 2616 } 2617 if (mm & 2) { 2618 nn = float16_maybe_ah_chs(nn, fpcr_ah); 2619 } 2620 d[i] = nn; 2621 } 2622 } 2623 2624 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2625 { 2626 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2627 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2628 uint32_t *d = vd, *n = vn, *m = vm; 2629 for (i = 0; i < opr_sz; i += 1) { 2630 uint32_t nn = n[i]; 2631 uint32_t mm = m[i]; 2632 if (mm & 1) { 2633 nn = float32_one; 2634 } 2635 if (mm & 2) { 2636 nn = float32_maybe_ah_chs(nn, fpcr_ah); 2637 } 2638 d[i] = nn; 2639 } 2640 } 2641 2642 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2643 { 2644 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2645 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2646 uint64_t *d = vd, *n = vn, *m = vm; 2647 for (i = 0; i < opr_sz; i += 1) { 2648 uint64_t nn = n[i]; 2649 uint64_t mm = m[i]; 2650 if (mm & 1) { 2651 nn = float64_one; 2652 } 2653 if (mm & 2) { 2654 nn = float64_maybe_ah_chs(nn, fpcr_ah); 2655 } 2656 d[i] = nn; 2657 } 2658 } 2659 2660 /* 2661 * Signed saturating addition with scalar operand. 2662 */ 2663 2664 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2665 { 2666 intptr_t i, oprsz = simd_oprsz(desc); 2667 2668 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2669 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2670 } 2671 } 2672 2673 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2674 { 2675 intptr_t i, oprsz = simd_oprsz(desc); 2676 2677 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2678 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2679 } 2680 } 2681 2682 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2683 { 2684 intptr_t i, oprsz = simd_oprsz(desc); 2685 2686 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2687 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2688 } 2689 } 2690 2691 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2692 { 2693 intptr_t i, oprsz = simd_oprsz(desc); 2694 2695 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2696 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2697 } 2698 } 2699 2700 /* 2701 * Unsigned saturating addition with scalar operand. 2702 */ 2703 2704 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2705 { 2706 intptr_t i, oprsz = simd_oprsz(desc); 2707 2708 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2709 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2710 } 2711 } 2712 2713 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2714 { 2715 intptr_t i, oprsz = simd_oprsz(desc); 2716 2717 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2718 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2719 } 2720 } 2721 2722 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2723 { 2724 intptr_t i, oprsz = simd_oprsz(desc); 2725 2726 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2727 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2728 } 2729 } 2730 2731 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2732 { 2733 intptr_t i, oprsz = simd_oprsz(desc); 2734 2735 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2736 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2737 } 2738 } 2739 2740 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2741 { 2742 intptr_t i, oprsz = simd_oprsz(desc); 2743 2744 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2745 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2746 } 2747 } 2748 2749 /* Two operand predicated copy immediate with merge. All valid immediates 2750 * can fit within 17 signed bits in the simd_data field. 2751 */ 2752 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2753 uint64_t mm, uint32_t desc) 2754 { 2755 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2756 uint64_t *d = vd, *n = vn; 2757 uint8_t *pg = vg; 2758 2759 mm = dup_const(MO_8, mm); 2760 for (i = 0; i < opr_sz; i += 1) { 2761 uint64_t nn = n[i]; 2762 uint64_t pp = expand_pred_b(pg[H1(i)]); 2763 d[i] = (mm & pp) | (nn & ~pp); 2764 } 2765 } 2766 2767 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2768 uint64_t mm, uint32_t desc) 2769 { 2770 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2771 uint64_t *d = vd, *n = vn; 2772 uint8_t *pg = vg; 2773 2774 mm = dup_const(MO_16, mm); 2775 for (i = 0; i < opr_sz; i += 1) { 2776 uint64_t nn = n[i]; 2777 uint64_t pp = expand_pred_h(pg[H1(i)]); 2778 d[i] = (mm & pp) | (nn & ~pp); 2779 } 2780 } 2781 2782 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2783 uint64_t mm, uint32_t desc) 2784 { 2785 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2786 uint64_t *d = vd, *n = vn; 2787 uint8_t *pg = vg; 2788 2789 mm = dup_const(MO_32, mm); 2790 for (i = 0; i < opr_sz; i += 1) { 2791 uint64_t nn = n[i]; 2792 uint64_t pp = expand_pred_s(pg[H1(i)]); 2793 d[i] = (mm & pp) | (nn & ~pp); 2794 } 2795 } 2796 2797 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2798 uint64_t mm, uint32_t desc) 2799 { 2800 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2801 uint64_t *d = vd, *n = vn; 2802 uint8_t *pg = vg; 2803 2804 for (i = 0; i < opr_sz; i += 1) { 2805 uint64_t nn = n[i]; 2806 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2807 } 2808 } 2809 2810 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2811 { 2812 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2813 uint64_t *d = vd; 2814 uint8_t *pg = vg; 2815 2816 val = dup_const(MO_8, val); 2817 for (i = 0; i < opr_sz; i += 1) { 2818 d[i] = val & expand_pred_b(pg[H1(i)]); 2819 } 2820 } 2821 2822 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2823 { 2824 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2825 uint64_t *d = vd; 2826 uint8_t *pg = vg; 2827 2828 val = dup_const(MO_16, val); 2829 for (i = 0; i < opr_sz; i += 1) { 2830 d[i] = val & expand_pred_h(pg[H1(i)]); 2831 } 2832 } 2833 2834 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2835 { 2836 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2837 uint64_t *d = vd; 2838 uint8_t *pg = vg; 2839 2840 val = dup_const(MO_32, val); 2841 for (i = 0; i < opr_sz; i += 1) { 2842 d[i] = val & expand_pred_s(pg[H1(i)]); 2843 } 2844 } 2845 2846 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2847 { 2848 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2849 uint64_t *d = vd; 2850 uint8_t *pg = vg; 2851 2852 for (i = 0; i < opr_sz; i += 1) { 2853 d[i] = (pg[H1(i)] & 1 ? val : 0); 2854 } 2855 } 2856 2857 /* Big-endian hosts need to frob the byte indices. If the copy 2858 * happens to be 8-byte aligned, then no frobbing necessary. 2859 */ 2860 static void swap_memmove(void *vd, void *vs, size_t n) 2861 { 2862 uintptr_t d = (uintptr_t)vd; 2863 uintptr_t s = (uintptr_t)vs; 2864 uintptr_t o = (d | s | n) & 7; 2865 size_t i; 2866 2867 #if !HOST_BIG_ENDIAN 2868 o = 0; 2869 #endif 2870 switch (o) { 2871 case 0: 2872 memmove(vd, vs, n); 2873 break; 2874 2875 case 4: 2876 if (d < s || d >= s + n) { 2877 for (i = 0; i < n; i += 4) { 2878 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2879 } 2880 } else { 2881 for (i = n; i > 0; ) { 2882 i -= 4; 2883 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2884 } 2885 } 2886 break; 2887 2888 case 2: 2889 case 6: 2890 if (d < s || d >= s + n) { 2891 for (i = 0; i < n; i += 2) { 2892 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2893 } 2894 } else { 2895 for (i = n; i > 0; ) { 2896 i -= 2; 2897 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2898 } 2899 } 2900 break; 2901 2902 default: 2903 if (d < s || d >= s + n) { 2904 for (i = 0; i < n; i++) { 2905 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2906 } 2907 } else { 2908 for (i = n; i > 0; ) { 2909 i -= 1; 2910 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2911 } 2912 } 2913 break; 2914 } 2915 } 2916 2917 /* Similarly for memset of 0. */ 2918 static void swap_memzero(void *vd, size_t n) 2919 { 2920 uintptr_t d = (uintptr_t)vd; 2921 uintptr_t o = (d | n) & 7; 2922 size_t i; 2923 2924 /* Usually, the first bit of a predicate is set, so N is 0. */ 2925 if (likely(n == 0)) { 2926 return; 2927 } 2928 2929 #if !HOST_BIG_ENDIAN 2930 o = 0; 2931 #endif 2932 switch (o) { 2933 case 0: 2934 memset(vd, 0, n); 2935 break; 2936 2937 case 4: 2938 for (i = 0; i < n; i += 4) { 2939 *(uint32_t *)H1_4(d + i) = 0; 2940 } 2941 break; 2942 2943 case 2: 2944 case 6: 2945 for (i = 0; i < n; i += 2) { 2946 *(uint16_t *)H1_2(d + i) = 0; 2947 } 2948 break; 2949 2950 default: 2951 for (i = 0; i < n; i++) { 2952 *(uint8_t *)H1(d + i) = 0; 2953 } 2954 break; 2955 } 2956 } 2957 2958 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2959 { 2960 intptr_t opr_sz = simd_oprsz(desc); 2961 size_t n_ofs = simd_data(desc); 2962 size_t n_siz = opr_sz - n_ofs; 2963 2964 if (vd != vm) { 2965 swap_memmove(vd, vn + n_ofs, n_siz); 2966 swap_memmove(vd + n_siz, vm, n_ofs); 2967 } else if (vd != vn) { 2968 swap_memmove(vd + n_siz, vd, n_ofs); 2969 swap_memmove(vd, vn + n_ofs, n_siz); 2970 } else { 2971 /* vd == vn == vm. Need temp space. */ 2972 ARMVectorReg tmp; 2973 swap_memmove(&tmp, vm, n_ofs); 2974 swap_memmove(vd, vd + n_ofs, n_siz); 2975 memcpy(vd + n_siz, &tmp, n_ofs); 2976 } 2977 } 2978 2979 #define DO_INSR(NAME, TYPE, H) \ 2980 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2981 { \ 2982 intptr_t opr_sz = simd_oprsz(desc); \ 2983 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2984 *(TYPE *)(vd + H(0)) = val; \ 2985 } 2986 2987 DO_INSR(sve_insr_b, uint8_t, H1) 2988 DO_INSR(sve_insr_h, uint16_t, H1_2) 2989 DO_INSR(sve_insr_s, uint32_t, H1_4) 2990 DO_INSR(sve_insr_d, uint64_t, H1_8) 2991 2992 #undef DO_INSR 2993 2994 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2995 { 2996 intptr_t i, j, opr_sz = simd_oprsz(desc); 2997 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2998 uint64_t f = *(uint64_t *)(vn + i); 2999 uint64_t b = *(uint64_t *)(vn + j); 3000 *(uint64_t *)(vd + i) = bswap64(b); 3001 *(uint64_t *)(vd + j) = bswap64(f); 3002 } 3003 } 3004 3005 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 3006 { 3007 intptr_t i, j, opr_sz = simd_oprsz(desc); 3008 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3009 uint64_t f = *(uint64_t *)(vn + i); 3010 uint64_t b = *(uint64_t *)(vn + j); 3011 *(uint64_t *)(vd + i) = hswap64(b); 3012 *(uint64_t *)(vd + j) = hswap64(f); 3013 } 3014 } 3015 3016 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 3017 { 3018 intptr_t i, j, opr_sz = simd_oprsz(desc); 3019 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3020 uint64_t f = *(uint64_t *)(vn + i); 3021 uint64_t b = *(uint64_t *)(vn + j); 3022 *(uint64_t *)(vd + i) = rol64(b, 32); 3023 *(uint64_t *)(vd + j) = rol64(f, 32); 3024 } 3025 } 3026 3027 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 3028 { 3029 intptr_t i, j, opr_sz = simd_oprsz(desc); 3030 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3031 uint64_t f = *(uint64_t *)(vn + i); 3032 uint64_t b = *(uint64_t *)(vn + j); 3033 *(uint64_t *)(vd + i) = b; 3034 *(uint64_t *)(vd + j) = f; 3035 } 3036 } 3037 3038 /* 3039 * TODO: This could use half_shuffle64 and similar bit tricks to 3040 * expand blocks of bits at once. 3041 */ 3042 #define DO_PMOV_PV(NAME, ESIZE) \ 3043 void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ 3044 { \ 3045 unsigned vl = simd_oprsz(desc); \ 3046 unsigned idx = simd_data(desc); \ 3047 unsigned elements = vl / ESIZE; \ 3048 ARMPredicateReg *d = vd; \ 3049 ARMVectorReg *s = vs; \ 3050 memset(d, 0, sizeof(*d)); \ 3051 for (unsigned e = 0; e < elements; ++e) { \ 3052 depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \ 3053 } \ 3054 } 3055 3056 DO_PMOV_PV(pmov_pv_h, 2) 3057 DO_PMOV_PV(pmov_pv_s, 4) 3058 DO_PMOV_PV(pmov_pv_d, 8) 3059 3060 #undef DO_PMOV_PV 3061 3062 /* 3063 * TODO: This could use half_unshuffle64 and similar bit tricks to 3064 * compress blocks of bits at once. 3065 */ 3066 #define DO_PMOV_VP(NAME, ESIZE) \ 3067 void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ 3068 { \ 3069 unsigned vl = simd_oprsz(desc); \ 3070 unsigned idx = simd_data(desc); \ 3071 unsigned elements = vl / ESIZE; \ 3072 ARMVectorReg *d = vd; \ 3073 ARMPredicateReg *s = vs; \ 3074 if (idx == 0) { \ 3075 memset(d, 0, vl); \ 3076 } \ 3077 for (unsigned e = 0; e < elements; ++e) { \ 3078 depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \ 3079 } \ 3080 } 3081 3082 DO_PMOV_VP(pmov_vp_h, 2) 3083 DO_PMOV_VP(pmov_vp_s, 4) 3084 DO_PMOV_VP(pmov_vp_d, 8) 3085 3086 #undef DO_PMOV_VP 3087 3088 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 3089 3090 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 3091 bool is_tbx, tb_impl_fn *fn) 3092 { 3093 ARMVectorReg scratch; 3094 uintptr_t oprsz = simd_oprsz(desc); 3095 3096 if (unlikely(vd == vn)) { 3097 vn = memcpy(&scratch, vn, oprsz); 3098 } 3099 3100 fn(vd, vn, NULL, vm, oprsz, is_tbx); 3101 } 3102 3103 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 3104 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 3105 { 3106 ARMVectorReg scratch; 3107 uintptr_t oprsz = simd_oprsz(desc); 3108 3109 if (unlikely(vd == vn0)) { 3110 vn0 = memcpy(&scratch, vn0, oprsz); 3111 if (vd == vn1) { 3112 vn1 = vn0; 3113 } 3114 } else if (unlikely(vd == vn1)) { 3115 vn1 = memcpy(&scratch, vn1, oprsz); 3116 } 3117 3118 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 3119 } 3120 3121 #define DO_TB(SUFF, TYPE, H) \ 3122 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 3123 void *vm, uintptr_t oprsz, bool is_tbx) \ 3124 { \ 3125 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 3126 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 3127 for (i = 0; i < nelem; ++i) { \ 3128 TYPE index = indexes[H1(i)], val = 0; \ 3129 if (index < nelem) { \ 3130 val = tbl0[H(index)]; \ 3131 } else { \ 3132 index -= nelem; \ 3133 if (tbl1 && index < nelem) { \ 3134 val = tbl1[H(index)]; \ 3135 } else if (is_tbx) { \ 3136 continue; \ 3137 } \ 3138 } \ 3139 d[H(i)] = val; \ 3140 } \ 3141 } \ 3142 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3143 { \ 3144 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3145 } \ 3146 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3147 void *vm, uint32_t desc) \ 3148 { \ 3149 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3150 } \ 3151 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3152 { \ 3153 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3154 } 3155 3156 DO_TB(b, uint8_t, H1) 3157 DO_TB(h, uint16_t, H2) 3158 DO_TB(s, uint32_t, H4) 3159 DO_TB(d, uint64_t, H8) 3160 3161 #undef DO_TB 3162 3163 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3164 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3165 { \ 3166 intptr_t i, opr_sz = simd_oprsz(desc); \ 3167 TYPED *d = vd; \ 3168 TYPES *n = vn; \ 3169 ARMVectorReg tmp; \ 3170 if (unlikely(vn - vd < opr_sz)) { \ 3171 n = memcpy(&tmp, n, opr_sz / 2); \ 3172 } \ 3173 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3174 d[HD(i)] = n[HS(i)]; \ 3175 } \ 3176 } 3177 3178 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3179 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3180 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3181 3182 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3183 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3184 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3185 3186 #undef DO_UNPK 3187 3188 /* Mask of bits included in the even numbered predicates of width esz. 3189 * We also use this for expand_bits/compress_bits, and so extend the 3190 * same pattern out to 16-bit units. 3191 */ 3192 static const uint64_t even_bit_esz_masks[5] = { 3193 0x5555555555555555ull, 3194 0x3333333333333333ull, 3195 0x0f0f0f0f0f0f0f0full, 3196 0x00ff00ff00ff00ffull, 3197 0x0000ffff0000ffffull, 3198 }; 3199 3200 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3201 * For N==0, this corresponds to the operation that in qemu/bitops.h 3202 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3203 * section 7-2 Shuffling Bits. 3204 */ 3205 static uint64_t expand_bits(uint64_t x, int n) 3206 { 3207 int i; 3208 3209 x &= 0xffffffffu; 3210 for (i = 4; i >= n; i--) { 3211 int sh = 1 << i; 3212 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3213 } 3214 return x; 3215 } 3216 3217 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3218 * For N==0, this corresponds to the operation that in qemu/bitops.h 3219 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3220 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3221 */ 3222 static uint64_t compress_bits(uint64_t x, int n) 3223 { 3224 int i; 3225 3226 for (i = n; i <= 4; i++) { 3227 int sh = 1 << i; 3228 x &= even_bit_esz_masks[i]; 3229 x = (x >> sh) | x; 3230 } 3231 return x & 0xffffffffu; 3232 } 3233 3234 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3235 { 3236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3237 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3238 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3239 int esize = 1 << esz; 3240 uint64_t *d = vd; 3241 intptr_t i; 3242 3243 if (oprsz <= 8) { 3244 uint64_t nn = *(uint64_t *)vn; 3245 uint64_t mm = *(uint64_t *)vm; 3246 int half = 4 * oprsz; 3247 3248 nn = extract64(nn, high * half, half); 3249 mm = extract64(mm, high * half, half); 3250 nn = expand_bits(nn, esz); 3251 mm = expand_bits(mm, esz); 3252 d[0] = nn | (mm << esize); 3253 } else { 3254 ARMPredicateReg tmp; 3255 3256 /* We produce output faster than we consume input. 3257 Therefore we must be mindful of possible overlap. */ 3258 if (vd == vn) { 3259 vn = memcpy(&tmp, vn, oprsz); 3260 if (vd == vm) { 3261 vm = vn; 3262 } 3263 } else if (vd == vm) { 3264 vm = memcpy(&tmp, vm, oprsz); 3265 } 3266 if (high) { 3267 high = oprsz >> 1; 3268 } 3269 3270 if ((oprsz & 7) == 0) { 3271 uint32_t *n = vn, *m = vm; 3272 high >>= 2; 3273 3274 for (i = 0; i < oprsz / 8; i++) { 3275 uint64_t nn = n[H4(high + i)]; 3276 uint64_t mm = m[H4(high + i)]; 3277 3278 nn = expand_bits(nn, esz); 3279 mm = expand_bits(mm, esz); 3280 d[i] = nn | (mm << esize); 3281 } 3282 } else { 3283 uint8_t *n = vn, *m = vm; 3284 uint16_t *d16 = vd; 3285 3286 for (i = 0; i < oprsz / 2; i++) { 3287 uint16_t nn = n[H1(high + i)]; 3288 uint16_t mm = m[H1(high + i)]; 3289 3290 nn = expand_bits(nn, esz); 3291 mm = expand_bits(mm, esz); 3292 d16[H2(i)] = nn | (mm << esize); 3293 } 3294 } 3295 } 3296 } 3297 3298 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3299 { 3300 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3301 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3302 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3303 uint64_t *d = vd, *n = vn, *m = vm; 3304 uint64_t l, h; 3305 intptr_t i; 3306 3307 if (oprsz <= 8) { 3308 l = compress_bits(n[0] >> odd, esz); 3309 h = compress_bits(m[0] >> odd, esz); 3310 d[0] = l | (h << (4 * oprsz)); 3311 } else { 3312 ARMPredicateReg tmp_m; 3313 intptr_t oprsz_16 = oprsz / 16; 3314 3315 if ((vm - vd) < (uintptr_t)oprsz) { 3316 m = memcpy(&tmp_m, vm, oprsz); 3317 } 3318 3319 for (i = 0; i < oprsz_16; i++) { 3320 l = n[2 * i + 0]; 3321 h = n[2 * i + 1]; 3322 l = compress_bits(l >> odd, esz); 3323 h = compress_bits(h >> odd, esz); 3324 d[i] = l | (h << 32); 3325 } 3326 3327 /* 3328 * For VL which is not a multiple of 512, the results from M do not 3329 * align nicely with the uint64_t for D. Put the aligned results 3330 * from M into TMP_M and then copy it into place afterward. 3331 */ 3332 if (oprsz & 15) { 3333 int final_shift = (oprsz & 15) * 2; 3334 3335 l = n[2 * i + 0]; 3336 h = n[2 * i + 1]; 3337 l = compress_bits(l >> odd, esz); 3338 h = compress_bits(h >> odd, esz); 3339 d[i] = l | (h << final_shift); 3340 3341 for (i = 0; i < oprsz_16; i++) { 3342 l = m[2 * i + 0]; 3343 h = m[2 * i + 1]; 3344 l = compress_bits(l >> odd, esz); 3345 h = compress_bits(h >> odd, esz); 3346 tmp_m.p[i] = l | (h << 32); 3347 } 3348 l = m[2 * i + 0]; 3349 h = m[2 * i + 1]; 3350 l = compress_bits(l >> odd, esz); 3351 h = compress_bits(h >> odd, esz); 3352 tmp_m.p[i] = l | (h << final_shift); 3353 3354 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3355 } else { 3356 for (i = 0; i < oprsz_16; i++) { 3357 l = m[2 * i + 0]; 3358 h = m[2 * i + 1]; 3359 l = compress_bits(l >> odd, esz); 3360 h = compress_bits(h >> odd, esz); 3361 d[oprsz_16 + i] = l | (h << 32); 3362 } 3363 } 3364 } 3365 } 3366 3367 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3368 { 3369 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3370 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3371 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3372 uint64_t *d = vd, *n = vn, *m = vm; 3373 uint64_t mask; 3374 int shr, shl; 3375 intptr_t i; 3376 3377 shl = 1 << esz; 3378 shr = 0; 3379 mask = even_bit_esz_masks[esz]; 3380 if (odd) { 3381 mask <<= shl; 3382 shr = shl; 3383 shl = 0; 3384 } 3385 3386 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3387 uint64_t nn = (n[i] & mask) >> shr; 3388 uint64_t mm = (m[i] & mask) << shl; 3389 d[i] = nn + mm; 3390 } 3391 } 3392 3393 /* Reverse units of 2**N bits. */ 3394 static uint64_t reverse_bits_64(uint64_t x, int n) 3395 { 3396 int i, sh; 3397 3398 x = bswap64(x); 3399 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3400 uint64_t mask = even_bit_esz_masks[i]; 3401 x = ((x & mask) << sh) | ((x >> sh) & mask); 3402 } 3403 return x; 3404 } 3405 3406 static uint8_t reverse_bits_8(uint8_t x, int n) 3407 { 3408 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3409 int i, sh; 3410 3411 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3412 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3413 } 3414 return x; 3415 } 3416 3417 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3418 { 3419 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3420 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3421 intptr_t i, oprsz_2 = oprsz / 2; 3422 3423 if (oprsz <= 8) { 3424 uint64_t l = *(uint64_t *)vn; 3425 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3426 *(uint64_t *)vd = l; 3427 } else if ((oprsz & 15) == 0) { 3428 for (i = 0; i < oprsz_2; i += 8) { 3429 intptr_t ih = oprsz - 8 - i; 3430 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3431 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3432 *(uint64_t *)(vd + i) = h; 3433 *(uint64_t *)(vd + ih) = l; 3434 } 3435 } else { 3436 for (i = 0; i < oprsz_2; i += 1) { 3437 intptr_t il = H1(i); 3438 intptr_t ih = H1(oprsz - 1 - i); 3439 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3440 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3441 *(uint8_t *)(vd + il) = h; 3442 *(uint8_t *)(vd + ih) = l; 3443 } 3444 } 3445 } 3446 3447 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3448 { 3449 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3450 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3451 uint64_t *d = vd; 3452 intptr_t i; 3453 3454 if (oprsz <= 8) { 3455 uint64_t nn = *(uint64_t *)vn; 3456 int half = 4 * oprsz; 3457 3458 nn = extract64(nn, high * half, half); 3459 nn = expand_bits(nn, 0); 3460 d[0] = nn; 3461 } else { 3462 ARMPredicateReg tmp_n; 3463 3464 /* We produce output faster than we consume input. 3465 Therefore we must be mindful of possible overlap. */ 3466 if ((vn - vd) < (uintptr_t)oprsz) { 3467 vn = memcpy(&tmp_n, vn, oprsz); 3468 } 3469 if (high) { 3470 high = oprsz >> 1; 3471 } 3472 3473 if ((oprsz & 7) == 0) { 3474 uint32_t *n = vn; 3475 high >>= 2; 3476 3477 for (i = 0; i < oprsz / 8; i++) { 3478 uint64_t nn = n[H4(high + i)]; 3479 d[i] = expand_bits(nn, 0); 3480 } 3481 } else { 3482 uint16_t *d16 = vd; 3483 uint8_t *n = vn; 3484 3485 for (i = 0; i < oprsz / 2; i++) { 3486 uint16_t nn = n[H1(high + i)]; 3487 d16[H2(i)] = expand_bits(nn, 0); 3488 } 3489 } 3490 } 3491 } 3492 3493 #define DO_ZIP(NAME, TYPE, H) \ 3494 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3495 { \ 3496 intptr_t oprsz = simd_oprsz(desc); \ 3497 intptr_t odd_ofs = simd_data(desc); \ 3498 intptr_t i, oprsz_2 = oprsz / 2; \ 3499 ARMVectorReg tmp_n, tmp_m; \ 3500 /* We produce output faster than we consume input. \ 3501 Therefore we must be mindful of possible overlap. */ \ 3502 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3503 vn = memcpy(&tmp_n, vn, oprsz); \ 3504 } \ 3505 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3506 vm = memcpy(&tmp_m, vm, oprsz); \ 3507 } \ 3508 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3509 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3510 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3511 *(TYPE *)(vm + odd_ofs + H(i)); \ 3512 } \ 3513 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3514 memset(vd + oprsz - 16, 0, 16); \ 3515 } \ 3516 } 3517 3518 DO_ZIP(sve_zip_b, uint8_t, H1) 3519 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3520 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3521 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3522 DO_ZIP(sve2_zip_q, Int128, ) 3523 3524 #define DO_UZP(NAME, TYPE, H) \ 3525 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3526 { \ 3527 intptr_t oprsz = simd_oprsz(desc); \ 3528 intptr_t odd_ofs = simd_data(desc); \ 3529 intptr_t i, p; \ 3530 ARMVectorReg tmp_m; \ 3531 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3532 vm = memcpy(&tmp_m, vm, oprsz); \ 3533 } \ 3534 i = 0, p = odd_ofs; \ 3535 do { \ 3536 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3537 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3538 } while (p < oprsz); \ 3539 p -= oprsz; \ 3540 do { \ 3541 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3542 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3543 } while (p < oprsz); \ 3544 tcg_debug_assert(i == oprsz); \ 3545 } 3546 3547 DO_UZP(sve_uzp_b, uint8_t, H1) 3548 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3549 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3550 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3551 DO_UZP(sve2_uzp_q, Int128, ) 3552 3553 typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc); 3554 3555 static void do_perseg_zzz(void *vd, void *vn, void *vm, 3556 uint32_t desc, perseg_zzz_fn *fn) 3557 { 3558 intptr_t oprsz = simd_oprsz(desc); 3559 3560 desc = simd_desc(16, 16, simd_data(desc)); 3561 for (intptr_t i = 0; i < oprsz; i += 16) { 3562 fn(vd + i, vn + i, vm + i, desc); 3563 } 3564 } 3565 3566 #define DO_PERSEG_ZZZ(NAME, FUNC) \ 3567 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3568 { do_perseg_zzz(vd, vn, vm, desc, FUNC); } 3569 3570 DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b) 3571 DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h) 3572 DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s) 3573 DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d) 3574 3575 DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b) 3576 DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h) 3577 DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s) 3578 DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d) 3579 3580 DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b) 3581 DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h) 3582 DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s) 3583 DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d) 3584 3585 DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b) 3586 DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h) 3587 DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s) 3588 DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d) 3589 3590 #undef DO_PERSEG_ZZZ 3591 3592 #define DO_TRN(NAME, TYPE, H) \ 3593 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3594 { \ 3595 intptr_t oprsz = simd_oprsz(desc); \ 3596 intptr_t odd_ofs = simd_data(desc); \ 3597 intptr_t i; \ 3598 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3599 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3600 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3601 *(TYPE *)(vd + H(i + 0)) = ae; \ 3602 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3603 } \ 3604 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3605 memset(vd + oprsz - 16, 0, 16); \ 3606 } \ 3607 } 3608 3609 DO_TRN(sve_trn_b, uint8_t, H1) 3610 DO_TRN(sve_trn_h, uint16_t, H1_2) 3611 DO_TRN(sve_trn_s, uint32_t, H1_4) 3612 DO_TRN(sve_trn_d, uint64_t, H1_8) 3613 DO_TRN(sve2_trn_q, Int128, ) 3614 3615 #undef DO_ZIP 3616 #undef DO_UZP 3617 #undef DO_TRN 3618 3619 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3620 { 3621 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3622 uint32_t *d = vd, *n = vn; 3623 uint8_t *pg = vg; 3624 3625 for (i = j = 0; i < opr_sz; i++) { 3626 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3627 d[H4(j)] = n[H4(i)]; 3628 j++; 3629 } 3630 } 3631 for (; j < opr_sz; j++) { 3632 d[H4(j)] = 0; 3633 } 3634 } 3635 3636 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3637 { 3638 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3639 uint64_t *d = vd, *n = vn; 3640 uint8_t *pg = vg; 3641 3642 for (i = j = 0; i < opr_sz; i++) { 3643 if (pg[H1(i)] & 1) { 3644 d[j] = n[i]; 3645 j++; 3646 } 3647 } 3648 for (; j < opr_sz; j++) { 3649 d[j] = 0; 3650 } 3651 } 3652 3653 /* Similar to the ARM LastActiveElement pseudocode function, except the 3654 * result is multiplied by the element size. This includes the not found 3655 * indication; e.g. not found for esz=3 is -8. 3656 */ 3657 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3658 { 3659 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3660 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3661 3662 return last_active_element(vg, words, esz); 3663 } 3664 3665 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3666 { 3667 intptr_t opr_sz = simd_oprsz(desc) / 8; 3668 int esz = simd_data(desc); 3669 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3670 intptr_t i, first_i, last_i; 3671 ARMVectorReg tmp; 3672 3673 first_i = last_i = 0; 3674 first_g = last_g = 0; 3675 3676 /* Find the extent of the active elements within VG. */ 3677 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3678 pg = *(uint64_t *)(vg + i) & mask; 3679 if (pg) { 3680 if (last_g == 0) { 3681 last_g = pg; 3682 last_i = i; 3683 } 3684 first_g = pg; 3685 first_i = i; 3686 } 3687 } 3688 3689 len = 0; 3690 if (first_g != 0) { 3691 first_i = first_i * 8 + ctz64(first_g); 3692 last_i = last_i * 8 + 63 - clz64(last_g); 3693 len = last_i - first_i + (1 << esz); 3694 if (vd == vm) { 3695 vm = memcpy(&tmp, vm, opr_sz * 8); 3696 } 3697 swap_memmove(vd, vn + first_i, len); 3698 } 3699 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3700 } 3701 3702 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3703 void *vg, uint32_t desc) 3704 { 3705 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3706 uint64_t *d = vd, *n = vn, *m = vm; 3707 uint8_t *pg = vg; 3708 3709 for (i = 0; i < opr_sz; i += 1) { 3710 uint64_t nn = n[i], mm = m[i]; 3711 uint64_t pp = expand_pred_b(pg[H1(i)]); 3712 d[i] = (nn & pp) | (mm & ~pp); 3713 } 3714 } 3715 3716 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3717 void *vg, uint32_t desc) 3718 { 3719 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3720 uint64_t *d = vd, *n = vn, *m = vm; 3721 uint8_t *pg = vg; 3722 3723 for (i = 0; i < opr_sz; i += 1) { 3724 uint64_t nn = n[i], mm = m[i]; 3725 uint64_t pp = expand_pred_h(pg[H1(i)]); 3726 d[i] = (nn & pp) | (mm & ~pp); 3727 } 3728 } 3729 3730 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3731 void *vg, uint32_t desc) 3732 { 3733 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3734 uint64_t *d = vd, *n = vn, *m = vm; 3735 uint8_t *pg = vg; 3736 3737 for (i = 0; i < opr_sz; i += 1) { 3738 uint64_t nn = n[i], mm = m[i]; 3739 uint64_t pp = expand_pred_s(pg[H1(i)]); 3740 d[i] = (nn & pp) | (mm & ~pp); 3741 } 3742 } 3743 3744 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3745 void *vg, uint32_t desc) 3746 { 3747 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3748 uint64_t *d = vd, *n = vn, *m = vm; 3749 uint8_t *pg = vg; 3750 3751 for (i = 0; i < opr_sz; i += 1) { 3752 uint64_t nn = n[i], mm = m[i]; 3753 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3754 } 3755 } 3756 3757 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3758 void *vg, uint32_t desc) 3759 { 3760 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3761 Int128 *d = vd, *n = vn, *m = vm; 3762 uint16_t *pg = vg; 3763 3764 for (i = 0; i < opr_sz; i += 1) { 3765 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3766 } 3767 } 3768 3769 /* Two operand comparison controlled by a predicate. 3770 * ??? It is very tempting to want to be able to expand this inline 3771 * with x86 instructions, e.g. 3772 * 3773 * vcmpeqw zm, zn, %ymm0 3774 * vpmovmskb %ymm0, %eax 3775 * and $0x5555, %eax 3776 * and pg, %eax 3777 * 3778 * or even aarch64, e.g. 3779 * 3780 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3781 * cmeq v0.8h, zn, zm 3782 * and v0.8h, v0.8h, mask 3783 * addv h0, v0.8h 3784 * and v0.8b, pg 3785 * 3786 * However, coming up with an abstraction that allows vector inputs and 3787 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3788 * scalar outputs, is tricky. 3789 */ 3790 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3791 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3792 { \ 3793 intptr_t opr_sz = simd_oprsz(desc); \ 3794 uint32_t flags = PREDTEST_INIT; \ 3795 intptr_t i = opr_sz; \ 3796 do { \ 3797 uint64_t out = 0, pg; \ 3798 do { \ 3799 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3800 TYPE nn = *(TYPE *)(vn + H(i)); \ 3801 TYPE mm = *(TYPE *)(vm + H(i)); \ 3802 out |= nn OP mm; \ 3803 } while (i & 63); \ 3804 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3805 out &= pg; \ 3806 *(uint64_t *)(vd + (i >> 3)) = out; \ 3807 flags = iter_predtest_bwd(out, pg, flags); \ 3808 } while (i > 0); \ 3809 return flags; \ 3810 } 3811 3812 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3813 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3814 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3815 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3816 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3817 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3818 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3819 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3820 3821 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3822 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3823 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3824 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3825 3826 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3827 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3828 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3829 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3830 3831 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3832 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3833 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3834 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3835 3836 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3837 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3838 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3839 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3840 3841 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3842 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3843 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3844 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3845 3846 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3847 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3848 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3849 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3850 3851 #undef DO_CMP_PPZZ_B 3852 #undef DO_CMP_PPZZ_H 3853 #undef DO_CMP_PPZZ_S 3854 #undef DO_CMP_PPZZ_D 3855 #undef DO_CMP_PPZZ 3856 3857 /* Similar, but the second source is "wide". */ 3858 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3859 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3860 { \ 3861 intptr_t opr_sz = simd_oprsz(desc); \ 3862 uint32_t flags = PREDTEST_INIT; \ 3863 intptr_t i = opr_sz; \ 3864 do { \ 3865 uint64_t out = 0, pg; \ 3866 do { \ 3867 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3868 do { \ 3869 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3870 TYPE nn = *(TYPE *)(vn + H(i)); \ 3871 out |= nn OP mm; \ 3872 } while (i & 7); \ 3873 } while (i & 63); \ 3874 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3875 out &= pg; \ 3876 *(uint64_t *)(vd + (i >> 3)) = out; \ 3877 flags = iter_predtest_bwd(out, pg, flags); \ 3878 } while (i > 0); \ 3879 return flags; \ 3880 } 3881 3882 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3883 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3884 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3885 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3886 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3887 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3888 3889 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3890 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3891 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3892 3893 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3894 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3895 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3896 3897 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3898 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3899 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3900 3901 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3902 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3903 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3904 3905 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3906 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3907 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3908 3909 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3910 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3911 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3912 3913 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3914 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3915 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3916 3917 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3918 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3919 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3920 3921 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3922 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3923 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3924 3925 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3926 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3927 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3928 3929 #undef DO_CMP_PPZW_B 3930 #undef DO_CMP_PPZW_H 3931 #undef DO_CMP_PPZW_S 3932 #undef DO_CMP_PPZW 3933 3934 /* Similar, but the second source is immediate. */ 3935 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3936 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3937 { \ 3938 intptr_t opr_sz = simd_oprsz(desc); \ 3939 uint32_t flags = PREDTEST_INIT; \ 3940 TYPE mm = simd_data(desc); \ 3941 intptr_t i = opr_sz; \ 3942 do { \ 3943 uint64_t out = 0, pg; \ 3944 do { \ 3945 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3946 TYPE nn = *(TYPE *)(vn + H(i)); \ 3947 out |= nn OP mm; \ 3948 } while (i & 63); \ 3949 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3950 out &= pg; \ 3951 *(uint64_t *)(vd + (i >> 3)) = out; \ 3952 flags = iter_predtest_bwd(out, pg, flags); \ 3953 } while (i > 0); \ 3954 return flags; \ 3955 } 3956 3957 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3958 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3959 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3960 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3961 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3962 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3963 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3964 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3965 3966 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3967 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3968 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3969 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3970 3971 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3972 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3973 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3974 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3975 3976 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3977 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3978 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3979 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3980 3981 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3982 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3983 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3984 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3985 3986 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3987 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3988 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3989 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3990 3991 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3992 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3993 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3994 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3995 3996 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3997 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3998 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3999 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 4000 4001 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 4002 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 4003 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 4004 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 4005 4006 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 4007 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 4008 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 4009 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 4010 4011 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 4012 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 4013 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 4014 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 4015 4016 #undef DO_CMP_PPZI_B 4017 #undef DO_CMP_PPZI_H 4018 #undef DO_CMP_PPZI_S 4019 #undef DO_CMP_PPZI_D 4020 #undef DO_CMP_PPZI 4021 4022 /* Similar to the ARM LastActive pseudocode function. */ 4023 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 4024 { 4025 intptr_t i; 4026 4027 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 4028 uint64_t pg = *(uint64_t *)(vg + i); 4029 if (pg) { 4030 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 4031 } 4032 } 4033 return 0; 4034 } 4035 4036 /* Compute a mask into RETB that is true for all G, up to and including 4037 * (if after) or excluding (if !after) the first G & N. 4038 * Return true if BRK found. 4039 */ 4040 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 4041 bool brk, bool after) 4042 { 4043 uint64_t b; 4044 4045 if (brk) { 4046 b = 0; 4047 } else if ((g & n) == 0) { 4048 /* For all G, no N are set; break not found. */ 4049 b = g; 4050 } else { 4051 /* Break somewhere in N. Locate it. */ 4052 b = g & n; /* guard true, pred true */ 4053 b = b & -b; /* first such */ 4054 if (after) { 4055 b = b | (b - 1); /* break after same */ 4056 } else { 4057 b = b - 1; /* break before same */ 4058 } 4059 brk = true; 4060 } 4061 4062 *retb = b; 4063 return brk; 4064 } 4065 4066 /* Compute a zeroing BRK. */ 4067 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 4068 intptr_t oprsz, bool after) 4069 { 4070 bool brk = false; 4071 intptr_t i; 4072 4073 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4074 uint64_t this_b, this_g = g[i]; 4075 4076 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4077 d[i] = this_b & this_g; 4078 } 4079 } 4080 4081 /* Likewise, but also compute flags. */ 4082 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 4083 intptr_t oprsz, bool after) 4084 { 4085 uint32_t flags = PREDTEST_INIT; 4086 bool brk = false; 4087 intptr_t i; 4088 4089 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4090 uint64_t this_b, this_d, this_g = g[i]; 4091 4092 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4093 d[i] = this_d = this_b & this_g; 4094 flags = iter_predtest_fwd(this_d, this_g, flags); 4095 } 4096 return flags; 4097 } 4098 4099 /* Compute a merging BRK. */ 4100 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 4101 intptr_t oprsz, bool after) 4102 { 4103 bool brk = false; 4104 intptr_t i; 4105 4106 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4107 uint64_t this_b, this_g = g[i]; 4108 4109 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4110 d[i] = (this_b & this_g) | (d[i] & ~this_g); 4111 } 4112 } 4113 4114 /* Likewise, but also compute flags. */ 4115 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 4116 intptr_t oprsz, bool after) 4117 { 4118 uint32_t flags = PREDTEST_INIT; 4119 bool brk = false; 4120 intptr_t i; 4121 4122 for (i = 0; i < oprsz / 8; ++i) { 4123 uint64_t this_b, this_d = d[i], this_g = g[i]; 4124 4125 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4126 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 4127 flags = iter_predtest_fwd(this_d, this_g, flags); 4128 } 4129 return flags; 4130 } 4131 4132 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 4133 uint32_t pred_desc) 4134 { 4135 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4136 if (last_active_pred(vn, vg, oprsz)) { 4137 compute_brk_z(vd, vm, vg, oprsz, true); 4138 } else { 4139 memset(vd, 0, sizeof(ARMPredicateReg)); 4140 } 4141 } 4142 4143 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 4144 uint32_t pred_desc) 4145 { 4146 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4147 if (last_active_pred(vn, vg, oprsz)) { 4148 return compute_brks_z(vd, vm, vg, oprsz, true); 4149 } else { 4150 memset(vd, 0, sizeof(ARMPredicateReg)); 4151 return PREDTEST_INIT; 4152 } 4153 } 4154 4155 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 4156 uint32_t pred_desc) 4157 { 4158 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4159 if (last_active_pred(vn, vg, oprsz)) { 4160 compute_brk_z(vd, vm, vg, oprsz, false); 4161 } else { 4162 memset(vd, 0, sizeof(ARMPredicateReg)); 4163 } 4164 } 4165 4166 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4167 uint32_t pred_desc) 4168 { 4169 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4170 if (last_active_pred(vn, vg, oprsz)) { 4171 return compute_brks_z(vd, vm, vg, oprsz, false); 4172 } else { 4173 memset(vd, 0, sizeof(ARMPredicateReg)); 4174 return PREDTEST_INIT; 4175 } 4176 } 4177 4178 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4179 { 4180 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4181 compute_brk_z(vd, vn, vg, oprsz, true); 4182 } 4183 4184 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4185 { 4186 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4187 return compute_brks_z(vd, vn, vg, oprsz, true); 4188 } 4189 4190 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4191 { 4192 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4193 compute_brk_z(vd, vn, vg, oprsz, false); 4194 } 4195 4196 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4197 { 4198 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4199 return compute_brks_z(vd, vn, vg, oprsz, false); 4200 } 4201 4202 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4203 { 4204 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4205 compute_brk_m(vd, vn, vg, oprsz, true); 4206 } 4207 4208 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4209 { 4210 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4211 return compute_brks_m(vd, vn, vg, oprsz, true); 4212 } 4213 4214 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4215 { 4216 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4217 compute_brk_m(vd, vn, vg, oprsz, false); 4218 } 4219 4220 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4221 { 4222 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4223 return compute_brks_m(vd, vn, vg, oprsz, false); 4224 } 4225 4226 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4227 { 4228 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4229 if (!last_active_pred(vn, vg, oprsz)) { 4230 memset(vd, 0, sizeof(ARMPredicateReg)); 4231 } 4232 } 4233 4234 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4235 { 4236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4237 if (last_active_pred(vn, vg, oprsz)) { 4238 ARMPredicateReg *d = vd; 4239 uint32_t flags = PREDTEST_INIT; 4240 intptr_t i; 4241 4242 /* As if PredTest(Ones(PL), D, MO_8). */ 4243 for (i = 0; i < oprsz / 8; i++) { 4244 flags = iter_predtest_fwd(d->p[i], -1, flags); 4245 } 4246 if (oprsz & 7) { 4247 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4248 flags = iter_predtest_fwd(d->p[i], mask, flags); 4249 } 4250 return flags; 4251 } 4252 memset(vd, 0, sizeof(ARMPredicateReg)); 4253 return PREDTEST_INIT; 4254 } 4255 4256 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4257 { 4258 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4259 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4260 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4261 intptr_t i; 4262 4263 for (i = 0; i < words; ++i) { 4264 uint64_t t = n[i] & g[i] & mask; 4265 sum += ctpop64(t); 4266 } 4267 return sum; 4268 } 4269 4270 uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc) 4271 { 4272 int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); 4273 int vl = pl * 8; 4274 unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); 4275 int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1; 4276 DecodeCounter p = decode_counter(png, vl, v_esz); 4277 unsigned maxelem = (vl << lg2_width) >> v_esz; 4278 unsigned count = p.count; 4279 4280 if (p.invert) { 4281 if (count >= maxelem) { 4282 return 0; 4283 } 4284 count = maxelem - count; 4285 } else { 4286 count = MIN(count, maxelem); 4287 } 4288 return count >> p.lg2_stride; 4289 } 4290 4291 /* C.f. Arm pseudocode EncodePredCount */ 4292 static uint64_t encode_pred_count(uint32_t elements, uint32_t count, 4293 uint32_t esz, bool invert) 4294 { 4295 uint32_t pred; 4296 4297 if (count == 0) { 4298 return 0; 4299 } 4300 if (invert) { 4301 count = elements - count; 4302 } else if (count == elements) { 4303 count = 0; 4304 invert = true; 4305 } 4306 4307 pred = (count << 1) | 1; 4308 pred <<= esz; 4309 pred |= invert << 15; 4310 4311 return pred; 4312 } 4313 4314 /* C.f. Arm pseudocode PredCountTest */ 4315 static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert) 4316 { 4317 uint32_t flags; 4318 4319 if (count == 0) { 4320 flags = 1; /* !N, Z, C */ 4321 } else if (!invert) { 4322 flags = (1u << 31) | 2; /* N, !Z */ 4323 flags |= count != elements; /* C */ 4324 } else { 4325 flags = 2; /* !Z, !C */ 4326 flags |= (count == elements) << 31; /* N */ 4327 } 4328 return flags; 4329 } 4330 4331 /* D must be cleared on entry. */ 4332 static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask, 4333 uint32_t count, uint32_t oprbits) 4334 { 4335 tcg_debug_assert(count <= oprbits); 4336 if (count) { 4337 uint32_t i; 4338 4339 /* Set all of the requested bits. */ 4340 for (i = 0; i < count / 64; ++i) { 4341 d->p[i] = esz_mask; 4342 } 4343 if (count & 63) { 4344 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4345 } 4346 } 4347 } 4348 4349 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4350 { 4351 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4352 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4353 uint32_t oprbits = oprsz * 8; 4354 uint64_t esz_mask = pred_esz_masks[esz]; 4355 ARMPredicateReg *d = vd; 4356 4357 count <<= esz; 4358 memset(d, 0, sizeof(*d)); 4359 do_whilel(d, esz_mask, count, oprbits); 4360 return pred_count_test(oprbits, count, false); 4361 } 4362 4363 uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc) 4364 { 4365 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4366 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4367 uint32_t oprbits = oprsz * 8; 4368 uint64_t esz_mask = pred_esz_masks[esz]; 4369 ARMPredicateReg *d = vd; 4370 4371 count <<= esz; 4372 memset(d, 0, 2 * sizeof(*d)); 4373 if (count <= oprbits) { 4374 do_whilel(&d[0], esz_mask, count, oprbits); 4375 } else { 4376 do_whilel(&d[0], esz_mask, oprbits, oprbits); 4377 do_whilel(&d[1], esz_mask, count - oprbits, oprbits); 4378 } 4379 4380 return pred_count_test(2 * oprbits, count, false); 4381 } 4382 4383 uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc) 4384 { 4385 uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4386 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4387 uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); 4388 uint32_t vl = pl * 8; 4389 uint32_t elements = (vl >> esz) << scale; 4390 ARMPredicateReg *d = vd; 4391 4392 *d = (ARMPredicateReg) { 4393 .p[0] = encode_pred_count(elements, count, esz, false) 4394 }; 4395 return pred_count_test(elements, count, false); 4396 } 4397 4398 /* D must be cleared on entry. */ 4399 static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask, 4400 uint32_t count, uint32_t oprbits) 4401 { 4402 tcg_debug_assert(count <= oprbits); 4403 if (count) { 4404 uint32_t i, invcount = oprbits - count; 4405 uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64); 4406 4407 for (i = invcount / 64; i < oprbits / 64; ++i) { 4408 d->p[i] = bits; 4409 bits = esz_mask; 4410 } 4411 if (oprbits & 63) { 4412 d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63); 4413 } 4414 } 4415 } 4416 4417 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4418 { 4419 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4420 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4421 uint32_t oprbits = oprsz * 8; 4422 uint64_t esz_mask = pred_esz_masks[esz]; 4423 ARMPredicateReg *d = vd; 4424 4425 count <<= esz; 4426 memset(d, 0, sizeof(*d)); 4427 do_whileg(d, esz_mask, count, oprbits); 4428 return pred_count_test(oprbits, count, true); 4429 } 4430 4431 uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc) 4432 { 4433 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4434 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4435 uint32_t oprbits = oprsz * 8; 4436 uint64_t esz_mask = pred_esz_masks[esz]; 4437 ARMPredicateReg *d = vd; 4438 4439 count <<= esz; 4440 memset(d, 0, 2 * sizeof(*d)); 4441 if (count <= oprbits) { 4442 do_whileg(&d[1], esz_mask, count, oprbits); 4443 } else { 4444 do_whilel(&d[1], esz_mask, oprbits, oprbits); 4445 do_whileg(&d[0], esz_mask, count - oprbits, oprbits); 4446 } 4447 4448 return pred_count_test(2 * oprbits, count, true); 4449 } 4450 4451 uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc) 4452 { 4453 uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4454 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4455 uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); 4456 uint32_t vl = pl * 8; 4457 uint32_t elements = (vl >> esz) << scale; 4458 ARMPredicateReg *d = vd; 4459 4460 *d = (ARMPredicateReg) { 4461 .p[0] = encode_pred_count(elements, count, esz, true) 4462 }; 4463 return pred_count_test(elements, count, true); 4464 } 4465 4466 /* Recursive reduction on a function; 4467 * C.f. the ARM ARM function ReducePredicated. 4468 * 4469 * While it would be possible to write this without the DATA temporary, 4470 * it is much simpler to process the predicate register this way. 4471 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4472 * little to gain with a more complex non-recursive form. 4473 */ 4474 #define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT) \ 4475 static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4476 { \ 4477 if (n == 1) { \ 4478 return *data; \ 4479 } else { \ 4480 uintptr_t half = n / 2; \ 4481 TYPE lo = FUNC##_reduce(data, status, half); \ 4482 TYPE hi = FUNC##_reduce(data + half, status, half); \ 4483 return FUNC(lo, hi, status); \ 4484 } \ 4485 } \ 4486 uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg, \ 4487 float_status *status, uint32_t desc) \ 4488 { \ 4489 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4490 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4491 TYPE ident = IDENT; \ 4492 for (i = 0; i < oprsz; ) { \ 4493 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4494 do { \ 4495 TYPE nn = *(TYPE *)(vn + H(i)); \ 4496 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : ident); \ 4497 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4498 } while (i & 15); \ 4499 } \ 4500 for (; i < maxsz; i += sizeof(TYPE)) { \ 4501 *(TYPE *)((void *)data + i) = ident; \ 4502 } \ 4503 return FUNC##_reduce(data, status, maxsz / sizeof(TYPE)); \ 4504 } \ 4505 void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg, \ 4506 float_status *status, uint32_t desc) \ 4507 { \ 4508 unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16; \ 4509 TYPE ident = IDENT; \ 4510 for (unsigned e = 0; e < 16; e += sizeof(TYPE)) { \ 4511 TYPE data[ARM_MAX_VQ]; \ 4512 for (unsigned s = 0; s < segments; s++) { \ 4513 uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2)); \ 4514 TYPE nn = *(TYPE *)(vn + (s * 16 + H(e))); \ 4515 data[s] = (pg >> e) & 1 ? nn : ident; \ 4516 } \ 4517 *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \ 4518 } \ 4519 clear_tail(vd, 16, simd_maxsz(desc)); \ 4520 } 4521 4522 DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero) 4523 DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero) 4524 DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero) 4525 4526 /* 4527 * We can't avoid the function call for the default NaN value, because 4528 * it changes when FPCR.AH is set. 4529 */ 4530 DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, float16_default_nan(status)) 4531 DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, float32_default_nan(status)) 4532 DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, float64_default_nan(status)) 4533 4534 DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, float16_default_nan(status)) 4535 DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, float32_default_nan(status)) 4536 DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, float64_default_nan(status)) 4537 4538 DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity) 4539 DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity) 4540 DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity) 4541 4542 DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity)) 4543 DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity)) 4544 DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity)) 4545 4546 DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) 4547 DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) 4548 DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) 4549 4550 DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh, 4551 float16_chs(float16_infinity)) 4552 DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs, 4553 float32_chs(float32_infinity)) 4554 DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd, 4555 float64_chs(float64_infinity)) 4556 4557 #undef DO_REDUCE 4558 4559 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4560 float_status *status, uint32_t desc) 4561 { 4562 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4563 float16 result = nn; 4564 4565 do { 4566 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4567 do { 4568 if (pg & 1) { 4569 float16 mm = *(float16 *)(vm + H1_2(i)); 4570 result = float16_add(result, mm, status); 4571 } 4572 i += sizeof(float16), pg >>= sizeof(float16); 4573 } while (i & 15); 4574 } while (i < opr_sz); 4575 4576 return result; 4577 } 4578 4579 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4580 float_status *status, uint32_t desc) 4581 { 4582 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4583 float32 result = nn; 4584 4585 do { 4586 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4587 do { 4588 if (pg & 1) { 4589 float32 mm = *(float32 *)(vm + H1_2(i)); 4590 result = float32_add(result, mm, status); 4591 } 4592 i += sizeof(float32), pg >>= sizeof(float32); 4593 } while (i & 15); 4594 } while (i < opr_sz); 4595 4596 return result; 4597 } 4598 4599 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4600 float_status *status, uint32_t desc) 4601 { 4602 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4603 uint64_t *m = vm; 4604 uint8_t *pg = vg; 4605 4606 for (i = 0; i < opr_sz; i++) { 4607 if (pg[H1(i)] & 1) { 4608 nn = float64_add(nn, m[i], status); 4609 } 4610 } 4611 4612 return nn; 4613 } 4614 4615 /* Fully general three-operand expander, controlled by a predicate, 4616 * With the extra float_status parameter. 4617 */ 4618 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4619 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4620 float_status *status, uint32_t desc) \ 4621 { \ 4622 intptr_t i = simd_oprsz(desc); \ 4623 uint64_t *g = vg; \ 4624 do { \ 4625 uint64_t pg = g[(i - 1) >> 6]; \ 4626 do { \ 4627 i -= sizeof(TYPE); \ 4628 if (likely((pg >> (i & 63)) & 1)) { \ 4629 TYPE nn = *(TYPE *)(vn + H(i)); \ 4630 TYPE mm = *(TYPE *)(vm + H(i)); \ 4631 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4632 } \ 4633 } while (i & 63); \ 4634 } while (i != 0); \ 4635 } 4636 4637 DO_ZPZZ_FP(sve_fadd_b16, uint16_t, H1_2, bfloat16_add) 4638 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4639 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4640 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4641 4642 DO_ZPZZ_FP(sve_fsub_b16, uint16_t, H1_2, bfloat16_sub) 4643 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4644 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4645 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4646 4647 DO_ZPZZ_FP(sve_fmul_b16, uint16_t, H1_2, bfloat16_mul) 4648 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4649 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4650 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4651 4652 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4653 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4654 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4655 4656 DO_ZPZZ_FP(sve_fmin_b16, uint16_t, H1_2, bfloat16_min) 4657 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4658 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4659 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4660 4661 DO_ZPZZ_FP(sve_fmax_b16, uint16_t, H1_2, bfloat16_max) 4662 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4663 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4664 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4665 4666 DO_ZPZZ_FP(sve_ah_fmin_b16, uint16_t, H1_2, helper_sme2_ah_fmin_b16) 4667 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) 4668 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) 4669 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) 4670 4671 DO_ZPZZ_FP(sve_ah_fmax_b16, uint16_t, H1_2, helper_sme2_ah_fmax_b16) 4672 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) 4673 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) 4674 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) 4675 4676 DO_ZPZZ_FP(sve_fminnum_b16, uint16_t, H1_2, bfloat16_minnum) 4677 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4678 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4679 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4680 4681 DO_ZPZZ_FP(sve_fmaxnum_b16, uint16_t, H1_2, bfloat16_maxnum) 4682 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4683 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4684 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4685 4686 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4687 { 4688 return float16_abs(float16_sub(a, b, s)); 4689 } 4690 4691 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4692 { 4693 return float32_abs(float32_sub(a, b, s)); 4694 } 4695 4696 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4697 { 4698 return float64_abs(float64_sub(a, b, s)); 4699 } 4700 4701 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 4702 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat) 4703 { 4704 float16 r = float16_sub(op1, op2, stat); 4705 return float16_is_any_nan(r) ? r : float16_abs(r); 4706 } 4707 4708 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat) 4709 { 4710 float32 r = float32_sub(op1, op2, stat); 4711 return float32_is_any_nan(r) ? r : float32_abs(r); 4712 } 4713 4714 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat) 4715 { 4716 float64 r = float64_sub(op1, op2, stat); 4717 return float64_is_any_nan(r) ? r : float64_abs(r); 4718 } 4719 4720 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4721 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4722 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4723 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h) 4724 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s) 4725 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d) 4726 4727 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4728 { 4729 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4730 return float64_scalbn(a, b_int, s); 4731 } 4732 4733 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4734 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4735 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4736 4737 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4738 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4739 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4740 4741 #undef DO_ZPZZ_FP 4742 4743 /* Three-operand expander, with one scalar operand, controlled by 4744 * a predicate, with the extra float_status parameter. 4745 */ 4746 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4747 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4748 float_status *status, uint32_t desc) \ 4749 { \ 4750 intptr_t i = simd_oprsz(desc); \ 4751 uint64_t *g = vg; \ 4752 TYPE mm = scalar; \ 4753 do { \ 4754 uint64_t pg = g[(i - 1) >> 6]; \ 4755 do { \ 4756 i -= sizeof(TYPE); \ 4757 if (likely((pg >> (i & 63)) & 1)) { \ 4758 TYPE nn = *(TYPE *)(vn + H(i)); \ 4759 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4760 } \ 4761 } while (i & 63); \ 4762 } while (i != 0); \ 4763 } 4764 4765 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4766 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4767 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4768 4769 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4770 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4771 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4772 4773 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4774 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4775 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4776 4777 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4778 { 4779 return float16_sub(b, a, s); 4780 } 4781 4782 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4783 { 4784 return float32_sub(b, a, s); 4785 } 4786 4787 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4788 { 4789 return float64_sub(b, a, s); 4790 } 4791 4792 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4793 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4794 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4795 4796 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4797 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4798 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4799 4800 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4801 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4802 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4803 4804 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4805 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4806 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4807 4808 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4809 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4810 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4811 4812 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh) 4813 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs) 4814 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd) 4815 4816 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh) 4817 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins) 4818 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind) 4819 4820 /* Fully general two-operand expander, controlled by a predicate, 4821 * With the extra float_status parameter. 4822 */ 4823 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4824 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4825 float_status *status, uint32_t desc) \ 4826 { \ 4827 intptr_t i = simd_oprsz(desc); \ 4828 uint64_t *g = vg; \ 4829 do { \ 4830 uint64_t pg = g[(i - 1) >> 6]; \ 4831 do { \ 4832 i -= sizeof(TYPE); \ 4833 if (likely((pg >> (i & 63)) & 1)) { \ 4834 TYPE nn = *(TYPE *)(vn + H(i)); \ 4835 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4836 } \ 4837 } while (i & 63); \ 4838 } while (i != 0); \ 4839 } 4840 4841 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4842 * FZ16. When converting from fp16, this affects flushing input denormals; 4843 * when converting to fp16, this affects flushing output denormals. 4844 */ 4845 float32 sve_f16_to_f32(float16 f, float_status *fpst) 4846 { 4847 bool save = get_flush_inputs_to_zero(fpst); 4848 float32 ret; 4849 4850 set_flush_inputs_to_zero(false, fpst); 4851 ret = float16_to_float32(f, true, fpst); 4852 set_flush_inputs_to_zero(save, fpst); 4853 return ret; 4854 } 4855 4856 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4857 { 4858 bool save = get_flush_inputs_to_zero(fpst); 4859 float64 ret; 4860 4861 set_flush_inputs_to_zero(false, fpst); 4862 ret = float16_to_float64(f, true, fpst); 4863 set_flush_inputs_to_zero(save, fpst); 4864 return ret; 4865 } 4866 4867 float16 sve_f32_to_f16(float32 f, float_status *fpst) 4868 { 4869 bool save = get_flush_to_zero(fpst); 4870 float16 ret; 4871 4872 set_flush_to_zero(false, fpst); 4873 ret = float32_to_float16(f, true, fpst); 4874 set_flush_to_zero(save, fpst); 4875 return ret; 4876 } 4877 4878 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4879 { 4880 bool save = get_flush_to_zero(fpst); 4881 float16 ret; 4882 4883 set_flush_to_zero(false, fpst); 4884 ret = float64_to_float16(f, true, fpst); 4885 set_flush_to_zero(save, fpst); 4886 return ret; 4887 } 4888 4889 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4890 { 4891 if (float16_is_any_nan(f)) { 4892 float_raise(float_flag_invalid, s); 4893 return 0; 4894 } 4895 return float16_to_int16_round_to_zero(f, s); 4896 } 4897 4898 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4899 { 4900 if (float16_is_any_nan(f)) { 4901 float_raise(float_flag_invalid, s); 4902 return 0; 4903 } 4904 return float16_to_int64_round_to_zero(f, s); 4905 } 4906 4907 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4908 { 4909 if (float32_is_any_nan(f)) { 4910 float_raise(float_flag_invalid, s); 4911 return 0; 4912 } 4913 return float32_to_int64_round_to_zero(f, s); 4914 } 4915 4916 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4917 { 4918 if (float64_is_any_nan(f)) { 4919 float_raise(float_flag_invalid, s); 4920 return 0; 4921 } 4922 return float64_to_int64_round_to_zero(f, s); 4923 } 4924 4925 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4926 { 4927 if (float16_is_any_nan(f)) { 4928 float_raise(float_flag_invalid, s); 4929 return 0; 4930 } 4931 return float16_to_uint16_round_to_zero(f, s); 4932 } 4933 4934 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4935 { 4936 if (float16_is_any_nan(f)) { 4937 float_raise(float_flag_invalid, s); 4938 return 0; 4939 } 4940 return float16_to_uint64_round_to_zero(f, s); 4941 } 4942 4943 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4944 { 4945 if (float32_is_any_nan(f)) { 4946 float_raise(float_flag_invalid, s); 4947 return 0; 4948 } 4949 return float32_to_uint64_round_to_zero(f, s); 4950 } 4951 4952 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4953 { 4954 if (float64_is_any_nan(f)) { 4955 float_raise(float_flag_invalid, s); 4956 return 0; 4957 } 4958 return float64_to_uint64_round_to_zero(f, s); 4959 } 4960 4961 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4962 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4963 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4964 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4965 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4966 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4967 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4968 4969 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4970 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4971 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4972 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4973 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4974 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4975 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4976 4977 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4978 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4979 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4980 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4981 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4982 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4983 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4984 4985 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4986 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4987 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4988 4989 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4990 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4991 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4992 4993 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4994 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4995 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4996 4997 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4998 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4999 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 5000 5001 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 5002 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 5003 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 5004 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 5005 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 5006 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 5007 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 5008 5009 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 5010 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 5011 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 5012 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 5013 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 5014 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 5015 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 5016 5017 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 5018 { 5019 /* Extract frac to the top of the uint32_t. */ 5020 uint32_t frac = (uint32_t)a << (16 + 6); 5021 int16_t exp = extract32(a, 10, 5); 5022 5023 if (unlikely(exp == 0)) { 5024 if (frac != 0) { 5025 if (!get_flush_inputs_to_zero(s)) { 5026 /* denormal: bias - fractional_zeros */ 5027 return -15 - clz32(frac); 5028 } 5029 /* flush to zero */ 5030 float_raise(float_flag_input_denormal_flushed, s); 5031 } 5032 } else if (unlikely(exp == 0x1f)) { 5033 if (frac == 0) { 5034 return INT16_MAX; /* infinity */ 5035 } 5036 } else { 5037 /* normal: exp - bias */ 5038 return exp - 15; 5039 } 5040 /* nan or zero */ 5041 float_raise(float_flag_invalid, s); 5042 return INT16_MIN; 5043 } 5044 5045 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 5046 { 5047 /* Extract frac to the top of the uint32_t. */ 5048 uint32_t frac = a << 9; 5049 int32_t exp = extract32(a, 23, 8); 5050 5051 if (unlikely(exp == 0)) { 5052 if (frac != 0) { 5053 if (!get_flush_inputs_to_zero(s)) { 5054 /* denormal: bias - fractional_zeros */ 5055 return -127 - clz32(frac); 5056 } 5057 /* flush to zero */ 5058 float_raise(float_flag_input_denormal_flushed, s); 5059 } 5060 } else if (unlikely(exp == 0xff)) { 5061 if (frac == 0) { 5062 return INT32_MAX; /* infinity */ 5063 } 5064 } else { 5065 /* normal: exp - bias */ 5066 return exp - 127; 5067 } 5068 /* nan or zero */ 5069 float_raise(float_flag_invalid, s); 5070 return INT32_MIN; 5071 } 5072 5073 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 5074 { 5075 /* Extract frac to the top of the uint64_t. */ 5076 uint64_t frac = a << 12; 5077 int64_t exp = extract64(a, 52, 11); 5078 5079 if (unlikely(exp == 0)) { 5080 if (frac != 0) { 5081 if (!get_flush_inputs_to_zero(s)) { 5082 /* denormal: bias - fractional_zeros */ 5083 return -1023 - clz64(frac); 5084 } 5085 /* flush to zero */ 5086 float_raise(float_flag_input_denormal_flushed, s); 5087 } 5088 } else if (unlikely(exp == 0x7ff)) { 5089 if (frac == 0) { 5090 return INT64_MAX; /* infinity */ 5091 } 5092 } else { 5093 /* normal: exp - bias */ 5094 return exp - 1023; 5095 } 5096 /* nan or zero */ 5097 float_raise(float_flag_invalid, s); 5098 return INT64_MIN; 5099 } 5100 5101 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 5102 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 5103 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 5104 5105 #undef DO_ZPZ_FP 5106 5107 static void do_fmla_zpzzz_b16(void *vd, void *vn, void *vm, void *va, void *vg, 5108 float_status *status, uint32_t desc, 5109 uint16_t neg1, uint16_t neg3, int flags) 5110 { 5111 intptr_t i = simd_oprsz(desc); 5112 uint64_t *g = vg; 5113 5114 do { 5115 uint64_t pg = g[(i - 1) >> 6]; 5116 do { 5117 i -= 2; 5118 if (likely((pg >> (i & 63)) & 1)) { 5119 float16 e1, e2, e3, r; 5120 5121 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 5122 e2 = *(uint16_t *)(vm + H1_2(i)); 5123 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 5124 r = bfloat16_muladd(e1, e2, e3, flags, status); 5125 *(uint16_t *)(vd + H1_2(i)) = r; 5126 } 5127 } while (i & 63); 5128 } while (i != 0); 5129 } 5130 5131 void HELPER(sve_fmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5132 void *vg, float_status *status, uint32_t desc) 5133 { 5134 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5135 } 5136 5137 void HELPER(sve_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5138 void *vg, float_status *status, uint32_t desc) 5139 { 5140 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 5141 } 5142 5143 void HELPER(sve_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5144 void *vg, float_status *status, uint32_t desc) 5145 { 5146 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 5147 } 5148 5149 void HELPER(sve_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5150 void *vg, float_status *status, uint32_t desc) 5151 { 5152 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 5153 } 5154 5155 void HELPER(sve_ah_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5156 void *vg, float_status *status, uint32_t desc) 5157 { 5158 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 5159 float_muladd_negate_product); 5160 } 5161 5162 void HELPER(sve_ah_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5163 void *vg, float_status *status, uint32_t desc) 5164 { 5165 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 5166 float_muladd_negate_product | float_muladd_negate_c); 5167 } 5168 5169 void HELPER(sve_ah_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5170 void *vg, float_status *status, uint32_t desc) 5171 { 5172 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 5173 float_muladd_negate_c); 5174 } 5175 5176 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 5177 float_status *status, uint32_t desc, 5178 uint16_t neg1, uint16_t neg3, int flags) 5179 { 5180 intptr_t i = simd_oprsz(desc); 5181 uint64_t *g = vg; 5182 5183 do { 5184 uint64_t pg = g[(i - 1) >> 6]; 5185 do { 5186 i -= 2; 5187 if (likely((pg >> (i & 63)) & 1)) { 5188 float16 e1, e2, e3, r; 5189 5190 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 5191 e2 = *(uint16_t *)(vm + H1_2(i)); 5192 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 5193 r = float16_muladd(e1, e2, e3, flags, status); 5194 *(uint16_t *)(vd + H1_2(i)) = r; 5195 } 5196 } while (i & 63); 5197 } while (i != 0); 5198 } 5199 5200 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5201 void *vg, float_status *status, uint32_t desc) 5202 { 5203 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5204 } 5205 5206 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5207 void *vg, float_status *status, uint32_t desc) 5208 { 5209 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 5210 } 5211 5212 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5213 void *vg, float_status *status, uint32_t desc) 5214 { 5215 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 5216 } 5217 5218 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5219 void *vg, float_status *status, uint32_t desc) 5220 { 5221 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 5222 } 5223 5224 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5225 void *vg, float_status *status, uint32_t desc) 5226 { 5227 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5228 float_muladd_negate_product); 5229 } 5230 5231 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5232 void *vg, float_status *status, uint32_t desc) 5233 { 5234 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5235 float_muladd_negate_product | float_muladd_negate_c); 5236 } 5237 5238 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5239 void *vg, float_status *status, uint32_t desc) 5240 { 5241 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5242 float_muladd_negate_c); 5243 } 5244 5245 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 5246 float_status *status, uint32_t desc, 5247 uint32_t neg1, uint32_t neg3, int flags) 5248 { 5249 intptr_t i = simd_oprsz(desc); 5250 uint64_t *g = vg; 5251 5252 do { 5253 uint64_t pg = g[(i - 1) >> 6]; 5254 do { 5255 i -= 4; 5256 if (likely((pg >> (i & 63)) & 1)) { 5257 float32 e1, e2, e3, r; 5258 5259 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 5260 e2 = *(uint32_t *)(vm + H1_4(i)); 5261 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 5262 r = float32_muladd(e1, e2, e3, flags, status); 5263 *(uint32_t *)(vd + H1_4(i)) = r; 5264 } 5265 } while (i & 63); 5266 } while (i != 0); 5267 } 5268 5269 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5270 void *vg, float_status *status, uint32_t desc) 5271 { 5272 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5273 } 5274 5275 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5276 void *vg, float_status *status, uint32_t desc) 5277 { 5278 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0); 5279 } 5280 5281 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5282 void *vg, float_status *status, uint32_t desc) 5283 { 5284 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0); 5285 } 5286 5287 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5288 void *vg, float_status *status, uint32_t desc) 5289 { 5290 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0); 5291 } 5292 5293 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5294 void *vg, float_status *status, uint32_t desc) 5295 { 5296 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5297 float_muladd_negate_product); 5298 } 5299 5300 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5301 void *vg, float_status *status, uint32_t desc) 5302 { 5303 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5304 float_muladd_negate_product | float_muladd_negate_c); 5305 } 5306 5307 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5308 void *vg, float_status *status, uint32_t desc) 5309 { 5310 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5311 float_muladd_negate_c); 5312 } 5313 5314 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 5315 float_status *status, uint32_t desc, 5316 uint64_t neg1, uint64_t neg3, int flags) 5317 { 5318 intptr_t i = simd_oprsz(desc); 5319 uint64_t *g = vg; 5320 5321 do { 5322 uint64_t pg = g[(i - 1) >> 6]; 5323 do { 5324 i -= 8; 5325 if (likely((pg >> (i & 63)) & 1)) { 5326 float64 e1, e2, e3, r; 5327 5328 e1 = *(uint64_t *)(vn + i) ^ neg1; 5329 e2 = *(uint64_t *)(vm + i); 5330 e3 = *(uint64_t *)(va + i) ^ neg3; 5331 r = float64_muladd(e1, e2, e3, flags, status); 5332 *(uint64_t *)(vd + i) = r; 5333 } 5334 } while (i & 63); 5335 } while (i != 0); 5336 } 5337 5338 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5339 void *vg, float_status *status, uint32_t desc) 5340 { 5341 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5342 } 5343 5344 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5345 void *vg, float_status *status, uint32_t desc) 5346 { 5347 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0); 5348 } 5349 5350 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5351 void *vg, float_status *status, uint32_t desc) 5352 { 5353 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0); 5354 } 5355 5356 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5357 void *vg, float_status *status, uint32_t desc) 5358 { 5359 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0); 5360 } 5361 5362 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5363 void *vg, float_status *status, uint32_t desc) 5364 { 5365 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5366 float_muladd_negate_product); 5367 } 5368 5369 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5370 void *vg, float_status *status, uint32_t desc) 5371 { 5372 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5373 float_muladd_negate_product | float_muladd_negate_c); 5374 } 5375 5376 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5377 void *vg, float_status *status, uint32_t desc) 5378 { 5379 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5380 float_muladd_negate_c); 5381 } 5382 5383 /* Two operand floating-point comparison controlled by a predicate. 5384 * Unlike the integer version, we are not allowed to optimistically 5385 * compare operands, since the comparison may have side effects wrt 5386 * the FPSR. 5387 */ 5388 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 5389 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 5390 float_status *status, uint32_t desc) \ 5391 { \ 5392 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5393 uint64_t *d = vd, *g = vg; \ 5394 do { \ 5395 uint64_t out = 0, pg = g[j]; \ 5396 do { \ 5397 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5398 if (likely((pg >> (i & 63)) & 1)) { \ 5399 TYPE nn = *(TYPE *)(vn + H(i)); \ 5400 TYPE mm = *(TYPE *)(vm + H(i)); \ 5401 out |= OP(TYPE, nn, mm, status); \ 5402 } \ 5403 } while (i & 63); \ 5404 d[j--] = out; \ 5405 } while (i > 0); \ 5406 } 5407 5408 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 5409 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 5410 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 5411 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 5412 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 5413 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 5414 5415 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 5416 DO_FPCMP_PPZZ_H(NAME, OP) \ 5417 DO_FPCMP_PPZZ_S(NAME, OP) \ 5418 DO_FPCMP_PPZZ_D(NAME, OP) 5419 5420 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 5421 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 5422 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 5423 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 5424 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 5425 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 5426 #define DO_FCMUO(TYPE, X, Y, ST) \ 5427 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 5428 #define DO_FACGE(TYPE, X, Y, ST) \ 5429 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 5430 #define DO_FACGT(TYPE, X, Y, ST) \ 5431 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 5432 5433 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 5434 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 5435 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 5436 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 5437 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 5438 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 5439 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 5440 5441 #undef DO_FPCMP_PPZZ_ALL 5442 #undef DO_FPCMP_PPZZ_D 5443 #undef DO_FPCMP_PPZZ_S 5444 #undef DO_FPCMP_PPZZ_H 5445 #undef DO_FPCMP_PPZZ 5446 5447 /* One operand floating-point comparison against zero, controlled 5448 * by a predicate. 5449 */ 5450 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 5451 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 5452 float_status *status, uint32_t desc) \ 5453 { \ 5454 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5455 uint64_t *d = vd, *g = vg; \ 5456 do { \ 5457 uint64_t out = 0, pg = g[j]; \ 5458 do { \ 5459 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5460 if ((pg >> (i & 63)) & 1) { \ 5461 TYPE nn = *(TYPE *)(vn + H(i)); \ 5462 out |= OP(TYPE, nn, 0, status); \ 5463 } \ 5464 } while (i & 63); \ 5465 d[j--] = out; \ 5466 } while (i > 0); \ 5467 } 5468 5469 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 5470 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 5471 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 5472 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 5473 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 5474 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 5475 5476 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 5477 DO_FPCMP_PPZ0_H(NAME, OP) \ 5478 DO_FPCMP_PPZ0_S(NAME, OP) \ 5479 DO_FPCMP_PPZ0_D(NAME, OP) 5480 5481 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 5482 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 5483 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 5484 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 5485 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 5486 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 5487 5488 /* FP Trig Multiply-Add. */ 5489 5490 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 5491 float_status *s, uint32_t desc) 5492 { 5493 static const float16 coeff[16] = { 5494 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5495 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5496 }; 5497 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 5498 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5499 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5500 float16 *d = vd, *n = vn, *m = vm; 5501 5502 for (i = 0; i < opr_sz; i++) { 5503 float16 mm = m[i]; 5504 intptr_t xx = x; 5505 int flags = 0; 5506 5507 if (float16_is_neg(mm)) { 5508 if (fpcr_ah) { 5509 flags = float_muladd_negate_product; 5510 } else { 5511 mm = float16_abs(mm); 5512 } 5513 xx += 8; 5514 } 5515 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s); 5516 } 5517 } 5518 5519 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5520 float_status *s, uint32_t desc) 5521 { 5522 static const float32 coeff[16] = { 5523 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5524 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5525 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5526 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5527 }; 5528 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5529 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5530 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5531 float32 *d = vd, *n = vn, *m = vm; 5532 5533 for (i = 0; i < opr_sz; i++) { 5534 float32 mm = m[i]; 5535 intptr_t xx = x; 5536 int flags = 0; 5537 5538 if (float32_is_neg(mm)) { 5539 if (fpcr_ah) { 5540 flags = float_muladd_negate_product; 5541 } else { 5542 mm = float32_abs(mm); 5543 } 5544 xx += 8; 5545 } 5546 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s); 5547 } 5548 } 5549 5550 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5551 float_status *s, uint32_t desc) 5552 { 5553 static const float64 coeff[16] = { 5554 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5555 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5556 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5557 0x3de5d8408868552full, 0x0000000000000000ull, 5558 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5559 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5560 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5561 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5562 }; 5563 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5564 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5565 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5566 float64 *d = vd, *n = vn, *m = vm; 5567 5568 for (i = 0; i < opr_sz; i++) { 5569 float64 mm = m[i]; 5570 intptr_t xx = x; 5571 int flags = 0; 5572 5573 if (float64_is_neg(mm)) { 5574 if (fpcr_ah) { 5575 flags = float_muladd_negate_product; 5576 } else { 5577 mm = float64_abs(mm); 5578 } 5579 xx += 8; 5580 } 5581 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s); 5582 } 5583 } 5584 5585 /* 5586 * FP Complex Add 5587 */ 5588 5589 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5590 float_status *s, uint32_t desc) 5591 { 5592 intptr_t j, i = simd_oprsz(desc); 5593 uint64_t *g = vg; 5594 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5595 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5596 5597 do { 5598 uint64_t pg = g[(i - 1) >> 6]; 5599 do { 5600 float16 e0, e1, e2, e3; 5601 5602 /* I holds the real index; J holds the imag index. */ 5603 j = i - sizeof(float16); 5604 i -= 2 * sizeof(float16); 5605 5606 e0 = *(float16 *)(vn + H1_2(i)); 5607 e1 = *(float16 *)(vm + H1_2(j)); 5608 e2 = *(float16 *)(vn + H1_2(j)); 5609 e3 = *(float16 *)(vm + H1_2(i)); 5610 5611 if (rot) { 5612 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 5613 } else { 5614 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 5615 } 5616 5617 if (likely((pg >> (i & 63)) & 1)) { 5618 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5619 } 5620 if (likely((pg >> (j & 63)) & 1)) { 5621 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5622 } 5623 } while (i & 63); 5624 } while (i != 0); 5625 } 5626 5627 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5628 float_status *s, uint32_t desc) 5629 { 5630 intptr_t j, i = simd_oprsz(desc); 5631 uint64_t *g = vg; 5632 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5633 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5634 5635 do { 5636 uint64_t pg = g[(i - 1) >> 6]; 5637 do { 5638 float32 e0, e1, e2, e3; 5639 5640 /* I holds the real index; J holds the imag index. */ 5641 j = i - sizeof(float32); 5642 i -= 2 * sizeof(float32); 5643 5644 e0 = *(float32 *)(vn + H1_2(i)); 5645 e1 = *(float32 *)(vm + H1_2(j)); 5646 e2 = *(float32 *)(vn + H1_2(j)); 5647 e3 = *(float32 *)(vm + H1_2(i)); 5648 5649 if (rot) { 5650 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 5651 } else { 5652 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 5653 } 5654 5655 if (likely((pg >> (i & 63)) & 1)) { 5656 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5657 } 5658 if (likely((pg >> (j & 63)) & 1)) { 5659 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5660 } 5661 } while (i & 63); 5662 } while (i != 0); 5663 } 5664 5665 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5666 float_status *s, uint32_t desc) 5667 { 5668 intptr_t j, i = simd_oprsz(desc); 5669 uint64_t *g = vg; 5670 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5671 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5672 5673 do { 5674 uint64_t pg = g[(i - 1) >> 6]; 5675 do { 5676 float64 e0, e1, e2, e3; 5677 5678 /* I holds the real index; J holds the imag index. */ 5679 j = i - sizeof(float64); 5680 i -= 2 * sizeof(float64); 5681 5682 e0 = *(float64 *)(vn + H1_2(i)); 5683 e1 = *(float64 *)(vm + H1_2(j)); 5684 e2 = *(float64 *)(vn + H1_2(j)); 5685 e3 = *(float64 *)(vm + H1_2(i)); 5686 5687 if (rot) { 5688 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 5689 } else { 5690 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 5691 } 5692 5693 if (likely((pg >> (i & 63)) & 1)) { 5694 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5695 } 5696 if (likely((pg >> (j & 63)) & 1)) { 5697 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5698 } 5699 } while (i & 63); 5700 } while (i != 0); 5701 } 5702 5703 /* 5704 * FP Complex Multiply 5705 */ 5706 5707 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5708 void *vg, float_status *status, uint32_t desc) 5709 { 5710 intptr_t j, i = simd_oprsz(desc); 5711 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5712 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5713 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5714 uint32_t negf_real = flip ^ negf_imag; 5715 float16 negx_imag, negx_real; 5716 uint64_t *g = vg; 5717 5718 /* With AH=0, use negx; with AH=1 use negf. */ 5719 negx_real = (negf_real & ~fpcr_ah) << 15; 5720 negx_imag = (negf_imag & ~fpcr_ah) << 15; 5721 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5722 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5723 5724 do { 5725 uint64_t pg = g[(i - 1) >> 6]; 5726 do { 5727 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5728 5729 /* I holds the real index; J holds the imag index. */ 5730 j = i - sizeof(float16); 5731 i -= 2 * sizeof(float16); 5732 5733 nr = *(float16 *)(vn + H1_2(i)); 5734 ni = *(float16 *)(vn + H1_2(j)); 5735 mr = *(float16 *)(vm + H1_2(i)); 5736 mi = *(float16 *)(vm + H1_2(j)); 5737 5738 e2 = (flip ? ni : nr); 5739 e1 = (flip ? mi : mr) ^ negx_real; 5740 e4 = e2; 5741 e3 = (flip ? mr : mi) ^ negx_imag; 5742 5743 if (likely((pg >> (i & 63)) & 1)) { 5744 d = *(float16 *)(va + H1_2(i)); 5745 d = float16_muladd(e2, e1, d, negf_real, status); 5746 *(float16 *)(vd + H1_2(i)) = d; 5747 } 5748 if (likely((pg >> (j & 63)) & 1)) { 5749 d = *(float16 *)(va + H1_2(j)); 5750 d = float16_muladd(e4, e3, d, negf_imag, status); 5751 *(float16 *)(vd + H1_2(j)) = d; 5752 } 5753 } while (i & 63); 5754 } while (i != 0); 5755 } 5756 5757 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5758 void *vg, float_status *status, uint32_t desc) 5759 { 5760 intptr_t j, i = simd_oprsz(desc); 5761 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5762 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5763 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5764 uint32_t negf_real = flip ^ negf_imag; 5765 float32 negx_imag, negx_real; 5766 uint64_t *g = vg; 5767 5768 /* With AH=0, use negx; with AH=1 use negf. */ 5769 negx_real = (negf_real & ~fpcr_ah) << 31; 5770 negx_imag = (negf_imag & ~fpcr_ah) << 31; 5771 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5772 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5773 5774 do { 5775 uint64_t pg = g[(i - 1) >> 6]; 5776 do { 5777 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5778 5779 /* I holds the real index; J holds the imag index. */ 5780 j = i - sizeof(float32); 5781 i -= 2 * sizeof(float32); 5782 5783 nr = *(float32 *)(vn + H1_2(i)); 5784 ni = *(float32 *)(vn + H1_2(j)); 5785 mr = *(float32 *)(vm + H1_2(i)); 5786 mi = *(float32 *)(vm + H1_2(j)); 5787 5788 e2 = (flip ? ni : nr); 5789 e1 = (flip ? mi : mr) ^ negx_real; 5790 e4 = e2; 5791 e3 = (flip ? mr : mi) ^ negx_imag; 5792 5793 if (likely((pg >> (i & 63)) & 1)) { 5794 d = *(float32 *)(va + H1_2(i)); 5795 d = float32_muladd(e2, e1, d, negf_real, status); 5796 *(float32 *)(vd + H1_2(i)) = d; 5797 } 5798 if (likely((pg >> (j & 63)) & 1)) { 5799 d = *(float32 *)(va + H1_2(j)); 5800 d = float32_muladd(e4, e3, d, negf_imag, status); 5801 *(float32 *)(vd + H1_2(j)) = d; 5802 } 5803 } while (i & 63); 5804 } while (i != 0); 5805 } 5806 5807 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5808 void *vg, float_status *status, uint32_t desc) 5809 { 5810 intptr_t j, i = simd_oprsz(desc); 5811 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5812 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5813 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5814 uint32_t negf_real = flip ^ negf_imag; 5815 float64 negx_imag, negx_real; 5816 uint64_t *g = vg; 5817 5818 /* With AH=0, use negx; with AH=1 use negf. */ 5819 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 5820 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 5821 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5822 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5823 5824 do { 5825 uint64_t pg = g[(i - 1) >> 6]; 5826 do { 5827 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5828 5829 /* I holds the real index; J holds the imag index. */ 5830 j = i - sizeof(float64); 5831 i -= 2 * sizeof(float64); 5832 5833 nr = *(float64 *)(vn + H1_2(i)); 5834 ni = *(float64 *)(vn + H1_2(j)); 5835 mr = *(float64 *)(vm + H1_2(i)); 5836 mi = *(float64 *)(vm + H1_2(j)); 5837 5838 e2 = (flip ? ni : nr); 5839 e1 = (flip ? mi : mr) ^ negx_real; 5840 e4 = e2; 5841 e3 = (flip ? mr : mi) ^ negx_imag; 5842 5843 if (likely((pg >> (i & 63)) & 1)) { 5844 d = *(float64 *)(va + H1_2(i)); 5845 d = float64_muladd(e2, e1, d, negf_real, status); 5846 *(float64 *)(vd + H1_2(i)) = d; 5847 } 5848 if (likely((pg >> (j & 63)) & 1)) { 5849 d = *(float64 *)(va + H1_2(j)); 5850 d = float64_muladd(e4, e3, d, negf_imag, status); 5851 *(float64 *)(vd + H1_2(j)) = d; 5852 } 5853 } while (i & 63); 5854 } while (i != 0); 5855 } 5856 5857 /* 5858 * Load contiguous data, protected by a governing predicate. 5859 */ 5860 5861 /* 5862 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5863 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5864 * element >= @reg_off, or @reg_max if there were no active elements at all. 5865 */ 5866 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5867 intptr_t reg_max, int esz) 5868 { 5869 uint64_t pg_mask = pred_esz_masks[esz]; 5870 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5871 5872 /* In normal usage, the first element is active. */ 5873 if (likely(pg & 1)) { 5874 return reg_off; 5875 } 5876 5877 if (pg == 0) { 5878 reg_off &= -64; 5879 do { 5880 reg_off += 64; 5881 if (unlikely(reg_off >= reg_max)) { 5882 /* The entire predicate was false. */ 5883 return reg_max; 5884 } 5885 pg = vg[reg_off >> 6] & pg_mask; 5886 } while (pg == 0); 5887 } 5888 reg_off += ctz64(pg); 5889 5890 /* We should never see an out of range predicate bit set. */ 5891 tcg_debug_assert(reg_off < reg_max); 5892 return reg_off; 5893 } 5894 5895 /* 5896 * Resolve the guest virtual address to info->host and info->flags. 5897 * If @nofault, return false if the page is invalid, otherwise 5898 * exit via page fault exception. 5899 */ 5900 5901 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5902 target_ulong addr, int mem_off, MMUAccessType access_type, 5903 int mmu_idx, uintptr_t retaddr) 5904 { 5905 int flags; 5906 5907 addr += mem_off; 5908 5909 /* 5910 * User-only currently always issues with TBI. See the comment 5911 * above useronly_clean_ptr. Usually we clean this top byte away 5912 * during translation, but we can't do that for e.g. vector + imm 5913 * addressing modes. 5914 * 5915 * We currently always enable TBI for user-only, and do not provide 5916 * a way to turn it off. So clean the pointer unconditionally here, 5917 * rather than look it up here, or pass it down from above. 5918 */ 5919 addr = useronly_clean_ptr(addr); 5920 5921 #ifdef CONFIG_USER_ONLY 5922 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5923 &info->host, retaddr); 5924 #else 5925 CPUTLBEntryFull *full; 5926 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5927 &info->host, &full, retaddr); 5928 #endif 5929 info->flags = flags; 5930 5931 if (flags & TLB_INVALID_MASK) { 5932 g_assert(nofault); 5933 return false; 5934 } 5935 5936 #ifdef CONFIG_USER_ONLY 5937 memset(&info->attrs, 0, sizeof(info->attrs)); 5938 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5939 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5940 #else 5941 info->attrs = full->attrs; 5942 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5943 #endif 5944 5945 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5946 info->host -= mem_off; 5947 return true; 5948 } 5949 5950 /* 5951 * Find first active element on each page, and a loose bound for the 5952 * final element on each page. Identify any single element that spans 5953 * the page boundary. Return true if there are any active elements. 5954 */ 5955 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5956 intptr_t reg_max, int esz, int msize) 5957 { 5958 const int esize = 1 << esz; 5959 const uint64_t pg_mask = pred_esz_masks[esz]; 5960 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5961 intptr_t mem_off_last, mem_off_split; 5962 intptr_t page_split, elt_split; 5963 intptr_t i; 5964 5965 /* Set all of the element indices to -1, and the TLB data to 0. */ 5966 memset(info, -1, offsetof(SVEContLdSt, page)); 5967 memset(info->page, 0, sizeof(info->page)); 5968 5969 /* Gross scan over the entire predicate to find bounds. */ 5970 i = 0; 5971 do { 5972 uint64_t pg = vg[i] & pg_mask; 5973 if (pg) { 5974 reg_off_last = i * 64 + 63 - clz64(pg); 5975 if (reg_off_first < 0) { 5976 reg_off_first = i * 64 + ctz64(pg); 5977 } 5978 } 5979 } while (++i * 64 < reg_max); 5980 5981 if (unlikely(reg_off_first < 0)) { 5982 /* No active elements, no pages touched. */ 5983 return false; 5984 } 5985 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5986 5987 info->reg_off_first[0] = reg_off_first; 5988 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5989 mem_off_last = (reg_off_last >> esz) * msize; 5990 5991 page_split = -(addr | TARGET_PAGE_MASK); 5992 if (likely(mem_off_last + msize <= page_split)) { 5993 /* The entire operation fits within a single page. */ 5994 info->reg_off_last[0] = reg_off_last; 5995 return true; 5996 } 5997 5998 info->page_split = page_split; 5999 elt_split = page_split / msize; 6000 reg_off_split = elt_split << esz; 6001 mem_off_split = elt_split * msize; 6002 6003 /* 6004 * This is the last full element on the first page, but it is not 6005 * necessarily active. If there is no full element, i.e. the first 6006 * active element is the one that's split, this value remains -1. 6007 * It is useful as iteration bounds. 6008 */ 6009 if (elt_split != 0) { 6010 info->reg_off_last[0] = reg_off_split - esize; 6011 } 6012 6013 /* Determine if an unaligned element spans the pages. */ 6014 if (page_split % msize != 0) { 6015 /* It is helpful to know if the split element is active. */ 6016 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 6017 info->reg_off_split = reg_off_split; 6018 info->mem_off_split = mem_off_split; 6019 6020 if (reg_off_split == reg_off_last) { 6021 /* The page crossing element is last. */ 6022 return true; 6023 } 6024 } 6025 reg_off_split += esize; 6026 mem_off_split += msize; 6027 } 6028 6029 /* 6030 * We do want the first active element on the second page, because 6031 * this may affect the address reported in an exception. 6032 */ 6033 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 6034 tcg_debug_assert(reg_off_split <= reg_off_last); 6035 info->reg_off_first[1] = reg_off_split; 6036 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 6037 info->reg_off_last[1] = reg_off_last; 6038 return true; 6039 } 6040 6041 /* 6042 * Resolve the guest virtual addresses to info->page[]. 6043 * Control the generation of page faults with @fault. Return false if 6044 * there is no work to do, which can only happen with @fault == FAULT_NO. 6045 */ 6046 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 6047 CPUARMState *env, target_ulong addr, 6048 MMUAccessType access_type, uintptr_t retaddr) 6049 { 6050 int mmu_idx = arm_env_mmu_index(env); 6051 int mem_off = info->mem_off_first[0]; 6052 bool nofault = fault == FAULT_NO; 6053 bool have_work = true; 6054 6055 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 6056 access_type, mmu_idx, retaddr)) { 6057 /* No work to be done. */ 6058 return false; 6059 } 6060 6061 if (likely(info->page_split < 0)) { 6062 /* The entire operation was on the one page. */ 6063 return true; 6064 } 6065 6066 /* 6067 * If the second page is invalid, then we want the fault address to be 6068 * the first byte on that page which is accessed. 6069 */ 6070 if (info->mem_off_split >= 0) { 6071 /* 6072 * There is an element split across the pages. The fault address 6073 * should be the first byte of the second page. 6074 */ 6075 mem_off = info->page_split; 6076 /* 6077 * If the split element is also the first active element 6078 * of the vector, then: For first-fault we should continue 6079 * to generate faults for the second page. For no-fault, 6080 * we have work only if the second page is valid. 6081 */ 6082 if (info->mem_off_first[0] < info->mem_off_split) { 6083 nofault = FAULT_FIRST; 6084 have_work = false; 6085 } 6086 } else { 6087 /* 6088 * There is no element split across the pages. The fault address 6089 * should be the first active element on the second page. 6090 */ 6091 mem_off = info->mem_off_first[1]; 6092 /* 6093 * There must have been one active element on the first page, 6094 * so we're out of first-fault territory. 6095 */ 6096 nofault = fault != FAULT_ALL; 6097 } 6098 6099 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 6100 access_type, mmu_idx, retaddr); 6101 return have_work; 6102 } 6103 6104 #ifndef CONFIG_USER_ONLY 6105 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 6106 uint64_t *vg, target_ulong addr, 6107 int esize, int msize, int wp_access, 6108 uintptr_t retaddr) 6109 { 6110 intptr_t mem_off, reg_off, reg_last; 6111 int flags0 = info->page[0].flags; 6112 int flags1 = info->page[1].flags; 6113 6114 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 6115 return; 6116 } 6117 6118 /* Indicate that watchpoints are handled. */ 6119 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 6120 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 6121 6122 if (flags0 & TLB_WATCHPOINT) { 6123 mem_off = info->mem_off_first[0]; 6124 reg_off = info->reg_off_first[0]; 6125 reg_last = info->reg_off_last[0]; 6126 6127 while (reg_off <= reg_last) { 6128 uint64_t pg = vg[reg_off >> 6]; 6129 do { 6130 if ((pg >> (reg_off & 63)) & 1) { 6131 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 6132 msize, info->page[0].attrs, 6133 wp_access, retaddr); 6134 } 6135 reg_off += esize; 6136 mem_off += msize; 6137 } while (reg_off <= reg_last && (reg_off & 63)); 6138 } 6139 } 6140 6141 mem_off = info->mem_off_split; 6142 if (mem_off >= 0) { 6143 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 6144 info->page[0].attrs, wp_access, retaddr); 6145 } 6146 6147 mem_off = info->mem_off_first[1]; 6148 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 6149 reg_off = info->reg_off_first[1]; 6150 reg_last = info->reg_off_last[1]; 6151 6152 do { 6153 uint64_t pg = vg[reg_off >> 6]; 6154 do { 6155 if ((pg >> (reg_off & 63)) & 1) { 6156 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 6157 msize, info->page[1].attrs, 6158 wp_access, retaddr); 6159 } 6160 reg_off += esize; 6161 mem_off += msize; 6162 } while (reg_off & 63); 6163 } while (reg_off <= reg_last); 6164 } 6165 } 6166 #endif 6167 6168 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 6169 uint64_t *vg, target_ulong addr, int esize, 6170 int msize, uint32_t mtedesc, uintptr_t ra) 6171 { 6172 intptr_t mem_off, reg_off, reg_last; 6173 6174 /* Process the page only if MemAttr == Tagged. */ 6175 if (info->page[0].tagged) { 6176 mem_off = info->mem_off_first[0]; 6177 reg_off = info->reg_off_first[0]; 6178 reg_last = info->reg_off_split; 6179 if (reg_last < 0) { 6180 reg_last = info->reg_off_last[0]; 6181 } 6182 6183 do { 6184 uint64_t pg = vg[reg_off >> 6]; 6185 do { 6186 if ((pg >> (reg_off & 63)) & 1) { 6187 mte_check(env, mtedesc, addr, ra); 6188 } 6189 reg_off += esize; 6190 mem_off += msize; 6191 } while (reg_off <= reg_last && (reg_off & 63)); 6192 } while (reg_off <= reg_last); 6193 } 6194 6195 mem_off = info->mem_off_first[1]; 6196 if (mem_off >= 0 && info->page[1].tagged) { 6197 reg_off = info->reg_off_first[1]; 6198 reg_last = info->reg_off_last[1]; 6199 6200 do { 6201 uint64_t pg = vg[reg_off >> 6]; 6202 do { 6203 if ((pg >> (reg_off & 63)) & 1) { 6204 mte_check(env, mtedesc, addr, ra); 6205 } 6206 reg_off += esize; 6207 mem_off += msize; 6208 } while (reg_off & 63); 6209 } while (reg_off <= reg_last); 6210 } 6211 } 6212 6213 /* 6214 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6215 */ 6216 static inline QEMU_ALWAYS_INLINE 6217 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 6218 uint32_t desc, const uintptr_t retaddr, 6219 const int esz, const int msz, const int N, uint32_t mtedesc, 6220 sve_ldst1_host_fn *host_fn, 6221 sve_ldst1_tlb_fn *tlb_fn) 6222 { 6223 const unsigned rd = simd_data(desc); 6224 const intptr_t reg_max = simd_oprsz(desc); 6225 intptr_t reg_off, reg_last, mem_off; 6226 SVEContLdSt info; 6227 void *host; 6228 int flags, i; 6229 6230 /* Find the active elements. */ 6231 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6232 /* The entire predicate was false; no load occurs. */ 6233 for (i = 0; i < N; ++i) { 6234 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 6235 } 6236 return; 6237 } 6238 6239 /* Probe the page(s). Exit with exception for any invalid page. */ 6240 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 6241 6242 /* Handle watchpoints for all active elements. */ 6243 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6244 BP_MEM_READ, retaddr); 6245 6246 /* 6247 * Handle mte checks for all active elements. 6248 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6249 */ 6250 if (mtedesc) { 6251 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6252 mtedesc, retaddr); 6253 } 6254 6255 flags = info.page[0].flags | info.page[1].flags; 6256 if (unlikely(flags != 0)) { 6257 /* 6258 * At least one page includes MMIO. 6259 * Any bus operation can fail with cpu_transaction_failed, 6260 * which for ARM will raise SyncExternal. Perform the load 6261 * into scratch memory to preserve register state until the end. 6262 */ 6263 ARMVectorReg scratch[4] = { }; 6264 6265 mem_off = info.mem_off_first[0]; 6266 reg_off = info.reg_off_first[0]; 6267 reg_last = info.reg_off_last[1]; 6268 if (reg_last < 0) { 6269 reg_last = info.reg_off_split; 6270 if (reg_last < 0) { 6271 reg_last = info.reg_off_last[0]; 6272 } 6273 } 6274 6275 do { 6276 uint64_t pg = vg[reg_off >> 6]; 6277 do { 6278 if ((pg >> (reg_off & 63)) & 1) { 6279 for (i = 0; i < N; ++i) { 6280 tlb_fn(env, &scratch[i], reg_off, 6281 addr + mem_off + (i << msz), retaddr); 6282 } 6283 } 6284 reg_off += 1 << esz; 6285 mem_off += N << msz; 6286 } while (reg_off & 63); 6287 } while (reg_off <= reg_last); 6288 6289 for (i = 0; i < N; ++i) { 6290 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 6291 } 6292 return; 6293 } 6294 6295 /* The entire operation is in RAM, on valid pages. */ 6296 6297 for (i = 0; i < N; ++i) { 6298 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 6299 } 6300 6301 mem_off = info.mem_off_first[0]; 6302 reg_off = info.reg_off_first[0]; 6303 reg_last = info.reg_off_last[0]; 6304 host = info.page[0].host; 6305 6306 set_helper_retaddr(retaddr); 6307 6308 while (reg_off <= reg_last) { 6309 uint64_t pg = vg[reg_off >> 6]; 6310 do { 6311 if ((pg >> (reg_off & 63)) & 1) { 6312 for (i = 0; i < N; ++i) { 6313 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6314 host + mem_off + (i << msz)); 6315 } 6316 } 6317 reg_off += 1 << esz; 6318 mem_off += N << msz; 6319 } while (reg_off <= reg_last && (reg_off & 63)); 6320 } 6321 6322 clear_helper_retaddr(); 6323 6324 /* 6325 * Use the slow path to manage the cross-page misalignment. 6326 * But we know this is RAM and cannot trap. 6327 */ 6328 mem_off = info.mem_off_split; 6329 if (unlikely(mem_off >= 0)) { 6330 reg_off = info.reg_off_split; 6331 for (i = 0; i < N; ++i) { 6332 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6333 addr + mem_off + (i << msz), retaddr); 6334 } 6335 } 6336 6337 mem_off = info.mem_off_first[1]; 6338 if (unlikely(mem_off >= 0)) { 6339 reg_off = info.reg_off_first[1]; 6340 reg_last = info.reg_off_last[1]; 6341 host = info.page[1].host; 6342 6343 set_helper_retaddr(retaddr); 6344 6345 do { 6346 uint64_t pg = vg[reg_off >> 6]; 6347 do { 6348 if ((pg >> (reg_off & 63)) & 1) { 6349 for (i = 0; i < N; ++i) { 6350 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6351 host + mem_off + (i << msz)); 6352 } 6353 } 6354 reg_off += 1 << esz; 6355 mem_off += N << msz; 6356 } while (reg_off & 63); 6357 } while (reg_off <= reg_last); 6358 6359 clear_helper_retaddr(); 6360 } 6361 } 6362 6363 static inline QEMU_ALWAYS_INLINE 6364 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6365 uint32_t desc, const uintptr_t ra, 6366 const int esz, const int msz, const int N, 6367 sve_ldst1_host_fn *host_fn, 6368 sve_ldst1_tlb_fn *tlb_fn) 6369 { 6370 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6371 int bit55 = extract64(addr, 55, 1); 6372 6373 /* Remove mtedesc from the normal sve descriptor. */ 6374 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6375 6376 /* Perform gross MTE suppression early. */ 6377 if (!tbi_check(mtedesc, bit55) || 6378 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6379 mtedesc = 0; 6380 } 6381 6382 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6383 } 6384 6385 #define DO_LD1_1(NAME, ESZ) \ 6386 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 6387 target_ulong addr, uint32_t desc) \ 6388 { \ 6389 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 6390 sve_##NAME##_host, sve_##NAME##_tlb); \ 6391 } \ 6392 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6393 target_ulong addr, uint32_t desc) \ 6394 { \ 6395 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 6396 sve_##NAME##_host, sve_##NAME##_tlb); \ 6397 } 6398 6399 #define DO_LD1_2(NAME, ESZ, MSZ) \ 6400 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 6401 target_ulong addr, uint32_t desc) \ 6402 { \ 6403 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6404 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6405 } \ 6406 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 6407 target_ulong addr, uint32_t desc) \ 6408 { \ 6409 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6410 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6411 } \ 6412 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6413 target_ulong addr, uint32_t desc) \ 6414 { \ 6415 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6416 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6417 } \ 6418 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6419 target_ulong addr, uint32_t desc) \ 6420 { \ 6421 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6422 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6423 } 6424 6425 DO_LD1_1(ld1bb, MO_8) 6426 DO_LD1_1(ld1bhu, MO_16) 6427 DO_LD1_1(ld1bhs, MO_16) 6428 DO_LD1_1(ld1bsu, MO_32) 6429 DO_LD1_1(ld1bss, MO_32) 6430 DO_LD1_1(ld1bdu, MO_64) 6431 DO_LD1_1(ld1bds, MO_64) 6432 6433 DO_LD1_2(ld1hh, MO_16, MO_16) 6434 DO_LD1_2(ld1hsu, MO_32, MO_16) 6435 DO_LD1_2(ld1hss, MO_32, MO_16) 6436 DO_LD1_2(ld1hdu, MO_64, MO_16) 6437 DO_LD1_2(ld1hds, MO_64, MO_16) 6438 6439 DO_LD1_2(ld1ss, MO_32, MO_32) 6440 DO_LD1_2(ld1sdu, MO_64, MO_32) 6441 DO_LD1_2(ld1sds, MO_64, MO_32) 6442 6443 DO_LD1_2(ld1dd, MO_64, MO_64) 6444 6445 DO_LD1_2(ld1squ, MO_32, MO_128) 6446 DO_LD1_2(ld1dqu, MO_64, MO_128) 6447 6448 #undef DO_LD1_1 6449 #undef DO_LD1_2 6450 6451 #define DO_LDN_1(N) \ 6452 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 6453 target_ulong addr, uint32_t desc) \ 6454 { \ 6455 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 6456 sve_ld1bb_host, sve_ld1bb_tlb); \ 6457 } \ 6458 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 6459 target_ulong addr, uint32_t desc) \ 6460 { \ 6461 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 6462 sve_ld1bb_host, sve_ld1bb_tlb); \ 6463 } 6464 6465 #define DO_LDN_2(N, SUFF, ESZ) \ 6466 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 6467 target_ulong addr, uint32_t desc) \ 6468 { \ 6469 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6470 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6471 } \ 6472 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 6473 target_ulong addr, uint32_t desc) \ 6474 { \ 6475 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6476 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6477 } \ 6478 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 6479 target_ulong addr, uint32_t desc) \ 6480 { \ 6481 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6482 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6483 } \ 6484 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 6485 target_ulong addr, uint32_t desc) \ 6486 { \ 6487 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6488 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6489 } 6490 6491 DO_LDN_1(2) 6492 DO_LDN_1(3) 6493 DO_LDN_1(4) 6494 6495 DO_LDN_2(2, hh, MO_16) 6496 DO_LDN_2(3, hh, MO_16) 6497 DO_LDN_2(4, hh, MO_16) 6498 6499 DO_LDN_2(2, ss, MO_32) 6500 DO_LDN_2(3, ss, MO_32) 6501 DO_LDN_2(4, ss, MO_32) 6502 6503 DO_LDN_2(2, dd, MO_64) 6504 DO_LDN_2(3, dd, MO_64) 6505 DO_LDN_2(4, dd, MO_64) 6506 6507 DO_LDN_2(2, qq, MO_128) 6508 DO_LDN_2(3, qq, MO_128) 6509 DO_LDN_2(4, qq, MO_128) 6510 6511 #undef DO_LDN_1 6512 #undef DO_LDN_2 6513 6514 /* 6515 * Load contiguous data, first-fault and no-fault. 6516 * 6517 * For user-only, we control the race between page_check_range and 6518 * another thread's munmap by using set/clear_helper_retaddr. Any 6519 * SEGV that occurs between those markers is assumed to be because 6520 * the guest page vanished. Keep that block as small as possible 6521 * so that unrelated QEMU bugs are not blamed on the guest. 6522 */ 6523 6524 /* Fault on byte I. All bits in FFR from I are cleared. The vector 6525 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 6526 * option, which leaves subsequent data unchanged. 6527 */ 6528 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 6529 { 6530 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 6531 6532 if (i & 63) { 6533 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 6534 i = ROUND_UP(i, 64); 6535 } 6536 for (; i < oprsz; i += 64) { 6537 ffr[i / 64] = 0; 6538 } 6539 } 6540 6541 /* 6542 * Common helper for all contiguous no-fault and first-fault loads. 6543 */ 6544 static inline QEMU_ALWAYS_INLINE 6545 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 6546 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 6547 const int esz, const int msz, const SVEContFault fault, 6548 sve_ldst1_host_fn *host_fn, 6549 sve_ldst1_tlb_fn *tlb_fn) 6550 { 6551 const unsigned rd = simd_data(desc); 6552 void *vd = &env->vfp.zregs[rd]; 6553 const intptr_t reg_max = simd_oprsz(desc); 6554 intptr_t reg_off, mem_off, reg_last; 6555 SVEContLdSt info; 6556 int flags; 6557 void *host; 6558 6559 /* Find the active elements. */ 6560 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 6561 /* The entire predicate was false; no load occurs. */ 6562 memset(vd, 0, reg_max); 6563 return; 6564 } 6565 reg_off = info.reg_off_first[0]; 6566 6567 /* Probe the page(s). */ 6568 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6569 /* Fault on first element. */ 6570 tcg_debug_assert(fault == FAULT_NO); 6571 memset(vd, 0, reg_max); 6572 goto do_fault; 6573 } 6574 6575 mem_off = info.mem_off_first[0]; 6576 flags = info.page[0].flags; 6577 6578 /* 6579 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6580 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6581 */ 6582 if (!info.page[0].tagged) { 6583 mtedesc = 0; 6584 } 6585 6586 if (fault == FAULT_FIRST) { 6587 /* Trapping mte check for the first-fault element. */ 6588 if (mtedesc) { 6589 mte_check(env, mtedesc, addr + mem_off, retaddr); 6590 } 6591 6592 /* 6593 * Special handling of the first active element, 6594 * if it crosses a page boundary or is MMIO. 6595 */ 6596 bool is_split = mem_off == info.mem_off_split; 6597 if (unlikely(flags != 0) || unlikely(is_split)) { 6598 /* 6599 * Use the slow path for cross-page handling. 6600 * Might trap for MMIO or watchpoints. 6601 */ 6602 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6603 6604 /* After any fault, zero the other elements. */ 6605 swap_memzero(vd, reg_off); 6606 reg_off += 1 << esz; 6607 mem_off += 1 << msz; 6608 swap_memzero(vd + reg_off, reg_max - reg_off); 6609 6610 if (is_split) { 6611 goto second_page; 6612 } 6613 } else { 6614 memset(vd, 0, reg_max); 6615 } 6616 } else { 6617 memset(vd, 0, reg_max); 6618 if (unlikely(mem_off == info.mem_off_split)) { 6619 /* The first active element crosses a page boundary. */ 6620 flags |= info.page[1].flags; 6621 if (unlikely(flags & TLB_MMIO)) { 6622 /* Some page is MMIO, see below. */ 6623 goto do_fault; 6624 } 6625 if (unlikely(flags & TLB_WATCHPOINT) && 6626 (cpu_watchpoint_address_matches 6627 (env_cpu(env), addr + mem_off, 1 << msz) 6628 & BP_MEM_READ)) { 6629 /* Watchpoint hit, see below. */ 6630 goto do_fault; 6631 } 6632 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6633 goto do_fault; 6634 } 6635 /* 6636 * Use the slow path for cross-page handling. 6637 * This is RAM, without a watchpoint, and will not trap. 6638 */ 6639 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6640 goto second_page; 6641 } 6642 } 6643 6644 /* 6645 * From this point on, all memory operations are MemSingleNF. 6646 * 6647 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6648 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6649 * 6650 * Unfortuately we do not have access to the memory attributes from the 6651 * PTE to tell Device memory from Normal memory. So we make a mostly 6652 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6653 * This gives the right answer for the common cases of "Normal memory, 6654 * backed by host RAM" and "Device memory, backed by MMIO". 6655 * The architecture allows us to suppress an NF load and return 6656 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6657 * case of "Normal memory, backed by MMIO" is permitted. The case we 6658 * get wrong is "Device memory, backed by host RAM", for which we 6659 * should return (UNKNOWN, FAULT) for but do not. 6660 * 6661 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6662 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6663 * architectural breakpoints the same. 6664 */ 6665 if (unlikely(flags & TLB_MMIO)) { 6666 goto do_fault; 6667 } 6668 6669 reg_last = info.reg_off_last[0]; 6670 host = info.page[0].host; 6671 6672 set_helper_retaddr(retaddr); 6673 6674 do { 6675 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6676 do { 6677 if ((pg >> (reg_off & 63)) & 1) { 6678 if (unlikely(flags & TLB_WATCHPOINT) && 6679 (cpu_watchpoint_address_matches 6680 (env_cpu(env), addr + mem_off, 1 << msz) 6681 & BP_MEM_READ)) { 6682 clear_helper_retaddr(); 6683 goto do_fault; 6684 } 6685 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6686 clear_helper_retaddr(); 6687 goto do_fault; 6688 } 6689 host_fn(vd, reg_off, host + mem_off); 6690 } 6691 reg_off += 1 << esz; 6692 mem_off += 1 << msz; 6693 } while (reg_off <= reg_last && (reg_off & 63)); 6694 } while (reg_off <= reg_last); 6695 6696 clear_helper_retaddr(); 6697 6698 /* 6699 * MemSingleNF is allowed to fail for any reason. We have special 6700 * code above to handle the first element crossing a page boundary. 6701 * As an implementation choice, decline to handle a cross-page element 6702 * in any other position. 6703 */ 6704 reg_off = info.reg_off_split; 6705 if (reg_off >= 0) { 6706 goto do_fault; 6707 } 6708 6709 second_page: 6710 reg_off = info.reg_off_first[1]; 6711 if (likely(reg_off < 0)) { 6712 /* No active elements on the second page. All done. */ 6713 return; 6714 } 6715 6716 /* 6717 * MemSingleNF is allowed to fail for any reason. As an implementation 6718 * choice, decline to handle elements on the second page. This should 6719 * be low frequency as the guest walks through memory -- the next 6720 * iteration of the guest's loop should be aligned on the page boundary, 6721 * and then all following iterations will stay aligned. 6722 */ 6723 6724 do_fault: 6725 record_fault(env, reg_off, reg_max); 6726 } 6727 6728 static inline QEMU_ALWAYS_INLINE 6729 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6730 uint32_t desc, const uintptr_t retaddr, 6731 const int esz, const int msz, const SVEContFault fault, 6732 sve_ldst1_host_fn *host_fn, 6733 sve_ldst1_tlb_fn *tlb_fn) 6734 { 6735 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6736 int bit55 = extract64(addr, 55, 1); 6737 6738 /* Remove mtedesc from the normal sve descriptor. */ 6739 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6740 6741 /* Perform gross MTE suppression early. */ 6742 if (!tbi_check(mtedesc, bit55) || 6743 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6744 mtedesc = 0; 6745 } 6746 6747 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6748 esz, msz, fault, host_fn, tlb_fn); 6749 } 6750 6751 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6752 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6753 target_ulong addr, uint32_t desc) \ 6754 { \ 6755 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6756 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6757 } \ 6758 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6759 target_ulong addr, uint32_t desc) \ 6760 { \ 6761 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6762 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6763 } \ 6764 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6765 target_ulong addr, uint32_t desc) \ 6766 { \ 6767 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6768 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6769 } \ 6770 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6771 target_ulong addr, uint32_t desc) \ 6772 { \ 6773 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6774 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6775 } 6776 6777 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6778 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6779 target_ulong addr, uint32_t desc) \ 6780 { \ 6781 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6782 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6783 } \ 6784 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6785 target_ulong addr, uint32_t desc) \ 6786 { \ 6787 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6788 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6789 } \ 6790 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6791 target_ulong addr, uint32_t desc) \ 6792 { \ 6793 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6794 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6795 } \ 6796 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6797 target_ulong addr, uint32_t desc) \ 6798 { \ 6799 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6800 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6801 } \ 6802 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6803 target_ulong addr, uint32_t desc) \ 6804 { \ 6805 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6806 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6807 } \ 6808 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6809 target_ulong addr, uint32_t desc) \ 6810 { \ 6811 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6812 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6813 } \ 6814 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6815 target_ulong addr, uint32_t desc) \ 6816 { \ 6817 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6818 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6819 } \ 6820 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6821 target_ulong addr, uint32_t desc) \ 6822 { \ 6823 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6824 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6825 } 6826 6827 DO_LDFF1_LDNF1_1(bb, MO_8) 6828 DO_LDFF1_LDNF1_1(bhu, MO_16) 6829 DO_LDFF1_LDNF1_1(bhs, MO_16) 6830 DO_LDFF1_LDNF1_1(bsu, MO_32) 6831 DO_LDFF1_LDNF1_1(bss, MO_32) 6832 DO_LDFF1_LDNF1_1(bdu, MO_64) 6833 DO_LDFF1_LDNF1_1(bds, MO_64) 6834 6835 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6836 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6837 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6838 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6839 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6840 6841 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6842 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6843 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6844 6845 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6846 6847 #undef DO_LDFF1_LDNF1_1 6848 #undef DO_LDFF1_LDNF1_2 6849 6850 /* 6851 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6852 */ 6853 6854 static inline QEMU_ALWAYS_INLINE 6855 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6856 uint32_t desc, const uintptr_t retaddr, 6857 const int esz, const int msz, const int N, uint32_t mtedesc, 6858 sve_ldst1_host_fn *host_fn, 6859 sve_ldst1_tlb_fn *tlb_fn) 6860 { 6861 const unsigned rd = simd_data(desc); 6862 const intptr_t reg_max = simd_oprsz(desc); 6863 intptr_t reg_off, reg_last, mem_off; 6864 SVEContLdSt info; 6865 void *host; 6866 int i, flags; 6867 6868 /* Find the active elements. */ 6869 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6870 /* The entire predicate was false; no store occurs. */ 6871 return; 6872 } 6873 6874 /* Probe the page(s). Exit with exception for any invalid page. */ 6875 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6876 6877 /* Handle watchpoints for all active elements. */ 6878 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6879 BP_MEM_WRITE, retaddr); 6880 6881 /* 6882 * Handle mte checks for all active elements. 6883 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6884 */ 6885 if (mtedesc) { 6886 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6887 mtedesc, retaddr); 6888 } 6889 6890 flags = info.page[0].flags | info.page[1].flags; 6891 if (unlikely(flags != 0)) { 6892 /* 6893 * At least one page includes MMIO. 6894 * Any bus operation can fail with cpu_transaction_failed, 6895 * which for ARM will raise SyncExternal. We cannot avoid 6896 * this fault and will leave with the store incomplete. 6897 */ 6898 mem_off = info.mem_off_first[0]; 6899 reg_off = info.reg_off_first[0]; 6900 reg_last = info.reg_off_last[1]; 6901 if (reg_last < 0) { 6902 reg_last = info.reg_off_split; 6903 if (reg_last < 0) { 6904 reg_last = info.reg_off_last[0]; 6905 } 6906 } 6907 6908 do { 6909 uint64_t pg = vg[reg_off >> 6]; 6910 do { 6911 if ((pg >> (reg_off & 63)) & 1) { 6912 for (i = 0; i < N; ++i) { 6913 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6914 addr + mem_off + (i << msz), retaddr); 6915 } 6916 } 6917 reg_off += 1 << esz; 6918 mem_off += N << msz; 6919 } while (reg_off & 63); 6920 } while (reg_off <= reg_last); 6921 return; 6922 } 6923 6924 mem_off = info.mem_off_first[0]; 6925 reg_off = info.reg_off_first[0]; 6926 reg_last = info.reg_off_last[0]; 6927 host = info.page[0].host; 6928 6929 set_helper_retaddr(retaddr); 6930 6931 while (reg_off <= reg_last) { 6932 uint64_t pg = vg[reg_off >> 6]; 6933 do { 6934 if ((pg >> (reg_off & 63)) & 1) { 6935 for (i = 0; i < N; ++i) { 6936 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6937 host + mem_off + (i << msz)); 6938 } 6939 } 6940 reg_off += 1 << esz; 6941 mem_off += N << msz; 6942 } while (reg_off <= reg_last && (reg_off & 63)); 6943 } 6944 6945 clear_helper_retaddr(); 6946 6947 /* 6948 * Use the slow path to manage the cross-page misalignment. 6949 * But we know this is RAM and cannot trap. 6950 */ 6951 mem_off = info.mem_off_split; 6952 if (unlikely(mem_off >= 0)) { 6953 reg_off = info.reg_off_split; 6954 for (i = 0; i < N; ++i) { 6955 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6956 addr + mem_off + (i << msz), retaddr); 6957 } 6958 } 6959 6960 mem_off = info.mem_off_first[1]; 6961 if (unlikely(mem_off >= 0)) { 6962 reg_off = info.reg_off_first[1]; 6963 reg_last = info.reg_off_last[1]; 6964 host = info.page[1].host; 6965 6966 set_helper_retaddr(retaddr); 6967 6968 do { 6969 uint64_t pg = vg[reg_off >> 6]; 6970 do { 6971 if ((pg >> (reg_off & 63)) & 1) { 6972 for (i = 0; i < N; ++i) { 6973 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6974 host + mem_off + (i << msz)); 6975 } 6976 } 6977 reg_off += 1 << esz; 6978 mem_off += N << msz; 6979 } while (reg_off & 63); 6980 } while (reg_off <= reg_last); 6981 6982 clear_helper_retaddr(); 6983 } 6984 } 6985 6986 static inline QEMU_ALWAYS_INLINE 6987 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6988 uint32_t desc, const uintptr_t ra, 6989 const int esz, const int msz, const int N, 6990 sve_ldst1_host_fn *host_fn, 6991 sve_ldst1_tlb_fn *tlb_fn) 6992 { 6993 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6994 int bit55 = extract64(addr, 55, 1); 6995 6996 /* Remove mtedesc from the normal sve descriptor. */ 6997 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6998 6999 /* Perform gross MTE suppression early. */ 7000 if (!tbi_check(mtedesc, bit55) || 7001 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 7002 mtedesc = 0; 7003 } 7004 7005 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 7006 } 7007 7008 #define DO_STN_1(N, NAME, ESZ) \ 7009 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 7010 target_ulong addr, uint32_t desc) \ 7011 { \ 7012 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 7013 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 7014 } \ 7015 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 7016 target_ulong addr, uint32_t desc) \ 7017 { \ 7018 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 7019 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 7020 } 7021 7022 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 7023 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 7024 target_ulong addr, uint32_t desc) \ 7025 { \ 7026 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 7027 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 7028 } \ 7029 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 7030 target_ulong addr, uint32_t desc) \ 7031 { \ 7032 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 7033 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 7034 } \ 7035 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 7036 target_ulong addr, uint32_t desc) \ 7037 { \ 7038 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 7039 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 7040 } \ 7041 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 7042 target_ulong addr, uint32_t desc) \ 7043 { \ 7044 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 7045 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 7046 } 7047 7048 DO_STN_1(1, bb, MO_8) 7049 DO_STN_1(1, bh, MO_16) 7050 DO_STN_1(1, bs, MO_32) 7051 DO_STN_1(1, bd, MO_64) 7052 DO_STN_1(2, bb, MO_8) 7053 DO_STN_1(3, bb, MO_8) 7054 DO_STN_1(4, bb, MO_8) 7055 7056 DO_STN_2(1, hh, MO_16, MO_16) 7057 DO_STN_2(1, hs, MO_32, MO_16) 7058 DO_STN_2(1, hd, MO_64, MO_16) 7059 DO_STN_2(2, hh, MO_16, MO_16) 7060 DO_STN_2(3, hh, MO_16, MO_16) 7061 DO_STN_2(4, hh, MO_16, MO_16) 7062 7063 DO_STN_2(1, ss, MO_32, MO_32) 7064 DO_STN_2(1, sd, MO_64, MO_32) 7065 DO_STN_2(2, ss, MO_32, MO_32) 7066 DO_STN_2(3, ss, MO_32, MO_32) 7067 DO_STN_2(4, ss, MO_32, MO_32) 7068 7069 DO_STN_2(1, dd, MO_64, MO_64) 7070 DO_STN_2(2, dd, MO_64, MO_64) 7071 DO_STN_2(3, dd, MO_64, MO_64) 7072 DO_STN_2(4, dd, MO_64, MO_64) 7073 7074 DO_STN_2(1, sq, MO_128, MO_32) 7075 DO_STN_2(1, dq, MO_128, MO_64) 7076 7077 DO_STN_2(2, qq, MO_128, MO_128) 7078 DO_STN_2(3, qq, MO_128, MO_128) 7079 DO_STN_2(4, qq, MO_128, MO_128) 7080 7081 #undef DO_STN_1 7082 #undef DO_STN_2 7083 7084 /* 7085 * Loads with a vector index. 7086 */ 7087 7088 /* 7089 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 7090 */ 7091 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 7092 7093 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 7094 { 7095 return *(uint32_t *)(reg + H1_4(reg_ofs)); 7096 } 7097 7098 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 7099 { 7100 return *(int32_t *)(reg + H1_4(reg_ofs)); 7101 } 7102 7103 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 7104 { 7105 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 7106 } 7107 7108 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 7109 { 7110 return (int32_t)*(uint64_t *)(reg + reg_ofs); 7111 } 7112 7113 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 7114 { 7115 return *(uint64_t *)(reg + reg_ofs); 7116 } 7117 7118 static inline QEMU_ALWAYS_INLINE 7119 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7120 target_ulong base, uint32_t desc, uintptr_t retaddr, 7121 uint32_t mtedesc, int esize, int msize, 7122 zreg_off_fn *off_fn, 7123 sve_ldst1_host_fn *host_fn, 7124 sve_ldst1_tlb_fn *tlb_fn) 7125 { 7126 const int mmu_idx = arm_env_mmu_index(env); 7127 const intptr_t reg_max = simd_oprsz(desc); 7128 const int scale = simd_data(desc); 7129 ARMVectorReg scratch; 7130 intptr_t reg_off; 7131 SVEHostPage info, info2; 7132 7133 memset(&scratch, 0, reg_max); 7134 reg_off = 0; 7135 do { 7136 uint64_t pg = vg[reg_off >> 6]; 7137 do { 7138 if (likely(pg & 1)) { 7139 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7140 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7141 7142 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 7143 mmu_idx, retaddr); 7144 7145 if (likely(in_page >= msize)) { 7146 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7147 cpu_check_watchpoint(env_cpu(env), addr, msize, 7148 info.attrs, BP_MEM_READ, retaddr); 7149 } 7150 if (mtedesc && info.tagged) { 7151 mte_check(env, mtedesc, addr, retaddr); 7152 } 7153 if (unlikely(info.flags & TLB_MMIO)) { 7154 tlb_fn(env, &scratch, reg_off, addr, retaddr); 7155 } else { 7156 set_helper_retaddr(retaddr); 7157 host_fn(&scratch, reg_off, info.host); 7158 clear_helper_retaddr(); 7159 } 7160 } else { 7161 /* Element crosses the page boundary. */ 7162 sve_probe_page(&info2, false, env, addr + in_page, 0, 7163 MMU_DATA_LOAD, mmu_idx, retaddr); 7164 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 7165 cpu_check_watchpoint(env_cpu(env), addr, 7166 msize, info.attrs, 7167 BP_MEM_READ, retaddr); 7168 } 7169 if (mtedesc && info.tagged) { 7170 mte_check(env, mtedesc, addr, retaddr); 7171 } 7172 tlb_fn(env, &scratch, reg_off, addr, retaddr); 7173 } 7174 } 7175 reg_off += esize; 7176 pg >>= esize; 7177 } while (reg_off & 63); 7178 } while (reg_off < reg_max); 7179 7180 /* Wait until all exceptions have been raised to write back. */ 7181 memcpy(vd, &scratch, reg_max); 7182 } 7183 7184 static inline QEMU_ALWAYS_INLINE 7185 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7186 target_ulong base, uint32_t desc, uintptr_t retaddr, 7187 int esize, int msize, zreg_off_fn *off_fn, 7188 sve_ldst1_host_fn *host_fn, 7189 sve_ldst1_tlb_fn *tlb_fn) 7190 { 7191 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7192 /* Remove mtedesc from the normal sve descriptor. */ 7193 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7194 7195 /* 7196 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7197 * offset base entirely over the address space hole to change the 7198 * pointer tag, or change the bit55 selector. So we could here 7199 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7200 */ 7201 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7202 esize, msize, off_fn, host_fn, tlb_fn); 7203 } 7204 7205 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 7206 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7207 void *vm, target_ulong base, uint32_t desc) \ 7208 { \ 7209 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7210 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7211 } \ 7212 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7213 void *vm, target_ulong base, uint32_t desc) \ 7214 { \ 7215 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7216 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7217 } 7218 7219 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 7220 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7221 void *vm, target_ulong base, uint32_t desc) \ 7222 { \ 7223 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7224 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7225 } \ 7226 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7227 void *vm, target_ulong base, uint32_t desc) \ 7228 { \ 7229 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7230 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7231 } 7232 7233 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 7234 DO_LD1_ZPZ_S(bsu, zss, MO_8) 7235 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 7236 DO_LD1_ZPZ_D(bdu, zss, MO_8) 7237 DO_LD1_ZPZ_D(bdu, zd, MO_8) 7238 7239 DO_LD1_ZPZ_S(bss, zsu, MO_8) 7240 DO_LD1_ZPZ_S(bss, zss, MO_8) 7241 DO_LD1_ZPZ_D(bds, zsu, MO_8) 7242 DO_LD1_ZPZ_D(bds, zss, MO_8) 7243 DO_LD1_ZPZ_D(bds, zd, MO_8) 7244 7245 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 7246 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 7247 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 7248 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 7249 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 7250 7251 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 7252 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 7253 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 7254 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 7255 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 7256 7257 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 7258 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 7259 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 7260 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 7261 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 7262 7263 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 7264 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 7265 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 7266 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 7267 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 7268 7269 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 7270 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 7271 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 7272 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 7273 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 7274 7275 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 7276 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 7277 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 7278 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 7279 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 7280 7281 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 7282 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 7283 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 7284 7285 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 7286 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 7287 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 7288 7289 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 7290 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 7291 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 7292 7293 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 7294 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 7295 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 7296 7297 DO_LD1_ZPZ_D(qq_le, zd, MO_128) 7298 DO_LD1_ZPZ_D(qq_be, zd, MO_128) 7299 7300 #undef DO_LD1_ZPZ_S 7301 #undef DO_LD1_ZPZ_D 7302 7303 /* First fault loads with a vector index. */ 7304 7305 /* 7306 * Common helpers for all gather first-faulting loads. 7307 */ 7308 7309 static inline QEMU_ALWAYS_INLINE 7310 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7311 target_ulong base, uint32_t desc, uintptr_t retaddr, 7312 uint32_t mtedesc, const int esz, const int msz, 7313 zreg_off_fn *off_fn, 7314 sve_ldst1_host_fn *host_fn, 7315 sve_ldst1_tlb_fn *tlb_fn) 7316 { 7317 const int mmu_idx = arm_env_mmu_index(env); 7318 const intptr_t reg_max = simd_oprsz(desc); 7319 const int scale = simd_data(desc); 7320 const int esize = 1 << esz; 7321 const int msize = 1 << msz; 7322 intptr_t reg_off; 7323 SVEHostPage info; 7324 target_ulong addr, in_page; 7325 ARMVectorReg scratch; 7326 7327 /* Skip to the first true predicate. */ 7328 reg_off = find_next_active(vg, 0, reg_max, esz); 7329 if (unlikely(reg_off >= reg_max)) { 7330 /* The entire predicate was false; no load occurs. */ 7331 memset(vd, 0, reg_max); 7332 return; 7333 } 7334 7335 /* Protect against overlap between vd and vm. */ 7336 if (unlikely(vd == vm)) { 7337 vm = memcpy(&scratch, vm, reg_max); 7338 } 7339 7340 /* 7341 * Probe the first element, allowing faults. 7342 */ 7343 addr = base + (off_fn(vm, reg_off) << scale); 7344 if (mtedesc) { 7345 mte_check(env, mtedesc, addr, retaddr); 7346 } 7347 tlb_fn(env, vd, reg_off, addr, retaddr); 7348 7349 /* After any fault, zero the other elements. */ 7350 swap_memzero(vd, reg_off); 7351 reg_off += esize; 7352 swap_memzero(vd + reg_off, reg_max - reg_off); 7353 7354 /* 7355 * Probe the remaining elements, not allowing faults. 7356 */ 7357 while (reg_off < reg_max) { 7358 uint64_t pg = vg[reg_off >> 6]; 7359 do { 7360 if (likely((pg >> (reg_off & 63)) & 1)) { 7361 addr = base + (off_fn(vm, reg_off) << scale); 7362 in_page = -(addr | TARGET_PAGE_MASK); 7363 7364 if (unlikely(in_page < msize)) { 7365 /* Stop if the element crosses a page boundary. */ 7366 goto fault; 7367 } 7368 7369 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 7370 mmu_idx, retaddr); 7371 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 7372 goto fault; 7373 } 7374 if (unlikely(info.flags & TLB_WATCHPOINT) && 7375 (cpu_watchpoint_address_matches 7376 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 7377 goto fault; 7378 } 7379 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 7380 goto fault; 7381 } 7382 7383 set_helper_retaddr(retaddr); 7384 host_fn(vd, reg_off, info.host); 7385 clear_helper_retaddr(); 7386 } 7387 reg_off += esize; 7388 } while (reg_off & 63); 7389 } 7390 return; 7391 7392 fault: 7393 record_fault(env, reg_off, reg_max); 7394 } 7395 7396 static inline QEMU_ALWAYS_INLINE 7397 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7398 target_ulong base, uint32_t desc, uintptr_t retaddr, 7399 const int esz, const int msz, 7400 zreg_off_fn *off_fn, 7401 sve_ldst1_host_fn *host_fn, 7402 sve_ldst1_tlb_fn *tlb_fn) 7403 { 7404 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7405 /* Remove mtedesc from the normal sve descriptor. */ 7406 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7407 7408 /* 7409 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7410 * offset base entirely over the address space hole to change the 7411 * pointer tag, or change the bit55 selector. So we could here 7412 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7413 */ 7414 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7415 esz, msz, off_fn, host_fn, tlb_fn); 7416 } 7417 7418 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 7419 void HELPER(sve_ldff##MEM##_##OFS) \ 7420 (CPUARMState *env, void *vd, void *vg, \ 7421 void *vm, target_ulong base, uint32_t desc) \ 7422 { \ 7423 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 7424 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7425 } \ 7426 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7427 (CPUARMState *env, void *vd, void *vg, \ 7428 void *vm, target_ulong base, uint32_t desc) \ 7429 { \ 7430 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 7431 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7432 } 7433 7434 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 7435 void HELPER(sve_ldff##MEM##_##OFS) \ 7436 (CPUARMState *env, void *vd, void *vg, \ 7437 void *vm, target_ulong base, uint32_t desc) \ 7438 { \ 7439 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 7440 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7441 } \ 7442 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7443 (CPUARMState *env, void *vd, void *vg, \ 7444 void *vm, target_ulong base, uint32_t desc) \ 7445 { \ 7446 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 7447 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7448 } 7449 7450 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 7451 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 7452 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 7453 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 7454 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 7455 7456 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 7457 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 7458 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 7459 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 7460 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 7461 7462 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 7463 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 7464 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 7465 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 7466 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 7467 7468 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 7469 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 7470 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 7471 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 7472 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 7473 7474 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 7475 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 7476 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 7477 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 7478 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 7479 7480 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 7481 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 7482 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 7483 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 7484 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 7485 7486 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 7487 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 7488 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 7489 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 7490 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 7491 7492 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 7493 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 7494 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 7495 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 7496 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 7497 7498 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 7499 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 7500 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 7501 7502 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 7503 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 7504 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 7505 7506 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 7507 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 7508 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 7509 7510 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 7511 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 7512 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 7513 7514 /* Stores with a vector index. */ 7515 7516 static inline QEMU_ALWAYS_INLINE 7517 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7518 target_ulong base, uint32_t desc, uintptr_t retaddr, 7519 uint32_t mtedesc, int esize, int msize, 7520 zreg_off_fn *off_fn, 7521 sve_ldst1_host_fn *host_fn, 7522 sve_ldst1_tlb_fn *tlb_fn) 7523 { 7524 const int mmu_idx = arm_env_mmu_index(env); 7525 const intptr_t reg_max = simd_oprsz(desc); 7526 const int scale = simd_data(desc); 7527 void *host[ARM_MAX_VQ * 4]; 7528 intptr_t reg_off, i; 7529 SVEHostPage info, info2; 7530 7531 /* 7532 * Probe all of the elements for host addresses and flags. 7533 */ 7534 i = reg_off = 0; 7535 do { 7536 uint64_t pg = vg[reg_off >> 6]; 7537 do { 7538 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7539 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7540 7541 host[i] = NULL; 7542 if (likely((pg >> (reg_off & 63)) & 1)) { 7543 if (likely(in_page >= msize)) { 7544 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 7545 mmu_idx, retaddr); 7546 if (!(info.flags & TLB_MMIO)) { 7547 host[i] = info.host; 7548 } 7549 } else { 7550 /* 7551 * Element crosses the page boundary. 7552 * Probe both pages, but do not record the host address, 7553 * so that we use the slow path. 7554 */ 7555 sve_probe_page(&info, false, env, addr, 0, 7556 MMU_DATA_STORE, mmu_idx, retaddr); 7557 sve_probe_page(&info2, false, env, addr + in_page, 0, 7558 MMU_DATA_STORE, mmu_idx, retaddr); 7559 info.flags |= info2.flags; 7560 } 7561 7562 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7563 cpu_check_watchpoint(env_cpu(env), addr, msize, 7564 info.attrs, BP_MEM_WRITE, retaddr); 7565 } 7566 7567 if (mtedesc && info.tagged) { 7568 mte_check(env, mtedesc, addr, retaddr); 7569 } 7570 } 7571 i += 1; 7572 reg_off += esize; 7573 } while (reg_off & 63); 7574 } while (reg_off < reg_max); 7575 7576 /* 7577 * Now that we have recognized all exceptions except SyncExternal 7578 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7579 * 7580 * Note for the common case of an element in RAM, not crossing a page 7581 * boundary, we have stored the host address in host[]. This doubles 7582 * as a first-level check against the predicate, since only enabled 7583 * elements have non-null host addresses. 7584 */ 7585 i = reg_off = 0; 7586 do { 7587 void *h = host[i]; 7588 if (likely(h != NULL)) { 7589 set_helper_retaddr(retaddr); 7590 host_fn(vd, reg_off, h); 7591 clear_helper_retaddr(); 7592 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7593 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7594 tlb_fn(env, vd, reg_off, addr, retaddr); 7595 } 7596 i += 1; 7597 reg_off += esize; 7598 } while (reg_off < reg_max); 7599 } 7600 7601 static inline QEMU_ALWAYS_INLINE 7602 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7603 target_ulong base, uint32_t desc, uintptr_t retaddr, 7604 int esize, int msize, zreg_off_fn *off_fn, 7605 sve_ldst1_host_fn *host_fn, 7606 sve_ldst1_tlb_fn *tlb_fn) 7607 { 7608 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7609 /* Remove mtedesc from the normal sve descriptor. */ 7610 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7611 7612 /* 7613 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7614 * offset base entirely over the address space hole to change the 7615 * pointer tag, or change the bit55 selector. So we could here 7616 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7617 */ 7618 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7619 esize, msize, off_fn, host_fn, tlb_fn); 7620 } 7621 7622 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7623 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7624 void *vm, target_ulong base, uint32_t desc) \ 7625 { \ 7626 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7627 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7628 } \ 7629 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7630 void *vm, target_ulong base, uint32_t desc) \ 7631 { \ 7632 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7633 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7634 } 7635 7636 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7637 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7638 void *vm, target_ulong base, uint32_t desc) \ 7639 { \ 7640 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7641 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7642 } \ 7643 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7644 void *vm, target_ulong base, uint32_t desc) \ 7645 { \ 7646 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7647 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7648 } 7649 7650 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7651 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7652 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7653 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7654 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7655 7656 DO_ST1_ZPZ_S(bs, zss, MO_8) 7657 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7658 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7659 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7660 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7661 7662 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7663 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7664 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7665 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7666 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7667 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7668 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7669 7670 DO_ST1_ZPZ_D(bd, zss, MO_8) 7671 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7672 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7673 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7674 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7675 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7676 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7677 7678 DO_ST1_ZPZ_D(bd, zd, MO_8) 7679 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7680 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7681 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7682 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7683 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7684 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7685 7686 DO_ST1_ZPZ_D(qq_le, zd, MO_128) 7687 DO_ST1_ZPZ_D(qq_be, zd, MO_128) 7688 7689 #undef DO_ST1_ZPZ_S 7690 #undef DO_ST1_ZPZ_D 7691 7692 /* 7693 * SVE2.1 consecutive register load/store 7694 */ 7695 7696 static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr, 7697 uint32_t png, intptr_t reg_max, 7698 int N, int v_esz) 7699 { 7700 const int esize = 1 << v_esz; 7701 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 7702 DecodeCounter p = decode_counter(png, reg_max, v_esz); 7703 unsigned b_count = p.count << v_esz; 7704 unsigned b_stride = 1 << (v_esz + p.lg2_stride); 7705 intptr_t page_split; 7706 7707 /* Set all of the element indices to -1, and the TLB data to 0. */ 7708 memset(info, -1, offsetof(SVEContLdSt, page)); 7709 memset(info->page, 0, sizeof(info->page)); 7710 7711 if (p.invert) { 7712 if (b_count >= reg_max * N) { 7713 return 0; 7714 } 7715 reg_off_first = b_count; 7716 reg_off_last = reg_max * N - b_stride; 7717 } else { 7718 if (b_count == 0) { 7719 return 0; 7720 } 7721 reg_off_first = 0; 7722 reg_off_last = MIN(b_count - esize, reg_max * N - b_stride); 7723 } 7724 7725 info->reg_off_first[0] = reg_off_first; 7726 info->mem_off_first[0] = reg_off_first; 7727 7728 page_split = -(addr | TARGET_PAGE_MASK); 7729 if (reg_off_last + esize <= page_split || reg_off_first >= page_split) { 7730 /* The entire operation fits within a single page. */ 7731 info->reg_off_last[0] = reg_off_last; 7732 return b_stride; 7733 } 7734 7735 info->page_split = page_split; 7736 reg_off_split = ROUND_DOWN(page_split, esize); 7737 7738 /* 7739 * This is the last full element on the first page, but it is not 7740 * necessarily active. If there is no full element, i.e. the first 7741 * active element is the one that's split, this value remains -1. 7742 * It is useful as iteration bounds. 7743 */ 7744 if (reg_off_split != 0) { 7745 info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride); 7746 } 7747 7748 /* Determine if an unaligned element spans the pages. */ 7749 if (page_split & (esize - 1)) { 7750 /* It is helpful to know if the split element is active. */ 7751 if ((reg_off_split & (b_stride - 1)) == 0) { 7752 info->reg_off_split = reg_off_split; 7753 info->mem_off_split = reg_off_split; 7754 } 7755 reg_off_split += esize; 7756 } 7757 7758 /* 7759 * We do want the first active element on the second page, because 7760 * this may affect the address reported in an exception. 7761 */ 7762 reg_off_split = ROUND_UP(reg_off_split, b_stride); 7763 if (reg_off_split <= reg_off_last) { 7764 info->reg_off_first[1] = reg_off_split; 7765 info->mem_off_first[1] = reg_off_split; 7766 info->reg_off_last[1] = reg_off_last; 7767 } 7768 return b_stride; 7769 } 7770 7771 static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 7772 target_ulong addr, unsigned estride, 7773 int esize, int wp_access, uintptr_t ra) 7774 { 7775 #ifndef CONFIG_USER_ONLY 7776 intptr_t count_off, count_last; 7777 int flags0 = info->page[0].flags; 7778 int flags1 = info->page[1].flags; 7779 7780 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 7781 return; 7782 } 7783 7784 /* Indicate that watchpoints are handled. */ 7785 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 7786 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 7787 7788 if (flags0 & TLB_WATCHPOINT) { 7789 count_off = info->reg_off_first[0]; 7790 count_last = info->reg_off_split; 7791 if (count_last < 0) { 7792 count_last = info->reg_off_last[0]; 7793 } 7794 do { 7795 cpu_check_watchpoint(env_cpu(env), addr + count_off, 7796 esize, info->page[0].attrs, wp_access, ra); 7797 count_off += estride; 7798 } while (count_off <= count_last); 7799 } 7800 7801 count_off = info->reg_off_first[1]; 7802 if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) { 7803 count_last = info->reg_off_last[1]; 7804 do { 7805 cpu_check_watchpoint(env_cpu(env), addr + count_off, 7806 esize, info->page[1].attrs, 7807 wp_access, ra); 7808 count_off += estride; 7809 } while (count_off <= count_last); 7810 } 7811 #endif 7812 } 7813 7814 static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 7815 target_ulong addr, unsigned estride, 7816 int esize, uint32_t mtedesc, 7817 uintptr_t ra) 7818 { 7819 intptr_t count_off, count_last; 7820 7821 /* 7822 * TODO: estride is always a small power of two, <= 8. 7823 * Manipulate the stride within the loops such that 7824 * - first iteration hits addr + off, as required, 7825 * - second iteration hits ALIGN_UP(addr, 16), 7826 * - other iterations advance addr by 16. 7827 * This will minimize the probing to once per MTE granule. 7828 */ 7829 7830 /* Process the page only if MemAttr == Tagged. */ 7831 if (info->page[0].tagged) { 7832 count_off = info->reg_off_first[0]; 7833 count_last = info->reg_off_split; 7834 if (count_last < 0) { 7835 count_last = info->reg_off_last[0]; 7836 } 7837 7838 do { 7839 mte_check(env, mtedesc, addr + count_off, ra); 7840 count_off += estride; 7841 } while (count_off <= count_last); 7842 } 7843 7844 count_off = info->reg_off_first[1]; 7845 if (count_off >= 0 && info->page[1].tagged) { 7846 count_last = info->reg_off_last[1]; 7847 do { 7848 mte_check(env, mtedesc, addr + count_off, ra); 7849 count_off += estride; 7850 } while (count_off <= count_last); 7851 } 7852 } 7853 7854 static inline QEMU_ALWAYS_INLINE 7855 void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, 7856 uint32_t png, uint32_t desc, 7857 const uintptr_t ra, const MemOp esz, 7858 sve_ldst1_host_fn *host_fn, 7859 sve_ldst1_tlb_fn *tlb_fn) 7860 { 7861 const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; 7862 const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); 7863 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7864 const intptr_t reg_max = simd_oprsz(desc); 7865 const unsigned esize = 1 << esz; 7866 intptr_t count_off, count_last; 7867 intptr_t reg_off, reg_last, reg_n; 7868 SVEContLdSt info; 7869 unsigned estride, flags; 7870 void *host; 7871 7872 estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); 7873 if (estride == 0) { 7874 /* The entire predicate was false; no load occurs. */ 7875 for (unsigned n = 0; n < N; n++) { 7876 memset(zd + n * rstride, 0, reg_max); 7877 } 7878 return; 7879 } 7880 7881 /* Probe the page(s). Exit with exception for any invalid page. */ 7882 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); 7883 7884 /* Handle watchpoints for all active elements. */ 7885 sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, 7886 esize, BP_MEM_READ, ra); 7887 7888 /* 7889 * Handle mte checks for all active elements. 7890 * Since TBI must be set for MTE, !mtedesc => !mte_active. 7891 */ 7892 if (mtedesc) { 7893 sve2p1_cont_ldst_mte_check(&info, env, estride, addr, 7894 esize, mtedesc, ra); 7895 } 7896 7897 flags = info.page[0].flags | info.page[1].flags; 7898 if (unlikely(flags != 0)) { 7899 /* 7900 * At least one page includes MMIO. 7901 * Any bus operation can fail with cpu_transaction_failed, 7902 * which for ARM will raise SyncExternal. Perform the load 7903 * into scratch memory to preserve register state until the end. 7904 */ 7905 ARMVectorReg scratch[4] = { }; 7906 7907 count_off = info.reg_off_first[0]; 7908 count_last = info.reg_off_last[1]; 7909 if (count_last < 0) { 7910 count_last = info.reg_off_split; 7911 if (count_last < 0) { 7912 count_last = info.reg_off_last[0]; 7913 } 7914 } 7915 reg_off = count_off % reg_max; 7916 reg_n = count_off / reg_max; 7917 7918 do { 7919 reg_last = MIN(count_last - count_off, reg_max - esize); 7920 do { 7921 tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra); 7922 reg_off += estride; 7923 count_off += estride; 7924 } while (reg_off <= reg_last); 7925 reg_off = 0; 7926 reg_n++; 7927 } while (count_off <= count_last); 7928 7929 for (unsigned n = 0; n < N; ++n) { 7930 memcpy(&zd[n * rstride], &scratch[n], reg_max); 7931 } 7932 return; 7933 } 7934 7935 /* The entire operation is in RAM, on valid pages. */ 7936 7937 for (unsigned n = 0; n < N; ++n) { 7938 memset(&zd[n * rstride], 0, reg_max); 7939 } 7940 7941 count_off = info.reg_off_first[0]; 7942 count_last = info.reg_off_last[0]; 7943 reg_off = count_off % reg_max; 7944 reg_n = count_off / reg_max; 7945 host = info.page[0].host; 7946 7947 set_helper_retaddr(ra); 7948 7949 do { 7950 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 7951 do { 7952 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 7953 reg_off += estride; 7954 count_off += estride; 7955 } while (reg_off <= reg_last); 7956 reg_off = 0; 7957 reg_n++; 7958 } while (count_off <= count_last); 7959 7960 clear_helper_retaddr(); 7961 7962 /* 7963 * Use the slow path to manage the cross-page misalignment. 7964 * But we know this is RAM and cannot trap. 7965 */ 7966 count_off = info.reg_off_split; 7967 if (unlikely(count_off >= 0)) { 7968 reg_off = count_off % reg_max; 7969 reg_n = count_off / reg_max; 7970 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 7971 } 7972 7973 count_off = info.reg_off_first[1]; 7974 if (unlikely(count_off >= 0)) { 7975 count_last = info.reg_off_last[1]; 7976 reg_off = count_off % reg_max; 7977 reg_n = count_off / reg_max; 7978 host = info.page[1].host; 7979 7980 set_helper_retaddr(ra); 7981 7982 do { 7983 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 7984 do { 7985 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 7986 reg_off += estride; 7987 count_off += estride; 7988 } while (reg_off <= reg_last); 7989 reg_off = 0; 7990 reg_n++; 7991 } while (count_off <= count_last); 7992 7993 clear_helper_retaddr(); 7994 } 7995 } 7996 7997 void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr, 7998 uint32_t png, uint32_t desc) 7999 { 8000 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8, 8001 sve_ld1bb_host, sve_ld1bb_tlb); 8002 } 8003 8004 #define DO_LD1_2(NAME, ESZ) \ 8005 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ 8006 target_ulong addr, uint32_t png, \ 8007 uint32_t desc) \ 8008 { \ 8009 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8010 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 8011 } \ 8012 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ 8013 target_ulong addr, uint32_t png, \ 8014 uint32_t desc) \ 8015 { \ 8016 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8017 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 8018 } 8019 8020 DO_LD1_2(ld1hh, MO_16) 8021 DO_LD1_2(ld1ss, MO_32) 8022 DO_LD1_2(ld1dd, MO_64) 8023 8024 #undef DO_LD1_2 8025 8026 static inline QEMU_ALWAYS_INLINE 8027 void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, 8028 uint32_t png, uint32_t desc, 8029 const uintptr_t ra, const int esz, 8030 sve_ldst1_host_fn *host_fn, 8031 sve_ldst1_tlb_fn *tlb_fn) 8032 { 8033 const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; 8034 const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); 8035 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 8036 const intptr_t reg_max = simd_oprsz(desc); 8037 const unsigned esize = 1 << esz; 8038 intptr_t count_off, count_last; 8039 intptr_t reg_off, reg_last, reg_n; 8040 SVEContLdSt info; 8041 unsigned estride, flags; 8042 void *host; 8043 8044 estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); 8045 if (estride == 0) { 8046 /* The entire predicate was false; no store occurs. */ 8047 return; 8048 } 8049 8050 /* Probe the page(s). Exit with exception for any invalid page. */ 8051 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); 8052 8053 /* Handle watchpoints for all active elements. */ 8054 sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, 8055 esize, BP_MEM_WRITE, ra); 8056 8057 /* 8058 * Handle mte checks for all active elements. 8059 * Since TBI must be set for MTE, !mtedesc => !mte_active. 8060 */ 8061 if (mtedesc) { 8062 sve2p1_cont_ldst_mte_check(&info, env, estride, addr, 8063 esize, mtedesc, ra); 8064 } 8065 8066 flags = info.page[0].flags | info.page[1].flags; 8067 if (unlikely(flags != 0)) { 8068 /* 8069 * At least one page includes MMIO. 8070 * Any bus operation can fail with cpu_transaction_failed, 8071 * which for ARM will raise SyncExternal. Perform the load 8072 * into scratch memory to preserve register state until the end. 8073 */ 8074 count_off = info.reg_off_first[0]; 8075 count_last = info.reg_off_last[1]; 8076 if (count_last < 0) { 8077 count_last = info.reg_off_split; 8078 if (count_last < 0) { 8079 count_last = info.reg_off_last[0]; 8080 } 8081 } 8082 reg_off = count_off % reg_max; 8083 reg_n = count_off / reg_max; 8084 8085 do { 8086 reg_last = MIN(count_last - count_off, reg_max - esize); 8087 do { 8088 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 8089 reg_off += estride; 8090 count_off += estride; 8091 } while (reg_off <= reg_last); 8092 reg_off = 0; 8093 reg_n++; 8094 } while (count_off <= count_last); 8095 return; 8096 } 8097 8098 /* The entire operation is in RAM, on valid pages. */ 8099 8100 count_off = info.reg_off_first[0]; 8101 count_last = info.reg_off_last[0]; 8102 reg_off = count_off % reg_max; 8103 reg_n = count_off / reg_max; 8104 host = info.page[0].host; 8105 8106 set_helper_retaddr(ra); 8107 8108 do { 8109 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 8110 do { 8111 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 8112 reg_off += estride; 8113 count_off += estride; 8114 } while (reg_off <= reg_last); 8115 reg_off = 0; 8116 reg_n++; 8117 } while (count_off <= count_last); 8118 8119 clear_helper_retaddr(); 8120 8121 /* 8122 * Use the slow path to manage the cross-page misalignment. 8123 * But we know this is RAM and cannot trap. 8124 */ 8125 count_off = info.reg_off_split; 8126 if (unlikely(count_off >= 0)) { 8127 reg_off = count_off % reg_max; 8128 reg_n = count_off / reg_max; 8129 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 8130 } 8131 8132 count_off = info.reg_off_first[1]; 8133 if (unlikely(count_off >= 0)) { 8134 count_last = info.reg_off_last[1]; 8135 reg_off = count_off % reg_max; 8136 reg_n = count_off / reg_max; 8137 host = info.page[1].host; 8138 8139 set_helper_retaddr(ra); 8140 8141 do { 8142 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 8143 do { 8144 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 8145 reg_off += estride; 8146 count_off += estride; 8147 } while (reg_off <= reg_last); 8148 reg_off = 0; 8149 reg_n++; 8150 } while (count_off <= count_last); 8151 8152 clear_helper_retaddr(); 8153 } 8154 } 8155 8156 void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr, 8157 uint32_t png, uint32_t desc) 8158 { 8159 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8, 8160 sve_st1bb_host, sve_st1bb_tlb); 8161 } 8162 8163 #define DO_ST1_2(NAME, ESZ) \ 8164 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ 8165 target_ulong addr, uint32_t png, \ 8166 uint32_t desc) \ 8167 { \ 8168 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8169 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 8170 } \ 8171 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ 8172 target_ulong addr, uint32_t png, \ 8173 uint32_t desc) \ 8174 { \ 8175 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8176 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 8177 } 8178 8179 DO_ST1_2(st1hh, MO_16) 8180 DO_ST1_2(st1ss, MO_32) 8181 DO_ST1_2(st1dd, MO_64) 8182 8183 #undef DO_ST1_2 8184 8185 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8186 { 8187 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8188 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8189 8190 for (i = 0; i < opr_sz; ++i) { 8191 d[i] = n[i] ^ m[i] ^ k[i]; 8192 } 8193 } 8194 8195 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8196 { 8197 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8198 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8199 8200 for (i = 0; i < opr_sz; ++i) { 8201 d[i] = n[i] ^ (m[i] & ~k[i]); 8202 } 8203 } 8204 8205 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8206 { 8207 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8208 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8209 8210 for (i = 0; i < opr_sz; ++i) { 8211 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 8212 } 8213 } 8214 8215 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8216 { 8217 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8218 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8219 8220 for (i = 0; i < opr_sz; ++i) { 8221 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 8222 } 8223 } 8224 8225 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8226 { 8227 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8228 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8229 8230 for (i = 0; i < opr_sz; ++i) { 8231 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 8232 } 8233 } 8234 8235 /* 8236 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 8237 * See hasless(v,1) from 8238 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 8239 */ 8240 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 8241 { 8242 int bits = 8 << esz; 8243 uint64_t ones = dup_const(esz, 1); 8244 uint64_t signs = ones << (bits - 1); 8245 uint64_t cmp0, cmp1; 8246 8247 cmp1 = dup_const(esz, n); 8248 cmp0 = cmp1 ^ m0; 8249 cmp1 = cmp1 ^ m1; 8250 cmp0 = (cmp0 - ones) & ~cmp0; 8251 cmp1 = (cmp1 - ones) & ~cmp1; 8252 return (cmp0 | cmp1) & signs; 8253 } 8254 8255 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 8256 uint32_t desc, int esz, bool nmatch) 8257 { 8258 uint16_t esz_mask = pred_esz_masks[esz]; 8259 intptr_t opr_sz = simd_oprsz(desc); 8260 uint32_t flags = PREDTEST_INIT; 8261 intptr_t i, j, k; 8262 8263 for (i = 0; i < opr_sz; i += 16) { 8264 uint64_t m0 = *(uint64_t *)(vm + i); 8265 uint64_t m1 = *(uint64_t *)(vm + i + 8); 8266 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 8267 uint16_t out = 0; 8268 8269 for (j = 0; j < 16; j += 8) { 8270 uint64_t n = *(uint64_t *)(vn + i + j); 8271 8272 for (k = 0; k < 8; k += 1 << esz) { 8273 if (pg & (1 << (j + k))) { 8274 bool o = do_match2(n >> (k * 8), m0, m1, esz); 8275 out |= (o ^ nmatch) << (j + k); 8276 } 8277 } 8278 } 8279 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 8280 flags = iter_predtest_fwd(out, pg, flags); 8281 } 8282 return flags; 8283 } 8284 8285 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 8286 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 8287 { \ 8288 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 8289 } 8290 8291 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 8292 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 8293 8294 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 8295 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 8296 8297 #undef DO_PPZZ_MATCH 8298 8299 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 8300 uint32_t desc) 8301 { 8302 ARMVectorReg scratch; 8303 intptr_t i, j; 8304 intptr_t opr_sz = simd_oprsz(desc); 8305 uint32_t *d = vd, *n = vn, *m = vm; 8306 uint8_t *pg = vg; 8307 8308 if (d == n) { 8309 n = memcpy(&scratch, n, opr_sz); 8310 if (d == m) { 8311 m = n; 8312 } 8313 } else if (d == m) { 8314 m = memcpy(&scratch, m, opr_sz); 8315 } 8316 8317 for (i = 0; i < opr_sz; i += 4) { 8318 uint64_t count = 0; 8319 uint8_t pred; 8320 8321 pred = pg[H1(i >> 3)] >> (i & 7); 8322 if (pred & 1) { 8323 uint32_t nn = n[H4(i >> 2)]; 8324 8325 for (j = 0; j <= i; j += 4) { 8326 pred = pg[H1(j >> 3)] >> (j & 7); 8327 if ((pred & 1) && nn == m[H4(j >> 2)]) { 8328 ++count; 8329 } 8330 } 8331 } 8332 d[H4(i >> 2)] = count; 8333 } 8334 } 8335 8336 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 8337 uint32_t desc) 8338 { 8339 ARMVectorReg scratch; 8340 intptr_t i, j; 8341 intptr_t opr_sz = simd_oprsz(desc); 8342 uint64_t *d = vd, *n = vn, *m = vm; 8343 uint8_t *pg = vg; 8344 8345 if (d == n) { 8346 n = memcpy(&scratch, n, opr_sz); 8347 if (d == m) { 8348 m = n; 8349 } 8350 } else if (d == m) { 8351 m = memcpy(&scratch, m, opr_sz); 8352 } 8353 8354 for (i = 0; i < opr_sz / 8; ++i) { 8355 uint64_t count = 0; 8356 if (pg[H1(i)] & 1) { 8357 uint64_t nn = n[i]; 8358 for (j = 0; j <= i; ++j) { 8359 if ((pg[H1(j)] & 1) && nn == m[j]) { 8360 ++count; 8361 } 8362 } 8363 } 8364 d[i] = count; 8365 } 8366 } 8367 8368 /* 8369 * Returns the number of bytes in m0 and m1 that match n. 8370 * Unlike do_match2 we don't just need true/false, we need an exact count. 8371 * This requires two extra logical operations. 8372 */ 8373 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 8374 { 8375 const uint64_t mask = dup_const(MO_8, 0x7f); 8376 uint64_t cmp0, cmp1; 8377 8378 cmp1 = dup_const(MO_8, n); 8379 cmp0 = cmp1 ^ m0; 8380 cmp1 = cmp1 ^ m1; 8381 8382 /* 8383 * 1: clear msb of each byte to avoid carry to next byte (& mask) 8384 * 2: carry in to msb if byte != 0 (+ mask) 8385 * 3: set msb if cmp has msb set (| cmp) 8386 * 4: set ~msb to ignore them (| mask) 8387 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 8388 * 5: invert, resulting in 0x80 if and only if byte == 0. 8389 */ 8390 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 8391 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 8392 8393 /* 8394 * Combine the two compares in a way that the bits do 8395 * not overlap, and so preserves the count of set bits. 8396 * If the host has an efficient instruction for ctpop, 8397 * then ctpop(x) + ctpop(y) has the same number of 8398 * operations as ctpop(x | (y >> 1)). If the host does 8399 * not have an efficient ctpop, then we only want to 8400 * use it once. 8401 */ 8402 return ctpop64(cmp0 | (cmp1 >> 1)); 8403 } 8404 8405 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 8406 { 8407 intptr_t i, j; 8408 intptr_t opr_sz = simd_oprsz(desc); 8409 8410 for (i = 0; i < opr_sz; i += 16) { 8411 uint64_t n0 = *(uint64_t *)(vn + i); 8412 uint64_t m0 = *(uint64_t *)(vm + i); 8413 uint64_t n1 = *(uint64_t *)(vn + i + 8); 8414 uint64_t m1 = *(uint64_t *)(vm + i + 8); 8415 uint64_t out0 = 0; 8416 uint64_t out1 = 0; 8417 8418 for (j = 0; j < 64; j += 8) { 8419 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 8420 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 8421 out0 |= cnt0 << j; 8422 out1 |= cnt1 << j; 8423 } 8424 8425 *(uint64_t *)(vd + i) = out0; 8426 *(uint64_t *)(vd + i + 8) = out1; 8427 } 8428 } 8429 8430 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 8431 { 8432 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8433 int shr = simd_data(desc); 8434 int shl = 8 - shr; 8435 uint64_t mask = dup_const(MO_8, 0xff >> shr); 8436 uint64_t *d = vd, *n = vn, *m = vm; 8437 8438 for (i = 0; i < opr_sz; ++i) { 8439 uint64_t t = n[i] ^ m[i]; 8440 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 8441 } 8442 } 8443 8444 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 8445 { 8446 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8447 int shr = simd_data(desc); 8448 int shl = 16 - shr; 8449 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 8450 uint64_t *d = vd, *n = vn, *m = vm; 8451 8452 for (i = 0; i < opr_sz; ++i) { 8453 uint64_t t = n[i] ^ m[i]; 8454 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 8455 } 8456 } 8457 8458 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 8459 { 8460 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 8461 int shr = simd_data(desc); 8462 uint32_t *d = vd, *n = vn, *m = vm; 8463 8464 for (i = 0; i < opr_sz; ++i) { 8465 d[i] = ror32(n[i] ^ m[i], shr); 8466 } 8467 } 8468 8469 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 8470 float_status *status, uint32_t desc) 8471 { 8472 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 8473 8474 for (s = 0; s < opr_sz; ++s) { 8475 float32 *n = vn + s * sizeof(float32) * 4; 8476 float32 *m = vm + s * sizeof(float32) * 4; 8477 float32 *a = va + s * sizeof(float32) * 4; 8478 float32 *d = vd + s * sizeof(float32) * 4; 8479 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 8480 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 8481 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 8482 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 8483 float32 p0, p1; 8484 8485 /* i = 0, j = 0 */ 8486 p0 = float32_mul(n00, m00, status); 8487 p1 = float32_mul(n01, m01, status); 8488 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 8489 8490 /* i = 0, j = 1 */ 8491 p0 = float32_mul(n00, m10, status); 8492 p1 = float32_mul(n01, m11, status); 8493 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 8494 8495 /* i = 1, j = 0 */ 8496 p0 = float32_mul(n10, m00, status); 8497 p1 = float32_mul(n11, m01, status); 8498 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 8499 8500 /* i = 1, j = 1 */ 8501 p0 = float32_mul(n10, m10, status); 8502 p1 = float32_mul(n11, m11, status); 8503 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 8504 } 8505 } 8506 8507 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 8508 float_status *status, uint32_t desc) 8509 { 8510 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 8511 8512 for (s = 0; s < opr_sz; ++s) { 8513 float64 *n = vn + s * sizeof(float64) * 4; 8514 float64 *m = vm + s * sizeof(float64) * 4; 8515 float64 *a = va + s * sizeof(float64) * 4; 8516 float64 *d = vd + s * sizeof(float64) * 4; 8517 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 8518 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 8519 float64 p0, p1; 8520 8521 /* i = 0, j = 0 */ 8522 p0 = float64_mul(n00, m00, status); 8523 p1 = float64_mul(n01, m01, status); 8524 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 8525 8526 /* i = 0, j = 1 */ 8527 p0 = float64_mul(n00, m10, status); 8528 p1 = float64_mul(n01, m11, status); 8529 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 8530 8531 /* i = 1, j = 0 */ 8532 p0 = float64_mul(n10, m00, status); 8533 p1 = float64_mul(n11, m01, status); 8534 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 8535 8536 /* i = 1, j = 1 */ 8537 p0 = float64_mul(n10, m10, status); 8538 p1 = float64_mul(n11, m11, status); 8539 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 8540 } 8541 } 8542 8543 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 8544 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 8545 float_status *status, uint32_t desc) \ 8546 { \ 8547 intptr_t i = simd_oprsz(desc); \ 8548 uint64_t *g = vg; \ 8549 do { \ 8550 uint64_t pg = g[(i - 1) >> 6]; \ 8551 do { \ 8552 i -= sizeof(TYPEW); \ 8553 if (likely((pg >> (i & 63)) & 1)) { \ 8554 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 8555 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 8556 } \ 8557 } while (i & 63); \ 8558 } while (i != 0); \ 8559 } 8560 8561 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 8562 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 8563 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 8564 8565 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 8566 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 8567 float_status *status, uint32_t desc) \ 8568 { \ 8569 intptr_t i = simd_oprsz(desc); \ 8570 uint64_t *g = vg; \ 8571 do { \ 8572 uint64_t pg = g[(i - 1) >> 6]; \ 8573 do { \ 8574 i -= sizeof(TYPEW); \ 8575 if (likely((pg >> (i & 63)) & 1)) { \ 8576 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 8577 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 8578 } \ 8579 } while (i & 63); \ 8580 } while (i != 0); \ 8581 } 8582 8583 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 8584 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 8585 8586 #undef DO_FCVTLT 8587 #undef DO_FCVTNT 8588 8589 void HELPER(pext)(void *vd, uint32_t png, uint32_t desc) 8590 { 8591 int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); 8592 int vl = pl * 8; 8593 unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); 8594 int part = FIELD_EX32(desc, PREDDESC, DATA); 8595 DecodeCounter p = decode_counter(png, vl, v_esz); 8596 uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride]; 8597 ARMPredicateReg *d = vd; 8598 8599 /* 8600 * Convert from element count to byte count and adjust 8601 * for the portion of the 4*VL counter to be extracted. 8602 */ 8603 int b_count = (p.count << v_esz) - vl * part; 8604 8605 memset(d, 0, sizeof(*d)); 8606 if (p.invert) { 8607 if (b_count <= 0) { 8608 do_whilel(vd, mask, vl, vl); 8609 } else if (b_count < vl) { 8610 do_whileg(vd, mask, vl - b_count, vl); 8611 } 8612 } else if (b_count > 0) { 8613 do_whilel(vd, mask, MIN(b_count, vl), vl); 8614 } 8615 } 8616