1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/page-protection.h" 24 #include "exec/helper-proto.h" 25 #include "exec/target_page.h" 26 #include "exec/tlb-flags.h" 27 #include "tcg/tcg-gvec-desc.h" 28 #include "fpu/softfloat.h" 29 #include "tcg/tcg.h" 30 #include "vec_internal.h" 31 #include "sve_ldst_internal.h" 32 #include "accel/tcg/cpu-ldst.h" 33 #include "accel/tcg/helper-retaddr.h" 34 #include "accel/tcg/cpu-ops.h" 35 #include "accel/tcg/probe.h" 36 #ifdef CONFIG_USER_ONLY 37 #include "user/page-protection.h" 38 #endif 39 40 41 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 42 * 43 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 44 * and bit 0 set if C is set. Compare the definitions of these variables 45 * within CPUARMState. 46 */ 47 48 /* For no G bits set, NZCV = C. */ 49 #define PREDTEST_INIT 1 50 51 /* This is an iterative function, called for each Pd and Pg word 52 * moving forward. 53 */ 54 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 55 { 56 if (likely(g)) { 57 /* Compute N from first D & G. 58 Use bit 2 to signal first G bit seen. */ 59 if (!(flags & 4)) { 60 flags |= ((d & (g & -g)) != 0) << 31; 61 flags |= 4; 62 } 63 64 /* Accumulate Z from each D & G. */ 65 flags |= ((d & g) != 0) << 1; 66 67 /* Compute C from last !(D & G). Replace previous. */ 68 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 69 } 70 return flags; 71 } 72 73 /* This is an iterative function, called for each Pd and Pg word 74 * moving backward. 75 */ 76 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 77 { 78 if (likely(g)) { 79 /* Compute C from first (i.e last) !(D & G). 80 Use bit 2 to signal first G bit seen. */ 81 if (!(flags & 4)) { 82 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 83 flags |= (d & pow2floor(g)) == 0; 84 } 85 86 /* Accumulate Z from each D & G. */ 87 flags |= ((d & g) != 0) << 1; 88 89 /* Compute N from last (i.e first) D & G. Replace previous. */ 90 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 91 } 92 return flags; 93 } 94 95 /* The same for a single word predicate. */ 96 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 97 { 98 return iter_predtest_fwd(d, g, PREDTEST_INIT); 99 } 100 101 /* The same for a multi-word predicate. */ 102 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 103 { 104 uint32_t flags = PREDTEST_INIT; 105 uint64_t *d = vd, *g = vg; 106 uintptr_t i = 0; 107 108 do { 109 flags = iter_predtest_fwd(d[i], g[i], flags); 110 } while (++i < words); 111 112 return flags; 113 } 114 115 /* Similarly for single word elements. */ 116 static inline uint64_t expand_pred_s(uint8_t byte) 117 { 118 static const uint64_t word[] = { 119 [0x01] = 0x00000000ffffffffull, 120 [0x10] = 0xffffffff00000000ull, 121 [0x11] = 0xffffffffffffffffull, 122 }; 123 return word[byte & 0x11]; 124 } 125 126 static inline uint64_t expand_pred_d(uint8_t byte) 127 { 128 return -(uint64_t)(byte & 1); 129 } 130 131 #define LOGICAL_PPPP(NAME, FUNC) \ 132 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 133 { \ 134 uintptr_t opr_sz = simd_oprsz(desc); \ 135 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 136 uintptr_t i; \ 137 for (i = 0; i < opr_sz / 8; ++i) { \ 138 d[i] = FUNC(n[i], m[i], g[i]); \ 139 } \ 140 } 141 142 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 143 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 144 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 145 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 146 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 147 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 148 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 149 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 150 151 LOGICAL_PPPP(sve_and_pppp, DO_AND) 152 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 153 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 154 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 155 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 156 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 157 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 158 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 159 160 #undef DO_AND 161 #undef DO_BIC 162 #undef DO_EOR 163 #undef DO_ORR 164 #undef DO_ORN 165 #undef DO_NOR 166 #undef DO_NAND 167 #undef DO_SEL 168 #undef LOGICAL_PPPP 169 170 /* Fully general three-operand expander, controlled by a predicate. 171 * This is complicated by the host-endian storage of the register file. 172 */ 173 /* ??? I don't expect the compiler could ever vectorize this itself. 174 * With some tables we can convert bit masks to byte masks, and with 175 * extra care wrt byte/word ordering we could use gcc generic vectors 176 * and do 16 bytes at a time. 177 */ 178 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 179 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 180 { \ 181 intptr_t i, opr_sz = simd_oprsz(desc); \ 182 for (i = 0; i < opr_sz; ) { \ 183 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 184 do { \ 185 if (pg & 1) { \ 186 TYPE nn = *(TYPE *)(vn + H(i)); \ 187 TYPE mm = *(TYPE *)(vm + H(i)); \ 188 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 189 } \ 190 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 191 } while (i & 15); \ 192 } \ 193 } 194 195 /* Similarly, specialized for 64-bit operands. */ 196 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 197 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 198 { \ 199 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 200 TYPE *d = vd, *n = vn, *m = vm; \ 201 uint8_t *pg = vg; \ 202 for (i = 0; i < opr_sz; i += 1) { \ 203 if (pg[H1(i)] & 1) { \ 204 TYPE nn = n[i], mm = m[i]; \ 205 d[i] = OP(nn, mm); \ 206 } \ 207 } \ 208 } 209 210 #define DO_AND(N, M) (N & M) 211 #define DO_EOR(N, M) (N ^ M) 212 #define DO_ORR(N, M) (N | M) 213 #define DO_BIC(N, M) (N & ~M) 214 #define DO_ORC(N, M) (N | ~M) 215 #define DO_ADD(N, M) (N + M) 216 #define DO_SUB(N, M) (N - M) 217 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 218 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 219 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 220 #define DO_MUL(N, M) (N * M) 221 222 223 /* 224 * We must avoid the C undefined behaviour cases: division by 225 * zero and signed division of INT_MIN by -1. Both of these 226 * have architecturally defined required results for Arm. 227 * We special case all signed divisions by -1 to avoid having 228 * to deduce the minimum integer for the type involved. 229 */ 230 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 231 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 232 233 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 234 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 235 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 236 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 237 238 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 239 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 240 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 241 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 242 243 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 244 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 245 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 246 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 247 248 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 249 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 250 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 251 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 252 253 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 254 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 255 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 256 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 257 258 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 259 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 260 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 261 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 262 263 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 264 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 265 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 266 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 267 268 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 269 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 270 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 271 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 272 273 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 274 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 275 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 276 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 277 278 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 279 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 280 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 281 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 282 283 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 284 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 285 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 286 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 287 288 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 289 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 290 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 291 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 292 293 /* Because the computation type is at least twice as large as required, 294 these work for both signed and unsigned source types. */ 295 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 296 { 297 return (n * m) >> 8; 298 } 299 300 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 301 { 302 return (n * m) >> 16; 303 } 304 305 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 306 { 307 return (n * m) >> 32; 308 } 309 310 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 311 { 312 uint64_t lo, hi; 313 muls64(&lo, &hi, n, m); 314 return hi; 315 } 316 317 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 318 { 319 uint64_t lo, hi; 320 mulu64(&lo, &hi, n, m); 321 return hi; 322 } 323 324 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 325 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 326 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 327 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 328 329 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 330 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 331 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 332 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 333 334 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 335 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 336 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 337 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 338 339 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 340 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 341 342 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 343 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 344 345 /* Note that all bits of the shift are significant 346 and not modulo the element size. */ 347 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 348 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 349 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 350 351 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 352 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 353 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 354 355 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 356 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 357 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 358 359 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 360 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 361 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 362 363 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 364 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 365 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 366 367 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 368 { 369 int8_t n1 = n, n2 = n >> 8; 370 return m + n1 + n2; 371 } 372 373 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 374 { 375 int16_t n1 = n, n2 = n >> 16; 376 return m + n1 + n2; 377 } 378 379 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 380 { 381 int32_t n1 = n, n2 = n >> 32; 382 return m + n1 + n2; 383 } 384 385 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 386 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 387 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 388 389 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 390 { 391 uint8_t n1 = n, n2 = n >> 8; 392 return m + n1 + n2; 393 } 394 395 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 396 { 397 uint16_t n1 = n, n2 = n >> 16; 398 return m + n1 + n2; 399 } 400 401 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 402 { 403 uint32_t n1 = n, n2 = n >> 32; 404 return m + n1 + n2; 405 } 406 407 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 408 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 409 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 410 411 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 412 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 413 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 414 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 415 416 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 417 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 418 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 419 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 420 421 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 422 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 423 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 424 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 425 426 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 427 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 428 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 429 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 430 431 /* 432 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 433 * We pass in a pointer to a dummy saturation field to trigger 434 * the saturating arithmetic but discard the information about 435 * whether it has occurred. 436 */ 437 #define do_sqshl_b(n, m) \ 438 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 439 #define do_sqshl_h(n, m) \ 440 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 441 #define do_sqshl_s(n, m) \ 442 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 443 #define do_sqshl_d(n, m) \ 444 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 445 446 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 447 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 448 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 449 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 450 451 #define do_uqshl_b(n, m) \ 452 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 453 #define do_uqshl_h(n, m) \ 454 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 455 #define do_uqshl_s(n, m) \ 456 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 457 #define do_uqshl_d(n, m) \ 458 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 459 460 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 461 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 462 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 463 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 464 465 #define do_sqrshl_b(n, m) \ 466 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 467 #define do_sqrshl_h(n, m) \ 468 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 469 #define do_sqrshl_s(n, m) \ 470 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 471 #define do_sqrshl_d(n, m) \ 472 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 473 474 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 475 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 476 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 477 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 478 479 #undef do_sqrshl_d 480 481 #define do_uqrshl_b(n, m) \ 482 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 483 #define do_uqrshl_h(n, m) \ 484 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 485 #define do_uqrshl_s(n, m) \ 486 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 487 #define do_uqrshl_d(n, m) \ 488 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 489 490 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 491 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 492 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 493 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 494 495 #undef do_uqrshl_d 496 497 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 498 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 499 500 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 501 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 502 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 503 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 504 505 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 506 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 507 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 508 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 509 510 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 511 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 512 513 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 514 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 515 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 516 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 517 518 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 519 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 520 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 521 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 522 523 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 524 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 525 526 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 527 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 528 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 529 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 530 531 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 532 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 533 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 534 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 535 536 #define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m) 537 #define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m) 538 #define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m) 539 540 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 541 { 542 int64_t r = n + m; 543 if (((r ^ n) & ~(n ^ m)) < 0) { 544 /* Signed overflow. */ 545 return r < 0 ? INT64_MAX : INT64_MIN; 546 } 547 return r; 548 } 549 550 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 551 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 552 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 553 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 554 555 #define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m) 556 #define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m) 557 #define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m) 558 559 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 560 { 561 uint64_t r = n + m; 562 return r < n ? UINT64_MAX : r; 563 } 564 565 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 566 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 567 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 568 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 569 570 #define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m) 571 #define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m) 572 #define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m) 573 574 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 575 { 576 int64_t r = n - m; 577 if (((r ^ n) & (n ^ m)) < 0) { 578 /* Signed overflow. */ 579 return r < 0 ? INT64_MAX : INT64_MIN; 580 } 581 return r; 582 } 583 584 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 585 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 586 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 587 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 588 589 #define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m) 590 #define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m) 591 #define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m) 592 593 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 594 { 595 return n > m ? n - m : 0; 596 } 597 598 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 599 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 600 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 601 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 602 603 #define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m) 604 #define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m) 605 #define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m) 606 607 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 608 { 609 uint64_t r = n + m; 610 611 if (n < 0) { 612 /* Note that m - abs(n) cannot underflow. */ 613 if (r > INT64_MAX) { 614 /* Result is either very large positive or negative. */ 615 if (m > -n) { 616 /* m > abs(n), so r is a very large positive. */ 617 return INT64_MAX; 618 } 619 /* Result is negative. */ 620 } 621 } else { 622 /* Both inputs are positive: check for overflow. */ 623 if (r < m || r > INT64_MAX) { 624 return INT64_MAX; 625 } 626 } 627 return r; 628 } 629 630 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 631 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 632 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 633 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 634 635 #define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m) 636 #define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m) 637 #define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m) 638 639 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 640 { 641 uint64_t r = n + m; 642 643 if (m < 0) { 644 return n < -m ? 0 : r; 645 } 646 return r < n ? UINT64_MAX : r; 647 } 648 649 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 650 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 651 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 652 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 653 654 #undef DO_ZPZZ 655 #undef DO_ZPZZ_D 656 657 /* 658 * Three operand expander, operating on element pairs. 659 * If the slot I is even, the elements from from VN {I, I+1}. 660 * If the slot I is odd, the elements from from VM {I-1, I}. 661 * Load all of the input elements in each pair before overwriting output. 662 */ 663 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 664 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 665 { \ 666 intptr_t i, opr_sz = simd_oprsz(desc); \ 667 for (i = 0; i < opr_sz; ) { \ 668 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 669 do { \ 670 TYPE n0 = *(TYPE *)(vn + H(i)); \ 671 TYPE m0 = *(TYPE *)(vm + H(i)); \ 672 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 673 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 674 if (pg & 1) { \ 675 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 676 } \ 677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 678 if (pg & 1) { \ 679 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 680 } \ 681 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 682 } while (i & 15); \ 683 } \ 684 } 685 686 /* Similarly, specialized for 64-bit operands. */ 687 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 688 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 689 { \ 690 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 691 TYPE *d = vd, *n = vn, *m = vm; \ 692 uint8_t *pg = vg; \ 693 for (i = 0; i < opr_sz; i += 2) { \ 694 TYPE n0 = n[i], n1 = n[i + 1]; \ 695 TYPE m0 = m[i], m1 = m[i + 1]; \ 696 if (pg[H1(i)] & 1) { \ 697 d[i] = OP(n0, n1); \ 698 } \ 699 if (pg[H1(i + 1)] & 1) { \ 700 d[i + 1] = OP(m0, m1); \ 701 } \ 702 } \ 703 } 704 705 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 706 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 707 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 708 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 709 710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 713 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 714 715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 718 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 719 720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 723 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 724 725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 728 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 729 730 #undef DO_ZPZZ_PAIR 731 #undef DO_ZPZZ_PAIR_D 732 733 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 734 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 735 float_status *status, uint32_t desc) \ 736 { \ 737 intptr_t i, opr_sz = simd_oprsz(desc); \ 738 for (i = 0; i < opr_sz; ) { \ 739 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 740 do { \ 741 TYPE n0 = *(TYPE *)(vn + H(i)); \ 742 TYPE m0 = *(TYPE *)(vm + H(i)); \ 743 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 744 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 745 if (pg & 1) { \ 746 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 747 } \ 748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 749 if (pg & 1) { \ 750 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 751 } \ 752 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 753 } while (i & 15); \ 754 } \ 755 } 756 757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 760 761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 764 765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 768 769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 772 773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 776 777 #undef DO_ZPZZ_PAIR_FP 778 779 /* Three-operand expander, controlled by a predicate, in which the 780 * third operand is "wide". That is, for D = N op M, the same 64-bit 781 * value of M is used with all of the narrower values of N. 782 */ 783 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 784 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 785 { \ 786 intptr_t i, opr_sz = simd_oprsz(desc); \ 787 for (i = 0; i < opr_sz; ) { \ 788 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 789 TYPEW mm = *(TYPEW *)(vm + i); \ 790 do { \ 791 if (pg & 1) { \ 792 TYPE nn = *(TYPE *)(vn + H(i)); \ 793 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 794 } \ 795 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 796 } while (i & 7); \ 797 } \ 798 } 799 800 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 801 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 802 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 803 804 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 805 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 806 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 807 808 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 809 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 810 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 811 812 #undef DO_ZPZW 813 814 /* Fully general two-operand expander, controlled by a predicate. 815 */ 816 #define DO_ZPZ(NAME, TYPE, H, OP) \ 817 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 818 { \ 819 intptr_t i, opr_sz = simd_oprsz(desc); \ 820 for (i = 0; i < opr_sz; ) { \ 821 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 822 do { \ 823 if (pg & 1) { \ 824 TYPE nn = *(TYPE *)(vn + H(i)); \ 825 *(TYPE *)(vd + H(i)) = OP(nn); \ 826 } \ 827 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 828 } while (i & 15); \ 829 } \ 830 } 831 832 /* Similarly, specialized for 64-bit operands. */ 833 #define DO_ZPZ_D(NAME, TYPE, OP) \ 834 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 835 { \ 836 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 837 TYPE *d = vd, *n = vn; \ 838 uint8_t *pg = vg; \ 839 for (i = 0; i < opr_sz; i += 1) { \ 840 if (pg[H1(i)] & 1) { \ 841 TYPE nn = n[i]; \ 842 d[i] = OP(nn); \ 843 } \ 844 } \ 845 } 846 847 #define DO_CLS_B(N) (clrsb32(N) - 24) 848 #define DO_CLS_H(N) (clrsb32(N) - 16) 849 850 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 851 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 852 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 853 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 854 855 #define DO_CLZ_B(N) (clz32(N) - 24) 856 #define DO_CLZ_H(N) (clz32(N) - 16) 857 858 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 859 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 860 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 861 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 862 863 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 864 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 865 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 866 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 867 868 #define DO_CNOT(N) (N == 0) 869 870 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 871 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 872 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 873 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 874 875 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 876 877 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 878 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 879 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 880 881 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N)) 882 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N)) 883 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N)) 884 885 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H) 886 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S) 887 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D) 888 889 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 890 891 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 892 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 893 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 894 895 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N)) 896 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N)) 897 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N)) 898 899 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H) 900 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S) 901 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D) 902 903 #define DO_NOT(N) (~N) 904 905 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 906 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 907 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 908 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 909 910 #define DO_SXTB(N) ((int8_t)N) 911 #define DO_SXTH(N) ((int16_t)N) 912 #define DO_SXTS(N) ((int32_t)N) 913 #define DO_UXTB(N) ((uint8_t)N) 914 #define DO_UXTH(N) ((uint16_t)N) 915 #define DO_UXTS(N) ((uint32_t)N) 916 917 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 918 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 919 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 920 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 921 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 922 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 923 924 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 925 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 926 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 927 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 928 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 929 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 930 931 #define DO_ABS(N) (N < 0 ? -N : N) 932 933 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 934 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 935 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 936 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 937 938 #define DO_NEG(N) (-N) 939 940 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 941 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 942 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 943 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 944 945 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 946 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 947 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 948 949 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 950 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 951 952 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 953 954 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 955 { 956 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 957 uint64_t *d = vd, *n = vn; 958 uint8_t *pg = vg; 959 960 for (i = 0; i < opr_sz; i += 2) { 961 if (pg[H1(i)] & 1) { 962 uint64_t n0 = n[i + 0]; 963 uint64_t n1 = n[i + 1]; 964 d[i + 0] = n1; 965 d[i + 1] = n0; 966 } 967 } 968 } 969 970 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 971 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 972 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 973 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 974 975 #define DO_SQABS(X) \ 976 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 977 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 978 979 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 980 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 981 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 982 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 983 984 #define DO_SQNEG(X) \ 985 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 986 x_ == min_ ? -min_ - 1 : -x_; }) 987 988 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 989 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 990 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 991 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 992 993 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 994 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 995 996 /* Three-operand expander, unpredicated, in which the third operand is "wide". 997 */ 998 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 999 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1000 { \ 1001 intptr_t i, opr_sz = simd_oprsz(desc); \ 1002 for (i = 0; i < opr_sz; ) { \ 1003 TYPEW mm = *(TYPEW *)(vm + i); \ 1004 do { \ 1005 TYPE nn = *(TYPE *)(vn + H(i)); \ 1006 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 1007 i += sizeof(TYPE); \ 1008 } while (i & 7); \ 1009 } \ 1010 } 1011 1012 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 1013 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 1014 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1015 1016 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1017 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1018 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1019 1020 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1021 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1022 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1023 1024 #undef DO_ZZW 1025 1026 #undef DO_CLS_B 1027 #undef DO_CLS_H 1028 #undef DO_CLZ_B 1029 #undef DO_CLZ_H 1030 #undef DO_CNOT 1031 #undef DO_FABS 1032 #undef DO_FNEG 1033 #undef DO_ABS 1034 #undef DO_NEG 1035 #undef DO_ZPZ 1036 #undef DO_ZPZ_D 1037 1038 /* 1039 * Three-operand expander, unpredicated, in which the two inputs are 1040 * selected from the top or bottom half of the wide column. 1041 */ 1042 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1043 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1044 { \ 1045 intptr_t i, opr_sz = simd_oprsz(desc); \ 1046 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1047 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1048 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1049 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1050 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1051 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1052 } \ 1053 } 1054 1055 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1056 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1057 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1058 1059 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1060 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1061 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1062 1063 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1064 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1065 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1066 1067 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1068 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1069 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1070 1071 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1072 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1073 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1074 1075 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1076 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1077 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1078 1079 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1080 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1081 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1082 1083 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1084 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1085 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1086 1087 /* Note that the multiply cannot overflow, but the doubling can. */ 1088 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1089 { 1090 int16_t val = n * m; 1091 return DO_SQADD_H(val, val); 1092 } 1093 1094 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1095 { 1096 int32_t val = n * m; 1097 return DO_SQADD_S(val, val); 1098 } 1099 1100 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1101 { 1102 int64_t val = n * m; 1103 return do_sqadd_d(val, val); 1104 } 1105 1106 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1107 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1108 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1109 1110 #undef DO_ZZZ_TB 1111 1112 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1113 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1114 { \ 1115 intptr_t i, opr_sz = simd_oprsz(desc); \ 1116 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1117 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1118 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1119 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1120 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1121 } \ 1122 } 1123 1124 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1125 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1126 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1127 1128 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1129 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1130 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1131 1132 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1133 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1134 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1135 1136 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1137 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1138 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1139 1140 #undef DO_ZZZ_WTB 1141 1142 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1143 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1144 { \ 1145 intptr_t i, opr_sz = simd_oprsz(desc); \ 1146 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1147 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1148 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1149 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1150 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1151 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1152 } \ 1153 } 1154 1155 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1156 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1157 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1158 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1159 1160 #undef DO_ZZZ_NTB 1161 1162 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1163 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1164 { \ 1165 intptr_t i, opr_sz = simd_oprsz(desc); \ 1166 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1167 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1168 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1169 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1170 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1171 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1172 } \ 1173 } 1174 1175 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1176 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1177 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1178 1179 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1180 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1181 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1182 1183 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1184 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1185 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1186 1187 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1188 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1189 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1190 1191 #define DO_NMUL(N, M) -(N * M) 1192 1193 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1196 1197 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1200 1201 #undef DO_ZZZW_ACC 1202 1203 #define DO_XTNB(NAME, TYPE, OP) \ 1204 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1205 { \ 1206 intptr_t i, opr_sz = simd_oprsz(desc); \ 1207 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1208 TYPE nn = *(TYPE *)(vn + i); \ 1209 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1210 *(TYPE *)(vd + i) = nn; \ 1211 } \ 1212 } 1213 1214 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1215 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1216 { \ 1217 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1218 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1219 TYPE nn = *(TYPE *)(vn + i); \ 1220 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1221 } \ 1222 } 1223 1224 DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b) 1225 DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h) 1226 DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s) 1227 1228 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b) 1229 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h) 1230 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s) 1231 1232 DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b) 1233 DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h) 1234 DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s) 1235 1236 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b) 1237 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h) 1238 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s) 1239 1240 DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b) 1241 DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h) 1242 DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s) 1243 1244 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b) 1245 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h) 1246 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s) 1247 1248 #undef DO_XTNB 1249 #undef DO_XTNT 1250 1251 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1252 { 1253 intptr_t i, opr_sz = simd_oprsz(desc); 1254 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1255 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1256 uint32_t *a = va, *n = vn; 1257 uint64_t *d = vd, *m = vm; 1258 1259 for (i = 0; i < opr_sz / 8; ++i) { 1260 uint32_t e1 = a[2 * i + H4(0)]; 1261 uint32_t e2 = n[2 * i + sel] ^ inv; 1262 uint64_t c = extract64(m[i], 32, 1); 1263 /* Compute and store the entire 33-bit result at once. */ 1264 d[i] = c + e1 + e2; 1265 } 1266 } 1267 1268 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1269 { 1270 intptr_t i, opr_sz = simd_oprsz(desc); 1271 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1272 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1273 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1274 1275 for (i = 0; i < opr_sz / 8; i += 2) { 1276 Int128 e1 = int128_make64(a[i]); 1277 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1278 Int128 c = int128_make64(m[i + 1] & 1); 1279 Int128 r = int128_add(int128_add(e1, e2), c); 1280 d[i + 0] = int128_getlo(r); 1281 d[i + 1] = int128_gethi(r); 1282 } 1283 } 1284 1285 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1287 { \ 1288 intptr_t i, opr_sz = simd_oprsz(desc); \ 1289 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1290 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1291 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1292 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1293 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1294 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1295 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1296 } \ 1297 } 1298 1299 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1300 do_sqdmull_h, DO_SQADD_H) 1301 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1302 do_sqdmull_s, DO_SQADD_S) 1303 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1304 do_sqdmull_d, do_sqadd_d) 1305 1306 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1307 do_sqdmull_h, DO_SQSUB_H) 1308 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1309 do_sqdmull_s, DO_SQSUB_S) 1310 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1311 do_sqdmull_d, do_sqsub_d) 1312 1313 #undef DO_SQDMLAL 1314 1315 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1316 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1317 { \ 1318 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1319 int rot = simd_data(desc); \ 1320 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1321 bool sub_r = rot == 1 || rot == 2; \ 1322 bool sub_i = rot >= 2; \ 1323 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1324 for (i = 0; i < opr_sz; i += 2) { \ 1325 TYPE elt1_a = n[H(i + sel_a)]; \ 1326 TYPE elt2_a = m[H(i + sel_a)]; \ 1327 TYPE elt2_b = m[H(i + sel_b)]; \ 1328 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1329 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1330 } \ 1331 } 1332 1333 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1334 1335 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1336 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1337 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1338 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1339 1340 #define DO_SQRDMLAH_B(N, M, A, S) \ 1341 do_sqrdmlah_b(N, M, A, S, true) 1342 #define DO_SQRDMLAH_H(N, M, A, S) \ 1343 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1344 #define DO_SQRDMLAH_S(N, M, A, S) \ 1345 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1346 #define DO_SQRDMLAH_D(N, M, A, S) \ 1347 do_sqrdmlah_d(N, M, A, S, true) 1348 1349 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1350 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1351 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1352 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1353 1354 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1355 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1356 { \ 1357 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1358 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1359 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1360 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1361 bool sub_r = rot == 1 || rot == 2; \ 1362 bool sub_i = rot >= 2; \ 1363 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1364 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1365 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1366 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1367 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1368 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1369 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1370 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1371 } \ 1372 } \ 1373 } 1374 1375 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1376 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1377 1378 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1379 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1380 1381 #undef DO_CMLA 1382 #undef DO_CMLA_FUNC 1383 #undef DO_CMLA_IDX_FUNC 1384 #undef DO_SQRDMLAH_B 1385 #undef DO_SQRDMLAH_H 1386 #undef DO_SQRDMLAH_S 1387 #undef DO_SQRDMLAH_D 1388 1389 /* Note N and M are 4 elements bundled into one unit. */ 1390 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1391 int sel_a, int sel_b, int sub_i) 1392 { 1393 for (int i = 0; i <= 1; i++) { 1394 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1395 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1396 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1397 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1398 1399 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1400 } 1401 return a; 1402 } 1403 1404 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1405 int sel_a, int sel_b, int sub_i) 1406 { 1407 for (int i = 0; i <= 1; i++) { 1408 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1409 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1410 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1411 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1412 1413 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1414 } 1415 return a; 1416 } 1417 1418 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1419 void *va, uint32_t desc) 1420 { 1421 int opr_sz = simd_oprsz(desc); 1422 int rot = simd_data(desc); 1423 int sel_a = rot & 1; 1424 int sel_b = sel_a ^ 1; 1425 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1426 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1427 1428 for (int e = 0; e < opr_sz / 4; e++) { 1429 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1430 } 1431 } 1432 1433 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1434 void *va, uint32_t desc) 1435 { 1436 int opr_sz = simd_oprsz(desc); 1437 int rot = simd_data(desc); 1438 int sel_a = rot & 1; 1439 int sel_b = sel_a ^ 1; 1440 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1441 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1442 1443 for (int e = 0; e < opr_sz / 8; e++) { 1444 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1445 } 1446 } 1447 1448 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1449 void *va, uint32_t desc) 1450 { 1451 int opr_sz = simd_oprsz(desc); 1452 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1453 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1454 int sel_a = rot & 1; 1455 int sel_b = sel_a ^ 1; 1456 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1457 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1458 1459 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1460 uint32_t seg_m = m[seg + idx]; 1461 for (int e = 0; e < 4; e++) { 1462 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1463 sel_a, sel_b, sub_i); 1464 } 1465 } 1466 } 1467 1468 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1469 void *va, uint32_t desc) 1470 { 1471 int seg, opr_sz = simd_oprsz(desc); 1472 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1473 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1474 int sel_a = rot & 1; 1475 int sel_b = sel_a ^ 1; 1476 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1477 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1478 1479 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1480 uint64_t seg_m = m[seg + idx]; 1481 for (int e = 0; e < 2; e++) { 1482 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1483 sel_a, sel_b, sub_i); 1484 } 1485 } 1486 } 1487 1488 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1489 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1490 { \ 1491 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1492 intptr_t i, j, idx = simd_data(desc); \ 1493 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1494 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1495 TYPE mm = m[i]; \ 1496 for (j = 0; j < segment; j++) { \ 1497 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1498 } \ 1499 } \ 1500 } 1501 1502 #define DO_SQRDMLAH_H(N, M, A) \ 1503 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1504 #define DO_SQRDMLAH_S(N, M, A) \ 1505 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1506 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1507 1508 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1509 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1510 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1511 1512 #define DO_SQRDMLSH_H(N, M, A) \ 1513 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1514 #define DO_SQRDMLSH_S(N, M, A) \ 1515 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1516 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1517 1518 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1519 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1520 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1521 1522 #undef DO_ZZXZ 1523 1524 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1525 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1526 { \ 1527 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1528 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1529 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1530 for (i = 0; i < oprsz; i += 16) { \ 1531 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1532 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1533 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1534 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1535 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1536 } \ 1537 } \ 1538 } 1539 1540 #define DO_MLA(N, M, A) (A + N * M) 1541 1542 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1543 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1544 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1545 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1546 1547 #define DO_MLS(N, M, A) (A - N * M) 1548 1549 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1550 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1551 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1552 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1553 1554 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1555 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1556 1557 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1558 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1559 1560 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1561 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1562 1563 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1564 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1565 1566 #undef DO_MLA 1567 #undef DO_MLS 1568 #undef DO_ZZXW 1569 1570 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1571 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1572 { \ 1573 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1574 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1575 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1576 for (i = 0; i < oprsz; i += 16) { \ 1577 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1578 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1579 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1580 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1581 } \ 1582 } \ 1583 } 1584 1585 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1586 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1587 1588 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1589 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1590 1591 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1592 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1593 1594 #undef DO_ZZX 1595 1596 #define DO_BITPERM(NAME, TYPE, OP) \ 1597 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1598 { \ 1599 intptr_t i, opr_sz = simd_oprsz(desc); \ 1600 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1601 TYPE nn = *(TYPE *)(vn + i); \ 1602 TYPE mm = *(TYPE *)(vm + i); \ 1603 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1604 } \ 1605 } 1606 1607 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1608 { 1609 uint64_t res = 0; 1610 int db, rb = 0; 1611 1612 for (db = 0; db < n; ++db) { 1613 if ((mask >> db) & 1) { 1614 res |= ((data >> db) & 1) << rb; 1615 ++rb; 1616 } 1617 } 1618 return res; 1619 } 1620 1621 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1622 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1623 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1624 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1625 1626 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1627 { 1628 uint64_t res = 0; 1629 int rb, db = 0; 1630 1631 for (rb = 0; rb < n; ++rb) { 1632 if ((mask >> rb) & 1) { 1633 res |= ((data >> db) & 1) << rb; 1634 ++db; 1635 } 1636 } 1637 return res; 1638 } 1639 1640 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1641 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1642 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1643 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1644 1645 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1646 { 1647 uint64_t resm = 0, resu = 0; 1648 int db, rbm = 0, rbu = 0; 1649 1650 for (db = 0; db < n; ++db) { 1651 uint64_t val = (data >> db) & 1; 1652 if ((mask >> db) & 1) { 1653 resm |= val << rbm++; 1654 } else { 1655 resu |= val << rbu++; 1656 } 1657 } 1658 1659 return resm | (resu << rbm); 1660 } 1661 1662 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1663 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1664 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1665 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1666 1667 #undef DO_BITPERM 1668 1669 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1670 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1671 { \ 1672 intptr_t i, opr_sz = simd_oprsz(desc); \ 1673 int sub_r = simd_data(desc); \ 1674 if (sub_r) { \ 1675 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1676 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1677 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1678 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1679 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1680 acc_r = ADD_OP(acc_r, el2_i); \ 1681 acc_i = SUB_OP(acc_i, el2_r); \ 1682 *(TYPE *)(vd + H(i)) = acc_r; \ 1683 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1684 } \ 1685 } else { \ 1686 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1687 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1688 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1689 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1690 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1691 acc_r = SUB_OP(acc_r, el2_i); \ 1692 acc_i = ADD_OP(acc_i, el2_r); \ 1693 *(TYPE *)(vd + H(i)) = acc_r; \ 1694 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1695 } \ 1696 } \ 1697 } 1698 1699 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1700 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1701 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1702 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1703 1704 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1705 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1706 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1707 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1708 1709 #undef DO_CADD 1710 1711 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1712 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1713 { \ 1714 intptr_t i, opr_sz = simd_oprsz(desc); \ 1715 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1716 int shift = simd_data(desc) >> 1; \ 1717 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1718 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1719 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1720 } \ 1721 } 1722 1723 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1724 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1725 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1726 1727 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1728 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1729 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1730 1731 #undef DO_ZZI_SHLL 1732 1733 /* Two-operand reduction expander, controlled by a predicate. 1734 * The difference between TYPERED and TYPERET has to do with 1735 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1736 * but TYPERET must be unsigned so that e.g. a 32-bit value 1737 * is not sign-extended to the ABI uint64_t return type. 1738 */ 1739 /* ??? If we were to vectorize this by hand the reduction ordering 1740 * would change. For integer operands, this is perfectly fine. 1741 */ 1742 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1743 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1744 { \ 1745 intptr_t i, opr_sz = simd_oprsz(desc); \ 1746 TYPERED ret = INIT; \ 1747 for (i = 0; i < opr_sz; ) { \ 1748 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1749 do { \ 1750 if (pg & 1) { \ 1751 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1752 ret = OP(ret, nn); \ 1753 } \ 1754 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1755 } while (i & 15); \ 1756 } \ 1757 return (TYPERET)ret; \ 1758 } 1759 1760 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1761 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1762 { \ 1763 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1764 TYPEE *n = vn; \ 1765 uint8_t *pg = vg; \ 1766 TYPER ret = INIT; \ 1767 for (i = 0; i < opr_sz; i += 1) { \ 1768 if (pg[H1(i)] & 1) { \ 1769 TYPEE nn = n[i]; \ 1770 ret = OP(ret, nn); \ 1771 } \ 1772 } \ 1773 return ret; \ 1774 } 1775 1776 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1777 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1778 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1779 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1780 1781 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1782 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1783 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1784 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1785 1786 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1787 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1788 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1789 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1790 1791 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1792 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1793 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1794 1795 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1796 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1797 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1798 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1799 1800 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1801 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1802 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1803 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1804 1805 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1806 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1807 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1808 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1809 1810 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1811 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1812 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1813 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1814 1815 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1816 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1817 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1818 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1819 1820 #undef DO_VPZ 1821 #undef DO_VPZ_D 1822 1823 #define DO_VPQ(NAME, TYPE, H, INIT, OP) \ 1824 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 1825 { \ 1826 TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \ 1827 TYPE *n = vn; uint16_t *g = vg; \ 1828 uintptr_t oprsz = simd_oprsz(desc); \ 1829 uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE); \ 1830 for (uintptr_t s = 0; s < nseg; s++) { \ 1831 uint16_t pg = g[H2(s)]; \ 1832 for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) { \ 1833 if (pg & 1) { \ 1834 tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]); \ 1835 } \ 1836 } \ 1837 } \ 1838 memcpy(vd, tmp, 16); \ 1839 clear_tail(vd, 16, simd_maxsz(desc)); \ 1840 } 1841 1842 DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD) 1843 DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD) 1844 DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD) 1845 DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD) 1846 1847 DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX) 1848 DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX) 1849 DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX) 1850 DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX) 1851 1852 DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN) 1853 DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN) 1854 DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN) 1855 DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN) 1856 1857 DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX) 1858 DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX) 1859 DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX) 1860 DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX) 1861 1862 DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN) 1863 DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN) 1864 DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN) 1865 DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN) 1866 1867 #undef DO_VPQ 1868 1869 /* Two vector operand, one scalar operand, unpredicated. */ 1870 #define DO_ZZI(NAME, TYPE, OP) \ 1871 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1872 { \ 1873 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1874 TYPE s = s64, *d = vd, *n = vn; \ 1875 for (i = 0; i < opr_sz; ++i) { \ 1876 d[i] = OP(n[i], s); \ 1877 } \ 1878 } 1879 1880 #define DO_SUBR(X, Y) (Y - X) 1881 1882 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1883 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1884 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1885 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1886 1887 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1888 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1889 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1890 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1891 1892 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1893 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1894 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1895 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1896 1897 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1898 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1899 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1900 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1901 1902 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1903 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1904 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1905 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1906 1907 #undef DO_ZZI 1908 1909 #define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP) \ 1910 void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \ 1911 { \ 1912 unsigned seg = simd_oprsz(desc) / 16; \ 1913 uint64_t r0 = INIT, r1 = INIT; \ 1914 for (unsigned s = 0; s < seg; s++) { \ 1915 uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \ 1916 uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \ 1917 uint64_t v0 = *(uint64_t *)(vn + s * 16); \ 1918 uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8); \ 1919 v0 = POP(v0, p0), v1 = POP(v1, p1); \ 1920 r0 = VOP(r0, v0), r1 = VOP(r1, v1); \ 1921 } \ 1922 *(uint64_t *)(vd + 0) = r0; \ 1923 *(uint64_t *)(vd + 8) = r1; \ 1924 clear_tail(vd, 16, simd_maxsz(desc)); \ 1925 } 1926 1927 DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND) 1928 DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND) 1929 DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND) 1930 DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND) 1931 1932 DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND) 1933 DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND) 1934 DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND) 1935 DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND) 1936 1937 DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC) 1938 DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC) 1939 DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC) 1940 DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC) 1941 1942 #undef DO_LOGIC_QV 1943 1944 #undef DO_AND 1945 #undef DO_ORR 1946 #undef DO_EOR 1947 #undef DO_BIC 1948 #undef DO_ORC 1949 #undef DO_ADD 1950 #undef DO_SUB 1951 #undef DO_MAX 1952 #undef DO_MIN 1953 #undef DO_ABD 1954 #undef DO_MUL 1955 #undef DO_DIV 1956 #undef DO_ASR 1957 #undef DO_LSR 1958 #undef DO_LSL 1959 #undef DO_SUBR 1960 1961 /* Similar to the ARM LastActiveElement pseudocode function, except the 1962 result is multiplied by the element size. This includes the not found 1963 indication; e.g. not found for esz=3 is -8. */ 1964 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1965 { 1966 uint64_t mask = pred_esz_masks[esz]; 1967 intptr_t i = words; 1968 1969 do { 1970 uint64_t this_g = g[--i] & mask; 1971 if (this_g) { 1972 return i * 64 + (63 - clz64(this_g)); 1973 } 1974 } while (i > 0); 1975 return (intptr_t)-1 << esz; 1976 } 1977 1978 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1979 { 1980 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1981 uint32_t flags = PREDTEST_INIT; 1982 uint64_t *d = vd, *g = vg; 1983 intptr_t i = 0; 1984 1985 do { 1986 uint64_t this_d = d[i]; 1987 uint64_t this_g = g[i]; 1988 1989 if (this_g) { 1990 if (!(flags & 4)) { 1991 /* Set in D the first bit of G. */ 1992 this_d |= this_g & -this_g; 1993 d[i] = this_d; 1994 } 1995 flags = iter_predtest_fwd(this_d, this_g, flags); 1996 } 1997 } while (++i < words); 1998 1999 return flags; 2000 } 2001 2002 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 2003 { 2004 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 2005 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 2006 uint32_t flags = PREDTEST_INIT; 2007 uint64_t *d = vd, *g = vg, esz_mask; 2008 intptr_t i, next; 2009 2010 next = last_active_element(vd, words, esz) + (1 << esz); 2011 esz_mask = pred_esz_masks[esz]; 2012 2013 /* Similar to the pseudocode for pnext, but scaled by ESZ 2014 so that we find the correct bit. */ 2015 if (next < words * 64) { 2016 uint64_t mask = -1; 2017 2018 if (next & 63) { 2019 mask = ~((1ull << (next & 63)) - 1); 2020 next &= -64; 2021 } 2022 do { 2023 uint64_t this_g = g[next / 64] & esz_mask & mask; 2024 if (this_g != 0) { 2025 next = (next & -64) + ctz64(this_g); 2026 break; 2027 } 2028 next += 64; 2029 mask = -1; 2030 } while (next < words * 64); 2031 } 2032 2033 i = 0; 2034 do { 2035 uint64_t this_d = 0; 2036 if (i == next / 64) { 2037 this_d = 1ull << (next & 63); 2038 } 2039 d[i] = this_d; 2040 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 2041 } while (++i < words); 2042 2043 return flags; 2044 } 2045 2046 /* 2047 * Copy Zn into Zd, and store zero into inactive elements. 2048 * If inv, store zeros into the active elements. 2049 */ 2050 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 2051 { 2052 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2053 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2054 uint64_t *d = vd, *n = vn; 2055 uint8_t *pg = vg; 2056 2057 for (i = 0; i < opr_sz; i += 1) { 2058 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 2059 } 2060 } 2061 2062 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 2063 { 2064 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2065 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2066 uint64_t *d = vd, *n = vn; 2067 uint8_t *pg = vg; 2068 2069 for (i = 0; i < opr_sz; i += 1) { 2070 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 2071 } 2072 } 2073 2074 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 2075 { 2076 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2077 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2078 uint64_t *d = vd, *n = vn; 2079 uint8_t *pg = vg; 2080 2081 for (i = 0; i < opr_sz; i += 1) { 2082 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 2083 } 2084 } 2085 2086 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 2087 { 2088 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2089 uint64_t *d = vd, *n = vn; 2090 uint8_t *pg = vg; 2091 uint8_t inv = simd_data(desc); 2092 2093 for (i = 0; i < opr_sz; i += 1) { 2094 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2095 } 2096 } 2097 2098 /* Three-operand expander, immediate operand, controlled by a predicate. 2099 */ 2100 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2101 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2102 { \ 2103 intptr_t i, opr_sz = simd_oprsz(desc); \ 2104 TYPE imm = simd_data(desc); \ 2105 for (i = 0; i < opr_sz; ) { \ 2106 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2107 do { \ 2108 if (pg & 1) { \ 2109 TYPE nn = *(TYPE *)(vn + H(i)); \ 2110 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2111 } \ 2112 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2113 } while (i & 15); \ 2114 } \ 2115 } 2116 2117 /* Similarly, specialized for 64-bit operands. */ 2118 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2119 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2120 { \ 2121 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2122 TYPE *d = vd, *n = vn; \ 2123 TYPE imm = simd_data(desc); \ 2124 uint8_t *pg = vg; \ 2125 for (i = 0; i < opr_sz; i += 1) { \ 2126 if (pg[H1(i)] & 1) { \ 2127 TYPE nn = n[i]; \ 2128 d[i] = OP(nn, imm); \ 2129 } \ 2130 } \ 2131 } 2132 2133 #define DO_SHR(N, M) (N >> M) 2134 #define DO_SHL(N, M) (N << M) 2135 2136 /* Arithmetic shift right for division. This rounds negative numbers 2137 toward zero as per signed division. Therefore before shifting, 2138 when N is negative, add 2**M-1. */ 2139 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2140 2141 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2142 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2143 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2144 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2145 2146 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2147 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2148 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2149 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2150 2151 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2152 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2153 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2154 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2155 2156 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2157 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2158 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2159 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2160 2161 /* SVE2 bitwise shift by immediate */ 2162 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2163 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2164 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2165 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2166 2167 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2168 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2169 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2170 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2171 2172 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2173 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2174 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2175 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2176 2177 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2178 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2179 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2180 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2181 2182 #define do_suqrshl_b(n, m) \ 2183 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2184 #define do_suqrshl_h(n, m) \ 2185 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2186 #define do_suqrshl_s(n, m) \ 2187 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2188 #define do_suqrshl_d(n, m) \ 2189 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2190 2191 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2192 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2193 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2194 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2195 2196 #undef DO_ASRD 2197 #undef DO_ZPZI 2198 #undef DO_ZPZI_D 2199 2200 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2201 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2202 { \ 2203 intptr_t i, opr_sz = simd_oprsz(desc); \ 2204 int shift = simd_data(desc); \ 2205 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2206 TYPEW nn = *(TYPEW *)(vn + i); \ 2207 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2208 } \ 2209 } 2210 2211 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2212 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2213 { \ 2214 intptr_t i, opr_sz = simd_oprsz(desc); \ 2215 int shift = simd_data(desc); \ 2216 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2217 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2218 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2219 } \ 2220 } 2221 2222 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2223 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2224 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2225 2226 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2227 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2228 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2229 2230 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2231 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2232 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2233 2234 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2235 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2236 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2237 2238 #define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh) 2239 #define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh) 2240 #define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63)) 2241 2242 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2243 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2244 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2245 2246 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2247 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2248 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2249 2250 #define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh)) 2251 #define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh)) 2252 #define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh)) 2253 2254 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2255 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2256 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2257 2258 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2259 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2260 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2261 2262 #define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh) 2263 #define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh) 2264 #define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh) 2265 2266 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2267 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2268 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2269 2270 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2271 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2272 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2273 2274 #define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh)) 2275 #define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh)) 2276 #define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh)) 2277 2278 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2279 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2280 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2281 2282 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2283 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2284 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2285 2286 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2287 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2288 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2289 2290 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2291 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2292 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2293 2294 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2295 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2296 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2297 2298 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2299 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2300 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2301 2302 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2303 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2304 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2305 2306 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2307 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2308 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2309 2310 #undef DO_SHRNB 2311 #undef DO_SHRNT 2312 2313 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2314 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2315 { \ 2316 intptr_t i, opr_sz = simd_oprsz(desc); \ 2317 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2318 TYPEW nn = *(TYPEW *)(vn + i); \ 2319 TYPEW mm = *(TYPEW *)(vm + i); \ 2320 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2321 } \ 2322 } 2323 2324 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2325 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2326 { \ 2327 intptr_t i, opr_sz = simd_oprsz(desc); \ 2328 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2329 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2330 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2331 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2332 } \ 2333 } 2334 2335 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2336 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2337 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2338 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2339 2340 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2341 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2342 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2343 2344 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2345 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2346 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2347 2348 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2349 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2350 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2351 2352 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2353 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2354 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2355 2356 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2357 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2358 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2359 2360 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2361 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2362 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2363 2364 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2365 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2366 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2367 2368 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2369 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2370 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2371 2372 #undef DO_RSUBHN 2373 #undef DO_SUBHN 2374 #undef DO_RADDHN 2375 #undef DO_ADDHN 2376 2377 #undef DO_BINOPNB 2378 2379 /* Fully general four-operand expander, controlled by a predicate. 2380 */ 2381 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2382 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2383 void *vg, uint32_t desc) \ 2384 { \ 2385 intptr_t i, opr_sz = simd_oprsz(desc); \ 2386 for (i = 0; i < opr_sz; ) { \ 2387 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2388 do { \ 2389 if (pg & 1) { \ 2390 TYPE nn = *(TYPE *)(vn + H(i)); \ 2391 TYPE mm = *(TYPE *)(vm + H(i)); \ 2392 TYPE aa = *(TYPE *)(va + H(i)); \ 2393 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2394 } \ 2395 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2396 } while (i & 15); \ 2397 } \ 2398 } 2399 2400 /* Similarly, specialized for 64-bit operands. */ 2401 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2402 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2403 void *vg, uint32_t desc) \ 2404 { \ 2405 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2406 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2407 uint8_t *pg = vg; \ 2408 for (i = 0; i < opr_sz; i += 1) { \ 2409 if (pg[H1(i)] & 1) { \ 2410 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2411 d[i] = OP(aa, nn, mm); \ 2412 } \ 2413 } \ 2414 } 2415 2416 #define DO_MLA(A, N, M) (A + N * M) 2417 #define DO_MLS(A, N, M) (A - N * M) 2418 2419 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2420 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2421 2422 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2423 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2424 2425 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2426 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2427 2428 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2429 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2430 2431 #undef DO_MLA 2432 #undef DO_MLS 2433 #undef DO_ZPZZZ 2434 #undef DO_ZPZZZ_D 2435 2436 void HELPER(sve_index_b)(void *vd, uint32_t start, 2437 uint32_t incr, uint32_t desc) 2438 { 2439 intptr_t i, opr_sz = simd_oprsz(desc); 2440 uint8_t *d = vd; 2441 for (i = 0; i < opr_sz; i += 1) { 2442 d[H1(i)] = start + i * incr; 2443 } 2444 } 2445 2446 void HELPER(sve_index_h)(void *vd, uint32_t start, 2447 uint32_t incr, uint32_t desc) 2448 { 2449 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2450 uint16_t *d = vd; 2451 for (i = 0; i < opr_sz; i += 1) { 2452 d[H2(i)] = start + i * incr; 2453 } 2454 } 2455 2456 void HELPER(sve_index_s)(void *vd, uint32_t start, 2457 uint32_t incr, uint32_t desc) 2458 { 2459 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2460 uint32_t *d = vd; 2461 for (i = 0; i < opr_sz; i += 1) { 2462 d[H4(i)] = start + i * incr; 2463 } 2464 } 2465 2466 void HELPER(sve_index_d)(void *vd, uint64_t start, 2467 uint64_t incr, uint32_t desc) 2468 { 2469 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2470 uint64_t *d = vd; 2471 for (i = 0; i < opr_sz; i += 1) { 2472 d[i] = start + i * incr; 2473 } 2474 } 2475 2476 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2477 { 2478 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2479 uint32_t sh = simd_data(desc); 2480 uint32_t *d = vd, *n = vn, *m = vm; 2481 for (i = 0; i < opr_sz; i += 1) { 2482 d[i] = n[i] + (m[i] << sh); 2483 } 2484 } 2485 2486 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2487 { 2488 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2489 uint64_t sh = simd_data(desc); 2490 uint64_t *d = vd, *n = vn, *m = vm; 2491 for (i = 0; i < opr_sz; i += 1) { 2492 d[i] = n[i] + (m[i] << sh); 2493 } 2494 } 2495 2496 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2497 { 2498 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2499 uint64_t sh = simd_data(desc); 2500 uint64_t *d = vd, *n = vn, *m = vm; 2501 for (i = 0; i < opr_sz; i += 1) { 2502 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2503 } 2504 } 2505 2506 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2507 { 2508 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2509 uint64_t sh = simd_data(desc); 2510 uint64_t *d = vd, *n = vn, *m = vm; 2511 for (i = 0; i < opr_sz; i += 1) { 2512 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2513 } 2514 } 2515 2516 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2517 { 2518 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2519 static const uint16_t coeff[] = { 2520 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2521 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2522 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2523 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2524 }; 2525 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2526 uint16_t *d = vd, *n = vn; 2527 2528 for (i = 0; i < opr_sz; i++) { 2529 uint16_t nn = n[i]; 2530 intptr_t idx = extract32(nn, 0, 5); 2531 uint16_t exp = extract32(nn, 5, 5); 2532 d[i] = coeff[idx] | (exp << 10); 2533 } 2534 } 2535 2536 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2537 { 2538 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2539 static const uint32_t coeff[] = { 2540 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2541 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2542 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2543 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2544 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2545 0x1ef532, 0x20b051, 0x227043, 0x243516, 2546 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2547 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2548 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2549 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2550 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2551 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2552 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2553 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2554 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2555 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2556 }; 2557 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2558 uint32_t *d = vd, *n = vn; 2559 2560 for (i = 0; i < opr_sz; i++) { 2561 uint32_t nn = n[i]; 2562 intptr_t idx = extract32(nn, 0, 6); 2563 uint32_t exp = extract32(nn, 6, 8); 2564 d[i] = coeff[idx] | (exp << 23); 2565 } 2566 } 2567 2568 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2569 { 2570 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2571 static const uint64_t coeff[] = { 2572 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2573 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2574 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2575 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2576 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2577 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2578 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2579 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2580 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2581 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2582 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2583 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2584 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2585 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2586 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2587 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2588 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2589 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2590 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2591 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2592 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2593 0xFA7C1819E90D8ull, 2594 }; 2595 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2596 uint64_t *d = vd, *n = vn; 2597 2598 for (i = 0; i < opr_sz; i++) { 2599 uint64_t nn = n[i]; 2600 intptr_t idx = extract32(nn, 0, 6); 2601 uint64_t exp = extract32(nn, 6, 11); 2602 d[i] = coeff[idx] | (exp << 52); 2603 } 2604 } 2605 2606 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2607 { 2608 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2609 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2610 uint16_t *d = vd, *n = vn, *m = vm; 2611 for (i = 0; i < opr_sz; i += 1) { 2612 uint16_t nn = n[i]; 2613 uint16_t mm = m[i]; 2614 if (mm & 1) { 2615 nn = float16_one; 2616 } 2617 if (mm & 2) { 2618 nn = float16_maybe_ah_chs(nn, fpcr_ah); 2619 } 2620 d[i] = nn; 2621 } 2622 } 2623 2624 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2625 { 2626 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2627 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2628 uint32_t *d = vd, *n = vn, *m = vm; 2629 for (i = 0; i < opr_sz; i += 1) { 2630 uint32_t nn = n[i]; 2631 uint32_t mm = m[i]; 2632 if (mm & 1) { 2633 nn = float32_one; 2634 } 2635 if (mm & 2) { 2636 nn = float32_maybe_ah_chs(nn, fpcr_ah); 2637 } 2638 d[i] = nn; 2639 } 2640 } 2641 2642 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2643 { 2644 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2645 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2646 uint64_t *d = vd, *n = vn, *m = vm; 2647 for (i = 0; i < opr_sz; i += 1) { 2648 uint64_t nn = n[i]; 2649 uint64_t mm = m[i]; 2650 if (mm & 1) { 2651 nn = float64_one; 2652 } 2653 if (mm & 2) { 2654 nn = float64_maybe_ah_chs(nn, fpcr_ah); 2655 } 2656 d[i] = nn; 2657 } 2658 } 2659 2660 /* 2661 * Signed saturating addition with scalar operand. 2662 */ 2663 2664 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2665 { 2666 intptr_t i, oprsz = simd_oprsz(desc); 2667 2668 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2669 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2670 } 2671 } 2672 2673 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2674 { 2675 intptr_t i, oprsz = simd_oprsz(desc); 2676 2677 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2678 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2679 } 2680 } 2681 2682 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2683 { 2684 intptr_t i, oprsz = simd_oprsz(desc); 2685 2686 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2687 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2688 } 2689 } 2690 2691 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2692 { 2693 intptr_t i, oprsz = simd_oprsz(desc); 2694 2695 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2696 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2697 } 2698 } 2699 2700 /* 2701 * Unsigned saturating addition with scalar operand. 2702 */ 2703 2704 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2705 { 2706 intptr_t i, oprsz = simd_oprsz(desc); 2707 2708 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2709 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2710 } 2711 } 2712 2713 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2714 { 2715 intptr_t i, oprsz = simd_oprsz(desc); 2716 2717 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2718 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2719 } 2720 } 2721 2722 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2723 { 2724 intptr_t i, oprsz = simd_oprsz(desc); 2725 2726 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2727 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2728 } 2729 } 2730 2731 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2732 { 2733 intptr_t i, oprsz = simd_oprsz(desc); 2734 2735 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2736 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2737 } 2738 } 2739 2740 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2741 { 2742 intptr_t i, oprsz = simd_oprsz(desc); 2743 2744 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2745 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2746 } 2747 } 2748 2749 /* Two operand predicated copy immediate with merge. All valid immediates 2750 * can fit within 17 signed bits in the simd_data field. 2751 */ 2752 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2753 uint64_t mm, uint32_t desc) 2754 { 2755 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2756 uint64_t *d = vd, *n = vn; 2757 uint8_t *pg = vg; 2758 2759 mm = dup_const(MO_8, mm); 2760 for (i = 0; i < opr_sz; i += 1) { 2761 uint64_t nn = n[i]; 2762 uint64_t pp = expand_pred_b(pg[H1(i)]); 2763 d[i] = (mm & pp) | (nn & ~pp); 2764 } 2765 } 2766 2767 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2768 uint64_t mm, uint32_t desc) 2769 { 2770 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2771 uint64_t *d = vd, *n = vn; 2772 uint8_t *pg = vg; 2773 2774 mm = dup_const(MO_16, mm); 2775 for (i = 0; i < opr_sz; i += 1) { 2776 uint64_t nn = n[i]; 2777 uint64_t pp = expand_pred_h(pg[H1(i)]); 2778 d[i] = (mm & pp) | (nn & ~pp); 2779 } 2780 } 2781 2782 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2783 uint64_t mm, uint32_t desc) 2784 { 2785 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2786 uint64_t *d = vd, *n = vn; 2787 uint8_t *pg = vg; 2788 2789 mm = dup_const(MO_32, mm); 2790 for (i = 0; i < opr_sz; i += 1) { 2791 uint64_t nn = n[i]; 2792 uint64_t pp = expand_pred_s(pg[H1(i)]); 2793 d[i] = (mm & pp) | (nn & ~pp); 2794 } 2795 } 2796 2797 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2798 uint64_t mm, uint32_t desc) 2799 { 2800 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2801 uint64_t *d = vd, *n = vn; 2802 uint8_t *pg = vg; 2803 2804 for (i = 0; i < opr_sz; i += 1) { 2805 uint64_t nn = n[i]; 2806 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2807 } 2808 } 2809 2810 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2811 { 2812 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2813 uint64_t *d = vd; 2814 uint8_t *pg = vg; 2815 2816 val = dup_const(MO_8, val); 2817 for (i = 0; i < opr_sz; i += 1) { 2818 d[i] = val & expand_pred_b(pg[H1(i)]); 2819 } 2820 } 2821 2822 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2823 { 2824 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2825 uint64_t *d = vd; 2826 uint8_t *pg = vg; 2827 2828 val = dup_const(MO_16, val); 2829 for (i = 0; i < opr_sz; i += 1) { 2830 d[i] = val & expand_pred_h(pg[H1(i)]); 2831 } 2832 } 2833 2834 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2835 { 2836 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2837 uint64_t *d = vd; 2838 uint8_t *pg = vg; 2839 2840 val = dup_const(MO_32, val); 2841 for (i = 0; i < opr_sz; i += 1) { 2842 d[i] = val & expand_pred_s(pg[H1(i)]); 2843 } 2844 } 2845 2846 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2847 { 2848 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2849 uint64_t *d = vd; 2850 uint8_t *pg = vg; 2851 2852 for (i = 0; i < opr_sz; i += 1) { 2853 d[i] = (pg[H1(i)] & 1 ? val : 0); 2854 } 2855 } 2856 2857 /* Big-endian hosts need to frob the byte indices. If the copy 2858 * happens to be 8-byte aligned, then no frobbing necessary. 2859 */ 2860 static void swap_memmove(void *vd, void *vs, size_t n) 2861 { 2862 uintptr_t d = (uintptr_t)vd; 2863 uintptr_t s = (uintptr_t)vs; 2864 uintptr_t o = (d | s | n) & 7; 2865 size_t i; 2866 2867 #if !HOST_BIG_ENDIAN 2868 o = 0; 2869 #endif 2870 switch (o) { 2871 case 0: 2872 memmove(vd, vs, n); 2873 break; 2874 2875 case 4: 2876 if (d < s || d >= s + n) { 2877 for (i = 0; i < n; i += 4) { 2878 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2879 } 2880 } else { 2881 for (i = n; i > 0; ) { 2882 i -= 4; 2883 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2884 } 2885 } 2886 break; 2887 2888 case 2: 2889 case 6: 2890 if (d < s || d >= s + n) { 2891 for (i = 0; i < n; i += 2) { 2892 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2893 } 2894 } else { 2895 for (i = n; i > 0; ) { 2896 i -= 2; 2897 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2898 } 2899 } 2900 break; 2901 2902 default: 2903 if (d < s || d >= s + n) { 2904 for (i = 0; i < n; i++) { 2905 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2906 } 2907 } else { 2908 for (i = n; i > 0; ) { 2909 i -= 1; 2910 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2911 } 2912 } 2913 break; 2914 } 2915 } 2916 2917 /* Similarly for memset of 0. */ 2918 static void swap_memzero(void *vd, size_t n) 2919 { 2920 uintptr_t d = (uintptr_t)vd; 2921 uintptr_t o = (d | n) & 7; 2922 size_t i; 2923 2924 /* Usually, the first bit of a predicate is set, so N is 0. */ 2925 if (likely(n == 0)) { 2926 return; 2927 } 2928 2929 #if !HOST_BIG_ENDIAN 2930 o = 0; 2931 #endif 2932 switch (o) { 2933 case 0: 2934 memset(vd, 0, n); 2935 break; 2936 2937 case 4: 2938 for (i = 0; i < n; i += 4) { 2939 *(uint32_t *)H1_4(d + i) = 0; 2940 } 2941 break; 2942 2943 case 2: 2944 case 6: 2945 for (i = 0; i < n; i += 2) { 2946 *(uint16_t *)H1_2(d + i) = 0; 2947 } 2948 break; 2949 2950 default: 2951 for (i = 0; i < n; i++) { 2952 *(uint8_t *)H1(d + i) = 0; 2953 } 2954 break; 2955 } 2956 } 2957 2958 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2959 { 2960 intptr_t opr_sz = simd_oprsz(desc); 2961 size_t n_ofs = simd_data(desc); 2962 size_t n_siz = opr_sz - n_ofs; 2963 2964 if (vd != vm) { 2965 swap_memmove(vd, vn + n_ofs, n_siz); 2966 swap_memmove(vd + n_siz, vm, n_ofs); 2967 } else if (vd != vn) { 2968 swap_memmove(vd + n_siz, vd, n_ofs); 2969 swap_memmove(vd, vn + n_ofs, n_siz); 2970 } else { 2971 /* vd == vn == vm. Need temp space. */ 2972 ARMVectorReg tmp; 2973 swap_memmove(&tmp, vm, n_ofs); 2974 swap_memmove(vd, vd + n_ofs, n_siz); 2975 memcpy(vd + n_siz, &tmp, n_ofs); 2976 } 2977 } 2978 2979 #define DO_INSR(NAME, TYPE, H) \ 2980 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2981 { \ 2982 intptr_t opr_sz = simd_oprsz(desc); \ 2983 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2984 *(TYPE *)(vd + H(0)) = val; \ 2985 } 2986 2987 DO_INSR(sve_insr_b, uint8_t, H1) 2988 DO_INSR(sve_insr_h, uint16_t, H1_2) 2989 DO_INSR(sve_insr_s, uint32_t, H1_4) 2990 DO_INSR(sve_insr_d, uint64_t, H1_8) 2991 2992 #undef DO_INSR 2993 2994 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2995 { 2996 intptr_t i, j, opr_sz = simd_oprsz(desc); 2997 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2998 uint64_t f = *(uint64_t *)(vn + i); 2999 uint64_t b = *(uint64_t *)(vn + j); 3000 *(uint64_t *)(vd + i) = bswap64(b); 3001 *(uint64_t *)(vd + j) = bswap64(f); 3002 } 3003 } 3004 3005 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 3006 { 3007 intptr_t i, j, opr_sz = simd_oprsz(desc); 3008 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3009 uint64_t f = *(uint64_t *)(vn + i); 3010 uint64_t b = *(uint64_t *)(vn + j); 3011 *(uint64_t *)(vd + i) = hswap64(b); 3012 *(uint64_t *)(vd + j) = hswap64(f); 3013 } 3014 } 3015 3016 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 3017 { 3018 intptr_t i, j, opr_sz = simd_oprsz(desc); 3019 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3020 uint64_t f = *(uint64_t *)(vn + i); 3021 uint64_t b = *(uint64_t *)(vn + j); 3022 *(uint64_t *)(vd + i) = rol64(b, 32); 3023 *(uint64_t *)(vd + j) = rol64(f, 32); 3024 } 3025 } 3026 3027 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 3028 { 3029 intptr_t i, j, opr_sz = simd_oprsz(desc); 3030 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3031 uint64_t f = *(uint64_t *)(vn + i); 3032 uint64_t b = *(uint64_t *)(vn + j); 3033 *(uint64_t *)(vd + i) = b; 3034 *(uint64_t *)(vd + j) = f; 3035 } 3036 } 3037 3038 /* 3039 * TODO: This could use half_shuffle64 and similar bit tricks to 3040 * expand blocks of bits at once. 3041 */ 3042 #define DO_PMOV_PV(NAME, ESIZE) \ 3043 void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ 3044 { \ 3045 unsigned vl = simd_oprsz(desc); \ 3046 unsigned idx = simd_data(desc); \ 3047 unsigned elements = vl / ESIZE; \ 3048 ARMPredicateReg *d = vd; \ 3049 ARMVectorReg *s = vs; \ 3050 memset(d, 0, sizeof(*d)); \ 3051 for (unsigned e = 0; e < elements; ++e) { \ 3052 depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \ 3053 } \ 3054 } 3055 3056 DO_PMOV_PV(pmov_pv_h, 2) 3057 DO_PMOV_PV(pmov_pv_s, 4) 3058 DO_PMOV_PV(pmov_pv_d, 8) 3059 3060 #undef DO_PMOV_PV 3061 3062 /* 3063 * TODO: This could use half_unshuffle64 and similar bit tricks to 3064 * compress blocks of bits at once. 3065 */ 3066 #define DO_PMOV_VP(NAME, ESIZE) \ 3067 void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ 3068 { \ 3069 unsigned vl = simd_oprsz(desc); \ 3070 unsigned idx = simd_data(desc); \ 3071 unsigned elements = vl / ESIZE; \ 3072 ARMVectorReg *d = vd; \ 3073 ARMPredicateReg *s = vs; \ 3074 if (idx == 0) { \ 3075 memset(d, 0, vl); \ 3076 } \ 3077 for (unsigned e = 0; e < elements; ++e) { \ 3078 depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \ 3079 } \ 3080 } 3081 3082 DO_PMOV_VP(pmov_vp_h, 2) 3083 DO_PMOV_VP(pmov_vp_s, 4) 3084 DO_PMOV_VP(pmov_vp_d, 8) 3085 3086 #undef DO_PMOV_VP 3087 3088 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 3089 3090 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 3091 bool is_tbx, tb_impl_fn *fn) 3092 { 3093 ARMVectorReg scratch; 3094 uintptr_t oprsz = simd_oprsz(desc); 3095 3096 if (unlikely(vd == vn)) { 3097 vn = memcpy(&scratch, vn, oprsz); 3098 } 3099 3100 fn(vd, vn, NULL, vm, oprsz, is_tbx); 3101 } 3102 3103 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 3104 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 3105 { 3106 ARMVectorReg scratch; 3107 uintptr_t oprsz = simd_oprsz(desc); 3108 3109 if (unlikely(vd == vn0)) { 3110 vn0 = memcpy(&scratch, vn0, oprsz); 3111 if (vd == vn1) { 3112 vn1 = vn0; 3113 } 3114 } else if (unlikely(vd == vn1)) { 3115 vn1 = memcpy(&scratch, vn1, oprsz); 3116 } 3117 3118 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 3119 } 3120 3121 #define DO_TB(SUFF, TYPE, H) \ 3122 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 3123 void *vm, uintptr_t oprsz, bool is_tbx) \ 3124 { \ 3125 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 3126 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 3127 for (i = 0; i < nelem; ++i) { \ 3128 TYPE index = indexes[H1(i)], val = 0; \ 3129 if (index < nelem) { \ 3130 val = tbl0[H(index)]; \ 3131 } else { \ 3132 index -= nelem; \ 3133 if (tbl1 && index < nelem) { \ 3134 val = tbl1[H(index)]; \ 3135 } else if (is_tbx) { \ 3136 continue; \ 3137 } \ 3138 } \ 3139 d[H(i)] = val; \ 3140 } \ 3141 } \ 3142 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3143 { \ 3144 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3145 } \ 3146 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3147 void *vm, uint32_t desc) \ 3148 { \ 3149 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3150 } \ 3151 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3152 { \ 3153 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3154 } 3155 3156 DO_TB(b, uint8_t, H1) 3157 DO_TB(h, uint16_t, H2) 3158 DO_TB(s, uint32_t, H4) 3159 DO_TB(d, uint64_t, H8) 3160 3161 #undef DO_TB 3162 3163 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3164 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3165 { \ 3166 intptr_t i, opr_sz = simd_oprsz(desc); \ 3167 TYPED *d = vd; \ 3168 TYPES *n = vn; \ 3169 ARMVectorReg tmp; \ 3170 if (unlikely(vn - vd < opr_sz)) { \ 3171 n = memcpy(&tmp, n, opr_sz / 2); \ 3172 } \ 3173 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3174 d[HD(i)] = n[HS(i)]; \ 3175 } \ 3176 } 3177 3178 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3179 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3180 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3181 3182 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3183 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3184 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3185 3186 #undef DO_UNPK 3187 3188 /* Mask of bits included in the even numbered predicates of width esz. 3189 * We also use this for expand_bits/compress_bits, and so extend the 3190 * same pattern out to 16-bit units. 3191 */ 3192 static const uint64_t even_bit_esz_masks[5] = { 3193 0x5555555555555555ull, 3194 0x3333333333333333ull, 3195 0x0f0f0f0f0f0f0f0full, 3196 0x00ff00ff00ff00ffull, 3197 0x0000ffff0000ffffull, 3198 }; 3199 3200 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3201 * For N==0, this corresponds to the operation that in qemu/bitops.h 3202 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3203 * section 7-2 Shuffling Bits. 3204 */ 3205 static uint64_t expand_bits(uint64_t x, int n) 3206 { 3207 int i; 3208 3209 x &= 0xffffffffu; 3210 for (i = 4; i >= n; i--) { 3211 int sh = 1 << i; 3212 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3213 } 3214 return x; 3215 } 3216 3217 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3218 * For N==0, this corresponds to the operation that in qemu/bitops.h 3219 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3220 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3221 */ 3222 static uint64_t compress_bits(uint64_t x, int n) 3223 { 3224 int i; 3225 3226 for (i = n; i <= 4; i++) { 3227 int sh = 1 << i; 3228 x &= even_bit_esz_masks[i]; 3229 x = (x >> sh) | x; 3230 } 3231 return x & 0xffffffffu; 3232 } 3233 3234 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3235 { 3236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3237 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3238 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3239 int esize = 1 << esz; 3240 uint64_t *d = vd; 3241 intptr_t i; 3242 3243 if (oprsz <= 8) { 3244 uint64_t nn = *(uint64_t *)vn; 3245 uint64_t mm = *(uint64_t *)vm; 3246 int half = 4 * oprsz; 3247 3248 nn = extract64(nn, high * half, half); 3249 mm = extract64(mm, high * half, half); 3250 nn = expand_bits(nn, esz); 3251 mm = expand_bits(mm, esz); 3252 d[0] = nn | (mm << esize); 3253 } else { 3254 ARMPredicateReg tmp; 3255 3256 /* We produce output faster than we consume input. 3257 Therefore we must be mindful of possible overlap. */ 3258 if (vd == vn) { 3259 vn = memcpy(&tmp, vn, oprsz); 3260 if (vd == vm) { 3261 vm = vn; 3262 } 3263 } else if (vd == vm) { 3264 vm = memcpy(&tmp, vm, oprsz); 3265 } 3266 if (high) { 3267 high = oprsz >> 1; 3268 } 3269 3270 if ((oprsz & 7) == 0) { 3271 uint32_t *n = vn, *m = vm; 3272 high >>= 2; 3273 3274 for (i = 0; i < oprsz / 8; i++) { 3275 uint64_t nn = n[H4(high + i)]; 3276 uint64_t mm = m[H4(high + i)]; 3277 3278 nn = expand_bits(nn, esz); 3279 mm = expand_bits(mm, esz); 3280 d[i] = nn | (mm << esize); 3281 } 3282 } else { 3283 uint8_t *n = vn, *m = vm; 3284 uint16_t *d16 = vd; 3285 3286 for (i = 0; i < oprsz / 2; i++) { 3287 uint16_t nn = n[H1(high + i)]; 3288 uint16_t mm = m[H1(high + i)]; 3289 3290 nn = expand_bits(nn, esz); 3291 mm = expand_bits(mm, esz); 3292 d16[H2(i)] = nn | (mm << esize); 3293 } 3294 } 3295 } 3296 } 3297 3298 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3299 { 3300 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3301 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3302 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3303 uint64_t *d = vd, *n = vn, *m = vm; 3304 uint64_t l, h; 3305 intptr_t i; 3306 3307 if (oprsz <= 8) { 3308 l = compress_bits(n[0] >> odd, esz); 3309 h = compress_bits(m[0] >> odd, esz); 3310 d[0] = l | (h << (4 * oprsz)); 3311 } else { 3312 ARMPredicateReg tmp_m; 3313 intptr_t oprsz_16 = oprsz / 16; 3314 3315 if ((vm - vd) < (uintptr_t)oprsz) { 3316 m = memcpy(&tmp_m, vm, oprsz); 3317 } 3318 3319 for (i = 0; i < oprsz_16; i++) { 3320 l = n[2 * i + 0]; 3321 h = n[2 * i + 1]; 3322 l = compress_bits(l >> odd, esz); 3323 h = compress_bits(h >> odd, esz); 3324 d[i] = l | (h << 32); 3325 } 3326 3327 /* 3328 * For VL which is not a multiple of 512, the results from M do not 3329 * align nicely with the uint64_t for D. Put the aligned results 3330 * from M into TMP_M and then copy it into place afterward. 3331 */ 3332 if (oprsz & 15) { 3333 int final_shift = (oprsz & 15) * 2; 3334 3335 l = n[2 * i + 0]; 3336 h = n[2 * i + 1]; 3337 l = compress_bits(l >> odd, esz); 3338 h = compress_bits(h >> odd, esz); 3339 d[i] = l | (h << final_shift); 3340 3341 for (i = 0; i < oprsz_16; i++) { 3342 l = m[2 * i + 0]; 3343 h = m[2 * i + 1]; 3344 l = compress_bits(l >> odd, esz); 3345 h = compress_bits(h >> odd, esz); 3346 tmp_m.p[i] = l | (h << 32); 3347 } 3348 l = m[2 * i + 0]; 3349 h = m[2 * i + 1]; 3350 l = compress_bits(l >> odd, esz); 3351 h = compress_bits(h >> odd, esz); 3352 tmp_m.p[i] = l | (h << final_shift); 3353 3354 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3355 } else { 3356 for (i = 0; i < oprsz_16; i++) { 3357 l = m[2 * i + 0]; 3358 h = m[2 * i + 1]; 3359 l = compress_bits(l >> odd, esz); 3360 h = compress_bits(h >> odd, esz); 3361 d[oprsz_16 + i] = l | (h << 32); 3362 } 3363 } 3364 } 3365 } 3366 3367 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3368 { 3369 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3370 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3371 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3372 uint64_t *d = vd, *n = vn, *m = vm; 3373 uint64_t mask; 3374 int shr, shl; 3375 intptr_t i; 3376 3377 shl = 1 << esz; 3378 shr = 0; 3379 mask = even_bit_esz_masks[esz]; 3380 if (odd) { 3381 mask <<= shl; 3382 shr = shl; 3383 shl = 0; 3384 } 3385 3386 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3387 uint64_t nn = (n[i] & mask) >> shr; 3388 uint64_t mm = (m[i] & mask) << shl; 3389 d[i] = nn + mm; 3390 } 3391 } 3392 3393 /* Reverse units of 2**N bits. */ 3394 static uint64_t reverse_bits_64(uint64_t x, int n) 3395 { 3396 int i, sh; 3397 3398 x = bswap64(x); 3399 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3400 uint64_t mask = even_bit_esz_masks[i]; 3401 x = ((x & mask) << sh) | ((x >> sh) & mask); 3402 } 3403 return x; 3404 } 3405 3406 static uint8_t reverse_bits_8(uint8_t x, int n) 3407 { 3408 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3409 int i, sh; 3410 3411 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3412 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3413 } 3414 return x; 3415 } 3416 3417 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3418 { 3419 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3420 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3421 intptr_t i, oprsz_2 = oprsz / 2; 3422 3423 if (oprsz <= 8) { 3424 uint64_t l = *(uint64_t *)vn; 3425 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3426 *(uint64_t *)vd = l; 3427 } else if ((oprsz & 15) == 0) { 3428 for (i = 0; i < oprsz_2; i += 8) { 3429 intptr_t ih = oprsz - 8 - i; 3430 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3431 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3432 *(uint64_t *)(vd + i) = h; 3433 *(uint64_t *)(vd + ih) = l; 3434 } 3435 } else { 3436 for (i = 0; i < oprsz_2; i += 1) { 3437 intptr_t il = H1(i); 3438 intptr_t ih = H1(oprsz - 1 - i); 3439 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3440 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3441 *(uint8_t *)(vd + il) = h; 3442 *(uint8_t *)(vd + ih) = l; 3443 } 3444 } 3445 } 3446 3447 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3448 { 3449 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3450 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3451 uint64_t *d = vd; 3452 intptr_t i; 3453 3454 if (oprsz <= 8) { 3455 uint64_t nn = *(uint64_t *)vn; 3456 int half = 4 * oprsz; 3457 3458 nn = extract64(nn, high * half, half); 3459 nn = expand_bits(nn, 0); 3460 d[0] = nn; 3461 } else { 3462 ARMPredicateReg tmp_n; 3463 3464 /* We produce output faster than we consume input. 3465 Therefore we must be mindful of possible overlap. */ 3466 if ((vn - vd) < (uintptr_t)oprsz) { 3467 vn = memcpy(&tmp_n, vn, oprsz); 3468 } 3469 if (high) { 3470 high = oprsz >> 1; 3471 } 3472 3473 if ((oprsz & 7) == 0) { 3474 uint32_t *n = vn; 3475 high >>= 2; 3476 3477 for (i = 0; i < oprsz / 8; i++) { 3478 uint64_t nn = n[H4(high + i)]; 3479 d[i] = expand_bits(nn, 0); 3480 } 3481 } else { 3482 uint16_t *d16 = vd; 3483 uint8_t *n = vn; 3484 3485 for (i = 0; i < oprsz / 2; i++) { 3486 uint16_t nn = n[H1(high + i)]; 3487 d16[H2(i)] = expand_bits(nn, 0); 3488 } 3489 } 3490 } 3491 } 3492 3493 #define DO_ZIP(NAME, TYPE, H) \ 3494 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3495 { \ 3496 intptr_t oprsz = simd_oprsz(desc); \ 3497 intptr_t odd_ofs = simd_data(desc); \ 3498 intptr_t i, oprsz_2 = oprsz / 2; \ 3499 ARMVectorReg tmp_n, tmp_m; \ 3500 /* We produce output faster than we consume input. \ 3501 Therefore we must be mindful of possible overlap. */ \ 3502 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3503 vn = memcpy(&tmp_n, vn, oprsz); \ 3504 } \ 3505 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3506 vm = memcpy(&tmp_m, vm, oprsz); \ 3507 } \ 3508 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3509 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3510 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3511 *(TYPE *)(vm + odd_ofs + H(i)); \ 3512 } \ 3513 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3514 memset(vd + oprsz - 16, 0, 16); \ 3515 } \ 3516 } 3517 3518 DO_ZIP(sve_zip_b, uint8_t, H1) 3519 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3520 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3521 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3522 DO_ZIP(sve2_zip_q, Int128, ) 3523 3524 #define DO_UZP(NAME, TYPE, H) \ 3525 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3526 { \ 3527 intptr_t oprsz = simd_oprsz(desc); \ 3528 intptr_t odd_ofs = simd_data(desc); \ 3529 intptr_t i, p; \ 3530 ARMVectorReg tmp_m; \ 3531 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3532 vm = memcpy(&tmp_m, vm, oprsz); \ 3533 } \ 3534 i = 0, p = odd_ofs; \ 3535 do { \ 3536 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3537 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3538 } while (p < oprsz); \ 3539 p -= oprsz; \ 3540 do { \ 3541 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3542 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3543 } while (p < oprsz); \ 3544 tcg_debug_assert(i == oprsz); \ 3545 } 3546 3547 DO_UZP(sve_uzp_b, uint8_t, H1) 3548 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3549 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3550 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3551 DO_UZP(sve2_uzp_q, Int128, ) 3552 3553 typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc); 3554 3555 static void do_perseg_zzz(void *vd, void *vn, void *vm, 3556 uint32_t desc, perseg_zzz_fn *fn) 3557 { 3558 intptr_t oprsz = simd_oprsz(desc); 3559 3560 desc = simd_desc(16, 16, simd_data(desc)); 3561 for (intptr_t i = 0; i < oprsz; i += 16) { 3562 fn(vd + i, vn + i, vm + i, desc); 3563 } 3564 } 3565 3566 #define DO_PERSEG_ZZZ(NAME, FUNC) \ 3567 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3568 { do_perseg_zzz(vd, vn, vm, desc, FUNC); } 3569 3570 DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b) 3571 DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h) 3572 DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s) 3573 DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d) 3574 3575 DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b) 3576 DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h) 3577 DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s) 3578 DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d) 3579 3580 DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b) 3581 DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h) 3582 DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s) 3583 DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d) 3584 3585 DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b) 3586 DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h) 3587 DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s) 3588 DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d) 3589 3590 #undef DO_PERSEG_ZZZ 3591 3592 #define DO_TRN(NAME, TYPE, H) \ 3593 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3594 { \ 3595 intptr_t oprsz = simd_oprsz(desc); \ 3596 intptr_t odd_ofs = simd_data(desc); \ 3597 intptr_t i; \ 3598 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3599 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3600 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3601 *(TYPE *)(vd + H(i + 0)) = ae; \ 3602 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3603 } \ 3604 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3605 memset(vd + oprsz - 16, 0, 16); \ 3606 } \ 3607 } 3608 3609 DO_TRN(sve_trn_b, uint8_t, H1) 3610 DO_TRN(sve_trn_h, uint16_t, H1_2) 3611 DO_TRN(sve_trn_s, uint32_t, H1_4) 3612 DO_TRN(sve_trn_d, uint64_t, H1_8) 3613 DO_TRN(sve2_trn_q, Int128, ) 3614 3615 #undef DO_ZIP 3616 #undef DO_UZP 3617 #undef DO_TRN 3618 3619 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3620 { 3621 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3622 uint32_t *d = vd, *n = vn; 3623 uint8_t *pg = vg; 3624 3625 for (i = j = 0; i < opr_sz; i++) { 3626 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3627 d[H4(j)] = n[H4(i)]; 3628 j++; 3629 } 3630 } 3631 for (; j < opr_sz; j++) { 3632 d[H4(j)] = 0; 3633 } 3634 } 3635 3636 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3637 { 3638 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3639 uint64_t *d = vd, *n = vn; 3640 uint8_t *pg = vg; 3641 3642 for (i = j = 0; i < opr_sz; i++) { 3643 if (pg[H1(i)] & 1) { 3644 d[j] = n[i]; 3645 j++; 3646 } 3647 } 3648 for (; j < opr_sz; j++) { 3649 d[j] = 0; 3650 } 3651 } 3652 3653 /* Similar to the ARM LastActiveElement pseudocode function, except the 3654 * result is multiplied by the element size. This includes the not found 3655 * indication; e.g. not found for esz=3 is -8. 3656 */ 3657 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3658 { 3659 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3660 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3661 3662 return last_active_element(vg, words, esz); 3663 } 3664 3665 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3666 { 3667 intptr_t opr_sz = simd_oprsz(desc) / 8; 3668 int esz = simd_data(desc); 3669 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3670 intptr_t i, first_i, last_i; 3671 ARMVectorReg tmp; 3672 3673 first_i = last_i = 0; 3674 first_g = last_g = 0; 3675 3676 /* Find the extent of the active elements within VG. */ 3677 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3678 pg = *(uint64_t *)(vg + i) & mask; 3679 if (pg) { 3680 if (last_g == 0) { 3681 last_g = pg; 3682 last_i = i; 3683 } 3684 first_g = pg; 3685 first_i = i; 3686 } 3687 } 3688 3689 len = 0; 3690 if (first_g != 0) { 3691 first_i = first_i * 8 + ctz64(first_g); 3692 last_i = last_i * 8 + 63 - clz64(last_g); 3693 len = last_i - first_i + (1 << esz); 3694 if (vd == vm) { 3695 vm = memcpy(&tmp, vm, opr_sz * 8); 3696 } 3697 swap_memmove(vd, vn + first_i, len); 3698 } 3699 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3700 } 3701 3702 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3703 void *vg, uint32_t desc) 3704 { 3705 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3706 uint64_t *d = vd, *n = vn, *m = vm; 3707 uint8_t *pg = vg; 3708 3709 for (i = 0; i < opr_sz; i += 1) { 3710 uint64_t nn = n[i], mm = m[i]; 3711 uint64_t pp = expand_pred_b(pg[H1(i)]); 3712 d[i] = (nn & pp) | (mm & ~pp); 3713 } 3714 } 3715 3716 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3717 void *vg, uint32_t desc) 3718 { 3719 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3720 uint64_t *d = vd, *n = vn, *m = vm; 3721 uint8_t *pg = vg; 3722 3723 for (i = 0; i < opr_sz; i += 1) { 3724 uint64_t nn = n[i], mm = m[i]; 3725 uint64_t pp = expand_pred_h(pg[H1(i)]); 3726 d[i] = (nn & pp) | (mm & ~pp); 3727 } 3728 } 3729 3730 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3731 void *vg, uint32_t desc) 3732 { 3733 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3734 uint64_t *d = vd, *n = vn, *m = vm; 3735 uint8_t *pg = vg; 3736 3737 for (i = 0; i < opr_sz; i += 1) { 3738 uint64_t nn = n[i], mm = m[i]; 3739 uint64_t pp = expand_pred_s(pg[H1(i)]); 3740 d[i] = (nn & pp) | (mm & ~pp); 3741 } 3742 } 3743 3744 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3745 void *vg, uint32_t desc) 3746 { 3747 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3748 uint64_t *d = vd, *n = vn, *m = vm; 3749 uint8_t *pg = vg; 3750 3751 for (i = 0; i < opr_sz; i += 1) { 3752 uint64_t nn = n[i], mm = m[i]; 3753 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3754 } 3755 } 3756 3757 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3758 void *vg, uint32_t desc) 3759 { 3760 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3761 Int128 *d = vd, *n = vn, *m = vm; 3762 uint16_t *pg = vg; 3763 3764 for (i = 0; i < opr_sz; i += 1) { 3765 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3766 } 3767 } 3768 3769 /* Two operand comparison controlled by a predicate. 3770 * ??? It is very tempting to want to be able to expand this inline 3771 * with x86 instructions, e.g. 3772 * 3773 * vcmpeqw zm, zn, %ymm0 3774 * vpmovmskb %ymm0, %eax 3775 * and $0x5555, %eax 3776 * and pg, %eax 3777 * 3778 * or even aarch64, e.g. 3779 * 3780 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3781 * cmeq v0.8h, zn, zm 3782 * and v0.8h, v0.8h, mask 3783 * addv h0, v0.8h 3784 * and v0.8b, pg 3785 * 3786 * However, coming up with an abstraction that allows vector inputs and 3787 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3788 * scalar outputs, is tricky. 3789 */ 3790 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3791 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3792 { \ 3793 intptr_t opr_sz = simd_oprsz(desc); \ 3794 uint32_t flags = PREDTEST_INIT; \ 3795 intptr_t i = opr_sz; \ 3796 do { \ 3797 uint64_t out = 0, pg; \ 3798 do { \ 3799 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3800 TYPE nn = *(TYPE *)(vn + H(i)); \ 3801 TYPE mm = *(TYPE *)(vm + H(i)); \ 3802 out |= nn OP mm; \ 3803 } while (i & 63); \ 3804 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3805 out &= pg; \ 3806 *(uint64_t *)(vd + (i >> 3)) = out; \ 3807 flags = iter_predtest_bwd(out, pg, flags); \ 3808 } while (i > 0); \ 3809 return flags; \ 3810 } 3811 3812 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3813 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3814 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3815 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3816 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3817 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3818 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3819 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3820 3821 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3822 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3823 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3824 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3825 3826 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3827 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3828 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3829 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3830 3831 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3832 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3833 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3834 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3835 3836 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3837 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3838 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3839 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3840 3841 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3842 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3843 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3844 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3845 3846 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3847 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3848 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3849 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3850 3851 #undef DO_CMP_PPZZ_B 3852 #undef DO_CMP_PPZZ_H 3853 #undef DO_CMP_PPZZ_S 3854 #undef DO_CMP_PPZZ_D 3855 #undef DO_CMP_PPZZ 3856 3857 /* Similar, but the second source is "wide". */ 3858 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3859 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3860 { \ 3861 intptr_t opr_sz = simd_oprsz(desc); \ 3862 uint32_t flags = PREDTEST_INIT; \ 3863 intptr_t i = opr_sz; \ 3864 do { \ 3865 uint64_t out = 0, pg; \ 3866 do { \ 3867 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3868 do { \ 3869 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3870 TYPE nn = *(TYPE *)(vn + H(i)); \ 3871 out |= nn OP mm; \ 3872 } while (i & 7); \ 3873 } while (i & 63); \ 3874 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3875 out &= pg; \ 3876 *(uint64_t *)(vd + (i >> 3)) = out; \ 3877 flags = iter_predtest_bwd(out, pg, flags); \ 3878 } while (i > 0); \ 3879 return flags; \ 3880 } 3881 3882 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3883 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3884 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3885 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3886 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3887 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3888 3889 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3890 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3891 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3892 3893 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3894 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3895 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3896 3897 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3898 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3899 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3900 3901 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3902 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3903 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3904 3905 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3906 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3907 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3908 3909 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3910 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3911 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3912 3913 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3914 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3915 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3916 3917 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3918 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3919 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3920 3921 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3922 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3923 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3924 3925 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3926 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3927 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3928 3929 #undef DO_CMP_PPZW_B 3930 #undef DO_CMP_PPZW_H 3931 #undef DO_CMP_PPZW_S 3932 #undef DO_CMP_PPZW 3933 3934 /* Similar, but the second source is immediate. */ 3935 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3936 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3937 { \ 3938 intptr_t opr_sz = simd_oprsz(desc); \ 3939 uint32_t flags = PREDTEST_INIT; \ 3940 TYPE mm = simd_data(desc); \ 3941 intptr_t i = opr_sz; \ 3942 do { \ 3943 uint64_t out = 0, pg; \ 3944 do { \ 3945 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3946 TYPE nn = *(TYPE *)(vn + H(i)); \ 3947 out |= nn OP mm; \ 3948 } while (i & 63); \ 3949 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3950 out &= pg; \ 3951 *(uint64_t *)(vd + (i >> 3)) = out; \ 3952 flags = iter_predtest_bwd(out, pg, flags); \ 3953 } while (i > 0); \ 3954 return flags; \ 3955 } 3956 3957 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3958 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3959 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3960 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3961 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3962 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3963 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3964 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3965 3966 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3967 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3968 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3969 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3970 3971 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3972 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3973 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3974 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3975 3976 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3977 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3978 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3979 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3980 3981 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3982 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3983 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3984 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3985 3986 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3987 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3988 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3989 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3990 3991 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3992 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3993 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3994 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3995 3996 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3997 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3998 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3999 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 4000 4001 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 4002 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 4003 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 4004 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 4005 4006 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 4007 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 4008 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 4009 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 4010 4011 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 4012 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 4013 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 4014 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 4015 4016 #undef DO_CMP_PPZI_B 4017 #undef DO_CMP_PPZI_H 4018 #undef DO_CMP_PPZI_S 4019 #undef DO_CMP_PPZI_D 4020 #undef DO_CMP_PPZI 4021 4022 /* Similar to the ARM LastActive pseudocode function. */ 4023 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 4024 { 4025 intptr_t i; 4026 4027 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 4028 uint64_t pg = *(uint64_t *)(vg + i); 4029 if (pg) { 4030 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 4031 } 4032 } 4033 return 0; 4034 } 4035 4036 /* Compute a mask into RETB that is true for all G, up to and including 4037 * (if after) or excluding (if !after) the first G & N. 4038 * Return true if BRK found. 4039 */ 4040 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 4041 bool brk, bool after) 4042 { 4043 uint64_t b; 4044 4045 if (brk) { 4046 b = 0; 4047 } else if ((g & n) == 0) { 4048 /* For all G, no N are set; break not found. */ 4049 b = g; 4050 } else { 4051 /* Break somewhere in N. Locate it. */ 4052 b = g & n; /* guard true, pred true */ 4053 b = b & -b; /* first such */ 4054 if (after) { 4055 b = b | (b - 1); /* break after same */ 4056 } else { 4057 b = b - 1; /* break before same */ 4058 } 4059 brk = true; 4060 } 4061 4062 *retb = b; 4063 return brk; 4064 } 4065 4066 /* Compute a zeroing BRK. */ 4067 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 4068 intptr_t oprsz, bool after) 4069 { 4070 bool brk = false; 4071 intptr_t i; 4072 4073 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4074 uint64_t this_b, this_g = g[i]; 4075 4076 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4077 d[i] = this_b & this_g; 4078 } 4079 } 4080 4081 /* Likewise, but also compute flags. */ 4082 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 4083 intptr_t oprsz, bool after) 4084 { 4085 uint32_t flags = PREDTEST_INIT; 4086 bool brk = false; 4087 intptr_t i; 4088 4089 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4090 uint64_t this_b, this_d, this_g = g[i]; 4091 4092 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4093 d[i] = this_d = this_b & this_g; 4094 flags = iter_predtest_fwd(this_d, this_g, flags); 4095 } 4096 return flags; 4097 } 4098 4099 /* Compute a merging BRK. */ 4100 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 4101 intptr_t oprsz, bool after) 4102 { 4103 bool brk = false; 4104 intptr_t i; 4105 4106 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4107 uint64_t this_b, this_g = g[i]; 4108 4109 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4110 d[i] = (this_b & this_g) | (d[i] & ~this_g); 4111 } 4112 } 4113 4114 /* Likewise, but also compute flags. */ 4115 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 4116 intptr_t oprsz, bool after) 4117 { 4118 uint32_t flags = PREDTEST_INIT; 4119 bool brk = false; 4120 intptr_t i; 4121 4122 for (i = 0; i < oprsz / 8; ++i) { 4123 uint64_t this_b, this_d = d[i], this_g = g[i]; 4124 4125 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4126 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 4127 flags = iter_predtest_fwd(this_d, this_g, flags); 4128 } 4129 return flags; 4130 } 4131 4132 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 4133 uint32_t pred_desc) 4134 { 4135 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4136 if (last_active_pred(vn, vg, oprsz)) { 4137 compute_brk_z(vd, vm, vg, oprsz, true); 4138 } else { 4139 memset(vd, 0, sizeof(ARMPredicateReg)); 4140 } 4141 } 4142 4143 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 4144 uint32_t pred_desc) 4145 { 4146 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4147 if (last_active_pred(vn, vg, oprsz)) { 4148 return compute_brks_z(vd, vm, vg, oprsz, true); 4149 } else { 4150 memset(vd, 0, sizeof(ARMPredicateReg)); 4151 return PREDTEST_INIT; 4152 } 4153 } 4154 4155 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 4156 uint32_t pred_desc) 4157 { 4158 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4159 if (last_active_pred(vn, vg, oprsz)) { 4160 compute_brk_z(vd, vm, vg, oprsz, false); 4161 } else { 4162 memset(vd, 0, sizeof(ARMPredicateReg)); 4163 } 4164 } 4165 4166 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4167 uint32_t pred_desc) 4168 { 4169 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4170 if (last_active_pred(vn, vg, oprsz)) { 4171 return compute_brks_z(vd, vm, vg, oprsz, false); 4172 } else { 4173 memset(vd, 0, sizeof(ARMPredicateReg)); 4174 return PREDTEST_INIT; 4175 } 4176 } 4177 4178 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4179 { 4180 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4181 compute_brk_z(vd, vn, vg, oprsz, true); 4182 } 4183 4184 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4185 { 4186 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4187 return compute_brks_z(vd, vn, vg, oprsz, true); 4188 } 4189 4190 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4191 { 4192 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4193 compute_brk_z(vd, vn, vg, oprsz, false); 4194 } 4195 4196 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4197 { 4198 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4199 return compute_brks_z(vd, vn, vg, oprsz, false); 4200 } 4201 4202 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4203 { 4204 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4205 compute_brk_m(vd, vn, vg, oprsz, true); 4206 } 4207 4208 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4209 { 4210 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4211 return compute_brks_m(vd, vn, vg, oprsz, true); 4212 } 4213 4214 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4215 { 4216 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4217 compute_brk_m(vd, vn, vg, oprsz, false); 4218 } 4219 4220 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4221 { 4222 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4223 return compute_brks_m(vd, vn, vg, oprsz, false); 4224 } 4225 4226 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4227 { 4228 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4229 if (!last_active_pred(vn, vg, oprsz)) { 4230 memset(vd, 0, sizeof(ARMPredicateReg)); 4231 } 4232 } 4233 4234 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4235 { 4236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4237 if (last_active_pred(vn, vg, oprsz)) { 4238 ARMPredicateReg *d = vd; 4239 uint32_t flags = PREDTEST_INIT; 4240 intptr_t i; 4241 4242 /* As if PredTest(Ones(PL), D, MO_8). */ 4243 for (i = 0; i < oprsz / 8; i++) { 4244 flags = iter_predtest_fwd(d->p[i], -1, flags); 4245 } 4246 if (oprsz & 7) { 4247 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4248 flags = iter_predtest_fwd(d->p[i], mask, flags); 4249 } 4250 return flags; 4251 } 4252 memset(vd, 0, sizeof(ARMPredicateReg)); 4253 return PREDTEST_INIT; 4254 } 4255 4256 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4257 { 4258 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4259 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4260 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4261 intptr_t i; 4262 4263 for (i = 0; i < words; ++i) { 4264 uint64_t t = n[i] & g[i] & mask; 4265 sum += ctpop64(t); 4266 } 4267 return sum; 4268 } 4269 4270 uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc) 4271 { 4272 int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); 4273 int vl = pl * 8; 4274 unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); 4275 int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1; 4276 DecodeCounter p = decode_counter(png, vl, v_esz); 4277 unsigned maxelem = (vl << lg2_width) >> v_esz; 4278 unsigned count = p.count; 4279 4280 if (p.invert) { 4281 if (count >= maxelem) { 4282 return 0; 4283 } 4284 count = maxelem - count; 4285 } else { 4286 count = MIN(count, maxelem); 4287 } 4288 return count >> p.lg2_stride; 4289 } 4290 4291 /* C.f. Arm pseudocode EncodePredCount */ 4292 static uint64_t encode_pred_count(uint32_t elements, uint32_t count, 4293 uint32_t esz, bool invert) 4294 { 4295 uint32_t pred; 4296 4297 if (count == 0) { 4298 return 0; 4299 } 4300 if (invert) { 4301 count = elements - count; 4302 } else if (count == elements) { 4303 count = 0; 4304 invert = true; 4305 } 4306 4307 pred = (count << 1) | 1; 4308 pred <<= esz; 4309 pred |= invert << 15; 4310 4311 return pred; 4312 } 4313 4314 /* C.f. Arm pseudocode PredCountTest */ 4315 static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert) 4316 { 4317 uint32_t flags; 4318 4319 if (count == 0) { 4320 flags = 1; /* !N, Z, C */ 4321 } else if (!invert) { 4322 flags = (1u << 31) | 2; /* N, !Z */ 4323 flags |= count != elements; /* C */ 4324 } else { 4325 flags = 2; /* !Z, !C */ 4326 flags |= (count == elements) << 31; /* N */ 4327 } 4328 return flags; 4329 } 4330 4331 /* D must be cleared on entry. */ 4332 static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask, 4333 uint32_t count, uint32_t oprbits) 4334 { 4335 tcg_debug_assert(count <= oprbits); 4336 if (count) { 4337 uint32_t i; 4338 4339 /* Set all of the requested bits. */ 4340 for (i = 0; i < count / 64; ++i) { 4341 d->p[i] = esz_mask; 4342 } 4343 if (count & 63) { 4344 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4345 } 4346 } 4347 } 4348 4349 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4350 { 4351 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4352 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4353 uint32_t oprbits = oprsz * 8; 4354 uint64_t esz_mask = pred_esz_masks[esz]; 4355 ARMPredicateReg *d = vd; 4356 4357 count <<= esz; 4358 memset(d, 0, sizeof(*d)); 4359 do_whilel(d, esz_mask, count, oprbits); 4360 return pred_count_test(oprbits, count, false); 4361 } 4362 4363 uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc) 4364 { 4365 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4366 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4367 uint32_t oprbits = oprsz * 8; 4368 uint64_t esz_mask = pred_esz_masks[esz]; 4369 ARMPredicateReg *d = vd; 4370 4371 count <<= esz; 4372 memset(d, 0, 2 * sizeof(*d)); 4373 if (count <= oprbits) { 4374 do_whilel(&d[0], esz_mask, count, oprbits); 4375 } else { 4376 do_whilel(&d[0], esz_mask, oprbits, oprbits); 4377 do_whilel(&d[1], esz_mask, count - oprbits, oprbits); 4378 } 4379 4380 return pred_count_test(2 * oprbits, count, false); 4381 } 4382 4383 uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc) 4384 { 4385 uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4386 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4387 uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); 4388 uint32_t vl = pl * 8; 4389 uint32_t elements = (vl >> esz) << scale; 4390 ARMPredicateReg *d = vd; 4391 4392 *d = (ARMPredicateReg) { 4393 .p[0] = encode_pred_count(elements, count, esz, false) 4394 }; 4395 return pred_count_test(elements, count, false); 4396 } 4397 4398 /* D must be cleared on entry. */ 4399 static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask, 4400 uint32_t count, uint32_t oprbits) 4401 { 4402 tcg_debug_assert(count <= oprbits); 4403 if (count) { 4404 uint32_t i, invcount = oprbits - count; 4405 uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64); 4406 4407 for (i = invcount / 64; i < oprbits / 64; ++i) { 4408 d->p[i] = bits; 4409 bits = esz_mask; 4410 } 4411 if (oprbits & 63) { 4412 d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63); 4413 } 4414 } 4415 } 4416 4417 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4418 { 4419 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4420 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4421 uint32_t oprbits = oprsz * 8; 4422 uint64_t esz_mask = pred_esz_masks[esz]; 4423 ARMPredicateReg *d = vd; 4424 4425 count <<= esz; 4426 memset(d, 0, sizeof(*d)); 4427 do_whileg(d, esz_mask, count, oprbits); 4428 return pred_count_test(oprbits, count, true); 4429 } 4430 4431 uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc) 4432 { 4433 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4434 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4435 uint32_t oprbits = oprsz * 8; 4436 uint64_t esz_mask = pred_esz_masks[esz]; 4437 ARMPredicateReg *d = vd; 4438 4439 count <<= esz; 4440 memset(d, 0, 2 * sizeof(*d)); 4441 if (count <= oprbits) { 4442 do_whileg(&d[1], esz_mask, count, oprbits); 4443 } else { 4444 do_whilel(&d[1], esz_mask, oprbits, oprbits); 4445 do_whileg(&d[0], esz_mask, count - oprbits, oprbits); 4446 } 4447 4448 return pred_count_test(2 * oprbits, count, true); 4449 } 4450 4451 uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc) 4452 { 4453 uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4454 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4455 uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); 4456 uint32_t vl = pl * 8; 4457 uint32_t elements = (vl >> esz) << scale; 4458 ARMPredicateReg *d = vd; 4459 4460 *d = (ARMPredicateReg) { 4461 .p[0] = encode_pred_count(elements, count, esz, true) 4462 }; 4463 return pred_count_test(elements, count, true); 4464 } 4465 4466 /* Recursive reduction on a function; 4467 * C.f. the ARM ARM function ReducePredicated. 4468 * 4469 * While it would be possible to write this without the DATA temporary, 4470 * it is much simpler to process the predicate register this way. 4471 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4472 * little to gain with a more complex non-recursive form. 4473 */ 4474 #define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT) \ 4475 static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4476 { \ 4477 if (n == 1) { \ 4478 return *data; \ 4479 } else { \ 4480 uintptr_t half = n / 2; \ 4481 TYPE lo = FUNC##_reduce(data, status, half); \ 4482 TYPE hi = FUNC##_reduce(data + half, status, half); \ 4483 return FUNC(lo, hi, status); \ 4484 } \ 4485 } \ 4486 uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg, \ 4487 float_status *status, uint32_t desc) \ 4488 { \ 4489 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4490 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4491 TYPE ident = IDENT; \ 4492 for (i = 0; i < oprsz; ) { \ 4493 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4494 do { \ 4495 TYPE nn = *(TYPE *)(vn + H(i)); \ 4496 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : ident); \ 4497 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4498 } while (i & 15); \ 4499 } \ 4500 for (; i < maxsz; i += sizeof(TYPE)) { \ 4501 *(TYPE *)((void *)data + i) = ident; \ 4502 } \ 4503 return FUNC##_reduce(data, status, maxsz / sizeof(TYPE)); \ 4504 } \ 4505 void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg, \ 4506 float_status *status, uint32_t desc) \ 4507 { \ 4508 unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16; \ 4509 TYPE ident = IDENT; \ 4510 for (unsigned e = 0; e < 16; e += sizeof(TYPE)) { \ 4511 TYPE data[ARM_MAX_VQ]; \ 4512 for (unsigned s = 0; s < segments; s++) { \ 4513 uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2)); \ 4514 TYPE nn = *(TYPE *)(vn + (s * 16 + H(e))); \ 4515 data[s] = (pg >> e) & 1 ? nn : ident; \ 4516 } \ 4517 *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \ 4518 } \ 4519 clear_tail(vd, 16, simd_maxsz(desc)); \ 4520 } 4521 4522 DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero) 4523 DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero) 4524 DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero) 4525 4526 /* 4527 * We can't avoid the function call for the default NaN value, because 4528 * it changes when FPCR.AH is set. 4529 */ 4530 DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, float16_default_nan(status)) 4531 DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, float32_default_nan(status)) 4532 DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, float64_default_nan(status)) 4533 4534 DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, float16_default_nan(status)) 4535 DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, float32_default_nan(status)) 4536 DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, float64_default_nan(status)) 4537 4538 DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity) 4539 DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity) 4540 DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity) 4541 4542 DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity)) 4543 DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity)) 4544 DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity)) 4545 4546 DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) 4547 DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) 4548 DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) 4549 4550 DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh, 4551 float16_chs(float16_infinity)) 4552 DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs, 4553 float32_chs(float32_infinity)) 4554 DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd, 4555 float64_chs(float64_infinity)) 4556 4557 #undef DO_REDUCE 4558 4559 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4560 float_status *status, uint32_t desc) 4561 { 4562 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4563 float16 result = nn; 4564 4565 do { 4566 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4567 do { 4568 if (pg & 1) { 4569 float16 mm = *(float16 *)(vm + H1_2(i)); 4570 result = float16_add(result, mm, status); 4571 } 4572 i += sizeof(float16), pg >>= sizeof(float16); 4573 } while (i & 15); 4574 } while (i < opr_sz); 4575 4576 return result; 4577 } 4578 4579 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4580 float_status *status, uint32_t desc) 4581 { 4582 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4583 float32 result = nn; 4584 4585 do { 4586 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4587 do { 4588 if (pg & 1) { 4589 float32 mm = *(float32 *)(vm + H1_2(i)); 4590 result = float32_add(result, mm, status); 4591 } 4592 i += sizeof(float32), pg >>= sizeof(float32); 4593 } while (i & 15); 4594 } while (i < opr_sz); 4595 4596 return result; 4597 } 4598 4599 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4600 float_status *status, uint32_t desc) 4601 { 4602 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4603 uint64_t *m = vm; 4604 uint8_t *pg = vg; 4605 4606 for (i = 0; i < opr_sz; i++) { 4607 if (pg[H1(i)] & 1) { 4608 nn = float64_add(nn, m[i], status); 4609 } 4610 } 4611 4612 return nn; 4613 } 4614 4615 /* Fully general three-operand expander, controlled by a predicate, 4616 * With the extra float_status parameter. 4617 */ 4618 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4619 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4620 float_status *status, uint32_t desc) \ 4621 { \ 4622 intptr_t i = simd_oprsz(desc); \ 4623 uint64_t *g = vg; \ 4624 do { \ 4625 uint64_t pg = g[(i - 1) >> 6]; \ 4626 do { \ 4627 i -= sizeof(TYPE); \ 4628 if (likely((pg >> (i & 63)) & 1)) { \ 4629 TYPE nn = *(TYPE *)(vn + H(i)); \ 4630 TYPE mm = *(TYPE *)(vm + H(i)); \ 4631 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4632 } \ 4633 } while (i & 63); \ 4634 } while (i != 0); \ 4635 } 4636 4637 DO_ZPZZ_FP(sve_fadd_b16, uint16_t, H1_2, bfloat16_add) 4638 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4639 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4640 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4641 4642 DO_ZPZZ_FP(sve_fsub_b16, uint16_t, H1_2, bfloat16_sub) 4643 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4644 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4645 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4646 4647 DO_ZPZZ_FP(sve_fmul_b16, uint16_t, H1_2, bfloat16_mul) 4648 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4649 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4650 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4651 4652 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4653 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4654 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4655 4656 DO_ZPZZ_FP(sve_fmin_b16, uint16_t, H1_2, bfloat16_min) 4657 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4658 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4659 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4660 4661 DO_ZPZZ_FP(sve_fmax_b16, uint16_t, H1_2, bfloat16_max) 4662 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4663 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4664 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4665 4666 DO_ZPZZ_FP(sve_ah_fmin_b16, uint16_t, H1_2, helper_sme2_ah_fmin_b16) 4667 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) 4668 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) 4669 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) 4670 4671 DO_ZPZZ_FP(sve_ah_fmax_b16, uint16_t, H1_2, helper_sme2_ah_fmax_b16) 4672 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) 4673 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) 4674 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) 4675 4676 DO_ZPZZ_FP(sve_fminnum_b16, uint16_t, H1_2, bfloat16_minnum) 4677 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4678 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4679 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4680 4681 DO_ZPZZ_FP(sve_fmaxnum_b16, uint16_t, H1_2, bfloat16_maxnum) 4682 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4683 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4684 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4685 4686 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4687 { 4688 return float16_abs(float16_sub(a, b, s)); 4689 } 4690 4691 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4692 { 4693 return float32_abs(float32_sub(a, b, s)); 4694 } 4695 4696 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4697 { 4698 return float64_abs(float64_sub(a, b, s)); 4699 } 4700 4701 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 4702 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat) 4703 { 4704 float16 r = float16_sub(op1, op2, stat); 4705 return float16_is_any_nan(r) ? r : float16_abs(r); 4706 } 4707 4708 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat) 4709 { 4710 float32 r = float32_sub(op1, op2, stat); 4711 return float32_is_any_nan(r) ? r : float32_abs(r); 4712 } 4713 4714 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat) 4715 { 4716 float64 r = float64_sub(op1, op2, stat); 4717 return float64_is_any_nan(r) ? r : float64_abs(r); 4718 } 4719 4720 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4721 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4722 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4723 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h) 4724 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s) 4725 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d) 4726 4727 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4728 { 4729 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4730 return float64_scalbn(a, b_int, s); 4731 } 4732 4733 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4734 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4735 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4736 4737 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4738 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4739 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4740 4741 #undef DO_ZPZZ_FP 4742 4743 /* Three-operand expander, with one scalar operand, controlled by 4744 * a predicate, with the extra float_status parameter. 4745 */ 4746 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4747 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4748 float_status *status, uint32_t desc) \ 4749 { \ 4750 intptr_t i = simd_oprsz(desc); \ 4751 uint64_t *g = vg; \ 4752 TYPE mm = scalar; \ 4753 do { \ 4754 uint64_t pg = g[(i - 1) >> 6]; \ 4755 do { \ 4756 i -= sizeof(TYPE); \ 4757 if (likely((pg >> (i & 63)) & 1)) { \ 4758 TYPE nn = *(TYPE *)(vn + H(i)); \ 4759 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4760 } \ 4761 } while (i & 63); \ 4762 } while (i != 0); \ 4763 } 4764 4765 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4766 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4767 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4768 4769 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4770 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4771 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4772 4773 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4774 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4775 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4776 4777 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4778 { 4779 return float16_sub(b, a, s); 4780 } 4781 4782 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4783 { 4784 return float32_sub(b, a, s); 4785 } 4786 4787 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4788 { 4789 return float64_sub(b, a, s); 4790 } 4791 4792 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4793 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4794 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4795 4796 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4797 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4798 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4799 4800 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4801 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4802 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4803 4804 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4805 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4806 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4807 4808 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4809 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4810 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4811 4812 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh) 4813 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs) 4814 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd) 4815 4816 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh) 4817 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins) 4818 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind) 4819 4820 /* Fully general two-operand expander, controlled by a predicate, 4821 * With the extra float_status parameter. 4822 */ 4823 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4824 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4825 float_status *status, uint32_t desc) \ 4826 { \ 4827 intptr_t i = simd_oprsz(desc); \ 4828 uint64_t *g = vg; \ 4829 do { \ 4830 uint64_t pg = g[(i - 1) >> 6]; \ 4831 do { \ 4832 i -= sizeof(TYPE); \ 4833 if (likely((pg >> (i & 63)) & 1)) { \ 4834 TYPE nn = *(TYPE *)(vn + H(i)); \ 4835 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4836 } \ 4837 } while (i & 63); \ 4838 } while (i != 0); \ 4839 } 4840 4841 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4842 * FZ16. When converting from fp16, this affects flushing input denormals; 4843 * when converting to fp16, this affects flushing output denormals. 4844 */ 4845 float32 sve_f16_to_f32(float16 f, float_status *fpst) 4846 { 4847 bool save = get_flush_inputs_to_zero(fpst); 4848 float32 ret; 4849 4850 set_flush_inputs_to_zero(false, fpst); 4851 ret = float16_to_float32(f, true, fpst); 4852 set_flush_inputs_to_zero(save, fpst); 4853 return ret; 4854 } 4855 4856 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4857 { 4858 bool save = get_flush_inputs_to_zero(fpst); 4859 float64 ret; 4860 4861 set_flush_inputs_to_zero(false, fpst); 4862 ret = float16_to_float64(f, true, fpst); 4863 set_flush_inputs_to_zero(save, fpst); 4864 return ret; 4865 } 4866 4867 float16 sve_f32_to_f16(float32 f, float_status *fpst) 4868 { 4869 bool save = get_flush_to_zero(fpst); 4870 float16 ret; 4871 4872 set_flush_to_zero(false, fpst); 4873 ret = float32_to_float16(f, true, fpst); 4874 set_flush_to_zero(save, fpst); 4875 return ret; 4876 } 4877 4878 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4879 { 4880 bool save = get_flush_to_zero(fpst); 4881 float16 ret; 4882 4883 set_flush_to_zero(false, fpst); 4884 ret = float64_to_float16(f, true, fpst); 4885 set_flush_to_zero(save, fpst); 4886 return ret; 4887 } 4888 4889 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4890 { 4891 if (float16_is_any_nan(f)) { 4892 float_raise(float_flag_invalid, s); 4893 return 0; 4894 } 4895 return float16_to_int16_round_to_zero(f, s); 4896 } 4897 4898 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4899 { 4900 if (float16_is_any_nan(f)) { 4901 float_raise(float_flag_invalid, s); 4902 return 0; 4903 } 4904 return float16_to_int64_round_to_zero(f, s); 4905 } 4906 4907 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4908 { 4909 if (float32_is_any_nan(f)) { 4910 float_raise(float_flag_invalid, s); 4911 return 0; 4912 } 4913 return float32_to_int64_round_to_zero(f, s); 4914 } 4915 4916 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4917 { 4918 if (float64_is_any_nan(f)) { 4919 float_raise(float_flag_invalid, s); 4920 return 0; 4921 } 4922 return float64_to_int64_round_to_zero(f, s); 4923 } 4924 4925 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4926 { 4927 if (float16_is_any_nan(f)) { 4928 float_raise(float_flag_invalid, s); 4929 return 0; 4930 } 4931 return float16_to_uint16_round_to_zero(f, s); 4932 } 4933 4934 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4935 { 4936 if (float16_is_any_nan(f)) { 4937 float_raise(float_flag_invalid, s); 4938 return 0; 4939 } 4940 return float16_to_uint64_round_to_zero(f, s); 4941 } 4942 4943 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4944 { 4945 if (float32_is_any_nan(f)) { 4946 float_raise(float_flag_invalid, s); 4947 return 0; 4948 } 4949 return float32_to_uint64_round_to_zero(f, s); 4950 } 4951 4952 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4953 { 4954 if (float64_is_any_nan(f)) { 4955 float_raise(float_flag_invalid, s); 4956 return 0; 4957 } 4958 return float64_to_uint64_round_to_zero(f, s); 4959 } 4960 4961 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4962 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4963 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4964 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4965 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4966 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4967 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4968 4969 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4970 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4971 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4972 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4973 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4974 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4975 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4976 4977 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4978 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4979 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4980 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4981 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4982 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4983 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4984 4985 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4986 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4987 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4988 4989 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4990 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4991 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4992 4993 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4994 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4995 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4996 4997 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4998 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4999 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 5000 5001 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 5002 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 5003 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 5004 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 5005 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 5006 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 5007 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 5008 5009 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 5010 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 5011 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 5012 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 5013 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 5014 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 5015 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 5016 5017 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 5018 { 5019 /* Extract frac to the top of the uint32_t. */ 5020 uint32_t frac = (uint32_t)a << (16 + 6); 5021 int16_t exp = extract32(a, 10, 5); 5022 5023 if (unlikely(exp == 0)) { 5024 if (frac != 0) { 5025 if (!get_flush_inputs_to_zero(s)) { 5026 /* denormal: bias - fractional_zeros */ 5027 return -15 - clz32(frac); 5028 } 5029 /* flush to zero */ 5030 float_raise(float_flag_input_denormal_flushed, s); 5031 } 5032 } else if (unlikely(exp == 0x1f)) { 5033 if (frac == 0) { 5034 return INT16_MAX; /* infinity */ 5035 } 5036 } else { 5037 /* normal: exp - bias */ 5038 return exp - 15; 5039 } 5040 /* nan or zero */ 5041 float_raise(float_flag_invalid, s); 5042 return INT16_MIN; 5043 } 5044 5045 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 5046 { 5047 /* Extract frac to the top of the uint32_t. */ 5048 uint32_t frac = a << 9; 5049 int32_t exp = extract32(a, 23, 8); 5050 5051 if (unlikely(exp == 0)) { 5052 if (frac != 0) { 5053 if (!get_flush_inputs_to_zero(s)) { 5054 /* denormal: bias - fractional_zeros */ 5055 return -127 - clz32(frac); 5056 } 5057 /* flush to zero */ 5058 float_raise(float_flag_input_denormal_flushed, s); 5059 } 5060 } else if (unlikely(exp == 0xff)) { 5061 if (frac == 0) { 5062 return INT32_MAX; /* infinity */ 5063 } 5064 } else { 5065 /* normal: exp - bias */ 5066 return exp - 127; 5067 } 5068 /* nan or zero */ 5069 float_raise(float_flag_invalid, s); 5070 return INT32_MIN; 5071 } 5072 5073 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 5074 { 5075 /* Extract frac to the top of the uint64_t. */ 5076 uint64_t frac = a << 12; 5077 int64_t exp = extract64(a, 52, 11); 5078 5079 if (unlikely(exp == 0)) { 5080 if (frac != 0) { 5081 if (!get_flush_inputs_to_zero(s)) { 5082 /* denormal: bias - fractional_zeros */ 5083 return -1023 - clz64(frac); 5084 } 5085 /* flush to zero */ 5086 float_raise(float_flag_input_denormal_flushed, s); 5087 } 5088 } else if (unlikely(exp == 0x7ff)) { 5089 if (frac == 0) { 5090 return INT64_MAX; /* infinity */ 5091 } 5092 } else { 5093 /* normal: exp - bias */ 5094 return exp - 1023; 5095 } 5096 /* nan or zero */ 5097 float_raise(float_flag_invalid, s); 5098 return INT64_MIN; 5099 } 5100 5101 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 5102 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 5103 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 5104 5105 #undef DO_ZPZ_FP 5106 5107 static void do_fmla_zpzzz_b16(void *vd, void *vn, void *vm, void *va, void *vg, 5108 float_status *status, uint32_t desc, 5109 uint16_t neg1, uint16_t neg3, int flags) 5110 { 5111 intptr_t i = simd_oprsz(desc); 5112 uint64_t *g = vg; 5113 5114 do { 5115 uint64_t pg = g[(i - 1) >> 6]; 5116 do { 5117 i -= 2; 5118 if (likely((pg >> (i & 63)) & 1)) { 5119 float16 e1, e2, e3, r; 5120 5121 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 5122 e2 = *(uint16_t *)(vm + H1_2(i)); 5123 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 5124 r = bfloat16_muladd(e1, e2, e3, flags, status); 5125 *(uint16_t *)(vd + H1_2(i)) = r; 5126 } 5127 } while (i & 63); 5128 } while (i != 0); 5129 } 5130 5131 void HELPER(sve_fmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5132 void *vg, float_status *status, uint32_t desc) 5133 { 5134 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5135 } 5136 5137 void HELPER(sve_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5138 void *vg, float_status *status, uint32_t desc) 5139 { 5140 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 5141 } 5142 5143 void HELPER(sve_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5144 void *vg, float_status *status, uint32_t desc) 5145 { 5146 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 5147 } 5148 5149 void HELPER(sve_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5150 void *vg, float_status *status, uint32_t desc) 5151 { 5152 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 5153 } 5154 5155 void HELPER(sve_ah_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5156 void *vg, float_status *status, uint32_t desc) 5157 { 5158 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 5159 float_muladd_negate_product); 5160 } 5161 5162 void HELPER(sve_ah_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5163 void *vg, float_status *status, uint32_t desc) 5164 { 5165 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 5166 float_muladd_negate_product | float_muladd_negate_c); 5167 } 5168 5169 void HELPER(sve_ah_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, 5170 void *vg, float_status *status, uint32_t desc) 5171 { 5172 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 5173 float_muladd_negate_c); 5174 } 5175 5176 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 5177 float_status *status, uint32_t desc, 5178 uint16_t neg1, uint16_t neg3, int flags) 5179 { 5180 intptr_t i = simd_oprsz(desc); 5181 uint64_t *g = vg; 5182 5183 do { 5184 uint64_t pg = g[(i - 1) >> 6]; 5185 do { 5186 i -= 2; 5187 if (likely((pg >> (i & 63)) & 1)) { 5188 float16 e1, e2, e3, r; 5189 5190 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 5191 e2 = *(uint16_t *)(vm + H1_2(i)); 5192 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 5193 r = float16_muladd(e1, e2, e3, flags, status); 5194 *(uint16_t *)(vd + H1_2(i)) = r; 5195 } 5196 } while (i & 63); 5197 } while (i != 0); 5198 } 5199 5200 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5201 void *vg, float_status *status, uint32_t desc) 5202 { 5203 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5204 } 5205 5206 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5207 void *vg, float_status *status, uint32_t desc) 5208 { 5209 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 5210 } 5211 5212 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5213 void *vg, float_status *status, uint32_t desc) 5214 { 5215 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 5216 } 5217 5218 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5219 void *vg, float_status *status, uint32_t desc) 5220 { 5221 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 5222 } 5223 5224 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5225 void *vg, float_status *status, uint32_t desc) 5226 { 5227 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5228 float_muladd_negate_product); 5229 } 5230 5231 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5232 void *vg, float_status *status, uint32_t desc) 5233 { 5234 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5235 float_muladd_negate_product | float_muladd_negate_c); 5236 } 5237 5238 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5239 void *vg, float_status *status, uint32_t desc) 5240 { 5241 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5242 float_muladd_negate_c); 5243 } 5244 5245 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 5246 float_status *status, uint32_t desc, 5247 uint32_t neg1, uint32_t neg3, int flags) 5248 { 5249 intptr_t i = simd_oprsz(desc); 5250 uint64_t *g = vg; 5251 5252 do { 5253 uint64_t pg = g[(i - 1) >> 6]; 5254 do { 5255 i -= 4; 5256 if (likely((pg >> (i & 63)) & 1)) { 5257 float32 e1, e2, e3, r; 5258 5259 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 5260 e2 = *(uint32_t *)(vm + H1_4(i)); 5261 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 5262 r = float32_muladd(e1, e2, e3, flags, status); 5263 *(uint32_t *)(vd + H1_4(i)) = r; 5264 } 5265 } while (i & 63); 5266 } while (i != 0); 5267 } 5268 5269 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5270 void *vg, float_status *status, uint32_t desc) 5271 { 5272 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5273 } 5274 5275 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5276 void *vg, float_status *status, uint32_t desc) 5277 { 5278 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0); 5279 } 5280 5281 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5282 void *vg, float_status *status, uint32_t desc) 5283 { 5284 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0); 5285 } 5286 5287 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5288 void *vg, float_status *status, uint32_t desc) 5289 { 5290 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0); 5291 } 5292 5293 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5294 void *vg, float_status *status, uint32_t desc) 5295 { 5296 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5297 float_muladd_negate_product); 5298 } 5299 5300 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5301 void *vg, float_status *status, uint32_t desc) 5302 { 5303 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5304 float_muladd_negate_product | float_muladd_negate_c); 5305 } 5306 5307 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5308 void *vg, float_status *status, uint32_t desc) 5309 { 5310 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5311 float_muladd_negate_c); 5312 } 5313 5314 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 5315 float_status *status, uint32_t desc, 5316 uint64_t neg1, uint64_t neg3, int flags) 5317 { 5318 intptr_t i = simd_oprsz(desc); 5319 uint64_t *g = vg; 5320 5321 do { 5322 uint64_t pg = g[(i - 1) >> 6]; 5323 do { 5324 i -= 8; 5325 if (likely((pg >> (i & 63)) & 1)) { 5326 float64 e1, e2, e3, r; 5327 5328 e1 = *(uint64_t *)(vn + i) ^ neg1; 5329 e2 = *(uint64_t *)(vm + i); 5330 e3 = *(uint64_t *)(va + i) ^ neg3; 5331 r = float64_muladd(e1, e2, e3, flags, status); 5332 *(uint64_t *)(vd + i) = r; 5333 } 5334 } while (i & 63); 5335 } while (i != 0); 5336 } 5337 5338 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5339 void *vg, float_status *status, uint32_t desc) 5340 { 5341 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5342 } 5343 5344 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5345 void *vg, float_status *status, uint32_t desc) 5346 { 5347 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0); 5348 } 5349 5350 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5351 void *vg, float_status *status, uint32_t desc) 5352 { 5353 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0); 5354 } 5355 5356 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5357 void *vg, float_status *status, uint32_t desc) 5358 { 5359 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0); 5360 } 5361 5362 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5363 void *vg, float_status *status, uint32_t desc) 5364 { 5365 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5366 float_muladd_negate_product); 5367 } 5368 5369 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5370 void *vg, float_status *status, uint32_t desc) 5371 { 5372 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5373 float_muladd_negate_product | float_muladd_negate_c); 5374 } 5375 5376 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5377 void *vg, float_status *status, uint32_t desc) 5378 { 5379 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5380 float_muladd_negate_c); 5381 } 5382 5383 /* Two operand floating-point comparison controlled by a predicate. 5384 * Unlike the integer version, we are not allowed to optimistically 5385 * compare operands, since the comparison may have side effects wrt 5386 * the FPSR. 5387 */ 5388 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 5389 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 5390 float_status *status, uint32_t desc) \ 5391 { \ 5392 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5393 uint64_t *d = vd, *g = vg; \ 5394 do { \ 5395 uint64_t out = 0, pg = g[j]; \ 5396 do { \ 5397 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5398 if (likely((pg >> (i & 63)) & 1)) { \ 5399 TYPE nn = *(TYPE *)(vn + H(i)); \ 5400 TYPE mm = *(TYPE *)(vm + H(i)); \ 5401 out |= OP(TYPE, nn, mm, status); \ 5402 } \ 5403 } while (i & 63); \ 5404 d[j--] = out; \ 5405 } while (i > 0); \ 5406 } 5407 5408 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 5409 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 5410 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 5411 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 5412 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 5413 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 5414 5415 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 5416 DO_FPCMP_PPZZ_H(NAME, OP) \ 5417 DO_FPCMP_PPZZ_S(NAME, OP) \ 5418 DO_FPCMP_PPZZ_D(NAME, OP) 5419 5420 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 5421 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 5422 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 5423 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 5424 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 5425 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 5426 #define DO_FCMUO(TYPE, X, Y, ST) \ 5427 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 5428 #define DO_FACGE(TYPE, X, Y, ST) \ 5429 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 5430 #define DO_FACGT(TYPE, X, Y, ST) \ 5431 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 5432 5433 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 5434 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 5435 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 5436 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 5437 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 5438 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 5439 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 5440 5441 #undef DO_FPCMP_PPZZ_ALL 5442 #undef DO_FPCMP_PPZZ_D 5443 #undef DO_FPCMP_PPZZ_S 5444 #undef DO_FPCMP_PPZZ_H 5445 #undef DO_FPCMP_PPZZ 5446 5447 /* One operand floating-point comparison against zero, controlled 5448 * by a predicate. 5449 */ 5450 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 5451 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 5452 float_status *status, uint32_t desc) \ 5453 { \ 5454 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5455 uint64_t *d = vd, *g = vg; \ 5456 do { \ 5457 uint64_t out = 0, pg = g[j]; \ 5458 do { \ 5459 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5460 if ((pg >> (i & 63)) & 1) { \ 5461 TYPE nn = *(TYPE *)(vn + H(i)); \ 5462 out |= OP(TYPE, nn, 0, status); \ 5463 } \ 5464 } while (i & 63); \ 5465 d[j--] = out; \ 5466 } while (i > 0); \ 5467 } 5468 5469 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 5470 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 5471 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 5472 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 5473 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 5474 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 5475 5476 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 5477 DO_FPCMP_PPZ0_H(NAME, OP) \ 5478 DO_FPCMP_PPZ0_S(NAME, OP) \ 5479 DO_FPCMP_PPZ0_D(NAME, OP) 5480 5481 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 5482 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 5483 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 5484 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 5485 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 5486 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 5487 5488 /* FP Trig Multiply-Add. */ 5489 5490 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 5491 float_status *s, uint32_t desc) 5492 { 5493 static const float16 coeff[16] = { 5494 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5495 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5496 }; 5497 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 5498 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5499 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5500 float16 *d = vd, *n = vn, *m = vm; 5501 5502 for (i = 0; i < opr_sz; i++) { 5503 float16 mm = m[i]; 5504 intptr_t xx = x; 5505 int flags = 0; 5506 5507 if (float16_is_neg(mm)) { 5508 if (fpcr_ah) { 5509 flags = float_muladd_negate_product; 5510 } else { 5511 mm = float16_abs(mm); 5512 } 5513 xx += 8; 5514 } 5515 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s); 5516 } 5517 } 5518 5519 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5520 float_status *s, uint32_t desc) 5521 { 5522 static const float32 coeff[16] = { 5523 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5524 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5525 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5526 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5527 }; 5528 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5529 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5530 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5531 float32 *d = vd, *n = vn, *m = vm; 5532 5533 for (i = 0; i < opr_sz; i++) { 5534 float32 mm = m[i]; 5535 intptr_t xx = x; 5536 int flags = 0; 5537 5538 if (float32_is_neg(mm)) { 5539 if (fpcr_ah) { 5540 flags = float_muladd_negate_product; 5541 } else { 5542 mm = float32_abs(mm); 5543 } 5544 xx += 8; 5545 } 5546 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s); 5547 } 5548 } 5549 5550 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5551 float_status *s, uint32_t desc) 5552 { 5553 static const float64 coeff[16] = { 5554 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5555 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5556 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5557 0x3de5d8408868552full, 0x0000000000000000ull, 5558 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5559 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5560 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5561 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5562 }; 5563 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5564 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5565 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5566 float64 *d = vd, *n = vn, *m = vm; 5567 5568 for (i = 0; i < opr_sz; i++) { 5569 float64 mm = m[i]; 5570 intptr_t xx = x; 5571 int flags = 0; 5572 5573 if (float64_is_neg(mm)) { 5574 if (fpcr_ah) { 5575 flags = float_muladd_negate_product; 5576 } else { 5577 mm = float64_abs(mm); 5578 } 5579 xx += 8; 5580 } 5581 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s); 5582 } 5583 } 5584 5585 /* 5586 * FP Complex Add 5587 */ 5588 5589 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5590 float_status *s, uint32_t desc) 5591 { 5592 intptr_t j, i = simd_oprsz(desc); 5593 uint64_t *g = vg; 5594 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5595 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5596 5597 do { 5598 uint64_t pg = g[(i - 1) >> 6]; 5599 do { 5600 float16 e0, e1, e2, e3; 5601 5602 /* I holds the real index; J holds the imag index. */ 5603 j = i - sizeof(float16); 5604 i -= 2 * sizeof(float16); 5605 5606 e0 = *(float16 *)(vn + H1_2(i)); 5607 e1 = *(float16 *)(vm + H1_2(j)); 5608 e2 = *(float16 *)(vn + H1_2(j)); 5609 e3 = *(float16 *)(vm + H1_2(i)); 5610 5611 if (rot) { 5612 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 5613 } else { 5614 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 5615 } 5616 5617 if (likely((pg >> (i & 63)) & 1)) { 5618 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5619 } 5620 if (likely((pg >> (j & 63)) & 1)) { 5621 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5622 } 5623 } while (i & 63); 5624 } while (i != 0); 5625 } 5626 5627 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5628 float_status *s, uint32_t desc) 5629 { 5630 intptr_t j, i = simd_oprsz(desc); 5631 uint64_t *g = vg; 5632 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5633 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5634 5635 do { 5636 uint64_t pg = g[(i - 1) >> 6]; 5637 do { 5638 float32 e0, e1, e2, e3; 5639 5640 /* I holds the real index; J holds the imag index. */ 5641 j = i - sizeof(float32); 5642 i -= 2 * sizeof(float32); 5643 5644 e0 = *(float32 *)(vn + H1_2(i)); 5645 e1 = *(float32 *)(vm + H1_2(j)); 5646 e2 = *(float32 *)(vn + H1_2(j)); 5647 e3 = *(float32 *)(vm + H1_2(i)); 5648 5649 if (rot) { 5650 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 5651 } else { 5652 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 5653 } 5654 5655 if (likely((pg >> (i & 63)) & 1)) { 5656 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5657 } 5658 if (likely((pg >> (j & 63)) & 1)) { 5659 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5660 } 5661 } while (i & 63); 5662 } while (i != 0); 5663 } 5664 5665 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5666 float_status *s, uint32_t desc) 5667 { 5668 intptr_t j, i = simd_oprsz(desc); 5669 uint64_t *g = vg; 5670 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5671 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5672 5673 do { 5674 uint64_t pg = g[(i - 1) >> 6]; 5675 do { 5676 float64 e0, e1, e2, e3; 5677 5678 /* I holds the real index; J holds the imag index. */ 5679 j = i - sizeof(float64); 5680 i -= 2 * sizeof(float64); 5681 5682 e0 = *(float64 *)(vn + H1_2(i)); 5683 e1 = *(float64 *)(vm + H1_2(j)); 5684 e2 = *(float64 *)(vn + H1_2(j)); 5685 e3 = *(float64 *)(vm + H1_2(i)); 5686 5687 if (rot) { 5688 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 5689 } else { 5690 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 5691 } 5692 5693 if (likely((pg >> (i & 63)) & 1)) { 5694 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5695 } 5696 if (likely((pg >> (j & 63)) & 1)) { 5697 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5698 } 5699 } while (i & 63); 5700 } while (i != 0); 5701 } 5702 5703 /* 5704 * FP Complex Multiply 5705 */ 5706 5707 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5708 void *vg, float_status *status, uint32_t desc) 5709 { 5710 intptr_t j, i = simd_oprsz(desc); 5711 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5712 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5713 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5714 uint32_t negf_real = flip ^ negf_imag; 5715 float16 negx_imag, negx_real; 5716 uint64_t *g = vg; 5717 5718 /* With AH=0, use negx; with AH=1 use negf. */ 5719 negx_real = (negf_real & ~fpcr_ah) << 15; 5720 negx_imag = (negf_imag & ~fpcr_ah) << 15; 5721 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5722 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5723 5724 do { 5725 uint64_t pg = g[(i - 1) >> 6]; 5726 do { 5727 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5728 5729 /* I holds the real index; J holds the imag index. */ 5730 j = i - sizeof(float16); 5731 i -= 2 * sizeof(float16); 5732 5733 nr = *(float16 *)(vn + H1_2(i)); 5734 ni = *(float16 *)(vn + H1_2(j)); 5735 mr = *(float16 *)(vm + H1_2(i)); 5736 mi = *(float16 *)(vm + H1_2(j)); 5737 5738 e2 = (flip ? ni : nr); 5739 e1 = (flip ? mi : mr) ^ negx_real; 5740 e4 = e2; 5741 e3 = (flip ? mr : mi) ^ negx_imag; 5742 5743 if (likely((pg >> (i & 63)) & 1)) { 5744 d = *(float16 *)(va + H1_2(i)); 5745 d = float16_muladd(e2, e1, d, negf_real, status); 5746 *(float16 *)(vd + H1_2(i)) = d; 5747 } 5748 if (likely((pg >> (j & 63)) & 1)) { 5749 d = *(float16 *)(va + H1_2(j)); 5750 d = float16_muladd(e4, e3, d, negf_imag, status); 5751 *(float16 *)(vd + H1_2(j)) = d; 5752 } 5753 } while (i & 63); 5754 } while (i != 0); 5755 } 5756 5757 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5758 void *vg, float_status *status, uint32_t desc) 5759 { 5760 intptr_t j, i = simd_oprsz(desc); 5761 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5762 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5763 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5764 uint32_t negf_real = flip ^ negf_imag; 5765 float32 negx_imag, negx_real; 5766 uint64_t *g = vg; 5767 5768 /* With AH=0, use negx; with AH=1 use negf. */ 5769 negx_real = (negf_real & ~fpcr_ah) << 31; 5770 negx_imag = (negf_imag & ~fpcr_ah) << 31; 5771 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5772 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5773 5774 do { 5775 uint64_t pg = g[(i - 1) >> 6]; 5776 do { 5777 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5778 5779 /* I holds the real index; J holds the imag index. */ 5780 j = i - sizeof(float32); 5781 i -= 2 * sizeof(float32); 5782 5783 nr = *(float32 *)(vn + H1_2(i)); 5784 ni = *(float32 *)(vn + H1_2(j)); 5785 mr = *(float32 *)(vm + H1_2(i)); 5786 mi = *(float32 *)(vm + H1_2(j)); 5787 5788 e2 = (flip ? ni : nr); 5789 e1 = (flip ? mi : mr) ^ negx_real; 5790 e4 = e2; 5791 e3 = (flip ? mr : mi) ^ negx_imag; 5792 5793 if (likely((pg >> (i & 63)) & 1)) { 5794 d = *(float32 *)(va + H1_2(i)); 5795 d = float32_muladd(e2, e1, d, negf_real, status); 5796 *(float32 *)(vd + H1_2(i)) = d; 5797 } 5798 if (likely((pg >> (j & 63)) & 1)) { 5799 d = *(float32 *)(va + H1_2(j)); 5800 d = float32_muladd(e4, e3, d, negf_imag, status); 5801 *(float32 *)(vd + H1_2(j)) = d; 5802 } 5803 } while (i & 63); 5804 } while (i != 0); 5805 } 5806 5807 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5808 void *vg, float_status *status, uint32_t desc) 5809 { 5810 intptr_t j, i = simd_oprsz(desc); 5811 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5812 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5813 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5814 uint32_t negf_real = flip ^ negf_imag; 5815 float64 negx_imag, negx_real; 5816 uint64_t *g = vg; 5817 5818 /* With AH=0, use negx; with AH=1 use negf. */ 5819 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 5820 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 5821 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5822 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5823 5824 do { 5825 uint64_t pg = g[(i - 1) >> 6]; 5826 do { 5827 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5828 5829 /* I holds the real index; J holds the imag index. */ 5830 j = i - sizeof(float64); 5831 i -= 2 * sizeof(float64); 5832 5833 nr = *(float64 *)(vn + H1_2(i)); 5834 ni = *(float64 *)(vn + H1_2(j)); 5835 mr = *(float64 *)(vm + H1_2(i)); 5836 mi = *(float64 *)(vm + H1_2(j)); 5837 5838 e2 = (flip ? ni : nr); 5839 e1 = (flip ? mi : mr) ^ negx_real; 5840 e4 = e2; 5841 e3 = (flip ? mr : mi) ^ negx_imag; 5842 5843 if (likely((pg >> (i & 63)) & 1)) { 5844 d = *(float64 *)(va + H1_2(i)); 5845 d = float64_muladd(e2, e1, d, negf_real, status); 5846 *(float64 *)(vd + H1_2(i)) = d; 5847 } 5848 if (likely((pg >> (j & 63)) & 1)) { 5849 d = *(float64 *)(va + H1_2(j)); 5850 d = float64_muladd(e4, e3, d, negf_imag, status); 5851 *(float64 *)(vd + H1_2(j)) = d; 5852 } 5853 } while (i & 63); 5854 } while (i != 0); 5855 } 5856 5857 /* 5858 * Load contiguous data, protected by a governing predicate. 5859 */ 5860 5861 /* 5862 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5863 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5864 * element >= @reg_off, or @reg_max if there were no active elements at all. 5865 */ 5866 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5867 intptr_t reg_max, int esz) 5868 { 5869 uint64_t pg_mask = pred_esz_masks[esz]; 5870 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5871 5872 /* In normal usage, the first element is active. */ 5873 if (likely(pg & 1)) { 5874 return reg_off; 5875 } 5876 5877 if (pg == 0) { 5878 reg_off &= -64; 5879 do { 5880 reg_off += 64; 5881 if (unlikely(reg_off >= reg_max)) { 5882 /* The entire predicate was false. */ 5883 return reg_max; 5884 } 5885 pg = vg[reg_off >> 6] & pg_mask; 5886 } while (pg == 0); 5887 } 5888 reg_off += ctz64(pg); 5889 5890 /* We should never see an out of range predicate bit set. */ 5891 tcg_debug_assert(reg_off < reg_max); 5892 return reg_off; 5893 } 5894 5895 /* 5896 * Resolve the guest virtual address to info->host and info->flags. 5897 * If @nofault, return false if the page is invalid, otherwise 5898 * exit via page fault exception. 5899 */ 5900 5901 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5902 target_ulong addr, int mem_off, MMUAccessType access_type, 5903 int mmu_idx, uintptr_t retaddr) 5904 { 5905 int flags; 5906 5907 addr += mem_off; 5908 5909 /* 5910 * User-only currently always issues with TBI. See the comment 5911 * above useronly_clean_ptr. Usually we clean this top byte away 5912 * during translation, but we can't do that for e.g. vector + imm 5913 * addressing modes. 5914 * 5915 * We currently always enable TBI for user-only, and do not provide 5916 * a way to turn it off. So clean the pointer unconditionally here, 5917 * rather than look it up here, or pass it down from above. 5918 */ 5919 addr = useronly_clean_ptr(addr); 5920 5921 #ifdef CONFIG_USER_ONLY 5922 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5923 &info->host, retaddr); 5924 #else 5925 CPUTLBEntryFull *full; 5926 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5927 &info->host, &full, retaddr); 5928 #endif 5929 info->flags = flags; 5930 5931 if (flags & TLB_INVALID_MASK) { 5932 g_assert(nofault); 5933 return false; 5934 } 5935 5936 #ifdef CONFIG_USER_ONLY 5937 memset(&info->attrs, 0, sizeof(info->attrs)); 5938 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5939 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5940 #else 5941 info->attrs = full->attrs; 5942 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5943 #endif 5944 5945 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5946 info->host -= mem_off; 5947 return true; 5948 } 5949 5950 /* 5951 * Find first active element on each page, and a loose bound for the 5952 * final element on each page. Identify any single element that spans 5953 * the page boundary. Return true if there are any active elements. 5954 */ 5955 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5956 intptr_t reg_max, int esz, int msize) 5957 { 5958 const int esize = 1 << esz; 5959 const uint64_t pg_mask = pred_esz_masks[esz]; 5960 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5961 intptr_t mem_off_last, mem_off_split; 5962 intptr_t page_split, elt_split; 5963 intptr_t i; 5964 5965 /* Set all of the element indices to -1, and the TLB data to 0. */ 5966 memset(info, -1, offsetof(SVEContLdSt, page)); 5967 memset(info->page, 0, sizeof(info->page)); 5968 5969 /* Gross scan over the entire predicate to find bounds. */ 5970 i = 0; 5971 do { 5972 uint64_t pg = vg[i] & pg_mask; 5973 if (pg) { 5974 reg_off_last = i * 64 + 63 - clz64(pg); 5975 if (reg_off_first < 0) { 5976 reg_off_first = i * 64 + ctz64(pg); 5977 } 5978 } 5979 } while (++i * 64 < reg_max); 5980 5981 if (unlikely(reg_off_first < 0)) { 5982 /* No active elements, no pages touched. */ 5983 return false; 5984 } 5985 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5986 5987 info->reg_off_first[0] = reg_off_first; 5988 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5989 mem_off_last = (reg_off_last >> esz) * msize; 5990 5991 page_split = -(addr | TARGET_PAGE_MASK); 5992 if (likely(mem_off_last + msize <= page_split)) { 5993 /* The entire operation fits within a single page. */ 5994 info->reg_off_last[0] = reg_off_last; 5995 return true; 5996 } 5997 5998 info->page_split = page_split; 5999 elt_split = page_split / msize; 6000 reg_off_split = elt_split << esz; 6001 mem_off_split = elt_split * msize; 6002 6003 /* 6004 * This is the last full element on the first page, but it is not 6005 * necessarily active. If there is no full element, i.e. the first 6006 * active element is the one that's split, this value remains -1. 6007 * It is useful as iteration bounds. 6008 */ 6009 if (elt_split != 0) { 6010 info->reg_off_last[0] = reg_off_split - esize; 6011 } 6012 6013 /* Determine if an unaligned element spans the pages. */ 6014 if (page_split % msize != 0) { 6015 /* It is helpful to know if the split element is active. */ 6016 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 6017 info->reg_off_split = reg_off_split; 6018 info->mem_off_split = mem_off_split; 6019 6020 if (reg_off_split == reg_off_last) { 6021 /* The page crossing element is last. */ 6022 return true; 6023 } 6024 } 6025 reg_off_split += esize; 6026 mem_off_split += msize; 6027 } 6028 6029 /* 6030 * We do want the first active element on the second page, because 6031 * this may affect the address reported in an exception. 6032 */ 6033 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 6034 tcg_debug_assert(reg_off_split <= reg_off_last); 6035 info->reg_off_first[1] = reg_off_split; 6036 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 6037 info->reg_off_last[1] = reg_off_last; 6038 return true; 6039 } 6040 6041 /* 6042 * Resolve the guest virtual addresses to info->page[]. 6043 * Control the generation of page faults with @fault. Return false if 6044 * there is no work to do, which can only happen with @fault == FAULT_NO. 6045 */ 6046 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 6047 CPUARMState *env, target_ulong addr, 6048 MMUAccessType access_type, uintptr_t retaddr) 6049 { 6050 int mmu_idx = arm_env_mmu_index(env); 6051 int mem_off = info->mem_off_first[0]; 6052 bool nofault = fault == FAULT_NO; 6053 bool have_work = true; 6054 6055 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 6056 access_type, mmu_idx, retaddr)) { 6057 /* No work to be done. */ 6058 return false; 6059 } 6060 6061 if (likely(info->page_split < 0)) { 6062 /* The entire operation was on the one page. */ 6063 return true; 6064 } 6065 6066 /* 6067 * If the second page is invalid, then we want the fault address to be 6068 * the first byte on that page which is accessed. 6069 */ 6070 if (info->mem_off_split >= 0) { 6071 /* 6072 * There is an element split across the pages. The fault address 6073 * should be the first byte of the second page. 6074 */ 6075 mem_off = info->page_split; 6076 /* 6077 * If the split element is also the first active element 6078 * of the vector, then: For first-fault we should continue 6079 * to generate faults for the second page. For no-fault, 6080 * we have work only if the second page is valid. 6081 */ 6082 if (info->mem_off_first[0] < info->mem_off_split) { 6083 nofault = FAULT_FIRST; 6084 have_work = false; 6085 } 6086 } else { 6087 /* 6088 * There is no element split across the pages. The fault address 6089 * should be the first active element on the second page. 6090 */ 6091 mem_off = info->mem_off_first[1]; 6092 /* 6093 * There must have been one active element on the first page, 6094 * so we're out of first-fault territory. 6095 */ 6096 nofault = fault != FAULT_ALL; 6097 } 6098 6099 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 6100 access_type, mmu_idx, retaddr); 6101 return have_work; 6102 } 6103 6104 #ifndef CONFIG_USER_ONLY 6105 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 6106 uint64_t *vg, target_ulong addr, 6107 int esize, int msize, int wp_access, 6108 uintptr_t retaddr) 6109 { 6110 intptr_t mem_off, reg_off, reg_last; 6111 int flags0 = info->page[0].flags; 6112 int flags1 = info->page[1].flags; 6113 6114 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 6115 return; 6116 } 6117 6118 /* Indicate that watchpoints are handled. */ 6119 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 6120 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 6121 6122 if (flags0 & TLB_WATCHPOINT) { 6123 mem_off = info->mem_off_first[0]; 6124 reg_off = info->reg_off_first[0]; 6125 reg_last = info->reg_off_last[0]; 6126 6127 while (reg_off <= reg_last) { 6128 uint64_t pg = vg[reg_off >> 6]; 6129 do { 6130 if ((pg >> (reg_off & 63)) & 1) { 6131 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 6132 msize, info->page[0].attrs, 6133 wp_access, retaddr); 6134 } 6135 reg_off += esize; 6136 mem_off += msize; 6137 } while (reg_off <= reg_last && (reg_off & 63)); 6138 } 6139 } 6140 6141 mem_off = info->mem_off_split; 6142 if (mem_off >= 0) { 6143 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 6144 info->page[0].attrs, wp_access, retaddr); 6145 } 6146 6147 mem_off = info->mem_off_first[1]; 6148 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 6149 reg_off = info->reg_off_first[1]; 6150 reg_last = info->reg_off_last[1]; 6151 6152 do { 6153 uint64_t pg = vg[reg_off >> 6]; 6154 do { 6155 if ((pg >> (reg_off & 63)) & 1) { 6156 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 6157 msize, info->page[1].attrs, 6158 wp_access, retaddr); 6159 } 6160 reg_off += esize; 6161 mem_off += msize; 6162 } while (reg_off & 63); 6163 } while (reg_off <= reg_last); 6164 } 6165 } 6166 #endif 6167 6168 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 6169 uint64_t *vg, target_ulong addr, int esize, 6170 int msize, uint32_t mtedesc, uintptr_t ra) 6171 { 6172 intptr_t mem_off, reg_off, reg_last; 6173 6174 /* Process the page only if MemAttr == Tagged. */ 6175 if (info->page[0].tagged) { 6176 mem_off = info->mem_off_first[0]; 6177 reg_off = info->reg_off_first[0]; 6178 reg_last = info->reg_off_split; 6179 if (reg_last < 0) { 6180 reg_last = info->reg_off_last[0]; 6181 } 6182 6183 do { 6184 uint64_t pg = vg[reg_off >> 6]; 6185 do { 6186 if ((pg >> (reg_off & 63)) & 1) { 6187 mte_check(env, mtedesc, addr, ra); 6188 } 6189 reg_off += esize; 6190 mem_off += msize; 6191 } while (reg_off <= reg_last && (reg_off & 63)); 6192 } while (reg_off <= reg_last); 6193 } 6194 6195 mem_off = info->mem_off_first[1]; 6196 if (mem_off >= 0 && info->page[1].tagged) { 6197 reg_off = info->reg_off_first[1]; 6198 reg_last = info->reg_off_last[1]; 6199 6200 do { 6201 uint64_t pg = vg[reg_off >> 6]; 6202 do { 6203 if ((pg >> (reg_off & 63)) & 1) { 6204 mte_check(env, mtedesc, addr, ra); 6205 } 6206 reg_off += esize; 6207 mem_off += msize; 6208 } while (reg_off & 63); 6209 } while (reg_off <= reg_last); 6210 } 6211 } 6212 6213 /* 6214 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6215 */ 6216 static inline QEMU_ALWAYS_INLINE 6217 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 6218 uint32_t desc, const uintptr_t retaddr, 6219 const int esz, const int msz, const int N, uint32_t mtedesc, 6220 sve_ldst1_host_fn *host_fn, 6221 sve_ldst1_tlb_fn *tlb_fn) 6222 { 6223 const unsigned rd = simd_data(desc); 6224 const intptr_t reg_max = simd_oprsz(desc); 6225 intptr_t reg_off, reg_last, mem_off; 6226 SVEContLdSt info; 6227 void *host; 6228 int flags, i; 6229 6230 /* Find the active elements. */ 6231 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6232 /* The entire predicate was false; no load occurs. */ 6233 for (i = 0; i < N; ++i) { 6234 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 6235 } 6236 return; 6237 } 6238 6239 /* Probe the page(s). Exit with exception for any invalid page. */ 6240 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 6241 6242 /* Handle watchpoints for all active elements. */ 6243 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6244 BP_MEM_READ, retaddr); 6245 6246 /* 6247 * Handle mte checks for all active elements. 6248 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6249 */ 6250 if (mtedesc) { 6251 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6252 mtedesc, retaddr); 6253 } 6254 6255 flags = info.page[0].flags | info.page[1].flags; 6256 if (unlikely(flags != 0)) { 6257 /* 6258 * At least one page includes MMIO. 6259 * Any bus operation can fail with cpu_transaction_failed, 6260 * which for ARM will raise SyncExternal. Perform the load 6261 * into scratch memory to preserve register state until the end. 6262 */ 6263 ARMVectorReg scratch[4] = { }; 6264 6265 mem_off = info.mem_off_first[0]; 6266 reg_off = info.reg_off_first[0]; 6267 reg_last = info.reg_off_last[1]; 6268 if (reg_last < 0) { 6269 reg_last = info.reg_off_split; 6270 if (reg_last < 0) { 6271 reg_last = info.reg_off_last[0]; 6272 } 6273 } 6274 6275 do { 6276 uint64_t pg = vg[reg_off >> 6]; 6277 do { 6278 if ((pg >> (reg_off & 63)) & 1) { 6279 for (i = 0; i < N; ++i) { 6280 tlb_fn(env, &scratch[i], reg_off, 6281 addr + mem_off + (i << msz), retaddr); 6282 } 6283 } 6284 reg_off += 1 << esz; 6285 mem_off += N << msz; 6286 } while (reg_off & 63); 6287 } while (reg_off <= reg_last); 6288 6289 for (i = 0; i < N; ++i) { 6290 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 6291 } 6292 return; 6293 } 6294 6295 /* The entire operation is in RAM, on valid pages. */ 6296 6297 for (i = 0; i < N; ++i) { 6298 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 6299 } 6300 6301 mem_off = info.mem_off_first[0]; 6302 reg_off = info.reg_off_first[0]; 6303 reg_last = info.reg_off_last[0]; 6304 host = info.page[0].host; 6305 6306 set_helper_retaddr(retaddr); 6307 6308 while (reg_off <= reg_last) { 6309 uint64_t pg = vg[reg_off >> 6]; 6310 do { 6311 if ((pg >> (reg_off & 63)) & 1) { 6312 for (i = 0; i < N; ++i) { 6313 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6314 host + mem_off + (i << msz)); 6315 } 6316 } 6317 reg_off += 1 << esz; 6318 mem_off += N << msz; 6319 } while (reg_off <= reg_last && (reg_off & 63)); 6320 } 6321 6322 clear_helper_retaddr(); 6323 6324 /* 6325 * Use the slow path to manage the cross-page misalignment. 6326 * But we know this is RAM and cannot trap. 6327 */ 6328 mem_off = info.mem_off_split; 6329 if (unlikely(mem_off >= 0)) { 6330 reg_off = info.reg_off_split; 6331 for (i = 0; i < N; ++i) { 6332 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6333 addr + mem_off + (i << msz), retaddr); 6334 } 6335 } 6336 6337 mem_off = info.mem_off_first[1]; 6338 if (unlikely(mem_off >= 0)) { 6339 reg_off = info.reg_off_first[1]; 6340 reg_last = info.reg_off_last[1]; 6341 host = info.page[1].host; 6342 6343 set_helper_retaddr(retaddr); 6344 6345 do { 6346 uint64_t pg = vg[reg_off >> 6]; 6347 do { 6348 if ((pg >> (reg_off & 63)) & 1) { 6349 for (i = 0; i < N; ++i) { 6350 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6351 host + mem_off + (i << msz)); 6352 } 6353 } 6354 reg_off += 1 << esz; 6355 mem_off += N << msz; 6356 } while (reg_off & 63); 6357 } while (reg_off <= reg_last); 6358 6359 clear_helper_retaddr(); 6360 } 6361 } 6362 6363 static inline QEMU_ALWAYS_INLINE 6364 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6365 uint64_t desc, const uintptr_t ra, 6366 const int esz, const int msz, const int N, 6367 sve_ldst1_host_fn *host_fn, 6368 sve_ldst1_tlb_fn *tlb_fn) 6369 { 6370 uint32_t mtedesc = desc >> 32; 6371 int bit55 = extract64(addr, 55, 1); 6372 6373 /* Perform gross MTE suppression early. */ 6374 if (!tbi_check(mtedesc, bit55) || 6375 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6376 mtedesc = 0; 6377 } 6378 6379 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6380 } 6381 6382 #define DO_LD1_1(NAME, ESZ) \ 6383 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 6384 target_ulong addr, uint64_t desc) \ 6385 { \ 6386 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 6387 sve_##NAME##_host, sve_##NAME##_tlb); \ 6388 } \ 6389 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6390 target_ulong addr, uint64_t desc) \ 6391 { \ 6392 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 6393 sve_##NAME##_host, sve_##NAME##_tlb); \ 6394 } 6395 6396 #define DO_LD1_2(NAME, ESZ, MSZ) \ 6397 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 6398 target_ulong addr, uint64_t desc) \ 6399 { \ 6400 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6401 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6402 } \ 6403 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 6404 target_ulong addr, uint64_t desc) \ 6405 { \ 6406 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6407 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6408 } \ 6409 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6410 target_ulong addr, uint64_t desc) \ 6411 { \ 6412 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6413 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6414 } \ 6415 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6416 target_ulong addr, uint64_t desc) \ 6417 { \ 6418 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6419 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6420 } 6421 6422 DO_LD1_1(ld1bb, MO_8) 6423 DO_LD1_1(ld1bhu, MO_16) 6424 DO_LD1_1(ld1bhs, MO_16) 6425 DO_LD1_1(ld1bsu, MO_32) 6426 DO_LD1_1(ld1bss, MO_32) 6427 DO_LD1_1(ld1bdu, MO_64) 6428 DO_LD1_1(ld1bds, MO_64) 6429 6430 DO_LD1_2(ld1hh, MO_16, MO_16) 6431 DO_LD1_2(ld1hsu, MO_32, MO_16) 6432 DO_LD1_2(ld1hss, MO_32, MO_16) 6433 DO_LD1_2(ld1hdu, MO_64, MO_16) 6434 DO_LD1_2(ld1hds, MO_64, MO_16) 6435 6436 DO_LD1_2(ld1ss, MO_32, MO_32) 6437 DO_LD1_2(ld1sdu, MO_64, MO_32) 6438 DO_LD1_2(ld1sds, MO_64, MO_32) 6439 6440 DO_LD1_2(ld1dd, MO_64, MO_64) 6441 6442 DO_LD1_2(ld1squ, MO_128, MO_32) 6443 DO_LD1_2(ld1dqu, MO_128, MO_64) 6444 6445 #undef DO_LD1_1 6446 #undef DO_LD1_2 6447 6448 #define DO_LDN_1(N) \ 6449 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 6450 target_ulong addr, uint64_t desc) \ 6451 { \ 6452 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 6453 sve_ld1bb_host, sve_ld1bb_tlb); \ 6454 } \ 6455 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 6456 target_ulong addr, uint64_t desc) \ 6457 { \ 6458 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 6459 sve_ld1bb_host, sve_ld1bb_tlb); \ 6460 } 6461 6462 #define DO_LDN_2(N, SUFF, ESZ) \ 6463 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 6464 target_ulong addr, uint64_t desc) \ 6465 { \ 6466 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6467 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6468 } \ 6469 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 6470 target_ulong addr, uint64_t desc) \ 6471 { \ 6472 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6473 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6474 } \ 6475 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 6476 target_ulong addr, uint64_t desc) \ 6477 { \ 6478 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6479 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6480 } \ 6481 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 6482 target_ulong addr, uint64_t desc) \ 6483 { \ 6484 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6485 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6486 } 6487 6488 DO_LDN_1(2) 6489 DO_LDN_1(3) 6490 DO_LDN_1(4) 6491 6492 DO_LDN_2(2, hh, MO_16) 6493 DO_LDN_2(3, hh, MO_16) 6494 DO_LDN_2(4, hh, MO_16) 6495 6496 DO_LDN_2(2, ss, MO_32) 6497 DO_LDN_2(3, ss, MO_32) 6498 DO_LDN_2(4, ss, MO_32) 6499 6500 DO_LDN_2(2, dd, MO_64) 6501 DO_LDN_2(3, dd, MO_64) 6502 DO_LDN_2(4, dd, MO_64) 6503 6504 DO_LDN_2(2, qq, MO_128) 6505 DO_LDN_2(3, qq, MO_128) 6506 DO_LDN_2(4, qq, MO_128) 6507 6508 #undef DO_LDN_1 6509 #undef DO_LDN_2 6510 6511 /* 6512 * Load contiguous data, first-fault and no-fault. 6513 * 6514 * For user-only, we control the race between page_check_range and 6515 * another thread's munmap by using set/clear_helper_retaddr. Any 6516 * SEGV that occurs between those markers is assumed to be because 6517 * the guest page vanished. Keep that block as small as possible 6518 * so that unrelated QEMU bugs are not blamed on the guest. 6519 */ 6520 6521 /* Fault on byte I. All bits in FFR from I are cleared. The vector 6522 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 6523 * option, which leaves subsequent data unchanged. 6524 */ 6525 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 6526 { 6527 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 6528 6529 if (i & 63) { 6530 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 6531 i = ROUND_UP(i, 64); 6532 } 6533 for (; i < oprsz; i += 64) { 6534 ffr[i / 64] = 0; 6535 } 6536 } 6537 6538 /* 6539 * Common helper for all contiguous no-fault and first-fault loads. 6540 */ 6541 static inline QEMU_ALWAYS_INLINE 6542 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 6543 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 6544 const int esz, const int msz, const SVEContFault fault, 6545 sve_ldst1_host_fn *host_fn, 6546 sve_ldst1_tlb_fn *tlb_fn) 6547 { 6548 const unsigned rd = simd_data(desc); 6549 void *vd = &env->vfp.zregs[rd]; 6550 const intptr_t reg_max = simd_oprsz(desc); 6551 intptr_t reg_off, mem_off, reg_last; 6552 SVEContLdSt info; 6553 int flags; 6554 void *host; 6555 6556 /* Find the active elements. */ 6557 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 6558 /* The entire predicate was false; no load occurs. */ 6559 memset(vd, 0, reg_max); 6560 return; 6561 } 6562 reg_off = info.reg_off_first[0]; 6563 6564 /* Probe the page(s). */ 6565 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6566 /* Fault on first element. */ 6567 tcg_debug_assert(fault == FAULT_NO); 6568 memset(vd, 0, reg_max); 6569 goto do_fault; 6570 } 6571 6572 mem_off = info.mem_off_first[0]; 6573 flags = info.page[0].flags; 6574 6575 /* 6576 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6577 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6578 */ 6579 if (!info.page[0].tagged) { 6580 mtedesc = 0; 6581 } 6582 6583 if (fault == FAULT_FIRST) { 6584 /* Trapping mte check for the first-fault element. */ 6585 if (mtedesc) { 6586 mte_check(env, mtedesc, addr + mem_off, retaddr); 6587 } 6588 6589 /* 6590 * Special handling of the first active element, 6591 * if it crosses a page boundary or is MMIO. 6592 */ 6593 bool is_split = mem_off == info.mem_off_split; 6594 if (unlikely(flags != 0) || unlikely(is_split)) { 6595 /* 6596 * Use the slow path for cross-page handling. 6597 * Might trap for MMIO or watchpoints. 6598 */ 6599 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6600 6601 /* After any fault, zero the other elements. */ 6602 swap_memzero(vd, reg_off); 6603 reg_off += 1 << esz; 6604 mem_off += 1 << msz; 6605 swap_memzero(vd + reg_off, reg_max - reg_off); 6606 6607 if (is_split) { 6608 goto second_page; 6609 } 6610 } else { 6611 memset(vd, 0, reg_max); 6612 } 6613 } else { 6614 memset(vd, 0, reg_max); 6615 if (unlikely(mem_off == info.mem_off_split)) { 6616 /* The first active element crosses a page boundary. */ 6617 flags |= info.page[1].flags; 6618 if (unlikely(flags & TLB_MMIO)) { 6619 /* Some page is MMIO, see below. */ 6620 goto do_fault; 6621 } 6622 if (unlikely(flags & TLB_WATCHPOINT) && 6623 (cpu_watchpoint_address_matches 6624 (env_cpu(env), addr + mem_off, 1 << msz) 6625 & BP_MEM_READ)) { 6626 /* Watchpoint hit, see below. */ 6627 goto do_fault; 6628 } 6629 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6630 goto do_fault; 6631 } 6632 /* 6633 * Use the slow path for cross-page handling. 6634 * This is RAM, without a watchpoint, and will not trap. 6635 */ 6636 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6637 goto second_page; 6638 } 6639 } 6640 6641 /* 6642 * From this point on, all memory operations are MemSingleNF. 6643 * 6644 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6645 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6646 * 6647 * Unfortuately we do not have access to the memory attributes from the 6648 * PTE to tell Device memory from Normal memory. So we make a mostly 6649 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6650 * This gives the right answer for the common cases of "Normal memory, 6651 * backed by host RAM" and "Device memory, backed by MMIO". 6652 * The architecture allows us to suppress an NF load and return 6653 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6654 * case of "Normal memory, backed by MMIO" is permitted. The case we 6655 * get wrong is "Device memory, backed by host RAM", for which we 6656 * should return (UNKNOWN, FAULT) for but do not. 6657 * 6658 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6659 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6660 * architectural breakpoints the same. 6661 */ 6662 if (unlikely(flags & TLB_MMIO)) { 6663 goto do_fault; 6664 } 6665 6666 reg_last = info.reg_off_last[0]; 6667 host = info.page[0].host; 6668 6669 set_helper_retaddr(retaddr); 6670 6671 do { 6672 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6673 do { 6674 if ((pg >> (reg_off & 63)) & 1) { 6675 if (unlikely(flags & TLB_WATCHPOINT) && 6676 (cpu_watchpoint_address_matches 6677 (env_cpu(env), addr + mem_off, 1 << msz) 6678 & BP_MEM_READ)) { 6679 clear_helper_retaddr(); 6680 goto do_fault; 6681 } 6682 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6683 clear_helper_retaddr(); 6684 goto do_fault; 6685 } 6686 host_fn(vd, reg_off, host + mem_off); 6687 } 6688 reg_off += 1 << esz; 6689 mem_off += 1 << msz; 6690 } while (reg_off <= reg_last && (reg_off & 63)); 6691 } while (reg_off <= reg_last); 6692 6693 clear_helper_retaddr(); 6694 6695 /* 6696 * MemSingleNF is allowed to fail for any reason. We have special 6697 * code above to handle the first element crossing a page boundary. 6698 * As an implementation choice, decline to handle a cross-page element 6699 * in any other position. 6700 */ 6701 reg_off = info.reg_off_split; 6702 if (reg_off >= 0) { 6703 goto do_fault; 6704 } 6705 6706 second_page: 6707 reg_off = info.reg_off_first[1]; 6708 if (likely(reg_off < 0)) { 6709 /* No active elements on the second page. All done. */ 6710 return; 6711 } 6712 6713 /* 6714 * MemSingleNF is allowed to fail for any reason. As an implementation 6715 * choice, decline to handle elements on the second page. This should 6716 * be low frequency as the guest walks through memory -- the next 6717 * iteration of the guest's loop should be aligned on the page boundary, 6718 * and then all following iterations will stay aligned. 6719 */ 6720 6721 do_fault: 6722 record_fault(env, reg_off, reg_max); 6723 } 6724 6725 static inline QEMU_ALWAYS_INLINE 6726 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6727 uint64_t desc, const uintptr_t retaddr, 6728 const int esz, const int msz, const SVEContFault fault, 6729 sve_ldst1_host_fn *host_fn, 6730 sve_ldst1_tlb_fn *tlb_fn) 6731 { 6732 uint32_t mtedesc = desc >> 32; 6733 int bit55 = extract64(addr, 55, 1); 6734 6735 /* Perform gross MTE suppression early. */ 6736 if (!tbi_check(mtedesc, bit55) || 6737 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6738 mtedesc = 0; 6739 } 6740 6741 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6742 esz, msz, fault, host_fn, tlb_fn); 6743 } 6744 6745 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6746 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6747 target_ulong addr, uint64_t desc) \ 6748 { \ 6749 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6750 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6751 } \ 6752 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6753 target_ulong addr, uint64_t desc) \ 6754 { \ 6755 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6756 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6757 } \ 6758 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6759 target_ulong addr, uint64_t desc) \ 6760 { \ 6761 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6762 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6763 } \ 6764 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6765 target_ulong addr, uint64_t desc) \ 6766 { \ 6767 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6768 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6769 } 6770 6771 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6772 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6773 target_ulong addr, uint64_t desc) \ 6774 { \ 6775 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6776 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6777 } \ 6778 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6779 target_ulong addr, uint64_t desc) \ 6780 { \ 6781 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6782 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6783 } \ 6784 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6785 target_ulong addr, uint64_t desc) \ 6786 { \ 6787 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6788 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6789 } \ 6790 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6791 target_ulong addr, uint64_t desc) \ 6792 { \ 6793 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6794 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6795 } \ 6796 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6797 target_ulong addr, uint64_t desc) \ 6798 { \ 6799 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6800 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6801 } \ 6802 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6803 target_ulong addr, uint64_t desc) \ 6804 { \ 6805 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6806 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6807 } \ 6808 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6809 target_ulong addr, uint64_t desc) \ 6810 { \ 6811 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6812 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6813 } \ 6814 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6815 target_ulong addr, uint64_t desc) \ 6816 { \ 6817 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6818 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6819 } 6820 6821 DO_LDFF1_LDNF1_1(bb, MO_8) 6822 DO_LDFF1_LDNF1_1(bhu, MO_16) 6823 DO_LDFF1_LDNF1_1(bhs, MO_16) 6824 DO_LDFF1_LDNF1_1(bsu, MO_32) 6825 DO_LDFF1_LDNF1_1(bss, MO_32) 6826 DO_LDFF1_LDNF1_1(bdu, MO_64) 6827 DO_LDFF1_LDNF1_1(bds, MO_64) 6828 6829 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6830 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6831 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6832 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6833 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6834 6835 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6836 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6837 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6838 6839 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6840 6841 #undef DO_LDFF1_LDNF1_1 6842 #undef DO_LDFF1_LDNF1_2 6843 6844 /* 6845 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6846 */ 6847 6848 static inline QEMU_ALWAYS_INLINE 6849 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6850 uint32_t desc, const uintptr_t retaddr, 6851 const int esz, const int msz, const int N, uint32_t mtedesc, 6852 sve_ldst1_host_fn *host_fn, 6853 sve_ldst1_tlb_fn *tlb_fn) 6854 { 6855 const unsigned rd = simd_data(desc); 6856 const intptr_t reg_max = simd_oprsz(desc); 6857 intptr_t reg_off, reg_last, mem_off; 6858 SVEContLdSt info; 6859 void *host; 6860 int i, flags; 6861 6862 /* Find the active elements. */ 6863 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6864 /* The entire predicate was false; no store occurs. */ 6865 return; 6866 } 6867 6868 /* Probe the page(s). Exit with exception for any invalid page. */ 6869 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6870 6871 /* Handle watchpoints for all active elements. */ 6872 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6873 BP_MEM_WRITE, retaddr); 6874 6875 /* 6876 * Handle mte checks for all active elements. 6877 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6878 */ 6879 if (mtedesc) { 6880 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6881 mtedesc, retaddr); 6882 } 6883 6884 flags = info.page[0].flags | info.page[1].flags; 6885 if (unlikely(flags != 0)) { 6886 /* 6887 * At least one page includes MMIO. 6888 * Any bus operation can fail with cpu_transaction_failed, 6889 * which for ARM will raise SyncExternal. We cannot avoid 6890 * this fault and will leave with the store incomplete. 6891 */ 6892 mem_off = info.mem_off_first[0]; 6893 reg_off = info.reg_off_first[0]; 6894 reg_last = info.reg_off_last[1]; 6895 if (reg_last < 0) { 6896 reg_last = info.reg_off_split; 6897 if (reg_last < 0) { 6898 reg_last = info.reg_off_last[0]; 6899 } 6900 } 6901 6902 do { 6903 uint64_t pg = vg[reg_off >> 6]; 6904 do { 6905 if ((pg >> (reg_off & 63)) & 1) { 6906 for (i = 0; i < N; ++i) { 6907 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6908 addr + mem_off + (i << msz), retaddr); 6909 } 6910 } 6911 reg_off += 1 << esz; 6912 mem_off += N << msz; 6913 } while (reg_off & 63); 6914 } while (reg_off <= reg_last); 6915 return; 6916 } 6917 6918 mem_off = info.mem_off_first[0]; 6919 reg_off = info.reg_off_first[0]; 6920 reg_last = info.reg_off_last[0]; 6921 host = info.page[0].host; 6922 6923 set_helper_retaddr(retaddr); 6924 6925 while (reg_off <= reg_last) { 6926 uint64_t pg = vg[reg_off >> 6]; 6927 do { 6928 if ((pg >> (reg_off & 63)) & 1) { 6929 for (i = 0; i < N; ++i) { 6930 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6931 host + mem_off + (i << msz)); 6932 } 6933 } 6934 reg_off += 1 << esz; 6935 mem_off += N << msz; 6936 } while (reg_off <= reg_last && (reg_off & 63)); 6937 } 6938 6939 clear_helper_retaddr(); 6940 6941 /* 6942 * Use the slow path to manage the cross-page misalignment. 6943 * But we know this is RAM and cannot trap. 6944 */ 6945 mem_off = info.mem_off_split; 6946 if (unlikely(mem_off >= 0)) { 6947 reg_off = info.reg_off_split; 6948 for (i = 0; i < N; ++i) { 6949 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6950 addr + mem_off + (i << msz), retaddr); 6951 } 6952 } 6953 6954 mem_off = info.mem_off_first[1]; 6955 if (unlikely(mem_off >= 0)) { 6956 reg_off = info.reg_off_first[1]; 6957 reg_last = info.reg_off_last[1]; 6958 host = info.page[1].host; 6959 6960 set_helper_retaddr(retaddr); 6961 6962 do { 6963 uint64_t pg = vg[reg_off >> 6]; 6964 do { 6965 if ((pg >> (reg_off & 63)) & 1) { 6966 for (i = 0; i < N; ++i) { 6967 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6968 host + mem_off + (i << msz)); 6969 } 6970 } 6971 reg_off += 1 << esz; 6972 mem_off += N << msz; 6973 } while (reg_off & 63); 6974 } while (reg_off <= reg_last); 6975 6976 clear_helper_retaddr(); 6977 } 6978 } 6979 6980 static inline QEMU_ALWAYS_INLINE 6981 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6982 uint64_t desc, const uintptr_t ra, 6983 const int esz, const int msz, const int N, 6984 sve_ldst1_host_fn *host_fn, 6985 sve_ldst1_tlb_fn *tlb_fn) 6986 { 6987 uint32_t mtedesc = desc >> 32; 6988 int bit55 = extract64(addr, 55, 1); 6989 6990 /* Perform gross MTE suppression early. */ 6991 if (!tbi_check(mtedesc, bit55) || 6992 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6993 mtedesc = 0; 6994 } 6995 6996 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6997 } 6998 6999 #define DO_STN_1(N, NAME, ESZ) \ 7000 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 7001 target_ulong addr, uint64_t desc) \ 7002 { \ 7003 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 7004 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 7005 } \ 7006 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 7007 target_ulong addr, uint64_t desc) \ 7008 { \ 7009 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 7010 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 7011 } 7012 7013 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 7014 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 7015 target_ulong addr, uint64_t desc) \ 7016 { \ 7017 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 7018 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 7019 } \ 7020 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 7021 target_ulong addr, uint64_t desc) \ 7022 { \ 7023 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 7024 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 7025 } \ 7026 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 7027 target_ulong addr, uint64_t desc) \ 7028 { \ 7029 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 7030 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 7031 } \ 7032 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 7033 target_ulong addr, uint64_t desc) \ 7034 { \ 7035 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 7036 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 7037 } 7038 7039 DO_STN_1(1, bb, MO_8) 7040 DO_STN_1(1, bh, MO_16) 7041 DO_STN_1(1, bs, MO_32) 7042 DO_STN_1(1, bd, MO_64) 7043 DO_STN_1(2, bb, MO_8) 7044 DO_STN_1(3, bb, MO_8) 7045 DO_STN_1(4, bb, MO_8) 7046 7047 DO_STN_2(1, hh, MO_16, MO_16) 7048 DO_STN_2(1, hs, MO_32, MO_16) 7049 DO_STN_2(1, hd, MO_64, MO_16) 7050 DO_STN_2(2, hh, MO_16, MO_16) 7051 DO_STN_2(3, hh, MO_16, MO_16) 7052 DO_STN_2(4, hh, MO_16, MO_16) 7053 7054 DO_STN_2(1, ss, MO_32, MO_32) 7055 DO_STN_2(1, sd, MO_64, MO_32) 7056 DO_STN_2(2, ss, MO_32, MO_32) 7057 DO_STN_2(3, ss, MO_32, MO_32) 7058 DO_STN_2(4, ss, MO_32, MO_32) 7059 7060 DO_STN_2(1, dd, MO_64, MO_64) 7061 DO_STN_2(2, dd, MO_64, MO_64) 7062 DO_STN_2(3, dd, MO_64, MO_64) 7063 DO_STN_2(4, dd, MO_64, MO_64) 7064 7065 DO_STN_2(1, sq, MO_128, MO_32) 7066 DO_STN_2(1, dq, MO_128, MO_64) 7067 7068 DO_STN_2(2, qq, MO_128, MO_128) 7069 DO_STN_2(3, qq, MO_128, MO_128) 7070 DO_STN_2(4, qq, MO_128, MO_128) 7071 7072 #undef DO_STN_1 7073 #undef DO_STN_2 7074 7075 /* 7076 * Loads with a vector index. 7077 */ 7078 7079 /* 7080 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 7081 */ 7082 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 7083 7084 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 7085 { 7086 return *(uint32_t *)(reg + H1_4(reg_ofs)); 7087 } 7088 7089 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 7090 { 7091 return *(int32_t *)(reg + H1_4(reg_ofs)); 7092 } 7093 7094 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 7095 { 7096 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 7097 } 7098 7099 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 7100 { 7101 return (int32_t)*(uint64_t *)(reg + reg_ofs); 7102 } 7103 7104 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 7105 { 7106 return *(uint64_t *)(reg + reg_ofs); 7107 } 7108 7109 static inline QEMU_ALWAYS_INLINE 7110 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7111 target_ulong base, uint32_t desc, uintptr_t retaddr, 7112 uint32_t mtedesc, int esize, int msize, 7113 zreg_off_fn *off_fn, 7114 sve_ldst1_host_fn *host_fn, 7115 sve_ldst1_tlb_fn *tlb_fn) 7116 { 7117 const int mmu_idx = arm_env_mmu_index(env); 7118 const intptr_t reg_max = simd_oprsz(desc); 7119 const int scale = simd_data(desc); 7120 ARMVectorReg scratch; 7121 intptr_t reg_off; 7122 SVEHostPage info, info2; 7123 7124 memset(&scratch, 0, reg_max); 7125 reg_off = 0; 7126 do { 7127 uint64_t pg = vg[reg_off >> 6]; 7128 do { 7129 if (likely(pg & 1)) { 7130 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7131 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7132 7133 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 7134 mmu_idx, retaddr); 7135 7136 if (likely(in_page >= msize)) { 7137 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7138 cpu_check_watchpoint(env_cpu(env), addr, msize, 7139 info.attrs, BP_MEM_READ, retaddr); 7140 } 7141 if (mtedesc && info.tagged) { 7142 mte_check(env, mtedesc, addr, retaddr); 7143 } 7144 if (unlikely(info.flags & TLB_MMIO)) { 7145 tlb_fn(env, &scratch, reg_off, addr, retaddr); 7146 } else { 7147 set_helper_retaddr(retaddr); 7148 host_fn(&scratch, reg_off, info.host); 7149 clear_helper_retaddr(); 7150 } 7151 } else { 7152 /* Element crosses the page boundary. */ 7153 sve_probe_page(&info2, false, env, addr + in_page, 0, 7154 MMU_DATA_LOAD, mmu_idx, retaddr); 7155 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 7156 cpu_check_watchpoint(env_cpu(env), addr, 7157 msize, info.attrs, 7158 BP_MEM_READ, retaddr); 7159 } 7160 if (mtedesc && info.tagged) { 7161 mte_check(env, mtedesc, addr, retaddr); 7162 } 7163 tlb_fn(env, &scratch, reg_off, addr, retaddr); 7164 } 7165 } 7166 reg_off += esize; 7167 pg >>= esize; 7168 } while (reg_off & 63); 7169 } while (reg_off < reg_max); 7170 7171 /* Wait until all exceptions have been raised to write back. */ 7172 memcpy(vd, &scratch, reg_max); 7173 } 7174 7175 static inline QEMU_ALWAYS_INLINE 7176 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7177 target_ulong base, uint64_t desc, uintptr_t retaddr, 7178 int esize, int msize, zreg_off_fn *off_fn, 7179 sve_ldst1_host_fn *host_fn, 7180 sve_ldst1_tlb_fn *tlb_fn) 7181 { 7182 uint32_t mtedesc = desc >> 32; 7183 7184 /* 7185 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7186 * offset base entirely over the address space hole to change the 7187 * pointer tag, or change the bit55 selector. So we could here 7188 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7189 */ 7190 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7191 esize, msize, off_fn, host_fn, tlb_fn); 7192 } 7193 7194 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 7195 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7196 void *vm, target_ulong base, uint64_t desc) \ 7197 { \ 7198 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7199 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7200 } \ 7201 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7202 void *vm, target_ulong base, uint64_t desc) \ 7203 { \ 7204 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7205 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7206 } 7207 7208 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 7209 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7210 void *vm, target_ulong base, uint64_t desc) \ 7211 { \ 7212 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7213 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7214 } \ 7215 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7216 void *vm, target_ulong base, uint64_t desc) \ 7217 { \ 7218 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7219 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7220 } 7221 7222 #define DO_LD1_ZPZ_Q(MEM, OFS, MSZ) \ 7223 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7224 void *vm, target_ulong base, uint64_t desc) \ 7225 { \ 7226 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 16, 1 << MSZ, \ 7227 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7228 } \ 7229 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7230 void *vm, target_ulong base, uint64_t desc) \ 7231 { \ 7232 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 16, 1 << MSZ, \ 7233 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7234 } 7235 7236 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 7237 DO_LD1_ZPZ_S(bsu, zss, MO_8) 7238 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 7239 DO_LD1_ZPZ_D(bdu, zss, MO_8) 7240 DO_LD1_ZPZ_D(bdu, zd, MO_8) 7241 7242 DO_LD1_ZPZ_S(bss, zsu, MO_8) 7243 DO_LD1_ZPZ_S(bss, zss, MO_8) 7244 DO_LD1_ZPZ_D(bds, zsu, MO_8) 7245 DO_LD1_ZPZ_D(bds, zss, MO_8) 7246 DO_LD1_ZPZ_D(bds, zd, MO_8) 7247 7248 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 7249 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 7250 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 7251 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 7252 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 7253 7254 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 7255 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 7256 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 7257 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 7258 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 7259 7260 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 7261 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 7262 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 7263 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 7264 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 7265 7266 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 7267 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 7268 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 7269 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 7270 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 7271 7272 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 7273 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 7274 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 7275 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 7276 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 7277 7278 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 7279 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 7280 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 7281 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 7282 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 7283 7284 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 7285 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 7286 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 7287 7288 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 7289 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 7290 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 7291 7292 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 7293 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 7294 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 7295 7296 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 7297 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 7298 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 7299 7300 DO_LD1_ZPZ_Q(qq_le, zd, MO_128) 7301 DO_LD1_ZPZ_Q(qq_be, zd, MO_128) 7302 7303 #undef DO_LD1_ZPZ_S 7304 #undef DO_LD1_ZPZ_D 7305 7306 /* First fault loads with a vector index. */ 7307 7308 /* 7309 * Common helpers for all gather first-faulting loads. 7310 */ 7311 7312 static inline QEMU_ALWAYS_INLINE 7313 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7314 target_ulong base, uint32_t desc, uintptr_t retaddr, 7315 uint32_t mtedesc, const int esz, const int msz, 7316 zreg_off_fn *off_fn, 7317 sve_ldst1_host_fn *host_fn, 7318 sve_ldst1_tlb_fn *tlb_fn) 7319 { 7320 const int mmu_idx = arm_env_mmu_index(env); 7321 const intptr_t reg_max = simd_oprsz(desc); 7322 const int scale = simd_data(desc); 7323 const int esize = 1 << esz; 7324 const int msize = 1 << msz; 7325 intptr_t reg_off; 7326 SVEHostPage info; 7327 target_ulong addr, in_page; 7328 ARMVectorReg scratch; 7329 7330 /* Skip to the first true predicate. */ 7331 reg_off = find_next_active(vg, 0, reg_max, esz); 7332 if (unlikely(reg_off >= reg_max)) { 7333 /* The entire predicate was false; no load occurs. */ 7334 memset(vd, 0, reg_max); 7335 return; 7336 } 7337 7338 /* Protect against overlap between vd and vm. */ 7339 if (unlikely(vd == vm)) { 7340 vm = memcpy(&scratch, vm, reg_max); 7341 } 7342 7343 /* 7344 * Probe the first element, allowing faults. 7345 */ 7346 addr = base + (off_fn(vm, reg_off) << scale); 7347 if (mtedesc) { 7348 mte_check(env, mtedesc, addr, retaddr); 7349 } 7350 tlb_fn(env, vd, reg_off, addr, retaddr); 7351 7352 /* After any fault, zero the other elements. */ 7353 swap_memzero(vd, reg_off); 7354 reg_off += esize; 7355 swap_memzero(vd + reg_off, reg_max - reg_off); 7356 7357 /* 7358 * Probe the remaining elements, not allowing faults. 7359 */ 7360 while (reg_off < reg_max) { 7361 uint64_t pg = vg[reg_off >> 6]; 7362 do { 7363 if (likely((pg >> (reg_off & 63)) & 1)) { 7364 addr = base + (off_fn(vm, reg_off) << scale); 7365 in_page = -(addr | TARGET_PAGE_MASK); 7366 7367 if (unlikely(in_page < msize)) { 7368 /* Stop if the element crosses a page boundary. */ 7369 goto fault; 7370 } 7371 7372 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 7373 mmu_idx, retaddr); 7374 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 7375 goto fault; 7376 } 7377 if (unlikely(info.flags & TLB_WATCHPOINT) && 7378 (cpu_watchpoint_address_matches 7379 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 7380 goto fault; 7381 } 7382 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 7383 goto fault; 7384 } 7385 7386 set_helper_retaddr(retaddr); 7387 host_fn(vd, reg_off, info.host); 7388 clear_helper_retaddr(); 7389 } 7390 reg_off += esize; 7391 } while (reg_off & 63); 7392 } 7393 return; 7394 7395 fault: 7396 record_fault(env, reg_off, reg_max); 7397 } 7398 7399 static inline QEMU_ALWAYS_INLINE 7400 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7401 target_ulong base, uint64_t desc, uintptr_t retaddr, 7402 const int esz, const int msz, 7403 zreg_off_fn *off_fn, 7404 sve_ldst1_host_fn *host_fn, 7405 sve_ldst1_tlb_fn *tlb_fn) 7406 { 7407 uint32_t mtedesc = desc >> 32; 7408 7409 /* 7410 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7411 * offset base entirely over the address space hole to change the 7412 * pointer tag, or change the bit55 selector. So we could here 7413 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7414 */ 7415 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7416 esz, msz, off_fn, host_fn, tlb_fn); 7417 } 7418 7419 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 7420 void HELPER(sve_ldff##MEM##_##OFS) \ 7421 (CPUARMState *env, void *vd, void *vg, \ 7422 void *vm, target_ulong base, uint64_t desc) \ 7423 { \ 7424 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 7425 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7426 } \ 7427 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7428 (CPUARMState *env, void *vd, void *vg, \ 7429 void *vm, target_ulong base, uint64_t desc) \ 7430 { \ 7431 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 7432 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7433 } 7434 7435 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 7436 void HELPER(sve_ldff##MEM##_##OFS) \ 7437 (CPUARMState *env, void *vd, void *vg, \ 7438 void *vm, target_ulong base, uint64_t desc) \ 7439 { \ 7440 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 7441 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7442 } \ 7443 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7444 (CPUARMState *env, void *vd, void *vg, \ 7445 void *vm, target_ulong base, uint64_t desc) \ 7446 { \ 7447 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 7448 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7449 } 7450 7451 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 7452 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 7453 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 7454 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 7455 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 7456 7457 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 7458 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 7459 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 7460 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 7461 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 7462 7463 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 7464 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 7465 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 7466 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 7467 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 7468 7469 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 7470 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 7471 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 7472 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 7473 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 7474 7475 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 7476 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 7477 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 7478 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 7479 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 7480 7481 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 7482 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 7483 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 7484 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 7485 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 7486 7487 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 7488 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 7489 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 7490 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 7491 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 7492 7493 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 7494 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 7495 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 7496 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 7497 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 7498 7499 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 7500 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 7501 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 7502 7503 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 7504 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 7505 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 7506 7507 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 7508 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 7509 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 7510 7511 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 7512 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 7513 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 7514 7515 /* Stores with a vector index. */ 7516 7517 static inline QEMU_ALWAYS_INLINE 7518 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7519 target_ulong base, uint32_t desc, uintptr_t retaddr, 7520 uint32_t mtedesc, int esize, int msize, 7521 zreg_off_fn *off_fn, 7522 sve_ldst1_host_fn *host_fn, 7523 sve_ldst1_tlb_fn *tlb_fn) 7524 { 7525 const int mmu_idx = arm_env_mmu_index(env); 7526 const intptr_t reg_max = simd_oprsz(desc); 7527 const int scale = simd_data(desc); 7528 void *host[ARM_MAX_VQ * 4]; 7529 intptr_t reg_off, i; 7530 SVEHostPage info, info2; 7531 7532 /* 7533 * Probe all of the elements for host addresses and flags. 7534 */ 7535 i = reg_off = 0; 7536 do { 7537 uint64_t pg = vg[reg_off >> 6]; 7538 do { 7539 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7540 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7541 7542 host[i] = NULL; 7543 if (likely((pg >> (reg_off & 63)) & 1)) { 7544 if (likely(in_page >= msize)) { 7545 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 7546 mmu_idx, retaddr); 7547 if (!(info.flags & TLB_MMIO)) { 7548 host[i] = info.host; 7549 } 7550 } else { 7551 /* 7552 * Element crosses the page boundary. 7553 * Probe both pages, but do not record the host address, 7554 * so that we use the slow path. 7555 */ 7556 sve_probe_page(&info, false, env, addr, 0, 7557 MMU_DATA_STORE, mmu_idx, retaddr); 7558 sve_probe_page(&info2, false, env, addr + in_page, 0, 7559 MMU_DATA_STORE, mmu_idx, retaddr); 7560 info.flags |= info2.flags; 7561 } 7562 7563 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7564 cpu_check_watchpoint(env_cpu(env), addr, msize, 7565 info.attrs, BP_MEM_WRITE, retaddr); 7566 } 7567 7568 if (mtedesc && info.tagged) { 7569 mte_check(env, mtedesc, addr, retaddr); 7570 } 7571 } 7572 i += 1; 7573 reg_off += esize; 7574 } while (reg_off & 63); 7575 } while (reg_off < reg_max); 7576 7577 /* 7578 * Now that we have recognized all exceptions except SyncExternal 7579 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7580 * 7581 * Note for the common case of an element in RAM, not crossing a page 7582 * boundary, we have stored the host address in host[]. This doubles 7583 * as a first-level check against the predicate, since only enabled 7584 * elements have non-null host addresses. 7585 */ 7586 i = reg_off = 0; 7587 do { 7588 void *h = host[i]; 7589 if (likely(h != NULL)) { 7590 set_helper_retaddr(retaddr); 7591 host_fn(vd, reg_off, h); 7592 clear_helper_retaddr(); 7593 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7594 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7595 tlb_fn(env, vd, reg_off, addr, retaddr); 7596 } 7597 i += 1; 7598 reg_off += esize; 7599 } while (reg_off < reg_max); 7600 } 7601 7602 static inline QEMU_ALWAYS_INLINE 7603 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7604 target_ulong base, uint64_t desc, uintptr_t retaddr, 7605 int esize, int msize, zreg_off_fn *off_fn, 7606 sve_ldst1_host_fn *host_fn, 7607 sve_ldst1_tlb_fn *tlb_fn) 7608 { 7609 uint32_t mtedesc = desc >> 32; 7610 7611 /* 7612 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7613 * offset base entirely over the address space hole to change the 7614 * pointer tag, or change the bit55 selector. So we could here 7615 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7616 */ 7617 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7618 esize, msize, off_fn, host_fn, tlb_fn); 7619 } 7620 7621 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7622 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7623 void *vm, target_ulong base, uint64_t desc) \ 7624 { \ 7625 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7626 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7627 } \ 7628 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7629 void *vm, target_ulong base, uint64_t desc) \ 7630 { \ 7631 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7632 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7633 } 7634 7635 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7636 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7637 void *vm, target_ulong base, uint64_t desc) \ 7638 { \ 7639 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7640 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7641 } \ 7642 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7643 void *vm, target_ulong base, uint64_t desc) \ 7644 { \ 7645 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7646 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7647 } 7648 7649 #define DO_ST1_ZPZ_Q(MEM, OFS, MSZ) \ 7650 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7651 void *vm, target_ulong base, uint64_t desc) \ 7652 { \ 7653 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 16, 1 << MSZ, \ 7654 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7655 } \ 7656 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7657 void *vm, target_ulong base, uint64_t desc) \ 7658 { \ 7659 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 16, 1 << MSZ, \ 7660 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7661 } 7662 7663 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7664 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7665 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7666 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7667 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7668 7669 DO_ST1_ZPZ_S(bs, zss, MO_8) 7670 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7671 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7672 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7673 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7674 7675 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7676 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7677 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7678 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7679 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7680 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7681 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7682 7683 DO_ST1_ZPZ_D(bd, zss, MO_8) 7684 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7685 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7686 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7687 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7688 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7689 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7690 7691 DO_ST1_ZPZ_D(bd, zd, MO_8) 7692 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7693 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7694 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7695 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7696 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7697 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7698 7699 DO_ST1_ZPZ_Q(qq_le, zd, MO_128) 7700 DO_ST1_ZPZ_Q(qq_be, zd, MO_128) 7701 7702 #undef DO_ST1_ZPZ_S 7703 #undef DO_ST1_ZPZ_D 7704 7705 /* 7706 * SVE2.1 consecutive register load/store 7707 */ 7708 7709 static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr, 7710 uint32_t png, intptr_t reg_max, 7711 int N, int v_esz) 7712 { 7713 const int esize = 1 << v_esz; 7714 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 7715 DecodeCounter p = decode_counter(png, reg_max, v_esz); 7716 unsigned b_count = p.count << v_esz; 7717 unsigned b_stride = 1 << (v_esz + p.lg2_stride); 7718 intptr_t page_split; 7719 7720 /* Set all of the element indices to -1, and the TLB data to 0. */ 7721 memset(info, -1, offsetof(SVEContLdSt, page)); 7722 memset(info->page, 0, sizeof(info->page)); 7723 7724 if (p.invert) { 7725 if (b_count >= reg_max * N) { 7726 return 0; 7727 } 7728 reg_off_first = b_count; 7729 reg_off_last = reg_max * N - b_stride; 7730 } else { 7731 if (b_count == 0) { 7732 return 0; 7733 } 7734 reg_off_first = 0; 7735 reg_off_last = MIN(b_count - esize, reg_max * N - b_stride); 7736 } 7737 7738 info->reg_off_first[0] = reg_off_first; 7739 info->mem_off_first[0] = reg_off_first; 7740 7741 page_split = -(addr | TARGET_PAGE_MASK); 7742 if (reg_off_last + esize <= page_split || reg_off_first >= page_split) { 7743 /* The entire operation fits within a single page. */ 7744 info->reg_off_last[0] = reg_off_last; 7745 return b_stride; 7746 } 7747 7748 info->page_split = page_split; 7749 reg_off_split = ROUND_DOWN(page_split, esize); 7750 7751 /* 7752 * This is the last full element on the first page, but it is not 7753 * necessarily active. If there is no full element, i.e. the first 7754 * active element is the one that's split, this value remains -1. 7755 * It is useful as iteration bounds. 7756 */ 7757 if (reg_off_split != 0) { 7758 info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride); 7759 } 7760 7761 /* Determine if an unaligned element spans the pages. */ 7762 if (page_split & (esize - 1)) { 7763 /* It is helpful to know if the split element is active. */ 7764 if ((reg_off_split & (b_stride - 1)) == 0) { 7765 info->reg_off_split = reg_off_split; 7766 info->mem_off_split = reg_off_split; 7767 } 7768 reg_off_split += esize; 7769 } 7770 7771 /* 7772 * We do want the first active element on the second page, because 7773 * this may affect the address reported in an exception. 7774 */ 7775 reg_off_split = ROUND_UP(reg_off_split, b_stride); 7776 if (reg_off_split <= reg_off_last) { 7777 info->reg_off_first[1] = reg_off_split; 7778 info->mem_off_first[1] = reg_off_split; 7779 info->reg_off_last[1] = reg_off_last; 7780 } 7781 return b_stride; 7782 } 7783 7784 static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 7785 target_ulong addr, unsigned estride, 7786 int esize, int wp_access, uintptr_t ra) 7787 { 7788 #ifndef CONFIG_USER_ONLY 7789 intptr_t count_off, count_last; 7790 int flags0 = info->page[0].flags; 7791 int flags1 = info->page[1].flags; 7792 7793 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 7794 return; 7795 } 7796 7797 /* Indicate that watchpoints are handled. */ 7798 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 7799 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 7800 7801 if (flags0 & TLB_WATCHPOINT) { 7802 count_off = info->reg_off_first[0]; 7803 count_last = info->reg_off_split; 7804 if (count_last < 0) { 7805 count_last = info->reg_off_last[0]; 7806 } 7807 do { 7808 cpu_check_watchpoint(env_cpu(env), addr + count_off, 7809 esize, info->page[0].attrs, wp_access, ra); 7810 count_off += estride; 7811 } while (count_off <= count_last); 7812 } 7813 7814 count_off = info->reg_off_first[1]; 7815 if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) { 7816 count_last = info->reg_off_last[1]; 7817 do { 7818 cpu_check_watchpoint(env_cpu(env), addr + count_off, 7819 esize, info->page[1].attrs, 7820 wp_access, ra); 7821 count_off += estride; 7822 } while (count_off <= count_last); 7823 } 7824 #endif 7825 } 7826 7827 static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 7828 target_ulong addr, unsigned estride, 7829 int esize, uint32_t mtedesc, 7830 uintptr_t ra) 7831 { 7832 intptr_t count_off, count_last; 7833 7834 /* 7835 * TODO: estride is always a small power of two, <= 8. 7836 * Manipulate the stride within the loops such that 7837 * - first iteration hits addr + off, as required, 7838 * - second iteration hits ALIGN_UP(addr, 16), 7839 * - other iterations advance addr by 16. 7840 * This will minimize the probing to once per MTE granule. 7841 */ 7842 7843 /* Process the page only if MemAttr == Tagged. */ 7844 if (info->page[0].tagged) { 7845 count_off = info->reg_off_first[0]; 7846 count_last = info->reg_off_split; 7847 if (count_last < 0) { 7848 count_last = info->reg_off_last[0]; 7849 } 7850 7851 do { 7852 mte_check(env, mtedesc, addr + count_off, ra); 7853 count_off += estride; 7854 } while (count_off <= count_last); 7855 } 7856 7857 count_off = info->reg_off_first[1]; 7858 if (count_off >= 0 && info->page[1].tagged) { 7859 count_last = info->reg_off_last[1]; 7860 do { 7861 mte_check(env, mtedesc, addr + count_off, ra); 7862 count_off += estride; 7863 } while (count_off <= count_last); 7864 } 7865 } 7866 7867 static inline QEMU_ALWAYS_INLINE 7868 void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, 7869 uint32_t png, uint64_t desc64, 7870 const uintptr_t ra, const MemOp esz, 7871 sve_ldst1_host_fn *host_fn, 7872 sve_ldst1_tlb_fn *tlb_fn) 7873 { 7874 uint32_t mtedesc = desc64 >> 32; 7875 uint32_t desc = desc64; 7876 const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; 7877 const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); 7878 const intptr_t reg_max = simd_oprsz(desc); 7879 const unsigned esize = 1 << esz; 7880 intptr_t count_off, count_last; 7881 intptr_t reg_off, reg_last, reg_n; 7882 SVEContLdSt info; 7883 unsigned estride, flags; 7884 void *host; 7885 7886 estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); 7887 if (estride == 0) { 7888 /* The entire predicate was false; no load occurs. */ 7889 for (unsigned n = 0; n < N; n++) { 7890 memset(zd + n * rstride, 0, reg_max); 7891 } 7892 return; 7893 } 7894 7895 /* Probe the page(s). Exit with exception for any invalid page. */ 7896 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); 7897 7898 /* Handle watchpoints for all active elements. */ 7899 sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, 7900 esize, BP_MEM_READ, ra); 7901 7902 /* 7903 * Handle mte checks for all active elements. 7904 * Since TBI must be set for MTE, !mtedesc => !mte_active. 7905 */ 7906 if (mtedesc) { 7907 sve2p1_cont_ldst_mte_check(&info, env, estride, addr, 7908 esize, mtedesc, ra); 7909 } 7910 7911 flags = info.page[0].flags | info.page[1].flags; 7912 if (unlikely(flags != 0)) { 7913 /* 7914 * At least one page includes MMIO. 7915 * Any bus operation can fail with cpu_transaction_failed, 7916 * which for ARM will raise SyncExternal. Perform the load 7917 * into scratch memory to preserve register state until the end. 7918 */ 7919 ARMVectorReg scratch[4] = { }; 7920 7921 count_off = info.reg_off_first[0]; 7922 count_last = info.reg_off_last[1]; 7923 if (count_last < 0) { 7924 count_last = info.reg_off_split; 7925 if (count_last < 0) { 7926 count_last = info.reg_off_last[0]; 7927 } 7928 } 7929 reg_off = count_off % reg_max; 7930 reg_n = count_off / reg_max; 7931 7932 do { 7933 reg_last = MIN(count_last - count_off, reg_max - esize); 7934 do { 7935 tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra); 7936 reg_off += estride; 7937 count_off += estride; 7938 } while (reg_off <= reg_last); 7939 reg_off = 0; 7940 reg_n++; 7941 } while (count_off <= count_last); 7942 7943 for (unsigned n = 0; n < N; ++n) { 7944 memcpy(&zd[n * rstride], &scratch[n], reg_max); 7945 } 7946 return; 7947 } 7948 7949 /* The entire operation is in RAM, on valid pages. */ 7950 7951 for (unsigned n = 0; n < N; ++n) { 7952 memset(&zd[n * rstride], 0, reg_max); 7953 } 7954 7955 count_off = info.reg_off_first[0]; 7956 count_last = info.reg_off_last[0]; 7957 reg_off = count_off % reg_max; 7958 reg_n = count_off / reg_max; 7959 host = info.page[0].host; 7960 7961 set_helper_retaddr(ra); 7962 7963 do { 7964 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 7965 do { 7966 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 7967 reg_off += estride; 7968 count_off += estride; 7969 } while (reg_off <= reg_last); 7970 reg_off = 0; 7971 reg_n++; 7972 } while (count_off <= count_last); 7973 7974 clear_helper_retaddr(); 7975 7976 /* 7977 * Use the slow path to manage the cross-page misalignment. 7978 * But we know this is RAM and cannot trap. 7979 */ 7980 count_off = info.reg_off_split; 7981 if (unlikely(count_off >= 0)) { 7982 reg_off = count_off % reg_max; 7983 reg_n = count_off / reg_max; 7984 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 7985 } 7986 7987 count_off = info.reg_off_first[1]; 7988 if (unlikely(count_off >= 0)) { 7989 count_last = info.reg_off_last[1]; 7990 reg_off = count_off % reg_max; 7991 reg_n = count_off / reg_max; 7992 host = info.page[1].host; 7993 7994 set_helper_retaddr(ra); 7995 7996 do { 7997 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 7998 do { 7999 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 8000 reg_off += estride; 8001 count_off += estride; 8002 } while (reg_off <= reg_last); 8003 reg_off = 0; 8004 reg_n++; 8005 } while (count_off <= count_last); 8006 8007 clear_helper_retaddr(); 8008 } 8009 } 8010 8011 void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr, 8012 uint32_t png, uint64_t desc) 8013 { 8014 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8, 8015 sve_ld1bb_host, sve_ld1bb_tlb); 8016 } 8017 8018 #define DO_LD1_2(NAME, ESZ) \ 8019 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ 8020 target_ulong addr, uint32_t png, \ 8021 uint64_t desc) \ 8022 { \ 8023 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8024 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 8025 } \ 8026 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ 8027 target_ulong addr, uint32_t png, \ 8028 uint64_t desc) \ 8029 { \ 8030 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8031 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 8032 } 8033 8034 DO_LD1_2(ld1hh, MO_16) 8035 DO_LD1_2(ld1ss, MO_32) 8036 DO_LD1_2(ld1dd, MO_64) 8037 8038 #undef DO_LD1_2 8039 8040 static inline QEMU_ALWAYS_INLINE 8041 void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, 8042 uint32_t png, uint64_t desc64, 8043 const uintptr_t ra, const int esz, 8044 sve_ldst1_host_fn *host_fn, 8045 sve_ldst1_tlb_fn *tlb_fn) 8046 { 8047 uint32_t mtedesc = desc64 >> 32; 8048 uint32_t desc = desc64; 8049 const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; 8050 const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); 8051 const intptr_t reg_max = simd_oprsz(desc); 8052 const unsigned esize = 1 << esz; 8053 intptr_t count_off, count_last; 8054 intptr_t reg_off, reg_last, reg_n; 8055 SVEContLdSt info; 8056 unsigned estride, flags; 8057 void *host; 8058 8059 estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); 8060 if (estride == 0) { 8061 /* The entire predicate was false; no store occurs. */ 8062 return; 8063 } 8064 8065 /* Probe the page(s). Exit with exception for any invalid page. */ 8066 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); 8067 8068 /* Handle watchpoints for all active elements. */ 8069 sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, 8070 esize, BP_MEM_WRITE, ra); 8071 8072 /* 8073 * Handle mte checks for all active elements. 8074 * Since TBI must be set for MTE, !mtedesc => !mte_active. 8075 */ 8076 if (mtedesc) { 8077 sve2p1_cont_ldst_mte_check(&info, env, estride, addr, 8078 esize, mtedesc, ra); 8079 } 8080 8081 flags = info.page[0].flags | info.page[1].flags; 8082 if (unlikely(flags != 0)) { 8083 /* 8084 * At least one page includes MMIO. 8085 * Any bus operation can fail with cpu_transaction_failed, 8086 * which for ARM will raise SyncExternal. Perform the load 8087 * into scratch memory to preserve register state until the end. 8088 */ 8089 count_off = info.reg_off_first[0]; 8090 count_last = info.reg_off_last[1]; 8091 if (count_last < 0) { 8092 count_last = info.reg_off_split; 8093 if (count_last < 0) { 8094 count_last = info.reg_off_last[0]; 8095 } 8096 } 8097 reg_off = count_off % reg_max; 8098 reg_n = count_off / reg_max; 8099 8100 do { 8101 reg_last = MIN(count_last - count_off, reg_max - esize); 8102 do { 8103 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 8104 reg_off += estride; 8105 count_off += estride; 8106 } while (reg_off <= reg_last); 8107 reg_off = 0; 8108 reg_n++; 8109 } while (count_off <= count_last); 8110 return; 8111 } 8112 8113 /* The entire operation is in RAM, on valid pages. */ 8114 8115 count_off = info.reg_off_first[0]; 8116 count_last = info.reg_off_last[0]; 8117 reg_off = count_off % reg_max; 8118 reg_n = count_off / reg_max; 8119 host = info.page[0].host; 8120 8121 set_helper_retaddr(ra); 8122 8123 do { 8124 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 8125 do { 8126 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 8127 reg_off += estride; 8128 count_off += estride; 8129 } while (reg_off <= reg_last); 8130 reg_off = 0; 8131 reg_n++; 8132 } while (count_off <= count_last); 8133 8134 clear_helper_retaddr(); 8135 8136 /* 8137 * Use the slow path to manage the cross-page misalignment. 8138 * But we know this is RAM and cannot trap. 8139 */ 8140 count_off = info.reg_off_split; 8141 if (unlikely(count_off >= 0)) { 8142 reg_off = count_off % reg_max; 8143 reg_n = count_off / reg_max; 8144 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 8145 } 8146 8147 count_off = info.reg_off_first[1]; 8148 if (unlikely(count_off >= 0)) { 8149 count_last = info.reg_off_last[1]; 8150 reg_off = count_off % reg_max; 8151 reg_n = count_off / reg_max; 8152 host = info.page[1].host; 8153 8154 set_helper_retaddr(ra); 8155 8156 do { 8157 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 8158 do { 8159 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 8160 reg_off += estride; 8161 count_off += estride; 8162 } while (reg_off <= reg_last); 8163 reg_off = 0; 8164 reg_n++; 8165 } while (count_off <= count_last); 8166 8167 clear_helper_retaddr(); 8168 } 8169 } 8170 8171 void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr, 8172 uint32_t png, uint64_t desc) 8173 { 8174 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8, 8175 sve_st1bb_host, sve_st1bb_tlb); 8176 } 8177 8178 #define DO_ST1_2(NAME, ESZ) \ 8179 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ 8180 target_ulong addr, uint32_t png, \ 8181 uint64_t desc) \ 8182 { \ 8183 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8184 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 8185 } \ 8186 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ 8187 target_ulong addr, uint32_t png, \ 8188 uint64_t desc) \ 8189 { \ 8190 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8191 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 8192 } 8193 8194 DO_ST1_2(st1hh, MO_16) 8195 DO_ST1_2(st1ss, MO_32) 8196 DO_ST1_2(st1dd, MO_64) 8197 8198 #undef DO_ST1_2 8199 8200 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8201 { 8202 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8203 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8204 8205 for (i = 0; i < opr_sz; ++i) { 8206 d[i] = n[i] ^ m[i] ^ k[i]; 8207 } 8208 } 8209 8210 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8211 { 8212 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8213 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8214 8215 for (i = 0; i < opr_sz; ++i) { 8216 d[i] = n[i] ^ (m[i] & ~k[i]); 8217 } 8218 } 8219 8220 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8221 { 8222 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8223 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8224 8225 for (i = 0; i < opr_sz; ++i) { 8226 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 8227 } 8228 } 8229 8230 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8231 { 8232 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8233 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8234 8235 for (i = 0; i < opr_sz; ++i) { 8236 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 8237 } 8238 } 8239 8240 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8241 { 8242 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8243 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8244 8245 for (i = 0; i < opr_sz; ++i) { 8246 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 8247 } 8248 } 8249 8250 /* 8251 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 8252 * See hasless(v,1) from 8253 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 8254 */ 8255 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 8256 { 8257 int bits = 8 << esz; 8258 uint64_t ones = dup_const(esz, 1); 8259 uint64_t signs = ones << (bits - 1); 8260 uint64_t cmp0, cmp1; 8261 8262 cmp1 = dup_const(esz, n); 8263 cmp0 = cmp1 ^ m0; 8264 cmp1 = cmp1 ^ m1; 8265 cmp0 = (cmp0 - ones) & ~cmp0; 8266 cmp1 = (cmp1 - ones) & ~cmp1; 8267 return (cmp0 | cmp1) & signs; 8268 } 8269 8270 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 8271 uint32_t desc, int esz, bool nmatch) 8272 { 8273 uint16_t esz_mask = pred_esz_masks[esz]; 8274 intptr_t opr_sz = simd_oprsz(desc); 8275 uint32_t flags = PREDTEST_INIT; 8276 intptr_t i, j, k; 8277 8278 for (i = 0; i < opr_sz; i += 16) { 8279 uint64_t m0 = *(uint64_t *)(vm + i); 8280 uint64_t m1 = *(uint64_t *)(vm + i + 8); 8281 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 8282 uint16_t out = 0; 8283 8284 for (j = 0; j < 16; j += 8) { 8285 uint64_t n = *(uint64_t *)(vn + i + j); 8286 8287 for (k = 0; k < 8; k += 1 << esz) { 8288 if (pg & (1 << (j + k))) { 8289 bool o = do_match2(n >> (k * 8), m0, m1, esz); 8290 out |= (o ^ nmatch) << (j + k); 8291 } 8292 } 8293 } 8294 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 8295 flags = iter_predtest_fwd(out, pg, flags); 8296 } 8297 return flags; 8298 } 8299 8300 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 8301 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 8302 { \ 8303 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 8304 } 8305 8306 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 8307 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 8308 8309 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 8310 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 8311 8312 #undef DO_PPZZ_MATCH 8313 8314 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 8315 uint32_t desc) 8316 { 8317 ARMVectorReg scratch; 8318 intptr_t i, j; 8319 intptr_t opr_sz = simd_oprsz(desc); 8320 uint32_t *d = vd, *n = vn, *m = vm; 8321 uint8_t *pg = vg; 8322 8323 if (d == n) { 8324 n = memcpy(&scratch, n, opr_sz); 8325 if (d == m) { 8326 m = n; 8327 } 8328 } else if (d == m) { 8329 m = memcpy(&scratch, m, opr_sz); 8330 } 8331 8332 for (i = 0; i < opr_sz; i += 4) { 8333 uint64_t count = 0; 8334 uint8_t pred; 8335 8336 pred = pg[H1(i >> 3)] >> (i & 7); 8337 if (pred & 1) { 8338 uint32_t nn = n[H4(i >> 2)]; 8339 8340 for (j = 0; j <= i; j += 4) { 8341 pred = pg[H1(j >> 3)] >> (j & 7); 8342 if ((pred & 1) && nn == m[H4(j >> 2)]) { 8343 ++count; 8344 } 8345 } 8346 } 8347 d[H4(i >> 2)] = count; 8348 } 8349 } 8350 8351 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 8352 uint32_t desc) 8353 { 8354 ARMVectorReg scratch; 8355 intptr_t i, j; 8356 intptr_t opr_sz = simd_oprsz(desc); 8357 uint64_t *d = vd, *n = vn, *m = vm; 8358 uint8_t *pg = vg; 8359 8360 if (d == n) { 8361 n = memcpy(&scratch, n, opr_sz); 8362 if (d == m) { 8363 m = n; 8364 } 8365 } else if (d == m) { 8366 m = memcpy(&scratch, m, opr_sz); 8367 } 8368 8369 for (i = 0; i < opr_sz / 8; ++i) { 8370 uint64_t count = 0; 8371 if (pg[H1(i)] & 1) { 8372 uint64_t nn = n[i]; 8373 for (j = 0; j <= i; ++j) { 8374 if ((pg[H1(j)] & 1) && nn == m[j]) { 8375 ++count; 8376 } 8377 } 8378 } 8379 d[i] = count; 8380 } 8381 } 8382 8383 /* 8384 * Returns the number of bytes in m0 and m1 that match n. 8385 * Unlike do_match2 we don't just need true/false, we need an exact count. 8386 * This requires two extra logical operations. 8387 */ 8388 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 8389 { 8390 const uint64_t mask = dup_const(MO_8, 0x7f); 8391 uint64_t cmp0, cmp1; 8392 8393 cmp1 = dup_const(MO_8, n); 8394 cmp0 = cmp1 ^ m0; 8395 cmp1 = cmp1 ^ m1; 8396 8397 /* 8398 * 1: clear msb of each byte to avoid carry to next byte (& mask) 8399 * 2: carry in to msb if byte != 0 (+ mask) 8400 * 3: set msb if cmp has msb set (| cmp) 8401 * 4: set ~msb to ignore them (| mask) 8402 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 8403 * 5: invert, resulting in 0x80 if and only if byte == 0. 8404 */ 8405 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 8406 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 8407 8408 /* 8409 * Combine the two compares in a way that the bits do 8410 * not overlap, and so preserves the count of set bits. 8411 * If the host has an efficient instruction for ctpop, 8412 * then ctpop(x) + ctpop(y) has the same number of 8413 * operations as ctpop(x | (y >> 1)). If the host does 8414 * not have an efficient ctpop, then we only want to 8415 * use it once. 8416 */ 8417 return ctpop64(cmp0 | (cmp1 >> 1)); 8418 } 8419 8420 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 8421 { 8422 intptr_t i, j; 8423 intptr_t opr_sz = simd_oprsz(desc); 8424 8425 for (i = 0; i < opr_sz; i += 16) { 8426 uint64_t n0 = *(uint64_t *)(vn + i); 8427 uint64_t m0 = *(uint64_t *)(vm + i); 8428 uint64_t n1 = *(uint64_t *)(vn + i + 8); 8429 uint64_t m1 = *(uint64_t *)(vm + i + 8); 8430 uint64_t out0 = 0; 8431 uint64_t out1 = 0; 8432 8433 for (j = 0; j < 64; j += 8) { 8434 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 8435 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 8436 out0 |= cnt0 << j; 8437 out1 |= cnt1 << j; 8438 } 8439 8440 *(uint64_t *)(vd + i) = out0; 8441 *(uint64_t *)(vd + i + 8) = out1; 8442 } 8443 } 8444 8445 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 8446 { 8447 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8448 int shr = simd_data(desc); 8449 int shl = 8 - shr; 8450 uint64_t mask = dup_const(MO_8, 0xff >> shr); 8451 uint64_t *d = vd, *n = vn, *m = vm; 8452 8453 for (i = 0; i < opr_sz; ++i) { 8454 uint64_t t = n[i] ^ m[i]; 8455 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 8456 } 8457 } 8458 8459 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 8460 { 8461 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8462 int shr = simd_data(desc); 8463 int shl = 16 - shr; 8464 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 8465 uint64_t *d = vd, *n = vn, *m = vm; 8466 8467 for (i = 0; i < opr_sz; ++i) { 8468 uint64_t t = n[i] ^ m[i]; 8469 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 8470 } 8471 } 8472 8473 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 8474 { 8475 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 8476 int shr = simd_data(desc); 8477 uint32_t *d = vd, *n = vn, *m = vm; 8478 8479 for (i = 0; i < opr_sz; ++i) { 8480 d[i] = ror32(n[i] ^ m[i], shr); 8481 } 8482 } 8483 8484 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 8485 float_status *status, uint32_t desc) 8486 { 8487 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 8488 8489 for (s = 0; s < opr_sz; ++s) { 8490 float32 *n = vn + s * sizeof(float32) * 4; 8491 float32 *m = vm + s * sizeof(float32) * 4; 8492 float32 *a = va + s * sizeof(float32) * 4; 8493 float32 *d = vd + s * sizeof(float32) * 4; 8494 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 8495 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 8496 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 8497 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 8498 float32 p0, p1; 8499 8500 /* i = 0, j = 0 */ 8501 p0 = float32_mul(n00, m00, status); 8502 p1 = float32_mul(n01, m01, status); 8503 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 8504 8505 /* i = 0, j = 1 */ 8506 p0 = float32_mul(n00, m10, status); 8507 p1 = float32_mul(n01, m11, status); 8508 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 8509 8510 /* i = 1, j = 0 */ 8511 p0 = float32_mul(n10, m00, status); 8512 p1 = float32_mul(n11, m01, status); 8513 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 8514 8515 /* i = 1, j = 1 */ 8516 p0 = float32_mul(n10, m10, status); 8517 p1 = float32_mul(n11, m11, status); 8518 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 8519 } 8520 } 8521 8522 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 8523 float_status *status, uint32_t desc) 8524 { 8525 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 8526 8527 for (s = 0; s < opr_sz; ++s) { 8528 float64 *n = vn + s * sizeof(float64) * 4; 8529 float64 *m = vm + s * sizeof(float64) * 4; 8530 float64 *a = va + s * sizeof(float64) * 4; 8531 float64 *d = vd + s * sizeof(float64) * 4; 8532 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 8533 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 8534 float64 p0, p1; 8535 8536 /* i = 0, j = 0 */ 8537 p0 = float64_mul(n00, m00, status); 8538 p1 = float64_mul(n01, m01, status); 8539 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 8540 8541 /* i = 0, j = 1 */ 8542 p0 = float64_mul(n00, m10, status); 8543 p1 = float64_mul(n01, m11, status); 8544 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 8545 8546 /* i = 1, j = 0 */ 8547 p0 = float64_mul(n10, m00, status); 8548 p1 = float64_mul(n11, m01, status); 8549 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 8550 8551 /* i = 1, j = 1 */ 8552 p0 = float64_mul(n10, m10, status); 8553 p1 = float64_mul(n11, m11, status); 8554 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 8555 } 8556 } 8557 8558 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 8559 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 8560 float_status *status, uint32_t desc) \ 8561 { \ 8562 intptr_t i = simd_oprsz(desc); \ 8563 uint64_t *g = vg; \ 8564 do { \ 8565 uint64_t pg = g[(i - 1) >> 6]; \ 8566 do { \ 8567 i -= sizeof(TYPEW); \ 8568 if (likely((pg >> (i & 63)) & 1)) { \ 8569 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 8570 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 8571 } \ 8572 } while (i & 63); \ 8573 } while (i != 0); \ 8574 } 8575 8576 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 8577 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 8578 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 8579 8580 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 8581 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 8582 float_status *status, uint32_t desc) \ 8583 { \ 8584 intptr_t i = simd_oprsz(desc); \ 8585 uint64_t *g = vg; \ 8586 do { \ 8587 uint64_t pg = g[(i - 1) >> 6]; \ 8588 do { \ 8589 i -= sizeof(TYPEW); \ 8590 if (likely((pg >> (i & 63)) & 1)) { \ 8591 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 8592 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 8593 } \ 8594 } while (i & 63); \ 8595 } while (i != 0); \ 8596 } 8597 8598 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 8599 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 8600 8601 #undef DO_FCVTLT 8602 #undef DO_FCVTNT 8603 8604 void HELPER(pext)(void *vd, uint32_t png, uint32_t desc) 8605 { 8606 int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); 8607 int vl = pl * 8; 8608 unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); 8609 int part = FIELD_EX32(desc, PREDDESC, DATA); 8610 DecodeCounter p = decode_counter(png, vl, v_esz); 8611 uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride]; 8612 ARMPredicateReg *d = vd; 8613 8614 /* 8615 * Convert from element count to byte count and adjust 8616 * for the portion of the 4*VL counter to be extracted. 8617 */ 8618 int b_count = (p.count << v_esz) - vl * part; 8619 8620 memset(d, 0, sizeof(*d)); 8621 if (p.invert) { 8622 if (b_count <= 0) { 8623 do_whilel(vd, mask, vl, vl); 8624 } else if (b_count < vl) { 8625 do_whileg(vd, mask, vl - b_count, vl); 8626 } 8627 } else if (b_count > 0) { 8628 do_whilel(vd, mask, MIN(b_count, vl), vl); 8629 } 8630 } 8631