1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/helper-proto.h" 25 #include "tcg/tcg-gvec-desc.h" 26 #include "fpu/softfloat.h" 27 #include "tcg/tcg.h" 28 #include "vec_internal.h" 29 #include "sve_ldst_internal.h" 30 #include "hw/core/tcg-cpu-ops.h" 31 32 33 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 34 * 35 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 36 * and bit 0 set if C is set. Compare the definitions of these variables 37 * within CPUARMState. 38 */ 39 40 /* For no G bits set, NZCV = C. */ 41 #define PREDTEST_INIT 1 42 43 /* This is an iterative function, called for each Pd and Pg word 44 * moving forward. 45 */ 46 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 47 { 48 if (likely(g)) { 49 /* Compute N from first D & G. 50 Use bit 2 to signal first G bit seen. */ 51 if (!(flags & 4)) { 52 flags |= ((d & (g & -g)) != 0) << 31; 53 flags |= 4; 54 } 55 56 /* Accumulate Z from each D & G. */ 57 flags |= ((d & g) != 0) << 1; 58 59 /* Compute C from last !(D & G). Replace previous. */ 60 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 61 } 62 return flags; 63 } 64 65 /* This is an iterative function, called for each Pd and Pg word 66 * moving backward. 67 */ 68 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 69 { 70 if (likely(g)) { 71 /* Compute C from first (i.e last) !(D & G). 72 Use bit 2 to signal first G bit seen. */ 73 if (!(flags & 4)) { 74 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 75 flags |= (d & pow2floor(g)) == 0; 76 } 77 78 /* Accumulate Z from each D & G. */ 79 flags |= ((d & g) != 0) << 1; 80 81 /* Compute N from last (i.e first) D & G. Replace previous. */ 82 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 83 } 84 return flags; 85 } 86 87 /* The same for a single word predicate. */ 88 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 89 { 90 return iter_predtest_fwd(d, g, PREDTEST_INIT); 91 } 92 93 /* The same for a multi-word predicate. */ 94 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 95 { 96 uint32_t flags = PREDTEST_INIT; 97 uint64_t *d = vd, *g = vg; 98 uintptr_t i = 0; 99 100 do { 101 flags = iter_predtest_fwd(d[i], g[i], flags); 102 } while (++i < words); 103 104 return flags; 105 } 106 107 /* Similarly for single word elements. */ 108 static inline uint64_t expand_pred_s(uint8_t byte) 109 { 110 static const uint64_t word[] = { 111 [0x01] = 0x00000000ffffffffull, 112 [0x10] = 0xffffffff00000000ull, 113 [0x11] = 0xffffffffffffffffull, 114 }; 115 return word[byte & 0x11]; 116 } 117 118 #define LOGICAL_PPPP(NAME, FUNC) \ 119 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 120 { \ 121 uintptr_t opr_sz = simd_oprsz(desc); \ 122 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 123 uintptr_t i; \ 124 for (i = 0; i < opr_sz / 8; ++i) { \ 125 d[i] = FUNC(n[i], m[i], g[i]); \ 126 } \ 127 } 128 129 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 130 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 131 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 132 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 133 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 134 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 135 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 136 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 137 138 LOGICAL_PPPP(sve_and_pppp, DO_AND) 139 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 140 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 141 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 142 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 143 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 144 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 145 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 146 147 #undef DO_AND 148 #undef DO_BIC 149 #undef DO_EOR 150 #undef DO_ORR 151 #undef DO_ORN 152 #undef DO_NOR 153 #undef DO_NAND 154 #undef DO_SEL 155 #undef LOGICAL_PPPP 156 157 /* Fully general three-operand expander, controlled by a predicate. 158 * This is complicated by the host-endian storage of the register file. 159 */ 160 /* ??? I don't expect the compiler could ever vectorize this itself. 161 * With some tables we can convert bit masks to byte masks, and with 162 * extra care wrt byte/word ordering we could use gcc generic vectors 163 * and do 16 bytes at a time. 164 */ 165 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 166 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 167 { \ 168 intptr_t i, opr_sz = simd_oprsz(desc); \ 169 for (i = 0; i < opr_sz; ) { \ 170 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 171 do { \ 172 if (pg & 1) { \ 173 TYPE nn = *(TYPE *)(vn + H(i)); \ 174 TYPE mm = *(TYPE *)(vm + H(i)); \ 175 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 176 } \ 177 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 178 } while (i & 15); \ 179 } \ 180 } 181 182 /* Similarly, specialized for 64-bit operands. */ 183 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 184 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 185 { \ 186 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 187 TYPE *d = vd, *n = vn, *m = vm; \ 188 uint8_t *pg = vg; \ 189 for (i = 0; i < opr_sz; i += 1) { \ 190 if (pg[H1(i)] & 1) { \ 191 TYPE nn = n[i], mm = m[i]; \ 192 d[i] = OP(nn, mm); \ 193 } \ 194 } \ 195 } 196 197 #define DO_AND(N, M) (N & M) 198 #define DO_EOR(N, M) (N ^ M) 199 #define DO_ORR(N, M) (N | M) 200 #define DO_BIC(N, M) (N & ~M) 201 #define DO_ADD(N, M) (N + M) 202 #define DO_SUB(N, M) (N - M) 203 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 204 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 205 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 206 #define DO_MUL(N, M) (N * M) 207 208 209 /* 210 * We must avoid the C undefined behaviour cases: division by 211 * zero and signed division of INT_MIN by -1. Both of these 212 * have architecturally defined required results for Arm. 213 * We special case all signed divisions by -1 to avoid having 214 * to deduce the minimum integer for the type involved. 215 */ 216 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 217 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 218 219 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 220 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 221 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 222 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 223 224 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 225 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 226 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 227 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 228 229 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 230 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 231 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 232 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 233 234 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 235 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 236 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 237 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 238 239 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 240 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 241 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 242 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 243 244 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 245 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 246 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 247 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 248 249 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 250 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 251 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 252 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 253 254 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 255 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 256 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 257 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 258 259 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 260 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 261 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 262 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 263 264 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 265 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 266 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 267 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 268 269 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 270 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 271 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 272 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 273 274 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 275 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 276 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 277 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 278 279 /* Because the computation type is at least twice as large as required, 280 these work for both signed and unsigned source types. */ 281 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 282 { 283 return (n * m) >> 8; 284 } 285 286 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 287 { 288 return (n * m) >> 16; 289 } 290 291 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 292 { 293 return (n * m) >> 32; 294 } 295 296 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 297 { 298 uint64_t lo, hi; 299 muls64(&lo, &hi, n, m); 300 return hi; 301 } 302 303 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 304 { 305 uint64_t lo, hi; 306 mulu64(&lo, &hi, n, m); 307 return hi; 308 } 309 310 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 311 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 312 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 313 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 314 315 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 316 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 317 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 318 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 319 320 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 321 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 322 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 323 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 324 325 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 326 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 327 328 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 329 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 330 331 /* Note that all bits of the shift are significant 332 and not modulo the element size. */ 333 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 334 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 335 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 336 337 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 338 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 339 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 340 341 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 342 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 343 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 344 345 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 346 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 347 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 348 349 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 350 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 351 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 352 353 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 354 { 355 int8_t n1 = n, n2 = n >> 8; 356 return m + n1 + n2; 357 } 358 359 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 360 { 361 int16_t n1 = n, n2 = n >> 16; 362 return m + n1 + n2; 363 } 364 365 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 366 { 367 int32_t n1 = n, n2 = n >> 32; 368 return m + n1 + n2; 369 } 370 371 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 372 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 373 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 374 375 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 376 { 377 uint8_t n1 = n, n2 = n >> 8; 378 return m + n1 + n2; 379 } 380 381 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 382 { 383 uint16_t n1 = n, n2 = n >> 16; 384 return m + n1 + n2; 385 } 386 387 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 388 { 389 uint32_t n1 = n, n2 = n >> 32; 390 return m + n1 + n2; 391 } 392 393 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 394 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 395 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 396 397 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 398 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 399 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 400 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 401 402 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 403 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 404 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 405 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 406 407 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 408 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 409 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 410 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 411 412 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 413 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 414 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 415 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 416 417 /* 418 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 419 * We pass in a pointer to a dummy saturation field to trigger 420 * the saturating arithmetic but discard the information about 421 * whether it has occurred. 422 */ 423 #define do_sqshl_b(n, m) \ 424 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 425 #define do_sqshl_h(n, m) \ 426 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 427 #define do_sqshl_s(n, m) \ 428 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 429 #define do_sqshl_d(n, m) \ 430 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 431 432 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 433 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 434 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 435 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 436 437 #define do_uqshl_b(n, m) \ 438 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 439 #define do_uqshl_h(n, m) \ 440 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 441 #define do_uqshl_s(n, m) \ 442 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 443 #define do_uqshl_d(n, m) \ 444 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 445 446 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 447 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 448 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 449 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 450 451 #define do_sqrshl_b(n, m) \ 452 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 453 #define do_sqrshl_h(n, m) \ 454 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 455 #define do_sqrshl_s(n, m) \ 456 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 457 #define do_sqrshl_d(n, m) \ 458 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 459 460 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 461 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 462 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 463 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 464 465 #undef do_sqrshl_d 466 467 #define do_uqrshl_b(n, m) \ 468 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 469 #define do_uqrshl_h(n, m) \ 470 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 471 #define do_uqrshl_s(n, m) \ 472 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 473 #define do_uqrshl_d(n, m) \ 474 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 475 476 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 477 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 478 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 479 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 480 481 #undef do_uqrshl_d 482 483 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 484 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 485 486 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 487 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 488 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 489 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 490 491 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 492 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 493 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 494 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 495 496 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 497 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 498 499 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 500 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 501 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 502 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 503 504 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 505 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 506 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 507 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 508 509 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 510 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 511 512 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 513 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 514 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 515 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 516 517 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 518 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 519 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 520 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 521 522 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 523 { 524 return val >= max ? max : val <= min ? min : val; 525 } 526 527 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 528 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 529 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 530 531 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 532 { 533 int64_t r = n + m; 534 if (((r ^ n) & ~(n ^ m)) < 0) { 535 /* Signed overflow. */ 536 return r < 0 ? INT64_MAX : INT64_MIN; 537 } 538 return r; 539 } 540 541 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 542 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 543 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 544 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 545 546 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 547 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 548 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 549 550 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 551 { 552 uint64_t r = n + m; 553 return r < n ? UINT64_MAX : r; 554 } 555 556 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 557 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 558 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 559 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 560 561 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 562 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 563 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 564 565 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 566 { 567 int64_t r = n - m; 568 if (((r ^ n) & (n ^ m)) < 0) { 569 /* Signed overflow. */ 570 return r < 0 ? INT64_MAX : INT64_MIN; 571 } 572 return r; 573 } 574 575 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 576 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 577 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 578 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 579 580 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 581 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 582 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 583 584 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 585 { 586 return n > m ? n - m : 0; 587 } 588 589 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 590 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 591 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 592 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 593 594 #define DO_SUQADD_B(n, m) \ 595 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 596 #define DO_SUQADD_H(n, m) \ 597 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 598 #define DO_SUQADD_S(n, m) \ 599 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 600 601 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 602 { 603 uint64_t r = n + m; 604 605 if (n < 0) { 606 /* Note that m - abs(n) cannot underflow. */ 607 if (r > INT64_MAX) { 608 /* Result is either very large positive or negative. */ 609 if (m > -n) { 610 /* m > abs(n), so r is a very large positive. */ 611 return INT64_MAX; 612 } 613 /* Result is negative. */ 614 } 615 } else { 616 /* Both inputs are positive: check for overflow. */ 617 if (r < m || r > INT64_MAX) { 618 return INT64_MAX; 619 } 620 } 621 return r; 622 } 623 624 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 625 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 626 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 627 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 628 629 #define DO_USQADD_B(n, m) \ 630 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 631 #define DO_USQADD_H(n, m) \ 632 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 633 #define DO_USQADD_S(n, m) \ 634 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 635 636 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 637 { 638 uint64_t r = n + m; 639 640 if (m < 0) { 641 return n < -m ? 0 : r; 642 } 643 return r < n ? UINT64_MAX : r; 644 } 645 646 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 647 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 648 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 649 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 650 651 #undef DO_ZPZZ 652 #undef DO_ZPZZ_D 653 654 /* 655 * Three operand expander, operating on element pairs. 656 * If the slot I is even, the elements from from VN {I, I+1}. 657 * If the slot I is odd, the elements from from VM {I-1, I}. 658 * Load all of the input elements in each pair before overwriting output. 659 */ 660 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 661 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 662 { \ 663 intptr_t i, opr_sz = simd_oprsz(desc); \ 664 for (i = 0; i < opr_sz; ) { \ 665 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 666 do { \ 667 TYPE n0 = *(TYPE *)(vn + H(i)); \ 668 TYPE m0 = *(TYPE *)(vm + H(i)); \ 669 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 670 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 671 if (pg & 1) { \ 672 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 673 } \ 674 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 675 if (pg & 1) { \ 676 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 677 } \ 678 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 679 } while (i & 15); \ 680 } \ 681 } 682 683 /* Similarly, specialized for 64-bit operands. */ 684 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 685 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 686 { \ 687 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 688 TYPE *d = vd, *n = vn, *m = vm; \ 689 uint8_t *pg = vg; \ 690 for (i = 0; i < opr_sz; i += 2) { \ 691 TYPE n0 = n[i], n1 = n[i + 1]; \ 692 TYPE m0 = m[i], m1 = m[i + 1]; \ 693 if (pg[H1(i)] & 1) { \ 694 d[i] = OP(n0, n1); \ 695 } \ 696 if (pg[H1(i + 1)] & 1) { \ 697 d[i + 1] = OP(m0, m1); \ 698 } \ 699 } \ 700 } 701 702 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 703 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 704 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 705 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 706 707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 710 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 711 712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 715 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 716 717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 720 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 721 722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 725 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 726 727 #undef DO_ZPZZ_PAIR 728 #undef DO_ZPZZ_PAIR_D 729 730 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 731 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 732 void *status, uint32_t desc) \ 733 { \ 734 intptr_t i, opr_sz = simd_oprsz(desc); \ 735 for (i = 0; i < opr_sz; ) { \ 736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 737 do { \ 738 TYPE n0 = *(TYPE *)(vn + H(i)); \ 739 TYPE m0 = *(TYPE *)(vm + H(i)); \ 740 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 741 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 742 if (pg & 1) { \ 743 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 744 } \ 745 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 746 if (pg & 1) { \ 747 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 748 } \ 749 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 750 } while (i & 15); \ 751 } \ 752 } 753 754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 757 758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 761 762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 765 766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 769 770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 773 774 #undef DO_ZPZZ_PAIR_FP 775 776 /* Three-operand expander, controlled by a predicate, in which the 777 * third operand is "wide". That is, for D = N op M, the same 64-bit 778 * value of M is used with all of the narrower values of N. 779 */ 780 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 781 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 782 { \ 783 intptr_t i, opr_sz = simd_oprsz(desc); \ 784 for (i = 0; i < opr_sz; ) { \ 785 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 786 TYPEW mm = *(TYPEW *)(vm + i); \ 787 do { \ 788 if (pg & 1) { \ 789 TYPE nn = *(TYPE *)(vn + H(i)); \ 790 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 791 } \ 792 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 793 } while (i & 7); \ 794 } \ 795 } 796 797 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 798 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 799 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 800 801 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 802 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 803 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 804 805 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 806 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 807 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 808 809 #undef DO_ZPZW 810 811 /* Fully general two-operand expander, controlled by a predicate. 812 */ 813 #define DO_ZPZ(NAME, TYPE, H, OP) \ 814 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 815 { \ 816 intptr_t i, opr_sz = simd_oprsz(desc); \ 817 for (i = 0; i < opr_sz; ) { \ 818 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 819 do { \ 820 if (pg & 1) { \ 821 TYPE nn = *(TYPE *)(vn + H(i)); \ 822 *(TYPE *)(vd + H(i)) = OP(nn); \ 823 } \ 824 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 825 } while (i & 15); \ 826 } \ 827 } 828 829 /* Similarly, specialized for 64-bit operands. */ 830 #define DO_ZPZ_D(NAME, TYPE, OP) \ 831 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 832 { \ 833 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 834 TYPE *d = vd, *n = vn; \ 835 uint8_t *pg = vg; \ 836 for (i = 0; i < opr_sz; i += 1) { \ 837 if (pg[H1(i)] & 1) { \ 838 TYPE nn = n[i]; \ 839 d[i] = OP(nn); \ 840 } \ 841 } \ 842 } 843 844 #define DO_CLS_B(N) (clrsb32(N) - 24) 845 #define DO_CLS_H(N) (clrsb32(N) - 16) 846 847 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 848 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 849 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 850 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 851 852 #define DO_CLZ_B(N) (clz32(N) - 24) 853 #define DO_CLZ_H(N) (clz32(N) - 16) 854 855 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 856 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 857 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 858 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 859 860 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 861 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 862 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 863 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 864 865 #define DO_CNOT(N) (N == 0) 866 867 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 868 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 869 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 870 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 871 872 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 873 874 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 875 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 876 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 877 878 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 879 880 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 881 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 882 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 883 884 #define DO_NOT(N) (~N) 885 886 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 887 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 888 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 889 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 890 891 #define DO_SXTB(N) ((int8_t)N) 892 #define DO_SXTH(N) ((int16_t)N) 893 #define DO_SXTS(N) ((int32_t)N) 894 #define DO_UXTB(N) ((uint8_t)N) 895 #define DO_UXTH(N) ((uint16_t)N) 896 #define DO_UXTS(N) ((uint32_t)N) 897 898 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 899 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 900 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 901 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 902 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 903 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 904 905 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 906 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 907 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 908 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 909 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 910 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 911 912 #define DO_ABS(N) (N < 0 ? -N : N) 913 914 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 915 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 916 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 917 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 918 919 #define DO_NEG(N) (-N) 920 921 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 922 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 923 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 924 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 925 926 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 927 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 928 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 929 930 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 931 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 932 933 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 934 935 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 936 { 937 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 938 uint64_t *d = vd, *n = vn; 939 uint8_t *pg = vg; 940 941 for (i = 0; i < opr_sz; i += 2) { 942 if (pg[H1(i)] & 1) { 943 uint64_t n0 = n[i + 0]; 944 uint64_t n1 = n[i + 1]; 945 d[i + 0] = n1; 946 d[i + 1] = n0; 947 } 948 } 949 } 950 951 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 952 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 953 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 954 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 955 956 #define DO_SQABS(X) \ 957 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 958 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 959 960 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 961 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 962 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 963 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 964 965 #define DO_SQNEG(X) \ 966 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 967 x_ == min_ ? -min_ - 1 : -x_; }) 968 969 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 970 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 971 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 972 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 973 974 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 975 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 976 977 /* Three-operand expander, unpredicated, in which the third operand is "wide". 978 */ 979 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 980 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 981 { \ 982 intptr_t i, opr_sz = simd_oprsz(desc); \ 983 for (i = 0; i < opr_sz; ) { \ 984 TYPEW mm = *(TYPEW *)(vm + i); \ 985 do { \ 986 TYPE nn = *(TYPE *)(vn + H(i)); \ 987 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 988 i += sizeof(TYPE); \ 989 } while (i & 7); \ 990 } \ 991 } 992 993 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 994 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 995 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 996 997 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 998 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 999 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1000 1001 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1002 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1003 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1004 1005 #undef DO_ZZW 1006 1007 #undef DO_CLS_B 1008 #undef DO_CLS_H 1009 #undef DO_CLZ_B 1010 #undef DO_CLZ_H 1011 #undef DO_CNOT 1012 #undef DO_FABS 1013 #undef DO_FNEG 1014 #undef DO_ABS 1015 #undef DO_NEG 1016 #undef DO_ZPZ 1017 #undef DO_ZPZ_D 1018 1019 /* 1020 * Three-operand expander, unpredicated, in which the two inputs are 1021 * selected from the top or bottom half of the wide column. 1022 */ 1023 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1024 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1025 { \ 1026 intptr_t i, opr_sz = simd_oprsz(desc); \ 1027 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1028 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1029 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1030 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1031 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1032 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1033 } \ 1034 } 1035 1036 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1037 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1038 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1039 1040 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1041 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1042 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1043 1044 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1045 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1046 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1047 1048 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1049 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1050 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1051 1052 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1053 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1054 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1055 1056 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1057 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1058 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1059 1060 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1061 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1062 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1063 1064 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1065 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1066 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1067 1068 /* Note that the multiply cannot overflow, but the doubling can. */ 1069 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1070 { 1071 int16_t val = n * m; 1072 return DO_SQADD_H(val, val); 1073 } 1074 1075 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1076 { 1077 int32_t val = n * m; 1078 return DO_SQADD_S(val, val); 1079 } 1080 1081 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1082 { 1083 int64_t val = n * m; 1084 return do_sqadd_d(val, val); 1085 } 1086 1087 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1088 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1089 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1090 1091 #undef DO_ZZZ_TB 1092 1093 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1094 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1095 { \ 1096 intptr_t i, opr_sz = simd_oprsz(desc); \ 1097 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1098 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1099 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1100 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1101 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1102 } \ 1103 } 1104 1105 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1106 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1107 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1108 1109 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1110 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1111 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1112 1113 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1114 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1115 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1116 1117 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1118 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1119 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1120 1121 #undef DO_ZZZ_WTB 1122 1123 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1124 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1125 { \ 1126 intptr_t i, opr_sz = simd_oprsz(desc); \ 1127 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1128 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1129 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1130 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1131 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1132 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1133 } \ 1134 } 1135 1136 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1137 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1138 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1139 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1140 1141 #undef DO_ZZZ_NTB 1142 1143 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1144 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1145 { \ 1146 intptr_t i, opr_sz = simd_oprsz(desc); \ 1147 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1148 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1149 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1150 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1151 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1152 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1153 } \ 1154 } 1155 1156 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1157 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1158 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1159 1160 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1161 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1162 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1163 1164 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1165 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1166 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1167 1168 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1169 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1170 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1171 1172 #define DO_NMUL(N, M) -(N * M) 1173 1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1177 1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1181 1182 #undef DO_ZZZW_ACC 1183 1184 #define DO_XTNB(NAME, TYPE, OP) \ 1185 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1186 { \ 1187 intptr_t i, opr_sz = simd_oprsz(desc); \ 1188 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1189 TYPE nn = *(TYPE *)(vn + i); \ 1190 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1191 *(TYPE *)(vd + i) = nn; \ 1192 } \ 1193 } 1194 1195 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1196 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1197 { \ 1198 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1199 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1200 TYPE nn = *(TYPE *)(vn + i); \ 1201 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1202 } \ 1203 } 1204 1205 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1206 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1207 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1208 1209 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1210 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1211 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1212 1213 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1214 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1215 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1216 1217 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1218 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1219 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1220 1221 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1222 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1223 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1224 1225 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1226 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1227 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1228 1229 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1230 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1231 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1232 1233 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1234 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1235 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1236 1237 #undef DO_XTNB 1238 #undef DO_XTNT 1239 1240 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1241 { 1242 intptr_t i, opr_sz = simd_oprsz(desc); 1243 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1244 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1245 uint32_t *a = va, *n = vn; 1246 uint64_t *d = vd, *m = vm; 1247 1248 for (i = 0; i < opr_sz / 8; ++i) { 1249 uint32_t e1 = a[2 * i + H4(0)]; 1250 uint32_t e2 = n[2 * i + sel] ^ inv; 1251 uint64_t c = extract64(m[i], 32, 1); 1252 /* Compute and store the entire 33-bit result at once. */ 1253 d[i] = c + e1 + e2; 1254 } 1255 } 1256 1257 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1258 { 1259 intptr_t i, opr_sz = simd_oprsz(desc); 1260 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1261 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1262 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1263 1264 for (i = 0; i < opr_sz / 8; i += 2) { 1265 Int128 e1 = int128_make64(a[i]); 1266 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1267 Int128 c = int128_make64(m[i + 1] & 1); 1268 Int128 r = int128_add(int128_add(e1, e2), c); 1269 d[i + 0] = int128_getlo(r); 1270 d[i + 1] = int128_gethi(r); 1271 } 1272 } 1273 1274 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1275 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1276 { \ 1277 intptr_t i, opr_sz = simd_oprsz(desc); \ 1278 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1279 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1280 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1281 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1282 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1283 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1284 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1285 } \ 1286 } 1287 1288 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1289 do_sqdmull_h, DO_SQADD_H) 1290 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1291 do_sqdmull_s, DO_SQADD_S) 1292 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1293 do_sqdmull_d, do_sqadd_d) 1294 1295 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1296 do_sqdmull_h, DO_SQSUB_H) 1297 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1298 do_sqdmull_s, DO_SQSUB_S) 1299 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1300 do_sqdmull_d, do_sqsub_d) 1301 1302 #undef DO_SQDMLAL 1303 1304 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1305 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1306 { \ 1307 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1308 int rot = simd_data(desc); \ 1309 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1310 bool sub_r = rot == 1 || rot == 2; \ 1311 bool sub_i = rot >= 2; \ 1312 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1313 for (i = 0; i < opr_sz; i += 2) { \ 1314 TYPE elt1_a = n[H(i + sel_a)]; \ 1315 TYPE elt2_a = m[H(i + sel_a)]; \ 1316 TYPE elt2_b = m[H(i + sel_b)]; \ 1317 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1318 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1319 } \ 1320 } 1321 1322 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1323 1324 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1325 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1326 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1327 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1328 1329 #define DO_SQRDMLAH_B(N, M, A, S) \ 1330 do_sqrdmlah_b(N, M, A, S, true) 1331 #define DO_SQRDMLAH_H(N, M, A, S) \ 1332 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1333 #define DO_SQRDMLAH_S(N, M, A, S) \ 1334 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1335 #define DO_SQRDMLAH_D(N, M, A, S) \ 1336 do_sqrdmlah_d(N, M, A, S, true) 1337 1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1342 1343 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1344 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1345 { \ 1346 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1347 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1348 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1349 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1350 bool sub_r = rot == 1 || rot == 2; \ 1351 bool sub_i = rot >= 2; \ 1352 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1353 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1354 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1355 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1356 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1357 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1358 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1359 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1360 } \ 1361 } \ 1362 } 1363 1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1366 1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1369 1370 #undef DO_CMLA 1371 #undef DO_CMLA_FUNC 1372 #undef DO_CMLA_IDX_FUNC 1373 #undef DO_SQRDMLAH_B 1374 #undef DO_SQRDMLAH_H 1375 #undef DO_SQRDMLAH_S 1376 #undef DO_SQRDMLAH_D 1377 1378 /* Note N and M are 4 elements bundled into one unit. */ 1379 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1380 int sel_a, int sel_b, int sub_i) 1381 { 1382 for (int i = 0; i <= 1; i++) { 1383 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1384 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1385 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1386 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1387 1388 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1389 } 1390 return a; 1391 } 1392 1393 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1394 int sel_a, int sel_b, int sub_i) 1395 { 1396 for (int i = 0; i <= 1; i++) { 1397 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1398 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1399 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1400 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1401 1402 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1403 } 1404 return a; 1405 } 1406 1407 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1408 void *va, uint32_t desc) 1409 { 1410 int opr_sz = simd_oprsz(desc); 1411 int rot = simd_data(desc); 1412 int sel_a = rot & 1; 1413 int sel_b = sel_a ^ 1; 1414 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1415 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1416 1417 for (int e = 0; e < opr_sz / 4; e++) { 1418 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1419 } 1420 } 1421 1422 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1423 void *va, uint32_t desc) 1424 { 1425 int opr_sz = simd_oprsz(desc); 1426 int rot = simd_data(desc); 1427 int sel_a = rot & 1; 1428 int sel_b = sel_a ^ 1; 1429 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1430 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1431 1432 for (int e = 0; e < opr_sz / 8; e++) { 1433 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1434 } 1435 } 1436 1437 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1438 void *va, uint32_t desc) 1439 { 1440 int opr_sz = simd_oprsz(desc); 1441 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1442 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1443 int sel_a = rot & 1; 1444 int sel_b = sel_a ^ 1; 1445 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1446 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1447 1448 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1449 uint32_t seg_m = m[seg + idx]; 1450 for (int e = 0; e < 4; e++) { 1451 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1452 sel_a, sel_b, sub_i); 1453 } 1454 } 1455 } 1456 1457 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1458 void *va, uint32_t desc) 1459 { 1460 int seg, opr_sz = simd_oprsz(desc); 1461 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1462 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1463 int sel_a = rot & 1; 1464 int sel_b = sel_a ^ 1; 1465 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1466 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1467 1468 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1469 uint64_t seg_m = m[seg + idx]; 1470 for (int e = 0; e < 2; e++) { 1471 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1472 sel_a, sel_b, sub_i); 1473 } 1474 } 1475 } 1476 1477 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1478 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1479 { \ 1480 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1481 intptr_t i, j, idx = simd_data(desc); \ 1482 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1483 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1484 TYPE mm = m[i]; \ 1485 for (j = 0; j < segment; j++) { \ 1486 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1487 } \ 1488 } \ 1489 } 1490 1491 #define DO_SQRDMLAH_H(N, M, A) \ 1492 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1493 #define DO_SQRDMLAH_S(N, M, A) \ 1494 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1495 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1496 1497 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1498 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1499 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1500 1501 #define DO_SQRDMLSH_H(N, M, A) \ 1502 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1503 #define DO_SQRDMLSH_S(N, M, A) \ 1504 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1505 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1506 1507 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1508 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1509 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1510 1511 #undef DO_ZZXZ 1512 1513 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1514 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1515 { \ 1516 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1517 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1518 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1519 for (i = 0; i < oprsz; i += 16) { \ 1520 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1521 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1522 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1523 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1524 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1525 } \ 1526 } \ 1527 } 1528 1529 #define DO_MLA(N, M, A) (A + N * M) 1530 1531 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1532 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1533 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1534 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1535 1536 #define DO_MLS(N, M, A) (A - N * M) 1537 1538 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1539 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1540 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1541 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1542 1543 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1544 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1545 1546 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1547 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1548 1549 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1550 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1551 1552 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1553 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1554 1555 #undef DO_MLA 1556 #undef DO_MLS 1557 #undef DO_ZZXW 1558 1559 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1560 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1561 { \ 1562 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1563 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1564 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1565 for (i = 0; i < oprsz; i += 16) { \ 1566 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1567 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1568 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1569 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1570 } \ 1571 } \ 1572 } 1573 1574 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1575 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1576 1577 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1578 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1579 1580 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1581 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1582 1583 #undef DO_ZZX 1584 1585 #define DO_BITPERM(NAME, TYPE, OP) \ 1586 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1587 { \ 1588 intptr_t i, opr_sz = simd_oprsz(desc); \ 1589 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1590 TYPE nn = *(TYPE *)(vn + i); \ 1591 TYPE mm = *(TYPE *)(vm + i); \ 1592 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1593 } \ 1594 } 1595 1596 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1597 { 1598 uint64_t res = 0; 1599 int db, rb = 0; 1600 1601 for (db = 0; db < n; ++db) { 1602 if ((mask >> db) & 1) { 1603 res |= ((data >> db) & 1) << rb; 1604 ++rb; 1605 } 1606 } 1607 return res; 1608 } 1609 1610 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1611 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1612 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1613 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1614 1615 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1616 { 1617 uint64_t res = 0; 1618 int rb, db = 0; 1619 1620 for (rb = 0; rb < n; ++rb) { 1621 if ((mask >> rb) & 1) { 1622 res |= ((data >> db) & 1) << rb; 1623 ++db; 1624 } 1625 } 1626 return res; 1627 } 1628 1629 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1630 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1631 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1632 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1633 1634 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1635 { 1636 uint64_t resm = 0, resu = 0; 1637 int db, rbm = 0, rbu = 0; 1638 1639 for (db = 0; db < n; ++db) { 1640 uint64_t val = (data >> db) & 1; 1641 if ((mask >> db) & 1) { 1642 resm |= val << rbm++; 1643 } else { 1644 resu |= val << rbu++; 1645 } 1646 } 1647 1648 return resm | (resu << rbm); 1649 } 1650 1651 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1652 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1653 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1654 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1655 1656 #undef DO_BITPERM 1657 1658 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1659 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1660 { \ 1661 intptr_t i, opr_sz = simd_oprsz(desc); \ 1662 int sub_r = simd_data(desc); \ 1663 if (sub_r) { \ 1664 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1665 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1666 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1667 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1668 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1669 acc_r = ADD_OP(acc_r, el2_i); \ 1670 acc_i = SUB_OP(acc_i, el2_r); \ 1671 *(TYPE *)(vd + H(i)) = acc_r; \ 1672 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1673 } \ 1674 } else { \ 1675 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1676 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1677 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1678 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1679 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1680 acc_r = SUB_OP(acc_r, el2_i); \ 1681 acc_i = ADD_OP(acc_i, el2_r); \ 1682 *(TYPE *)(vd + H(i)) = acc_r; \ 1683 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1684 } \ 1685 } \ 1686 } 1687 1688 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1689 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1690 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1691 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1692 1693 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1694 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1695 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1696 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1697 1698 #undef DO_CADD 1699 1700 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1701 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1702 { \ 1703 intptr_t i, opr_sz = simd_oprsz(desc); \ 1704 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1705 int shift = simd_data(desc) >> 1; \ 1706 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1707 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1708 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1709 } \ 1710 } 1711 1712 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1713 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1714 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1715 1716 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1717 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1718 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1719 1720 #undef DO_ZZI_SHLL 1721 1722 /* Two-operand reduction expander, controlled by a predicate. 1723 * The difference between TYPERED and TYPERET has to do with 1724 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1725 * but TYPERET must be unsigned so that e.g. a 32-bit value 1726 * is not sign-extended to the ABI uint64_t return type. 1727 */ 1728 /* ??? If we were to vectorize this by hand the reduction ordering 1729 * would change. For integer operands, this is perfectly fine. 1730 */ 1731 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1732 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1733 { \ 1734 intptr_t i, opr_sz = simd_oprsz(desc); \ 1735 TYPERED ret = INIT; \ 1736 for (i = 0; i < opr_sz; ) { \ 1737 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1738 do { \ 1739 if (pg & 1) { \ 1740 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1741 ret = OP(ret, nn); \ 1742 } \ 1743 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1744 } while (i & 15); \ 1745 } \ 1746 return (TYPERET)ret; \ 1747 } 1748 1749 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1750 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1751 { \ 1752 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1753 TYPEE *n = vn; \ 1754 uint8_t *pg = vg; \ 1755 TYPER ret = INIT; \ 1756 for (i = 0; i < opr_sz; i += 1) { \ 1757 if (pg[H1(i)] & 1) { \ 1758 TYPEE nn = n[i]; \ 1759 ret = OP(ret, nn); \ 1760 } \ 1761 } \ 1762 return ret; \ 1763 } 1764 1765 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1766 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1767 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1768 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1769 1770 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1771 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1772 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1773 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1774 1775 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1776 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1777 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1778 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1779 1780 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1781 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1782 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1783 1784 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1785 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1786 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1787 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1788 1789 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1790 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1791 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1792 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1793 1794 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1795 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1796 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1797 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1798 1799 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1800 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1801 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1802 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1803 1804 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1805 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1806 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1807 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1808 1809 #undef DO_VPZ 1810 #undef DO_VPZ_D 1811 1812 /* Two vector operand, one scalar operand, unpredicated. */ 1813 #define DO_ZZI(NAME, TYPE, OP) \ 1814 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1815 { \ 1816 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1817 TYPE s = s64, *d = vd, *n = vn; \ 1818 for (i = 0; i < opr_sz; ++i) { \ 1819 d[i] = OP(n[i], s); \ 1820 } \ 1821 } 1822 1823 #define DO_SUBR(X, Y) (Y - X) 1824 1825 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1826 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1827 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1828 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1829 1830 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1831 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1832 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1833 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1834 1835 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1836 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1837 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1838 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1839 1840 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1841 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1842 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1843 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1844 1845 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1846 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1847 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1848 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1849 1850 #undef DO_ZZI 1851 1852 #undef DO_AND 1853 #undef DO_ORR 1854 #undef DO_EOR 1855 #undef DO_BIC 1856 #undef DO_ADD 1857 #undef DO_SUB 1858 #undef DO_MAX 1859 #undef DO_MIN 1860 #undef DO_ABD 1861 #undef DO_MUL 1862 #undef DO_DIV 1863 #undef DO_ASR 1864 #undef DO_LSR 1865 #undef DO_LSL 1866 #undef DO_SUBR 1867 1868 /* Similar to the ARM LastActiveElement pseudocode function, except the 1869 result is multiplied by the element size. This includes the not found 1870 indication; e.g. not found for esz=3 is -8. */ 1871 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1872 { 1873 uint64_t mask = pred_esz_masks[esz]; 1874 intptr_t i = words; 1875 1876 do { 1877 uint64_t this_g = g[--i] & mask; 1878 if (this_g) { 1879 return i * 64 + (63 - clz64(this_g)); 1880 } 1881 } while (i > 0); 1882 return (intptr_t)-1 << esz; 1883 } 1884 1885 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1886 { 1887 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1888 uint32_t flags = PREDTEST_INIT; 1889 uint64_t *d = vd, *g = vg; 1890 intptr_t i = 0; 1891 1892 do { 1893 uint64_t this_d = d[i]; 1894 uint64_t this_g = g[i]; 1895 1896 if (this_g) { 1897 if (!(flags & 4)) { 1898 /* Set in D the first bit of G. */ 1899 this_d |= this_g & -this_g; 1900 d[i] = this_d; 1901 } 1902 flags = iter_predtest_fwd(this_d, this_g, flags); 1903 } 1904 } while (++i < words); 1905 1906 return flags; 1907 } 1908 1909 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1910 { 1911 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1912 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1913 uint32_t flags = PREDTEST_INIT; 1914 uint64_t *d = vd, *g = vg, esz_mask; 1915 intptr_t i, next; 1916 1917 next = last_active_element(vd, words, esz) + (1 << esz); 1918 esz_mask = pred_esz_masks[esz]; 1919 1920 /* Similar to the pseudocode for pnext, but scaled by ESZ 1921 so that we find the correct bit. */ 1922 if (next < words * 64) { 1923 uint64_t mask = -1; 1924 1925 if (next & 63) { 1926 mask = ~((1ull << (next & 63)) - 1); 1927 next &= -64; 1928 } 1929 do { 1930 uint64_t this_g = g[next / 64] & esz_mask & mask; 1931 if (this_g != 0) { 1932 next = (next & -64) + ctz64(this_g); 1933 break; 1934 } 1935 next += 64; 1936 mask = -1; 1937 } while (next < words * 64); 1938 } 1939 1940 i = 0; 1941 do { 1942 uint64_t this_d = 0; 1943 if (i == next / 64) { 1944 this_d = 1ull << (next & 63); 1945 } 1946 d[i] = this_d; 1947 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1948 } while (++i < words); 1949 1950 return flags; 1951 } 1952 1953 /* 1954 * Copy Zn into Zd, and store zero into inactive elements. 1955 * If inv, store zeros into the active elements. 1956 */ 1957 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1958 { 1959 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1960 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1961 uint64_t *d = vd, *n = vn; 1962 uint8_t *pg = vg; 1963 1964 for (i = 0; i < opr_sz; i += 1) { 1965 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1966 } 1967 } 1968 1969 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1970 { 1971 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1972 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1973 uint64_t *d = vd, *n = vn; 1974 uint8_t *pg = vg; 1975 1976 for (i = 0; i < opr_sz; i += 1) { 1977 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1978 } 1979 } 1980 1981 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 1982 { 1983 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1984 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1985 uint64_t *d = vd, *n = vn; 1986 uint8_t *pg = vg; 1987 1988 for (i = 0; i < opr_sz; i += 1) { 1989 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 1990 } 1991 } 1992 1993 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 1994 { 1995 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1996 uint64_t *d = vd, *n = vn; 1997 uint8_t *pg = vg; 1998 uint8_t inv = simd_data(desc); 1999 2000 for (i = 0; i < opr_sz; i += 1) { 2001 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2002 } 2003 } 2004 2005 /* Three-operand expander, immediate operand, controlled by a predicate. 2006 */ 2007 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2008 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2009 { \ 2010 intptr_t i, opr_sz = simd_oprsz(desc); \ 2011 TYPE imm = simd_data(desc); \ 2012 for (i = 0; i < opr_sz; ) { \ 2013 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2014 do { \ 2015 if (pg & 1) { \ 2016 TYPE nn = *(TYPE *)(vn + H(i)); \ 2017 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2018 } \ 2019 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2020 } while (i & 15); \ 2021 } \ 2022 } 2023 2024 /* Similarly, specialized for 64-bit operands. */ 2025 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2026 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2027 { \ 2028 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2029 TYPE *d = vd, *n = vn; \ 2030 TYPE imm = simd_data(desc); \ 2031 uint8_t *pg = vg; \ 2032 for (i = 0; i < opr_sz; i += 1) { \ 2033 if (pg[H1(i)] & 1) { \ 2034 TYPE nn = n[i]; \ 2035 d[i] = OP(nn, imm); \ 2036 } \ 2037 } \ 2038 } 2039 2040 #define DO_SHR(N, M) (N >> M) 2041 #define DO_SHL(N, M) (N << M) 2042 2043 /* Arithmetic shift right for division. This rounds negative numbers 2044 toward zero as per signed division. Therefore before shifting, 2045 when N is negative, add 2**M-1. */ 2046 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2047 2048 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2049 { 2050 if (likely(sh < 64)) { 2051 return (x >> sh) + ((x >> (sh - 1)) & 1); 2052 } else if (sh == 64) { 2053 return x >> 63; 2054 } else { 2055 return 0; 2056 } 2057 } 2058 2059 static inline int64_t do_srshr(int64_t x, unsigned sh) 2060 { 2061 if (likely(sh < 64)) { 2062 return (x >> sh) + ((x >> (sh - 1)) & 1); 2063 } else { 2064 /* Rounding the sign bit always produces 0. */ 2065 return 0; 2066 } 2067 } 2068 2069 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2070 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2071 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2072 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2073 2074 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2075 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2076 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2077 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2078 2079 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2080 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2081 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2082 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2083 2084 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2085 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2086 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2087 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2088 2089 /* SVE2 bitwise shift by immediate */ 2090 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2091 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2092 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2093 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2094 2095 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2096 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2097 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2098 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2099 2100 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2101 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2102 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2103 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2104 2105 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2106 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2107 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2108 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2109 2110 #define do_suqrshl_b(n, m) \ 2111 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2112 #define do_suqrshl_h(n, m) \ 2113 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2114 #define do_suqrshl_s(n, m) \ 2115 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2116 #define do_suqrshl_d(n, m) \ 2117 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2118 2119 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2120 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2121 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2122 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2123 2124 #undef DO_ASRD 2125 #undef DO_ZPZI 2126 #undef DO_ZPZI_D 2127 2128 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2129 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2130 { \ 2131 intptr_t i, opr_sz = simd_oprsz(desc); \ 2132 int shift = simd_data(desc); \ 2133 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2134 TYPEW nn = *(TYPEW *)(vn + i); \ 2135 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2136 } \ 2137 } 2138 2139 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2140 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2141 { \ 2142 intptr_t i, opr_sz = simd_oprsz(desc); \ 2143 int shift = simd_data(desc); \ 2144 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2145 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2146 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2147 } \ 2148 } 2149 2150 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2151 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2152 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2153 2154 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2155 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2156 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2157 2158 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2159 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2160 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2161 2162 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2163 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2164 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2165 2166 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2167 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2168 #define DO_SQSHRUN_D(x, sh) \ 2169 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2170 2171 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2172 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2173 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2174 2175 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2176 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2177 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2178 2179 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2180 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2181 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2182 2183 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2184 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2185 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2186 2187 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2188 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2189 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2190 2191 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2192 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2193 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2194 2195 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2196 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2197 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2198 2199 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2200 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2201 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2202 2203 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2204 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2205 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2206 2207 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2208 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2209 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2210 2211 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2212 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2213 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2214 2215 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2216 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2217 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2218 2219 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2220 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2221 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2222 2223 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2224 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2225 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2226 2227 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2228 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2229 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2230 2231 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2232 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2233 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2234 2235 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2236 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2237 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2238 2239 #undef DO_SHRNB 2240 #undef DO_SHRNT 2241 2242 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2243 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2244 { \ 2245 intptr_t i, opr_sz = simd_oprsz(desc); \ 2246 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2247 TYPEW nn = *(TYPEW *)(vn + i); \ 2248 TYPEW mm = *(TYPEW *)(vm + i); \ 2249 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2250 } \ 2251 } 2252 2253 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2254 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2255 { \ 2256 intptr_t i, opr_sz = simd_oprsz(desc); \ 2257 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2258 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2259 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2260 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2261 } \ 2262 } 2263 2264 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2265 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2266 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2267 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2268 2269 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2270 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2271 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2272 2273 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2274 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2275 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2276 2277 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2278 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2279 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2280 2281 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2282 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2283 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2284 2285 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2286 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2287 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2288 2289 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2290 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2291 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2292 2293 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2294 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2295 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2296 2297 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2298 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2299 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2300 2301 #undef DO_RSUBHN 2302 #undef DO_SUBHN 2303 #undef DO_RADDHN 2304 #undef DO_ADDHN 2305 2306 #undef DO_BINOPNB 2307 2308 /* Fully general four-operand expander, controlled by a predicate. 2309 */ 2310 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2311 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2312 void *vg, uint32_t desc) \ 2313 { \ 2314 intptr_t i, opr_sz = simd_oprsz(desc); \ 2315 for (i = 0; i < opr_sz; ) { \ 2316 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2317 do { \ 2318 if (pg & 1) { \ 2319 TYPE nn = *(TYPE *)(vn + H(i)); \ 2320 TYPE mm = *(TYPE *)(vm + H(i)); \ 2321 TYPE aa = *(TYPE *)(va + H(i)); \ 2322 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2323 } \ 2324 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2325 } while (i & 15); \ 2326 } \ 2327 } 2328 2329 /* Similarly, specialized for 64-bit operands. */ 2330 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2331 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2332 void *vg, uint32_t desc) \ 2333 { \ 2334 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2335 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2336 uint8_t *pg = vg; \ 2337 for (i = 0; i < opr_sz; i += 1) { \ 2338 if (pg[H1(i)] & 1) { \ 2339 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2340 d[i] = OP(aa, nn, mm); \ 2341 } \ 2342 } \ 2343 } 2344 2345 #define DO_MLA(A, N, M) (A + N * M) 2346 #define DO_MLS(A, N, M) (A - N * M) 2347 2348 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2349 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2350 2351 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2352 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2353 2354 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2355 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2356 2357 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2358 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2359 2360 #undef DO_MLA 2361 #undef DO_MLS 2362 #undef DO_ZPZZZ 2363 #undef DO_ZPZZZ_D 2364 2365 void HELPER(sve_index_b)(void *vd, uint32_t start, 2366 uint32_t incr, uint32_t desc) 2367 { 2368 intptr_t i, opr_sz = simd_oprsz(desc); 2369 uint8_t *d = vd; 2370 for (i = 0; i < opr_sz; i += 1) { 2371 d[H1(i)] = start + i * incr; 2372 } 2373 } 2374 2375 void HELPER(sve_index_h)(void *vd, uint32_t start, 2376 uint32_t incr, uint32_t desc) 2377 { 2378 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2379 uint16_t *d = vd; 2380 for (i = 0; i < opr_sz; i += 1) { 2381 d[H2(i)] = start + i * incr; 2382 } 2383 } 2384 2385 void HELPER(sve_index_s)(void *vd, uint32_t start, 2386 uint32_t incr, uint32_t desc) 2387 { 2388 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2389 uint32_t *d = vd; 2390 for (i = 0; i < opr_sz; i += 1) { 2391 d[H4(i)] = start + i * incr; 2392 } 2393 } 2394 2395 void HELPER(sve_index_d)(void *vd, uint64_t start, 2396 uint64_t incr, uint32_t desc) 2397 { 2398 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2399 uint64_t *d = vd; 2400 for (i = 0; i < opr_sz; i += 1) { 2401 d[i] = start + i * incr; 2402 } 2403 } 2404 2405 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2406 { 2407 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2408 uint32_t sh = simd_data(desc); 2409 uint32_t *d = vd, *n = vn, *m = vm; 2410 for (i = 0; i < opr_sz; i += 1) { 2411 d[i] = n[i] + (m[i] << sh); 2412 } 2413 } 2414 2415 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2416 { 2417 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2418 uint64_t sh = simd_data(desc); 2419 uint64_t *d = vd, *n = vn, *m = vm; 2420 for (i = 0; i < opr_sz; i += 1) { 2421 d[i] = n[i] + (m[i] << sh); 2422 } 2423 } 2424 2425 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2426 { 2427 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2428 uint64_t sh = simd_data(desc); 2429 uint64_t *d = vd, *n = vn, *m = vm; 2430 for (i = 0; i < opr_sz; i += 1) { 2431 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2432 } 2433 } 2434 2435 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2436 { 2437 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2438 uint64_t sh = simd_data(desc); 2439 uint64_t *d = vd, *n = vn, *m = vm; 2440 for (i = 0; i < opr_sz; i += 1) { 2441 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2442 } 2443 } 2444 2445 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2446 { 2447 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2448 static const uint16_t coeff[] = { 2449 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2450 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2451 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2452 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2453 }; 2454 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2455 uint16_t *d = vd, *n = vn; 2456 2457 for (i = 0; i < opr_sz; i++) { 2458 uint16_t nn = n[i]; 2459 intptr_t idx = extract32(nn, 0, 5); 2460 uint16_t exp = extract32(nn, 5, 5); 2461 d[i] = coeff[idx] | (exp << 10); 2462 } 2463 } 2464 2465 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2466 { 2467 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2468 static const uint32_t coeff[] = { 2469 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2470 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2471 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2472 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2473 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2474 0x1ef532, 0x20b051, 0x227043, 0x243516, 2475 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2476 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2477 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2478 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2479 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2480 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2481 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2482 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2483 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2484 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2485 }; 2486 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2487 uint32_t *d = vd, *n = vn; 2488 2489 for (i = 0; i < opr_sz; i++) { 2490 uint32_t nn = n[i]; 2491 intptr_t idx = extract32(nn, 0, 6); 2492 uint32_t exp = extract32(nn, 6, 8); 2493 d[i] = coeff[idx] | (exp << 23); 2494 } 2495 } 2496 2497 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2498 { 2499 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2500 static const uint64_t coeff[] = { 2501 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2502 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2503 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2504 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2505 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2506 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2507 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2508 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2509 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2510 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2511 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2512 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2513 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2514 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2515 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2516 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2517 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2518 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2519 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2520 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2521 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2522 0xFA7C1819E90D8ull, 2523 }; 2524 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2525 uint64_t *d = vd, *n = vn; 2526 2527 for (i = 0; i < opr_sz; i++) { 2528 uint64_t nn = n[i]; 2529 intptr_t idx = extract32(nn, 0, 6); 2530 uint64_t exp = extract32(nn, 6, 11); 2531 d[i] = coeff[idx] | (exp << 52); 2532 } 2533 } 2534 2535 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2536 { 2537 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2538 uint16_t *d = vd, *n = vn, *m = vm; 2539 for (i = 0; i < opr_sz; i += 1) { 2540 uint16_t nn = n[i]; 2541 uint16_t mm = m[i]; 2542 if (mm & 1) { 2543 nn = float16_one; 2544 } 2545 d[i] = nn ^ (mm & 2) << 14; 2546 } 2547 } 2548 2549 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2550 { 2551 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2552 uint32_t *d = vd, *n = vn, *m = vm; 2553 for (i = 0; i < opr_sz; i += 1) { 2554 uint32_t nn = n[i]; 2555 uint32_t mm = m[i]; 2556 if (mm & 1) { 2557 nn = float32_one; 2558 } 2559 d[i] = nn ^ (mm & 2) << 30; 2560 } 2561 } 2562 2563 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2564 { 2565 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2566 uint64_t *d = vd, *n = vn, *m = vm; 2567 for (i = 0; i < opr_sz; i += 1) { 2568 uint64_t nn = n[i]; 2569 uint64_t mm = m[i]; 2570 if (mm & 1) { 2571 nn = float64_one; 2572 } 2573 d[i] = nn ^ (mm & 2) << 62; 2574 } 2575 } 2576 2577 /* 2578 * Signed saturating addition with scalar operand. 2579 */ 2580 2581 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2582 { 2583 intptr_t i, oprsz = simd_oprsz(desc); 2584 2585 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2586 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2587 } 2588 } 2589 2590 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2591 { 2592 intptr_t i, oprsz = simd_oprsz(desc); 2593 2594 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2595 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2596 } 2597 } 2598 2599 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2600 { 2601 intptr_t i, oprsz = simd_oprsz(desc); 2602 2603 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2604 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2605 } 2606 } 2607 2608 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2609 { 2610 intptr_t i, oprsz = simd_oprsz(desc); 2611 2612 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2613 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2614 } 2615 } 2616 2617 /* 2618 * Unsigned saturating addition with scalar operand. 2619 */ 2620 2621 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2622 { 2623 intptr_t i, oprsz = simd_oprsz(desc); 2624 2625 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2626 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2627 } 2628 } 2629 2630 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2631 { 2632 intptr_t i, oprsz = simd_oprsz(desc); 2633 2634 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2635 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2636 } 2637 } 2638 2639 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2640 { 2641 intptr_t i, oprsz = simd_oprsz(desc); 2642 2643 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2644 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2645 } 2646 } 2647 2648 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2649 { 2650 intptr_t i, oprsz = simd_oprsz(desc); 2651 2652 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2653 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2654 } 2655 } 2656 2657 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2658 { 2659 intptr_t i, oprsz = simd_oprsz(desc); 2660 2661 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2662 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2663 } 2664 } 2665 2666 /* Two operand predicated copy immediate with merge. All valid immediates 2667 * can fit within 17 signed bits in the simd_data field. 2668 */ 2669 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2670 uint64_t mm, uint32_t desc) 2671 { 2672 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2673 uint64_t *d = vd, *n = vn; 2674 uint8_t *pg = vg; 2675 2676 mm = dup_const(MO_8, mm); 2677 for (i = 0; i < opr_sz; i += 1) { 2678 uint64_t nn = n[i]; 2679 uint64_t pp = expand_pred_b(pg[H1(i)]); 2680 d[i] = (mm & pp) | (nn & ~pp); 2681 } 2682 } 2683 2684 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2685 uint64_t mm, uint32_t desc) 2686 { 2687 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2688 uint64_t *d = vd, *n = vn; 2689 uint8_t *pg = vg; 2690 2691 mm = dup_const(MO_16, mm); 2692 for (i = 0; i < opr_sz; i += 1) { 2693 uint64_t nn = n[i]; 2694 uint64_t pp = expand_pred_h(pg[H1(i)]); 2695 d[i] = (mm & pp) | (nn & ~pp); 2696 } 2697 } 2698 2699 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2700 uint64_t mm, uint32_t desc) 2701 { 2702 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2703 uint64_t *d = vd, *n = vn; 2704 uint8_t *pg = vg; 2705 2706 mm = dup_const(MO_32, mm); 2707 for (i = 0; i < opr_sz; i += 1) { 2708 uint64_t nn = n[i]; 2709 uint64_t pp = expand_pred_s(pg[H1(i)]); 2710 d[i] = (mm & pp) | (nn & ~pp); 2711 } 2712 } 2713 2714 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2715 uint64_t mm, uint32_t desc) 2716 { 2717 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2718 uint64_t *d = vd, *n = vn; 2719 uint8_t *pg = vg; 2720 2721 for (i = 0; i < opr_sz; i += 1) { 2722 uint64_t nn = n[i]; 2723 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2724 } 2725 } 2726 2727 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2728 { 2729 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2730 uint64_t *d = vd; 2731 uint8_t *pg = vg; 2732 2733 val = dup_const(MO_8, val); 2734 for (i = 0; i < opr_sz; i += 1) { 2735 d[i] = val & expand_pred_b(pg[H1(i)]); 2736 } 2737 } 2738 2739 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2740 { 2741 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2742 uint64_t *d = vd; 2743 uint8_t *pg = vg; 2744 2745 val = dup_const(MO_16, val); 2746 for (i = 0; i < opr_sz; i += 1) { 2747 d[i] = val & expand_pred_h(pg[H1(i)]); 2748 } 2749 } 2750 2751 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2752 { 2753 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2754 uint64_t *d = vd; 2755 uint8_t *pg = vg; 2756 2757 val = dup_const(MO_32, val); 2758 for (i = 0; i < opr_sz; i += 1) { 2759 d[i] = val & expand_pred_s(pg[H1(i)]); 2760 } 2761 } 2762 2763 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2764 { 2765 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2766 uint64_t *d = vd; 2767 uint8_t *pg = vg; 2768 2769 for (i = 0; i < opr_sz; i += 1) { 2770 d[i] = (pg[H1(i)] & 1 ? val : 0); 2771 } 2772 } 2773 2774 /* Big-endian hosts need to frob the byte indices. If the copy 2775 * happens to be 8-byte aligned, then no frobbing necessary. 2776 */ 2777 static void swap_memmove(void *vd, void *vs, size_t n) 2778 { 2779 uintptr_t d = (uintptr_t)vd; 2780 uintptr_t s = (uintptr_t)vs; 2781 uintptr_t o = (d | s | n) & 7; 2782 size_t i; 2783 2784 #if !HOST_BIG_ENDIAN 2785 o = 0; 2786 #endif 2787 switch (o) { 2788 case 0: 2789 memmove(vd, vs, n); 2790 break; 2791 2792 case 4: 2793 if (d < s || d >= s + n) { 2794 for (i = 0; i < n; i += 4) { 2795 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2796 } 2797 } else { 2798 for (i = n; i > 0; ) { 2799 i -= 4; 2800 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2801 } 2802 } 2803 break; 2804 2805 case 2: 2806 case 6: 2807 if (d < s || d >= s + n) { 2808 for (i = 0; i < n; i += 2) { 2809 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2810 } 2811 } else { 2812 for (i = n; i > 0; ) { 2813 i -= 2; 2814 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2815 } 2816 } 2817 break; 2818 2819 default: 2820 if (d < s || d >= s + n) { 2821 for (i = 0; i < n; i++) { 2822 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2823 } 2824 } else { 2825 for (i = n; i > 0; ) { 2826 i -= 1; 2827 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2828 } 2829 } 2830 break; 2831 } 2832 } 2833 2834 /* Similarly for memset of 0. */ 2835 static void swap_memzero(void *vd, size_t n) 2836 { 2837 uintptr_t d = (uintptr_t)vd; 2838 uintptr_t o = (d | n) & 7; 2839 size_t i; 2840 2841 /* Usually, the first bit of a predicate is set, so N is 0. */ 2842 if (likely(n == 0)) { 2843 return; 2844 } 2845 2846 #if !HOST_BIG_ENDIAN 2847 o = 0; 2848 #endif 2849 switch (o) { 2850 case 0: 2851 memset(vd, 0, n); 2852 break; 2853 2854 case 4: 2855 for (i = 0; i < n; i += 4) { 2856 *(uint32_t *)H1_4(d + i) = 0; 2857 } 2858 break; 2859 2860 case 2: 2861 case 6: 2862 for (i = 0; i < n; i += 2) { 2863 *(uint16_t *)H1_2(d + i) = 0; 2864 } 2865 break; 2866 2867 default: 2868 for (i = 0; i < n; i++) { 2869 *(uint8_t *)H1(d + i) = 0; 2870 } 2871 break; 2872 } 2873 } 2874 2875 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2876 { 2877 intptr_t opr_sz = simd_oprsz(desc); 2878 size_t n_ofs = simd_data(desc); 2879 size_t n_siz = opr_sz - n_ofs; 2880 2881 if (vd != vm) { 2882 swap_memmove(vd, vn + n_ofs, n_siz); 2883 swap_memmove(vd + n_siz, vm, n_ofs); 2884 } else if (vd != vn) { 2885 swap_memmove(vd + n_siz, vd, n_ofs); 2886 swap_memmove(vd, vn + n_ofs, n_siz); 2887 } else { 2888 /* vd == vn == vm. Need temp space. */ 2889 ARMVectorReg tmp; 2890 swap_memmove(&tmp, vm, n_ofs); 2891 swap_memmove(vd, vd + n_ofs, n_siz); 2892 memcpy(vd + n_siz, &tmp, n_ofs); 2893 } 2894 } 2895 2896 #define DO_INSR(NAME, TYPE, H) \ 2897 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2898 { \ 2899 intptr_t opr_sz = simd_oprsz(desc); \ 2900 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2901 *(TYPE *)(vd + H(0)) = val; \ 2902 } 2903 2904 DO_INSR(sve_insr_b, uint8_t, H1) 2905 DO_INSR(sve_insr_h, uint16_t, H1_2) 2906 DO_INSR(sve_insr_s, uint32_t, H1_4) 2907 DO_INSR(sve_insr_d, uint64_t, H1_8) 2908 2909 #undef DO_INSR 2910 2911 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2912 { 2913 intptr_t i, j, opr_sz = simd_oprsz(desc); 2914 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2915 uint64_t f = *(uint64_t *)(vn + i); 2916 uint64_t b = *(uint64_t *)(vn + j); 2917 *(uint64_t *)(vd + i) = bswap64(b); 2918 *(uint64_t *)(vd + j) = bswap64(f); 2919 } 2920 } 2921 2922 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2923 { 2924 intptr_t i, j, opr_sz = simd_oprsz(desc); 2925 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2926 uint64_t f = *(uint64_t *)(vn + i); 2927 uint64_t b = *(uint64_t *)(vn + j); 2928 *(uint64_t *)(vd + i) = hswap64(b); 2929 *(uint64_t *)(vd + j) = hswap64(f); 2930 } 2931 } 2932 2933 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2934 { 2935 intptr_t i, j, opr_sz = simd_oprsz(desc); 2936 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2937 uint64_t f = *(uint64_t *)(vn + i); 2938 uint64_t b = *(uint64_t *)(vn + j); 2939 *(uint64_t *)(vd + i) = rol64(b, 32); 2940 *(uint64_t *)(vd + j) = rol64(f, 32); 2941 } 2942 } 2943 2944 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2945 { 2946 intptr_t i, j, opr_sz = simd_oprsz(desc); 2947 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2948 uint64_t f = *(uint64_t *)(vn + i); 2949 uint64_t b = *(uint64_t *)(vn + j); 2950 *(uint64_t *)(vd + i) = b; 2951 *(uint64_t *)(vd + j) = f; 2952 } 2953 } 2954 2955 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2956 2957 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2958 bool is_tbx, tb_impl_fn *fn) 2959 { 2960 ARMVectorReg scratch; 2961 uintptr_t oprsz = simd_oprsz(desc); 2962 2963 if (unlikely(vd == vn)) { 2964 vn = memcpy(&scratch, vn, oprsz); 2965 } 2966 2967 fn(vd, vn, NULL, vm, oprsz, is_tbx); 2968 } 2969 2970 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 2971 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 2972 { 2973 ARMVectorReg scratch; 2974 uintptr_t oprsz = simd_oprsz(desc); 2975 2976 if (unlikely(vd == vn0)) { 2977 vn0 = memcpy(&scratch, vn0, oprsz); 2978 if (vd == vn1) { 2979 vn1 = vn0; 2980 } 2981 } else if (unlikely(vd == vn1)) { 2982 vn1 = memcpy(&scratch, vn1, oprsz); 2983 } 2984 2985 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 2986 } 2987 2988 #define DO_TB(SUFF, TYPE, H) \ 2989 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 2990 void *vm, uintptr_t oprsz, bool is_tbx) \ 2991 { \ 2992 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 2993 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 2994 for (i = 0; i < nelem; ++i) { \ 2995 TYPE index = indexes[H1(i)], val = 0; \ 2996 if (index < nelem) { \ 2997 val = tbl0[H(index)]; \ 2998 } else { \ 2999 index -= nelem; \ 3000 if (tbl1 && index < nelem) { \ 3001 val = tbl1[H(index)]; \ 3002 } else if (is_tbx) { \ 3003 continue; \ 3004 } \ 3005 } \ 3006 d[H(i)] = val; \ 3007 } \ 3008 } \ 3009 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3010 { \ 3011 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3012 } \ 3013 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3014 void *vm, uint32_t desc) \ 3015 { \ 3016 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3017 } \ 3018 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3019 { \ 3020 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3021 } 3022 3023 DO_TB(b, uint8_t, H1) 3024 DO_TB(h, uint16_t, H2) 3025 DO_TB(s, uint32_t, H4) 3026 DO_TB(d, uint64_t, H8) 3027 3028 #undef DO_TB 3029 3030 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3031 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3032 { \ 3033 intptr_t i, opr_sz = simd_oprsz(desc); \ 3034 TYPED *d = vd; \ 3035 TYPES *n = vn; \ 3036 ARMVectorReg tmp; \ 3037 if (unlikely(vn - vd < opr_sz)) { \ 3038 n = memcpy(&tmp, n, opr_sz / 2); \ 3039 } \ 3040 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3041 d[HD(i)] = n[HS(i)]; \ 3042 } \ 3043 } 3044 3045 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3046 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3047 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3048 3049 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3050 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3051 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3052 3053 #undef DO_UNPK 3054 3055 /* Mask of bits included in the even numbered predicates of width esz. 3056 * We also use this for expand_bits/compress_bits, and so extend the 3057 * same pattern out to 16-bit units. 3058 */ 3059 static const uint64_t even_bit_esz_masks[5] = { 3060 0x5555555555555555ull, 3061 0x3333333333333333ull, 3062 0x0f0f0f0f0f0f0f0full, 3063 0x00ff00ff00ff00ffull, 3064 0x0000ffff0000ffffull, 3065 }; 3066 3067 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3068 * For N==0, this corresponds to the operation that in qemu/bitops.h 3069 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3070 * section 7-2 Shuffling Bits. 3071 */ 3072 static uint64_t expand_bits(uint64_t x, int n) 3073 { 3074 int i; 3075 3076 x &= 0xffffffffu; 3077 for (i = 4; i >= n; i--) { 3078 int sh = 1 << i; 3079 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3080 } 3081 return x; 3082 } 3083 3084 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3085 * For N==0, this corresponds to the operation that in qemu/bitops.h 3086 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3087 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3088 */ 3089 static uint64_t compress_bits(uint64_t x, int n) 3090 { 3091 int i; 3092 3093 for (i = n; i <= 4; i++) { 3094 int sh = 1 << i; 3095 x &= even_bit_esz_masks[i]; 3096 x = (x >> sh) | x; 3097 } 3098 return x & 0xffffffffu; 3099 } 3100 3101 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3102 { 3103 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3104 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3105 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3106 int esize = 1 << esz; 3107 uint64_t *d = vd; 3108 intptr_t i; 3109 3110 if (oprsz <= 8) { 3111 uint64_t nn = *(uint64_t *)vn; 3112 uint64_t mm = *(uint64_t *)vm; 3113 int half = 4 * oprsz; 3114 3115 nn = extract64(nn, high * half, half); 3116 mm = extract64(mm, high * half, half); 3117 nn = expand_bits(nn, esz); 3118 mm = expand_bits(mm, esz); 3119 d[0] = nn | (mm << esize); 3120 } else { 3121 ARMPredicateReg tmp; 3122 3123 /* We produce output faster than we consume input. 3124 Therefore we must be mindful of possible overlap. */ 3125 if (vd == vn) { 3126 vn = memcpy(&tmp, vn, oprsz); 3127 if (vd == vm) { 3128 vm = vn; 3129 } 3130 } else if (vd == vm) { 3131 vm = memcpy(&tmp, vm, oprsz); 3132 } 3133 if (high) { 3134 high = oprsz >> 1; 3135 } 3136 3137 if ((oprsz & 7) == 0) { 3138 uint32_t *n = vn, *m = vm; 3139 high >>= 2; 3140 3141 for (i = 0; i < oprsz / 8; i++) { 3142 uint64_t nn = n[H4(high + i)]; 3143 uint64_t mm = m[H4(high + i)]; 3144 3145 nn = expand_bits(nn, esz); 3146 mm = expand_bits(mm, esz); 3147 d[i] = nn | (mm << esize); 3148 } 3149 } else { 3150 uint8_t *n = vn, *m = vm; 3151 uint16_t *d16 = vd; 3152 3153 for (i = 0; i < oprsz / 2; i++) { 3154 uint16_t nn = n[H1(high + i)]; 3155 uint16_t mm = m[H1(high + i)]; 3156 3157 nn = expand_bits(nn, esz); 3158 mm = expand_bits(mm, esz); 3159 d16[H2(i)] = nn | (mm << esize); 3160 } 3161 } 3162 } 3163 } 3164 3165 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3166 { 3167 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3168 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3169 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3170 uint64_t *d = vd, *n = vn, *m = vm; 3171 uint64_t l, h; 3172 intptr_t i; 3173 3174 if (oprsz <= 8) { 3175 l = compress_bits(n[0] >> odd, esz); 3176 h = compress_bits(m[0] >> odd, esz); 3177 d[0] = l | (h << (4 * oprsz)); 3178 } else { 3179 ARMPredicateReg tmp_m; 3180 intptr_t oprsz_16 = oprsz / 16; 3181 3182 if ((vm - vd) < (uintptr_t)oprsz) { 3183 m = memcpy(&tmp_m, vm, oprsz); 3184 } 3185 3186 for (i = 0; i < oprsz_16; i++) { 3187 l = n[2 * i + 0]; 3188 h = n[2 * i + 1]; 3189 l = compress_bits(l >> odd, esz); 3190 h = compress_bits(h >> odd, esz); 3191 d[i] = l | (h << 32); 3192 } 3193 3194 /* 3195 * For VL which is not a multiple of 512, the results from M do not 3196 * align nicely with the uint64_t for D. Put the aligned results 3197 * from M into TMP_M and then copy it into place afterward. 3198 */ 3199 if (oprsz & 15) { 3200 int final_shift = (oprsz & 15) * 2; 3201 3202 l = n[2 * i + 0]; 3203 h = n[2 * i + 1]; 3204 l = compress_bits(l >> odd, esz); 3205 h = compress_bits(h >> odd, esz); 3206 d[i] = l | (h << final_shift); 3207 3208 for (i = 0; i < oprsz_16; i++) { 3209 l = m[2 * i + 0]; 3210 h = m[2 * i + 1]; 3211 l = compress_bits(l >> odd, esz); 3212 h = compress_bits(h >> odd, esz); 3213 tmp_m.p[i] = l | (h << 32); 3214 } 3215 l = m[2 * i + 0]; 3216 h = m[2 * i + 1]; 3217 l = compress_bits(l >> odd, esz); 3218 h = compress_bits(h >> odd, esz); 3219 tmp_m.p[i] = l | (h << final_shift); 3220 3221 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3222 } else { 3223 for (i = 0; i < oprsz_16; i++) { 3224 l = m[2 * i + 0]; 3225 h = m[2 * i + 1]; 3226 l = compress_bits(l >> odd, esz); 3227 h = compress_bits(h >> odd, esz); 3228 d[oprsz_16 + i] = l | (h << 32); 3229 } 3230 } 3231 } 3232 } 3233 3234 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3235 { 3236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3237 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3238 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3239 uint64_t *d = vd, *n = vn, *m = vm; 3240 uint64_t mask; 3241 int shr, shl; 3242 intptr_t i; 3243 3244 shl = 1 << esz; 3245 shr = 0; 3246 mask = even_bit_esz_masks[esz]; 3247 if (odd) { 3248 mask <<= shl; 3249 shr = shl; 3250 shl = 0; 3251 } 3252 3253 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3254 uint64_t nn = (n[i] & mask) >> shr; 3255 uint64_t mm = (m[i] & mask) << shl; 3256 d[i] = nn + mm; 3257 } 3258 } 3259 3260 /* Reverse units of 2**N bits. */ 3261 static uint64_t reverse_bits_64(uint64_t x, int n) 3262 { 3263 int i, sh; 3264 3265 x = bswap64(x); 3266 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3267 uint64_t mask = even_bit_esz_masks[i]; 3268 x = ((x & mask) << sh) | ((x >> sh) & mask); 3269 } 3270 return x; 3271 } 3272 3273 static uint8_t reverse_bits_8(uint8_t x, int n) 3274 { 3275 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3276 int i, sh; 3277 3278 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3279 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3280 } 3281 return x; 3282 } 3283 3284 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3285 { 3286 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3287 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3288 intptr_t i, oprsz_2 = oprsz / 2; 3289 3290 if (oprsz <= 8) { 3291 uint64_t l = *(uint64_t *)vn; 3292 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3293 *(uint64_t *)vd = l; 3294 } else if ((oprsz & 15) == 0) { 3295 for (i = 0; i < oprsz_2; i += 8) { 3296 intptr_t ih = oprsz - 8 - i; 3297 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3298 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3299 *(uint64_t *)(vd + i) = h; 3300 *(uint64_t *)(vd + ih) = l; 3301 } 3302 } else { 3303 for (i = 0; i < oprsz_2; i += 1) { 3304 intptr_t il = H1(i); 3305 intptr_t ih = H1(oprsz - 1 - i); 3306 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3307 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3308 *(uint8_t *)(vd + il) = h; 3309 *(uint8_t *)(vd + ih) = l; 3310 } 3311 } 3312 } 3313 3314 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3315 { 3316 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3317 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3318 uint64_t *d = vd; 3319 intptr_t i; 3320 3321 if (oprsz <= 8) { 3322 uint64_t nn = *(uint64_t *)vn; 3323 int half = 4 * oprsz; 3324 3325 nn = extract64(nn, high * half, half); 3326 nn = expand_bits(nn, 0); 3327 d[0] = nn; 3328 } else { 3329 ARMPredicateReg tmp_n; 3330 3331 /* We produce output faster than we consume input. 3332 Therefore we must be mindful of possible overlap. */ 3333 if ((vn - vd) < (uintptr_t)oprsz) { 3334 vn = memcpy(&tmp_n, vn, oprsz); 3335 } 3336 if (high) { 3337 high = oprsz >> 1; 3338 } 3339 3340 if ((oprsz & 7) == 0) { 3341 uint32_t *n = vn; 3342 high >>= 2; 3343 3344 for (i = 0; i < oprsz / 8; i++) { 3345 uint64_t nn = n[H4(high + i)]; 3346 d[i] = expand_bits(nn, 0); 3347 } 3348 } else { 3349 uint16_t *d16 = vd; 3350 uint8_t *n = vn; 3351 3352 for (i = 0; i < oprsz / 2; i++) { 3353 uint16_t nn = n[H1(high + i)]; 3354 d16[H2(i)] = expand_bits(nn, 0); 3355 } 3356 } 3357 } 3358 } 3359 3360 #define DO_ZIP(NAME, TYPE, H) \ 3361 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3362 { \ 3363 intptr_t oprsz = simd_oprsz(desc); \ 3364 intptr_t odd_ofs = simd_data(desc); \ 3365 intptr_t i, oprsz_2 = oprsz / 2; \ 3366 ARMVectorReg tmp_n, tmp_m; \ 3367 /* We produce output faster than we consume input. \ 3368 Therefore we must be mindful of possible overlap. */ \ 3369 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3370 vn = memcpy(&tmp_n, vn, oprsz); \ 3371 } \ 3372 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3373 vm = memcpy(&tmp_m, vm, oprsz); \ 3374 } \ 3375 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3376 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3377 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3378 *(TYPE *)(vm + odd_ofs + H(i)); \ 3379 } \ 3380 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3381 memset(vd + oprsz - 16, 0, 16); \ 3382 } \ 3383 } 3384 3385 DO_ZIP(sve_zip_b, uint8_t, H1) 3386 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3387 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3388 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3389 DO_ZIP(sve2_zip_q, Int128, ) 3390 3391 #define DO_UZP(NAME, TYPE, H) \ 3392 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3393 { \ 3394 intptr_t oprsz = simd_oprsz(desc); \ 3395 intptr_t odd_ofs = simd_data(desc); \ 3396 intptr_t i, p; \ 3397 ARMVectorReg tmp_m; \ 3398 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3399 vm = memcpy(&tmp_m, vm, oprsz); \ 3400 } \ 3401 i = 0, p = odd_ofs; \ 3402 do { \ 3403 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3404 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3405 } while (p < oprsz); \ 3406 p -= oprsz; \ 3407 do { \ 3408 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3409 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3410 } while (p < oprsz); \ 3411 tcg_debug_assert(i == oprsz); \ 3412 } 3413 3414 DO_UZP(sve_uzp_b, uint8_t, H1) 3415 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3416 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3417 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3418 DO_UZP(sve2_uzp_q, Int128, ) 3419 3420 #define DO_TRN(NAME, TYPE, H) \ 3421 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3422 { \ 3423 intptr_t oprsz = simd_oprsz(desc); \ 3424 intptr_t odd_ofs = simd_data(desc); \ 3425 intptr_t i; \ 3426 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3427 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3428 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3429 *(TYPE *)(vd + H(i + 0)) = ae; \ 3430 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3431 } \ 3432 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3433 memset(vd + oprsz - 16, 0, 16); \ 3434 } \ 3435 } 3436 3437 DO_TRN(sve_trn_b, uint8_t, H1) 3438 DO_TRN(sve_trn_h, uint16_t, H1_2) 3439 DO_TRN(sve_trn_s, uint32_t, H1_4) 3440 DO_TRN(sve_trn_d, uint64_t, H1_8) 3441 DO_TRN(sve2_trn_q, Int128, ) 3442 3443 #undef DO_ZIP 3444 #undef DO_UZP 3445 #undef DO_TRN 3446 3447 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3448 { 3449 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3450 uint32_t *d = vd, *n = vn; 3451 uint8_t *pg = vg; 3452 3453 for (i = j = 0; i < opr_sz; i++) { 3454 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3455 d[H4(j)] = n[H4(i)]; 3456 j++; 3457 } 3458 } 3459 for (; j < opr_sz; j++) { 3460 d[H4(j)] = 0; 3461 } 3462 } 3463 3464 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3465 { 3466 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3467 uint64_t *d = vd, *n = vn; 3468 uint8_t *pg = vg; 3469 3470 for (i = j = 0; i < opr_sz; i++) { 3471 if (pg[H1(i)] & 1) { 3472 d[j] = n[i]; 3473 j++; 3474 } 3475 } 3476 for (; j < opr_sz; j++) { 3477 d[j] = 0; 3478 } 3479 } 3480 3481 /* Similar to the ARM LastActiveElement pseudocode function, except the 3482 * result is multiplied by the element size. This includes the not found 3483 * indication; e.g. not found for esz=3 is -8. 3484 */ 3485 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3486 { 3487 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3488 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3489 3490 return last_active_element(vg, words, esz); 3491 } 3492 3493 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3494 { 3495 intptr_t opr_sz = simd_oprsz(desc) / 8; 3496 int esz = simd_data(desc); 3497 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3498 intptr_t i, first_i, last_i; 3499 ARMVectorReg tmp; 3500 3501 first_i = last_i = 0; 3502 first_g = last_g = 0; 3503 3504 /* Find the extent of the active elements within VG. */ 3505 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3506 pg = *(uint64_t *)(vg + i) & mask; 3507 if (pg) { 3508 if (last_g == 0) { 3509 last_g = pg; 3510 last_i = i; 3511 } 3512 first_g = pg; 3513 first_i = i; 3514 } 3515 } 3516 3517 len = 0; 3518 if (first_g != 0) { 3519 first_i = first_i * 8 + ctz64(first_g); 3520 last_i = last_i * 8 + 63 - clz64(last_g); 3521 len = last_i - first_i + (1 << esz); 3522 if (vd == vm) { 3523 vm = memcpy(&tmp, vm, opr_sz * 8); 3524 } 3525 swap_memmove(vd, vn + first_i, len); 3526 } 3527 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3528 } 3529 3530 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3531 void *vg, uint32_t desc) 3532 { 3533 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3534 uint64_t *d = vd, *n = vn, *m = vm; 3535 uint8_t *pg = vg; 3536 3537 for (i = 0; i < opr_sz; i += 1) { 3538 uint64_t nn = n[i], mm = m[i]; 3539 uint64_t pp = expand_pred_b(pg[H1(i)]); 3540 d[i] = (nn & pp) | (mm & ~pp); 3541 } 3542 } 3543 3544 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3545 void *vg, uint32_t desc) 3546 { 3547 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3548 uint64_t *d = vd, *n = vn, *m = vm; 3549 uint8_t *pg = vg; 3550 3551 for (i = 0; i < opr_sz; i += 1) { 3552 uint64_t nn = n[i], mm = m[i]; 3553 uint64_t pp = expand_pred_h(pg[H1(i)]); 3554 d[i] = (nn & pp) | (mm & ~pp); 3555 } 3556 } 3557 3558 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3559 void *vg, uint32_t desc) 3560 { 3561 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3562 uint64_t *d = vd, *n = vn, *m = vm; 3563 uint8_t *pg = vg; 3564 3565 for (i = 0; i < opr_sz; i += 1) { 3566 uint64_t nn = n[i], mm = m[i]; 3567 uint64_t pp = expand_pred_s(pg[H1(i)]); 3568 d[i] = (nn & pp) | (mm & ~pp); 3569 } 3570 } 3571 3572 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3573 void *vg, uint32_t desc) 3574 { 3575 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3576 uint64_t *d = vd, *n = vn, *m = vm; 3577 uint8_t *pg = vg; 3578 3579 for (i = 0; i < opr_sz; i += 1) { 3580 uint64_t nn = n[i], mm = m[i]; 3581 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3582 } 3583 } 3584 3585 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3586 void *vg, uint32_t desc) 3587 { 3588 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3589 Int128 *d = vd, *n = vn, *m = vm; 3590 uint16_t *pg = vg; 3591 3592 for (i = 0; i < opr_sz; i += 1) { 3593 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3594 } 3595 } 3596 3597 /* Two operand comparison controlled by a predicate. 3598 * ??? It is very tempting to want to be able to expand this inline 3599 * with x86 instructions, e.g. 3600 * 3601 * vcmpeqw zm, zn, %ymm0 3602 * vpmovmskb %ymm0, %eax 3603 * and $0x5555, %eax 3604 * and pg, %eax 3605 * 3606 * or even aarch64, e.g. 3607 * 3608 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3609 * cmeq v0.8h, zn, zm 3610 * and v0.8h, v0.8h, mask 3611 * addv h0, v0.8h 3612 * and v0.8b, pg 3613 * 3614 * However, coming up with an abstraction that allows vector inputs and 3615 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3616 * scalar outputs, is tricky. 3617 */ 3618 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3619 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3620 { \ 3621 intptr_t opr_sz = simd_oprsz(desc); \ 3622 uint32_t flags = PREDTEST_INIT; \ 3623 intptr_t i = opr_sz; \ 3624 do { \ 3625 uint64_t out = 0, pg; \ 3626 do { \ 3627 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3628 TYPE nn = *(TYPE *)(vn + H(i)); \ 3629 TYPE mm = *(TYPE *)(vm + H(i)); \ 3630 out |= nn OP mm; \ 3631 } while (i & 63); \ 3632 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3633 out &= pg; \ 3634 *(uint64_t *)(vd + (i >> 3)) = out; \ 3635 flags = iter_predtest_bwd(out, pg, flags); \ 3636 } while (i > 0); \ 3637 return flags; \ 3638 } 3639 3640 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3641 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3642 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3643 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3644 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3645 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3646 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3647 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3648 3649 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3650 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3651 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3652 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3653 3654 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3655 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3656 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3657 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3658 3659 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3660 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3661 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3662 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3663 3664 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3665 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3666 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3667 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3668 3669 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3670 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3671 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3672 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3673 3674 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3675 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3676 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3677 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3678 3679 #undef DO_CMP_PPZZ_B 3680 #undef DO_CMP_PPZZ_H 3681 #undef DO_CMP_PPZZ_S 3682 #undef DO_CMP_PPZZ_D 3683 #undef DO_CMP_PPZZ 3684 3685 /* Similar, but the second source is "wide". */ 3686 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3687 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3688 { \ 3689 intptr_t opr_sz = simd_oprsz(desc); \ 3690 uint32_t flags = PREDTEST_INIT; \ 3691 intptr_t i = opr_sz; \ 3692 do { \ 3693 uint64_t out = 0, pg; \ 3694 do { \ 3695 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3696 do { \ 3697 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3698 TYPE nn = *(TYPE *)(vn + H(i)); \ 3699 out |= nn OP mm; \ 3700 } while (i & 7); \ 3701 } while (i & 63); \ 3702 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3703 out &= pg; \ 3704 *(uint64_t *)(vd + (i >> 3)) = out; \ 3705 flags = iter_predtest_bwd(out, pg, flags); \ 3706 } while (i > 0); \ 3707 return flags; \ 3708 } 3709 3710 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3711 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3712 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3713 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3714 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3715 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3716 3717 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3718 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3719 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3720 3721 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3722 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3723 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3724 3725 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3726 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3727 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3728 3729 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3730 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3731 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3732 3733 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3734 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3735 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3736 3737 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3738 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3739 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3740 3741 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3742 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3743 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3744 3745 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3746 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3747 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3748 3749 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3750 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3751 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3752 3753 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3754 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3755 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3756 3757 #undef DO_CMP_PPZW_B 3758 #undef DO_CMP_PPZW_H 3759 #undef DO_CMP_PPZW_S 3760 #undef DO_CMP_PPZW 3761 3762 /* Similar, but the second source is immediate. */ 3763 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3764 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3765 { \ 3766 intptr_t opr_sz = simd_oprsz(desc); \ 3767 uint32_t flags = PREDTEST_INIT; \ 3768 TYPE mm = simd_data(desc); \ 3769 intptr_t i = opr_sz; \ 3770 do { \ 3771 uint64_t out = 0, pg; \ 3772 do { \ 3773 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3774 TYPE nn = *(TYPE *)(vn + H(i)); \ 3775 out |= nn OP mm; \ 3776 } while (i & 63); \ 3777 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3778 out &= pg; \ 3779 *(uint64_t *)(vd + (i >> 3)) = out; \ 3780 flags = iter_predtest_bwd(out, pg, flags); \ 3781 } while (i > 0); \ 3782 return flags; \ 3783 } 3784 3785 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3786 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3787 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3788 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3789 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3790 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3791 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3792 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3793 3794 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3795 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3796 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3797 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3798 3799 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3800 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3801 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3802 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3803 3804 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3805 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3806 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3807 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3808 3809 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3810 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3811 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3812 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3813 3814 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3815 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3816 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3817 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3818 3819 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3820 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3821 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3822 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3823 3824 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3825 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3826 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3827 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3828 3829 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3830 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3831 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3832 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3833 3834 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3835 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3836 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3837 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3838 3839 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3840 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3841 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3842 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3843 3844 #undef DO_CMP_PPZI_B 3845 #undef DO_CMP_PPZI_H 3846 #undef DO_CMP_PPZI_S 3847 #undef DO_CMP_PPZI_D 3848 #undef DO_CMP_PPZI 3849 3850 /* Similar to the ARM LastActive pseudocode function. */ 3851 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3852 { 3853 intptr_t i; 3854 3855 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3856 uint64_t pg = *(uint64_t *)(vg + i); 3857 if (pg) { 3858 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3859 } 3860 } 3861 return 0; 3862 } 3863 3864 /* Compute a mask into RETB that is true for all G, up to and including 3865 * (if after) or excluding (if !after) the first G & N. 3866 * Return true if BRK found. 3867 */ 3868 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3869 bool brk, bool after) 3870 { 3871 uint64_t b; 3872 3873 if (brk) { 3874 b = 0; 3875 } else if ((g & n) == 0) { 3876 /* For all G, no N are set; break not found. */ 3877 b = g; 3878 } else { 3879 /* Break somewhere in N. Locate it. */ 3880 b = g & n; /* guard true, pred true */ 3881 b = b & -b; /* first such */ 3882 if (after) { 3883 b = b | (b - 1); /* break after same */ 3884 } else { 3885 b = b - 1; /* break before same */ 3886 } 3887 brk = true; 3888 } 3889 3890 *retb = b; 3891 return brk; 3892 } 3893 3894 /* Compute a zeroing BRK. */ 3895 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3896 intptr_t oprsz, bool after) 3897 { 3898 bool brk = false; 3899 intptr_t i; 3900 3901 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3902 uint64_t this_b, this_g = g[i]; 3903 3904 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3905 d[i] = this_b & this_g; 3906 } 3907 } 3908 3909 /* Likewise, but also compute flags. */ 3910 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3911 intptr_t oprsz, bool after) 3912 { 3913 uint32_t flags = PREDTEST_INIT; 3914 bool brk = false; 3915 intptr_t i; 3916 3917 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3918 uint64_t this_b, this_d, this_g = g[i]; 3919 3920 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3921 d[i] = this_d = this_b & this_g; 3922 flags = iter_predtest_fwd(this_d, this_g, flags); 3923 } 3924 return flags; 3925 } 3926 3927 /* Compute a merging BRK. */ 3928 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3929 intptr_t oprsz, bool after) 3930 { 3931 bool brk = false; 3932 intptr_t i; 3933 3934 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3935 uint64_t this_b, this_g = g[i]; 3936 3937 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3938 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3939 } 3940 } 3941 3942 /* Likewise, but also compute flags. */ 3943 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3944 intptr_t oprsz, bool after) 3945 { 3946 uint32_t flags = PREDTEST_INIT; 3947 bool brk = false; 3948 intptr_t i; 3949 3950 for (i = 0; i < oprsz / 8; ++i) { 3951 uint64_t this_b, this_d = d[i], this_g = g[i]; 3952 3953 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3954 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3955 flags = iter_predtest_fwd(this_d, this_g, flags); 3956 } 3957 return flags; 3958 } 3959 3960 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3961 { 3962 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3963 * The compiler should turn this into 4 64-bit integer stores. 3964 */ 3965 memset(d, 0, sizeof(ARMPredicateReg)); 3966 return PREDTEST_INIT; 3967 } 3968 3969 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 3970 uint32_t pred_desc) 3971 { 3972 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3973 if (last_active_pred(vn, vg, oprsz)) { 3974 compute_brk_z(vd, vm, vg, oprsz, true); 3975 } else { 3976 do_zero(vd, oprsz); 3977 } 3978 } 3979 3980 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 3981 uint32_t pred_desc) 3982 { 3983 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3984 if (last_active_pred(vn, vg, oprsz)) { 3985 return compute_brks_z(vd, vm, vg, oprsz, true); 3986 } else { 3987 return do_zero(vd, oprsz); 3988 } 3989 } 3990 3991 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 3992 uint32_t pred_desc) 3993 { 3994 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3995 if (last_active_pred(vn, vg, oprsz)) { 3996 compute_brk_z(vd, vm, vg, oprsz, false); 3997 } else { 3998 do_zero(vd, oprsz); 3999 } 4000 } 4001 4002 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4003 uint32_t pred_desc) 4004 { 4005 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4006 if (last_active_pred(vn, vg, oprsz)) { 4007 return compute_brks_z(vd, vm, vg, oprsz, false); 4008 } else { 4009 return do_zero(vd, oprsz); 4010 } 4011 } 4012 4013 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4014 { 4015 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4016 compute_brk_z(vd, vn, vg, oprsz, true); 4017 } 4018 4019 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4020 { 4021 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4022 return compute_brks_z(vd, vn, vg, oprsz, true); 4023 } 4024 4025 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4026 { 4027 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4028 compute_brk_z(vd, vn, vg, oprsz, false); 4029 } 4030 4031 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4032 { 4033 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4034 return compute_brks_z(vd, vn, vg, oprsz, false); 4035 } 4036 4037 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4038 { 4039 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4040 compute_brk_m(vd, vn, vg, oprsz, true); 4041 } 4042 4043 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4044 { 4045 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4046 return compute_brks_m(vd, vn, vg, oprsz, true); 4047 } 4048 4049 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4050 { 4051 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4052 compute_brk_m(vd, vn, vg, oprsz, false); 4053 } 4054 4055 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4056 { 4057 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4058 return compute_brks_m(vd, vn, vg, oprsz, false); 4059 } 4060 4061 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4062 { 4063 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4064 if (!last_active_pred(vn, vg, oprsz)) { 4065 do_zero(vd, oprsz); 4066 } 4067 } 4068 4069 /* As if PredTest(Ones(PL), D, esz). */ 4070 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4071 uint64_t esz_mask) 4072 { 4073 uint32_t flags = PREDTEST_INIT; 4074 intptr_t i; 4075 4076 for (i = 0; i < oprsz / 8; i++) { 4077 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4078 } 4079 if (oprsz & 7) { 4080 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4081 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4082 } 4083 return flags; 4084 } 4085 4086 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4087 { 4088 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4089 if (last_active_pred(vn, vg, oprsz)) { 4090 return predtest_ones(vd, oprsz, -1); 4091 } else { 4092 return do_zero(vd, oprsz); 4093 } 4094 } 4095 4096 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4097 { 4098 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4099 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4100 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4101 intptr_t i; 4102 4103 for (i = 0; i < words; ++i) { 4104 uint64_t t = n[i] & g[i] & mask; 4105 sum += ctpop64(t); 4106 } 4107 return sum; 4108 } 4109 4110 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4111 { 4112 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4113 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4114 uint64_t esz_mask = pred_esz_masks[esz]; 4115 ARMPredicateReg *d = vd; 4116 uint32_t flags; 4117 intptr_t i; 4118 4119 /* Begin with a zero predicate register. */ 4120 flags = do_zero(d, oprsz); 4121 if (count == 0) { 4122 return flags; 4123 } 4124 4125 /* Set all of the requested bits. */ 4126 for (i = 0; i < count / 64; ++i) { 4127 d->p[i] = esz_mask; 4128 } 4129 if (count & 63) { 4130 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4131 } 4132 4133 return predtest_ones(d, oprsz, esz_mask); 4134 } 4135 4136 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4137 { 4138 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4139 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4140 uint64_t esz_mask = pred_esz_masks[esz]; 4141 ARMPredicateReg *d = vd; 4142 intptr_t i, invcount, oprbits; 4143 uint64_t bits; 4144 4145 if (count == 0) { 4146 return do_zero(d, oprsz); 4147 } 4148 4149 oprbits = oprsz * 8; 4150 tcg_debug_assert(count <= oprbits); 4151 4152 bits = esz_mask; 4153 if (oprbits & 63) { 4154 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4155 } 4156 4157 invcount = oprbits - count; 4158 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4159 d->p[i] = bits; 4160 bits = esz_mask; 4161 } 4162 4163 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4164 4165 while (--i >= 0) { 4166 d->p[i] = 0; 4167 } 4168 4169 return predtest_ones(d, oprsz, esz_mask); 4170 } 4171 4172 /* Recursive reduction on a function; 4173 * C.f. the ARM ARM function ReducePredicated. 4174 * 4175 * While it would be possible to write this without the DATA temporary, 4176 * it is much simpler to process the predicate register this way. 4177 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4178 * little to gain with a more complex non-recursive form. 4179 */ 4180 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4181 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4182 { \ 4183 if (n == 1) { \ 4184 return *data; \ 4185 } else { \ 4186 uintptr_t half = n / 2; \ 4187 TYPE lo = NAME##_reduce(data, status, half); \ 4188 TYPE hi = NAME##_reduce(data + half, status, half); \ 4189 return TYPE##_##FUNC(lo, hi, status); \ 4190 } \ 4191 } \ 4192 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ 4193 { \ 4194 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4195 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4196 for (i = 0; i < oprsz; ) { \ 4197 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4198 do { \ 4199 TYPE nn = *(TYPE *)(vn + H(i)); \ 4200 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4201 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4202 } while (i & 15); \ 4203 } \ 4204 for (; i < maxsz; i += sizeof(TYPE)) { \ 4205 *(TYPE *)((void *)data + i) = IDENT; \ 4206 } \ 4207 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \ 4208 } 4209 4210 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) 4211 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) 4212 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) 4213 4214 /* Identity is floatN_default_nan, without the function call. */ 4215 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) 4216 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) 4217 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) 4218 4219 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) 4220 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) 4221 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) 4222 4223 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) 4224 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) 4225 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) 4226 4227 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) 4228 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) 4229 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) 4230 4231 #undef DO_REDUCE 4232 4233 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4234 void *status, uint32_t desc) 4235 { 4236 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4237 float16 result = nn; 4238 4239 do { 4240 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4241 do { 4242 if (pg & 1) { 4243 float16 mm = *(float16 *)(vm + H1_2(i)); 4244 result = float16_add(result, mm, status); 4245 } 4246 i += sizeof(float16), pg >>= sizeof(float16); 4247 } while (i & 15); 4248 } while (i < opr_sz); 4249 4250 return result; 4251 } 4252 4253 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4254 void *status, uint32_t desc) 4255 { 4256 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4257 float32 result = nn; 4258 4259 do { 4260 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4261 do { 4262 if (pg & 1) { 4263 float32 mm = *(float32 *)(vm + H1_2(i)); 4264 result = float32_add(result, mm, status); 4265 } 4266 i += sizeof(float32), pg >>= sizeof(float32); 4267 } while (i & 15); 4268 } while (i < opr_sz); 4269 4270 return result; 4271 } 4272 4273 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4274 void *status, uint32_t desc) 4275 { 4276 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4277 uint64_t *m = vm; 4278 uint8_t *pg = vg; 4279 4280 for (i = 0; i < opr_sz; i++) { 4281 if (pg[H1(i)] & 1) { 4282 nn = float64_add(nn, m[i], status); 4283 } 4284 } 4285 4286 return nn; 4287 } 4288 4289 /* Fully general three-operand expander, controlled by a predicate, 4290 * With the extra float_status parameter. 4291 */ 4292 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4293 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4294 void *status, uint32_t desc) \ 4295 { \ 4296 intptr_t i = simd_oprsz(desc); \ 4297 uint64_t *g = vg; \ 4298 do { \ 4299 uint64_t pg = g[(i - 1) >> 6]; \ 4300 do { \ 4301 i -= sizeof(TYPE); \ 4302 if (likely((pg >> (i & 63)) & 1)) { \ 4303 TYPE nn = *(TYPE *)(vn + H(i)); \ 4304 TYPE mm = *(TYPE *)(vm + H(i)); \ 4305 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4306 } \ 4307 } while (i & 63); \ 4308 } while (i != 0); \ 4309 } 4310 4311 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4312 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4313 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4314 4315 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4316 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4317 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4318 4319 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4320 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4321 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4322 4323 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4324 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4325 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4326 4327 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4328 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4329 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4330 4331 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4332 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4333 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4334 4335 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4336 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4337 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4338 4339 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4340 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4341 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4342 4343 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4344 { 4345 return float16_abs(float16_sub(a, b, s)); 4346 } 4347 4348 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4349 { 4350 return float32_abs(float32_sub(a, b, s)); 4351 } 4352 4353 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4354 { 4355 return float64_abs(float64_sub(a, b, s)); 4356 } 4357 4358 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4359 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4360 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4361 4362 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4363 { 4364 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4365 return float64_scalbn(a, b_int, s); 4366 } 4367 4368 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4369 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4370 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4371 4372 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4373 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4374 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4375 4376 #undef DO_ZPZZ_FP 4377 4378 /* Three-operand expander, with one scalar operand, controlled by 4379 * a predicate, with the extra float_status parameter. 4380 */ 4381 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4382 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4383 void *status, uint32_t desc) \ 4384 { \ 4385 intptr_t i = simd_oprsz(desc); \ 4386 uint64_t *g = vg; \ 4387 TYPE mm = scalar; \ 4388 do { \ 4389 uint64_t pg = g[(i - 1) >> 6]; \ 4390 do { \ 4391 i -= sizeof(TYPE); \ 4392 if (likely((pg >> (i & 63)) & 1)) { \ 4393 TYPE nn = *(TYPE *)(vn + H(i)); \ 4394 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4395 } \ 4396 } while (i & 63); \ 4397 } while (i != 0); \ 4398 } 4399 4400 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4401 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4402 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4403 4404 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4405 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4406 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4407 4408 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4409 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4410 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4411 4412 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4413 { 4414 return float16_sub(b, a, s); 4415 } 4416 4417 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4418 { 4419 return float32_sub(b, a, s); 4420 } 4421 4422 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4423 { 4424 return float64_sub(b, a, s); 4425 } 4426 4427 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4428 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4429 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4430 4431 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4432 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4433 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4434 4435 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4436 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4437 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4438 4439 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4440 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4441 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4442 4443 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4444 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4445 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4446 4447 /* Fully general two-operand expander, controlled by a predicate, 4448 * With the extra float_status parameter. 4449 */ 4450 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4451 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 4452 { \ 4453 intptr_t i = simd_oprsz(desc); \ 4454 uint64_t *g = vg; \ 4455 do { \ 4456 uint64_t pg = g[(i - 1) >> 6]; \ 4457 do { \ 4458 i -= sizeof(TYPE); \ 4459 if (likely((pg >> (i & 63)) & 1)) { \ 4460 TYPE nn = *(TYPE *)(vn + H(i)); \ 4461 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4462 } \ 4463 } while (i & 63); \ 4464 } while (i != 0); \ 4465 } 4466 4467 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4468 * FZ16. When converting from fp16, this affects flushing input denormals; 4469 * when converting to fp16, this affects flushing output denormals. 4470 */ 4471 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4472 { 4473 bool save = get_flush_inputs_to_zero(fpst); 4474 float32 ret; 4475 4476 set_flush_inputs_to_zero(false, fpst); 4477 ret = float16_to_float32(f, true, fpst); 4478 set_flush_inputs_to_zero(save, fpst); 4479 return ret; 4480 } 4481 4482 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4483 { 4484 bool save = get_flush_inputs_to_zero(fpst); 4485 float64 ret; 4486 4487 set_flush_inputs_to_zero(false, fpst); 4488 ret = float16_to_float64(f, true, fpst); 4489 set_flush_inputs_to_zero(save, fpst); 4490 return ret; 4491 } 4492 4493 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4494 { 4495 bool save = get_flush_to_zero(fpst); 4496 float16 ret; 4497 4498 set_flush_to_zero(false, fpst); 4499 ret = float32_to_float16(f, true, fpst); 4500 set_flush_to_zero(save, fpst); 4501 return ret; 4502 } 4503 4504 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4505 { 4506 bool save = get_flush_to_zero(fpst); 4507 float16 ret; 4508 4509 set_flush_to_zero(false, fpst); 4510 ret = float64_to_float16(f, true, fpst); 4511 set_flush_to_zero(save, fpst); 4512 return ret; 4513 } 4514 4515 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4516 { 4517 if (float16_is_any_nan(f)) { 4518 float_raise(float_flag_invalid, s); 4519 return 0; 4520 } 4521 return float16_to_int16_round_to_zero(f, s); 4522 } 4523 4524 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4525 { 4526 if (float16_is_any_nan(f)) { 4527 float_raise(float_flag_invalid, s); 4528 return 0; 4529 } 4530 return float16_to_int64_round_to_zero(f, s); 4531 } 4532 4533 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4534 { 4535 if (float32_is_any_nan(f)) { 4536 float_raise(float_flag_invalid, s); 4537 return 0; 4538 } 4539 return float32_to_int64_round_to_zero(f, s); 4540 } 4541 4542 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4543 { 4544 if (float64_is_any_nan(f)) { 4545 float_raise(float_flag_invalid, s); 4546 return 0; 4547 } 4548 return float64_to_int64_round_to_zero(f, s); 4549 } 4550 4551 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4552 { 4553 if (float16_is_any_nan(f)) { 4554 float_raise(float_flag_invalid, s); 4555 return 0; 4556 } 4557 return float16_to_uint16_round_to_zero(f, s); 4558 } 4559 4560 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4561 { 4562 if (float16_is_any_nan(f)) { 4563 float_raise(float_flag_invalid, s); 4564 return 0; 4565 } 4566 return float16_to_uint64_round_to_zero(f, s); 4567 } 4568 4569 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4570 { 4571 if (float32_is_any_nan(f)) { 4572 float_raise(float_flag_invalid, s); 4573 return 0; 4574 } 4575 return float32_to_uint64_round_to_zero(f, s); 4576 } 4577 4578 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4579 { 4580 if (float64_is_any_nan(f)) { 4581 float_raise(float_flag_invalid, s); 4582 return 0; 4583 } 4584 return float64_to_uint64_round_to_zero(f, s); 4585 } 4586 4587 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4588 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4589 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4590 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4591 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4592 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4593 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4594 4595 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4596 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4597 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4598 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4599 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4600 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4601 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4602 4603 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4604 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4605 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4606 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4607 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4608 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4609 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4610 4611 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4612 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4613 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4614 4615 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4616 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4617 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4618 4619 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4620 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4621 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4622 4623 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4624 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4625 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4626 4627 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4628 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4629 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4630 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4631 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4632 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4633 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4634 4635 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4636 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4637 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4638 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4639 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4640 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4641 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4642 4643 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4644 { 4645 /* Extract frac to the top of the uint32_t. */ 4646 uint32_t frac = (uint32_t)a << (16 + 6); 4647 int16_t exp = extract32(a, 10, 5); 4648 4649 if (unlikely(exp == 0)) { 4650 if (frac != 0) { 4651 if (!get_flush_inputs_to_zero(s)) { 4652 /* denormal: bias - fractional_zeros */ 4653 return -15 - clz32(frac); 4654 } 4655 /* flush to zero */ 4656 float_raise(float_flag_input_denormal, s); 4657 } 4658 } else if (unlikely(exp == 0x1f)) { 4659 if (frac == 0) { 4660 return INT16_MAX; /* infinity */ 4661 } 4662 } else { 4663 /* normal: exp - bias */ 4664 return exp - 15; 4665 } 4666 /* nan or zero */ 4667 float_raise(float_flag_invalid, s); 4668 return INT16_MIN; 4669 } 4670 4671 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4672 { 4673 /* Extract frac to the top of the uint32_t. */ 4674 uint32_t frac = a << 9; 4675 int32_t exp = extract32(a, 23, 8); 4676 4677 if (unlikely(exp == 0)) { 4678 if (frac != 0) { 4679 if (!get_flush_inputs_to_zero(s)) { 4680 /* denormal: bias - fractional_zeros */ 4681 return -127 - clz32(frac); 4682 } 4683 /* flush to zero */ 4684 float_raise(float_flag_input_denormal, s); 4685 } 4686 } else if (unlikely(exp == 0xff)) { 4687 if (frac == 0) { 4688 return INT32_MAX; /* infinity */ 4689 } 4690 } else { 4691 /* normal: exp - bias */ 4692 return exp - 127; 4693 } 4694 /* nan or zero */ 4695 float_raise(float_flag_invalid, s); 4696 return INT32_MIN; 4697 } 4698 4699 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4700 { 4701 /* Extract frac to the top of the uint64_t. */ 4702 uint64_t frac = a << 12; 4703 int64_t exp = extract64(a, 52, 11); 4704 4705 if (unlikely(exp == 0)) { 4706 if (frac != 0) { 4707 if (!get_flush_inputs_to_zero(s)) { 4708 /* denormal: bias - fractional_zeros */ 4709 return -1023 - clz64(frac); 4710 } 4711 /* flush to zero */ 4712 float_raise(float_flag_input_denormal, s); 4713 } 4714 } else if (unlikely(exp == 0x7ff)) { 4715 if (frac == 0) { 4716 return INT64_MAX; /* infinity */ 4717 } 4718 } else { 4719 /* normal: exp - bias */ 4720 return exp - 1023; 4721 } 4722 /* nan or zero */ 4723 float_raise(float_flag_invalid, s); 4724 return INT64_MIN; 4725 } 4726 4727 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4728 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4729 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4730 4731 #undef DO_ZPZ_FP 4732 4733 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4734 float_status *status, uint32_t desc, 4735 uint16_t neg1, uint16_t neg3) 4736 { 4737 intptr_t i = simd_oprsz(desc); 4738 uint64_t *g = vg; 4739 4740 do { 4741 uint64_t pg = g[(i - 1) >> 6]; 4742 do { 4743 i -= 2; 4744 if (likely((pg >> (i & 63)) & 1)) { 4745 float16 e1, e2, e3, r; 4746 4747 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4748 e2 = *(uint16_t *)(vm + H1_2(i)); 4749 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4750 r = float16_muladd(e1, e2, e3, 0, status); 4751 *(uint16_t *)(vd + H1_2(i)) = r; 4752 } 4753 } while (i & 63); 4754 } while (i != 0); 4755 } 4756 4757 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4758 void *vg, void *status, uint32_t desc) 4759 { 4760 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); 4761 } 4762 4763 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4764 void *vg, void *status, uint32_t desc) 4765 { 4766 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); 4767 } 4768 4769 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4770 void *vg, void *status, uint32_t desc) 4771 { 4772 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); 4773 } 4774 4775 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4776 void *vg, void *status, uint32_t desc) 4777 { 4778 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); 4779 } 4780 4781 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4782 float_status *status, uint32_t desc, 4783 uint32_t neg1, uint32_t neg3) 4784 { 4785 intptr_t i = simd_oprsz(desc); 4786 uint64_t *g = vg; 4787 4788 do { 4789 uint64_t pg = g[(i - 1) >> 6]; 4790 do { 4791 i -= 4; 4792 if (likely((pg >> (i & 63)) & 1)) { 4793 float32 e1, e2, e3, r; 4794 4795 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4796 e2 = *(uint32_t *)(vm + H1_4(i)); 4797 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4798 r = float32_muladd(e1, e2, e3, 0, status); 4799 *(uint32_t *)(vd + H1_4(i)) = r; 4800 } 4801 } while (i & 63); 4802 } while (i != 0); 4803 } 4804 4805 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4806 void *vg, void *status, uint32_t desc) 4807 { 4808 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); 4809 } 4810 4811 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4812 void *vg, void *status, uint32_t desc) 4813 { 4814 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); 4815 } 4816 4817 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4818 void *vg, void *status, uint32_t desc) 4819 { 4820 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); 4821 } 4822 4823 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4824 void *vg, void *status, uint32_t desc) 4825 { 4826 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); 4827 } 4828 4829 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4830 float_status *status, uint32_t desc, 4831 uint64_t neg1, uint64_t neg3) 4832 { 4833 intptr_t i = simd_oprsz(desc); 4834 uint64_t *g = vg; 4835 4836 do { 4837 uint64_t pg = g[(i - 1) >> 6]; 4838 do { 4839 i -= 8; 4840 if (likely((pg >> (i & 63)) & 1)) { 4841 float64 e1, e2, e3, r; 4842 4843 e1 = *(uint64_t *)(vn + i) ^ neg1; 4844 e2 = *(uint64_t *)(vm + i); 4845 e3 = *(uint64_t *)(va + i) ^ neg3; 4846 r = float64_muladd(e1, e2, e3, 0, status); 4847 *(uint64_t *)(vd + i) = r; 4848 } 4849 } while (i & 63); 4850 } while (i != 0); 4851 } 4852 4853 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4854 void *vg, void *status, uint32_t desc) 4855 { 4856 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); 4857 } 4858 4859 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4860 void *vg, void *status, uint32_t desc) 4861 { 4862 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); 4863 } 4864 4865 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4866 void *vg, void *status, uint32_t desc) 4867 { 4868 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); 4869 } 4870 4871 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4872 void *vg, void *status, uint32_t desc) 4873 { 4874 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); 4875 } 4876 4877 /* Two operand floating-point comparison controlled by a predicate. 4878 * Unlike the integer version, we are not allowed to optimistically 4879 * compare operands, since the comparison may have side effects wrt 4880 * the FPSR. 4881 */ 4882 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 4883 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4884 void *status, uint32_t desc) \ 4885 { \ 4886 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4887 uint64_t *d = vd, *g = vg; \ 4888 do { \ 4889 uint64_t out = 0, pg = g[j]; \ 4890 do { \ 4891 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4892 if (likely((pg >> (i & 63)) & 1)) { \ 4893 TYPE nn = *(TYPE *)(vn + H(i)); \ 4894 TYPE mm = *(TYPE *)(vm + H(i)); \ 4895 out |= OP(TYPE, nn, mm, status); \ 4896 } \ 4897 } while (i & 63); \ 4898 d[j--] = out; \ 4899 } while (i > 0); \ 4900 } 4901 4902 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 4903 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 4904 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 4905 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 4906 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 4907 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 4908 4909 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 4910 DO_FPCMP_PPZZ_H(NAME, OP) \ 4911 DO_FPCMP_PPZZ_S(NAME, OP) \ 4912 DO_FPCMP_PPZZ_D(NAME, OP) 4913 4914 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 4915 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 4916 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 4917 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 4918 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 4919 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 4920 #define DO_FCMUO(TYPE, X, Y, ST) \ 4921 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 4922 #define DO_FACGE(TYPE, X, Y, ST) \ 4923 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 4924 #define DO_FACGT(TYPE, X, Y, ST) \ 4925 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 4926 4927 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 4928 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 4929 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 4930 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 4931 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 4932 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 4933 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 4934 4935 #undef DO_FPCMP_PPZZ_ALL 4936 #undef DO_FPCMP_PPZZ_D 4937 #undef DO_FPCMP_PPZZ_S 4938 #undef DO_FPCMP_PPZZ_H 4939 #undef DO_FPCMP_PPZZ 4940 4941 /* One operand floating-point comparison against zero, controlled 4942 * by a predicate. 4943 */ 4944 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 4945 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4946 void *status, uint32_t desc) \ 4947 { \ 4948 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4949 uint64_t *d = vd, *g = vg; \ 4950 do { \ 4951 uint64_t out = 0, pg = g[j]; \ 4952 do { \ 4953 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4954 if ((pg >> (i & 63)) & 1) { \ 4955 TYPE nn = *(TYPE *)(vn + H(i)); \ 4956 out |= OP(TYPE, nn, 0, status); \ 4957 } \ 4958 } while (i & 63); \ 4959 d[j--] = out; \ 4960 } while (i > 0); \ 4961 } 4962 4963 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 4964 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 4965 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 4966 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 4967 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 4968 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 4969 4970 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 4971 DO_FPCMP_PPZ0_H(NAME, OP) \ 4972 DO_FPCMP_PPZ0_S(NAME, OP) \ 4973 DO_FPCMP_PPZ0_D(NAME, OP) 4974 4975 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 4976 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 4977 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 4978 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 4979 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 4980 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 4981 4982 /* FP Trig Multiply-Add. */ 4983 4984 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 4985 { 4986 static const float16 coeff[16] = { 4987 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4988 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4989 }; 4990 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 4991 intptr_t x = simd_data(desc); 4992 float16 *d = vd, *n = vn, *m = vm; 4993 for (i = 0; i < opr_sz; i++) { 4994 float16 mm = m[i]; 4995 intptr_t xx = x; 4996 if (float16_is_neg(mm)) { 4997 mm = float16_abs(mm); 4998 xx += 8; 4999 } 5000 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs); 5001 } 5002 } 5003 5004 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5005 { 5006 static const float32 coeff[16] = { 5007 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5008 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5009 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5010 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5011 }; 5012 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5013 intptr_t x = simd_data(desc); 5014 float32 *d = vd, *n = vn, *m = vm; 5015 for (i = 0; i < opr_sz; i++) { 5016 float32 mm = m[i]; 5017 intptr_t xx = x; 5018 if (float32_is_neg(mm)) { 5019 mm = float32_abs(mm); 5020 xx += 8; 5021 } 5022 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs); 5023 } 5024 } 5025 5026 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5027 { 5028 static const float64 coeff[16] = { 5029 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5030 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5031 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5032 0x3de5d8408868552full, 0x0000000000000000ull, 5033 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5034 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5035 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5036 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5037 }; 5038 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5039 intptr_t x = simd_data(desc); 5040 float64 *d = vd, *n = vn, *m = vm; 5041 for (i = 0; i < opr_sz; i++) { 5042 float64 mm = m[i]; 5043 intptr_t xx = x; 5044 if (float64_is_neg(mm)) { 5045 mm = float64_abs(mm); 5046 xx += 8; 5047 } 5048 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs); 5049 } 5050 } 5051 5052 /* 5053 * FP Complex Add 5054 */ 5055 5056 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5057 void *vs, uint32_t desc) 5058 { 5059 intptr_t j, i = simd_oprsz(desc); 5060 uint64_t *g = vg; 5061 float16 neg_imag = float16_set_sign(0, simd_data(desc)); 5062 float16 neg_real = float16_chs(neg_imag); 5063 5064 do { 5065 uint64_t pg = g[(i - 1) >> 6]; 5066 do { 5067 float16 e0, e1, e2, e3; 5068 5069 /* I holds the real index; J holds the imag index. */ 5070 j = i - sizeof(float16); 5071 i -= 2 * sizeof(float16); 5072 5073 e0 = *(float16 *)(vn + H1_2(i)); 5074 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; 5075 e2 = *(float16 *)(vn + H1_2(j)); 5076 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; 5077 5078 if (likely((pg >> (i & 63)) & 1)) { 5079 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs); 5080 } 5081 if (likely((pg >> (j & 63)) & 1)) { 5082 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs); 5083 } 5084 } while (i & 63); 5085 } while (i != 0); 5086 } 5087 5088 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5089 void *vs, uint32_t desc) 5090 { 5091 intptr_t j, i = simd_oprsz(desc); 5092 uint64_t *g = vg; 5093 float32 neg_imag = float32_set_sign(0, simd_data(desc)); 5094 float32 neg_real = float32_chs(neg_imag); 5095 5096 do { 5097 uint64_t pg = g[(i - 1) >> 6]; 5098 do { 5099 float32 e0, e1, e2, e3; 5100 5101 /* I holds the real index; J holds the imag index. */ 5102 j = i - sizeof(float32); 5103 i -= 2 * sizeof(float32); 5104 5105 e0 = *(float32 *)(vn + H1_2(i)); 5106 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; 5107 e2 = *(float32 *)(vn + H1_2(j)); 5108 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; 5109 5110 if (likely((pg >> (i & 63)) & 1)) { 5111 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs); 5112 } 5113 if (likely((pg >> (j & 63)) & 1)) { 5114 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs); 5115 } 5116 } while (i & 63); 5117 } while (i != 0); 5118 } 5119 5120 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5121 void *vs, uint32_t desc) 5122 { 5123 intptr_t j, i = simd_oprsz(desc); 5124 uint64_t *g = vg; 5125 float64 neg_imag = float64_set_sign(0, simd_data(desc)); 5126 float64 neg_real = float64_chs(neg_imag); 5127 5128 do { 5129 uint64_t pg = g[(i - 1) >> 6]; 5130 do { 5131 float64 e0, e1, e2, e3; 5132 5133 /* I holds the real index; J holds the imag index. */ 5134 j = i - sizeof(float64); 5135 i -= 2 * sizeof(float64); 5136 5137 e0 = *(float64 *)(vn + H1_2(i)); 5138 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; 5139 e2 = *(float64 *)(vn + H1_2(j)); 5140 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; 5141 5142 if (likely((pg >> (i & 63)) & 1)) { 5143 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs); 5144 } 5145 if (likely((pg >> (j & 63)) & 1)) { 5146 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs); 5147 } 5148 } while (i & 63); 5149 } while (i != 0); 5150 } 5151 5152 /* 5153 * FP Complex Multiply 5154 */ 5155 5156 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5157 void *vg, void *status, uint32_t desc) 5158 { 5159 intptr_t j, i = simd_oprsz(desc); 5160 unsigned rot = simd_data(desc); 5161 bool flip = rot & 1; 5162 float16 neg_imag, neg_real; 5163 uint64_t *g = vg; 5164 5165 neg_imag = float16_set_sign(0, (rot & 2) != 0); 5166 neg_real = float16_set_sign(0, rot == 1 || rot == 2); 5167 5168 do { 5169 uint64_t pg = g[(i - 1) >> 6]; 5170 do { 5171 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5172 5173 /* I holds the real index; J holds the imag index. */ 5174 j = i - sizeof(float16); 5175 i -= 2 * sizeof(float16); 5176 5177 nr = *(float16 *)(vn + H1_2(i)); 5178 ni = *(float16 *)(vn + H1_2(j)); 5179 mr = *(float16 *)(vm + H1_2(i)); 5180 mi = *(float16 *)(vm + H1_2(j)); 5181 5182 e2 = (flip ? ni : nr); 5183 e1 = (flip ? mi : mr) ^ neg_real; 5184 e4 = e2; 5185 e3 = (flip ? mr : mi) ^ neg_imag; 5186 5187 if (likely((pg >> (i & 63)) & 1)) { 5188 d = *(float16 *)(va + H1_2(i)); 5189 d = float16_muladd(e2, e1, d, 0, status); 5190 *(float16 *)(vd + H1_2(i)) = d; 5191 } 5192 if (likely((pg >> (j & 63)) & 1)) { 5193 d = *(float16 *)(va + H1_2(j)); 5194 d = float16_muladd(e4, e3, d, 0, status); 5195 *(float16 *)(vd + H1_2(j)) = d; 5196 } 5197 } while (i & 63); 5198 } while (i != 0); 5199 } 5200 5201 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5202 void *vg, void *status, uint32_t desc) 5203 { 5204 intptr_t j, i = simd_oprsz(desc); 5205 unsigned rot = simd_data(desc); 5206 bool flip = rot & 1; 5207 float32 neg_imag, neg_real; 5208 uint64_t *g = vg; 5209 5210 neg_imag = float32_set_sign(0, (rot & 2) != 0); 5211 neg_real = float32_set_sign(0, rot == 1 || rot == 2); 5212 5213 do { 5214 uint64_t pg = g[(i - 1) >> 6]; 5215 do { 5216 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5217 5218 /* I holds the real index; J holds the imag index. */ 5219 j = i - sizeof(float32); 5220 i -= 2 * sizeof(float32); 5221 5222 nr = *(float32 *)(vn + H1_2(i)); 5223 ni = *(float32 *)(vn + H1_2(j)); 5224 mr = *(float32 *)(vm + H1_2(i)); 5225 mi = *(float32 *)(vm + H1_2(j)); 5226 5227 e2 = (flip ? ni : nr); 5228 e1 = (flip ? mi : mr) ^ neg_real; 5229 e4 = e2; 5230 e3 = (flip ? mr : mi) ^ neg_imag; 5231 5232 if (likely((pg >> (i & 63)) & 1)) { 5233 d = *(float32 *)(va + H1_2(i)); 5234 d = float32_muladd(e2, e1, d, 0, status); 5235 *(float32 *)(vd + H1_2(i)) = d; 5236 } 5237 if (likely((pg >> (j & 63)) & 1)) { 5238 d = *(float32 *)(va + H1_2(j)); 5239 d = float32_muladd(e4, e3, d, 0, status); 5240 *(float32 *)(vd + H1_2(j)) = d; 5241 } 5242 } while (i & 63); 5243 } while (i != 0); 5244 } 5245 5246 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5247 void *vg, void *status, uint32_t desc) 5248 { 5249 intptr_t j, i = simd_oprsz(desc); 5250 unsigned rot = simd_data(desc); 5251 bool flip = rot & 1; 5252 float64 neg_imag, neg_real; 5253 uint64_t *g = vg; 5254 5255 neg_imag = float64_set_sign(0, (rot & 2) != 0); 5256 neg_real = float64_set_sign(0, rot == 1 || rot == 2); 5257 5258 do { 5259 uint64_t pg = g[(i - 1) >> 6]; 5260 do { 5261 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5262 5263 /* I holds the real index; J holds the imag index. */ 5264 j = i - sizeof(float64); 5265 i -= 2 * sizeof(float64); 5266 5267 nr = *(float64 *)(vn + H1_2(i)); 5268 ni = *(float64 *)(vn + H1_2(j)); 5269 mr = *(float64 *)(vm + H1_2(i)); 5270 mi = *(float64 *)(vm + H1_2(j)); 5271 5272 e2 = (flip ? ni : nr); 5273 e1 = (flip ? mi : mr) ^ neg_real; 5274 e4 = e2; 5275 e3 = (flip ? mr : mi) ^ neg_imag; 5276 5277 if (likely((pg >> (i & 63)) & 1)) { 5278 d = *(float64 *)(va + H1_2(i)); 5279 d = float64_muladd(e2, e1, d, 0, status); 5280 *(float64 *)(vd + H1_2(i)) = d; 5281 } 5282 if (likely((pg >> (j & 63)) & 1)) { 5283 d = *(float64 *)(va + H1_2(j)); 5284 d = float64_muladd(e4, e3, d, 0, status); 5285 *(float64 *)(vd + H1_2(j)) = d; 5286 } 5287 } while (i & 63); 5288 } while (i != 0); 5289 } 5290 5291 /* 5292 * Load contiguous data, protected by a governing predicate. 5293 */ 5294 5295 /* 5296 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5297 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5298 * element >= @reg_off, or @reg_max if there were no active elements at all. 5299 */ 5300 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5301 intptr_t reg_max, int esz) 5302 { 5303 uint64_t pg_mask = pred_esz_masks[esz]; 5304 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5305 5306 /* In normal usage, the first element is active. */ 5307 if (likely(pg & 1)) { 5308 return reg_off; 5309 } 5310 5311 if (pg == 0) { 5312 reg_off &= -64; 5313 do { 5314 reg_off += 64; 5315 if (unlikely(reg_off >= reg_max)) { 5316 /* The entire predicate was false. */ 5317 return reg_max; 5318 } 5319 pg = vg[reg_off >> 6] & pg_mask; 5320 } while (pg == 0); 5321 } 5322 reg_off += ctz64(pg); 5323 5324 /* We should never see an out of range predicate bit set. */ 5325 tcg_debug_assert(reg_off < reg_max); 5326 return reg_off; 5327 } 5328 5329 /* 5330 * Resolve the guest virtual address to info->host and info->flags. 5331 * If @nofault, return false if the page is invalid, otherwise 5332 * exit via page fault exception. 5333 */ 5334 5335 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5336 target_ulong addr, int mem_off, MMUAccessType access_type, 5337 int mmu_idx, uintptr_t retaddr) 5338 { 5339 int flags; 5340 5341 addr += mem_off; 5342 5343 /* 5344 * User-only currently always issues with TBI. See the comment 5345 * above useronly_clean_ptr. Usually we clean this top byte away 5346 * during translation, but we can't do that for e.g. vector + imm 5347 * addressing modes. 5348 * 5349 * We currently always enable TBI for user-only, and do not provide 5350 * a way to turn it off. So clean the pointer unconditionally here, 5351 * rather than look it up here, or pass it down from above. 5352 */ 5353 addr = useronly_clean_ptr(addr); 5354 5355 #ifdef CONFIG_USER_ONLY 5356 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5357 &info->host, retaddr); 5358 #else 5359 CPUTLBEntryFull *full; 5360 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5361 &info->host, &full, retaddr); 5362 #endif 5363 info->flags = flags; 5364 5365 if (flags & TLB_INVALID_MASK) { 5366 g_assert(nofault); 5367 return false; 5368 } 5369 5370 #ifdef CONFIG_USER_ONLY 5371 memset(&info->attrs, 0, sizeof(info->attrs)); 5372 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5373 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5374 #else 5375 info->attrs = full->attrs; 5376 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5377 #endif 5378 5379 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5380 info->host -= mem_off; 5381 return true; 5382 } 5383 5384 /* 5385 * Find first active element on each page, and a loose bound for the 5386 * final element on each page. Identify any single element that spans 5387 * the page boundary. Return true if there are any active elements. 5388 */ 5389 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5390 intptr_t reg_max, int esz, int msize) 5391 { 5392 const int esize = 1 << esz; 5393 const uint64_t pg_mask = pred_esz_masks[esz]; 5394 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5395 intptr_t mem_off_last, mem_off_split; 5396 intptr_t page_split, elt_split; 5397 intptr_t i; 5398 5399 /* Set all of the element indices to -1, and the TLB data to 0. */ 5400 memset(info, -1, offsetof(SVEContLdSt, page)); 5401 memset(info->page, 0, sizeof(info->page)); 5402 5403 /* Gross scan over the entire predicate to find bounds. */ 5404 i = 0; 5405 do { 5406 uint64_t pg = vg[i] & pg_mask; 5407 if (pg) { 5408 reg_off_last = i * 64 + 63 - clz64(pg); 5409 if (reg_off_first < 0) { 5410 reg_off_first = i * 64 + ctz64(pg); 5411 } 5412 } 5413 } while (++i * 64 < reg_max); 5414 5415 if (unlikely(reg_off_first < 0)) { 5416 /* No active elements, no pages touched. */ 5417 return false; 5418 } 5419 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5420 5421 info->reg_off_first[0] = reg_off_first; 5422 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5423 mem_off_last = (reg_off_last >> esz) * msize; 5424 5425 page_split = -(addr | TARGET_PAGE_MASK); 5426 if (likely(mem_off_last + msize <= page_split)) { 5427 /* The entire operation fits within a single page. */ 5428 info->reg_off_last[0] = reg_off_last; 5429 return true; 5430 } 5431 5432 info->page_split = page_split; 5433 elt_split = page_split / msize; 5434 reg_off_split = elt_split << esz; 5435 mem_off_split = elt_split * msize; 5436 5437 /* 5438 * This is the last full element on the first page, but it is not 5439 * necessarily active. If there is no full element, i.e. the first 5440 * active element is the one that's split, this value remains -1. 5441 * It is useful as iteration bounds. 5442 */ 5443 if (elt_split != 0) { 5444 info->reg_off_last[0] = reg_off_split - esize; 5445 } 5446 5447 /* Determine if an unaligned element spans the pages. */ 5448 if (page_split % msize != 0) { 5449 /* It is helpful to know if the split element is active. */ 5450 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5451 info->reg_off_split = reg_off_split; 5452 info->mem_off_split = mem_off_split; 5453 5454 if (reg_off_split == reg_off_last) { 5455 /* The page crossing element is last. */ 5456 return true; 5457 } 5458 } 5459 reg_off_split += esize; 5460 mem_off_split += msize; 5461 } 5462 5463 /* 5464 * We do want the first active element on the second page, because 5465 * this may affect the address reported in an exception. 5466 */ 5467 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5468 tcg_debug_assert(reg_off_split <= reg_off_last); 5469 info->reg_off_first[1] = reg_off_split; 5470 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5471 info->reg_off_last[1] = reg_off_last; 5472 return true; 5473 } 5474 5475 /* 5476 * Resolve the guest virtual addresses to info->page[]. 5477 * Control the generation of page faults with @fault. Return false if 5478 * there is no work to do, which can only happen with @fault == FAULT_NO. 5479 */ 5480 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5481 CPUARMState *env, target_ulong addr, 5482 MMUAccessType access_type, uintptr_t retaddr) 5483 { 5484 int mmu_idx = cpu_mmu_index(env, false); 5485 int mem_off = info->mem_off_first[0]; 5486 bool nofault = fault == FAULT_NO; 5487 bool have_work = true; 5488 5489 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5490 access_type, mmu_idx, retaddr)) { 5491 /* No work to be done. */ 5492 return false; 5493 } 5494 5495 if (likely(info->page_split < 0)) { 5496 /* The entire operation was on the one page. */ 5497 return true; 5498 } 5499 5500 /* 5501 * If the second page is invalid, then we want the fault address to be 5502 * the first byte on that page which is accessed. 5503 */ 5504 if (info->mem_off_split >= 0) { 5505 /* 5506 * There is an element split across the pages. The fault address 5507 * should be the first byte of the second page. 5508 */ 5509 mem_off = info->page_split; 5510 /* 5511 * If the split element is also the first active element 5512 * of the vector, then: For first-fault we should continue 5513 * to generate faults for the second page. For no-fault, 5514 * we have work only if the second page is valid. 5515 */ 5516 if (info->mem_off_first[0] < info->mem_off_split) { 5517 nofault = FAULT_FIRST; 5518 have_work = false; 5519 } 5520 } else { 5521 /* 5522 * There is no element split across the pages. The fault address 5523 * should be the first active element on the second page. 5524 */ 5525 mem_off = info->mem_off_first[1]; 5526 /* 5527 * There must have been one active element on the first page, 5528 * so we're out of first-fault territory. 5529 */ 5530 nofault = fault != FAULT_ALL; 5531 } 5532 5533 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5534 access_type, mmu_idx, retaddr); 5535 return have_work; 5536 } 5537 5538 #ifndef CONFIG_USER_ONLY 5539 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5540 uint64_t *vg, target_ulong addr, 5541 int esize, int msize, int wp_access, 5542 uintptr_t retaddr) 5543 { 5544 intptr_t mem_off, reg_off, reg_last; 5545 int flags0 = info->page[0].flags; 5546 int flags1 = info->page[1].flags; 5547 5548 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5549 return; 5550 } 5551 5552 /* Indicate that watchpoints are handled. */ 5553 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5554 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5555 5556 if (flags0 & TLB_WATCHPOINT) { 5557 mem_off = info->mem_off_first[0]; 5558 reg_off = info->reg_off_first[0]; 5559 reg_last = info->reg_off_last[0]; 5560 5561 while (reg_off <= reg_last) { 5562 uint64_t pg = vg[reg_off >> 6]; 5563 do { 5564 if ((pg >> (reg_off & 63)) & 1) { 5565 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5566 msize, info->page[0].attrs, 5567 wp_access, retaddr); 5568 } 5569 reg_off += esize; 5570 mem_off += msize; 5571 } while (reg_off <= reg_last && (reg_off & 63)); 5572 } 5573 } 5574 5575 mem_off = info->mem_off_split; 5576 if (mem_off >= 0) { 5577 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5578 info->page[0].attrs, wp_access, retaddr); 5579 } 5580 5581 mem_off = info->mem_off_first[1]; 5582 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5583 reg_off = info->reg_off_first[1]; 5584 reg_last = info->reg_off_last[1]; 5585 5586 do { 5587 uint64_t pg = vg[reg_off >> 6]; 5588 do { 5589 if ((pg >> (reg_off & 63)) & 1) { 5590 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5591 msize, info->page[1].attrs, 5592 wp_access, retaddr); 5593 } 5594 reg_off += esize; 5595 mem_off += msize; 5596 } while (reg_off & 63); 5597 } while (reg_off <= reg_last); 5598 } 5599 } 5600 #endif 5601 5602 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5603 uint64_t *vg, target_ulong addr, int esize, 5604 int msize, uint32_t mtedesc, uintptr_t ra) 5605 { 5606 intptr_t mem_off, reg_off, reg_last; 5607 5608 /* Process the page only if MemAttr == Tagged. */ 5609 if (info->page[0].tagged) { 5610 mem_off = info->mem_off_first[0]; 5611 reg_off = info->reg_off_first[0]; 5612 reg_last = info->reg_off_split; 5613 if (reg_last < 0) { 5614 reg_last = info->reg_off_last[0]; 5615 } 5616 5617 do { 5618 uint64_t pg = vg[reg_off >> 6]; 5619 do { 5620 if ((pg >> (reg_off & 63)) & 1) { 5621 mte_check(env, mtedesc, addr, ra); 5622 } 5623 reg_off += esize; 5624 mem_off += msize; 5625 } while (reg_off <= reg_last && (reg_off & 63)); 5626 } while (reg_off <= reg_last); 5627 } 5628 5629 mem_off = info->mem_off_first[1]; 5630 if (mem_off >= 0 && info->page[1].tagged) { 5631 reg_off = info->reg_off_first[1]; 5632 reg_last = info->reg_off_last[1]; 5633 5634 do { 5635 uint64_t pg = vg[reg_off >> 6]; 5636 do { 5637 if ((pg >> (reg_off & 63)) & 1) { 5638 mte_check(env, mtedesc, addr, ra); 5639 } 5640 reg_off += esize; 5641 mem_off += msize; 5642 } while (reg_off & 63); 5643 } while (reg_off <= reg_last); 5644 } 5645 } 5646 5647 /* 5648 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5649 */ 5650 static inline QEMU_ALWAYS_INLINE 5651 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5652 uint32_t desc, const uintptr_t retaddr, 5653 const int esz, const int msz, const int N, uint32_t mtedesc, 5654 sve_ldst1_host_fn *host_fn, 5655 sve_ldst1_tlb_fn *tlb_fn) 5656 { 5657 const unsigned rd = simd_data(desc); 5658 const intptr_t reg_max = simd_oprsz(desc); 5659 intptr_t reg_off, reg_last, mem_off; 5660 SVEContLdSt info; 5661 void *host; 5662 int flags, i; 5663 5664 /* Find the active elements. */ 5665 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5666 /* The entire predicate was false; no load occurs. */ 5667 for (i = 0; i < N; ++i) { 5668 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5669 } 5670 return; 5671 } 5672 5673 /* Probe the page(s). Exit with exception for any invalid page. */ 5674 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5675 5676 /* Handle watchpoints for all active elements. */ 5677 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5678 BP_MEM_READ, retaddr); 5679 5680 /* 5681 * Handle mte checks for all active elements. 5682 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5683 */ 5684 if (mtedesc) { 5685 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5686 mtedesc, retaddr); 5687 } 5688 5689 flags = info.page[0].flags | info.page[1].flags; 5690 if (unlikely(flags != 0)) { 5691 /* 5692 * At least one page includes MMIO. 5693 * Any bus operation can fail with cpu_transaction_failed, 5694 * which for ARM will raise SyncExternal. Perform the load 5695 * into scratch memory to preserve register state until the end. 5696 */ 5697 ARMVectorReg scratch[4] = { }; 5698 5699 mem_off = info.mem_off_first[0]; 5700 reg_off = info.reg_off_first[0]; 5701 reg_last = info.reg_off_last[1]; 5702 if (reg_last < 0) { 5703 reg_last = info.reg_off_split; 5704 if (reg_last < 0) { 5705 reg_last = info.reg_off_last[0]; 5706 } 5707 } 5708 5709 do { 5710 uint64_t pg = vg[reg_off >> 6]; 5711 do { 5712 if ((pg >> (reg_off & 63)) & 1) { 5713 for (i = 0; i < N; ++i) { 5714 tlb_fn(env, &scratch[i], reg_off, 5715 addr + mem_off + (i << msz), retaddr); 5716 } 5717 } 5718 reg_off += 1 << esz; 5719 mem_off += N << msz; 5720 } while (reg_off & 63); 5721 } while (reg_off <= reg_last); 5722 5723 for (i = 0; i < N; ++i) { 5724 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5725 } 5726 return; 5727 } 5728 5729 /* The entire operation is in RAM, on valid pages. */ 5730 5731 for (i = 0; i < N; ++i) { 5732 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5733 } 5734 5735 mem_off = info.mem_off_first[0]; 5736 reg_off = info.reg_off_first[0]; 5737 reg_last = info.reg_off_last[0]; 5738 host = info.page[0].host; 5739 5740 while (reg_off <= reg_last) { 5741 uint64_t pg = vg[reg_off >> 6]; 5742 do { 5743 if ((pg >> (reg_off & 63)) & 1) { 5744 for (i = 0; i < N; ++i) { 5745 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5746 host + mem_off + (i << msz)); 5747 } 5748 } 5749 reg_off += 1 << esz; 5750 mem_off += N << msz; 5751 } while (reg_off <= reg_last && (reg_off & 63)); 5752 } 5753 5754 /* 5755 * Use the slow path to manage the cross-page misalignment. 5756 * But we know this is RAM and cannot trap. 5757 */ 5758 mem_off = info.mem_off_split; 5759 if (unlikely(mem_off >= 0)) { 5760 reg_off = info.reg_off_split; 5761 for (i = 0; i < N; ++i) { 5762 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5763 addr + mem_off + (i << msz), retaddr); 5764 } 5765 } 5766 5767 mem_off = info.mem_off_first[1]; 5768 if (unlikely(mem_off >= 0)) { 5769 reg_off = info.reg_off_first[1]; 5770 reg_last = info.reg_off_last[1]; 5771 host = info.page[1].host; 5772 5773 do { 5774 uint64_t pg = vg[reg_off >> 6]; 5775 do { 5776 if ((pg >> (reg_off & 63)) & 1) { 5777 for (i = 0; i < N; ++i) { 5778 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5779 host + mem_off + (i << msz)); 5780 } 5781 } 5782 reg_off += 1 << esz; 5783 mem_off += N << msz; 5784 } while (reg_off & 63); 5785 } while (reg_off <= reg_last); 5786 } 5787 } 5788 5789 static inline QEMU_ALWAYS_INLINE 5790 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 5791 uint32_t desc, const uintptr_t ra, 5792 const int esz, const int msz, const int N, 5793 sve_ldst1_host_fn *host_fn, 5794 sve_ldst1_tlb_fn *tlb_fn) 5795 { 5796 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5797 int bit55 = extract64(addr, 55, 1); 5798 5799 /* Remove mtedesc from the normal sve descriptor. */ 5800 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5801 5802 /* Perform gross MTE suppression early. */ 5803 if (!tbi_check(desc, bit55) || 5804 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 5805 mtedesc = 0; 5806 } 5807 5808 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 5809 } 5810 5811 #define DO_LD1_1(NAME, ESZ) \ 5812 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 5813 target_ulong addr, uint32_t desc) \ 5814 { \ 5815 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 5816 sve_##NAME##_host, sve_##NAME##_tlb); \ 5817 } \ 5818 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 5819 target_ulong addr, uint32_t desc) \ 5820 { \ 5821 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 5822 sve_##NAME##_host, sve_##NAME##_tlb); \ 5823 } 5824 5825 #define DO_LD1_2(NAME, ESZ, MSZ) \ 5826 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 5827 target_ulong addr, uint32_t desc) \ 5828 { \ 5829 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5830 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5831 } \ 5832 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 5833 target_ulong addr, uint32_t desc) \ 5834 { \ 5835 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5836 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5837 } \ 5838 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 5839 target_ulong addr, uint32_t desc) \ 5840 { \ 5841 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5842 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5843 } \ 5844 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 5845 target_ulong addr, uint32_t desc) \ 5846 { \ 5847 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5848 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5849 } 5850 5851 DO_LD1_1(ld1bb, MO_8) 5852 DO_LD1_1(ld1bhu, MO_16) 5853 DO_LD1_1(ld1bhs, MO_16) 5854 DO_LD1_1(ld1bsu, MO_32) 5855 DO_LD1_1(ld1bss, MO_32) 5856 DO_LD1_1(ld1bdu, MO_64) 5857 DO_LD1_1(ld1bds, MO_64) 5858 5859 DO_LD1_2(ld1hh, MO_16, MO_16) 5860 DO_LD1_2(ld1hsu, MO_32, MO_16) 5861 DO_LD1_2(ld1hss, MO_32, MO_16) 5862 DO_LD1_2(ld1hdu, MO_64, MO_16) 5863 DO_LD1_2(ld1hds, MO_64, MO_16) 5864 5865 DO_LD1_2(ld1ss, MO_32, MO_32) 5866 DO_LD1_2(ld1sdu, MO_64, MO_32) 5867 DO_LD1_2(ld1sds, MO_64, MO_32) 5868 5869 DO_LD1_2(ld1dd, MO_64, MO_64) 5870 5871 #undef DO_LD1_1 5872 #undef DO_LD1_2 5873 5874 #define DO_LDN_1(N) \ 5875 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 5876 target_ulong addr, uint32_t desc) \ 5877 { \ 5878 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 5879 sve_ld1bb_host, sve_ld1bb_tlb); \ 5880 } \ 5881 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 5882 target_ulong addr, uint32_t desc) \ 5883 { \ 5884 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 5885 sve_ld1bb_host, sve_ld1bb_tlb); \ 5886 } 5887 5888 #define DO_LDN_2(N, SUFF, ESZ) \ 5889 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 5890 target_ulong addr, uint32_t desc) \ 5891 { \ 5892 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5893 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5894 } \ 5895 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 5896 target_ulong addr, uint32_t desc) \ 5897 { \ 5898 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5899 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5900 } \ 5901 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 5902 target_ulong addr, uint32_t desc) \ 5903 { \ 5904 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5905 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5906 } \ 5907 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 5908 target_ulong addr, uint32_t desc) \ 5909 { \ 5910 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5911 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5912 } 5913 5914 DO_LDN_1(2) 5915 DO_LDN_1(3) 5916 DO_LDN_1(4) 5917 5918 DO_LDN_2(2, hh, MO_16) 5919 DO_LDN_2(3, hh, MO_16) 5920 DO_LDN_2(4, hh, MO_16) 5921 5922 DO_LDN_2(2, ss, MO_32) 5923 DO_LDN_2(3, ss, MO_32) 5924 DO_LDN_2(4, ss, MO_32) 5925 5926 DO_LDN_2(2, dd, MO_64) 5927 DO_LDN_2(3, dd, MO_64) 5928 DO_LDN_2(4, dd, MO_64) 5929 5930 #undef DO_LDN_1 5931 #undef DO_LDN_2 5932 5933 /* 5934 * Load contiguous data, first-fault and no-fault. 5935 * 5936 * For user-only, one could argue that we should hold the mmap_lock during 5937 * the operation so that there is no race between page_check_range and the 5938 * load operation. However, unmapping pages out from under a running thread 5939 * is extraordinarily unlikely. This theoretical race condition also affects 5940 * linux-user/ in its get_user/put_user macros. 5941 * 5942 * TODO: Construct some helpers, written in assembly, that interact with 5943 * host_signal_handler to produce memory ops which can properly report errors 5944 * without racing. 5945 */ 5946 5947 /* Fault on byte I. All bits in FFR from I are cleared. The vector 5948 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 5949 * option, which leaves subsequent data unchanged. 5950 */ 5951 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 5952 { 5953 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 5954 5955 if (i & 63) { 5956 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 5957 i = ROUND_UP(i, 64); 5958 } 5959 for (; i < oprsz; i += 64) { 5960 ffr[i / 64] = 0; 5961 } 5962 } 5963 5964 /* 5965 * Common helper for all contiguous no-fault and first-fault loads. 5966 */ 5967 static inline QEMU_ALWAYS_INLINE 5968 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 5969 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 5970 const int esz, const int msz, const SVEContFault fault, 5971 sve_ldst1_host_fn *host_fn, 5972 sve_ldst1_tlb_fn *tlb_fn) 5973 { 5974 const unsigned rd = simd_data(desc); 5975 void *vd = &env->vfp.zregs[rd]; 5976 const intptr_t reg_max = simd_oprsz(desc); 5977 intptr_t reg_off, mem_off, reg_last; 5978 SVEContLdSt info; 5979 int flags; 5980 void *host; 5981 5982 /* Find the active elements. */ 5983 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 5984 /* The entire predicate was false; no load occurs. */ 5985 memset(vd, 0, reg_max); 5986 return; 5987 } 5988 reg_off = info.reg_off_first[0]; 5989 5990 /* Probe the page(s). */ 5991 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 5992 /* Fault on first element. */ 5993 tcg_debug_assert(fault == FAULT_NO); 5994 memset(vd, 0, reg_max); 5995 goto do_fault; 5996 } 5997 5998 mem_off = info.mem_off_first[0]; 5999 flags = info.page[0].flags; 6000 6001 /* 6002 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6003 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6004 */ 6005 if (!info.page[0].tagged) { 6006 mtedesc = 0; 6007 } 6008 6009 if (fault == FAULT_FIRST) { 6010 /* Trapping mte check for the first-fault element. */ 6011 if (mtedesc) { 6012 mte_check(env, mtedesc, addr + mem_off, retaddr); 6013 } 6014 6015 /* 6016 * Special handling of the first active element, 6017 * if it crosses a page boundary or is MMIO. 6018 */ 6019 bool is_split = mem_off == info.mem_off_split; 6020 if (unlikely(flags != 0) || unlikely(is_split)) { 6021 /* 6022 * Use the slow path for cross-page handling. 6023 * Might trap for MMIO or watchpoints. 6024 */ 6025 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6026 6027 /* After any fault, zero the other elements. */ 6028 swap_memzero(vd, reg_off); 6029 reg_off += 1 << esz; 6030 mem_off += 1 << msz; 6031 swap_memzero(vd + reg_off, reg_max - reg_off); 6032 6033 if (is_split) { 6034 goto second_page; 6035 } 6036 } else { 6037 memset(vd, 0, reg_max); 6038 } 6039 } else { 6040 memset(vd, 0, reg_max); 6041 if (unlikely(mem_off == info.mem_off_split)) { 6042 /* The first active element crosses a page boundary. */ 6043 flags |= info.page[1].flags; 6044 if (unlikely(flags & TLB_MMIO)) { 6045 /* Some page is MMIO, see below. */ 6046 goto do_fault; 6047 } 6048 if (unlikely(flags & TLB_WATCHPOINT) && 6049 (cpu_watchpoint_address_matches 6050 (env_cpu(env), addr + mem_off, 1 << msz) 6051 & BP_MEM_READ)) { 6052 /* Watchpoint hit, see below. */ 6053 goto do_fault; 6054 } 6055 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6056 goto do_fault; 6057 } 6058 /* 6059 * Use the slow path for cross-page handling. 6060 * This is RAM, without a watchpoint, and will not trap. 6061 */ 6062 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6063 goto second_page; 6064 } 6065 } 6066 6067 /* 6068 * From this point on, all memory operations are MemSingleNF. 6069 * 6070 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6071 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6072 * 6073 * Unfortuately we do not have access to the memory attributes from the 6074 * PTE to tell Device memory from Normal memory. So we make a mostly 6075 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6076 * This gives the right answer for the common cases of "Normal memory, 6077 * backed by host RAM" and "Device memory, backed by MMIO". 6078 * The architecture allows us to suppress an NF load and return 6079 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6080 * case of "Normal memory, backed by MMIO" is permitted. The case we 6081 * get wrong is "Device memory, backed by host RAM", for which we 6082 * should return (UNKNOWN, FAULT) for but do not. 6083 * 6084 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6085 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6086 * architectural breakpoints the same. 6087 */ 6088 if (unlikely(flags & TLB_MMIO)) { 6089 goto do_fault; 6090 } 6091 6092 reg_last = info.reg_off_last[0]; 6093 host = info.page[0].host; 6094 6095 do { 6096 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6097 do { 6098 if ((pg >> (reg_off & 63)) & 1) { 6099 if (unlikely(flags & TLB_WATCHPOINT) && 6100 (cpu_watchpoint_address_matches 6101 (env_cpu(env), addr + mem_off, 1 << msz) 6102 & BP_MEM_READ)) { 6103 goto do_fault; 6104 } 6105 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6106 goto do_fault; 6107 } 6108 host_fn(vd, reg_off, host + mem_off); 6109 } 6110 reg_off += 1 << esz; 6111 mem_off += 1 << msz; 6112 } while (reg_off <= reg_last && (reg_off & 63)); 6113 } while (reg_off <= reg_last); 6114 6115 /* 6116 * MemSingleNF is allowed to fail for any reason. We have special 6117 * code above to handle the first element crossing a page boundary. 6118 * As an implementation choice, decline to handle a cross-page element 6119 * in any other position. 6120 */ 6121 reg_off = info.reg_off_split; 6122 if (reg_off >= 0) { 6123 goto do_fault; 6124 } 6125 6126 second_page: 6127 reg_off = info.reg_off_first[1]; 6128 if (likely(reg_off < 0)) { 6129 /* No active elements on the second page. All done. */ 6130 return; 6131 } 6132 6133 /* 6134 * MemSingleNF is allowed to fail for any reason. As an implementation 6135 * choice, decline to handle elements on the second page. This should 6136 * be low frequency as the guest walks through memory -- the next 6137 * iteration of the guest's loop should be aligned on the page boundary, 6138 * and then all following iterations will stay aligned. 6139 */ 6140 6141 do_fault: 6142 record_fault(env, reg_off, reg_max); 6143 } 6144 6145 static inline QEMU_ALWAYS_INLINE 6146 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6147 uint32_t desc, const uintptr_t retaddr, 6148 const int esz, const int msz, const SVEContFault fault, 6149 sve_ldst1_host_fn *host_fn, 6150 sve_ldst1_tlb_fn *tlb_fn) 6151 { 6152 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6153 int bit55 = extract64(addr, 55, 1); 6154 6155 /* Remove mtedesc from the normal sve descriptor. */ 6156 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6157 6158 /* Perform gross MTE suppression early. */ 6159 if (!tbi_check(desc, bit55) || 6160 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6161 mtedesc = 0; 6162 } 6163 6164 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6165 esz, msz, fault, host_fn, tlb_fn); 6166 } 6167 6168 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6169 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6170 target_ulong addr, uint32_t desc) \ 6171 { \ 6172 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6173 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6174 } \ 6175 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6176 target_ulong addr, uint32_t desc) \ 6177 { \ 6178 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6179 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6180 } \ 6181 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6182 target_ulong addr, uint32_t desc) \ 6183 { \ 6184 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6185 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6186 } \ 6187 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6188 target_ulong addr, uint32_t desc) \ 6189 { \ 6190 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6191 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6192 } 6193 6194 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6195 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6196 target_ulong addr, uint32_t desc) \ 6197 { \ 6198 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6199 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6200 } \ 6201 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6202 target_ulong addr, uint32_t desc) \ 6203 { \ 6204 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6205 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6206 } \ 6207 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6208 target_ulong addr, uint32_t desc) \ 6209 { \ 6210 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6211 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6212 } \ 6213 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6214 target_ulong addr, uint32_t desc) \ 6215 { \ 6216 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6217 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6218 } \ 6219 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6220 target_ulong addr, uint32_t desc) \ 6221 { \ 6222 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6223 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6224 } \ 6225 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6226 target_ulong addr, uint32_t desc) \ 6227 { \ 6228 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6229 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6230 } \ 6231 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6232 target_ulong addr, uint32_t desc) \ 6233 { \ 6234 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6235 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6236 } \ 6237 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6238 target_ulong addr, uint32_t desc) \ 6239 { \ 6240 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6241 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6242 } 6243 6244 DO_LDFF1_LDNF1_1(bb, MO_8) 6245 DO_LDFF1_LDNF1_1(bhu, MO_16) 6246 DO_LDFF1_LDNF1_1(bhs, MO_16) 6247 DO_LDFF1_LDNF1_1(bsu, MO_32) 6248 DO_LDFF1_LDNF1_1(bss, MO_32) 6249 DO_LDFF1_LDNF1_1(bdu, MO_64) 6250 DO_LDFF1_LDNF1_1(bds, MO_64) 6251 6252 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6253 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6254 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6255 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6256 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6257 6258 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6259 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6260 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6261 6262 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6263 6264 #undef DO_LDFF1_LDNF1_1 6265 #undef DO_LDFF1_LDNF1_2 6266 6267 /* 6268 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6269 */ 6270 6271 static inline QEMU_ALWAYS_INLINE 6272 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6273 uint32_t desc, const uintptr_t retaddr, 6274 const int esz, const int msz, const int N, uint32_t mtedesc, 6275 sve_ldst1_host_fn *host_fn, 6276 sve_ldst1_tlb_fn *tlb_fn) 6277 { 6278 const unsigned rd = simd_data(desc); 6279 const intptr_t reg_max = simd_oprsz(desc); 6280 intptr_t reg_off, reg_last, mem_off; 6281 SVEContLdSt info; 6282 void *host; 6283 int i, flags; 6284 6285 /* Find the active elements. */ 6286 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6287 /* The entire predicate was false; no store occurs. */ 6288 return; 6289 } 6290 6291 /* Probe the page(s). Exit with exception for any invalid page. */ 6292 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6293 6294 /* Handle watchpoints for all active elements. */ 6295 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6296 BP_MEM_WRITE, retaddr); 6297 6298 /* 6299 * Handle mte checks for all active elements. 6300 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6301 */ 6302 if (mtedesc) { 6303 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6304 mtedesc, retaddr); 6305 } 6306 6307 flags = info.page[0].flags | info.page[1].flags; 6308 if (unlikely(flags != 0)) { 6309 #ifdef CONFIG_USER_ONLY 6310 g_assert_not_reached(); 6311 #else 6312 /* 6313 * At least one page includes MMIO. 6314 * Any bus operation can fail with cpu_transaction_failed, 6315 * which for ARM will raise SyncExternal. We cannot avoid 6316 * this fault and will leave with the store incomplete. 6317 */ 6318 mem_off = info.mem_off_first[0]; 6319 reg_off = info.reg_off_first[0]; 6320 reg_last = info.reg_off_last[1]; 6321 if (reg_last < 0) { 6322 reg_last = info.reg_off_split; 6323 if (reg_last < 0) { 6324 reg_last = info.reg_off_last[0]; 6325 } 6326 } 6327 6328 do { 6329 uint64_t pg = vg[reg_off >> 6]; 6330 do { 6331 if ((pg >> (reg_off & 63)) & 1) { 6332 for (i = 0; i < N; ++i) { 6333 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6334 addr + mem_off + (i << msz), retaddr); 6335 } 6336 } 6337 reg_off += 1 << esz; 6338 mem_off += N << msz; 6339 } while (reg_off & 63); 6340 } while (reg_off <= reg_last); 6341 return; 6342 #endif 6343 } 6344 6345 mem_off = info.mem_off_first[0]; 6346 reg_off = info.reg_off_first[0]; 6347 reg_last = info.reg_off_last[0]; 6348 host = info.page[0].host; 6349 6350 while (reg_off <= reg_last) { 6351 uint64_t pg = vg[reg_off >> 6]; 6352 do { 6353 if ((pg >> (reg_off & 63)) & 1) { 6354 for (i = 0; i < N; ++i) { 6355 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6356 host + mem_off + (i << msz)); 6357 } 6358 } 6359 reg_off += 1 << esz; 6360 mem_off += N << msz; 6361 } while (reg_off <= reg_last && (reg_off & 63)); 6362 } 6363 6364 /* 6365 * Use the slow path to manage the cross-page misalignment. 6366 * But we know this is RAM and cannot trap. 6367 */ 6368 mem_off = info.mem_off_split; 6369 if (unlikely(mem_off >= 0)) { 6370 reg_off = info.reg_off_split; 6371 for (i = 0; i < N; ++i) { 6372 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6373 addr + mem_off + (i << msz), retaddr); 6374 } 6375 } 6376 6377 mem_off = info.mem_off_first[1]; 6378 if (unlikely(mem_off >= 0)) { 6379 reg_off = info.reg_off_first[1]; 6380 reg_last = info.reg_off_last[1]; 6381 host = info.page[1].host; 6382 6383 do { 6384 uint64_t pg = vg[reg_off >> 6]; 6385 do { 6386 if ((pg >> (reg_off & 63)) & 1) { 6387 for (i = 0; i < N; ++i) { 6388 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6389 host + mem_off + (i << msz)); 6390 } 6391 } 6392 reg_off += 1 << esz; 6393 mem_off += N << msz; 6394 } while (reg_off & 63); 6395 } while (reg_off <= reg_last); 6396 } 6397 } 6398 6399 static inline QEMU_ALWAYS_INLINE 6400 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6401 uint32_t desc, const uintptr_t ra, 6402 const int esz, const int msz, const int N, 6403 sve_ldst1_host_fn *host_fn, 6404 sve_ldst1_tlb_fn *tlb_fn) 6405 { 6406 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6407 int bit55 = extract64(addr, 55, 1); 6408 6409 /* Remove mtedesc from the normal sve descriptor. */ 6410 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6411 6412 /* Perform gross MTE suppression early. */ 6413 if (!tbi_check(desc, bit55) || 6414 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6415 mtedesc = 0; 6416 } 6417 6418 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6419 } 6420 6421 #define DO_STN_1(N, NAME, ESZ) \ 6422 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6423 target_ulong addr, uint32_t desc) \ 6424 { \ 6425 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6426 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6427 } \ 6428 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6429 target_ulong addr, uint32_t desc) \ 6430 { \ 6431 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6432 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6433 } 6434 6435 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6436 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6437 target_ulong addr, uint32_t desc) \ 6438 { \ 6439 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6440 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6441 } \ 6442 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6443 target_ulong addr, uint32_t desc) \ 6444 { \ 6445 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6446 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6447 } \ 6448 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6449 target_ulong addr, uint32_t desc) \ 6450 { \ 6451 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6452 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6453 } \ 6454 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6455 target_ulong addr, uint32_t desc) \ 6456 { \ 6457 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6458 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6459 } 6460 6461 DO_STN_1(1, bb, MO_8) 6462 DO_STN_1(1, bh, MO_16) 6463 DO_STN_1(1, bs, MO_32) 6464 DO_STN_1(1, bd, MO_64) 6465 DO_STN_1(2, bb, MO_8) 6466 DO_STN_1(3, bb, MO_8) 6467 DO_STN_1(4, bb, MO_8) 6468 6469 DO_STN_2(1, hh, MO_16, MO_16) 6470 DO_STN_2(1, hs, MO_32, MO_16) 6471 DO_STN_2(1, hd, MO_64, MO_16) 6472 DO_STN_2(2, hh, MO_16, MO_16) 6473 DO_STN_2(3, hh, MO_16, MO_16) 6474 DO_STN_2(4, hh, MO_16, MO_16) 6475 6476 DO_STN_2(1, ss, MO_32, MO_32) 6477 DO_STN_2(1, sd, MO_64, MO_32) 6478 DO_STN_2(2, ss, MO_32, MO_32) 6479 DO_STN_2(3, ss, MO_32, MO_32) 6480 DO_STN_2(4, ss, MO_32, MO_32) 6481 6482 DO_STN_2(1, dd, MO_64, MO_64) 6483 DO_STN_2(2, dd, MO_64, MO_64) 6484 DO_STN_2(3, dd, MO_64, MO_64) 6485 DO_STN_2(4, dd, MO_64, MO_64) 6486 6487 #undef DO_STN_1 6488 #undef DO_STN_2 6489 6490 /* 6491 * Loads with a vector index. 6492 */ 6493 6494 /* 6495 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6496 */ 6497 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6498 6499 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6500 { 6501 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6502 } 6503 6504 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6505 { 6506 return *(int32_t *)(reg + H1_4(reg_ofs)); 6507 } 6508 6509 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6510 { 6511 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6512 } 6513 6514 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6515 { 6516 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6517 } 6518 6519 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6520 { 6521 return *(uint64_t *)(reg + reg_ofs); 6522 } 6523 6524 static inline QEMU_ALWAYS_INLINE 6525 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6526 target_ulong base, uint32_t desc, uintptr_t retaddr, 6527 uint32_t mtedesc, int esize, int msize, 6528 zreg_off_fn *off_fn, 6529 sve_ldst1_host_fn *host_fn, 6530 sve_ldst1_tlb_fn *tlb_fn) 6531 { 6532 const int mmu_idx = cpu_mmu_index(env, false); 6533 const intptr_t reg_max = simd_oprsz(desc); 6534 const int scale = simd_data(desc); 6535 ARMVectorReg scratch; 6536 intptr_t reg_off; 6537 SVEHostPage info, info2; 6538 6539 memset(&scratch, 0, reg_max); 6540 reg_off = 0; 6541 do { 6542 uint64_t pg = vg[reg_off >> 6]; 6543 do { 6544 if (likely(pg & 1)) { 6545 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6546 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6547 6548 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6549 mmu_idx, retaddr); 6550 6551 if (likely(in_page >= msize)) { 6552 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6553 cpu_check_watchpoint(env_cpu(env), addr, msize, 6554 info.attrs, BP_MEM_READ, retaddr); 6555 } 6556 if (mtedesc && info.tagged) { 6557 mte_check(env, mtedesc, addr, retaddr); 6558 } 6559 if (unlikely(info.flags & TLB_MMIO)) { 6560 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6561 } else { 6562 host_fn(&scratch, reg_off, info.host); 6563 } 6564 } else { 6565 /* Element crosses the page boundary. */ 6566 sve_probe_page(&info2, false, env, addr + in_page, 0, 6567 MMU_DATA_LOAD, mmu_idx, retaddr); 6568 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6569 cpu_check_watchpoint(env_cpu(env), addr, 6570 msize, info.attrs, 6571 BP_MEM_READ, retaddr); 6572 } 6573 if (mtedesc && info.tagged) { 6574 mte_check(env, mtedesc, addr, retaddr); 6575 } 6576 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6577 } 6578 } 6579 reg_off += esize; 6580 pg >>= esize; 6581 } while (reg_off & 63); 6582 } while (reg_off < reg_max); 6583 6584 /* Wait until all exceptions have been raised to write back. */ 6585 memcpy(vd, &scratch, reg_max); 6586 } 6587 6588 static inline QEMU_ALWAYS_INLINE 6589 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6590 target_ulong base, uint32_t desc, uintptr_t retaddr, 6591 int esize, int msize, zreg_off_fn *off_fn, 6592 sve_ldst1_host_fn *host_fn, 6593 sve_ldst1_tlb_fn *tlb_fn) 6594 { 6595 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6596 /* Remove mtedesc from the normal sve descriptor. */ 6597 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6598 6599 /* 6600 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6601 * offset base entirely over the address space hole to change the 6602 * pointer tag, or change the bit55 selector. So we could here 6603 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6604 */ 6605 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6606 esize, msize, off_fn, host_fn, tlb_fn); 6607 } 6608 6609 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6610 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6611 void *vm, target_ulong base, uint32_t desc) \ 6612 { \ 6613 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6614 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6615 } \ 6616 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6617 void *vm, target_ulong base, uint32_t desc) \ 6618 { \ 6619 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6620 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6621 } 6622 6623 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6624 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6625 void *vm, target_ulong base, uint32_t desc) \ 6626 { \ 6627 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6628 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6629 } \ 6630 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6631 void *vm, target_ulong base, uint32_t desc) \ 6632 { \ 6633 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6634 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6635 } 6636 6637 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6638 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6639 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6640 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6641 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6642 6643 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6644 DO_LD1_ZPZ_S(bss, zss, MO_8) 6645 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6646 DO_LD1_ZPZ_D(bds, zss, MO_8) 6647 DO_LD1_ZPZ_D(bds, zd, MO_8) 6648 6649 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6650 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6651 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6652 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6653 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6654 6655 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6656 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6657 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6658 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6659 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6660 6661 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6662 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6663 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6664 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6665 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6666 6667 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6668 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6669 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6670 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6671 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6672 6673 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6674 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6675 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6676 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6677 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6678 6679 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6680 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6681 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6682 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6683 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6684 6685 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6686 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6687 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6688 6689 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6690 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6691 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6692 6693 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6694 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6695 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6696 6697 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6698 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6699 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6700 6701 #undef DO_LD1_ZPZ_S 6702 #undef DO_LD1_ZPZ_D 6703 6704 /* First fault loads with a vector index. */ 6705 6706 /* 6707 * Common helpers for all gather first-faulting loads. 6708 */ 6709 6710 static inline QEMU_ALWAYS_INLINE 6711 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6712 target_ulong base, uint32_t desc, uintptr_t retaddr, 6713 uint32_t mtedesc, const int esz, const int msz, 6714 zreg_off_fn *off_fn, 6715 sve_ldst1_host_fn *host_fn, 6716 sve_ldst1_tlb_fn *tlb_fn) 6717 { 6718 const int mmu_idx = cpu_mmu_index(env, false); 6719 const intptr_t reg_max = simd_oprsz(desc); 6720 const int scale = simd_data(desc); 6721 const int esize = 1 << esz; 6722 const int msize = 1 << msz; 6723 intptr_t reg_off; 6724 SVEHostPage info; 6725 target_ulong addr, in_page; 6726 ARMVectorReg scratch; 6727 6728 /* Skip to the first true predicate. */ 6729 reg_off = find_next_active(vg, 0, reg_max, esz); 6730 if (unlikely(reg_off >= reg_max)) { 6731 /* The entire predicate was false; no load occurs. */ 6732 memset(vd, 0, reg_max); 6733 return; 6734 } 6735 6736 /* Protect against overlap between vd and vm. */ 6737 if (unlikely(vd == vm)) { 6738 vm = memcpy(&scratch, vm, reg_max); 6739 } 6740 6741 /* 6742 * Probe the first element, allowing faults. 6743 */ 6744 addr = base + (off_fn(vm, reg_off) << scale); 6745 if (mtedesc) { 6746 mte_check(env, mtedesc, addr, retaddr); 6747 } 6748 tlb_fn(env, vd, reg_off, addr, retaddr); 6749 6750 /* After any fault, zero the other elements. */ 6751 swap_memzero(vd, reg_off); 6752 reg_off += esize; 6753 swap_memzero(vd + reg_off, reg_max - reg_off); 6754 6755 /* 6756 * Probe the remaining elements, not allowing faults. 6757 */ 6758 while (reg_off < reg_max) { 6759 uint64_t pg = vg[reg_off >> 6]; 6760 do { 6761 if (likely((pg >> (reg_off & 63)) & 1)) { 6762 addr = base + (off_fn(vm, reg_off) << scale); 6763 in_page = -(addr | TARGET_PAGE_MASK); 6764 6765 if (unlikely(in_page < msize)) { 6766 /* Stop if the element crosses a page boundary. */ 6767 goto fault; 6768 } 6769 6770 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6771 mmu_idx, retaddr); 6772 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6773 goto fault; 6774 } 6775 if (unlikely(info.flags & TLB_WATCHPOINT) && 6776 (cpu_watchpoint_address_matches 6777 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 6778 goto fault; 6779 } 6780 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 6781 goto fault; 6782 } 6783 6784 host_fn(vd, reg_off, info.host); 6785 } 6786 reg_off += esize; 6787 } while (reg_off & 63); 6788 } 6789 return; 6790 6791 fault: 6792 record_fault(env, reg_off, reg_max); 6793 } 6794 6795 static inline QEMU_ALWAYS_INLINE 6796 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6797 target_ulong base, uint32_t desc, uintptr_t retaddr, 6798 const int esz, const int msz, 6799 zreg_off_fn *off_fn, 6800 sve_ldst1_host_fn *host_fn, 6801 sve_ldst1_tlb_fn *tlb_fn) 6802 { 6803 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6804 /* Remove mtedesc from the normal sve descriptor. */ 6805 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6806 6807 /* 6808 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6809 * offset base entirely over the address space hole to change the 6810 * pointer tag, or change the bit55 selector. So we could here 6811 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6812 */ 6813 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6814 esz, msz, off_fn, host_fn, tlb_fn); 6815 } 6816 6817 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 6818 void HELPER(sve_ldff##MEM##_##OFS) \ 6819 (CPUARMState *env, void *vd, void *vg, \ 6820 void *vm, target_ulong base, uint32_t desc) \ 6821 { \ 6822 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 6823 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6824 } \ 6825 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6826 (CPUARMState *env, void *vd, void *vg, \ 6827 void *vm, target_ulong base, uint32_t desc) \ 6828 { \ 6829 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 6830 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6831 } 6832 6833 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 6834 void HELPER(sve_ldff##MEM##_##OFS) \ 6835 (CPUARMState *env, void *vd, void *vg, \ 6836 void *vm, target_ulong base, uint32_t desc) \ 6837 { \ 6838 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 6839 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6840 } \ 6841 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6842 (CPUARMState *env, void *vd, void *vg, \ 6843 void *vm, target_ulong base, uint32_t desc) \ 6844 { \ 6845 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 6846 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6847 } 6848 6849 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 6850 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 6851 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 6852 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 6853 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 6854 6855 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 6856 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 6857 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 6858 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 6859 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 6860 6861 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 6862 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 6863 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 6864 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 6865 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 6866 6867 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 6868 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 6869 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 6870 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 6871 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 6872 6873 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 6874 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 6875 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 6876 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 6877 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 6878 6879 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 6880 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 6881 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 6882 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 6883 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 6884 6885 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 6886 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 6887 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 6888 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 6889 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 6890 6891 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 6892 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 6893 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 6894 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 6895 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 6896 6897 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 6898 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 6899 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 6900 6901 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 6902 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 6903 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 6904 6905 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 6906 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 6907 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 6908 6909 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 6910 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 6911 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 6912 6913 /* Stores with a vector index. */ 6914 6915 static inline QEMU_ALWAYS_INLINE 6916 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6917 target_ulong base, uint32_t desc, uintptr_t retaddr, 6918 uint32_t mtedesc, int esize, int msize, 6919 zreg_off_fn *off_fn, 6920 sve_ldst1_host_fn *host_fn, 6921 sve_ldst1_tlb_fn *tlb_fn) 6922 { 6923 const int mmu_idx = cpu_mmu_index(env, false); 6924 const intptr_t reg_max = simd_oprsz(desc); 6925 const int scale = simd_data(desc); 6926 void *host[ARM_MAX_VQ * 4]; 6927 intptr_t reg_off, i; 6928 SVEHostPage info, info2; 6929 6930 /* 6931 * Probe all of the elements for host addresses and flags. 6932 */ 6933 i = reg_off = 0; 6934 do { 6935 uint64_t pg = vg[reg_off >> 6]; 6936 do { 6937 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6938 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6939 6940 host[i] = NULL; 6941 if (likely((pg >> (reg_off & 63)) & 1)) { 6942 if (likely(in_page >= msize)) { 6943 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 6944 mmu_idx, retaddr); 6945 if (!(info.flags & TLB_MMIO)) { 6946 host[i] = info.host; 6947 } 6948 } else { 6949 /* 6950 * Element crosses the page boundary. 6951 * Probe both pages, but do not record the host address, 6952 * so that we use the slow path. 6953 */ 6954 sve_probe_page(&info, false, env, addr, 0, 6955 MMU_DATA_STORE, mmu_idx, retaddr); 6956 sve_probe_page(&info2, false, env, addr + in_page, 0, 6957 MMU_DATA_STORE, mmu_idx, retaddr); 6958 info.flags |= info2.flags; 6959 } 6960 6961 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6962 cpu_check_watchpoint(env_cpu(env), addr, msize, 6963 info.attrs, BP_MEM_WRITE, retaddr); 6964 } 6965 6966 if (mtedesc && info.tagged) { 6967 mte_check(env, mtedesc, addr, retaddr); 6968 } 6969 } 6970 i += 1; 6971 reg_off += esize; 6972 } while (reg_off & 63); 6973 } while (reg_off < reg_max); 6974 6975 /* 6976 * Now that we have recognized all exceptions except SyncExternal 6977 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 6978 * 6979 * Note for the common case of an element in RAM, not crossing a page 6980 * boundary, we have stored the host address in host[]. This doubles 6981 * as a first-level check against the predicate, since only enabled 6982 * elements have non-null host addresses. 6983 */ 6984 i = reg_off = 0; 6985 do { 6986 void *h = host[i]; 6987 if (likely(h != NULL)) { 6988 host_fn(vd, reg_off, h); 6989 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 6990 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6991 tlb_fn(env, vd, reg_off, addr, retaddr); 6992 } 6993 i += 1; 6994 reg_off += esize; 6995 } while (reg_off < reg_max); 6996 } 6997 6998 static inline QEMU_ALWAYS_INLINE 6999 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7000 target_ulong base, uint32_t desc, uintptr_t retaddr, 7001 int esize, int msize, zreg_off_fn *off_fn, 7002 sve_ldst1_host_fn *host_fn, 7003 sve_ldst1_tlb_fn *tlb_fn) 7004 { 7005 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7006 /* Remove mtedesc from the normal sve descriptor. */ 7007 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7008 7009 /* 7010 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7011 * offset base entirely over the address space hole to change the 7012 * pointer tag, or change the bit55 selector. So we could here 7013 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7014 */ 7015 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7016 esize, msize, off_fn, host_fn, tlb_fn); 7017 } 7018 7019 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7020 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7021 void *vm, target_ulong base, uint32_t desc) \ 7022 { \ 7023 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7024 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7025 } \ 7026 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7027 void *vm, target_ulong base, uint32_t desc) \ 7028 { \ 7029 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7030 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7031 } 7032 7033 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7034 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7035 void *vm, target_ulong base, uint32_t desc) \ 7036 { \ 7037 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7038 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7039 } \ 7040 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7041 void *vm, target_ulong base, uint32_t desc) \ 7042 { \ 7043 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7044 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7045 } 7046 7047 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7048 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7049 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7050 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7051 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7052 7053 DO_ST1_ZPZ_S(bs, zss, MO_8) 7054 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7055 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7056 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7057 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7058 7059 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7060 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7061 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7062 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7063 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7064 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7065 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7066 7067 DO_ST1_ZPZ_D(bd, zss, MO_8) 7068 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7069 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7070 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7071 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7072 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7073 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7074 7075 DO_ST1_ZPZ_D(bd, zd, MO_8) 7076 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7077 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7078 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7079 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7080 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7081 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7082 7083 #undef DO_ST1_ZPZ_S 7084 #undef DO_ST1_ZPZ_D 7085 7086 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7087 { 7088 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7089 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7090 7091 for (i = 0; i < opr_sz; ++i) { 7092 d[i] = n[i] ^ m[i] ^ k[i]; 7093 } 7094 } 7095 7096 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7097 { 7098 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7099 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7100 7101 for (i = 0; i < opr_sz; ++i) { 7102 d[i] = n[i] ^ (m[i] & ~k[i]); 7103 } 7104 } 7105 7106 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7107 { 7108 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7109 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7110 7111 for (i = 0; i < opr_sz; ++i) { 7112 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7113 } 7114 } 7115 7116 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7117 { 7118 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7119 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7120 7121 for (i = 0; i < opr_sz; ++i) { 7122 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7123 } 7124 } 7125 7126 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7127 { 7128 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7129 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7130 7131 for (i = 0; i < opr_sz; ++i) { 7132 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7133 } 7134 } 7135 7136 /* 7137 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7138 * See hasless(v,1) from 7139 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7140 */ 7141 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7142 { 7143 int bits = 8 << esz; 7144 uint64_t ones = dup_const(esz, 1); 7145 uint64_t signs = ones << (bits - 1); 7146 uint64_t cmp0, cmp1; 7147 7148 cmp1 = dup_const(esz, n); 7149 cmp0 = cmp1 ^ m0; 7150 cmp1 = cmp1 ^ m1; 7151 cmp0 = (cmp0 - ones) & ~cmp0; 7152 cmp1 = (cmp1 - ones) & ~cmp1; 7153 return (cmp0 | cmp1) & signs; 7154 } 7155 7156 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7157 uint32_t desc, int esz, bool nmatch) 7158 { 7159 uint16_t esz_mask = pred_esz_masks[esz]; 7160 intptr_t opr_sz = simd_oprsz(desc); 7161 uint32_t flags = PREDTEST_INIT; 7162 intptr_t i, j, k; 7163 7164 for (i = 0; i < opr_sz; i += 16) { 7165 uint64_t m0 = *(uint64_t *)(vm + i); 7166 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7167 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7168 uint16_t out = 0; 7169 7170 for (j = 0; j < 16; j += 8) { 7171 uint64_t n = *(uint64_t *)(vn + i + j); 7172 7173 for (k = 0; k < 8; k += 1 << esz) { 7174 if (pg & (1 << (j + k))) { 7175 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7176 out |= (o ^ nmatch) << (j + k); 7177 } 7178 } 7179 } 7180 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7181 flags = iter_predtest_fwd(out, pg, flags); 7182 } 7183 return flags; 7184 } 7185 7186 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7187 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7188 { \ 7189 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7190 } 7191 7192 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7193 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7194 7195 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7196 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7197 7198 #undef DO_PPZZ_MATCH 7199 7200 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7201 uint32_t desc) 7202 { 7203 ARMVectorReg scratch; 7204 intptr_t i, j; 7205 intptr_t opr_sz = simd_oprsz(desc); 7206 uint32_t *d = vd, *n = vn, *m = vm; 7207 uint8_t *pg = vg; 7208 7209 if (d == n) { 7210 n = memcpy(&scratch, n, opr_sz); 7211 if (d == m) { 7212 m = n; 7213 } 7214 } else if (d == m) { 7215 m = memcpy(&scratch, m, opr_sz); 7216 } 7217 7218 for (i = 0; i < opr_sz; i += 4) { 7219 uint64_t count = 0; 7220 uint8_t pred; 7221 7222 pred = pg[H1(i >> 3)] >> (i & 7); 7223 if (pred & 1) { 7224 uint32_t nn = n[H4(i >> 2)]; 7225 7226 for (j = 0; j <= i; j += 4) { 7227 pred = pg[H1(j >> 3)] >> (j & 7); 7228 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7229 ++count; 7230 } 7231 } 7232 } 7233 d[H4(i >> 2)] = count; 7234 } 7235 } 7236 7237 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7238 uint32_t desc) 7239 { 7240 ARMVectorReg scratch; 7241 intptr_t i, j; 7242 intptr_t opr_sz = simd_oprsz(desc); 7243 uint64_t *d = vd, *n = vn, *m = vm; 7244 uint8_t *pg = vg; 7245 7246 if (d == n) { 7247 n = memcpy(&scratch, n, opr_sz); 7248 if (d == m) { 7249 m = n; 7250 } 7251 } else if (d == m) { 7252 m = memcpy(&scratch, m, opr_sz); 7253 } 7254 7255 for (i = 0; i < opr_sz / 8; ++i) { 7256 uint64_t count = 0; 7257 if (pg[H1(i)] & 1) { 7258 uint64_t nn = n[i]; 7259 for (j = 0; j <= i; ++j) { 7260 if ((pg[H1(j)] & 1) && nn == m[j]) { 7261 ++count; 7262 } 7263 } 7264 } 7265 d[i] = count; 7266 } 7267 } 7268 7269 /* 7270 * Returns the number of bytes in m0 and m1 that match n. 7271 * Unlike do_match2 we don't just need true/false, we need an exact count. 7272 * This requires two extra logical operations. 7273 */ 7274 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7275 { 7276 const uint64_t mask = dup_const(MO_8, 0x7f); 7277 uint64_t cmp0, cmp1; 7278 7279 cmp1 = dup_const(MO_8, n); 7280 cmp0 = cmp1 ^ m0; 7281 cmp1 = cmp1 ^ m1; 7282 7283 /* 7284 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7285 * 2: carry in to msb if byte != 0 (+ mask) 7286 * 3: set msb if cmp has msb set (| cmp) 7287 * 4: set ~msb to ignore them (| mask) 7288 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7289 * 5: invert, resulting in 0x80 if and only if byte == 0. 7290 */ 7291 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7292 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7293 7294 /* 7295 * Combine the two compares in a way that the bits do 7296 * not overlap, and so preserves the count of set bits. 7297 * If the host has an efficient instruction for ctpop, 7298 * then ctpop(x) + ctpop(y) has the same number of 7299 * operations as ctpop(x | (y >> 1)). If the host does 7300 * not have an efficient ctpop, then we only want to 7301 * use it once. 7302 */ 7303 return ctpop64(cmp0 | (cmp1 >> 1)); 7304 } 7305 7306 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7307 { 7308 intptr_t i, j; 7309 intptr_t opr_sz = simd_oprsz(desc); 7310 7311 for (i = 0; i < opr_sz; i += 16) { 7312 uint64_t n0 = *(uint64_t *)(vn + i); 7313 uint64_t m0 = *(uint64_t *)(vm + i); 7314 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7315 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7316 uint64_t out0 = 0; 7317 uint64_t out1 = 0; 7318 7319 for (j = 0; j < 64; j += 8) { 7320 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7321 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7322 out0 |= cnt0 << j; 7323 out1 |= cnt1 << j; 7324 } 7325 7326 *(uint64_t *)(vd + i) = out0; 7327 *(uint64_t *)(vd + i + 8) = out1; 7328 } 7329 } 7330 7331 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7332 { 7333 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7334 int shr = simd_data(desc); 7335 int shl = 8 - shr; 7336 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7337 uint64_t *d = vd, *n = vn, *m = vm; 7338 7339 for (i = 0; i < opr_sz; ++i) { 7340 uint64_t t = n[i] ^ m[i]; 7341 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7342 } 7343 } 7344 7345 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7346 { 7347 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7348 int shr = simd_data(desc); 7349 int shl = 16 - shr; 7350 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7351 uint64_t *d = vd, *n = vn, *m = vm; 7352 7353 for (i = 0; i < opr_sz; ++i) { 7354 uint64_t t = n[i] ^ m[i]; 7355 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7356 } 7357 } 7358 7359 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7360 { 7361 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7362 int shr = simd_data(desc); 7363 uint32_t *d = vd, *n = vn, *m = vm; 7364 7365 for (i = 0; i < opr_sz; ++i) { 7366 d[i] = ror32(n[i] ^ m[i], shr); 7367 } 7368 } 7369 7370 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7371 void *status, uint32_t desc) 7372 { 7373 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7374 7375 for (s = 0; s < opr_sz; ++s) { 7376 float32 *n = vn + s * sizeof(float32) * 4; 7377 float32 *m = vm + s * sizeof(float32) * 4; 7378 float32 *a = va + s * sizeof(float32) * 4; 7379 float32 *d = vd + s * sizeof(float32) * 4; 7380 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7381 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7382 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7383 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7384 float32 p0, p1; 7385 7386 /* i = 0, j = 0 */ 7387 p0 = float32_mul(n00, m00, status); 7388 p1 = float32_mul(n01, m01, status); 7389 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7390 7391 /* i = 0, j = 1 */ 7392 p0 = float32_mul(n00, m10, status); 7393 p1 = float32_mul(n01, m11, status); 7394 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7395 7396 /* i = 1, j = 0 */ 7397 p0 = float32_mul(n10, m00, status); 7398 p1 = float32_mul(n11, m01, status); 7399 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7400 7401 /* i = 1, j = 1 */ 7402 p0 = float32_mul(n10, m10, status); 7403 p1 = float32_mul(n11, m11, status); 7404 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7405 } 7406 } 7407 7408 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7409 void *status, uint32_t desc) 7410 { 7411 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7412 7413 for (s = 0; s < opr_sz; ++s) { 7414 float64 *n = vn + s * sizeof(float64) * 4; 7415 float64 *m = vm + s * sizeof(float64) * 4; 7416 float64 *a = va + s * sizeof(float64) * 4; 7417 float64 *d = vd + s * sizeof(float64) * 4; 7418 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7419 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7420 float64 p0, p1; 7421 7422 /* i = 0, j = 0 */ 7423 p0 = float64_mul(n00, m00, status); 7424 p1 = float64_mul(n01, m01, status); 7425 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7426 7427 /* i = 0, j = 1 */ 7428 p0 = float64_mul(n00, m10, status); 7429 p1 = float64_mul(n01, m11, status); 7430 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7431 7432 /* i = 1, j = 0 */ 7433 p0 = float64_mul(n10, m00, status); 7434 p1 = float64_mul(n11, m01, status); 7435 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7436 7437 /* i = 1, j = 1 */ 7438 p0 = float64_mul(n10, m10, status); 7439 p1 = float64_mul(n11, m11, status); 7440 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7441 } 7442 } 7443 7444 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7445 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7446 { \ 7447 intptr_t i = simd_oprsz(desc); \ 7448 uint64_t *g = vg; \ 7449 do { \ 7450 uint64_t pg = g[(i - 1) >> 6]; \ 7451 do { \ 7452 i -= sizeof(TYPEW); \ 7453 if (likely((pg >> (i & 63)) & 1)) { \ 7454 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7455 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7456 } \ 7457 } while (i & 63); \ 7458 } while (i != 0); \ 7459 } 7460 7461 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7462 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7463 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7464 7465 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7466 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7467 { \ 7468 intptr_t i = simd_oprsz(desc); \ 7469 uint64_t *g = vg; \ 7470 do { \ 7471 uint64_t pg = g[(i - 1) >> 6]; \ 7472 do { \ 7473 i -= sizeof(TYPEW); \ 7474 if (likely((pg >> (i & 63)) & 1)) { \ 7475 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7476 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7477 } \ 7478 } while (i & 63); \ 7479 } while (i != 0); \ 7480 } 7481 7482 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7483 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7484 7485 #undef DO_FCVTLT 7486 #undef DO_FCVTNT 7487