1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/helper-proto.h" 25 #include "tcg/tcg-gvec-desc.h" 26 #include "fpu/softfloat.h" 27 #include "tcg/tcg.h" 28 #include "vec_internal.h" 29 #include "sve_ldst_internal.h" 30 31 32 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 33 * 34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 35 * and bit 0 set if C is set. Compare the definitions of these variables 36 * within CPUARMState. 37 */ 38 39 /* For no G bits set, NZCV = C. */ 40 #define PREDTEST_INIT 1 41 42 /* This is an iterative function, called for each Pd and Pg word 43 * moving forward. 44 */ 45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 46 { 47 if (likely(g)) { 48 /* Compute N from first D & G. 49 Use bit 2 to signal first G bit seen. */ 50 if (!(flags & 4)) { 51 flags |= ((d & (g & -g)) != 0) << 31; 52 flags |= 4; 53 } 54 55 /* Accumulate Z from each D & G. */ 56 flags |= ((d & g) != 0) << 1; 57 58 /* Compute C from last !(D & G). Replace previous. */ 59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 60 } 61 return flags; 62 } 63 64 /* This is an iterative function, called for each Pd and Pg word 65 * moving backward. 66 */ 67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 68 { 69 if (likely(g)) { 70 /* Compute C from first (i.e last) !(D & G). 71 Use bit 2 to signal first G bit seen. */ 72 if (!(flags & 4)) { 73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 74 flags |= (d & pow2floor(g)) == 0; 75 } 76 77 /* Accumulate Z from each D & G. */ 78 flags |= ((d & g) != 0) << 1; 79 80 /* Compute N from last (i.e first) D & G. Replace previous. */ 81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 82 } 83 return flags; 84 } 85 86 /* The same for a single word predicate. */ 87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 88 { 89 return iter_predtest_fwd(d, g, PREDTEST_INIT); 90 } 91 92 /* The same for a multi-word predicate. */ 93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 94 { 95 uint32_t flags = PREDTEST_INIT; 96 uint64_t *d = vd, *g = vg; 97 uintptr_t i = 0; 98 99 do { 100 flags = iter_predtest_fwd(d[i], g[i], flags); 101 } while (++i < words); 102 103 return flags; 104 } 105 106 /* Similarly for single word elements. */ 107 static inline uint64_t expand_pred_s(uint8_t byte) 108 { 109 static const uint64_t word[] = { 110 [0x01] = 0x00000000ffffffffull, 111 [0x10] = 0xffffffff00000000ull, 112 [0x11] = 0xffffffffffffffffull, 113 }; 114 return word[byte & 0x11]; 115 } 116 117 #define LOGICAL_PPPP(NAME, FUNC) \ 118 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 119 { \ 120 uintptr_t opr_sz = simd_oprsz(desc); \ 121 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 122 uintptr_t i; \ 123 for (i = 0; i < opr_sz / 8; ++i) { \ 124 d[i] = FUNC(n[i], m[i], g[i]); \ 125 } \ 126 } 127 128 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 129 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 130 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 131 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 132 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 133 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 134 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 135 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 136 137 LOGICAL_PPPP(sve_and_pppp, DO_AND) 138 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 139 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 140 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 141 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 142 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 143 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 144 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 145 146 #undef DO_AND 147 #undef DO_BIC 148 #undef DO_EOR 149 #undef DO_ORR 150 #undef DO_ORN 151 #undef DO_NOR 152 #undef DO_NAND 153 #undef DO_SEL 154 #undef LOGICAL_PPPP 155 156 /* Fully general three-operand expander, controlled by a predicate. 157 * This is complicated by the host-endian storage of the register file. 158 */ 159 /* ??? I don't expect the compiler could ever vectorize this itself. 160 * With some tables we can convert bit masks to byte masks, and with 161 * extra care wrt byte/word ordering we could use gcc generic vectors 162 * and do 16 bytes at a time. 163 */ 164 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 166 { \ 167 intptr_t i, opr_sz = simd_oprsz(desc); \ 168 for (i = 0; i < opr_sz; ) { \ 169 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 170 do { \ 171 if (pg & 1) { \ 172 TYPE nn = *(TYPE *)(vn + H(i)); \ 173 TYPE mm = *(TYPE *)(vm + H(i)); \ 174 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 175 } \ 176 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 177 } while (i & 15); \ 178 } \ 179 } 180 181 /* Similarly, specialized for 64-bit operands. */ 182 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 183 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 184 { \ 185 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 186 TYPE *d = vd, *n = vn, *m = vm; \ 187 uint8_t *pg = vg; \ 188 for (i = 0; i < opr_sz; i += 1) { \ 189 if (pg[H1(i)] & 1) { \ 190 TYPE nn = n[i], mm = m[i]; \ 191 d[i] = OP(nn, mm); \ 192 } \ 193 } \ 194 } 195 196 #define DO_AND(N, M) (N & M) 197 #define DO_EOR(N, M) (N ^ M) 198 #define DO_ORR(N, M) (N | M) 199 #define DO_BIC(N, M) (N & ~M) 200 #define DO_ADD(N, M) (N + M) 201 #define DO_SUB(N, M) (N - M) 202 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 203 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 204 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 205 #define DO_MUL(N, M) (N * M) 206 207 208 /* 209 * We must avoid the C undefined behaviour cases: division by 210 * zero and signed division of INT_MIN by -1. Both of these 211 * have architecturally defined required results for Arm. 212 * We special case all signed divisions by -1 to avoid having 213 * to deduce the minimum integer for the type involved. 214 */ 215 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 216 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 217 218 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 219 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 220 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 221 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 222 223 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 224 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 225 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 226 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 227 228 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 229 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 230 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 231 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 232 233 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 234 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 235 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 236 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 237 238 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 239 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 240 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 241 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 242 243 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 244 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 245 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 246 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 247 248 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 249 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 250 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 251 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 252 253 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 254 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 255 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 256 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 257 258 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 259 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 260 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 261 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 262 263 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 264 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 265 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 266 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 267 268 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 269 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 270 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 271 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 272 273 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 274 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 275 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 276 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 277 278 /* Because the computation type is at least twice as large as required, 279 these work for both signed and unsigned source types. */ 280 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 281 { 282 return (n * m) >> 8; 283 } 284 285 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 286 { 287 return (n * m) >> 16; 288 } 289 290 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 291 { 292 return (n * m) >> 32; 293 } 294 295 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 296 { 297 uint64_t lo, hi; 298 muls64(&lo, &hi, n, m); 299 return hi; 300 } 301 302 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 303 { 304 uint64_t lo, hi; 305 mulu64(&lo, &hi, n, m); 306 return hi; 307 } 308 309 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 310 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 311 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 312 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 313 314 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 315 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 316 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 317 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 318 319 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 320 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 321 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 322 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 323 324 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 325 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 326 327 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 328 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 329 330 /* Note that all bits of the shift are significant 331 and not modulo the element size. */ 332 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 333 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 334 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 335 336 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 337 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 338 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 339 340 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 341 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 342 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 343 344 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 345 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 346 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 347 348 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 349 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 350 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 351 352 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 353 { 354 int8_t n1 = n, n2 = n >> 8; 355 return m + n1 + n2; 356 } 357 358 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 359 { 360 int16_t n1 = n, n2 = n >> 16; 361 return m + n1 + n2; 362 } 363 364 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 365 { 366 int32_t n1 = n, n2 = n >> 32; 367 return m + n1 + n2; 368 } 369 370 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 371 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 372 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 373 374 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 375 { 376 uint8_t n1 = n, n2 = n >> 8; 377 return m + n1 + n2; 378 } 379 380 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 381 { 382 uint16_t n1 = n, n2 = n >> 16; 383 return m + n1 + n2; 384 } 385 386 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 387 { 388 uint32_t n1 = n, n2 = n >> 32; 389 return m + n1 + n2; 390 } 391 392 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 393 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 394 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 395 396 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 397 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 398 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 399 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 400 401 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 402 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 403 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 404 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 405 406 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 407 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 408 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 409 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 410 411 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 412 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 413 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 414 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 415 416 /* 417 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 418 * We pass in a pointer to a dummy saturation field to trigger 419 * the saturating arithmetic but discard the information about 420 * whether it has occurred. 421 */ 422 #define do_sqshl_b(n, m) \ 423 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 424 #define do_sqshl_h(n, m) \ 425 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 426 #define do_sqshl_s(n, m) \ 427 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 428 #define do_sqshl_d(n, m) \ 429 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 430 431 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 432 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 433 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 434 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 435 436 #define do_uqshl_b(n, m) \ 437 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 438 #define do_uqshl_h(n, m) \ 439 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 440 #define do_uqshl_s(n, m) \ 441 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 442 #define do_uqshl_d(n, m) \ 443 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 444 445 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 446 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 447 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 448 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 449 450 #define do_sqrshl_b(n, m) \ 451 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 452 #define do_sqrshl_h(n, m) \ 453 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 454 #define do_sqrshl_s(n, m) \ 455 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 456 #define do_sqrshl_d(n, m) \ 457 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 458 459 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 460 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 461 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 462 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 463 464 #undef do_sqrshl_d 465 466 #define do_uqrshl_b(n, m) \ 467 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 468 #define do_uqrshl_h(n, m) \ 469 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 470 #define do_uqrshl_s(n, m) \ 471 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 472 #define do_uqrshl_d(n, m) \ 473 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 474 475 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 476 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 477 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 478 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 479 480 #undef do_uqrshl_d 481 482 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 483 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 484 485 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 486 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 487 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 488 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 489 490 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 491 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 492 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 493 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 494 495 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 496 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 497 498 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 499 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 500 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 501 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 502 503 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 504 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 505 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 506 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 507 508 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 509 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 510 511 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 512 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 513 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 514 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 515 516 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 517 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 518 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 519 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 520 521 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 522 { 523 return val >= max ? max : val <= min ? min : val; 524 } 525 526 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 527 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 528 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 529 530 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 531 { 532 int64_t r = n + m; 533 if (((r ^ n) & ~(n ^ m)) < 0) { 534 /* Signed overflow. */ 535 return r < 0 ? INT64_MAX : INT64_MIN; 536 } 537 return r; 538 } 539 540 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 541 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 542 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 543 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 544 545 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 546 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 547 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 548 549 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 550 { 551 uint64_t r = n + m; 552 return r < n ? UINT64_MAX : r; 553 } 554 555 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 556 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 557 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 558 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 559 560 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 561 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 562 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 563 564 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 565 { 566 int64_t r = n - m; 567 if (((r ^ n) & (n ^ m)) < 0) { 568 /* Signed overflow. */ 569 return r < 0 ? INT64_MAX : INT64_MIN; 570 } 571 return r; 572 } 573 574 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 575 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 576 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 577 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 578 579 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 580 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 581 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 582 583 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 584 { 585 return n > m ? n - m : 0; 586 } 587 588 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 589 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 590 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 591 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 592 593 #define DO_SUQADD_B(n, m) \ 594 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 595 #define DO_SUQADD_H(n, m) \ 596 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 597 #define DO_SUQADD_S(n, m) \ 598 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 599 600 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 601 { 602 uint64_t r = n + m; 603 604 if (n < 0) { 605 /* Note that m - abs(n) cannot underflow. */ 606 if (r > INT64_MAX) { 607 /* Result is either very large positive or negative. */ 608 if (m > -n) { 609 /* m > abs(n), so r is a very large positive. */ 610 return INT64_MAX; 611 } 612 /* Result is negative. */ 613 } 614 } else { 615 /* Both inputs are positive: check for overflow. */ 616 if (r < m || r > INT64_MAX) { 617 return INT64_MAX; 618 } 619 } 620 return r; 621 } 622 623 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 624 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 625 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 626 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 627 628 #define DO_USQADD_B(n, m) \ 629 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 630 #define DO_USQADD_H(n, m) \ 631 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 632 #define DO_USQADD_S(n, m) \ 633 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 634 635 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 636 { 637 uint64_t r = n + m; 638 639 if (m < 0) { 640 return n < -m ? 0 : r; 641 } 642 return r < n ? UINT64_MAX : r; 643 } 644 645 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 646 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 647 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 648 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 649 650 #undef DO_ZPZZ 651 #undef DO_ZPZZ_D 652 653 /* 654 * Three operand expander, operating on element pairs. 655 * If the slot I is even, the elements from from VN {I, I+1}. 656 * If the slot I is odd, the elements from from VM {I-1, I}. 657 * Load all of the input elements in each pair before overwriting output. 658 */ 659 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 660 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 661 { \ 662 intptr_t i, opr_sz = simd_oprsz(desc); \ 663 for (i = 0; i < opr_sz; ) { \ 664 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 665 do { \ 666 TYPE n0 = *(TYPE *)(vn + H(i)); \ 667 TYPE m0 = *(TYPE *)(vm + H(i)); \ 668 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 669 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 670 if (pg & 1) { \ 671 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 672 } \ 673 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 674 if (pg & 1) { \ 675 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 676 } \ 677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 678 } while (i & 15); \ 679 } \ 680 } 681 682 /* Similarly, specialized for 64-bit operands. */ 683 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 684 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 685 { \ 686 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 687 TYPE *d = vd, *n = vn, *m = vm; \ 688 uint8_t *pg = vg; \ 689 for (i = 0; i < opr_sz; i += 2) { \ 690 TYPE n0 = n[i], n1 = n[i + 1]; \ 691 TYPE m0 = m[i], m1 = m[i + 1]; \ 692 if (pg[H1(i)] & 1) { \ 693 d[i] = OP(n0, n1); \ 694 } \ 695 if (pg[H1(i + 1)] & 1) { \ 696 d[i + 1] = OP(m0, m1); \ 697 } \ 698 } \ 699 } 700 701 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 702 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 703 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 704 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 705 706 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 709 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 710 711 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 714 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 715 716 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 719 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 720 721 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 724 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 725 726 #undef DO_ZPZZ_PAIR 727 #undef DO_ZPZZ_PAIR_D 728 729 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 730 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 731 void *status, uint32_t desc) \ 732 { \ 733 intptr_t i, opr_sz = simd_oprsz(desc); \ 734 for (i = 0; i < opr_sz; ) { \ 735 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 736 do { \ 737 TYPE n0 = *(TYPE *)(vn + H(i)); \ 738 TYPE m0 = *(TYPE *)(vm + H(i)); \ 739 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 740 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 741 if (pg & 1) { \ 742 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 743 } \ 744 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 745 if (pg & 1) { \ 746 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 747 } \ 748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 749 } while (i & 15); \ 750 } \ 751 } 752 753 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 756 757 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 760 761 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 764 765 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 768 769 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 772 773 #undef DO_ZPZZ_PAIR_FP 774 775 /* Three-operand expander, controlled by a predicate, in which the 776 * third operand is "wide". That is, for D = N op M, the same 64-bit 777 * value of M is used with all of the narrower values of N. 778 */ 779 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 780 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 781 { \ 782 intptr_t i, opr_sz = simd_oprsz(desc); \ 783 for (i = 0; i < opr_sz; ) { \ 784 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 785 TYPEW mm = *(TYPEW *)(vm + i); \ 786 do { \ 787 if (pg & 1) { \ 788 TYPE nn = *(TYPE *)(vn + H(i)); \ 789 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 790 } \ 791 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 792 } while (i & 7); \ 793 } \ 794 } 795 796 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 797 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 798 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 799 800 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 801 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 802 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 803 804 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 805 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 806 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 807 808 #undef DO_ZPZW 809 810 /* Fully general two-operand expander, controlled by a predicate. 811 */ 812 #define DO_ZPZ(NAME, TYPE, H, OP) \ 813 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 814 { \ 815 intptr_t i, opr_sz = simd_oprsz(desc); \ 816 for (i = 0; i < opr_sz; ) { \ 817 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 818 do { \ 819 if (pg & 1) { \ 820 TYPE nn = *(TYPE *)(vn + H(i)); \ 821 *(TYPE *)(vd + H(i)) = OP(nn); \ 822 } \ 823 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 824 } while (i & 15); \ 825 } \ 826 } 827 828 /* Similarly, specialized for 64-bit operands. */ 829 #define DO_ZPZ_D(NAME, TYPE, OP) \ 830 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 831 { \ 832 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 833 TYPE *d = vd, *n = vn; \ 834 uint8_t *pg = vg; \ 835 for (i = 0; i < opr_sz; i += 1) { \ 836 if (pg[H1(i)] & 1) { \ 837 TYPE nn = n[i]; \ 838 d[i] = OP(nn); \ 839 } \ 840 } \ 841 } 842 843 #define DO_CLS_B(N) (clrsb32(N) - 24) 844 #define DO_CLS_H(N) (clrsb32(N) - 16) 845 846 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 847 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 848 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 849 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 850 851 #define DO_CLZ_B(N) (clz32(N) - 24) 852 #define DO_CLZ_H(N) (clz32(N) - 16) 853 854 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 855 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 856 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 857 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 858 859 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 860 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 861 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 862 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 863 864 #define DO_CNOT(N) (N == 0) 865 866 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 867 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 868 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 869 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 870 871 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 872 873 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 874 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 875 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 876 877 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 878 879 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 880 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 881 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 882 883 #define DO_NOT(N) (~N) 884 885 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 886 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 887 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 888 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 889 890 #define DO_SXTB(N) ((int8_t)N) 891 #define DO_SXTH(N) ((int16_t)N) 892 #define DO_SXTS(N) ((int32_t)N) 893 #define DO_UXTB(N) ((uint8_t)N) 894 #define DO_UXTH(N) ((uint16_t)N) 895 #define DO_UXTS(N) ((uint32_t)N) 896 897 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 898 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 899 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 900 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 901 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 902 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 903 904 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 905 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 906 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 907 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 908 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 909 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 910 911 #define DO_ABS(N) (N < 0 ? -N : N) 912 913 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 914 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 915 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 916 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 917 918 #define DO_NEG(N) (-N) 919 920 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 921 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 922 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 923 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 924 925 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 926 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 927 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 928 929 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 930 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 931 932 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 933 934 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 935 { 936 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 937 uint64_t *d = vd, *n = vn; 938 uint8_t *pg = vg; 939 940 for (i = 0; i < opr_sz; i += 2) { 941 if (pg[H1(i)] & 1) { 942 uint64_t n0 = n[i + 0]; 943 uint64_t n1 = n[i + 1]; 944 d[i + 0] = n1; 945 d[i + 1] = n0; 946 } 947 } 948 } 949 950 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 951 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 952 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 953 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 954 955 #define DO_SQABS(X) \ 956 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 957 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 958 959 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 960 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 961 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 962 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 963 964 #define DO_SQNEG(X) \ 965 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 966 x_ == min_ ? -min_ - 1 : -x_; }) 967 968 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 969 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 970 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 971 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 972 973 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 974 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 975 976 /* Three-operand expander, unpredicated, in which the third operand is "wide". 977 */ 978 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 979 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 980 { \ 981 intptr_t i, opr_sz = simd_oprsz(desc); \ 982 for (i = 0; i < opr_sz; ) { \ 983 TYPEW mm = *(TYPEW *)(vm + i); \ 984 do { \ 985 TYPE nn = *(TYPE *)(vn + H(i)); \ 986 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 987 i += sizeof(TYPE); \ 988 } while (i & 7); \ 989 } \ 990 } 991 992 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 993 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 994 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 995 996 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 997 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 998 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 999 1000 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1001 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1002 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1003 1004 #undef DO_ZZW 1005 1006 #undef DO_CLS_B 1007 #undef DO_CLS_H 1008 #undef DO_CLZ_B 1009 #undef DO_CLZ_H 1010 #undef DO_CNOT 1011 #undef DO_FABS 1012 #undef DO_FNEG 1013 #undef DO_ABS 1014 #undef DO_NEG 1015 #undef DO_ZPZ 1016 #undef DO_ZPZ_D 1017 1018 /* 1019 * Three-operand expander, unpredicated, in which the two inputs are 1020 * selected from the top or bottom half of the wide column. 1021 */ 1022 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1023 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1024 { \ 1025 intptr_t i, opr_sz = simd_oprsz(desc); \ 1026 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1027 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1028 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1029 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1030 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1031 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1032 } \ 1033 } 1034 1035 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1036 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1037 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1038 1039 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1040 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1041 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1042 1043 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1044 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1045 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1046 1047 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1048 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1049 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1050 1051 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1052 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1053 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1054 1055 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1056 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1057 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1058 1059 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1060 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1061 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1062 1063 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1064 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1065 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1066 1067 /* Note that the multiply cannot overflow, but the doubling can. */ 1068 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1069 { 1070 int16_t val = n * m; 1071 return DO_SQADD_H(val, val); 1072 } 1073 1074 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1075 { 1076 int32_t val = n * m; 1077 return DO_SQADD_S(val, val); 1078 } 1079 1080 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1081 { 1082 int64_t val = n * m; 1083 return do_sqadd_d(val, val); 1084 } 1085 1086 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1087 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1088 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1089 1090 #undef DO_ZZZ_TB 1091 1092 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1093 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1094 { \ 1095 intptr_t i, opr_sz = simd_oprsz(desc); \ 1096 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1097 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1098 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1099 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1100 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1101 } \ 1102 } 1103 1104 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1105 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1106 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1107 1108 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1109 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1110 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1111 1112 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1113 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1114 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1115 1116 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1117 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1118 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1119 1120 #undef DO_ZZZ_WTB 1121 1122 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1123 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1124 { \ 1125 intptr_t i, opr_sz = simd_oprsz(desc); \ 1126 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1127 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1128 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1129 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1130 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1131 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1132 } \ 1133 } 1134 1135 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1136 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1137 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1138 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1139 1140 #undef DO_ZZZ_NTB 1141 1142 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1143 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1144 { \ 1145 intptr_t i, opr_sz = simd_oprsz(desc); \ 1146 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1147 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1148 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1149 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1150 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1151 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1152 } \ 1153 } 1154 1155 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1156 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1157 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1158 1159 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1160 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1161 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1162 1163 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1164 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1165 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1166 1167 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1168 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1169 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1170 1171 #define DO_NMUL(N, M) -(N * M) 1172 1173 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1176 1177 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1180 1181 #undef DO_ZZZW_ACC 1182 1183 #define DO_XTNB(NAME, TYPE, OP) \ 1184 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1185 { \ 1186 intptr_t i, opr_sz = simd_oprsz(desc); \ 1187 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1188 TYPE nn = *(TYPE *)(vn + i); \ 1189 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1190 *(TYPE *)(vd + i) = nn; \ 1191 } \ 1192 } 1193 1194 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1195 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1196 { \ 1197 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1198 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1199 TYPE nn = *(TYPE *)(vn + i); \ 1200 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1201 } \ 1202 } 1203 1204 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1205 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1206 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1207 1208 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1209 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1210 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1211 1212 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1213 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1214 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1215 1216 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1217 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1218 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1219 1220 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1221 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1222 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1223 1224 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1225 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1226 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1227 1228 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1229 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1230 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1231 1232 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1233 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1234 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1235 1236 #undef DO_XTNB 1237 #undef DO_XTNT 1238 1239 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1240 { 1241 intptr_t i, opr_sz = simd_oprsz(desc); 1242 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1243 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1244 uint32_t *a = va, *n = vn; 1245 uint64_t *d = vd, *m = vm; 1246 1247 for (i = 0; i < opr_sz / 8; ++i) { 1248 uint32_t e1 = a[2 * i + H4(0)]; 1249 uint32_t e2 = n[2 * i + sel] ^ inv; 1250 uint64_t c = extract64(m[i], 32, 1); 1251 /* Compute and store the entire 33-bit result at once. */ 1252 d[i] = c + e1 + e2; 1253 } 1254 } 1255 1256 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1257 { 1258 intptr_t i, opr_sz = simd_oprsz(desc); 1259 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1260 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1261 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1262 1263 for (i = 0; i < opr_sz / 8; i += 2) { 1264 Int128 e1 = int128_make64(a[i]); 1265 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1266 Int128 c = int128_make64(m[i + 1] & 1); 1267 Int128 r = int128_add(int128_add(e1, e2), c); 1268 d[i + 0] = int128_getlo(r); 1269 d[i + 1] = int128_gethi(r); 1270 } 1271 } 1272 1273 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1274 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1275 { \ 1276 intptr_t i, opr_sz = simd_oprsz(desc); \ 1277 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1278 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1279 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1280 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1281 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1282 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1283 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1284 } \ 1285 } 1286 1287 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1288 do_sqdmull_h, DO_SQADD_H) 1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1290 do_sqdmull_s, DO_SQADD_S) 1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1292 do_sqdmull_d, do_sqadd_d) 1293 1294 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1295 do_sqdmull_h, DO_SQSUB_H) 1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1297 do_sqdmull_s, DO_SQSUB_S) 1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1299 do_sqdmull_d, do_sqsub_d) 1300 1301 #undef DO_SQDMLAL 1302 1303 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1304 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1305 { \ 1306 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1307 int rot = simd_data(desc); \ 1308 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1309 bool sub_r = rot == 1 || rot == 2; \ 1310 bool sub_i = rot >= 2; \ 1311 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1312 for (i = 0; i < opr_sz; i += 2) { \ 1313 TYPE elt1_a = n[H(i + sel_a)]; \ 1314 TYPE elt2_a = m[H(i + sel_a)]; \ 1315 TYPE elt2_b = m[H(i + sel_b)]; \ 1316 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1317 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1318 } \ 1319 } 1320 1321 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1322 1323 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1324 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1325 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1326 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1327 1328 #define DO_SQRDMLAH_B(N, M, A, S) \ 1329 do_sqrdmlah_b(N, M, A, S, true) 1330 #define DO_SQRDMLAH_H(N, M, A, S) \ 1331 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1332 #define DO_SQRDMLAH_S(N, M, A, S) \ 1333 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1334 #define DO_SQRDMLAH_D(N, M, A, S) \ 1335 do_sqrdmlah_d(N, M, A, S, true) 1336 1337 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1341 1342 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1343 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1344 { \ 1345 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1346 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1347 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1348 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1349 bool sub_r = rot == 1 || rot == 2; \ 1350 bool sub_i = rot >= 2; \ 1351 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1352 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1353 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1354 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1355 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1356 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1357 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1358 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1359 } \ 1360 } \ 1361 } 1362 1363 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1365 1366 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1368 1369 #undef DO_CMLA 1370 #undef DO_CMLA_FUNC 1371 #undef DO_CMLA_IDX_FUNC 1372 #undef DO_SQRDMLAH_B 1373 #undef DO_SQRDMLAH_H 1374 #undef DO_SQRDMLAH_S 1375 #undef DO_SQRDMLAH_D 1376 1377 /* Note N and M are 4 elements bundled into one unit. */ 1378 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1379 int sel_a, int sel_b, int sub_i) 1380 { 1381 for (int i = 0; i <= 1; i++) { 1382 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1383 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1384 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1385 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1386 1387 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1388 } 1389 return a; 1390 } 1391 1392 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1393 int sel_a, int sel_b, int sub_i) 1394 { 1395 for (int i = 0; i <= 1; i++) { 1396 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1397 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1398 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1399 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1400 1401 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1402 } 1403 return a; 1404 } 1405 1406 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1407 void *va, uint32_t desc) 1408 { 1409 int opr_sz = simd_oprsz(desc); 1410 int rot = simd_data(desc); 1411 int sel_a = rot & 1; 1412 int sel_b = sel_a ^ 1; 1413 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1414 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1415 1416 for (int e = 0; e < opr_sz / 4; e++) { 1417 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1418 } 1419 } 1420 1421 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1422 void *va, uint32_t desc) 1423 { 1424 int opr_sz = simd_oprsz(desc); 1425 int rot = simd_data(desc); 1426 int sel_a = rot & 1; 1427 int sel_b = sel_a ^ 1; 1428 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1429 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1430 1431 for (int e = 0; e < opr_sz / 8; e++) { 1432 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1433 } 1434 } 1435 1436 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1437 void *va, uint32_t desc) 1438 { 1439 int opr_sz = simd_oprsz(desc); 1440 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1441 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1442 int sel_a = rot & 1; 1443 int sel_b = sel_a ^ 1; 1444 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1445 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1446 1447 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1448 uint32_t seg_m = m[seg + idx]; 1449 for (int e = 0; e < 4; e++) { 1450 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1451 sel_a, sel_b, sub_i); 1452 } 1453 } 1454 } 1455 1456 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1457 void *va, uint32_t desc) 1458 { 1459 int seg, opr_sz = simd_oprsz(desc); 1460 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1461 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1462 int sel_a = rot & 1; 1463 int sel_b = sel_a ^ 1; 1464 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1465 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1466 1467 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1468 uint64_t seg_m = m[seg + idx]; 1469 for (int e = 0; e < 2; e++) { 1470 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1471 sel_a, sel_b, sub_i); 1472 } 1473 } 1474 } 1475 1476 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1477 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1478 { \ 1479 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1480 intptr_t i, j, idx = simd_data(desc); \ 1481 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1482 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1483 TYPE mm = m[i]; \ 1484 for (j = 0; j < segment; j++) { \ 1485 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1486 } \ 1487 } \ 1488 } 1489 1490 #define DO_SQRDMLAH_H(N, M, A) \ 1491 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1492 #define DO_SQRDMLAH_S(N, M, A) \ 1493 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1494 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1495 1496 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1497 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1498 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1499 1500 #define DO_SQRDMLSH_H(N, M, A) \ 1501 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1502 #define DO_SQRDMLSH_S(N, M, A) \ 1503 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1504 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1505 1506 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1507 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1508 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1509 1510 #undef DO_ZZXZ 1511 1512 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1513 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1514 { \ 1515 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1516 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1517 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1518 for (i = 0; i < oprsz; i += 16) { \ 1519 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1520 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1521 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1522 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1523 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1524 } \ 1525 } \ 1526 } 1527 1528 #define DO_MLA(N, M, A) (A + N * M) 1529 1530 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1531 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1532 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1533 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1534 1535 #define DO_MLS(N, M, A) (A - N * M) 1536 1537 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1538 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1539 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1540 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1541 1542 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1543 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1544 1545 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1546 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1547 1548 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1549 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1550 1551 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1552 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1553 1554 #undef DO_MLA 1555 #undef DO_MLS 1556 #undef DO_ZZXW 1557 1558 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1559 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1560 { \ 1561 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1562 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1563 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1564 for (i = 0; i < oprsz; i += 16) { \ 1565 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1566 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1567 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1568 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1569 } \ 1570 } \ 1571 } 1572 1573 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1574 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1575 1576 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1577 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1578 1579 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1580 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1581 1582 #undef DO_ZZX 1583 1584 #define DO_BITPERM(NAME, TYPE, OP) \ 1585 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1586 { \ 1587 intptr_t i, opr_sz = simd_oprsz(desc); \ 1588 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1589 TYPE nn = *(TYPE *)(vn + i); \ 1590 TYPE mm = *(TYPE *)(vm + i); \ 1591 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1592 } \ 1593 } 1594 1595 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1596 { 1597 uint64_t res = 0; 1598 int db, rb = 0; 1599 1600 for (db = 0; db < n; ++db) { 1601 if ((mask >> db) & 1) { 1602 res |= ((data >> db) & 1) << rb; 1603 ++rb; 1604 } 1605 } 1606 return res; 1607 } 1608 1609 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1610 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1611 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1612 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1613 1614 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1615 { 1616 uint64_t res = 0; 1617 int rb, db = 0; 1618 1619 for (rb = 0; rb < n; ++rb) { 1620 if ((mask >> rb) & 1) { 1621 res |= ((data >> db) & 1) << rb; 1622 ++db; 1623 } 1624 } 1625 return res; 1626 } 1627 1628 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1629 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1630 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1631 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1632 1633 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1634 { 1635 uint64_t resm = 0, resu = 0; 1636 int db, rbm = 0, rbu = 0; 1637 1638 for (db = 0; db < n; ++db) { 1639 uint64_t val = (data >> db) & 1; 1640 if ((mask >> db) & 1) { 1641 resm |= val << rbm++; 1642 } else { 1643 resu |= val << rbu++; 1644 } 1645 } 1646 1647 return resm | (resu << rbm); 1648 } 1649 1650 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1651 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1652 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1653 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1654 1655 #undef DO_BITPERM 1656 1657 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1658 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1659 { \ 1660 intptr_t i, opr_sz = simd_oprsz(desc); \ 1661 int sub_r = simd_data(desc); \ 1662 if (sub_r) { \ 1663 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1664 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1665 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1666 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1667 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1668 acc_r = ADD_OP(acc_r, el2_i); \ 1669 acc_i = SUB_OP(acc_i, el2_r); \ 1670 *(TYPE *)(vd + H(i)) = acc_r; \ 1671 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1672 } \ 1673 } else { \ 1674 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1675 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1676 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1677 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1678 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1679 acc_r = SUB_OP(acc_r, el2_i); \ 1680 acc_i = ADD_OP(acc_i, el2_r); \ 1681 *(TYPE *)(vd + H(i)) = acc_r; \ 1682 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1683 } \ 1684 } \ 1685 } 1686 1687 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1688 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1689 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1690 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1691 1692 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1693 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1694 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1695 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1696 1697 #undef DO_CADD 1698 1699 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1700 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1701 { \ 1702 intptr_t i, opr_sz = simd_oprsz(desc); \ 1703 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1704 int shift = simd_data(desc) >> 1; \ 1705 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1706 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1707 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1708 } \ 1709 } 1710 1711 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1712 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1713 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1714 1715 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1716 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1717 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1718 1719 #undef DO_ZZI_SHLL 1720 1721 /* Two-operand reduction expander, controlled by a predicate. 1722 * The difference between TYPERED and TYPERET has to do with 1723 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1724 * but TYPERET must be unsigned so that e.g. a 32-bit value 1725 * is not sign-extended to the ABI uint64_t return type. 1726 */ 1727 /* ??? If we were to vectorize this by hand the reduction ordering 1728 * would change. For integer operands, this is perfectly fine. 1729 */ 1730 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1731 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1732 { \ 1733 intptr_t i, opr_sz = simd_oprsz(desc); \ 1734 TYPERED ret = INIT; \ 1735 for (i = 0; i < opr_sz; ) { \ 1736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1737 do { \ 1738 if (pg & 1) { \ 1739 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1740 ret = OP(ret, nn); \ 1741 } \ 1742 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1743 } while (i & 15); \ 1744 } \ 1745 return (TYPERET)ret; \ 1746 } 1747 1748 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1749 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1750 { \ 1751 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1752 TYPEE *n = vn; \ 1753 uint8_t *pg = vg; \ 1754 TYPER ret = INIT; \ 1755 for (i = 0; i < opr_sz; i += 1) { \ 1756 if (pg[H1(i)] & 1) { \ 1757 TYPEE nn = n[i]; \ 1758 ret = OP(ret, nn); \ 1759 } \ 1760 } \ 1761 return ret; \ 1762 } 1763 1764 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1765 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1766 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1767 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1768 1769 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1770 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1771 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1772 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1773 1774 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1775 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1776 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1777 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1778 1779 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1780 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1781 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1782 1783 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1784 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1785 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1786 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1787 1788 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1789 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1790 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1791 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1792 1793 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1794 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1795 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1796 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1797 1798 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1799 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1800 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1801 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1802 1803 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1804 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1805 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1806 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1807 1808 #undef DO_VPZ 1809 #undef DO_VPZ_D 1810 1811 /* Two vector operand, one scalar operand, unpredicated. */ 1812 #define DO_ZZI(NAME, TYPE, OP) \ 1813 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1814 { \ 1815 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1816 TYPE s = s64, *d = vd, *n = vn; \ 1817 for (i = 0; i < opr_sz; ++i) { \ 1818 d[i] = OP(n[i], s); \ 1819 } \ 1820 } 1821 1822 #define DO_SUBR(X, Y) (Y - X) 1823 1824 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1825 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1826 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1827 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1828 1829 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1830 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1831 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1832 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1833 1834 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1835 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1836 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1837 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1838 1839 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1840 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1841 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1842 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1843 1844 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1845 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1846 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1847 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1848 1849 #undef DO_ZZI 1850 1851 #undef DO_AND 1852 #undef DO_ORR 1853 #undef DO_EOR 1854 #undef DO_BIC 1855 #undef DO_ADD 1856 #undef DO_SUB 1857 #undef DO_MAX 1858 #undef DO_MIN 1859 #undef DO_ABD 1860 #undef DO_MUL 1861 #undef DO_DIV 1862 #undef DO_ASR 1863 #undef DO_LSR 1864 #undef DO_LSL 1865 #undef DO_SUBR 1866 1867 /* Similar to the ARM LastActiveElement pseudocode function, except the 1868 result is multiplied by the element size. This includes the not found 1869 indication; e.g. not found for esz=3 is -8. */ 1870 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1871 { 1872 uint64_t mask = pred_esz_masks[esz]; 1873 intptr_t i = words; 1874 1875 do { 1876 uint64_t this_g = g[--i] & mask; 1877 if (this_g) { 1878 return i * 64 + (63 - clz64(this_g)); 1879 } 1880 } while (i > 0); 1881 return (intptr_t)-1 << esz; 1882 } 1883 1884 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1885 { 1886 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1887 uint32_t flags = PREDTEST_INIT; 1888 uint64_t *d = vd, *g = vg; 1889 intptr_t i = 0; 1890 1891 do { 1892 uint64_t this_d = d[i]; 1893 uint64_t this_g = g[i]; 1894 1895 if (this_g) { 1896 if (!(flags & 4)) { 1897 /* Set in D the first bit of G. */ 1898 this_d |= this_g & -this_g; 1899 d[i] = this_d; 1900 } 1901 flags = iter_predtest_fwd(this_d, this_g, flags); 1902 } 1903 } while (++i < words); 1904 1905 return flags; 1906 } 1907 1908 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1909 { 1910 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1911 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1912 uint32_t flags = PREDTEST_INIT; 1913 uint64_t *d = vd, *g = vg, esz_mask; 1914 intptr_t i, next; 1915 1916 next = last_active_element(vd, words, esz) + (1 << esz); 1917 esz_mask = pred_esz_masks[esz]; 1918 1919 /* Similar to the pseudocode for pnext, but scaled by ESZ 1920 so that we find the correct bit. */ 1921 if (next < words * 64) { 1922 uint64_t mask = -1; 1923 1924 if (next & 63) { 1925 mask = ~((1ull << (next & 63)) - 1); 1926 next &= -64; 1927 } 1928 do { 1929 uint64_t this_g = g[next / 64] & esz_mask & mask; 1930 if (this_g != 0) { 1931 next = (next & -64) + ctz64(this_g); 1932 break; 1933 } 1934 next += 64; 1935 mask = -1; 1936 } while (next < words * 64); 1937 } 1938 1939 i = 0; 1940 do { 1941 uint64_t this_d = 0; 1942 if (i == next / 64) { 1943 this_d = 1ull << (next & 63); 1944 } 1945 d[i] = this_d; 1946 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1947 } while (++i < words); 1948 1949 return flags; 1950 } 1951 1952 /* 1953 * Copy Zn into Zd, and store zero into inactive elements. 1954 * If inv, store zeros into the active elements. 1955 */ 1956 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1957 { 1958 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1959 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1960 uint64_t *d = vd, *n = vn; 1961 uint8_t *pg = vg; 1962 1963 for (i = 0; i < opr_sz; i += 1) { 1964 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1965 } 1966 } 1967 1968 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1969 { 1970 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1971 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1972 uint64_t *d = vd, *n = vn; 1973 uint8_t *pg = vg; 1974 1975 for (i = 0; i < opr_sz; i += 1) { 1976 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1977 } 1978 } 1979 1980 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 1981 { 1982 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1983 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1984 uint64_t *d = vd, *n = vn; 1985 uint8_t *pg = vg; 1986 1987 for (i = 0; i < opr_sz; i += 1) { 1988 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 1989 } 1990 } 1991 1992 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 1993 { 1994 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1995 uint64_t *d = vd, *n = vn; 1996 uint8_t *pg = vg; 1997 uint8_t inv = simd_data(desc); 1998 1999 for (i = 0; i < opr_sz; i += 1) { 2000 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2001 } 2002 } 2003 2004 /* Three-operand expander, immediate operand, controlled by a predicate. 2005 */ 2006 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2007 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2008 { \ 2009 intptr_t i, opr_sz = simd_oprsz(desc); \ 2010 TYPE imm = simd_data(desc); \ 2011 for (i = 0; i < opr_sz; ) { \ 2012 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2013 do { \ 2014 if (pg & 1) { \ 2015 TYPE nn = *(TYPE *)(vn + H(i)); \ 2016 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2017 } \ 2018 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2019 } while (i & 15); \ 2020 } \ 2021 } 2022 2023 /* Similarly, specialized for 64-bit operands. */ 2024 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2025 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2026 { \ 2027 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2028 TYPE *d = vd, *n = vn; \ 2029 TYPE imm = simd_data(desc); \ 2030 uint8_t *pg = vg; \ 2031 for (i = 0; i < opr_sz; i += 1) { \ 2032 if (pg[H1(i)] & 1) { \ 2033 TYPE nn = n[i]; \ 2034 d[i] = OP(nn, imm); \ 2035 } \ 2036 } \ 2037 } 2038 2039 #define DO_SHR(N, M) (N >> M) 2040 #define DO_SHL(N, M) (N << M) 2041 2042 /* Arithmetic shift right for division. This rounds negative numbers 2043 toward zero as per signed division. Therefore before shifting, 2044 when N is negative, add 2**M-1. */ 2045 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2046 2047 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2048 { 2049 if (likely(sh < 64)) { 2050 return (x >> sh) + ((x >> (sh - 1)) & 1); 2051 } else if (sh == 64) { 2052 return x >> 63; 2053 } else { 2054 return 0; 2055 } 2056 } 2057 2058 static inline int64_t do_srshr(int64_t x, unsigned sh) 2059 { 2060 if (likely(sh < 64)) { 2061 return (x >> sh) + ((x >> (sh - 1)) & 1); 2062 } else { 2063 /* Rounding the sign bit always produces 0. */ 2064 return 0; 2065 } 2066 } 2067 2068 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2069 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2070 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2071 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2072 2073 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2074 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2075 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2076 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2077 2078 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2079 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2080 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2081 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2082 2083 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2084 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2085 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2086 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2087 2088 /* SVE2 bitwise shift by immediate */ 2089 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2090 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2091 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2092 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2093 2094 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2095 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2096 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2097 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2098 2099 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2100 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2101 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2102 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2103 2104 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2105 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2106 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2107 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2108 2109 #define do_suqrshl_b(n, m) \ 2110 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2111 #define do_suqrshl_h(n, m) \ 2112 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2113 #define do_suqrshl_s(n, m) \ 2114 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2115 #define do_suqrshl_d(n, m) \ 2116 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2117 2118 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2119 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2120 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2121 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2122 2123 #undef DO_ASRD 2124 #undef DO_ZPZI 2125 #undef DO_ZPZI_D 2126 2127 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2128 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2129 { \ 2130 intptr_t i, opr_sz = simd_oprsz(desc); \ 2131 int shift = simd_data(desc); \ 2132 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2133 TYPEW nn = *(TYPEW *)(vn + i); \ 2134 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2135 } \ 2136 } 2137 2138 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2139 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2140 { \ 2141 intptr_t i, opr_sz = simd_oprsz(desc); \ 2142 int shift = simd_data(desc); \ 2143 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2144 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2145 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2146 } \ 2147 } 2148 2149 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2150 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2151 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2152 2153 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2154 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2155 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2156 2157 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2158 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2159 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2160 2161 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2162 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2163 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2164 2165 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2166 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2167 #define DO_SQSHRUN_D(x, sh) \ 2168 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2169 2170 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2171 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2172 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2173 2174 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2175 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2176 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2177 2178 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2179 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2180 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2181 2182 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2183 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2184 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2185 2186 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2187 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2188 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2189 2190 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2191 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2192 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2193 2194 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2195 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2196 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2197 2198 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2199 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2200 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2201 2202 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2203 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2204 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2205 2206 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2207 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2208 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2209 2210 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2211 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2212 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2213 2214 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2215 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2216 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2217 2218 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2219 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2220 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2221 2222 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2223 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2224 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2225 2226 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2227 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2228 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2229 2230 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2231 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2232 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2233 2234 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2235 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2236 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2237 2238 #undef DO_SHRNB 2239 #undef DO_SHRNT 2240 2241 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2242 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2243 { \ 2244 intptr_t i, opr_sz = simd_oprsz(desc); \ 2245 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2246 TYPEW nn = *(TYPEW *)(vn + i); \ 2247 TYPEW mm = *(TYPEW *)(vm + i); \ 2248 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2249 } \ 2250 } 2251 2252 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2253 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2254 { \ 2255 intptr_t i, opr_sz = simd_oprsz(desc); \ 2256 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2257 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2258 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2259 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2260 } \ 2261 } 2262 2263 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2264 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2265 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2266 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2267 2268 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2269 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2270 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2271 2272 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2273 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2274 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2275 2276 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2277 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2278 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2279 2280 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2281 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2282 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2283 2284 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2285 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2286 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2287 2288 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2289 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2290 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2291 2292 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2293 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2294 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2295 2296 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2297 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2298 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2299 2300 #undef DO_RSUBHN 2301 #undef DO_SUBHN 2302 #undef DO_RADDHN 2303 #undef DO_ADDHN 2304 2305 #undef DO_BINOPNB 2306 2307 /* Fully general four-operand expander, controlled by a predicate. 2308 */ 2309 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2310 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2311 void *vg, uint32_t desc) \ 2312 { \ 2313 intptr_t i, opr_sz = simd_oprsz(desc); \ 2314 for (i = 0; i < opr_sz; ) { \ 2315 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2316 do { \ 2317 if (pg & 1) { \ 2318 TYPE nn = *(TYPE *)(vn + H(i)); \ 2319 TYPE mm = *(TYPE *)(vm + H(i)); \ 2320 TYPE aa = *(TYPE *)(va + H(i)); \ 2321 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2322 } \ 2323 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2324 } while (i & 15); \ 2325 } \ 2326 } 2327 2328 /* Similarly, specialized for 64-bit operands. */ 2329 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2330 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2331 void *vg, uint32_t desc) \ 2332 { \ 2333 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2334 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2335 uint8_t *pg = vg; \ 2336 for (i = 0; i < opr_sz; i += 1) { \ 2337 if (pg[H1(i)] & 1) { \ 2338 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2339 d[i] = OP(aa, nn, mm); \ 2340 } \ 2341 } \ 2342 } 2343 2344 #define DO_MLA(A, N, M) (A + N * M) 2345 #define DO_MLS(A, N, M) (A - N * M) 2346 2347 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2348 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2349 2350 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2351 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2352 2353 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2354 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2355 2356 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2357 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2358 2359 #undef DO_MLA 2360 #undef DO_MLS 2361 #undef DO_ZPZZZ 2362 #undef DO_ZPZZZ_D 2363 2364 void HELPER(sve_index_b)(void *vd, uint32_t start, 2365 uint32_t incr, uint32_t desc) 2366 { 2367 intptr_t i, opr_sz = simd_oprsz(desc); 2368 uint8_t *d = vd; 2369 for (i = 0; i < opr_sz; i += 1) { 2370 d[H1(i)] = start + i * incr; 2371 } 2372 } 2373 2374 void HELPER(sve_index_h)(void *vd, uint32_t start, 2375 uint32_t incr, uint32_t desc) 2376 { 2377 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2378 uint16_t *d = vd; 2379 for (i = 0; i < opr_sz; i += 1) { 2380 d[H2(i)] = start + i * incr; 2381 } 2382 } 2383 2384 void HELPER(sve_index_s)(void *vd, uint32_t start, 2385 uint32_t incr, uint32_t desc) 2386 { 2387 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2388 uint32_t *d = vd; 2389 for (i = 0; i < opr_sz; i += 1) { 2390 d[H4(i)] = start + i * incr; 2391 } 2392 } 2393 2394 void HELPER(sve_index_d)(void *vd, uint64_t start, 2395 uint64_t incr, uint32_t desc) 2396 { 2397 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2398 uint64_t *d = vd; 2399 for (i = 0; i < opr_sz; i += 1) { 2400 d[i] = start + i * incr; 2401 } 2402 } 2403 2404 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2405 { 2406 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2407 uint32_t sh = simd_data(desc); 2408 uint32_t *d = vd, *n = vn, *m = vm; 2409 for (i = 0; i < opr_sz; i += 1) { 2410 d[i] = n[i] + (m[i] << sh); 2411 } 2412 } 2413 2414 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2415 { 2416 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2417 uint64_t sh = simd_data(desc); 2418 uint64_t *d = vd, *n = vn, *m = vm; 2419 for (i = 0; i < opr_sz; i += 1) { 2420 d[i] = n[i] + (m[i] << sh); 2421 } 2422 } 2423 2424 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2425 { 2426 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2427 uint64_t sh = simd_data(desc); 2428 uint64_t *d = vd, *n = vn, *m = vm; 2429 for (i = 0; i < opr_sz; i += 1) { 2430 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2431 } 2432 } 2433 2434 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2435 { 2436 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2437 uint64_t sh = simd_data(desc); 2438 uint64_t *d = vd, *n = vn, *m = vm; 2439 for (i = 0; i < opr_sz; i += 1) { 2440 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2441 } 2442 } 2443 2444 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2445 { 2446 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2447 static const uint16_t coeff[] = { 2448 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2449 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2450 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2451 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2452 }; 2453 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2454 uint16_t *d = vd, *n = vn; 2455 2456 for (i = 0; i < opr_sz; i++) { 2457 uint16_t nn = n[i]; 2458 intptr_t idx = extract32(nn, 0, 5); 2459 uint16_t exp = extract32(nn, 5, 5); 2460 d[i] = coeff[idx] | (exp << 10); 2461 } 2462 } 2463 2464 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2465 { 2466 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2467 static const uint32_t coeff[] = { 2468 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2469 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2470 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2471 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2472 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2473 0x1ef532, 0x20b051, 0x227043, 0x243516, 2474 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2475 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2476 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2477 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2478 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2479 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2480 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2481 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2482 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2483 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2484 }; 2485 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2486 uint32_t *d = vd, *n = vn; 2487 2488 for (i = 0; i < opr_sz; i++) { 2489 uint32_t nn = n[i]; 2490 intptr_t idx = extract32(nn, 0, 6); 2491 uint32_t exp = extract32(nn, 6, 8); 2492 d[i] = coeff[idx] | (exp << 23); 2493 } 2494 } 2495 2496 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2497 { 2498 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2499 static const uint64_t coeff[] = { 2500 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2501 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2502 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2503 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2504 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2505 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2506 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2507 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2508 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2509 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2510 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2511 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2512 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2513 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2514 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2515 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2516 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2517 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2518 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2519 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2520 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2521 0xFA7C1819E90D8ull, 2522 }; 2523 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2524 uint64_t *d = vd, *n = vn; 2525 2526 for (i = 0; i < opr_sz; i++) { 2527 uint64_t nn = n[i]; 2528 intptr_t idx = extract32(nn, 0, 6); 2529 uint64_t exp = extract32(nn, 6, 11); 2530 d[i] = coeff[idx] | (exp << 52); 2531 } 2532 } 2533 2534 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2535 { 2536 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2537 uint16_t *d = vd, *n = vn, *m = vm; 2538 for (i = 0; i < opr_sz; i += 1) { 2539 uint16_t nn = n[i]; 2540 uint16_t mm = m[i]; 2541 if (mm & 1) { 2542 nn = float16_one; 2543 } 2544 d[i] = nn ^ (mm & 2) << 14; 2545 } 2546 } 2547 2548 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2549 { 2550 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2551 uint32_t *d = vd, *n = vn, *m = vm; 2552 for (i = 0; i < opr_sz; i += 1) { 2553 uint32_t nn = n[i]; 2554 uint32_t mm = m[i]; 2555 if (mm & 1) { 2556 nn = float32_one; 2557 } 2558 d[i] = nn ^ (mm & 2) << 30; 2559 } 2560 } 2561 2562 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2563 { 2564 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2565 uint64_t *d = vd, *n = vn, *m = vm; 2566 for (i = 0; i < opr_sz; i += 1) { 2567 uint64_t nn = n[i]; 2568 uint64_t mm = m[i]; 2569 if (mm & 1) { 2570 nn = float64_one; 2571 } 2572 d[i] = nn ^ (mm & 2) << 62; 2573 } 2574 } 2575 2576 /* 2577 * Signed saturating addition with scalar operand. 2578 */ 2579 2580 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2581 { 2582 intptr_t i, oprsz = simd_oprsz(desc); 2583 2584 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2585 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2586 } 2587 } 2588 2589 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2590 { 2591 intptr_t i, oprsz = simd_oprsz(desc); 2592 2593 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2594 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2595 } 2596 } 2597 2598 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2599 { 2600 intptr_t i, oprsz = simd_oprsz(desc); 2601 2602 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2603 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2604 } 2605 } 2606 2607 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2608 { 2609 intptr_t i, oprsz = simd_oprsz(desc); 2610 2611 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2612 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2613 } 2614 } 2615 2616 /* 2617 * Unsigned saturating addition with scalar operand. 2618 */ 2619 2620 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2621 { 2622 intptr_t i, oprsz = simd_oprsz(desc); 2623 2624 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2625 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2626 } 2627 } 2628 2629 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2630 { 2631 intptr_t i, oprsz = simd_oprsz(desc); 2632 2633 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2634 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2635 } 2636 } 2637 2638 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2639 { 2640 intptr_t i, oprsz = simd_oprsz(desc); 2641 2642 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2643 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2644 } 2645 } 2646 2647 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2648 { 2649 intptr_t i, oprsz = simd_oprsz(desc); 2650 2651 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2652 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2653 } 2654 } 2655 2656 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2657 { 2658 intptr_t i, oprsz = simd_oprsz(desc); 2659 2660 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2661 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2662 } 2663 } 2664 2665 /* Two operand predicated copy immediate with merge. All valid immediates 2666 * can fit within 17 signed bits in the simd_data field. 2667 */ 2668 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2669 uint64_t mm, uint32_t desc) 2670 { 2671 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2672 uint64_t *d = vd, *n = vn; 2673 uint8_t *pg = vg; 2674 2675 mm = dup_const(MO_8, mm); 2676 for (i = 0; i < opr_sz; i += 1) { 2677 uint64_t nn = n[i]; 2678 uint64_t pp = expand_pred_b(pg[H1(i)]); 2679 d[i] = (mm & pp) | (nn & ~pp); 2680 } 2681 } 2682 2683 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2684 uint64_t mm, uint32_t desc) 2685 { 2686 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2687 uint64_t *d = vd, *n = vn; 2688 uint8_t *pg = vg; 2689 2690 mm = dup_const(MO_16, mm); 2691 for (i = 0; i < opr_sz; i += 1) { 2692 uint64_t nn = n[i]; 2693 uint64_t pp = expand_pred_h(pg[H1(i)]); 2694 d[i] = (mm & pp) | (nn & ~pp); 2695 } 2696 } 2697 2698 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2699 uint64_t mm, uint32_t desc) 2700 { 2701 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2702 uint64_t *d = vd, *n = vn; 2703 uint8_t *pg = vg; 2704 2705 mm = dup_const(MO_32, mm); 2706 for (i = 0; i < opr_sz; i += 1) { 2707 uint64_t nn = n[i]; 2708 uint64_t pp = expand_pred_s(pg[H1(i)]); 2709 d[i] = (mm & pp) | (nn & ~pp); 2710 } 2711 } 2712 2713 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2714 uint64_t mm, uint32_t desc) 2715 { 2716 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2717 uint64_t *d = vd, *n = vn; 2718 uint8_t *pg = vg; 2719 2720 for (i = 0; i < opr_sz; i += 1) { 2721 uint64_t nn = n[i]; 2722 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2723 } 2724 } 2725 2726 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2727 { 2728 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2729 uint64_t *d = vd; 2730 uint8_t *pg = vg; 2731 2732 val = dup_const(MO_8, val); 2733 for (i = 0; i < opr_sz; i += 1) { 2734 d[i] = val & expand_pred_b(pg[H1(i)]); 2735 } 2736 } 2737 2738 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2739 { 2740 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2741 uint64_t *d = vd; 2742 uint8_t *pg = vg; 2743 2744 val = dup_const(MO_16, val); 2745 for (i = 0; i < opr_sz; i += 1) { 2746 d[i] = val & expand_pred_h(pg[H1(i)]); 2747 } 2748 } 2749 2750 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2751 { 2752 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2753 uint64_t *d = vd; 2754 uint8_t *pg = vg; 2755 2756 val = dup_const(MO_32, val); 2757 for (i = 0; i < opr_sz; i += 1) { 2758 d[i] = val & expand_pred_s(pg[H1(i)]); 2759 } 2760 } 2761 2762 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2763 { 2764 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2765 uint64_t *d = vd; 2766 uint8_t *pg = vg; 2767 2768 for (i = 0; i < opr_sz; i += 1) { 2769 d[i] = (pg[H1(i)] & 1 ? val : 0); 2770 } 2771 } 2772 2773 /* Big-endian hosts need to frob the byte indices. If the copy 2774 * happens to be 8-byte aligned, then no frobbing necessary. 2775 */ 2776 static void swap_memmove(void *vd, void *vs, size_t n) 2777 { 2778 uintptr_t d = (uintptr_t)vd; 2779 uintptr_t s = (uintptr_t)vs; 2780 uintptr_t o = (d | s | n) & 7; 2781 size_t i; 2782 2783 #if !HOST_BIG_ENDIAN 2784 o = 0; 2785 #endif 2786 switch (o) { 2787 case 0: 2788 memmove(vd, vs, n); 2789 break; 2790 2791 case 4: 2792 if (d < s || d >= s + n) { 2793 for (i = 0; i < n; i += 4) { 2794 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2795 } 2796 } else { 2797 for (i = n; i > 0; ) { 2798 i -= 4; 2799 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2800 } 2801 } 2802 break; 2803 2804 case 2: 2805 case 6: 2806 if (d < s || d >= s + n) { 2807 for (i = 0; i < n; i += 2) { 2808 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2809 } 2810 } else { 2811 for (i = n; i > 0; ) { 2812 i -= 2; 2813 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2814 } 2815 } 2816 break; 2817 2818 default: 2819 if (d < s || d >= s + n) { 2820 for (i = 0; i < n; i++) { 2821 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2822 } 2823 } else { 2824 for (i = n; i > 0; ) { 2825 i -= 1; 2826 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2827 } 2828 } 2829 break; 2830 } 2831 } 2832 2833 /* Similarly for memset of 0. */ 2834 static void swap_memzero(void *vd, size_t n) 2835 { 2836 uintptr_t d = (uintptr_t)vd; 2837 uintptr_t o = (d | n) & 7; 2838 size_t i; 2839 2840 /* Usually, the first bit of a predicate is set, so N is 0. */ 2841 if (likely(n == 0)) { 2842 return; 2843 } 2844 2845 #if !HOST_BIG_ENDIAN 2846 o = 0; 2847 #endif 2848 switch (o) { 2849 case 0: 2850 memset(vd, 0, n); 2851 break; 2852 2853 case 4: 2854 for (i = 0; i < n; i += 4) { 2855 *(uint32_t *)H1_4(d + i) = 0; 2856 } 2857 break; 2858 2859 case 2: 2860 case 6: 2861 for (i = 0; i < n; i += 2) { 2862 *(uint16_t *)H1_2(d + i) = 0; 2863 } 2864 break; 2865 2866 default: 2867 for (i = 0; i < n; i++) { 2868 *(uint8_t *)H1(d + i) = 0; 2869 } 2870 break; 2871 } 2872 } 2873 2874 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2875 { 2876 intptr_t opr_sz = simd_oprsz(desc); 2877 size_t n_ofs = simd_data(desc); 2878 size_t n_siz = opr_sz - n_ofs; 2879 2880 if (vd != vm) { 2881 swap_memmove(vd, vn + n_ofs, n_siz); 2882 swap_memmove(vd + n_siz, vm, n_ofs); 2883 } else if (vd != vn) { 2884 swap_memmove(vd + n_siz, vd, n_ofs); 2885 swap_memmove(vd, vn + n_ofs, n_siz); 2886 } else { 2887 /* vd == vn == vm. Need temp space. */ 2888 ARMVectorReg tmp; 2889 swap_memmove(&tmp, vm, n_ofs); 2890 swap_memmove(vd, vd + n_ofs, n_siz); 2891 memcpy(vd + n_siz, &tmp, n_ofs); 2892 } 2893 } 2894 2895 #define DO_INSR(NAME, TYPE, H) \ 2896 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2897 { \ 2898 intptr_t opr_sz = simd_oprsz(desc); \ 2899 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2900 *(TYPE *)(vd + H(0)) = val; \ 2901 } 2902 2903 DO_INSR(sve_insr_b, uint8_t, H1) 2904 DO_INSR(sve_insr_h, uint16_t, H1_2) 2905 DO_INSR(sve_insr_s, uint32_t, H1_4) 2906 DO_INSR(sve_insr_d, uint64_t, H1_8) 2907 2908 #undef DO_INSR 2909 2910 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2911 { 2912 intptr_t i, j, opr_sz = simd_oprsz(desc); 2913 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2914 uint64_t f = *(uint64_t *)(vn + i); 2915 uint64_t b = *(uint64_t *)(vn + j); 2916 *(uint64_t *)(vd + i) = bswap64(b); 2917 *(uint64_t *)(vd + j) = bswap64(f); 2918 } 2919 } 2920 2921 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2922 { 2923 intptr_t i, j, opr_sz = simd_oprsz(desc); 2924 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2925 uint64_t f = *(uint64_t *)(vn + i); 2926 uint64_t b = *(uint64_t *)(vn + j); 2927 *(uint64_t *)(vd + i) = hswap64(b); 2928 *(uint64_t *)(vd + j) = hswap64(f); 2929 } 2930 } 2931 2932 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2933 { 2934 intptr_t i, j, opr_sz = simd_oprsz(desc); 2935 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2936 uint64_t f = *(uint64_t *)(vn + i); 2937 uint64_t b = *(uint64_t *)(vn + j); 2938 *(uint64_t *)(vd + i) = rol64(b, 32); 2939 *(uint64_t *)(vd + j) = rol64(f, 32); 2940 } 2941 } 2942 2943 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2944 { 2945 intptr_t i, j, opr_sz = simd_oprsz(desc); 2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2947 uint64_t f = *(uint64_t *)(vn + i); 2948 uint64_t b = *(uint64_t *)(vn + j); 2949 *(uint64_t *)(vd + i) = b; 2950 *(uint64_t *)(vd + j) = f; 2951 } 2952 } 2953 2954 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2955 2956 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2957 bool is_tbx, tb_impl_fn *fn) 2958 { 2959 ARMVectorReg scratch; 2960 uintptr_t oprsz = simd_oprsz(desc); 2961 2962 if (unlikely(vd == vn)) { 2963 vn = memcpy(&scratch, vn, oprsz); 2964 } 2965 2966 fn(vd, vn, NULL, vm, oprsz, is_tbx); 2967 } 2968 2969 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 2970 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 2971 { 2972 ARMVectorReg scratch; 2973 uintptr_t oprsz = simd_oprsz(desc); 2974 2975 if (unlikely(vd == vn0)) { 2976 vn0 = memcpy(&scratch, vn0, oprsz); 2977 if (vd == vn1) { 2978 vn1 = vn0; 2979 } 2980 } else if (unlikely(vd == vn1)) { 2981 vn1 = memcpy(&scratch, vn1, oprsz); 2982 } 2983 2984 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 2985 } 2986 2987 #define DO_TB(SUFF, TYPE, H) \ 2988 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 2989 void *vm, uintptr_t oprsz, bool is_tbx) \ 2990 { \ 2991 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 2992 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 2993 for (i = 0; i < nelem; ++i) { \ 2994 TYPE index = indexes[H1(i)], val = 0; \ 2995 if (index < nelem) { \ 2996 val = tbl0[H(index)]; \ 2997 } else { \ 2998 index -= nelem; \ 2999 if (tbl1 && index < nelem) { \ 3000 val = tbl1[H(index)]; \ 3001 } else if (is_tbx) { \ 3002 continue; \ 3003 } \ 3004 } \ 3005 d[H(i)] = val; \ 3006 } \ 3007 } \ 3008 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3009 { \ 3010 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3011 } \ 3012 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3013 void *vm, uint32_t desc) \ 3014 { \ 3015 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3016 } \ 3017 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3018 { \ 3019 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3020 } 3021 3022 DO_TB(b, uint8_t, H1) 3023 DO_TB(h, uint16_t, H2) 3024 DO_TB(s, uint32_t, H4) 3025 DO_TB(d, uint64_t, H8) 3026 3027 #undef DO_TB 3028 3029 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3030 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3031 { \ 3032 intptr_t i, opr_sz = simd_oprsz(desc); \ 3033 TYPED *d = vd; \ 3034 TYPES *n = vn; \ 3035 ARMVectorReg tmp; \ 3036 if (unlikely(vn - vd < opr_sz)) { \ 3037 n = memcpy(&tmp, n, opr_sz / 2); \ 3038 } \ 3039 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3040 d[HD(i)] = n[HS(i)]; \ 3041 } \ 3042 } 3043 3044 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3045 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3046 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3047 3048 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3049 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3050 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3051 3052 #undef DO_UNPK 3053 3054 /* Mask of bits included in the even numbered predicates of width esz. 3055 * We also use this for expand_bits/compress_bits, and so extend the 3056 * same pattern out to 16-bit units. 3057 */ 3058 static const uint64_t even_bit_esz_masks[5] = { 3059 0x5555555555555555ull, 3060 0x3333333333333333ull, 3061 0x0f0f0f0f0f0f0f0full, 3062 0x00ff00ff00ff00ffull, 3063 0x0000ffff0000ffffull, 3064 }; 3065 3066 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3067 * For N==0, this corresponds to the operation that in qemu/bitops.h 3068 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3069 * section 7-2 Shuffling Bits. 3070 */ 3071 static uint64_t expand_bits(uint64_t x, int n) 3072 { 3073 int i; 3074 3075 x &= 0xffffffffu; 3076 for (i = 4; i >= n; i--) { 3077 int sh = 1 << i; 3078 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3079 } 3080 return x; 3081 } 3082 3083 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3084 * For N==0, this corresponds to the operation that in qemu/bitops.h 3085 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3086 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3087 */ 3088 static uint64_t compress_bits(uint64_t x, int n) 3089 { 3090 int i; 3091 3092 for (i = n; i <= 4; i++) { 3093 int sh = 1 << i; 3094 x &= even_bit_esz_masks[i]; 3095 x = (x >> sh) | x; 3096 } 3097 return x & 0xffffffffu; 3098 } 3099 3100 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3101 { 3102 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3103 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3104 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3105 int esize = 1 << esz; 3106 uint64_t *d = vd; 3107 intptr_t i; 3108 3109 if (oprsz <= 8) { 3110 uint64_t nn = *(uint64_t *)vn; 3111 uint64_t mm = *(uint64_t *)vm; 3112 int half = 4 * oprsz; 3113 3114 nn = extract64(nn, high * half, half); 3115 mm = extract64(mm, high * half, half); 3116 nn = expand_bits(nn, esz); 3117 mm = expand_bits(mm, esz); 3118 d[0] = nn | (mm << esize); 3119 } else { 3120 ARMPredicateReg tmp; 3121 3122 /* We produce output faster than we consume input. 3123 Therefore we must be mindful of possible overlap. */ 3124 if (vd == vn) { 3125 vn = memcpy(&tmp, vn, oprsz); 3126 if (vd == vm) { 3127 vm = vn; 3128 } 3129 } else if (vd == vm) { 3130 vm = memcpy(&tmp, vm, oprsz); 3131 } 3132 if (high) { 3133 high = oprsz >> 1; 3134 } 3135 3136 if ((oprsz & 7) == 0) { 3137 uint32_t *n = vn, *m = vm; 3138 high >>= 2; 3139 3140 for (i = 0; i < oprsz / 8; i++) { 3141 uint64_t nn = n[H4(high + i)]; 3142 uint64_t mm = m[H4(high + i)]; 3143 3144 nn = expand_bits(nn, esz); 3145 mm = expand_bits(mm, esz); 3146 d[i] = nn | (mm << esize); 3147 } 3148 } else { 3149 uint8_t *n = vn, *m = vm; 3150 uint16_t *d16 = vd; 3151 3152 for (i = 0; i < oprsz / 2; i++) { 3153 uint16_t nn = n[H1(high + i)]; 3154 uint16_t mm = m[H1(high + i)]; 3155 3156 nn = expand_bits(nn, esz); 3157 mm = expand_bits(mm, esz); 3158 d16[H2(i)] = nn | (mm << esize); 3159 } 3160 } 3161 } 3162 } 3163 3164 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3165 { 3166 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3167 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3168 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3169 uint64_t *d = vd, *n = vn, *m = vm; 3170 uint64_t l, h; 3171 intptr_t i; 3172 3173 if (oprsz <= 8) { 3174 l = compress_bits(n[0] >> odd, esz); 3175 h = compress_bits(m[0] >> odd, esz); 3176 d[0] = l | (h << (4 * oprsz)); 3177 } else { 3178 ARMPredicateReg tmp_m; 3179 intptr_t oprsz_16 = oprsz / 16; 3180 3181 if ((vm - vd) < (uintptr_t)oprsz) { 3182 m = memcpy(&tmp_m, vm, oprsz); 3183 } 3184 3185 for (i = 0; i < oprsz_16; i++) { 3186 l = n[2 * i + 0]; 3187 h = n[2 * i + 1]; 3188 l = compress_bits(l >> odd, esz); 3189 h = compress_bits(h >> odd, esz); 3190 d[i] = l | (h << 32); 3191 } 3192 3193 /* 3194 * For VL which is not a multiple of 512, the results from M do not 3195 * align nicely with the uint64_t for D. Put the aligned results 3196 * from M into TMP_M and then copy it into place afterward. 3197 */ 3198 if (oprsz & 15) { 3199 int final_shift = (oprsz & 15) * 2; 3200 3201 l = n[2 * i + 0]; 3202 h = n[2 * i + 1]; 3203 l = compress_bits(l >> odd, esz); 3204 h = compress_bits(h >> odd, esz); 3205 d[i] = l | (h << final_shift); 3206 3207 for (i = 0; i < oprsz_16; i++) { 3208 l = m[2 * i + 0]; 3209 h = m[2 * i + 1]; 3210 l = compress_bits(l >> odd, esz); 3211 h = compress_bits(h >> odd, esz); 3212 tmp_m.p[i] = l | (h << 32); 3213 } 3214 l = m[2 * i + 0]; 3215 h = m[2 * i + 1]; 3216 l = compress_bits(l >> odd, esz); 3217 h = compress_bits(h >> odd, esz); 3218 tmp_m.p[i] = l | (h << final_shift); 3219 3220 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3221 } else { 3222 for (i = 0; i < oprsz_16; i++) { 3223 l = m[2 * i + 0]; 3224 h = m[2 * i + 1]; 3225 l = compress_bits(l >> odd, esz); 3226 h = compress_bits(h >> odd, esz); 3227 d[oprsz_16 + i] = l | (h << 32); 3228 } 3229 } 3230 } 3231 } 3232 3233 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3234 { 3235 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3236 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3237 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3238 uint64_t *d = vd, *n = vn, *m = vm; 3239 uint64_t mask; 3240 int shr, shl; 3241 intptr_t i; 3242 3243 shl = 1 << esz; 3244 shr = 0; 3245 mask = even_bit_esz_masks[esz]; 3246 if (odd) { 3247 mask <<= shl; 3248 shr = shl; 3249 shl = 0; 3250 } 3251 3252 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3253 uint64_t nn = (n[i] & mask) >> shr; 3254 uint64_t mm = (m[i] & mask) << shl; 3255 d[i] = nn + mm; 3256 } 3257 } 3258 3259 /* Reverse units of 2**N bits. */ 3260 static uint64_t reverse_bits_64(uint64_t x, int n) 3261 { 3262 int i, sh; 3263 3264 x = bswap64(x); 3265 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3266 uint64_t mask = even_bit_esz_masks[i]; 3267 x = ((x & mask) << sh) | ((x >> sh) & mask); 3268 } 3269 return x; 3270 } 3271 3272 static uint8_t reverse_bits_8(uint8_t x, int n) 3273 { 3274 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3275 int i, sh; 3276 3277 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3278 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3279 } 3280 return x; 3281 } 3282 3283 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3284 { 3285 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3286 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3287 intptr_t i, oprsz_2 = oprsz / 2; 3288 3289 if (oprsz <= 8) { 3290 uint64_t l = *(uint64_t *)vn; 3291 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3292 *(uint64_t *)vd = l; 3293 } else if ((oprsz & 15) == 0) { 3294 for (i = 0; i < oprsz_2; i += 8) { 3295 intptr_t ih = oprsz - 8 - i; 3296 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3297 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3298 *(uint64_t *)(vd + i) = h; 3299 *(uint64_t *)(vd + ih) = l; 3300 } 3301 } else { 3302 for (i = 0; i < oprsz_2; i += 1) { 3303 intptr_t il = H1(i); 3304 intptr_t ih = H1(oprsz - 1 - i); 3305 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3306 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3307 *(uint8_t *)(vd + il) = h; 3308 *(uint8_t *)(vd + ih) = l; 3309 } 3310 } 3311 } 3312 3313 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3314 { 3315 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3316 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3317 uint64_t *d = vd; 3318 intptr_t i; 3319 3320 if (oprsz <= 8) { 3321 uint64_t nn = *(uint64_t *)vn; 3322 int half = 4 * oprsz; 3323 3324 nn = extract64(nn, high * half, half); 3325 nn = expand_bits(nn, 0); 3326 d[0] = nn; 3327 } else { 3328 ARMPredicateReg tmp_n; 3329 3330 /* We produce output faster than we consume input. 3331 Therefore we must be mindful of possible overlap. */ 3332 if ((vn - vd) < (uintptr_t)oprsz) { 3333 vn = memcpy(&tmp_n, vn, oprsz); 3334 } 3335 if (high) { 3336 high = oprsz >> 1; 3337 } 3338 3339 if ((oprsz & 7) == 0) { 3340 uint32_t *n = vn; 3341 high >>= 2; 3342 3343 for (i = 0; i < oprsz / 8; i++) { 3344 uint64_t nn = n[H4(high + i)]; 3345 d[i] = expand_bits(nn, 0); 3346 } 3347 } else { 3348 uint16_t *d16 = vd; 3349 uint8_t *n = vn; 3350 3351 for (i = 0; i < oprsz / 2; i++) { 3352 uint16_t nn = n[H1(high + i)]; 3353 d16[H2(i)] = expand_bits(nn, 0); 3354 } 3355 } 3356 } 3357 } 3358 3359 #define DO_ZIP(NAME, TYPE, H) \ 3360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3361 { \ 3362 intptr_t oprsz = simd_oprsz(desc); \ 3363 intptr_t odd_ofs = simd_data(desc); \ 3364 intptr_t i, oprsz_2 = oprsz / 2; \ 3365 ARMVectorReg tmp_n, tmp_m; \ 3366 /* We produce output faster than we consume input. \ 3367 Therefore we must be mindful of possible overlap. */ \ 3368 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3369 vn = memcpy(&tmp_n, vn, oprsz); \ 3370 } \ 3371 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3372 vm = memcpy(&tmp_m, vm, oprsz); \ 3373 } \ 3374 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3375 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3376 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3377 *(TYPE *)(vm + odd_ofs + H(i)); \ 3378 } \ 3379 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3380 memset(vd + oprsz - 16, 0, 16); \ 3381 } \ 3382 } 3383 3384 DO_ZIP(sve_zip_b, uint8_t, H1) 3385 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3386 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3387 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3388 DO_ZIP(sve2_zip_q, Int128, ) 3389 3390 #define DO_UZP(NAME, TYPE, H) \ 3391 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3392 { \ 3393 intptr_t oprsz = simd_oprsz(desc); \ 3394 intptr_t odd_ofs = simd_data(desc); \ 3395 intptr_t i, p; \ 3396 ARMVectorReg tmp_m; \ 3397 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3398 vm = memcpy(&tmp_m, vm, oprsz); \ 3399 } \ 3400 i = 0, p = odd_ofs; \ 3401 do { \ 3402 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3403 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3404 } while (p < oprsz); \ 3405 p -= oprsz; \ 3406 do { \ 3407 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3408 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3409 } while (p < oprsz); \ 3410 tcg_debug_assert(i == oprsz); \ 3411 } 3412 3413 DO_UZP(sve_uzp_b, uint8_t, H1) 3414 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3415 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3416 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3417 DO_UZP(sve2_uzp_q, Int128, ) 3418 3419 #define DO_TRN(NAME, TYPE, H) \ 3420 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3421 { \ 3422 intptr_t oprsz = simd_oprsz(desc); \ 3423 intptr_t odd_ofs = simd_data(desc); \ 3424 intptr_t i; \ 3425 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3426 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3427 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3428 *(TYPE *)(vd + H(i + 0)) = ae; \ 3429 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3430 } \ 3431 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3432 memset(vd + oprsz - 16, 0, 16); \ 3433 } \ 3434 } 3435 3436 DO_TRN(sve_trn_b, uint8_t, H1) 3437 DO_TRN(sve_trn_h, uint16_t, H1_2) 3438 DO_TRN(sve_trn_s, uint32_t, H1_4) 3439 DO_TRN(sve_trn_d, uint64_t, H1_8) 3440 DO_TRN(sve2_trn_q, Int128, ) 3441 3442 #undef DO_ZIP 3443 #undef DO_UZP 3444 #undef DO_TRN 3445 3446 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3447 { 3448 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3449 uint32_t *d = vd, *n = vn; 3450 uint8_t *pg = vg; 3451 3452 for (i = j = 0; i < opr_sz; i++) { 3453 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3454 d[H4(j)] = n[H4(i)]; 3455 j++; 3456 } 3457 } 3458 for (; j < opr_sz; j++) { 3459 d[H4(j)] = 0; 3460 } 3461 } 3462 3463 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3464 { 3465 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3466 uint64_t *d = vd, *n = vn; 3467 uint8_t *pg = vg; 3468 3469 for (i = j = 0; i < opr_sz; i++) { 3470 if (pg[H1(i)] & 1) { 3471 d[j] = n[i]; 3472 j++; 3473 } 3474 } 3475 for (; j < opr_sz; j++) { 3476 d[j] = 0; 3477 } 3478 } 3479 3480 /* Similar to the ARM LastActiveElement pseudocode function, except the 3481 * result is multiplied by the element size. This includes the not found 3482 * indication; e.g. not found for esz=3 is -8. 3483 */ 3484 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3485 { 3486 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3487 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3488 3489 return last_active_element(vg, words, esz); 3490 } 3491 3492 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3493 { 3494 intptr_t opr_sz = simd_oprsz(desc) / 8; 3495 int esz = simd_data(desc); 3496 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3497 intptr_t i, first_i, last_i; 3498 ARMVectorReg tmp; 3499 3500 first_i = last_i = 0; 3501 first_g = last_g = 0; 3502 3503 /* Find the extent of the active elements within VG. */ 3504 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3505 pg = *(uint64_t *)(vg + i) & mask; 3506 if (pg) { 3507 if (last_g == 0) { 3508 last_g = pg; 3509 last_i = i; 3510 } 3511 first_g = pg; 3512 first_i = i; 3513 } 3514 } 3515 3516 len = 0; 3517 if (first_g != 0) { 3518 first_i = first_i * 8 + ctz64(first_g); 3519 last_i = last_i * 8 + 63 - clz64(last_g); 3520 len = last_i - first_i + (1 << esz); 3521 if (vd == vm) { 3522 vm = memcpy(&tmp, vm, opr_sz * 8); 3523 } 3524 swap_memmove(vd, vn + first_i, len); 3525 } 3526 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3527 } 3528 3529 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3530 void *vg, uint32_t desc) 3531 { 3532 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3533 uint64_t *d = vd, *n = vn, *m = vm; 3534 uint8_t *pg = vg; 3535 3536 for (i = 0; i < opr_sz; i += 1) { 3537 uint64_t nn = n[i], mm = m[i]; 3538 uint64_t pp = expand_pred_b(pg[H1(i)]); 3539 d[i] = (nn & pp) | (mm & ~pp); 3540 } 3541 } 3542 3543 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3544 void *vg, uint32_t desc) 3545 { 3546 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3547 uint64_t *d = vd, *n = vn, *m = vm; 3548 uint8_t *pg = vg; 3549 3550 for (i = 0; i < opr_sz; i += 1) { 3551 uint64_t nn = n[i], mm = m[i]; 3552 uint64_t pp = expand_pred_h(pg[H1(i)]); 3553 d[i] = (nn & pp) | (mm & ~pp); 3554 } 3555 } 3556 3557 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3558 void *vg, uint32_t desc) 3559 { 3560 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3561 uint64_t *d = vd, *n = vn, *m = vm; 3562 uint8_t *pg = vg; 3563 3564 for (i = 0; i < opr_sz; i += 1) { 3565 uint64_t nn = n[i], mm = m[i]; 3566 uint64_t pp = expand_pred_s(pg[H1(i)]); 3567 d[i] = (nn & pp) | (mm & ~pp); 3568 } 3569 } 3570 3571 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3572 void *vg, uint32_t desc) 3573 { 3574 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3575 uint64_t *d = vd, *n = vn, *m = vm; 3576 uint8_t *pg = vg; 3577 3578 for (i = 0; i < opr_sz; i += 1) { 3579 uint64_t nn = n[i], mm = m[i]; 3580 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3581 } 3582 } 3583 3584 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3585 void *vg, uint32_t desc) 3586 { 3587 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3588 Int128 *d = vd, *n = vn, *m = vm; 3589 uint16_t *pg = vg; 3590 3591 for (i = 0; i < opr_sz; i += 1) { 3592 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3593 } 3594 } 3595 3596 /* Two operand comparison controlled by a predicate. 3597 * ??? It is very tempting to want to be able to expand this inline 3598 * with x86 instructions, e.g. 3599 * 3600 * vcmpeqw zm, zn, %ymm0 3601 * vpmovmskb %ymm0, %eax 3602 * and $0x5555, %eax 3603 * and pg, %eax 3604 * 3605 * or even aarch64, e.g. 3606 * 3607 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3608 * cmeq v0.8h, zn, zm 3609 * and v0.8h, v0.8h, mask 3610 * addv h0, v0.8h 3611 * and v0.8b, pg 3612 * 3613 * However, coming up with an abstraction that allows vector inputs and 3614 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3615 * scalar outputs, is tricky. 3616 */ 3617 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3618 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3619 { \ 3620 intptr_t opr_sz = simd_oprsz(desc); \ 3621 uint32_t flags = PREDTEST_INIT; \ 3622 intptr_t i = opr_sz; \ 3623 do { \ 3624 uint64_t out = 0, pg; \ 3625 do { \ 3626 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3627 TYPE nn = *(TYPE *)(vn + H(i)); \ 3628 TYPE mm = *(TYPE *)(vm + H(i)); \ 3629 out |= nn OP mm; \ 3630 } while (i & 63); \ 3631 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3632 out &= pg; \ 3633 *(uint64_t *)(vd + (i >> 3)) = out; \ 3634 flags = iter_predtest_bwd(out, pg, flags); \ 3635 } while (i > 0); \ 3636 return flags; \ 3637 } 3638 3639 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3640 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3641 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3642 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3643 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3644 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3645 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3646 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3647 3648 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3649 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3650 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3651 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3652 3653 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3654 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3655 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3656 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3657 3658 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3659 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3660 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3661 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3662 3663 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3664 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3665 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3666 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3667 3668 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3669 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3670 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3671 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3672 3673 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3674 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3675 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3676 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3677 3678 #undef DO_CMP_PPZZ_B 3679 #undef DO_CMP_PPZZ_H 3680 #undef DO_CMP_PPZZ_S 3681 #undef DO_CMP_PPZZ_D 3682 #undef DO_CMP_PPZZ 3683 3684 /* Similar, but the second source is "wide". */ 3685 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3686 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3687 { \ 3688 intptr_t opr_sz = simd_oprsz(desc); \ 3689 uint32_t flags = PREDTEST_INIT; \ 3690 intptr_t i = opr_sz; \ 3691 do { \ 3692 uint64_t out = 0, pg; \ 3693 do { \ 3694 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3695 do { \ 3696 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3697 TYPE nn = *(TYPE *)(vn + H(i)); \ 3698 out |= nn OP mm; \ 3699 } while (i & 7); \ 3700 } while (i & 63); \ 3701 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3702 out &= pg; \ 3703 *(uint64_t *)(vd + (i >> 3)) = out; \ 3704 flags = iter_predtest_bwd(out, pg, flags); \ 3705 } while (i > 0); \ 3706 return flags; \ 3707 } 3708 3709 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3710 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3711 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3712 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3713 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3714 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3715 3716 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3717 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3718 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3719 3720 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3721 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3722 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3723 3724 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3725 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3726 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3727 3728 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3729 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3730 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3731 3732 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3733 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3734 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3735 3736 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3737 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3738 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3739 3740 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3741 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3742 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3743 3744 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3745 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3746 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3747 3748 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3749 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3750 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3751 3752 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3753 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3754 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3755 3756 #undef DO_CMP_PPZW_B 3757 #undef DO_CMP_PPZW_H 3758 #undef DO_CMP_PPZW_S 3759 #undef DO_CMP_PPZW 3760 3761 /* Similar, but the second source is immediate. */ 3762 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3763 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3764 { \ 3765 intptr_t opr_sz = simd_oprsz(desc); \ 3766 uint32_t flags = PREDTEST_INIT; \ 3767 TYPE mm = simd_data(desc); \ 3768 intptr_t i = opr_sz; \ 3769 do { \ 3770 uint64_t out = 0, pg; \ 3771 do { \ 3772 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3773 TYPE nn = *(TYPE *)(vn + H(i)); \ 3774 out |= nn OP mm; \ 3775 } while (i & 63); \ 3776 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3777 out &= pg; \ 3778 *(uint64_t *)(vd + (i >> 3)) = out; \ 3779 flags = iter_predtest_bwd(out, pg, flags); \ 3780 } while (i > 0); \ 3781 return flags; \ 3782 } 3783 3784 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3785 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3786 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3787 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3788 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3789 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3790 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3791 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3792 3793 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3794 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3795 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3796 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3797 3798 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3799 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3800 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3801 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3802 3803 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3804 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3805 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3806 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3807 3808 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3809 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3810 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3811 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3812 3813 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3814 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3815 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3816 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3817 3818 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3819 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3820 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3821 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3822 3823 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3824 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3825 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3826 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3827 3828 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3829 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3830 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3831 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3832 3833 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3834 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3835 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3836 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3837 3838 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3839 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3840 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3841 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3842 3843 #undef DO_CMP_PPZI_B 3844 #undef DO_CMP_PPZI_H 3845 #undef DO_CMP_PPZI_S 3846 #undef DO_CMP_PPZI_D 3847 #undef DO_CMP_PPZI 3848 3849 /* Similar to the ARM LastActive pseudocode function. */ 3850 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3851 { 3852 intptr_t i; 3853 3854 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3855 uint64_t pg = *(uint64_t *)(vg + i); 3856 if (pg) { 3857 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3858 } 3859 } 3860 return 0; 3861 } 3862 3863 /* Compute a mask into RETB that is true for all G, up to and including 3864 * (if after) or excluding (if !after) the first G & N. 3865 * Return true if BRK found. 3866 */ 3867 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3868 bool brk, bool after) 3869 { 3870 uint64_t b; 3871 3872 if (brk) { 3873 b = 0; 3874 } else if ((g & n) == 0) { 3875 /* For all G, no N are set; break not found. */ 3876 b = g; 3877 } else { 3878 /* Break somewhere in N. Locate it. */ 3879 b = g & n; /* guard true, pred true */ 3880 b = b & -b; /* first such */ 3881 if (after) { 3882 b = b | (b - 1); /* break after same */ 3883 } else { 3884 b = b - 1; /* break before same */ 3885 } 3886 brk = true; 3887 } 3888 3889 *retb = b; 3890 return brk; 3891 } 3892 3893 /* Compute a zeroing BRK. */ 3894 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3895 intptr_t oprsz, bool after) 3896 { 3897 bool brk = false; 3898 intptr_t i; 3899 3900 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3901 uint64_t this_b, this_g = g[i]; 3902 3903 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3904 d[i] = this_b & this_g; 3905 } 3906 } 3907 3908 /* Likewise, but also compute flags. */ 3909 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3910 intptr_t oprsz, bool after) 3911 { 3912 uint32_t flags = PREDTEST_INIT; 3913 bool brk = false; 3914 intptr_t i; 3915 3916 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3917 uint64_t this_b, this_d, this_g = g[i]; 3918 3919 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3920 d[i] = this_d = this_b & this_g; 3921 flags = iter_predtest_fwd(this_d, this_g, flags); 3922 } 3923 return flags; 3924 } 3925 3926 /* Compute a merging BRK. */ 3927 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3928 intptr_t oprsz, bool after) 3929 { 3930 bool brk = false; 3931 intptr_t i; 3932 3933 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3934 uint64_t this_b, this_g = g[i]; 3935 3936 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3937 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3938 } 3939 } 3940 3941 /* Likewise, but also compute flags. */ 3942 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3943 intptr_t oprsz, bool after) 3944 { 3945 uint32_t flags = PREDTEST_INIT; 3946 bool brk = false; 3947 intptr_t i; 3948 3949 for (i = 0; i < oprsz / 8; ++i) { 3950 uint64_t this_b, this_d = d[i], this_g = g[i]; 3951 3952 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3953 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3954 flags = iter_predtest_fwd(this_d, this_g, flags); 3955 } 3956 return flags; 3957 } 3958 3959 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3960 { 3961 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3962 * The compiler should turn this into 4 64-bit integer stores. 3963 */ 3964 memset(d, 0, sizeof(ARMPredicateReg)); 3965 return PREDTEST_INIT; 3966 } 3967 3968 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 3969 uint32_t pred_desc) 3970 { 3971 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3972 if (last_active_pred(vn, vg, oprsz)) { 3973 compute_brk_z(vd, vm, vg, oprsz, true); 3974 } else { 3975 do_zero(vd, oprsz); 3976 } 3977 } 3978 3979 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 3980 uint32_t pred_desc) 3981 { 3982 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3983 if (last_active_pred(vn, vg, oprsz)) { 3984 return compute_brks_z(vd, vm, vg, oprsz, true); 3985 } else { 3986 return do_zero(vd, oprsz); 3987 } 3988 } 3989 3990 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 3991 uint32_t pred_desc) 3992 { 3993 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3994 if (last_active_pred(vn, vg, oprsz)) { 3995 compute_brk_z(vd, vm, vg, oprsz, false); 3996 } else { 3997 do_zero(vd, oprsz); 3998 } 3999 } 4000 4001 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4002 uint32_t pred_desc) 4003 { 4004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4005 if (last_active_pred(vn, vg, oprsz)) { 4006 return compute_brks_z(vd, vm, vg, oprsz, false); 4007 } else { 4008 return do_zero(vd, oprsz); 4009 } 4010 } 4011 4012 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4013 { 4014 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4015 compute_brk_z(vd, vn, vg, oprsz, true); 4016 } 4017 4018 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4019 { 4020 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4021 return compute_brks_z(vd, vn, vg, oprsz, true); 4022 } 4023 4024 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4025 { 4026 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4027 compute_brk_z(vd, vn, vg, oprsz, false); 4028 } 4029 4030 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4031 { 4032 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4033 return compute_brks_z(vd, vn, vg, oprsz, false); 4034 } 4035 4036 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4037 { 4038 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4039 compute_brk_m(vd, vn, vg, oprsz, true); 4040 } 4041 4042 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4043 { 4044 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4045 return compute_brks_m(vd, vn, vg, oprsz, true); 4046 } 4047 4048 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4049 { 4050 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4051 compute_brk_m(vd, vn, vg, oprsz, false); 4052 } 4053 4054 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4055 { 4056 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4057 return compute_brks_m(vd, vn, vg, oprsz, false); 4058 } 4059 4060 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4061 { 4062 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4063 if (!last_active_pred(vn, vg, oprsz)) { 4064 do_zero(vd, oprsz); 4065 } 4066 } 4067 4068 /* As if PredTest(Ones(PL), D, esz). */ 4069 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4070 uint64_t esz_mask) 4071 { 4072 uint32_t flags = PREDTEST_INIT; 4073 intptr_t i; 4074 4075 for (i = 0; i < oprsz / 8; i++) { 4076 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4077 } 4078 if (oprsz & 7) { 4079 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4080 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4081 } 4082 return flags; 4083 } 4084 4085 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4086 { 4087 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4088 if (last_active_pred(vn, vg, oprsz)) { 4089 return predtest_ones(vd, oprsz, -1); 4090 } else { 4091 return do_zero(vd, oprsz); 4092 } 4093 } 4094 4095 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4096 { 4097 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4098 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4099 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4100 intptr_t i; 4101 4102 for (i = 0; i < words; ++i) { 4103 uint64_t t = n[i] & g[i] & mask; 4104 sum += ctpop64(t); 4105 } 4106 return sum; 4107 } 4108 4109 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4110 { 4111 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4112 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4113 uint64_t esz_mask = pred_esz_masks[esz]; 4114 ARMPredicateReg *d = vd; 4115 uint32_t flags; 4116 intptr_t i; 4117 4118 /* Begin with a zero predicate register. */ 4119 flags = do_zero(d, oprsz); 4120 if (count == 0) { 4121 return flags; 4122 } 4123 4124 /* Set all of the requested bits. */ 4125 for (i = 0; i < count / 64; ++i) { 4126 d->p[i] = esz_mask; 4127 } 4128 if (count & 63) { 4129 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4130 } 4131 4132 return predtest_ones(d, oprsz, esz_mask); 4133 } 4134 4135 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4136 { 4137 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4138 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4139 uint64_t esz_mask = pred_esz_masks[esz]; 4140 ARMPredicateReg *d = vd; 4141 intptr_t i, invcount, oprbits; 4142 uint64_t bits; 4143 4144 if (count == 0) { 4145 return do_zero(d, oprsz); 4146 } 4147 4148 oprbits = oprsz * 8; 4149 tcg_debug_assert(count <= oprbits); 4150 4151 bits = esz_mask; 4152 if (oprbits & 63) { 4153 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4154 } 4155 4156 invcount = oprbits - count; 4157 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4158 d->p[i] = bits; 4159 bits = esz_mask; 4160 } 4161 4162 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4163 4164 while (--i >= 0) { 4165 d->p[i] = 0; 4166 } 4167 4168 return predtest_ones(d, oprsz, esz_mask); 4169 } 4170 4171 /* Recursive reduction on a function; 4172 * C.f. the ARM ARM function ReducePredicated. 4173 * 4174 * While it would be possible to write this without the DATA temporary, 4175 * it is much simpler to process the predicate register this way. 4176 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4177 * little to gain with a more complex non-recursive form. 4178 */ 4179 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4180 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4181 { \ 4182 if (n == 1) { \ 4183 return *data; \ 4184 } else { \ 4185 uintptr_t half = n / 2; \ 4186 TYPE lo = NAME##_reduce(data, status, half); \ 4187 TYPE hi = NAME##_reduce(data + half, status, half); \ 4188 return TYPE##_##FUNC(lo, hi, status); \ 4189 } \ 4190 } \ 4191 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ 4192 { \ 4193 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4194 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4195 for (i = 0; i < oprsz; ) { \ 4196 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4197 do { \ 4198 TYPE nn = *(TYPE *)(vn + H(i)); \ 4199 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4200 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4201 } while (i & 15); \ 4202 } \ 4203 for (; i < maxsz; i += sizeof(TYPE)) { \ 4204 *(TYPE *)((void *)data + i) = IDENT; \ 4205 } \ 4206 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \ 4207 } 4208 4209 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) 4210 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) 4211 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) 4212 4213 /* Identity is floatN_default_nan, without the function call. */ 4214 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) 4215 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) 4216 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) 4217 4218 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) 4219 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) 4220 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) 4221 4222 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) 4223 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) 4224 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) 4225 4226 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) 4227 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) 4228 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) 4229 4230 #undef DO_REDUCE 4231 4232 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4233 void *status, uint32_t desc) 4234 { 4235 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4236 float16 result = nn; 4237 4238 do { 4239 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4240 do { 4241 if (pg & 1) { 4242 float16 mm = *(float16 *)(vm + H1_2(i)); 4243 result = float16_add(result, mm, status); 4244 } 4245 i += sizeof(float16), pg >>= sizeof(float16); 4246 } while (i & 15); 4247 } while (i < opr_sz); 4248 4249 return result; 4250 } 4251 4252 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4253 void *status, uint32_t desc) 4254 { 4255 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4256 float32 result = nn; 4257 4258 do { 4259 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4260 do { 4261 if (pg & 1) { 4262 float32 mm = *(float32 *)(vm + H1_2(i)); 4263 result = float32_add(result, mm, status); 4264 } 4265 i += sizeof(float32), pg >>= sizeof(float32); 4266 } while (i & 15); 4267 } while (i < opr_sz); 4268 4269 return result; 4270 } 4271 4272 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4273 void *status, uint32_t desc) 4274 { 4275 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4276 uint64_t *m = vm; 4277 uint8_t *pg = vg; 4278 4279 for (i = 0; i < opr_sz; i++) { 4280 if (pg[H1(i)] & 1) { 4281 nn = float64_add(nn, m[i], status); 4282 } 4283 } 4284 4285 return nn; 4286 } 4287 4288 /* Fully general three-operand expander, controlled by a predicate, 4289 * With the extra float_status parameter. 4290 */ 4291 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4292 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4293 void *status, uint32_t desc) \ 4294 { \ 4295 intptr_t i = simd_oprsz(desc); \ 4296 uint64_t *g = vg; \ 4297 do { \ 4298 uint64_t pg = g[(i - 1) >> 6]; \ 4299 do { \ 4300 i -= sizeof(TYPE); \ 4301 if (likely((pg >> (i & 63)) & 1)) { \ 4302 TYPE nn = *(TYPE *)(vn + H(i)); \ 4303 TYPE mm = *(TYPE *)(vm + H(i)); \ 4304 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4305 } \ 4306 } while (i & 63); \ 4307 } while (i != 0); \ 4308 } 4309 4310 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4311 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4312 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4313 4314 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4315 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4316 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4317 4318 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4319 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4320 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4321 4322 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4323 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4324 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4325 4326 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4327 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4328 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4329 4330 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4331 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4332 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4333 4334 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4335 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4336 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4337 4338 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4339 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4340 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4341 4342 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4343 { 4344 return float16_abs(float16_sub(a, b, s)); 4345 } 4346 4347 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4348 { 4349 return float32_abs(float32_sub(a, b, s)); 4350 } 4351 4352 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4353 { 4354 return float64_abs(float64_sub(a, b, s)); 4355 } 4356 4357 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4358 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4359 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4360 4361 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4362 { 4363 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4364 return float64_scalbn(a, b_int, s); 4365 } 4366 4367 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4368 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4369 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4370 4371 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4372 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4373 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4374 4375 #undef DO_ZPZZ_FP 4376 4377 /* Three-operand expander, with one scalar operand, controlled by 4378 * a predicate, with the extra float_status parameter. 4379 */ 4380 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4381 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4382 void *status, uint32_t desc) \ 4383 { \ 4384 intptr_t i = simd_oprsz(desc); \ 4385 uint64_t *g = vg; \ 4386 TYPE mm = scalar; \ 4387 do { \ 4388 uint64_t pg = g[(i - 1) >> 6]; \ 4389 do { \ 4390 i -= sizeof(TYPE); \ 4391 if (likely((pg >> (i & 63)) & 1)) { \ 4392 TYPE nn = *(TYPE *)(vn + H(i)); \ 4393 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4394 } \ 4395 } while (i & 63); \ 4396 } while (i != 0); \ 4397 } 4398 4399 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4400 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4401 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4402 4403 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4404 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4405 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4406 4407 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4408 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4409 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4410 4411 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4412 { 4413 return float16_sub(b, a, s); 4414 } 4415 4416 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4417 { 4418 return float32_sub(b, a, s); 4419 } 4420 4421 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4422 { 4423 return float64_sub(b, a, s); 4424 } 4425 4426 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4427 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4428 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4429 4430 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4431 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4432 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4433 4434 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4435 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4436 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4437 4438 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4439 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4440 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4441 4442 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4443 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4444 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4445 4446 /* Fully general two-operand expander, controlled by a predicate, 4447 * With the extra float_status parameter. 4448 */ 4449 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4450 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 4451 { \ 4452 intptr_t i = simd_oprsz(desc); \ 4453 uint64_t *g = vg; \ 4454 do { \ 4455 uint64_t pg = g[(i - 1) >> 6]; \ 4456 do { \ 4457 i -= sizeof(TYPE); \ 4458 if (likely((pg >> (i & 63)) & 1)) { \ 4459 TYPE nn = *(TYPE *)(vn + H(i)); \ 4460 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4461 } \ 4462 } while (i & 63); \ 4463 } while (i != 0); \ 4464 } 4465 4466 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4467 * FZ16. When converting from fp16, this affects flushing input denormals; 4468 * when converting to fp16, this affects flushing output denormals. 4469 */ 4470 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4471 { 4472 bool save = get_flush_inputs_to_zero(fpst); 4473 float32 ret; 4474 4475 set_flush_inputs_to_zero(false, fpst); 4476 ret = float16_to_float32(f, true, fpst); 4477 set_flush_inputs_to_zero(save, fpst); 4478 return ret; 4479 } 4480 4481 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4482 { 4483 bool save = get_flush_inputs_to_zero(fpst); 4484 float64 ret; 4485 4486 set_flush_inputs_to_zero(false, fpst); 4487 ret = float16_to_float64(f, true, fpst); 4488 set_flush_inputs_to_zero(save, fpst); 4489 return ret; 4490 } 4491 4492 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4493 { 4494 bool save = get_flush_to_zero(fpst); 4495 float16 ret; 4496 4497 set_flush_to_zero(false, fpst); 4498 ret = float32_to_float16(f, true, fpst); 4499 set_flush_to_zero(save, fpst); 4500 return ret; 4501 } 4502 4503 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4504 { 4505 bool save = get_flush_to_zero(fpst); 4506 float16 ret; 4507 4508 set_flush_to_zero(false, fpst); 4509 ret = float64_to_float16(f, true, fpst); 4510 set_flush_to_zero(save, fpst); 4511 return ret; 4512 } 4513 4514 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4515 { 4516 if (float16_is_any_nan(f)) { 4517 float_raise(float_flag_invalid, s); 4518 return 0; 4519 } 4520 return float16_to_int16_round_to_zero(f, s); 4521 } 4522 4523 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4524 { 4525 if (float16_is_any_nan(f)) { 4526 float_raise(float_flag_invalid, s); 4527 return 0; 4528 } 4529 return float16_to_int64_round_to_zero(f, s); 4530 } 4531 4532 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4533 { 4534 if (float32_is_any_nan(f)) { 4535 float_raise(float_flag_invalid, s); 4536 return 0; 4537 } 4538 return float32_to_int64_round_to_zero(f, s); 4539 } 4540 4541 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4542 { 4543 if (float64_is_any_nan(f)) { 4544 float_raise(float_flag_invalid, s); 4545 return 0; 4546 } 4547 return float64_to_int64_round_to_zero(f, s); 4548 } 4549 4550 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4551 { 4552 if (float16_is_any_nan(f)) { 4553 float_raise(float_flag_invalid, s); 4554 return 0; 4555 } 4556 return float16_to_uint16_round_to_zero(f, s); 4557 } 4558 4559 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4560 { 4561 if (float16_is_any_nan(f)) { 4562 float_raise(float_flag_invalid, s); 4563 return 0; 4564 } 4565 return float16_to_uint64_round_to_zero(f, s); 4566 } 4567 4568 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4569 { 4570 if (float32_is_any_nan(f)) { 4571 float_raise(float_flag_invalid, s); 4572 return 0; 4573 } 4574 return float32_to_uint64_round_to_zero(f, s); 4575 } 4576 4577 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4578 { 4579 if (float64_is_any_nan(f)) { 4580 float_raise(float_flag_invalid, s); 4581 return 0; 4582 } 4583 return float64_to_uint64_round_to_zero(f, s); 4584 } 4585 4586 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4587 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4588 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4589 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4590 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4591 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4592 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4593 4594 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4595 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4596 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4597 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4598 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4599 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4600 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4601 4602 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4603 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4604 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4605 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4606 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4607 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4608 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4609 4610 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4611 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4612 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4613 4614 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4615 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4616 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4617 4618 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4619 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4620 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4621 4622 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4623 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4624 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4625 4626 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4627 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4628 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4629 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4630 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4631 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4632 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4633 4634 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4635 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4636 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4637 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4638 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4639 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4640 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4641 4642 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4643 { 4644 /* Extract frac to the top of the uint32_t. */ 4645 uint32_t frac = (uint32_t)a << (16 + 6); 4646 int16_t exp = extract32(a, 10, 5); 4647 4648 if (unlikely(exp == 0)) { 4649 if (frac != 0) { 4650 if (!get_flush_inputs_to_zero(s)) { 4651 /* denormal: bias - fractional_zeros */ 4652 return -15 - clz32(frac); 4653 } 4654 /* flush to zero */ 4655 float_raise(float_flag_input_denormal, s); 4656 } 4657 } else if (unlikely(exp == 0x1f)) { 4658 if (frac == 0) { 4659 return INT16_MAX; /* infinity */ 4660 } 4661 } else { 4662 /* normal: exp - bias */ 4663 return exp - 15; 4664 } 4665 /* nan or zero */ 4666 float_raise(float_flag_invalid, s); 4667 return INT16_MIN; 4668 } 4669 4670 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4671 { 4672 /* Extract frac to the top of the uint32_t. */ 4673 uint32_t frac = a << 9; 4674 int32_t exp = extract32(a, 23, 8); 4675 4676 if (unlikely(exp == 0)) { 4677 if (frac != 0) { 4678 if (!get_flush_inputs_to_zero(s)) { 4679 /* denormal: bias - fractional_zeros */ 4680 return -127 - clz32(frac); 4681 } 4682 /* flush to zero */ 4683 float_raise(float_flag_input_denormal, s); 4684 } 4685 } else if (unlikely(exp == 0xff)) { 4686 if (frac == 0) { 4687 return INT32_MAX; /* infinity */ 4688 } 4689 } else { 4690 /* normal: exp - bias */ 4691 return exp - 127; 4692 } 4693 /* nan or zero */ 4694 float_raise(float_flag_invalid, s); 4695 return INT32_MIN; 4696 } 4697 4698 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4699 { 4700 /* Extract frac to the top of the uint64_t. */ 4701 uint64_t frac = a << 12; 4702 int64_t exp = extract64(a, 52, 11); 4703 4704 if (unlikely(exp == 0)) { 4705 if (frac != 0) { 4706 if (!get_flush_inputs_to_zero(s)) { 4707 /* denormal: bias - fractional_zeros */ 4708 return -1023 - clz64(frac); 4709 } 4710 /* flush to zero */ 4711 float_raise(float_flag_input_denormal, s); 4712 } 4713 } else if (unlikely(exp == 0x7ff)) { 4714 if (frac == 0) { 4715 return INT64_MAX; /* infinity */ 4716 } 4717 } else { 4718 /* normal: exp - bias */ 4719 return exp - 1023; 4720 } 4721 /* nan or zero */ 4722 float_raise(float_flag_invalid, s); 4723 return INT64_MIN; 4724 } 4725 4726 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4727 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4728 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4729 4730 #undef DO_ZPZ_FP 4731 4732 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4733 float_status *status, uint32_t desc, 4734 uint16_t neg1, uint16_t neg3) 4735 { 4736 intptr_t i = simd_oprsz(desc); 4737 uint64_t *g = vg; 4738 4739 do { 4740 uint64_t pg = g[(i - 1) >> 6]; 4741 do { 4742 i -= 2; 4743 if (likely((pg >> (i & 63)) & 1)) { 4744 float16 e1, e2, e3, r; 4745 4746 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4747 e2 = *(uint16_t *)(vm + H1_2(i)); 4748 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4749 r = float16_muladd(e1, e2, e3, 0, status); 4750 *(uint16_t *)(vd + H1_2(i)) = r; 4751 } 4752 } while (i & 63); 4753 } while (i != 0); 4754 } 4755 4756 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4757 void *vg, void *status, uint32_t desc) 4758 { 4759 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); 4760 } 4761 4762 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4763 void *vg, void *status, uint32_t desc) 4764 { 4765 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); 4766 } 4767 4768 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4769 void *vg, void *status, uint32_t desc) 4770 { 4771 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); 4772 } 4773 4774 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4775 void *vg, void *status, uint32_t desc) 4776 { 4777 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); 4778 } 4779 4780 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4781 float_status *status, uint32_t desc, 4782 uint32_t neg1, uint32_t neg3) 4783 { 4784 intptr_t i = simd_oprsz(desc); 4785 uint64_t *g = vg; 4786 4787 do { 4788 uint64_t pg = g[(i - 1) >> 6]; 4789 do { 4790 i -= 4; 4791 if (likely((pg >> (i & 63)) & 1)) { 4792 float32 e1, e2, e3, r; 4793 4794 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4795 e2 = *(uint32_t *)(vm + H1_4(i)); 4796 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4797 r = float32_muladd(e1, e2, e3, 0, status); 4798 *(uint32_t *)(vd + H1_4(i)) = r; 4799 } 4800 } while (i & 63); 4801 } while (i != 0); 4802 } 4803 4804 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4805 void *vg, void *status, uint32_t desc) 4806 { 4807 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); 4808 } 4809 4810 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4811 void *vg, void *status, uint32_t desc) 4812 { 4813 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); 4814 } 4815 4816 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4817 void *vg, void *status, uint32_t desc) 4818 { 4819 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); 4820 } 4821 4822 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4823 void *vg, void *status, uint32_t desc) 4824 { 4825 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); 4826 } 4827 4828 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4829 float_status *status, uint32_t desc, 4830 uint64_t neg1, uint64_t neg3) 4831 { 4832 intptr_t i = simd_oprsz(desc); 4833 uint64_t *g = vg; 4834 4835 do { 4836 uint64_t pg = g[(i - 1) >> 6]; 4837 do { 4838 i -= 8; 4839 if (likely((pg >> (i & 63)) & 1)) { 4840 float64 e1, e2, e3, r; 4841 4842 e1 = *(uint64_t *)(vn + i) ^ neg1; 4843 e2 = *(uint64_t *)(vm + i); 4844 e3 = *(uint64_t *)(va + i) ^ neg3; 4845 r = float64_muladd(e1, e2, e3, 0, status); 4846 *(uint64_t *)(vd + i) = r; 4847 } 4848 } while (i & 63); 4849 } while (i != 0); 4850 } 4851 4852 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4853 void *vg, void *status, uint32_t desc) 4854 { 4855 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); 4856 } 4857 4858 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4859 void *vg, void *status, uint32_t desc) 4860 { 4861 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); 4862 } 4863 4864 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4865 void *vg, void *status, uint32_t desc) 4866 { 4867 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); 4868 } 4869 4870 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4871 void *vg, void *status, uint32_t desc) 4872 { 4873 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); 4874 } 4875 4876 /* Two operand floating-point comparison controlled by a predicate. 4877 * Unlike the integer version, we are not allowed to optimistically 4878 * compare operands, since the comparison may have side effects wrt 4879 * the FPSR. 4880 */ 4881 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 4882 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4883 void *status, uint32_t desc) \ 4884 { \ 4885 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4886 uint64_t *d = vd, *g = vg; \ 4887 do { \ 4888 uint64_t out = 0, pg = g[j]; \ 4889 do { \ 4890 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4891 if (likely((pg >> (i & 63)) & 1)) { \ 4892 TYPE nn = *(TYPE *)(vn + H(i)); \ 4893 TYPE mm = *(TYPE *)(vm + H(i)); \ 4894 out |= OP(TYPE, nn, mm, status); \ 4895 } \ 4896 } while (i & 63); \ 4897 d[j--] = out; \ 4898 } while (i > 0); \ 4899 } 4900 4901 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 4902 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 4903 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 4904 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 4905 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 4906 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 4907 4908 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 4909 DO_FPCMP_PPZZ_H(NAME, OP) \ 4910 DO_FPCMP_PPZZ_S(NAME, OP) \ 4911 DO_FPCMP_PPZZ_D(NAME, OP) 4912 4913 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 4914 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 4915 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 4916 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 4917 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 4918 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 4919 #define DO_FCMUO(TYPE, X, Y, ST) \ 4920 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 4921 #define DO_FACGE(TYPE, X, Y, ST) \ 4922 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 4923 #define DO_FACGT(TYPE, X, Y, ST) \ 4924 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 4925 4926 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 4927 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 4928 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 4929 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 4930 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 4931 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 4932 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 4933 4934 #undef DO_FPCMP_PPZZ_ALL 4935 #undef DO_FPCMP_PPZZ_D 4936 #undef DO_FPCMP_PPZZ_S 4937 #undef DO_FPCMP_PPZZ_H 4938 #undef DO_FPCMP_PPZZ 4939 4940 /* One operand floating-point comparison against zero, controlled 4941 * by a predicate. 4942 */ 4943 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 4944 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4945 void *status, uint32_t desc) \ 4946 { \ 4947 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4948 uint64_t *d = vd, *g = vg; \ 4949 do { \ 4950 uint64_t out = 0, pg = g[j]; \ 4951 do { \ 4952 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4953 if ((pg >> (i & 63)) & 1) { \ 4954 TYPE nn = *(TYPE *)(vn + H(i)); \ 4955 out |= OP(TYPE, nn, 0, status); \ 4956 } \ 4957 } while (i & 63); \ 4958 d[j--] = out; \ 4959 } while (i > 0); \ 4960 } 4961 4962 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 4963 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 4964 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 4965 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 4966 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 4967 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 4968 4969 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 4970 DO_FPCMP_PPZ0_H(NAME, OP) \ 4971 DO_FPCMP_PPZ0_S(NAME, OP) \ 4972 DO_FPCMP_PPZ0_D(NAME, OP) 4973 4974 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 4975 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 4976 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 4977 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 4978 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 4979 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 4980 4981 /* FP Trig Multiply-Add. */ 4982 4983 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 4984 { 4985 static const float16 coeff[16] = { 4986 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4987 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4988 }; 4989 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 4990 intptr_t x = simd_data(desc); 4991 float16 *d = vd, *n = vn, *m = vm; 4992 for (i = 0; i < opr_sz; i++) { 4993 float16 mm = m[i]; 4994 intptr_t xx = x; 4995 if (float16_is_neg(mm)) { 4996 mm = float16_abs(mm); 4997 xx += 8; 4998 } 4999 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs); 5000 } 5001 } 5002 5003 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5004 { 5005 static const float32 coeff[16] = { 5006 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5007 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5008 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5009 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5010 }; 5011 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5012 intptr_t x = simd_data(desc); 5013 float32 *d = vd, *n = vn, *m = vm; 5014 for (i = 0; i < opr_sz; i++) { 5015 float32 mm = m[i]; 5016 intptr_t xx = x; 5017 if (float32_is_neg(mm)) { 5018 mm = float32_abs(mm); 5019 xx += 8; 5020 } 5021 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs); 5022 } 5023 } 5024 5025 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5026 { 5027 static const float64 coeff[16] = { 5028 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5029 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5030 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5031 0x3de5d8408868552full, 0x0000000000000000ull, 5032 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5033 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5034 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5035 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5036 }; 5037 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5038 intptr_t x = simd_data(desc); 5039 float64 *d = vd, *n = vn, *m = vm; 5040 for (i = 0; i < opr_sz; i++) { 5041 float64 mm = m[i]; 5042 intptr_t xx = x; 5043 if (float64_is_neg(mm)) { 5044 mm = float64_abs(mm); 5045 xx += 8; 5046 } 5047 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs); 5048 } 5049 } 5050 5051 /* 5052 * FP Complex Add 5053 */ 5054 5055 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5056 void *vs, uint32_t desc) 5057 { 5058 intptr_t j, i = simd_oprsz(desc); 5059 uint64_t *g = vg; 5060 float16 neg_imag = float16_set_sign(0, simd_data(desc)); 5061 float16 neg_real = float16_chs(neg_imag); 5062 5063 do { 5064 uint64_t pg = g[(i - 1) >> 6]; 5065 do { 5066 float16 e0, e1, e2, e3; 5067 5068 /* I holds the real index; J holds the imag index. */ 5069 j = i - sizeof(float16); 5070 i -= 2 * sizeof(float16); 5071 5072 e0 = *(float16 *)(vn + H1_2(i)); 5073 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; 5074 e2 = *(float16 *)(vn + H1_2(j)); 5075 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; 5076 5077 if (likely((pg >> (i & 63)) & 1)) { 5078 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs); 5079 } 5080 if (likely((pg >> (j & 63)) & 1)) { 5081 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs); 5082 } 5083 } while (i & 63); 5084 } while (i != 0); 5085 } 5086 5087 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5088 void *vs, uint32_t desc) 5089 { 5090 intptr_t j, i = simd_oprsz(desc); 5091 uint64_t *g = vg; 5092 float32 neg_imag = float32_set_sign(0, simd_data(desc)); 5093 float32 neg_real = float32_chs(neg_imag); 5094 5095 do { 5096 uint64_t pg = g[(i - 1) >> 6]; 5097 do { 5098 float32 e0, e1, e2, e3; 5099 5100 /* I holds the real index; J holds the imag index. */ 5101 j = i - sizeof(float32); 5102 i -= 2 * sizeof(float32); 5103 5104 e0 = *(float32 *)(vn + H1_2(i)); 5105 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; 5106 e2 = *(float32 *)(vn + H1_2(j)); 5107 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; 5108 5109 if (likely((pg >> (i & 63)) & 1)) { 5110 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs); 5111 } 5112 if (likely((pg >> (j & 63)) & 1)) { 5113 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs); 5114 } 5115 } while (i & 63); 5116 } while (i != 0); 5117 } 5118 5119 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5120 void *vs, uint32_t desc) 5121 { 5122 intptr_t j, i = simd_oprsz(desc); 5123 uint64_t *g = vg; 5124 float64 neg_imag = float64_set_sign(0, simd_data(desc)); 5125 float64 neg_real = float64_chs(neg_imag); 5126 5127 do { 5128 uint64_t pg = g[(i - 1) >> 6]; 5129 do { 5130 float64 e0, e1, e2, e3; 5131 5132 /* I holds the real index; J holds the imag index. */ 5133 j = i - sizeof(float64); 5134 i -= 2 * sizeof(float64); 5135 5136 e0 = *(float64 *)(vn + H1_2(i)); 5137 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; 5138 e2 = *(float64 *)(vn + H1_2(j)); 5139 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; 5140 5141 if (likely((pg >> (i & 63)) & 1)) { 5142 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs); 5143 } 5144 if (likely((pg >> (j & 63)) & 1)) { 5145 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs); 5146 } 5147 } while (i & 63); 5148 } while (i != 0); 5149 } 5150 5151 /* 5152 * FP Complex Multiply 5153 */ 5154 5155 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5156 void *vg, void *status, uint32_t desc) 5157 { 5158 intptr_t j, i = simd_oprsz(desc); 5159 unsigned rot = simd_data(desc); 5160 bool flip = rot & 1; 5161 float16 neg_imag, neg_real; 5162 uint64_t *g = vg; 5163 5164 neg_imag = float16_set_sign(0, (rot & 2) != 0); 5165 neg_real = float16_set_sign(0, rot == 1 || rot == 2); 5166 5167 do { 5168 uint64_t pg = g[(i - 1) >> 6]; 5169 do { 5170 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5171 5172 /* I holds the real index; J holds the imag index. */ 5173 j = i - sizeof(float16); 5174 i -= 2 * sizeof(float16); 5175 5176 nr = *(float16 *)(vn + H1_2(i)); 5177 ni = *(float16 *)(vn + H1_2(j)); 5178 mr = *(float16 *)(vm + H1_2(i)); 5179 mi = *(float16 *)(vm + H1_2(j)); 5180 5181 e2 = (flip ? ni : nr); 5182 e1 = (flip ? mi : mr) ^ neg_real; 5183 e4 = e2; 5184 e3 = (flip ? mr : mi) ^ neg_imag; 5185 5186 if (likely((pg >> (i & 63)) & 1)) { 5187 d = *(float16 *)(va + H1_2(i)); 5188 d = float16_muladd(e2, e1, d, 0, status); 5189 *(float16 *)(vd + H1_2(i)) = d; 5190 } 5191 if (likely((pg >> (j & 63)) & 1)) { 5192 d = *(float16 *)(va + H1_2(j)); 5193 d = float16_muladd(e4, e3, d, 0, status); 5194 *(float16 *)(vd + H1_2(j)) = d; 5195 } 5196 } while (i & 63); 5197 } while (i != 0); 5198 } 5199 5200 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5201 void *vg, void *status, uint32_t desc) 5202 { 5203 intptr_t j, i = simd_oprsz(desc); 5204 unsigned rot = simd_data(desc); 5205 bool flip = rot & 1; 5206 float32 neg_imag, neg_real; 5207 uint64_t *g = vg; 5208 5209 neg_imag = float32_set_sign(0, (rot & 2) != 0); 5210 neg_real = float32_set_sign(0, rot == 1 || rot == 2); 5211 5212 do { 5213 uint64_t pg = g[(i - 1) >> 6]; 5214 do { 5215 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5216 5217 /* I holds the real index; J holds the imag index. */ 5218 j = i - sizeof(float32); 5219 i -= 2 * sizeof(float32); 5220 5221 nr = *(float32 *)(vn + H1_2(i)); 5222 ni = *(float32 *)(vn + H1_2(j)); 5223 mr = *(float32 *)(vm + H1_2(i)); 5224 mi = *(float32 *)(vm + H1_2(j)); 5225 5226 e2 = (flip ? ni : nr); 5227 e1 = (flip ? mi : mr) ^ neg_real; 5228 e4 = e2; 5229 e3 = (flip ? mr : mi) ^ neg_imag; 5230 5231 if (likely((pg >> (i & 63)) & 1)) { 5232 d = *(float32 *)(va + H1_2(i)); 5233 d = float32_muladd(e2, e1, d, 0, status); 5234 *(float32 *)(vd + H1_2(i)) = d; 5235 } 5236 if (likely((pg >> (j & 63)) & 1)) { 5237 d = *(float32 *)(va + H1_2(j)); 5238 d = float32_muladd(e4, e3, d, 0, status); 5239 *(float32 *)(vd + H1_2(j)) = d; 5240 } 5241 } while (i & 63); 5242 } while (i != 0); 5243 } 5244 5245 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5246 void *vg, void *status, uint32_t desc) 5247 { 5248 intptr_t j, i = simd_oprsz(desc); 5249 unsigned rot = simd_data(desc); 5250 bool flip = rot & 1; 5251 float64 neg_imag, neg_real; 5252 uint64_t *g = vg; 5253 5254 neg_imag = float64_set_sign(0, (rot & 2) != 0); 5255 neg_real = float64_set_sign(0, rot == 1 || rot == 2); 5256 5257 do { 5258 uint64_t pg = g[(i - 1) >> 6]; 5259 do { 5260 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5261 5262 /* I holds the real index; J holds the imag index. */ 5263 j = i - sizeof(float64); 5264 i -= 2 * sizeof(float64); 5265 5266 nr = *(float64 *)(vn + H1_2(i)); 5267 ni = *(float64 *)(vn + H1_2(j)); 5268 mr = *(float64 *)(vm + H1_2(i)); 5269 mi = *(float64 *)(vm + H1_2(j)); 5270 5271 e2 = (flip ? ni : nr); 5272 e1 = (flip ? mi : mr) ^ neg_real; 5273 e4 = e2; 5274 e3 = (flip ? mr : mi) ^ neg_imag; 5275 5276 if (likely((pg >> (i & 63)) & 1)) { 5277 d = *(float64 *)(va + H1_2(i)); 5278 d = float64_muladd(e2, e1, d, 0, status); 5279 *(float64 *)(vd + H1_2(i)) = d; 5280 } 5281 if (likely((pg >> (j & 63)) & 1)) { 5282 d = *(float64 *)(va + H1_2(j)); 5283 d = float64_muladd(e4, e3, d, 0, status); 5284 *(float64 *)(vd + H1_2(j)) = d; 5285 } 5286 } while (i & 63); 5287 } while (i != 0); 5288 } 5289 5290 /* 5291 * Load contiguous data, protected by a governing predicate. 5292 */ 5293 5294 /* 5295 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5296 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5297 * element >= @reg_off, or @reg_max if there were no active elements at all. 5298 */ 5299 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5300 intptr_t reg_max, int esz) 5301 { 5302 uint64_t pg_mask = pred_esz_masks[esz]; 5303 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5304 5305 /* In normal usage, the first element is active. */ 5306 if (likely(pg & 1)) { 5307 return reg_off; 5308 } 5309 5310 if (pg == 0) { 5311 reg_off &= -64; 5312 do { 5313 reg_off += 64; 5314 if (unlikely(reg_off >= reg_max)) { 5315 /* The entire predicate was false. */ 5316 return reg_max; 5317 } 5318 pg = vg[reg_off >> 6] & pg_mask; 5319 } while (pg == 0); 5320 } 5321 reg_off += ctz64(pg); 5322 5323 /* We should never see an out of range predicate bit set. */ 5324 tcg_debug_assert(reg_off < reg_max); 5325 return reg_off; 5326 } 5327 5328 /* 5329 * Resolve the guest virtual address to info->host and info->flags. 5330 * If @nofault, return false if the page is invalid, otherwise 5331 * exit via page fault exception. 5332 */ 5333 5334 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5335 target_ulong addr, int mem_off, MMUAccessType access_type, 5336 int mmu_idx, uintptr_t retaddr) 5337 { 5338 int flags; 5339 5340 addr += mem_off; 5341 5342 /* 5343 * User-only currently always issues with TBI. See the comment 5344 * above useronly_clean_ptr. Usually we clean this top byte away 5345 * during translation, but we can't do that for e.g. vector + imm 5346 * addressing modes. 5347 * 5348 * We currently always enable TBI for user-only, and do not provide 5349 * a way to turn it off. So clean the pointer unconditionally here, 5350 * rather than look it up here, or pass it down from above. 5351 */ 5352 addr = useronly_clean_ptr(addr); 5353 5354 #ifdef CONFIG_USER_ONLY 5355 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault, 5356 &info->host, retaddr); 5357 #else 5358 CPUTLBEntryFull *full; 5359 flags = probe_access_full(env, addr, access_type, mmu_idx, nofault, 5360 &info->host, &full, retaddr); 5361 #endif 5362 info->flags = flags; 5363 5364 if (flags & TLB_INVALID_MASK) { 5365 g_assert(nofault); 5366 return false; 5367 } 5368 5369 #ifdef CONFIG_USER_ONLY 5370 memset(&info->attrs, 0, sizeof(info->attrs)); 5371 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5372 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5373 #else 5374 info->attrs = full->attrs; 5375 info->tagged = full->pte_attrs == 0xf0; 5376 #endif 5377 5378 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5379 info->host -= mem_off; 5380 return true; 5381 } 5382 5383 /* 5384 * Find first active element on each page, and a loose bound for the 5385 * final element on each page. Identify any single element that spans 5386 * the page boundary. Return true if there are any active elements. 5387 */ 5388 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5389 intptr_t reg_max, int esz, int msize) 5390 { 5391 const int esize = 1 << esz; 5392 const uint64_t pg_mask = pred_esz_masks[esz]; 5393 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5394 intptr_t mem_off_last, mem_off_split; 5395 intptr_t page_split, elt_split; 5396 intptr_t i; 5397 5398 /* Set all of the element indices to -1, and the TLB data to 0. */ 5399 memset(info, -1, offsetof(SVEContLdSt, page)); 5400 memset(info->page, 0, sizeof(info->page)); 5401 5402 /* Gross scan over the entire predicate to find bounds. */ 5403 i = 0; 5404 do { 5405 uint64_t pg = vg[i] & pg_mask; 5406 if (pg) { 5407 reg_off_last = i * 64 + 63 - clz64(pg); 5408 if (reg_off_first < 0) { 5409 reg_off_first = i * 64 + ctz64(pg); 5410 } 5411 } 5412 } while (++i * 64 < reg_max); 5413 5414 if (unlikely(reg_off_first < 0)) { 5415 /* No active elements, no pages touched. */ 5416 return false; 5417 } 5418 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5419 5420 info->reg_off_first[0] = reg_off_first; 5421 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5422 mem_off_last = (reg_off_last >> esz) * msize; 5423 5424 page_split = -(addr | TARGET_PAGE_MASK); 5425 if (likely(mem_off_last + msize <= page_split)) { 5426 /* The entire operation fits within a single page. */ 5427 info->reg_off_last[0] = reg_off_last; 5428 return true; 5429 } 5430 5431 info->page_split = page_split; 5432 elt_split = page_split / msize; 5433 reg_off_split = elt_split << esz; 5434 mem_off_split = elt_split * msize; 5435 5436 /* 5437 * This is the last full element on the first page, but it is not 5438 * necessarily active. If there is no full element, i.e. the first 5439 * active element is the one that's split, this value remains -1. 5440 * It is useful as iteration bounds. 5441 */ 5442 if (elt_split != 0) { 5443 info->reg_off_last[0] = reg_off_split - esize; 5444 } 5445 5446 /* Determine if an unaligned element spans the pages. */ 5447 if (page_split % msize != 0) { 5448 /* It is helpful to know if the split element is active. */ 5449 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5450 info->reg_off_split = reg_off_split; 5451 info->mem_off_split = mem_off_split; 5452 5453 if (reg_off_split == reg_off_last) { 5454 /* The page crossing element is last. */ 5455 return true; 5456 } 5457 } 5458 reg_off_split += esize; 5459 mem_off_split += msize; 5460 } 5461 5462 /* 5463 * We do want the first active element on the second page, because 5464 * this may affect the address reported in an exception. 5465 */ 5466 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5467 tcg_debug_assert(reg_off_split <= reg_off_last); 5468 info->reg_off_first[1] = reg_off_split; 5469 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5470 info->reg_off_last[1] = reg_off_last; 5471 return true; 5472 } 5473 5474 /* 5475 * Resolve the guest virtual addresses to info->page[]. 5476 * Control the generation of page faults with @fault. Return false if 5477 * there is no work to do, which can only happen with @fault == FAULT_NO. 5478 */ 5479 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5480 CPUARMState *env, target_ulong addr, 5481 MMUAccessType access_type, uintptr_t retaddr) 5482 { 5483 int mmu_idx = cpu_mmu_index(env, false); 5484 int mem_off = info->mem_off_first[0]; 5485 bool nofault = fault == FAULT_NO; 5486 bool have_work = true; 5487 5488 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5489 access_type, mmu_idx, retaddr)) { 5490 /* No work to be done. */ 5491 return false; 5492 } 5493 5494 if (likely(info->page_split < 0)) { 5495 /* The entire operation was on the one page. */ 5496 return true; 5497 } 5498 5499 /* 5500 * If the second page is invalid, then we want the fault address to be 5501 * the first byte on that page which is accessed. 5502 */ 5503 if (info->mem_off_split >= 0) { 5504 /* 5505 * There is an element split across the pages. The fault address 5506 * should be the first byte of the second page. 5507 */ 5508 mem_off = info->page_split; 5509 /* 5510 * If the split element is also the first active element 5511 * of the vector, then: For first-fault we should continue 5512 * to generate faults for the second page. For no-fault, 5513 * we have work only if the second page is valid. 5514 */ 5515 if (info->mem_off_first[0] < info->mem_off_split) { 5516 nofault = FAULT_FIRST; 5517 have_work = false; 5518 } 5519 } else { 5520 /* 5521 * There is no element split across the pages. The fault address 5522 * should be the first active element on the second page. 5523 */ 5524 mem_off = info->mem_off_first[1]; 5525 /* 5526 * There must have been one active element on the first page, 5527 * so we're out of first-fault territory. 5528 */ 5529 nofault = fault != FAULT_ALL; 5530 } 5531 5532 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5533 access_type, mmu_idx, retaddr); 5534 return have_work; 5535 } 5536 5537 #ifndef CONFIG_USER_ONLY 5538 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5539 uint64_t *vg, target_ulong addr, 5540 int esize, int msize, int wp_access, 5541 uintptr_t retaddr) 5542 { 5543 intptr_t mem_off, reg_off, reg_last; 5544 int flags0 = info->page[0].flags; 5545 int flags1 = info->page[1].flags; 5546 5547 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5548 return; 5549 } 5550 5551 /* Indicate that watchpoints are handled. */ 5552 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5553 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5554 5555 if (flags0 & TLB_WATCHPOINT) { 5556 mem_off = info->mem_off_first[0]; 5557 reg_off = info->reg_off_first[0]; 5558 reg_last = info->reg_off_last[0]; 5559 5560 while (reg_off <= reg_last) { 5561 uint64_t pg = vg[reg_off >> 6]; 5562 do { 5563 if ((pg >> (reg_off & 63)) & 1) { 5564 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5565 msize, info->page[0].attrs, 5566 wp_access, retaddr); 5567 } 5568 reg_off += esize; 5569 mem_off += msize; 5570 } while (reg_off <= reg_last && (reg_off & 63)); 5571 } 5572 } 5573 5574 mem_off = info->mem_off_split; 5575 if (mem_off >= 0) { 5576 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5577 info->page[0].attrs, wp_access, retaddr); 5578 } 5579 5580 mem_off = info->mem_off_first[1]; 5581 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5582 reg_off = info->reg_off_first[1]; 5583 reg_last = info->reg_off_last[1]; 5584 5585 do { 5586 uint64_t pg = vg[reg_off >> 6]; 5587 do { 5588 if ((pg >> (reg_off & 63)) & 1) { 5589 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5590 msize, info->page[1].attrs, 5591 wp_access, retaddr); 5592 } 5593 reg_off += esize; 5594 mem_off += msize; 5595 } while (reg_off & 63); 5596 } while (reg_off <= reg_last); 5597 } 5598 } 5599 #endif 5600 5601 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5602 uint64_t *vg, target_ulong addr, int esize, 5603 int msize, uint32_t mtedesc, uintptr_t ra) 5604 { 5605 intptr_t mem_off, reg_off, reg_last; 5606 5607 /* Process the page only if MemAttr == Tagged. */ 5608 if (info->page[0].tagged) { 5609 mem_off = info->mem_off_first[0]; 5610 reg_off = info->reg_off_first[0]; 5611 reg_last = info->reg_off_split; 5612 if (reg_last < 0) { 5613 reg_last = info->reg_off_last[0]; 5614 } 5615 5616 do { 5617 uint64_t pg = vg[reg_off >> 6]; 5618 do { 5619 if ((pg >> (reg_off & 63)) & 1) { 5620 mte_check(env, mtedesc, addr, ra); 5621 } 5622 reg_off += esize; 5623 mem_off += msize; 5624 } while (reg_off <= reg_last && (reg_off & 63)); 5625 } while (reg_off <= reg_last); 5626 } 5627 5628 mem_off = info->mem_off_first[1]; 5629 if (mem_off >= 0 && info->page[1].tagged) { 5630 reg_off = info->reg_off_first[1]; 5631 reg_last = info->reg_off_last[1]; 5632 5633 do { 5634 uint64_t pg = vg[reg_off >> 6]; 5635 do { 5636 if ((pg >> (reg_off & 63)) & 1) { 5637 mte_check(env, mtedesc, addr, ra); 5638 } 5639 reg_off += esize; 5640 mem_off += msize; 5641 } while (reg_off & 63); 5642 } while (reg_off <= reg_last); 5643 } 5644 } 5645 5646 /* 5647 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5648 */ 5649 static inline QEMU_ALWAYS_INLINE 5650 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5651 uint32_t desc, const uintptr_t retaddr, 5652 const int esz, const int msz, const int N, uint32_t mtedesc, 5653 sve_ldst1_host_fn *host_fn, 5654 sve_ldst1_tlb_fn *tlb_fn) 5655 { 5656 const unsigned rd = simd_data(desc); 5657 const intptr_t reg_max = simd_oprsz(desc); 5658 intptr_t reg_off, reg_last, mem_off; 5659 SVEContLdSt info; 5660 void *host; 5661 int flags, i; 5662 5663 /* Find the active elements. */ 5664 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5665 /* The entire predicate was false; no load occurs. */ 5666 for (i = 0; i < N; ++i) { 5667 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5668 } 5669 return; 5670 } 5671 5672 /* Probe the page(s). Exit with exception for any invalid page. */ 5673 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5674 5675 /* Handle watchpoints for all active elements. */ 5676 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5677 BP_MEM_READ, retaddr); 5678 5679 /* 5680 * Handle mte checks for all active elements. 5681 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5682 */ 5683 if (mtedesc) { 5684 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5685 mtedesc, retaddr); 5686 } 5687 5688 flags = info.page[0].flags | info.page[1].flags; 5689 if (unlikely(flags != 0)) { 5690 #ifdef CONFIG_USER_ONLY 5691 g_assert_not_reached(); 5692 #else 5693 /* 5694 * At least one page includes MMIO. 5695 * Any bus operation can fail with cpu_transaction_failed, 5696 * which for ARM will raise SyncExternal. Perform the load 5697 * into scratch memory to preserve register state until the end. 5698 */ 5699 ARMVectorReg scratch[4] = { }; 5700 5701 mem_off = info.mem_off_first[0]; 5702 reg_off = info.reg_off_first[0]; 5703 reg_last = info.reg_off_last[1]; 5704 if (reg_last < 0) { 5705 reg_last = info.reg_off_split; 5706 if (reg_last < 0) { 5707 reg_last = info.reg_off_last[0]; 5708 } 5709 } 5710 5711 do { 5712 uint64_t pg = vg[reg_off >> 6]; 5713 do { 5714 if ((pg >> (reg_off & 63)) & 1) { 5715 for (i = 0; i < N; ++i) { 5716 tlb_fn(env, &scratch[i], reg_off, 5717 addr + mem_off + (i << msz), retaddr); 5718 } 5719 } 5720 reg_off += 1 << esz; 5721 mem_off += N << msz; 5722 } while (reg_off & 63); 5723 } while (reg_off <= reg_last); 5724 5725 for (i = 0; i < N; ++i) { 5726 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5727 } 5728 return; 5729 #endif 5730 } 5731 5732 /* The entire operation is in RAM, on valid pages. */ 5733 5734 for (i = 0; i < N; ++i) { 5735 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5736 } 5737 5738 mem_off = info.mem_off_first[0]; 5739 reg_off = info.reg_off_first[0]; 5740 reg_last = info.reg_off_last[0]; 5741 host = info.page[0].host; 5742 5743 while (reg_off <= reg_last) { 5744 uint64_t pg = vg[reg_off >> 6]; 5745 do { 5746 if ((pg >> (reg_off & 63)) & 1) { 5747 for (i = 0; i < N; ++i) { 5748 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5749 host + mem_off + (i << msz)); 5750 } 5751 } 5752 reg_off += 1 << esz; 5753 mem_off += N << msz; 5754 } while (reg_off <= reg_last && (reg_off & 63)); 5755 } 5756 5757 /* 5758 * Use the slow path to manage the cross-page misalignment. 5759 * But we know this is RAM and cannot trap. 5760 */ 5761 mem_off = info.mem_off_split; 5762 if (unlikely(mem_off >= 0)) { 5763 reg_off = info.reg_off_split; 5764 for (i = 0; i < N; ++i) { 5765 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5766 addr + mem_off + (i << msz), retaddr); 5767 } 5768 } 5769 5770 mem_off = info.mem_off_first[1]; 5771 if (unlikely(mem_off >= 0)) { 5772 reg_off = info.reg_off_first[1]; 5773 reg_last = info.reg_off_last[1]; 5774 host = info.page[1].host; 5775 5776 do { 5777 uint64_t pg = vg[reg_off >> 6]; 5778 do { 5779 if ((pg >> (reg_off & 63)) & 1) { 5780 for (i = 0; i < N; ++i) { 5781 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5782 host + mem_off + (i << msz)); 5783 } 5784 } 5785 reg_off += 1 << esz; 5786 mem_off += N << msz; 5787 } while (reg_off & 63); 5788 } while (reg_off <= reg_last); 5789 } 5790 } 5791 5792 static inline QEMU_ALWAYS_INLINE 5793 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 5794 uint32_t desc, const uintptr_t ra, 5795 const int esz, const int msz, const int N, 5796 sve_ldst1_host_fn *host_fn, 5797 sve_ldst1_tlb_fn *tlb_fn) 5798 { 5799 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5800 int bit55 = extract64(addr, 55, 1); 5801 5802 /* Remove mtedesc from the normal sve descriptor. */ 5803 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5804 5805 /* Perform gross MTE suppression early. */ 5806 if (!tbi_check(desc, bit55) || 5807 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 5808 mtedesc = 0; 5809 } 5810 5811 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 5812 } 5813 5814 #define DO_LD1_1(NAME, ESZ) \ 5815 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 5816 target_ulong addr, uint32_t desc) \ 5817 { \ 5818 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 5819 sve_##NAME##_host, sve_##NAME##_tlb); \ 5820 } \ 5821 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 5822 target_ulong addr, uint32_t desc) \ 5823 { \ 5824 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 5825 sve_##NAME##_host, sve_##NAME##_tlb); \ 5826 } 5827 5828 #define DO_LD1_2(NAME, ESZ, MSZ) \ 5829 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 5830 target_ulong addr, uint32_t desc) \ 5831 { \ 5832 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5833 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5834 } \ 5835 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 5836 target_ulong addr, uint32_t desc) \ 5837 { \ 5838 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5839 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5840 } \ 5841 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 5842 target_ulong addr, uint32_t desc) \ 5843 { \ 5844 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5845 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5846 } \ 5847 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 5848 target_ulong addr, uint32_t desc) \ 5849 { \ 5850 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5851 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5852 } 5853 5854 DO_LD1_1(ld1bb, MO_8) 5855 DO_LD1_1(ld1bhu, MO_16) 5856 DO_LD1_1(ld1bhs, MO_16) 5857 DO_LD1_1(ld1bsu, MO_32) 5858 DO_LD1_1(ld1bss, MO_32) 5859 DO_LD1_1(ld1bdu, MO_64) 5860 DO_LD1_1(ld1bds, MO_64) 5861 5862 DO_LD1_2(ld1hh, MO_16, MO_16) 5863 DO_LD1_2(ld1hsu, MO_32, MO_16) 5864 DO_LD1_2(ld1hss, MO_32, MO_16) 5865 DO_LD1_2(ld1hdu, MO_64, MO_16) 5866 DO_LD1_2(ld1hds, MO_64, MO_16) 5867 5868 DO_LD1_2(ld1ss, MO_32, MO_32) 5869 DO_LD1_2(ld1sdu, MO_64, MO_32) 5870 DO_LD1_2(ld1sds, MO_64, MO_32) 5871 5872 DO_LD1_2(ld1dd, MO_64, MO_64) 5873 5874 #undef DO_LD1_1 5875 #undef DO_LD1_2 5876 5877 #define DO_LDN_1(N) \ 5878 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 5879 target_ulong addr, uint32_t desc) \ 5880 { \ 5881 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 5882 sve_ld1bb_host, sve_ld1bb_tlb); \ 5883 } \ 5884 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 5885 target_ulong addr, uint32_t desc) \ 5886 { \ 5887 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 5888 sve_ld1bb_host, sve_ld1bb_tlb); \ 5889 } 5890 5891 #define DO_LDN_2(N, SUFF, ESZ) \ 5892 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 5893 target_ulong addr, uint32_t desc) \ 5894 { \ 5895 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5896 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5897 } \ 5898 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 5899 target_ulong addr, uint32_t desc) \ 5900 { \ 5901 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5902 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5903 } \ 5904 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 5905 target_ulong addr, uint32_t desc) \ 5906 { \ 5907 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5908 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5909 } \ 5910 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 5911 target_ulong addr, uint32_t desc) \ 5912 { \ 5913 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5914 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5915 } 5916 5917 DO_LDN_1(2) 5918 DO_LDN_1(3) 5919 DO_LDN_1(4) 5920 5921 DO_LDN_2(2, hh, MO_16) 5922 DO_LDN_2(3, hh, MO_16) 5923 DO_LDN_2(4, hh, MO_16) 5924 5925 DO_LDN_2(2, ss, MO_32) 5926 DO_LDN_2(3, ss, MO_32) 5927 DO_LDN_2(4, ss, MO_32) 5928 5929 DO_LDN_2(2, dd, MO_64) 5930 DO_LDN_2(3, dd, MO_64) 5931 DO_LDN_2(4, dd, MO_64) 5932 5933 #undef DO_LDN_1 5934 #undef DO_LDN_2 5935 5936 /* 5937 * Load contiguous data, first-fault and no-fault. 5938 * 5939 * For user-only, one could argue that we should hold the mmap_lock during 5940 * the operation so that there is no race between page_check_range and the 5941 * load operation. However, unmapping pages out from under a running thread 5942 * is extraordinarily unlikely. This theoretical race condition also affects 5943 * linux-user/ in its get_user/put_user macros. 5944 * 5945 * TODO: Construct some helpers, written in assembly, that interact with 5946 * host_signal_handler to produce memory ops which can properly report errors 5947 * without racing. 5948 */ 5949 5950 /* Fault on byte I. All bits in FFR from I are cleared. The vector 5951 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 5952 * option, which leaves subsequent data unchanged. 5953 */ 5954 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 5955 { 5956 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 5957 5958 if (i & 63) { 5959 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 5960 i = ROUND_UP(i, 64); 5961 } 5962 for (; i < oprsz; i += 64) { 5963 ffr[i / 64] = 0; 5964 } 5965 } 5966 5967 /* 5968 * Common helper for all contiguous no-fault and first-fault loads. 5969 */ 5970 static inline QEMU_ALWAYS_INLINE 5971 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 5972 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 5973 const int esz, const int msz, const SVEContFault fault, 5974 sve_ldst1_host_fn *host_fn, 5975 sve_ldst1_tlb_fn *tlb_fn) 5976 { 5977 const unsigned rd = simd_data(desc); 5978 void *vd = &env->vfp.zregs[rd]; 5979 const intptr_t reg_max = simd_oprsz(desc); 5980 intptr_t reg_off, mem_off, reg_last; 5981 SVEContLdSt info; 5982 int flags; 5983 void *host; 5984 5985 /* Find the active elements. */ 5986 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 5987 /* The entire predicate was false; no load occurs. */ 5988 memset(vd, 0, reg_max); 5989 return; 5990 } 5991 reg_off = info.reg_off_first[0]; 5992 5993 /* Probe the page(s). */ 5994 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 5995 /* Fault on first element. */ 5996 tcg_debug_assert(fault == FAULT_NO); 5997 memset(vd, 0, reg_max); 5998 goto do_fault; 5999 } 6000 6001 mem_off = info.mem_off_first[0]; 6002 flags = info.page[0].flags; 6003 6004 /* 6005 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6006 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6007 */ 6008 if (!info.page[0].tagged) { 6009 mtedesc = 0; 6010 } 6011 6012 if (fault == FAULT_FIRST) { 6013 /* Trapping mte check for the first-fault element. */ 6014 if (mtedesc) { 6015 mte_check(env, mtedesc, addr + mem_off, retaddr); 6016 } 6017 6018 /* 6019 * Special handling of the first active element, 6020 * if it crosses a page boundary or is MMIO. 6021 */ 6022 bool is_split = mem_off == info.mem_off_split; 6023 if (unlikely(flags != 0) || unlikely(is_split)) { 6024 /* 6025 * Use the slow path for cross-page handling. 6026 * Might trap for MMIO or watchpoints. 6027 */ 6028 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6029 6030 /* After any fault, zero the other elements. */ 6031 swap_memzero(vd, reg_off); 6032 reg_off += 1 << esz; 6033 mem_off += 1 << msz; 6034 swap_memzero(vd + reg_off, reg_max - reg_off); 6035 6036 if (is_split) { 6037 goto second_page; 6038 } 6039 } else { 6040 memset(vd, 0, reg_max); 6041 } 6042 } else { 6043 memset(vd, 0, reg_max); 6044 if (unlikely(mem_off == info.mem_off_split)) { 6045 /* The first active element crosses a page boundary. */ 6046 flags |= info.page[1].flags; 6047 if (unlikely(flags & TLB_MMIO)) { 6048 /* Some page is MMIO, see below. */ 6049 goto do_fault; 6050 } 6051 if (unlikely(flags & TLB_WATCHPOINT) && 6052 (cpu_watchpoint_address_matches 6053 (env_cpu(env), addr + mem_off, 1 << msz) 6054 & BP_MEM_READ)) { 6055 /* Watchpoint hit, see below. */ 6056 goto do_fault; 6057 } 6058 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6059 goto do_fault; 6060 } 6061 /* 6062 * Use the slow path for cross-page handling. 6063 * This is RAM, without a watchpoint, and will not trap. 6064 */ 6065 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6066 goto second_page; 6067 } 6068 } 6069 6070 /* 6071 * From this point on, all memory operations are MemSingleNF. 6072 * 6073 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6074 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6075 * 6076 * Unfortuately we do not have access to the memory attributes from the 6077 * PTE to tell Device memory from Normal memory. So we make a mostly 6078 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6079 * This gives the right answer for the common cases of "Normal memory, 6080 * backed by host RAM" and "Device memory, backed by MMIO". 6081 * The architecture allows us to suppress an NF load and return 6082 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6083 * case of "Normal memory, backed by MMIO" is permitted. The case we 6084 * get wrong is "Device memory, backed by host RAM", for which we 6085 * should return (UNKNOWN, FAULT) for but do not. 6086 * 6087 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6088 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6089 * architectural breakpoints the same. 6090 */ 6091 if (unlikely(flags & TLB_MMIO)) { 6092 goto do_fault; 6093 } 6094 6095 reg_last = info.reg_off_last[0]; 6096 host = info.page[0].host; 6097 6098 do { 6099 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6100 do { 6101 if ((pg >> (reg_off & 63)) & 1) { 6102 if (unlikely(flags & TLB_WATCHPOINT) && 6103 (cpu_watchpoint_address_matches 6104 (env_cpu(env), addr + mem_off, 1 << msz) 6105 & BP_MEM_READ)) { 6106 goto do_fault; 6107 } 6108 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6109 goto do_fault; 6110 } 6111 host_fn(vd, reg_off, host + mem_off); 6112 } 6113 reg_off += 1 << esz; 6114 mem_off += 1 << msz; 6115 } while (reg_off <= reg_last && (reg_off & 63)); 6116 } while (reg_off <= reg_last); 6117 6118 /* 6119 * MemSingleNF is allowed to fail for any reason. We have special 6120 * code above to handle the first element crossing a page boundary. 6121 * As an implementation choice, decline to handle a cross-page element 6122 * in any other position. 6123 */ 6124 reg_off = info.reg_off_split; 6125 if (reg_off >= 0) { 6126 goto do_fault; 6127 } 6128 6129 second_page: 6130 reg_off = info.reg_off_first[1]; 6131 if (likely(reg_off < 0)) { 6132 /* No active elements on the second page. All done. */ 6133 return; 6134 } 6135 6136 /* 6137 * MemSingleNF is allowed to fail for any reason. As an implementation 6138 * choice, decline to handle elements on the second page. This should 6139 * be low frequency as the guest walks through memory -- the next 6140 * iteration of the guest's loop should be aligned on the page boundary, 6141 * and then all following iterations will stay aligned. 6142 */ 6143 6144 do_fault: 6145 record_fault(env, reg_off, reg_max); 6146 } 6147 6148 static inline QEMU_ALWAYS_INLINE 6149 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6150 uint32_t desc, const uintptr_t retaddr, 6151 const int esz, const int msz, const SVEContFault fault, 6152 sve_ldst1_host_fn *host_fn, 6153 sve_ldst1_tlb_fn *tlb_fn) 6154 { 6155 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6156 int bit55 = extract64(addr, 55, 1); 6157 6158 /* Remove mtedesc from the normal sve descriptor. */ 6159 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6160 6161 /* Perform gross MTE suppression early. */ 6162 if (!tbi_check(desc, bit55) || 6163 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6164 mtedesc = 0; 6165 } 6166 6167 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6168 esz, msz, fault, host_fn, tlb_fn); 6169 } 6170 6171 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6172 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6173 target_ulong addr, uint32_t desc) \ 6174 { \ 6175 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6176 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6177 } \ 6178 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6179 target_ulong addr, uint32_t desc) \ 6180 { \ 6181 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6182 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6183 } \ 6184 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6185 target_ulong addr, uint32_t desc) \ 6186 { \ 6187 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6188 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6189 } \ 6190 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6191 target_ulong addr, uint32_t desc) \ 6192 { \ 6193 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6194 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6195 } 6196 6197 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6198 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6199 target_ulong addr, uint32_t desc) \ 6200 { \ 6201 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6202 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6203 } \ 6204 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6205 target_ulong addr, uint32_t desc) \ 6206 { \ 6207 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6208 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6209 } \ 6210 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6211 target_ulong addr, uint32_t desc) \ 6212 { \ 6213 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6214 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6215 } \ 6216 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6217 target_ulong addr, uint32_t desc) \ 6218 { \ 6219 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6220 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6221 } \ 6222 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6223 target_ulong addr, uint32_t desc) \ 6224 { \ 6225 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6226 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6227 } \ 6228 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6229 target_ulong addr, uint32_t desc) \ 6230 { \ 6231 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6232 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6233 } \ 6234 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6235 target_ulong addr, uint32_t desc) \ 6236 { \ 6237 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6238 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6239 } \ 6240 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6241 target_ulong addr, uint32_t desc) \ 6242 { \ 6243 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6244 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6245 } 6246 6247 DO_LDFF1_LDNF1_1(bb, MO_8) 6248 DO_LDFF1_LDNF1_1(bhu, MO_16) 6249 DO_LDFF1_LDNF1_1(bhs, MO_16) 6250 DO_LDFF1_LDNF1_1(bsu, MO_32) 6251 DO_LDFF1_LDNF1_1(bss, MO_32) 6252 DO_LDFF1_LDNF1_1(bdu, MO_64) 6253 DO_LDFF1_LDNF1_1(bds, MO_64) 6254 6255 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6256 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6257 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6258 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6259 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6260 6261 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6262 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6263 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6264 6265 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6266 6267 #undef DO_LDFF1_LDNF1_1 6268 #undef DO_LDFF1_LDNF1_2 6269 6270 /* 6271 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6272 */ 6273 6274 static inline QEMU_ALWAYS_INLINE 6275 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6276 uint32_t desc, const uintptr_t retaddr, 6277 const int esz, const int msz, const int N, uint32_t mtedesc, 6278 sve_ldst1_host_fn *host_fn, 6279 sve_ldst1_tlb_fn *tlb_fn) 6280 { 6281 const unsigned rd = simd_data(desc); 6282 const intptr_t reg_max = simd_oprsz(desc); 6283 intptr_t reg_off, reg_last, mem_off; 6284 SVEContLdSt info; 6285 void *host; 6286 int i, flags; 6287 6288 /* Find the active elements. */ 6289 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6290 /* The entire predicate was false; no store occurs. */ 6291 return; 6292 } 6293 6294 /* Probe the page(s). Exit with exception for any invalid page. */ 6295 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6296 6297 /* Handle watchpoints for all active elements. */ 6298 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6299 BP_MEM_WRITE, retaddr); 6300 6301 /* 6302 * Handle mte checks for all active elements. 6303 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6304 */ 6305 if (mtedesc) { 6306 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6307 mtedesc, retaddr); 6308 } 6309 6310 flags = info.page[0].flags | info.page[1].flags; 6311 if (unlikely(flags != 0)) { 6312 #ifdef CONFIG_USER_ONLY 6313 g_assert_not_reached(); 6314 #else 6315 /* 6316 * At least one page includes MMIO. 6317 * Any bus operation can fail with cpu_transaction_failed, 6318 * which for ARM will raise SyncExternal. We cannot avoid 6319 * this fault and will leave with the store incomplete. 6320 */ 6321 mem_off = info.mem_off_first[0]; 6322 reg_off = info.reg_off_first[0]; 6323 reg_last = info.reg_off_last[1]; 6324 if (reg_last < 0) { 6325 reg_last = info.reg_off_split; 6326 if (reg_last < 0) { 6327 reg_last = info.reg_off_last[0]; 6328 } 6329 } 6330 6331 do { 6332 uint64_t pg = vg[reg_off >> 6]; 6333 do { 6334 if ((pg >> (reg_off & 63)) & 1) { 6335 for (i = 0; i < N; ++i) { 6336 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6337 addr + mem_off + (i << msz), retaddr); 6338 } 6339 } 6340 reg_off += 1 << esz; 6341 mem_off += N << msz; 6342 } while (reg_off & 63); 6343 } while (reg_off <= reg_last); 6344 return; 6345 #endif 6346 } 6347 6348 mem_off = info.mem_off_first[0]; 6349 reg_off = info.reg_off_first[0]; 6350 reg_last = info.reg_off_last[0]; 6351 host = info.page[0].host; 6352 6353 while (reg_off <= reg_last) { 6354 uint64_t pg = vg[reg_off >> 6]; 6355 do { 6356 if ((pg >> (reg_off & 63)) & 1) { 6357 for (i = 0; i < N; ++i) { 6358 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6359 host + mem_off + (i << msz)); 6360 } 6361 } 6362 reg_off += 1 << esz; 6363 mem_off += N << msz; 6364 } while (reg_off <= reg_last && (reg_off & 63)); 6365 } 6366 6367 /* 6368 * Use the slow path to manage the cross-page misalignment. 6369 * But we know this is RAM and cannot trap. 6370 */ 6371 mem_off = info.mem_off_split; 6372 if (unlikely(mem_off >= 0)) { 6373 reg_off = info.reg_off_split; 6374 for (i = 0; i < N; ++i) { 6375 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6376 addr + mem_off + (i << msz), retaddr); 6377 } 6378 } 6379 6380 mem_off = info.mem_off_first[1]; 6381 if (unlikely(mem_off >= 0)) { 6382 reg_off = info.reg_off_first[1]; 6383 reg_last = info.reg_off_last[1]; 6384 host = info.page[1].host; 6385 6386 do { 6387 uint64_t pg = vg[reg_off >> 6]; 6388 do { 6389 if ((pg >> (reg_off & 63)) & 1) { 6390 for (i = 0; i < N; ++i) { 6391 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6392 host + mem_off + (i << msz)); 6393 } 6394 } 6395 reg_off += 1 << esz; 6396 mem_off += N << msz; 6397 } while (reg_off & 63); 6398 } while (reg_off <= reg_last); 6399 } 6400 } 6401 6402 static inline QEMU_ALWAYS_INLINE 6403 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6404 uint32_t desc, const uintptr_t ra, 6405 const int esz, const int msz, const int N, 6406 sve_ldst1_host_fn *host_fn, 6407 sve_ldst1_tlb_fn *tlb_fn) 6408 { 6409 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6410 int bit55 = extract64(addr, 55, 1); 6411 6412 /* Remove mtedesc from the normal sve descriptor. */ 6413 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6414 6415 /* Perform gross MTE suppression early. */ 6416 if (!tbi_check(desc, bit55) || 6417 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6418 mtedesc = 0; 6419 } 6420 6421 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6422 } 6423 6424 #define DO_STN_1(N, NAME, ESZ) \ 6425 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6426 target_ulong addr, uint32_t desc) \ 6427 { \ 6428 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6429 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6430 } \ 6431 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6432 target_ulong addr, uint32_t desc) \ 6433 { \ 6434 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6435 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6436 } 6437 6438 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6439 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6440 target_ulong addr, uint32_t desc) \ 6441 { \ 6442 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6443 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6444 } \ 6445 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6446 target_ulong addr, uint32_t desc) \ 6447 { \ 6448 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6449 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6450 } \ 6451 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6452 target_ulong addr, uint32_t desc) \ 6453 { \ 6454 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6455 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6456 } \ 6457 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6458 target_ulong addr, uint32_t desc) \ 6459 { \ 6460 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6461 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6462 } 6463 6464 DO_STN_1(1, bb, MO_8) 6465 DO_STN_1(1, bh, MO_16) 6466 DO_STN_1(1, bs, MO_32) 6467 DO_STN_1(1, bd, MO_64) 6468 DO_STN_1(2, bb, MO_8) 6469 DO_STN_1(3, bb, MO_8) 6470 DO_STN_1(4, bb, MO_8) 6471 6472 DO_STN_2(1, hh, MO_16, MO_16) 6473 DO_STN_2(1, hs, MO_32, MO_16) 6474 DO_STN_2(1, hd, MO_64, MO_16) 6475 DO_STN_2(2, hh, MO_16, MO_16) 6476 DO_STN_2(3, hh, MO_16, MO_16) 6477 DO_STN_2(4, hh, MO_16, MO_16) 6478 6479 DO_STN_2(1, ss, MO_32, MO_32) 6480 DO_STN_2(1, sd, MO_64, MO_32) 6481 DO_STN_2(2, ss, MO_32, MO_32) 6482 DO_STN_2(3, ss, MO_32, MO_32) 6483 DO_STN_2(4, ss, MO_32, MO_32) 6484 6485 DO_STN_2(1, dd, MO_64, MO_64) 6486 DO_STN_2(2, dd, MO_64, MO_64) 6487 DO_STN_2(3, dd, MO_64, MO_64) 6488 DO_STN_2(4, dd, MO_64, MO_64) 6489 6490 #undef DO_STN_1 6491 #undef DO_STN_2 6492 6493 /* 6494 * Loads with a vector index. 6495 */ 6496 6497 /* 6498 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6499 */ 6500 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6501 6502 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6503 { 6504 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6505 } 6506 6507 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6508 { 6509 return *(int32_t *)(reg + H1_4(reg_ofs)); 6510 } 6511 6512 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6513 { 6514 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6515 } 6516 6517 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6518 { 6519 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6520 } 6521 6522 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6523 { 6524 return *(uint64_t *)(reg + reg_ofs); 6525 } 6526 6527 static inline QEMU_ALWAYS_INLINE 6528 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6529 target_ulong base, uint32_t desc, uintptr_t retaddr, 6530 uint32_t mtedesc, int esize, int msize, 6531 zreg_off_fn *off_fn, 6532 sve_ldst1_host_fn *host_fn, 6533 sve_ldst1_tlb_fn *tlb_fn) 6534 { 6535 const int mmu_idx = cpu_mmu_index(env, false); 6536 const intptr_t reg_max = simd_oprsz(desc); 6537 const int scale = simd_data(desc); 6538 ARMVectorReg scratch; 6539 intptr_t reg_off; 6540 SVEHostPage info, info2; 6541 6542 memset(&scratch, 0, reg_max); 6543 reg_off = 0; 6544 do { 6545 uint64_t pg = vg[reg_off >> 6]; 6546 do { 6547 if (likely(pg & 1)) { 6548 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6549 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6550 6551 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6552 mmu_idx, retaddr); 6553 6554 if (likely(in_page >= msize)) { 6555 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6556 cpu_check_watchpoint(env_cpu(env), addr, msize, 6557 info.attrs, BP_MEM_READ, retaddr); 6558 } 6559 if (mtedesc && info.tagged) { 6560 mte_check(env, mtedesc, addr, retaddr); 6561 } 6562 if (unlikely(info.flags & TLB_MMIO)) { 6563 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6564 } else { 6565 host_fn(&scratch, reg_off, info.host); 6566 } 6567 } else { 6568 /* Element crosses the page boundary. */ 6569 sve_probe_page(&info2, false, env, addr + in_page, 0, 6570 MMU_DATA_LOAD, mmu_idx, retaddr); 6571 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6572 cpu_check_watchpoint(env_cpu(env), addr, 6573 msize, info.attrs, 6574 BP_MEM_READ, retaddr); 6575 } 6576 if (mtedesc && info.tagged) { 6577 mte_check(env, mtedesc, addr, retaddr); 6578 } 6579 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6580 } 6581 } 6582 reg_off += esize; 6583 pg >>= esize; 6584 } while (reg_off & 63); 6585 } while (reg_off < reg_max); 6586 6587 /* Wait until all exceptions have been raised to write back. */ 6588 memcpy(vd, &scratch, reg_max); 6589 } 6590 6591 static inline QEMU_ALWAYS_INLINE 6592 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6593 target_ulong base, uint32_t desc, uintptr_t retaddr, 6594 int esize, int msize, zreg_off_fn *off_fn, 6595 sve_ldst1_host_fn *host_fn, 6596 sve_ldst1_tlb_fn *tlb_fn) 6597 { 6598 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6599 /* Remove mtedesc from the normal sve descriptor. */ 6600 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6601 6602 /* 6603 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6604 * offset base entirely over the address space hole to change the 6605 * pointer tag, or change the bit55 selector. So we could here 6606 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6607 */ 6608 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6609 esize, msize, off_fn, host_fn, tlb_fn); 6610 } 6611 6612 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6613 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6614 void *vm, target_ulong base, uint32_t desc) \ 6615 { \ 6616 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6617 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6618 } \ 6619 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6620 void *vm, target_ulong base, uint32_t desc) \ 6621 { \ 6622 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6623 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6624 } 6625 6626 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6627 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6628 void *vm, target_ulong base, uint32_t desc) \ 6629 { \ 6630 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6631 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6632 } \ 6633 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6634 void *vm, target_ulong base, uint32_t desc) \ 6635 { \ 6636 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6637 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6638 } 6639 6640 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6641 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6642 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6643 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6644 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6645 6646 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6647 DO_LD1_ZPZ_S(bss, zss, MO_8) 6648 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6649 DO_LD1_ZPZ_D(bds, zss, MO_8) 6650 DO_LD1_ZPZ_D(bds, zd, MO_8) 6651 6652 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6653 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6654 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6655 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6656 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6657 6658 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6659 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6660 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6661 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6662 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6663 6664 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6665 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6666 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6667 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6668 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6669 6670 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6671 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6672 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6673 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6674 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6675 6676 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6677 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6678 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6679 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6680 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6681 6682 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6683 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6684 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6685 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6686 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6687 6688 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6689 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6690 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6691 6692 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6693 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6694 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6695 6696 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6697 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6698 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6699 6700 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6701 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6702 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6703 6704 #undef DO_LD1_ZPZ_S 6705 #undef DO_LD1_ZPZ_D 6706 6707 /* First fault loads with a vector index. */ 6708 6709 /* 6710 * Common helpers for all gather first-faulting loads. 6711 */ 6712 6713 static inline QEMU_ALWAYS_INLINE 6714 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6715 target_ulong base, uint32_t desc, uintptr_t retaddr, 6716 uint32_t mtedesc, const int esz, const int msz, 6717 zreg_off_fn *off_fn, 6718 sve_ldst1_host_fn *host_fn, 6719 sve_ldst1_tlb_fn *tlb_fn) 6720 { 6721 const int mmu_idx = cpu_mmu_index(env, false); 6722 const intptr_t reg_max = simd_oprsz(desc); 6723 const int scale = simd_data(desc); 6724 const int esize = 1 << esz; 6725 const int msize = 1 << msz; 6726 intptr_t reg_off; 6727 SVEHostPage info; 6728 target_ulong addr, in_page; 6729 6730 /* Skip to the first true predicate. */ 6731 reg_off = find_next_active(vg, 0, reg_max, esz); 6732 if (unlikely(reg_off >= reg_max)) { 6733 /* The entire predicate was false; no load occurs. */ 6734 memset(vd, 0, reg_max); 6735 return; 6736 } 6737 6738 /* 6739 * Probe the first element, allowing faults. 6740 */ 6741 addr = base + (off_fn(vm, reg_off) << scale); 6742 if (mtedesc) { 6743 mte_check(env, mtedesc, addr, retaddr); 6744 } 6745 tlb_fn(env, vd, reg_off, addr, retaddr); 6746 6747 /* After any fault, zero the other elements. */ 6748 swap_memzero(vd, reg_off); 6749 reg_off += esize; 6750 swap_memzero(vd + reg_off, reg_max - reg_off); 6751 6752 /* 6753 * Probe the remaining elements, not allowing faults. 6754 */ 6755 while (reg_off < reg_max) { 6756 uint64_t pg = vg[reg_off >> 6]; 6757 do { 6758 if (likely((pg >> (reg_off & 63)) & 1)) { 6759 addr = base + (off_fn(vm, reg_off) << scale); 6760 in_page = -(addr | TARGET_PAGE_MASK); 6761 6762 if (unlikely(in_page < msize)) { 6763 /* Stop if the element crosses a page boundary. */ 6764 goto fault; 6765 } 6766 6767 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6768 mmu_idx, retaddr); 6769 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6770 goto fault; 6771 } 6772 if (unlikely(info.flags & TLB_WATCHPOINT) && 6773 (cpu_watchpoint_address_matches 6774 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 6775 goto fault; 6776 } 6777 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 6778 goto fault; 6779 } 6780 6781 host_fn(vd, reg_off, info.host); 6782 } 6783 reg_off += esize; 6784 } while (reg_off & 63); 6785 } 6786 return; 6787 6788 fault: 6789 record_fault(env, reg_off, reg_max); 6790 } 6791 6792 static inline QEMU_ALWAYS_INLINE 6793 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6794 target_ulong base, uint32_t desc, uintptr_t retaddr, 6795 const int esz, const int msz, 6796 zreg_off_fn *off_fn, 6797 sve_ldst1_host_fn *host_fn, 6798 sve_ldst1_tlb_fn *tlb_fn) 6799 { 6800 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6801 /* Remove mtedesc from the normal sve descriptor. */ 6802 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6803 6804 /* 6805 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6806 * offset base entirely over the address space hole to change the 6807 * pointer tag, or change the bit55 selector. So we could here 6808 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6809 */ 6810 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6811 esz, msz, off_fn, host_fn, tlb_fn); 6812 } 6813 6814 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 6815 void HELPER(sve_ldff##MEM##_##OFS) \ 6816 (CPUARMState *env, void *vd, void *vg, \ 6817 void *vm, target_ulong base, uint32_t desc) \ 6818 { \ 6819 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 6820 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6821 } \ 6822 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6823 (CPUARMState *env, void *vd, void *vg, \ 6824 void *vm, target_ulong base, uint32_t desc) \ 6825 { \ 6826 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 6827 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6828 } 6829 6830 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 6831 void HELPER(sve_ldff##MEM##_##OFS) \ 6832 (CPUARMState *env, void *vd, void *vg, \ 6833 void *vm, target_ulong base, uint32_t desc) \ 6834 { \ 6835 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 6836 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6837 } \ 6838 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6839 (CPUARMState *env, void *vd, void *vg, \ 6840 void *vm, target_ulong base, uint32_t desc) \ 6841 { \ 6842 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 6843 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6844 } 6845 6846 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 6847 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 6848 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 6849 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 6850 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 6851 6852 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 6853 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 6854 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 6855 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 6856 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 6857 6858 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 6859 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 6860 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 6861 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 6862 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 6863 6864 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 6865 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 6866 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 6867 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 6868 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 6869 6870 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 6871 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 6872 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 6873 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 6874 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 6875 6876 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 6877 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 6878 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 6879 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 6880 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 6881 6882 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 6883 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 6884 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 6885 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 6886 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 6887 6888 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 6889 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 6890 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 6891 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 6892 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 6893 6894 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 6895 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 6896 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 6897 6898 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 6899 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 6900 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 6901 6902 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 6903 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 6904 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 6905 6906 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 6907 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 6908 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 6909 6910 /* Stores with a vector index. */ 6911 6912 static inline QEMU_ALWAYS_INLINE 6913 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6914 target_ulong base, uint32_t desc, uintptr_t retaddr, 6915 uint32_t mtedesc, int esize, int msize, 6916 zreg_off_fn *off_fn, 6917 sve_ldst1_host_fn *host_fn, 6918 sve_ldst1_tlb_fn *tlb_fn) 6919 { 6920 const int mmu_idx = cpu_mmu_index(env, false); 6921 const intptr_t reg_max = simd_oprsz(desc); 6922 const int scale = simd_data(desc); 6923 void *host[ARM_MAX_VQ * 4]; 6924 intptr_t reg_off, i; 6925 SVEHostPage info, info2; 6926 6927 /* 6928 * Probe all of the elements for host addresses and flags. 6929 */ 6930 i = reg_off = 0; 6931 do { 6932 uint64_t pg = vg[reg_off >> 6]; 6933 do { 6934 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6935 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6936 6937 host[i] = NULL; 6938 if (likely((pg >> (reg_off & 63)) & 1)) { 6939 if (likely(in_page >= msize)) { 6940 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 6941 mmu_idx, retaddr); 6942 if (!(info.flags & TLB_MMIO)) { 6943 host[i] = info.host; 6944 } 6945 } else { 6946 /* 6947 * Element crosses the page boundary. 6948 * Probe both pages, but do not record the host address, 6949 * so that we use the slow path. 6950 */ 6951 sve_probe_page(&info, false, env, addr, 0, 6952 MMU_DATA_STORE, mmu_idx, retaddr); 6953 sve_probe_page(&info2, false, env, addr + in_page, 0, 6954 MMU_DATA_STORE, mmu_idx, retaddr); 6955 info.flags |= info2.flags; 6956 } 6957 6958 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6959 cpu_check_watchpoint(env_cpu(env), addr, msize, 6960 info.attrs, BP_MEM_WRITE, retaddr); 6961 } 6962 6963 if (mtedesc && info.tagged) { 6964 mte_check(env, mtedesc, addr, retaddr); 6965 } 6966 } 6967 i += 1; 6968 reg_off += esize; 6969 } while (reg_off & 63); 6970 } while (reg_off < reg_max); 6971 6972 /* 6973 * Now that we have recognized all exceptions except SyncExternal 6974 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 6975 * 6976 * Note for the common case of an element in RAM, not crossing a page 6977 * boundary, we have stored the host address in host[]. This doubles 6978 * as a first-level check against the predicate, since only enabled 6979 * elements have non-null host addresses. 6980 */ 6981 i = reg_off = 0; 6982 do { 6983 void *h = host[i]; 6984 if (likely(h != NULL)) { 6985 host_fn(vd, reg_off, h); 6986 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 6987 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6988 tlb_fn(env, vd, reg_off, addr, retaddr); 6989 } 6990 i += 1; 6991 reg_off += esize; 6992 } while (reg_off < reg_max); 6993 } 6994 6995 static inline QEMU_ALWAYS_INLINE 6996 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6997 target_ulong base, uint32_t desc, uintptr_t retaddr, 6998 int esize, int msize, zreg_off_fn *off_fn, 6999 sve_ldst1_host_fn *host_fn, 7000 sve_ldst1_tlb_fn *tlb_fn) 7001 { 7002 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7003 /* Remove mtedesc from the normal sve descriptor. */ 7004 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7005 7006 /* 7007 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7008 * offset base entirely over the address space hole to change the 7009 * pointer tag, or change the bit55 selector. So we could here 7010 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7011 */ 7012 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7013 esize, msize, off_fn, host_fn, tlb_fn); 7014 } 7015 7016 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7017 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7018 void *vm, target_ulong base, uint32_t desc) \ 7019 { \ 7020 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7021 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7022 } \ 7023 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7024 void *vm, target_ulong base, uint32_t desc) \ 7025 { \ 7026 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7027 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7028 } 7029 7030 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7031 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7032 void *vm, target_ulong base, uint32_t desc) \ 7033 { \ 7034 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7035 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7036 } \ 7037 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7038 void *vm, target_ulong base, uint32_t desc) \ 7039 { \ 7040 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7041 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7042 } 7043 7044 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7045 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7046 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7047 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7048 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7049 7050 DO_ST1_ZPZ_S(bs, zss, MO_8) 7051 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7052 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7053 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7054 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7055 7056 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7057 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7058 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7059 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7060 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7061 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7062 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7063 7064 DO_ST1_ZPZ_D(bd, zss, MO_8) 7065 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7066 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7067 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7068 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7069 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7070 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7071 7072 DO_ST1_ZPZ_D(bd, zd, MO_8) 7073 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7074 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7075 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7076 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7077 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7078 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7079 7080 #undef DO_ST1_ZPZ_S 7081 #undef DO_ST1_ZPZ_D 7082 7083 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7084 { 7085 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7086 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7087 7088 for (i = 0; i < opr_sz; ++i) { 7089 d[i] = n[i] ^ m[i] ^ k[i]; 7090 } 7091 } 7092 7093 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7094 { 7095 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7096 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7097 7098 for (i = 0; i < opr_sz; ++i) { 7099 d[i] = n[i] ^ (m[i] & ~k[i]); 7100 } 7101 } 7102 7103 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7104 { 7105 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7106 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7107 7108 for (i = 0; i < opr_sz; ++i) { 7109 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7110 } 7111 } 7112 7113 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7114 { 7115 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7116 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7117 7118 for (i = 0; i < opr_sz; ++i) { 7119 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7120 } 7121 } 7122 7123 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7124 { 7125 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7126 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7127 7128 for (i = 0; i < opr_sz; ++i) { 7129 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7130 } 7131 } 7132 7133 /* 7134 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7135 * See hasless(v,1) from 7136 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7137 */ 7138 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7139 { 7140 int bits = 8 << esz; 7141 uint64_t ones = dup_const(esz, 1); 7142 uint64_t signs = ones << (bits - 1); 7143 uint64_t cmp0, cmp1; 7144 7145 cmp1 = dup_const(esz, n); 7146 cmp0 = cmp1 ^ m0; 7147 cmp1 = cmp1 ^ m1; 7148 cmp0 = (cmp0 - ones) & ~cmp0; 7149 cmp1 = (cmp1 - ones) & ~cmp1; 7150 return (cmp0 | cmp1) & signs; 7151 } 7152 7153 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7154 uint32_t desc, int esz, bool nmatch) 7155 { 7156 uint16_t esz_mask = pred_esz_masks[esz]; 7157 intptr_t opr_sz = simd_oprsz(desc); 7158 uint32_t flags = PREDTEST_INIT; 7159 intptr_t i, j, k; 7160 7161 for (i = 0; i < opr_sz; i += 16) { 7162 uint64_t m0 = *(uint64_t *)(vm + i); 7163 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7164 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7165 uint16_t out = 0; 7166 7167 for (j = 0; j < 16; j += 8) { 7168 uint64_t n = *(uint64_t *)(vn + i + j); 7169 7170 for (k = 0; k < 8; k += 1 << esz) { 7171 if (pg & (1 << (j + k))) { 7172 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7173 out |= (o ^ nmatch) << (j + k); 7174 } 7175 } 7176 } 7177 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7178 flags = iter_predtest_fwd(out, pg, flags); 7179 } 7180 return flags; 7181 } 7182 7183 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7184 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7185 { \ 7186 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7187 } 7188 7189 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7190 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7191 7192 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7193 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7194 7195 #undef DO_PPZZ_MATCH 7196 7197 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7198 uint32_t desc) 7199 { 7200 ARMVectorReg scratch; 7201 intptr_t i, j; 7202 intptr_t opr_sz = simd_oprsz(desc); 7203 uint32_t *d = vd, *n = vn, *m = vm; 7204 uint8_t *pg = vg; 7205 7206 if (d == n) { 7207 n = memcpy(&scratch, n, opr_sz); 7208 if (d == m) { 7209 m = n; 7210 } 7211 } else if (d == m) { 7212 m = memcpy(&scratch, m, opr_sz); 7213 } 7214 7215 for (i = 0; i < opr_sz; i += 4) { 7216 uint64_t count = 0; 7217 uint8_t pred; 7218 7219 pred = pg[H1(i >> 3)] >> (i & 7); 7220 if (pred & 1) { 7221 uint32_t nn = n[H4(i >> 2)]; 7222 7223 for (j = 0; j <= i; j += 4) { 7224 pred = pg[H1(j >> 3)] >> (j & 7); 7225 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7226 ++count; 7227 } 7228 } 7229 } 7230 d[H4(i >> 2)] = count; 7231 } 7232 } 7233 7234 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7235 uint32_t desc) 7236 { 7237 ARMVectorReg scratch; 7238 intptr_t i, j; 7239 intptr_t opr_sz = simd_oprsz(desc); 7240 uint64_t *d = vd, *n = vn, *m = vm; 7241 uint8_t *pg = vg; 7242 7243 if (d == n) { 7244 n = memcpy(&scratch, n, opr_sz); 7245 if (d == m) { 7246 m = n; 7247 } 7248 } else if (d == m) { 7249 m = memcpy(&scratch, m, opr_sz); 7250 } 7251 7252 for (i = 0; i < opr_sz / 8; ++i) { 7253 uint64_t count = 0; 7254 if (pg[H1(i)] & 1) { 7255 uint64_t nn = n[i]; 7256 for (j = 0; j <= i; ++j) { 7257 if ((pg[H1(j)] & 1) && nn == m[j]) { 7258 ++count; 7259 } 7260 } 7261 } 7262 d[i] = count; 7263 } 7264 } 7265 7266 /* 7267 * Returns the number of bytes in m0 and m1 that match n. 7268 * Unlike do_match2 we don't just need true/false, we need an exact count. 7269 * This requires two extra logical operations. 7270 */ 7271 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7272 { 7273 const uint64_t mask = dup_const(MO_8, 0x7f); 7274 uint64_t cmp0, cmp1; 7275 7276 cmp1 = dup_const(MO_8, n); 7277 cmp0 = cmp1 ^ m0; 7278 cmp1 = cmp1 ^ m1; 7279 7280 /* 7281 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7282 * 2: carry in to msb if byte != 0 (+ mask) 7283 * 3: set msb if cmp has msb set (| cmp) 7284 * 4: set ~msb to ignore them (| mask) 7285 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7286 * 5: invert, resulting in 0x80 if and only if byte == 0. 7287 */ 7288 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7289 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7290 7291 /* 7292 * Combine the two compares in a way that the bits do 7293 * not overlap, and so preserves the count of set bits. 7294 * If the host has an efficient instruction for ctpop, 7295 * then ctpop(x) + ctpop(y) has the same number of 7296 * operations as ctpop(x | (y >> 1)). If the host does 7297 * not have an efficient ctpop, then we only want to 7298 * use it once. 7299 */ 7300 return ctpop64(cmp0 | (cmp1 >> 1)); 7301 } 7302 7303 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7304 { 7305 intptr_t i, j; 7306 intptr_t opr_sz = simd_oprsz(desc); 7307 7308 for (i = 0; i < opr_sz; i += 16) { 7309 uint64_t n0 = *(uint64_t *)(vn + i); 7310 uint64_t m0 = *(uint64_t *)(vm + i); 7311 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7312 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7313 uint64_t out0 = 0; 7314 uint64_t out1 = 0; 7315 7316 for (j = 0; j < 64; j += 8) { 7317 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7318 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7319 out0 |= cnt0 << j; 7320 out1 |= cnt1 << j; 7321 } 7322 7323 *(uint64_t *)(vd + i) = out0; 7324 *(uint64_t *)(vd + i + 8) = out1; 7325 } 7326 } 7327 7328 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7329 { 7330 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7331 int shr = simd_data(desc); 7332 int shl = 8 - shr; 7333 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7334 uint64_t *d = vd, *n = vn, *m = vm; 7335 7336 for (i = 0; i < opr_sz; ++i) { 7337 uint64_t t = n[i] ^ m[i]; 7338 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7339 } 7340 } 7341 7342 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7343 { 7344 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7345 int shr = simd_data(desc); 7346 int shl = 16 - shr; 7347 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7348 uint64_t *d = vd, *n = vn, *m = vm; 7349 7350 for (i = 0; i < opr_sz; ++i) { 7351 uint64_t t = n[i] ^ m[i]; 7352 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7353 } 7354 } 7355 7356 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7357 { 7358 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7359 int shr = simd_data(desc); 7360 uint32_t *d = vd, *n = vn, *m = vm; 7361 7362 for (i = 0; i < opr_sz; ++i) { 7363 d[i] = ror32(n[i] ^ m[i], shr); 7364 } 7365 } 7366 7367 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7368 void *status, uint32_t desc) 7369 { 7370 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7371 7372 for (s = 0; s < opr_sz; ++s) { 7373 float32 *n = vn + s * sizeof(float32) * 4; 7374 float32 *m = vm + s * sizeof(float32) * 4; 7375 float32 *a = va + s * sizeof(float32) * 4; 7376 float32 *d = vd + s * sizeof(float32) * 4; 7377 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7378 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7379 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7380 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7381 float32 p0, p1; 7382 7383 /* i = 0, j = 0 */ 7384 p0 = float32_mul(n00, m00, status); 7385 p1 = float32_mul(n01, m01, status); 7386 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7387 7388 /* i = 0, j = 1 */ 7389 p0 = float32_mul(n00, m10, status); 7390 p1 = float32_mul(n01, m11, status); 7391 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7392 7393 /* i = 1, j = 0 */ 7394 p0 = float32_mul(n10, m00, status); 7395 p1 = float32_mul(n11, m01, status); 7396 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7397 7398 /* i = 1, j = 1 */ 7399 p0 = float32_mul(n10, m10, status); 7400 p1 = float32_mul(n11, m11, status); 7401 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7402 } 7403 } 7404 7405 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7406 void *status, uint32_t desc) 7407 { 7408 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7409 7410 for (s = 0; s < opr_sz; ++s) { 7411 float64 *n = vn + s * sizeof(float64) * 4; 7412 float64 *m = vm + s * sizeof(float64) * 4; 7413 float64 *a = va + s * sizeof(float64) * 4; 7414 float64 *d = vd + s * sizeof(float64) * 4; 7415 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7416 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7417 float64 p0, p1; 7418 7419 /* i = 0, j = 0 */ 7420 p0 = float64_mul(n00, m00, status); 7421 p1 = float64_mul(n01, m01, status); 7422 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7423 7424 /* i = 0, j = 1 */ 7425 p0 = float64_mul(n00, m10, status); 7426 p1 = float64_mul(n01, m11, status); 7427 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7428 7429 /* i = 1, j = 0 */ 7430 p0 = float64_mul(n10, m00, status); 7431 p1 = float64_mul(n11, m01, status); 7432 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7433 7434 /* i = 1, j = 1 */ 7435 p0 = float64_mul(n10, m10, status); 7436 p1 = float64_mul(n11, m11, status); 7437 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7438 } 7439 } 7440 7441 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7442 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7443 { \ 7444 intptr_t i = simd_oprsz(desc); \ 7445 uint64_t *g = vg; \ 7446 do { \ 7447 uint64_t pg = g[(i - 1) >> 6]; \ 7448 do { \ 7449 i -= sizeof(TYPEW); \ 7450 if (likely((pg >> (i & 63)) & 1)) { \ 7451 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7452 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7453 } \ 7454 } while (i & 63); \ 7455 } while (i != 0); \ 7456 } 7457 7458 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7459 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7460 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7461 7462 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7463 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7464 { \ 7465 intptr_t i = simd_oprsz(desc); \ 7466 uint64_t *g = vg; \ 7467 do { \ 7468 uint64_t pg = g[(i - 1) >> 6]; \ 7469 do { \ 7470 i -= sizeof(TYPEW); \ 7471 if (likely((pg >> (i & 63)) & 1)) { \ 7472 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7473 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7474 } \ 7475 } while (i & 63); \ 7476 } while (i != 0); \ 7477 } 7478 7479 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7480 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7481 7482 #undef DO_FCVTLT 7483 #undef DO_FCVTNT 7484