1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/page-protection.h" 25 #include "exec/helper-proto.h" 26 #include "exec/target_page.h" 27 #include "exec/tlb-flags.h" 28 #include "tcg/tcg-gvec-desc.h" 29 #include "fpu/softfloat.h" 30 #include "tcg/tcg.h" 31 #include "vec_internal.h" 32 #include "sve_ldst_internal.h" 33 #include "accel/tcg/cpu-ops.h" 34 #ifdef CONFIG_USER_ONLY 35 #include "user/page-protection.h" 36 #endif 37 38 39 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 40 * 41 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 42 * and bit 0 set if C is set. Compare the definitions of these variables 43 * within CPUARMState. 44 */ 45 46 /* For no G bits set, NZCV = C. */ 47 #define PREDTEST_INIT 1 48 49 /* This is an iterative function, called for each Pd and Pg word 50 * moving forward. 51 */ 52 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 53 { 54 if (likely(g)) { 55 /* Compute N from first D & G. 56 Use bit 2 to signal first G bit seen. */ 57 if (!(flags & 4)) { 58 flags |= ((d & (g & -g)) != 0) << 31; 59 flags |= 4; 60 } 61 62 /* Accumulate Z from each D & G. */ 63 flags |= ((d & g) != 0) << 1; 64 65 /* Compute C from last !(D & G). Replace previous. */ 66 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 67 } 68 return flags; 69 } 70 71 /* This is an iterative function, called for each Pd and Pg word 72 * moving backward. 73 */ 74 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 75 { 76 if (likely(g)) { 77 /* Compute C from first (i.e last) !(D & G). 78 Use bit 2 to signal first G bit seen. */ 79 if (!(flags & 4)) { 80 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 81 flags |= (d & pow2floor(g)) == 0; 82 } 83 84 /* Accumulate Z from each D & G. */ 85 flags |= ((d & g) != 0) << 1; 86 87 /* Compute N from last (i.e first) D & G. Replace previous. */ 88 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 89 } 90 return flags; 91 } 92 93 /* The same for a single word predicate. */ 94 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 95 { 96 return iter_predtest_fwd(d, g, PREDTEST_INIT); 97 } 98 99 /* The same for a multi-word predicate. */ 100 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 101 { 102 uint32_t flags = PREDTEST_INIT; 103 uint64_t *d = vd, *g = vg; 104 uintptr_t i = 0; 105 106 do { 107 flags = iter_predtest_fwd(d[i], g[i], flags); 108 } while (++i < words); 109 110 return flags; 111 } 112 113 /* Similarly for single word elements. */ 114 static inline uint64_t expand_pred_s(uint8_t byte) 115 { 116 static const uint64_t word[] = { 117 [0x01] = 0x00000000ffffffffull, 118 [0x10] = 0xffffffff00000000ull, 119 [0x11] = 0xffffffffffffffffull, 120 }; 121 return word[byte & 0x11]; 122 } 123 124 #define LOGICAL_PPPP(NAME, FUNC) \ 125 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 126 { \ 127 uintptr_t opr_sz = simd_oprsz(desc); \ 128 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 129 uintptr_t i; \ 130 for (i = 0; i < opr_sz / 8; ++i) { \ 131 d[i] = FUNC(n[i], m[i], g[i]); \ 132 } \ 133 } 134 135 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 136 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 137 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 138 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 139 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 140 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 141 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 142 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 143 144 LOGICAL_PPPP(sve_and_pppp, DO_AND) 145 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 146 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 147 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 148 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 149 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 150 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 151 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 152 153 #undef DO_AND 154 #undef DO_BIC 155 #undef DO_EOR 156 #undef DO_ORR 157 #undef DO_ORN 158 #undef DO_NOR 159 #undef DO_NAND 160 #undef DO_SEL 161 #undef LOGICAL_PPPP 162 163 /* Fully general three-operand expander, controlled by a predicate. 164 * This is complicated by the host-endian storage of the register file. 165 */ 166 /* ??? I don't expect the compiler could ever vectorize this itself. 167 * With some tables we can convert bit masks to byte masks, and with 168 * extra care wrt byte/word ordering we could use gcc generic vectors 169 * and do 16 bytes at a time. 170 */ 171 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 172 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 173 { \ 174 intptr_t i, opr_sz = simd_oprsz(desc); \ 175 for (i = 0; i < opr_sz; ) { \ 176 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 177 do { \ 178 if (pg & 1) { \ 179 TYPE nn = *(TYPE *)(vn + H(i)); \ 180 TYPE mm = *(TYPE *)(vm + H(i)); \ 181 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 182 } \ 183 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 184 } while (i & 15); \ 185 } \ 186 } 187 188 /* Similarly, specialized for 64-bit operands. */ 189 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 190 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 191 { \ 192 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 193 TYPE *d = vd, *n = vn, *m = vm; \ 194 uint8_t *pg = vg; \ 195 for (i = 0; i < opr_sz; i += 1) { \ 196 if (pg[H1(i)] & 1) { \ 197 TYPE nn = n[i], mm = m[i]; \ 198 d[i] = OP(nn, mm); \ 199 } \ 200 } \ 201 } 202 203 #define DO_AND(N, M) (N & M) 204 #define DO_EOR(N, M) (N ^ M) 205 #define DO_ORR(N, M) (N | M) 206 #define DO_BIC(N, M) (N & ~M) 207 #define DO_ADD(N, M) (N + M) 208 #define DO_SUB(N, M) (N - M) 209 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 210 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 211 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 212 #define DO_MUL(N, M) (N * M) 213 214 215 /* 216 * We must avoid the C undefined behaviour cases: division by 217 * zero and signed division of INT_MIN by -1. Both of these 218 * have architecturally defined required results for Arm. 219 * We special case all signed divisions by -1 to avoid having 220 * to deduce the minimum integer for the type involved. 221 */ 222 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 223 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 224 225 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 226 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 227 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 228 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 229 230 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 231 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 232 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 233 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 234 235 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 236 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 237 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 238 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 239 240 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 241 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 242 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 243 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 244 245 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 246 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 247 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 248 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 249 250 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 251 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 252 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 253 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 254 255 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 256 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 257 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 258 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 259 260 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 261 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 262 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 263 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 264 265 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 266 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 267 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 268 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 269 270 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 271 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 272 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 273 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 274 275 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 276 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 277 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 278 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 279 280 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 281 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 282 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 283 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 284 285 /* Because the computation type is at least twice as large as required, 286 these work for both signed and unsigned source types. */ 287 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 288 { 289 return (n * m) >> 8; 290 } 291 292 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 293 { 294 return (n * m) >> 16; 295 } 296 297 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 298 { 299 return (n * m) >> 32; 300 } 301 302 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 303 { 304 uint64_t lo, hi; 305 muls64(&lo, &hi, n, m); 306 return hi; 307 } 308 309 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 310 { 311 uint64_t lo, hi; 312 mulu64(&lo, &hi, n, m); 313 return hi; 314 } 315 316 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 317 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 318 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 319 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 320 321 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 322 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 323 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 324 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 325 326 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 327 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 328 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 329 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 330 331 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 332 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 333 334 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 335 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 336 337 /* Note that all bits of the shift are significant 338 and not modulo the element size. */ 339 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 340 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 341 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 342 343 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 344 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 345 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 346 347 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 348 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 349 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 350 351 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 352 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 353 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 354 355 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 356 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 357 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 358 359 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 360 { 361 int8_t n1 = n, n2 = n >> 8; 362 return m + n1 + n2; 363 } 364 365 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 366 { 367 int16_t n1 = n, n2 = n >> 16; 368 return m + n1 + n2; 369 } 370 371 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 372 { 373 int32_t n1 = n, n2 = n >> 32; 374 return m + n1 + n2; 375 } 376 377 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 378 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 379 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 380 381 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 382 { 383 uint8_t n1 = n, n2 = n >> 8; 384 return m + n1 + n2; 385 } 386 387 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 388 { 389 uint16_t n1 = n, n2 = n >> 16; 390 return m + n1 + n2; 391 } 392 393 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 394 { 395 uint32_t n1 = n, n2 = n >> 32; 396 return m + n1 + n2; 397 } 398 399 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 400 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 401 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 402 403 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 404 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 405 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 406 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 407 408 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 409 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 410 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 411 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 412 413 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 414 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 415 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 416 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 417 418 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 419 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 420 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 421 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 422 423 /* 424 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 425 * We pass in a pointer to a dummy saturation field to trigger 426 * the saturating arithmetic but discard the information about 427 * whether it has occurred. 428 */ 429 #define do_sqshl_b(n, m) \ 430 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 431 #define do_sqshl_h(n, m) \ 432 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 433 #define do_sqshl_s(n, m) \ 434 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 435 #define do_sqshl_d(n, m) \ 436 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 437 438 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 439 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 440 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 441 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 442 443 #define do_uqshl_b(n, m) \ 444 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 445 #define do_uqshl_h(n, m) \ 446 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 447 #define do_uqshl_s(n, m) \ 448 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 449 #define do_uqshl_d(n, m) \ 450 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 451 452 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 453 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 454 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 455 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 456 457 #define do_sqrshl_b(n, m) \ 458 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 459 #define do_sqrshl_h(n, m) \ 460 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 461 #define do_sqrshl_s(n, m) \ 462 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 463 #define do_sqrshl_d(n, m) \ 464 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 465 466 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 467 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 468 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 469 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 470 471 #undef do_sqrshl_d 472 473 #define do_uqrshl_b(n, m) \ 474 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 475 #define do_uqrshl_h(n, m) \ 476 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 477 #define do_uqrshl_s(n, m) \ 478 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 479 #define do_uqrshl_d(n, m) \ 480 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 481 482 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 483 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 484 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 485 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 486 487 #undef do_uqrshl_d 488 489 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 490 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 491 492 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 493 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 494 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 495 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 496 497 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 498 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 499 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 500 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 501 502 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 503 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 504 505 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 506 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 507 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 508 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 509 510 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 511 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 512 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 513 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 514 515 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 516 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 517 518 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 519 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 520 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 521 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 522 523 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 524 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 525 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 526 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 527 528 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 529 { 530 return val >= max ? max : val <= min ? min : val; 531 } 532 533 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 534 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 535 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 536 537 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 538 { 539 int64_t r = n + m; 540 if (((r ^ n) & ~(n ^ m)) < 0) { 541 /* Signed overflow. */ 542 return r < 0 ? INT64_MAX : INT64_MIN; 543 } 544 return r; 545 } 546 547 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 548 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 549 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 550 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 551 552 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 553 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 554 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 555 556 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 557 { 558 uint64_t r = n + m; 559 return r < n ? UINT64_MAX : r; 560 } 561 562 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 563 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 564 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 565 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 566 567 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 568 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 569 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 570 571 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 572 { 573 int64_t r = n - m; 574 if (((r ^ n) & (n ^ m)) < 0) { 575 /* Signed overflow. */ 576 return r < 0 ? INT64_MAX : INT64_MIN; 577 } 578 return r; 579 } 580 581 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 582 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 583 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 584 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 585 586 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 587 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 588 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 589 590 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 591 { 592 return n > m ? n - m : 0; 593 } 594 595 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 596 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 597 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 598 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 599 600 #define DO_SUQADD_B(n, m) \ 601 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 602 #define DO_SUQADD_H(n, m) \ 603 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 604 #define DO_SUQADD_S(n, m) \ 605 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 606 607 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 608 { 609 uint64_t r = n + m; 610 611 if (n < 0) { 612 /* Note that m - abs(n) cannot underflow. */ 613 if (r > INT64_MAX) { 614 /* Result is either very large positive or negative. */ 615 if (m > -n) { 616 /* m > abs(n), so r is a very large positive. */ 617 return INT64_MAX; 618 } 619 /* Result is negative. */ 620 } 621 } else { 622 /* Both inputs are positive: check for overflow. */ 623 if (r < m || r > INT64_MAX) { 624 return INT64_MAX; 625 } 626 } 627 return r; 628 } 629 630 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 631 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 632 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 633 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 634 635 #define DO_USQADD_B(n, m) \ 636 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 637 #define DO_USQADD_H(n, m) \ 638 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 639 #define DO_USQADD_S(n, m) \ 640 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 641 642 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 643 { 644 uint64_t r = n + m; 645 646 if (m < 0) { 647 return n < -m ? 0 : r; 648 } 649 return r < n ? UINT64_MAX : r; 650 } 651 652 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 653 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 654 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 655 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 656 657 #undef DO_ZPZZ 658 #undef DO_ZPZZ_D 659 660 /* 661 * Three operand expander, operating on element pairs. 662 * If the slot I is even, the elements from from VN {I, I+1}. 663 * If the slot I is odd, the elements from from VM {I-1, I}. 664 * Load all of the input elements in each pair before overwriting output. 665 */ 666 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 667 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 668 { \ 669 intptr_t i, opr_sz = simd_oprsz(desc); \ 670 for (i = 0; i < opr_sz; ) { \ 671 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 672 do { \ 673 TYPE n0 = *(TYPE *)(vn + H(i)); \ 674 TYPE m0 = *(TYPE *)(vm + H(i)); \ 675 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 676 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 677 if (pg & 1) { \ 678 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 679 } \ 680 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 681 if (pg & 1) { \ 682 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 683 } \ 684 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 685 } while (i & 15); \ 686 } \ 687 } 688 689 /* Similarly, specialized for 64-bit operands. */ 690 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 692 { \ 693 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 694 TYPE *d = vd, *n = vn, *m = vm; \ 695 uint8_t *pg = vg; \ 696 for (i = 0; i < opr_sz; i += 2) { \ 697 TYPE n0 = n[i], n1 = n[i + 1]; \ 698 TYPE m0 = m[i], m1 = m[i + 1]; \ 699 if (pg[H1(i)] & 1) { \ 700 d[i] = OP(n0, n1); \ 701 } \ 702 if (pg[H1(i + 1)] & 1) { \ 703 d[i + 1] = OP(m0, m1); \ 704 } \ 705 } \ 706 } 707 708 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 709 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 710 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 711 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 712 713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 714 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 715 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 716 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 717 718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 719 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 720 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 721 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 722 723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 724 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 725 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 726 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 727 728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 729 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 730 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 731 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 732 733 #undef DO_ZPZZ_PAIR 734 #undef DO_ZPZZ_PAIR_D 735 736 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 737 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 738 float_status *status, uint32_t desc) \ 739 { \ 740 intptr_t i, opr_sz = simd_oprsz(desc); \ 741 for (i = 0; i < opr_sz; ) { \ 742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 743 do { \ 744 TYPE n0 = *(TYPE *)(vn + H(i)); \ 745 TYPE m0 = *(TYPE *)(vm + H(i)); \ 746 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 747 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 748 if (pg & 1) { \ 749 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 750 } \ 751 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 752 if (pg & 1) { \ 753 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 754 } \ 755 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 756 } while (i & 15); \ 757 } \ 758 } 759 760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 761 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 762 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 763 764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 765 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 766 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 767 768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 769 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 770 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 771 772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 773 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 774 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 775 776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 777 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 778 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 779 780 #undef DO_ZPZZ_PAIR_FP 781 782 /* Three-operand expander, controlled by a predicate, in which the 783 * third operand is "wide". That is, for D = N op M, the same 64-bit 784 * value of M is used with all of the narrower values of N. 785 */ 786 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 787 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 788 { \ 789 intptr_t i, opr_sz = simd_oprsz(desc); \ 790 for (i = 0; i < opr_sz; ) { \ 791 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 792 TYPEW mm = *(TYPEW *)(vm + i); \ 793 do { \ 794 if (pg & 1) { \ 795 TYPE nn = *(TYPE *)(vn + H(i)); \ 796 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 797 } \ 798 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 799 } while (i & 7); \ 800 } \ 801 } 802 803 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 804 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 805 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 806 807 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 808 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 809 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 810 811 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 812 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 813 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 814 815 #undef DO_ZPZW 816 817 /* Fully general two-operand expander, controlled by a predicate. 818 */ 819 #define DO_ZPZ(NAME, TYPE, H, OP) \ 820 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 821 { \ 822 intptr_t i, opr_sz = simd_oprsz(desc); \ 823 for (i = 0; i < opr_sz; ) { \ 824 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 825 do { \ 826 if (pg & 1) { \ 827 TYPE nn = *(TYPE *)(vn + H(i)); \ 828 *(TYPE *)(vd + H(i)) = OP(nn); \ 829 } \ 830 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 831 } while (i & 15); \ 832 } \ 833 } 834 835 /* Similarly, specialized for 64-bit operands. */ 836 #define DO_ZPZ_D(NAME, TYPE, OP) \ 837 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 838 { \ 839 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 840 TYPE *d = vd, *n = vn; \ 841 uint8_t *pg = vg; \ 842 for (i = 0; i < opr_sz; i += 1) { \ 843 if (pg[H1(i)] & 1) { \ 844 TYPE nn = n[i]; \ 845 d[i] = OP(nn); \ 846 } \ 847 } \ 848 } 849 850 #define DO_CLS_B(N) (clrsb32(N) - 24) 851 #define DO_CLS_H(N) (clrsb32(N) - 16) 852 853 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 854 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 855 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 856 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 857 858 #define DO_CLZ_B(N) (clz32(N) - 24) 859 #define DO_CLZ_H(N) (clz32(N) - 16) 860 861 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 862 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 863 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 864 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 865 866 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 867 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 868 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 869 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 870 871 #define DO_CNOT(N) (N == 0) 872 873 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 874 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 875 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 876 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 877 878 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 879 880 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 881 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 882 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 883 884 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N)) 885 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N)) 886 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N)) 887 888 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H) 889 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S) 890 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D) 891 892 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 893 894 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 895 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 896 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 897 898 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N)) 899 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N)) 900 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N)) 901 902 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H) 903 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S) 904 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D) 905 906 #define DO_NOT(N) (~N) 907 908 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 909 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 910 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 911 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 912 913 #define DO_SXTB(N) ((int8_t)N) 914 #define DO_SXTH(N) ((int16_t)N) 915 #define DO_SXTS(N) ((int32_t)N) 916 #define DO_UXTB(N) ((uint8_t)N) 917 #define DO_UXTH(N) ((uint16_t)N) 918 #define DO_UXTS(N) ((uint32_t)N) 919 920 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 921 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 922 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 923 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 924 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 925 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 926 927 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 928 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 929 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 930 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 931 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 932 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 933 934 #define DO_ABS(N) (N < 0 ? -N : N) 935 936 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 937 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 938 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 939 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 940 941 #define DO_NEG(N) (-N) 942 943 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 944 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 945 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 946 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 947 948 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 949 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 950 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 951 952 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 953 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 954 955 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 956 957 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 958 { 959 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 960 uint64_t *d = vd, *n = vn; 961 uint8_t *pg = vg; 962 963 for (i = 0; i < opr_sz; i += 2) { 964 if (pg[H1(i)] & 1) { 965 uint64_t n0 = n[i + 0]; 966 uint64_t n1 = n[i + 1]; 967 d[i + 0] = n1; 968 d[i + 1] = n0; 969 } 970 } 971 } 972 973 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 974 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 975 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 976 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 977 978 #define DO_SQABS(X) \ 979 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 980 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 981 982 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 983 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 984 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 985 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 986 987 #define DO_SQNEG(X) \ 988 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 989 x_ == min_ ? -min_ - 1 : -x_; }) 990 991 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 992 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 993 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 994 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 995 996 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 997 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 998 999 /* Three-operand expander, unpredicated, in which the third operand is "wide". 1000 */ 1001 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 1002 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1003 { \ 1004 intptr_t i, opr_sz = simd_oprsz(desc); \ 1005 for (i = 0; i < opr_sz; ) { \ 1006 TYPEW mm = *(TYPEW *)(vm + i); \ 1007 do { \ 1008 TYPE nn = *(TYPE *)(vn + H(i)); \ 1009 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 1010 i += sizeof(TYPE); \ 1011 } while (i & 7); \ 1012 } \ 1013 } 1014 1015 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 1016 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 1017 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1018 1019 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1020 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1021 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1022 1023 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1024 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1025 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1026 1027 #undef DO_ZZW 1028 1029 #undef DO_CLS_B 1030 #undef DO_CLS_H 1031 #undef DO_CLZ_B 1032 #undef DO_CLZ_H 1033 #undef DO_CNOT 1034 #undef DO_FABS 1035 #undef DO_FNEG 1036 #undef DO_ABS 1037 #undef DO_NEG 1038 #undef DO_ZPZ 1039 #undef DO_ZPZ_D 1040 1041 /* 1042 * Three-operand expander, unpredicated, in which the two inputs are 1043 * selected from the top or bottom half of the wide column. 1044 */ 1045 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1046 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1047 { \ 1048 intptr_t i, opr_sz = simd_oprsz(desc); \ 1049 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1050 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1051 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1052 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1053 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1054 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1055 } \ 1056 } 1057 1058 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1059 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1060 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1061 1062 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1063 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1064 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1065 1066 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1067 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1068 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1069 1070 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1071 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1072 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1073 1074 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1075 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1076 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1077 1078 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1079 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1080 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1081 1082 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1083 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1084 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1085 1086 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1087 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1088 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1089 1090 /* Note that the multiply cannot overflow, but the doubling can. */ 1091 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1092 { 1093 int16_t val = n * m; 1094 return DO_SQADD_H(val, val); 1095 } 1096 1097 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1098 { 1099 int32_t val = n * m; 1100 return DO_SQADD_S(val, val); 1101 } 1102 1103 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1104 { 1105 int64_t val = n * m; 1106 return do_sqadd_d(val, val); 1107 } 1108 1109 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1110 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1111 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1112 1113 #undef DO_ZZZ_TB 1114 1115 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1116 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1117 { \ 1118 intptr_t i, opr_sz = simd_oprsz(desc); \ 1119 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1120 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1121 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1122 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1123 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1124 } \ 1125 } 1126 1127 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1128 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1129 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1130 1131 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1132 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1133 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1134 1135 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1136 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1137 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1138 1139 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1140 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1141 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1142 1143 #undef DO_ZZZ_WTB 1144 1145 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1146 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1147 { \ 1148 intptr_t i, opr_sz = simd_oprsz(desc); \ 1149 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1150 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1151 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1152 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1153 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1154 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1155 } \ 1156 } 1157 1158 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1159 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1160 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1161 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1162 1163 #undef DO_ZZZ_NTB 1164 1165 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1166 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1167 { \ 1168 intptr_t i, opr_sz = simd_oprsz(desc); \ 1169 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1170 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1171 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1172 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1173 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1174 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1175 } \ 1176 } 1177 1178 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1179 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1180 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1181 1182 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1183 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1184 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1185 1186 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1187 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1188 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1189 1190 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1191 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1192 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1193 1194 #define DO_NMUL(N, M) -(N * M) 1195 1196 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1197 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1198 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1199 1200 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1201 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1202 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1203 1204 #undef DO_ZZZW_ACC 1205 1206 #define DO_XTNB(NAME, TYPE, OP) \ 1207 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1208 { \ 1209 intptr_t i, opr_sz = simd_oprsz(desc); \ 1210 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1211 TYPE nn = *(TYPE *)(vn + i); \ 1212 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1213 *(TYPE *)(vd + i) = nn; \ 1214 } \ 1215 } 1216 1217 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1218 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1219 { \ 1220 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1221 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1222 TYPE nn = *(TYPE *)(vn + i); \ 1223 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1224 } \ 1225 } 1226 1227 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1228 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1229 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1230 1231 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1232 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1233 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1234 1235 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1236 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1237 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1238 1239 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1240 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1241 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1242 1243 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1244 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1245 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1246 1247 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1248 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1249 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1250 1251 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1252 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1253 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1254 1255 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1256 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1257 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1258 1259 #undef DO_XTNB 1260 #undef DO_XTNT 1261 1262 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1263 { 1264 intptr_t i, opr_sz = simd_oprsz(desc); 1265 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1266 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1267 uint32_t *a = va, *n = vn; 1268 uint64_t *d = vd, *m = vm; 1269 1270 for (i = 0; i < opr_sz / 8; ++i) { 1271 uint32_t e1 = a[2 * i + H4(0)]; 1272 uint32_t e2 = n[2 * i + sel] ^ inv; 1273 uint64_t c = extract64(m[i], 32, 1); 1274 /* Compute and store the entire 33-bit result at once. */ 1275 d[i] = c + e1 + e2; 1276 } 1277 } 1278 1279 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1280 { 1281 intptr_t i, opr_sz = simd_oprsz(desc); 1282 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1283 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1284 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1285 1286 for (i = 0; i < opr_sz / 8; i += 2) { 1287 Int128 e1 = int128_make64(a[i]); 1288 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1289 Int128 c = int128_make64(m[i + 1] & 1); 1290 Int128 r = int128_add(int128_add(e1, e2), c); 1291 d[i + 0] = int128_getlo(r); 1292 d[i + 1] = int128_gethi(r); 1293 } 1294 } 1295 1296 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1297 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1298 { \ 1299 intptr_t i, opr_sz = simd_oprsz(desc); \ 1300 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1301 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1302 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1303 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1304 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1305 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1306 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1307 } \ 1308 } 1309 1310 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1311 do_sqdmull_h, DO_SQADD_H) 1312 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1313 do_sqdmull_s, DO_SQADD_S) 1314 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1315 do_sqdmull_d, do_sqadd_d) 1316 1317 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1318 do_sqdmull_h, DO_SQSUB_H) 1319 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1320 do_sqdmull_s, DO_SQSUB_S) 1321 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1322 do_sqdmull_d, do_sqsub_d) 1323 1324 #undef DO_SQDMLAL 1325 1326 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1327 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1328 { \ 1329 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1330 int rot = simd_data(desc); \ 1331 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1332 bool sub_r = rot == 1 || rot == 2; \ 1333 bool sub_i = rot >= 2; \ 1334 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1335 for (i = 0; i < opr_sz; i += 2) { \ 1336 TYPE elt1_a = n[H(i + sel_a)]; \ 1337 TYPE elt2_a = m[H(i + sel_a)]; \ 1338 TYPE elt2_b = m[H(i + sel_b)]; \ 1339 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1340 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1341 } \ 1342 } 1343 1344 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1345 1346 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1347 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1348 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1349 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1350 1351 #define DO_SQRDMLAH_B(N, M, A, S) \ 1352 do_sqrdmlah_b(N, M, A, S, true) 1353 #define DO_SQRDMLAH_H(N, M, A, S) \ 1354 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1355 #define DO_SQRDMLAH_S(N, M, A, S) \ 1356 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1357 #define DO_SQRDMLAH_D(N, M, A, S) \ 1358 do_sqrdmlah_d(N, M, A, S, true) 1359 1360 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1363 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1364 1365 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1366 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1367 { \ 1368 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1369 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1370 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1371 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1372 bool sub_r = rot == 1 || rot == 2; \ 1373 bool sub_i = rot >= 2; \ 1374 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1375 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1376 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1377 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1378 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1379 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1380 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1381 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1382 } \ 1383 } \ 1384 } 1385 1386 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1387 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1388 1389 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1390 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1391 1392 #undef DO_CMLA 1393 #undef DO_CMLA_FUNC 1394 #undef DO_CMLA_IDX_FUNC 1395 #undef DO_SQRDMLAH_B 1396 #undef DO_SQRDMLAH_H 1397 #undef DO_SQRDMLAH_S 1398 #undef DO_SQRDMLAH_D 1399 1400 /* Note N and M are 4 elements bundled into one unit. */ 1401 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1402 int sel_a, int sel_b, int sub_i) 1403 { 1404 for (int i = 0; i <= 1; i++) { 1405 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1406 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1407 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1408 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1409 1410 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1411 } 1412 return a; 1413 } 1414 1415 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1416 int sel_a, int sel_b, int sub_i) 1417 { 1418 for (int i = 0; i <= 1; i++) { 1419 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1420 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1421 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1422 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1423 1424 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1425 } 1426 return a; 1427 } 1428 1429 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1430 void *va, uint32_t desc) 1431 { 1432 int opr_sz = simd_oprsz(desc); 1433 int rot = simd_data(desc); 1434 int sel_a = rot & 1; 1435 int sel_b = sel_a ^ 1; 1436 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1437 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1438 1439 for (int e = 0; e < opr_sz / 4; e++) { 1440 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1441 } 1442 } 1443 1444 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1445 void *va, uint32_t desc) 1446 { 1447 int opr_sz = simd_oprsz(desc); 1448 int rot = simd_data(desc); 1449 int sel_a = rot & 1; 1450 int sel_b = sel_a ^ 1; 1451 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1452 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1453 1454 for (int e = 0; e < opr_sz / 8; e++) { 1455 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1456 } 1457 } 1458 1459 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1460 void *va, uint32_t desc) 1461 { 1462 int opr_sz = simd_oprsz(desc); 1463 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1464 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1465 int sel_a = rot & 1; 1466 int sel_b = sel_a ^ 1; 1467 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1468 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1469 1470 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1471 uint32_t seg_m = m[seg + idx]; 1472 for (int e = 0; e < 4; e++) { 1473 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1474 sel_a, sel_b, sub_i); 1475 } 1476 } 1477 } 1478 1479 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1480 void *va, uint32_t desc) 1481 { 1482 int seg, opr_sz = simd_oprsz(desc); 1483 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1484 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1485 int sel_a = rot & 1; 1486 int sel_b = sel_a ^ 1; 1487 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1488 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1489 1490 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1491 uint64_t seg_m = m[seg + idx]; 1492 for (int e = 0; e < 2; e++) { 1493 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1494 sel_a, sel_b, sub_i); 1495 } 1496 } 1497 } 1498 1499 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1500 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1501 { \ 1502 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1503 intptr_t i, j, idx = simd_data(desc); \ 1504 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1505 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1506 TYPE mm = m[i]; \ 1507 for (j = 0; j < segment; j++) { \ 1508 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1509 } \ 1510 } \ 1511 } 1512 1513 #define DO_SQRDMLAH_H(N, M, A) \ 1514 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1515 #define DO_SQRDMLAH_S(N, M, A) \ 1516 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1517 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1518 1519 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1520 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1521 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1522 1523 #define DO_SQRDMLSH_H(N, M, A) \ 1524 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1525 #define DO_SQRDMLSH_S(N, M, A) \ 1526 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1527 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1528 1529 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1530 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1531 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1532 1533 #undef DO_ZZXZ 1534 1535 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1536 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1537 { \ 1538 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1539 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1540 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1541 for (i = 0; i < oprsz; i += 16) { \ 1542 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1543 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1544 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1545 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1546 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1547 } \ 1548 } \ 1549 } 1550 1551 #define DO_MLA(N, M, A) (A + N * M) 1552 1553 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1554 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1555 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1556 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1557 1558 #define DO_MLS(N, M, A) (A - N * M) 1559 1560 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1561 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1562 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1563 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1564 1565 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1566 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1567 1568 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1569 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1570 1571 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1572 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1573 1574 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1575 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1576 1577 #undef DO_MLA 1578 #undef DO_MLS 1579 #undef DO_ZZXW 1580 1581 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1582 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1583 { \ 1584 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1585 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1586 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1587 for (i = 0; i < oprsz; i += 16) { \ 1588 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1589 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1590 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1591 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1592 } \ 1593 } \ 1594 } 1595 1596 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1597 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1598 1599 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1600 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1601 1602 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1603 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1604 1605 #undef DO_ZZX 1606 1607 #define DO_BITPERM(NAME, TYPE, OP) \ 1608 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1609 { \ 1610 intptr_t i, opr_sz = simd_oprsz(desc); \ 1611 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1612 TYPE nn = *(TYPE *)(vn + i); \ 1613 TYPE mm = *(TYPE *)(vm + i); \ 1614 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1615 } \ 1616 } 1617 1618 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1619 { 1620 uint64_t res = 0; 1621 int db, rb = 0; 1622 1623 for (db = 0; db < n; ++db) { 1624 if ((mask >> db) & 1) { 1625 res |= ((data >> db) & 1) << rb; 1626 ++rb; 1627 } 1628 } 1629 return res; 1630 } 1631 1632 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1633 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1634 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1635 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1636 1637 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1638 { 1639 uint64_t res = 0; 1640 int rb, db = 0; 1641 1642 for (rb = 0; rb < n; ++rb) { 1643 if ((mask >> rb) & 1) { 1644 res |= ((data >> db) & 1) << rb; 1645 ++db; 1646 } 1647 } 1648 return res; 1649 } 1650 1651 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1652 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1653 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1654 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1655 1656 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1657 { 1658 uint64_t resm = 0, resu = 0; 1659 int db, rbm = 0, rbu = 0; 1660 1661 for (db = 0; db < n; ++db) { 1662 uint64_t val = (data >> db) & 1; 1663 if ((mask >> db) & 1) { 1664 resm |= val << rbm++; 1665 } else { 1666 resu |= val << rbu++; 1667 } 1668 } 1669 1670 return resm | (resu << rbm); 1671 } 1672 1673 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1674 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1675 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1676 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1677 1678 #undef DO_BITPERM 1679 1680 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1681 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1682 { \ 1683 intptr_t i, opr_sz = simd_oprsz(desc); \ 1684 int sub_r = simd_data(desc); \ 1685 if (sub_r) { \ 1686 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1687 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1688 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1689 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1690 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1691 acc_r = ADD_OP(acc_r, el2_i); \ 1692 acc_i = SUB_OP(acc_i, el2_r); \ 1693 *(TYPE *)(vd + H(i)) = acc_r; \ 1694 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1695 } \ 1696 } else { \ 1697 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1698 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1699 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1700 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1701 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1702 acc_r = SUB_OP(acc_r, el2_i); \ 1703 acc_i = ADD_OP(acc_i, el2_r); \ 1704 *(TYPE *)(vd + H(i)) = acc_r; \ 1705 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1706 } \ 1707 } \ 1708 } 1709 1710 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1711 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1712 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1713 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1714 1715 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1716 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1717 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1718 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1719 1720 #undef DO_CADD 1721 1722 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1723 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1724 { \ 1725 intptr_t i, opr_sz = simd_oprsz(desc); \ 1726 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1727 int shift = simd_data(desc) >> 1; \ 1728 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1729 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1730 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1731 } \ 1732 } 1733 1734 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1735 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1736 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1737 1738 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1739 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1740 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1741 1742 #undef DO_ZZI_SHLL 1743 1744 /* Two-operand reduction expander, controlled by a predicate. 1745 * The difference between TYPERED and TYPERET has to do with 1746 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1747 * but TYPERET must be unsigned so that e.g. a 32-bit value 1748 * is not sign-extended to the ABI uint64_t return type. 1749 */ 1750 /* ??? If we were to vectorize this by hand the reduction ordering 1751 * would change. For integer operands, this is perfectly fine. 1752 */ 1753 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1754 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1755 { \ 1756 intptr_t i, opr_sz = simd_oprsz(desc); \ 1757 TYPERED ret = INIT; \ 1758 for (i = 0; i < opr_sz; ) { \ 1759 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1760 do { \ 1761 if (pg & 1) { \ 1762 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1763 ret = OP(ret, nn); \ 1764 } \ 1765 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1766 } while (i & 15); \ 1767 } \ 1768 return (TYPERET)ret; \ 1769 } 1770 1771 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1772 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1773 { \ 1774 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1775 TYPEE *n = vn; \ 1776 uint8_t *pg = vg; \ 1777 TYPER ret = INIT; \ 1778 for (i = 0; i < opr_sz; i += 1) { \ 1779 if (pg[H1(i)] & 1) { \ 1780 TYPEE nn = n[i]; \ 1781 ret = OP(ret, nn); \ 1782 } \ 1783 } \ 1784 return ret; \ 1785 } 1786 1787 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1788 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1789 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1790 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1791 1792 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1793 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1794 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1795 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1796 1797 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1798 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1799 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1800 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1801 1802 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1803 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1804 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1805 1806 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1807 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1808 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1809 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1810 1811 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1812 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1813 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1814 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1815 1816 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1817 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1818 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1819 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1820 1821 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1822 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1823 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1824 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1825 1826 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1827 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1828 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1829 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1830 1831 #undef DO_VPZ 1832 #undef DO_VPZ_D 1833 1834 /* Two vector operand, one scalar operand, unpredicated. */ 1835 #define DO_ZZI(NAME, TYPE, OP) \ 1836 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1837 { \ 1838 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1839 TYPE s = s64, *d = vd, *n = vn; \ 1840 for (i = 0; i < opr_sz; ++i) { \ 1841 d[i] = OP(n[i], s); \ 1842 } \ 1843 } 1844 1845 #define DO_SUBR(X, Y) (Y - X) 1846 1847 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1848 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1849 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1850 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1851 1852 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1853 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1854 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1855 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1856 1857 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1858 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1859 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1860 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1861 1862 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1863 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1864 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1865 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1866 1867 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1868 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1869 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1870 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1871 1872 #undef DO_ZZI 1873 1874 #undef DO_AND 1875 #undef DO_ORR 1876 #undef DO_EOR 1877 #undef DO_BIC 1878 #undef DO_ADD 1879 #undef DO_SUB 1880 #undef DO_MAX 1881 #undef DO_MIN 1882 #undef DO_ABD 1883 #undef DO_MUL 1884 #undef DO_DIV 1885 #undef DO_ASR 1886 #undef DO_LSR 1887 #undef DO_LSL 1888 #undef DO_SUBR 1889 1890 /* Similar to the ARM LastActiveElement pseudocode function, except the 1891 result is multiplied by the element size. This includes the not found 1892 indication; e.g. not found for esz=3 is -8. */ 1893 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1894 { 1895 uint64_t mask = pred_esz_masks[esz]; 1896 intptr_t i = words; 1897 1898 do { 1899 uint64_t this_g = g[--i] & mask; 1900 if (this_g) { 1901 return i * 64 + (63 - clz64(this_g)); 1902 } 1903 } while (i > 0); 1904 return (intptr_t)-1 << esz; 1905 } 1906 1907 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1908 { 1909 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1910 uint32_t flags = PREDTEST_INIT; 1911 uint64_t *d = vd, *g = vg; 1912 intptr_t i = 0; 1913 1914 do { 1915 uint64_t this_d = d[i]; 1916 uint64_t this_g = g[i]; 1917 1918 if (this_g) { 1919 if (!(flags & 4)) { 1920 /* Set in D the first bit of G. */ 1921 this_d |= this_g & -this_g; 1922 d[i] = this_d; 1923 } 1924 flags = iter_predtest_fwd(this_d, this_g, flags); 1925 } 1926 } while (++i < words); 1927 1928 return flags; 1929 } 1930 1931 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1932 { 1933 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1934 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1935 uint32_t flags = PREDTEST_INIT; 1936 uint64_t *d = vd, *g = vg, esz_mask; 1937 intptr_t i, next; 1938 1939 next = last_active_element(vd, words, esz) + (1 << esz); 1940 esz_mask = pred_esz_masks[esz]; 1941 1942 /* Similar to the pseudocode for pnext, but scaled by ESZ 1943 so that we find the correct bit. */ 1944 if (next < words * 64) { 1945 uint64_t mask = -1; 1946 1947 if (next & 63) { 1948 mask = ~((1ull << (next & 63)) - 1); 1949 next &= -64; 1950 } 1951 do { 1952 uint64_t this_g = g[next / 64] & esz_mask & mask; 1953 if (this_g != 0) { 1954 next = (next & -64) + ctz64(this_g); 1955 break; 1956 } 1957 next += 64; 1958 mask = -1; 1959 } while (next < words * 64); 1960 } 1961 1962 i = 0; 1963 do { 1964 uint64_t this_d = 0; 1965 if (i == next / 64) { 1966 this_d = 1ull << (next & 63); 1967 } 1968 d[i] = this_d; 1969 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1970 } while (++i < words); 1971 1972 return flags; 1973 } 1974 1975 /* 1976 * Copy Zn into Zd, and store zero into inactive elements. 1977 * If inv, store zeros into the active elements. 1978 */ 1979 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1980 { 1981 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1982 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1983 uint64_t *d = vd, *n = vn; 1984 uint8_t *pg = vg; 1985 1986 for (i = 0; i < opr_sz; i += 1) { 1987 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1988 } 1989 } 1990 1991 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1992 { 1993 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1994 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1995 uint64_t *d = vd, *n = vn; 1996 uint8_t *pg = vg; 1997 1998 for (i = 0; i < opr_sz; i += 1) { 1999 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 2000 } 2001 } 2002 2003 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 2004 { 2005 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2006 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2007 uint64_t *d = vd, *n = vn; 2008 uint8_t *pg = vg; 2009 2010 for (i = 0; i < opr_sz; i += 1) { 2011 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 2012 } 2013 } 2014 2015 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 2016 { 2017 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2018 uint64_t *d = vd, *n = vn; 2019 uint8_t *pg = vg; 2020 uint8_t inv = simd_data(desc); 2021 2022 for (i = 0; i < opr_sz; i += 1) { 2023 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2024 } 2025 } 2026 2027 /* Three-operand expander, immediate operand, controlled by a predicate. 2028 */ 2029 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2030 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2031 { \ 2032 intptr_t i, opr_sz = simd_oprsz(desc); \ 2033 TYPE imm = simd_data(desc); \ 2034 for (i = 0; i < opr_sz; ) { \ 2035 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2036 do { \ 2037 if (pg & 1) { \ 2038 TYPE nn = *(TYPE *)(vn + H(i)); \ 2039 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2040 } \ 2041 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2042 } while (i & 15); \ 2043 } \ 2044 } 2045 2046 /* Similarly, specialized for 64-bit operands. */ 2047 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2048 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2049 { \ 2050 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2051 TYPE *d = vd, *n = vn; \ 2052 TYPE imm = simd_data(desc); \ 2053 uint8_t *pg = vg; \ 2054 for (i = 0; i < opr_sz; i += 1) { \ 2055 if (pg[H1(i)] & 1) { \ 2056 TYPE nn = n[i]; \ 2057 d[i] = OP(nn, imm); \ 2058 } \ 2059 } \ 2060 } 2061 2062 #define DO_SHR(N, M) (N >> M) 2063 #define DO_SHL(N, M) (N << M) 2064 2065 /* Arithmetic shift right for division. This rounds negative numbers 2066 toward zero as per signed division. Therefore before shifting, 2067 when N is negative, add 2**M-1. */ 2068 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2069 2070 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2071 { 2072 if (likely(sh < 64)) { 2073 return (x >> sh) + ((x >> (sh - 1)) & 1); 2074 } else if (sh == 64) { 2075 return x >> 63; 2076 } else { 2077 return 0; 2078 } 2079 } 2080 2081 static inline int64_t do_srshr(int64_t x, unsigned sh) 2082 { 2083 if (likely(sh < 64)) { 2084 return (x >> sh) + ((x >> (sh - 1)) & 1); 2085 } else { 2086 /* Rounding the sign bit always produces 0. */ 2087 return 0; 2088 } 2089 } 2090 2091 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2092 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2093 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2094 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2095 2096 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2097 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2098 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2099 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2100 2101 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2102 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2103 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2104 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2105 2106 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2107 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2108 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2109 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2110 2111 /* SVE2 bitwise shift by immediate */ 2112 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2113 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2114 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2115 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2116 2117 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2118 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2119 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2120 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2121 2122 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2123 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2124 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2125 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2126 2127 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2128 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2129 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2130 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2131 2132 #define do_suqrshl_b(n, m) \ 2133 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2134 #define do_suqrshl_h(n, m) \ 2135 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2136 #define do_suqrshl_s(n, m) \ 2137 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2138 #define do_suqrshl_d(n, m) \ 2139 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2140 2141 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2142 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2143 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2144 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2145 2146 #undef DO_ASRD 2147 #undef DO_ZPZI 2148 #undef DO_ZPZI_D 2149 2150 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2151 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2152 { \ 2153 intptr_t i, opr_sz = simd_oprsz(desc); \ 2154 int shift = simd_data(desc); \ 2155 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2156 TYPEW nn = *(TYPEW *)(vn + i); \ 2157 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2158 } \ 2159 } 2160 2161 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2162 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2163 { \ 2164 intptr_t i, opr_sz = simd_oprsz(desc); \ 2165 int shift = simd_data(desc); \ 2166 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2167 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2168 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2169 } \ 2170 } 2171 2172 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2173 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2174 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2175 2176 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2177 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2178 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2179 2180 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2181 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2182 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2183 2184 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2185 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2186 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2187 2188 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2189 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2190 #define DO_SQSHRUN_D(x, sh) \ 2191 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2192 2193 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2194 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2195 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2196 2197 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2198 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2199 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2200 2201 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2202 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2203 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2204 2205 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2206 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2207 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2208 2209 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2210 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2211 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2212 2213 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2214 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2215 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2216 2217 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2218 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2219 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2220 2221 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2222 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2223 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2224 2225 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2226 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2227 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2228 2229 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2230 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2231 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2232 2233 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2234 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2235 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2236 2237 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2238 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2239 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2240 2241 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2242 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2243 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2244 2245 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2246 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2247 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2248 2249 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2250 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2251 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2252 2253 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2254 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2255 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2256 2257 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2258 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2259 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2260 2261 #undef DO_SHRNB 2262 #undef DO_SHRNT 2263 2264 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2265 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2266 { \ 2267 intptr_t i, opr_sz = simd_oprsz(desc); \ 2268 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2269 TYPEW nn = *(TYPEW *)(vn + i); \ 2270 TYPEW mm = *(TYPEW *)(vm + i); \ 2271 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2272 } \ 2273 } 2274 2275 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2277 { \ 2278 intptr_t i, opr_sz = simd_oprsz(desc); \ 2279 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2280 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2281 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2282 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2283 } \ 2284 } 2285 2286 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2287 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2288 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2289 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2290 2291 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2292 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2293 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2294 2295 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2296 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2297 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2298 2299 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2300 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2301 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2302 2303 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2304 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2305 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2306 2307 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2308 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2309 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2310 2311 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2312 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2313 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2314 2315 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2316 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2317 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2318 2319 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2320 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2321 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2322 2323 #undef DO_RSUBHN 2324 #undef DO_SUBHN 2325 #undef DO_RADDHN 2326 #undef DO_ADDHN 2327 2328 #undef DO_BINOPNB 2329 2330 /* Fully general four-operand expander, controlled by a predicate. 2331 */ 2332 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2333 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2334 void *vg, uint32_t desc) \ 2335 { \ 2336 intptr_t i, opr_sz = simd_oprsz(desc); \ 2337 for (i = 0; i < opr_sz; ) { \ 2338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2339 do { \ 2340 if (pg & 1) { \ 2341 TYPE nn = *(TYPE *)(vn + H(i)); \ 2342 TYPE mm = *(TYPE *)(vm + H(i)); \ 2343 TYPE aa = *(TYPE *)(va + H(i)); \ 2344 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2345 } \ 2346 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2347 } while (i & 15); \ 2348 } \ 2349 } 2350 2351 /* Similarly, specialized for 64-bit operands. */ 2352 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2353 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2354 void *vg, uint32_t desc) \ 2355 { \ 2356 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2357 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2358 uint8_t *pg = vg; \ 2359 for (i = 0; i < opr_sz; i += 1) { \ 2360 if (pg[H1(i)] & 1) { \ 2361 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2362 d[i] = OP(aa, nn, mm); \ 2363 } \ 2364 } \ 2365 } 2366 2367 #define DO_MLA(A, N, M) (A + N * M) 2368 #define DO_MLS(A, N, M) (A - N * M) 2369 2370 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2371 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2372 2373 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2374 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2375 2376 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2377 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2378 2379 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2380 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2381 2382 #undef DO_MLA 2383 #undef DO_MLS 2384 #undef DO_ZPZZZ 2385 #undef DO_ZPZZZ_D 2386 2387 void HELPER(sve_index_b)(void *vd, uint32_t start, 2388 uint32_t incr, uint32_t desc) 2389 { 2390 intptr_t i, opr_sz = simd_oprsz(desc); 2391 uint8_t *d = vd; 2392 for (i = 0; i < opr_sz; i += 1) { 2393 d[H1(i)] = start + i * incr; 2394 } 2395 } 2396 2397 void HELPER(sve_index_h)(void *vd, uint32_t start, 2398 uint32_t incr, uint32_t desc) 2399 { 2400 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2401 uint16_t *d = vd; 2402 for (i = 0; i < opr_sz; i += 1) { 2403 d[H2(i)] = start + i * incr; 2404 } 2405 } 2406 2407 void HELPER(sve_index_s)(void *vd, uint32_t start, 2408 uint32_t incr, uint32_t desc) 2409 { 2410 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2411 uint32_t *d = vd; 2412 for (i = 0; i < opr_sz; i += 1) { 2413 d[H4(i)] = start + i * incr; 2414 } 2415 } 2416 2417 void HELPER(sve_index_d)(void *vd, uint64_t start, 2418 uint64_t incr, uint32_t desc) 2419 { 2420 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2421 uint64_t *d = vd; 2422 for (i = 0; i < opr_sz; i += 1) { 2423 d[i] = start + i * incr; 2424 } 2425 } 2426 2427 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2428 { 2429 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2430 uint32_t sh = simd_data(desc); 2431 uint32_t *d = vd, *n = vn, *m = vm; 2432 for (i = 0; i < opr_sz; i += 1) { 2433 d[i] = n[i] + (m[i] << sh); 2434 } 2435 } 2436 2437 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2438 { 2439 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2440 uint64_t sh = simd_data(desc); 2441 uint64_t *d = vd, *n = vn, *m = vm; 2442 for (i = 0; i < opr_sz; i += 1) { 2443 d[i] = n[i] + (m[i] << sh); 2444 } 2445 } 2446 2447 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2448 { 2449 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2450 uint64_t sh = simd_data(desc); 2451 uint64_t *d = vd, *n = vn, *m = vm; 2452 for (i = 0; i < opr_sz; i += 1) { 2453 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2454 } 2455 } 2456 2457 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2458 { 2459 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2460 uint64_t sh = simd_data(desc); 2461 uint64_t *d = vd, *n = vn, *m = vm; 2462 for (i = 0; i < opr_sz; i += 1) { 2463 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2464 } 2465 } 2466 2467 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2468 { 2469 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2470 static const uint16_t coeff[] = { 2471 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2472 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2473 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2474 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2475 }; 2476 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2477 uint16_t *d = vd, *n = vn; 2478 2479 for (i = 0; i < opr_sz; i++) { 2480 uint16_t nn = n[i]; 2481 intptr_t idx = extract32(nn, 0, 5); 2482 uint16_t exp = extract32(nn, 5, 5); 2483 d[i] = coeff[idx] | (exp << 10); 2484 } 2485 } 2486 2487 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2488 { 2489 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2490 static const uint32_t coeff[] = { 2491 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2492 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2493 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2494 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2495 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2496 0x1ef532, 0x20b051, 0x227043, 0x243516, 2497 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2498 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2499 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2500 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2501 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2502 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2503 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2504 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2505 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2506 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2507 }; 2508 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2509 uint32_t *d = vd, *n = vn; 2510 2511 for (i = 0; i < opr_sz; i++) { 2512 uint32_t nn = n[i]; 2513 intptr_t idx = extract32(nn, 0, 6); 2514 uint32_t exp = extract32(nn, 6, 8); 2515 d[i] = coeff[idx] | (exp << 23); 2516 } 2517 } 2518 2519 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2520 { 2521 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2522 static const uint64_t coeff[] = { 2523 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2524 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2525 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2526 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2527 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2528 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2529 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2530 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2531 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2532 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2533 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2534 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2535 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2536 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2537 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2538 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2539 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2540 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2541 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2542 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2543 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2544 0xFA7C1819E90D8ull, 2545 }; 2546 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2547 uint64_t *d = vd, *n = vn; 2548 2549 for (i = 0; i < opr_sz; i++) { 2550 uint64_t nn = n[i]; 2551 intptr_t idx = extract32(nn, 0, 6); 2552 uint64_t exp = extract32(nn, 6, 11); 2553 d[i] = coeff[idx] | (exp << 52); 2554 } 2555 } 2556 2557 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2558 { 2559 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2560 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2561 uint16_t *d = vd, *n = vn, *m = vm; 2562 for (i = 0; i < opr_sz; i += 1) { 2563 uint16_t nn = n[i]; 2564 uint16_t mm = m[i]; 2565 if (mm & 1) { 2566 nn = float16_one; 2567 } 2568 if (mm & 2) { 2569 nn = float16_maybe_ah_chs(nn, fpcr_ah); 2570 } 2571 d[i] = nn; 2572 } 2573 } 2574 2575 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2576 { 2577 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2578 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2579 uint32_t *d = vd, *n = vn, *m = vm; 2580 for (i = 0; i < opr_sz; i += 1) { 2581 uint32_t nn = n[i]; 2582 uint32_t mm = m[i]; 2583 if (mm & 1) { 2584 nn = float32_one; 2585 } 2586 if (mm & 2) { 2587 nn = float32_maybe_ah_chs(nn, fpcr_ah); 2588 } 2589 d[i] = nn; 2590 } 2591 } 2592 2593 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2594 { 2595 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2596 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2597 uint64_t *d = vd, *n = vn, *m = vm; 2598 for (i = 0; i < opr_sz; i += 1) { 2599 uint64_t nn = n[i]; 2600 uint64_t mm = m[i]; 2601 if (mm & 1) { 2602 nn = float64_one; 2603 } 2604 if (mm & 2) { 2605 nn = float64_maybe_ah_chs(nn, fpcr_ah); 2606 } 2607 d[i] = nn; 2608 } 2609 } 2610 2611 /* 2612 * Signed saturating addition with scalar operand. 2613 */ 2614 2615 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2616 { 2617 intptr_t i, oprsz = simd_oprsz(desc); 2618 2619 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2620 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2621 } 2622 } 2623 2624 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2625 { 2626 intptr_t i, oprsz = simd_oprsz(desc); 2627 2628 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2629 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2630 } 2631 } 2632 2633 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2634 { 2635 intptr_t i, oprsz = simd_oprsz(desc); 2636 2637 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2638 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2639 } 2640 } 2641 2642 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2643 { 2644 intptr_t i, oprsz = simd_oprsz(desc); 2645 2646 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2647 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2648 } 2649 } 2650 2651 /* 2652 * Unsigned saturating addition with scalar operand. 2653 */ 2654 2655 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2656 { 2657 intptr_t i, oprsz = simd_oprsz(desc); 2658 2659 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2660 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2661 } 2662 } 2663 2664 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2665 { 2666 intptr_t i, oprsz = simd_oprsz(desc); 2667 2668 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2669 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2670 } 2671 } 2672 2673 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2674 { 2675 intptr_t i, oprsz = simd_oprsz(desc); 2676 2677 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2678 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2679 } 2680 } 2681 2682 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2683 { 2684 intptr_t i, oprsz = simd_oprsz(desc); 2685 2686 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2687 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2688 } 2689 } 2690 2691 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2692 { 2693 intptr_t i, oprsz = simd_oprsz(desc); 2694 2695 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2696 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2697 } 2698 } 2699 2700 /* Two operand predicated copy immediate with merge. All valid immediates 2701 * can fit within 17 signed bits in the simd_data field. 2702 */ 2703 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2704 uint64_t mm, uint32_t desc) 2705 { 2706 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2707 uint64_t *d = vd, *n = vn; 2708 uint8_t *pg = vg; 2709 2710 mm = dup_const(MO_8, mm); 2711 for (i = 0; i < opr_sz; i += 1) { 2712 uint64_t nn = n[i]; 2713 uint64_t pp = expand_pred_b(pg[H1(i)]); 2714 d[i] = (mm & pp) | (nn & ~pp); 2715 } 2716 } 2717 2718 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2719 uint64_t mm, uint32_t desc) 2720 { 2721 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2722 uint64_t *d = vd, *n = vn; 2723 uint8_t *pg = vg; 2724 2725 mm = dup_const(MO_16, mm); 2726 for (i = 0; i < opr_sz; i += 1) { 2727 uint64_t nn = n[i]; 2728 uint64_t pp = expand_pred_h(pg[H1(i)]); 2729 d[i] = (mm & pp) | (nn & ~pp); 2730 } 2731 } 2732 2733 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2734 uint64_t mm, uint32_t desc) 2735 { 2736 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2737 uint64_t *d = vd, *n = vn; 2738 uint8_t *pg = vg; 2739 2740 mm = dup_const(MO_32, mm); 2741 for (i = 0; i < opr_sz; i += 1) { 2742 uint64_t nn = n[i]; 2743 uint64_t pp = expand_pred_s(pg[H1(i)]); 2744 d[i] = (mm & pp) | (nn & ~pp); 2745 } 2746 } 2747 2748 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2749 uint64_t mm, uint32_t desc) 2750 { 2751 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2752 uint64_t *d = vd, *n = vn; 2753 uint8_t *pg = vg; 2754 2755 for (i = 0; i < opr_sz; i += 1) { 2756 uint64_t nn = n[i]; 2757 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2758 } 2759 } 2760 2761 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2762 { 2763 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2764 uint64_t *d = vd; 2765 uint8_t *pg = vg; 2766 2767 val = dup_const(MO_8, val); 2768 for (i = 0; i < opr_sz; i += 1) { 2769 d[i] = val & expand_pred_b(pg[H1(i)]); 2770 } 2771 } 2772 2773 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2774 { 2775 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2776 uint64_t *d = vd; 2777 uint8_t *pg = vg; 2778 2779 val = dup_const(MO_16, val); 2780 for (i = 0; i < opr_sz; i += 1) { 2781 d[i] = val & expand_pred_h(pg[H1(i)]); 2782 } 2783 } 2784 2785 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2786 { 2787 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2788 uint64_t *d = vd; 2789 uint8_t *pg = vg; 2790 2791 val = dup_const(MO_32, val); 2792 for (i = 0; i < opr_sz; i += 1) { 2793 d[i] = val & expand_pred_s(pg[H1(i)]); 2794 } 2795 } 2796 2797 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2798 { 2799 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2800 uint64_t *d = vd; 2801 uint8_t *pg = vg; 2802 2803 for (i = 0; i < opr_sz; i += 1) { 2804 d[i] = (pg[H1(i)] & 1 ? val : 0); 2805 } 2806 } 2807 2808 /* Big-endian hosts need to frob the byte indices. If the copy 2809 * happens to be 8-byte aligned, then no frobbing necessary. 2810 */ 2811 static void swap_memmove(void *vd, void *vs, size_t n) 2812 { 2813 uintptr_t d = (uintptr_t)vd; 2814 uintptr_t s = (uintptr_t)vs; 2815 uintptr_t o = (d | s | n) & 7; 2816 size_t i; 2817 2818 #if !HOST_BIG_ENDIAN 2819 o = 0; 2820 #endif 2821 switch (o) { 2822 case 0: 2823 memmove(vd, vs, n); 2824 break; 2825 2826 case 4: 2827 if (d < s || d >= s + n) { 2828 for (i = 0; i < n; i += 4) { 2829 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2830 } 2831 } else { 2832 for (i = n; i > 0; ) { 2833 i -= 4; 2834 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2835 } 2836 } 2837 break; 2838 2839 case 2: 2840 case 6: 2841 if (d < s || d >= s + n) { 2842 for (i = 0; i < n; i += 2) { 2843 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2844 } 2845 } else { 2846 for (i = n; i > 0; ) { 2847 i -= 2; 2848 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2849 } 2850 } 2851 break; 2852 2853 default: 2854 if (d < s || d >= s + n) { 2855 for (i = 0; i < n; i++) { 2856 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2857 } 2858 } else { 2859 for (i = n; i > 0; ) { 2860 i -= 1; 2861 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2862 } 2863 } 2864 break; 2865 } 2866 } 2867 2868 /* Similarly for memset of 0. */ 2869 static void swap_memzero(void *vd, size_t n) 2870 { 2871 uintptr_t d = (uintptr_t)vd; 2872 uintptr_t o = (d | n) & 7; 2873 size_t i; 2874 2875 /* Usually, the first bit of a predicate is set, so N is 0. */ 2876 if (likely(n == 0)) { 2877 return; 2878 } 2879 2880 #if !HOST_BIG_ENDIAN 2881 o = 0; 2882 #endif 2883 switch (o) { 2884 case 0: 2885 memset(vd, 0, n); 2886 break; 2887 2888 case 4: 2889 for (i = 0; i < n; i += 4) { 2890 *(uint32_t *)H1_4(d + i) = 0; 2891 } 2892 break; 2893 2894 case 2: 2895 case 6: 2896 for (i = 0; i < n; i += 2) { 2897 *(uint16_t *)H1_2(d + i) = 0; 2898 } 2899 break; 2900 2901 default: 2902 for (i = 0; i < n; i++) { 2903 *(uint8_t *)H1(d + i) = 0; 2904 } 2905 break; 2906 } 2907 } 2908 2909 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2910 { 2911 intptr_t opr_sz = simd_oprsz(desc); 2912 size_t n_ofs = simd_data(desc); 2913 size_t n_siz = opr_sz - n_ofs; 2914 2915 if (vd != vm) { 2916 swap_memmove(vd, vn + n_ofs, n_siz); 2917 swap_memmove(vd + n_siz, vm, n_ofs); 2918 } else if (vd != vn) { 2919 swap_memmove(vd + n_siz, vd, n_ofs); 2920 swap_memmove(vd, vn + n_ofs, n_siz); 2921 } else { 2922 /* vd == vn == vm. Need temp space. */ 2923 ARMVectorReg tmp; 2924 swap_memmove(&tmp, vm, n_ofs); 2925 swap_memmove(vd, vd + n_ofs, n_siz); 2926 memcpy(vd + n_siz, &tmp, n_ofs); 2927 } 2928 } 2929 2930 #define DO_INSR(NAME, TYPE, H) \ 2931 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2932 { \ 2933 intptr_t opr_sz = simd_oprsz(desc); \ 2934 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2935 *(TYPE *)(vd + H(0)) = val; \ 2936 } 2937 2938 DO_INSR(sve_insr_b, uint8_t, H1) 2939 DO_INSR(sve_insr_h, uint16_t, H1_2) 2940 DO_INSR(sve_insr_s, uint32_t, H1_4) 2941 DO_INSR(sve_insr_d, uint64_t, H1_8) 2942 2943 #undef DO_INSR 2944 2945 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2946 { 2947 intptr_t i, j, opr_sz = simd_oprsz(desc); 2948 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2949 uint64_t f = *(uint64_t *)(vn + i); 2950 uint64_t b = *(uint64_t *)(vn + j); 2951 *(uint64_t *)(vd + i) = bswap64(b); 2952 *(uint64_t *)(vd + j) = bswap64(f); 2953 } 2954 } 2955 2956 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2957 { 2958 intptr_t i, j, opr_sz = simd_oprsz(desc); 2959 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2960 uint64_t f = *(uint64_t *)(vn + i); 2961 uint64_t b = *(uint64_t *)(vn + j); 2962 *(uint64_t *)(vd + i) = hswap64(b); 2963 *(uint64_t *)(vd + j) = hswap64(f); 2964 } 2965 } 2966 2967 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2968 { 2969 intptr_t i, j, opr_sz = simd_oprsz(desc); 2970 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2971 uint64_t f = *(uint64_t *)(vn + i); 2972 uint64_t b = *(uint64_t *)(vn + j); 2973 *(uint64_t *)(vd + i) = rol64(b, 32); 2974 *(uint64_t *)(vd + j) = rol64(f, 32); 2975 } 2976 } 2977 2978 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2979 { 2980 intptr_t i, j, opr_sz = simd_oprsz(desc); 2981 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2982 uint64_t f = *(uint64_t *)(vn + i); 2983 uint64_t b = *(uint64_t *)(vn + j); 2984 *(uint64_t *)(vd + i) = b; 2985 *(uint64_t *)(vd + j) = f; 2986 } 2987 } 2988 2989 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2990 2991 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2992 bool is_tbx, tb_impl_fn *fn) 2993 { 2994 ARMVectorReg scratch; 2995 uintptr_t oprsz = simd_oprsz(desc); 2996 2997 if (unlikely(vd == vn)) { 2998 vn = memcpy(&scratch, vn, oprsz); 2999 } 3000 3001 fn(vd, vn, NULL, vm, oprsz, is_tbx); 3002 } 3003 3004 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 3005 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 3006 { 3007 ARMVectorReg scratch; 3008 uintptr_t oprsz = simd_oprsz(desc); 3009 3010 if (unlikely(vd == vn0)) { 3011 vn0 = memcpy(&scratch, vn0, oprsz); 3012 if (vd == vn1) { 3013 vn1 = vn0; 3014 } 3015 } else if (unlikely(vd == vn1)) { 3016 vn1 = memcpy(&scratch, vn1, oprsz); 3017 } 3018 3019 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 3020 } 3021 3022 #define DO_TB(SUFF, TYPE, H) \ 3023 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 3024 void *vm, uintptr_t oprsz, bool is_tbx) \ 3025 { \ 3026 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 3027 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 3028 for (i = 0; i < nelem; ++i) { \ 3029 TYPE index = indexes[H1(i)], val = 0; \ 3030 if (index < nelem) { \ 3031 val = tbl0[H(index)]; \ 3032 } else { \ 3033 index -= nelem; \ 3034 if (tbl1 && index < nelem) { \ 3035 val = tbl1[H(index)]; \ 3036 } else if (is_tbx) { \ 3037 continue; \ 3038 } \ 3039 } \ 3040 d[H(i)] = val; \ 3041 } \ 3042 } \ 3043 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3044 { \ 3045 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3046 } \ 3047 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3048 void *vm, uint32_t desc) \ 3049 { \ 3050 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3051 } \ 3052 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3053 { \ 3054 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3055 } 3056 3057 DO_TB(b, uint8_t, H1) 3058 DO_TB(h, uint16_t, H2) 3059 DO_TB(s, uint32_t, H4) 3060 DO_TB(d, uint64_t, H8) 3061 3062 #undef DO_TB 3063 3064 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3065 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3066 { \ 3067 intptr_t i, opr_sz = simd_oprsz(desc); \ 3068 TYPED *d = vd; \ 3069 TYPES *n = vn; \ 3070 ARMVectorReg tmp; \ 3071 if (unlikely(vn - vd < opr_sz)) { \ 3072 n = memcpy(&tmp, n, opr_sz / 2); \ 3073 } \ 3074 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3075 d[HD(i)] = n[HS(i)]; \ 3076 } \ 3077 } 3078 3079 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3080 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3081 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3082 3083 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3084 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3085 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3086 3087 #undef DO_UNPK 3088 3089 /* Mask of bits included in the even numbered predicates of width esz. 3090 * We also use this for expand_bits/compress_bits, and so extend the 3091 * same pattern out to 16-bit units. 3092 */ 3093 static const uint64_t even_bit_esz_masks[5] = { 3094 0x5555555555555555ull, 3095 0x3333333333333333ull, 3096 0x0f0f0f0f0f0f0f0full, 3097 0x00ff00ff00ff00ffull, 3098 0x0000ffff0000ffffull, 3099 }; 3100 3101 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3102 * For N==0, this corresponds to the operation that in qemu/bitops.h 3103 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3104 * section 7-2 Shuffling Bits. 3105 */ 3106 static uint64_t expand_bits(uint64_t x, int n) 3107 { 3108 int i; 3109 3110 x &= 0xffffffffu; 3111 for (i = 4; i >= n; i--) { 3112 int sh = 1 << i; 3113 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3114 } 3115 return x; 3116 } 3117 3118 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3119 * For N==0, this corresponds to the operation that in qemu/bitops.h 3120 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3121 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3122 */ 3123 static uint64_t compress_bits(uint64_t x, int n) 3124 { 3125 int i; 3126 3127 for (i = n; i <= 4; i++) { 3128 int sh = 1 << i; 3129 x &= even_bit_esz_masks[i]; 3130 x = (x >> sh) | x; 3131 } 3132 return x & 0xffffffffu; 3133 } 3134 3135 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3136 { 3137 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3138 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3139 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3140 int esize = 1 << esz; 3141 uint64_t *d = vd; 3142 intptr_t i; 3143 3144 if (oprsz <= 8) { 3145 uint64_t nn = *(uint64_t *)vn; 3146 uint64_t mm = *(uint64_t *)vm; 3147 int half = 4 * oprsz; 3148 3149 nn = extract64(nn, high * half, half); 3150 mm = extract64(mm, high * half, half); 3151 nn = expand_bits(nn, esz); 3152 mm = expand_bits(mm, esz); 3153 d[0] = nn | (mm << esize); 3154 } else { 3155 ARMPredicateReg tmp; 3156 3157 /* We produce output faster than we consume input. 3158 Therefore we must be mindful of possible overlap. */ 3159 if (vd == vn) { 3160 vn = memcpy(&tmp, vn, oprsz); 3161 if (vd == vm) { 3162 vm = vn; 3163 } 3164 } else if (vd == vm) { 3165 vm = memcpy(&tmp, vm, oprsz); 3166 } 3167 if (high) { 3168 high = oprsz >> 1; 3169 } 3170 3171 if ((oprsz & 7) == 0) { 3172 uint32_t *n = vn, *m = vm; 3173 high >>= 2; 3174 3175 for (i = 0; i < oprsz / 8; i++) { 3176 uint64_t nn = n[H4(high + i)]; 3177 uint64_t mm = m[H4(high + i)]; 3178 3179 nn = expand_bits(nn, esz); 3180 mm = expand_bits(mm, esz); 3181 d[i] = nn | (mm << esize); 3182 } 3183 } else { 3184 uint8_t *n = vn, *m = vm; 3185 uint16_t *d16 = vd; 3186 3187 for (i = 0; i < oprsz / 2; i++) { 3188 uint16_t nn = n[H1(high + i)]; 3189 uint16_t mm = m[H1(high + i)]; 3190 3191 nn = expand_bits(nn, esz); 3192 mm = expand_bits(mm, esz); 3193 d16[H2(i)] = nn | (mm << esize); 3194 } 3195 } 3196 } 3197 } 3198 3199 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3200 { 3201 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3202 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3203 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3204 uint64_t *d = vd, *n = vn, *m = vm; 3205 uint64_t l, h; 3206 intptr_t i; 3207 3208 if (oprsz <= 8) { 3209 l = compress_bits(n[0] >> odd, esz); 3210 h = compress_bits(m[0] >> odd, esz); 3211 d[0] = l | (h << (4 * oprsz)); 3212 } else { 3213 ARMPredicateReg tmp_m; 3214 intptr_t oprsz_16 = oprsz / 16; 3215 3216 if ((vm - vd) < (uintptr_t)oprsz) { 3217 m = memcpy(&tmp_m, vm, oprsz); 3218 } 3219 3220 for (i = 0; i < oprsz_16; i++) { 3221 l = n[2 * i + 0]; 3222 h = n[2 * i + 1]; 3223 l = compress_bits(l >> odd, esz); 3224 h = compress_bits(h >> odd, esz); 3225 d[i] = l | (h << 32); 3226 } 3227 3228 /* 3229 * For VL which is not a multiple of 512, the results from M do not 3230 * align nicely with the uint64_t for D. Put the aligned results 3231 * from M into TMP_M and then copy it into place afterward. 3232 */ 3233 if (oprsz & 15) { 3234 int final_shift = (oprsz & 15) * 2; 3235 3236 l = n[2 * i + 0]; 3237 h = n[2 * i + 1]; 3238 l = compress_bits(l >> odd, esz); 3239 h = compress_bits(h >> odd, esz); 3240 d[i] = l | (h << final_shift); 3241 3242 for (i = 0; i < oprsz_16; i++) { 3243 l = m[2 * i + 0]; 3244 h = m[2 * i + 1]; 3245 l = compress_bits(l >> odd, esz); 3246 h = compress_bits(h >> odd, esz); 3247 tmp_m.p[i] = l | (h << 32); 3248 } 3249 l = m[2 * i + 0]; 3250 h = m[2 * i + 1]; 3251 l = compress_bits(l >> odd, esz); 3252 h = compress_bits(h >> odd, esz); 3253 tmp_m.p[i] = l | (h << final_shift); 3254 3255 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3256 } else { 3257 for (i = 0; i < oprsz_16; i++) { 3258 l = m[2 * i + 0]; 3259 h = m[2 * i + 1]; 3260 l = compress_bits(l >> odd, esz); 3261 h = compress_bits(h >> odd, esz); 3262 d[oprsz_16 + i] = l | (h << 32); 3263 } 3264 } 3265 } 3266 } 3267 3268 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3269 { 3270 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3271 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3272 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3273 uint64_t *d = vd, *n = vn, *m = vm; 3274 uint64_t mask; 3275 int shr, shl; 3276 intptr_t i; 3277 3278 shl = 1 << esz; 3279 shr = 0; 3280 mask = even_bit_esz_masks[esz]; 3281 if (odd) { 3282 mask <<= shl; 3283 shr = shl; 3284 shl = 0; 3285 } 3286 3287 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3288 uint64_t nn = (n[i] & mask) >> shr; 3289 uint64_t mm = (m[i] & mask) << shl; 3290 d[i] = nn + mm; 3291 } 3292 } 3293 3294 /* Reverse units of 2**N bits. */ 3295 static uint64_t reverse_bits_64(uint64_t x, int n) 3296 { 3297 int i, sh; 3298 3299 x = bswap64(x); 3300 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3301 uint64_t mask = even_bit_esz_masks[i]; 3302 x = ((x & mask) << sh) | ((x >> sh) & mask); 3303 } 3304 return x; 3305 } 3306 3307 static uint8_t reverse_bits_8(uint8_t x, int n) 3308 { 3309 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3310 int i, sh; 3311 3312 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3313 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3314 } 3315 return x; 3316 } 3317 3318 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3319 { 3320 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3321 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3322 intptr_t i, oprsz_2 = oprsz / 2; 3323 3324 if (oprsz <= 8) { 3325 uint64_t l = *(uint64_t *)vn; 3326 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3327 *(uint64_t *)vd = l; 3328 } else if ((oprsz & 15) == 0) { 3329 for (i = 0; i < oprsz_2; i += 8) { 3330 intptr_t ih = oprsz - 8 - i; 3331 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3332 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3333 *(uint64_t *)(vd + i) = h; 3334 *(uint64_t *)(vd + ih) = l; 3335 } 3336 } else { 3337 for (i = 0; i < oprsz_2; i += 1) { 3338 intptr_t il = H1(i); 3339 intptr_t ih = H1(oprsz - 1 - i); 3340 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3341 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3342 *(uint8_t *)(vd + il) = h; 3343 *(uint8_t *)(vd + ih) = l; 3344 } 3345 } 3346 } 3347 3348 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3349 { 3350 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3351 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3352 uint64_t *d = vd; 3353 intptr_t i; 3354 3355 if (oprsz <= 8) { 3356 uint64_t nn = *(uint64_t *)vn; 3357 int half = 4 * oprsz; 3358 3359 nn = extract64(nn, high * half, half); 3360 nn = expand_bits(nn, 0); 3361 d[0] = nn; 3362 } else { 3363 ARMPredicateReg tmp_n; 3364 3365 /* We produce output faster than we consume input. 3366 Therefore we must be mindful of possible overlap. */ 3367 if ((vn - vd) < (uintptr_t)oprsz) { 3368 vn = memcpy(&tmp_n, vn, oprsz); 3369 } 3370 if (high) { 3371 high = oprsz >> 1; 3372 } 3373 3374 if ((oprsz & 7) == 0) { 3375 uint32_t *n = vn; 3376 high >>= 2; 3377 3378 for (i = 0; i < oprsz / 8; i++) { 3379 uint64_t nn = n[H4(high + i)]; 3380 d[i] = expand_bits(nn, 0); 3381 } 3382 } else { 3383 uint16_t *d16 = vd; 3384 uint8_t *n = vn; 3385 3386 for (i = 0; i < oprsz / 2; i++) { 3387 uint16_t nn = n[H1(high + i)]; 3388 d16[H2(i)] = expand_bits(nn, 0); 3389 } 3390 } 3391 } 3392 } 3393 3394 #define DO_ZIP(NAME, TYPE, H) \ 3395 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3396 { \ 3397 intptr_t oprsz = simd_oprsz(desc); \ 3398 intptr_t odd_ofs = simd_data(desc); \ 3399 intptr_t i, oprsz_2 = oprsz / 2; \ 3400 ARMVectorReg tmp_n, tmp_m; \ 3401 /* We produce output faster than we consume input. \ 3402 Therefore we must be mindful of possible overlap. */ \ 3403 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3404 vn = memcpy(&tmp_n, vn, oprsz); \ 3405 } \ 3406 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3407 vm = memcpy(&tmp_m, vm, oprsz); \ 3408 } \ 3409 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3410 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3411 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3412 *(TYPE *)(vm + odd_ofs + H(i)); \ 3413 } \ 3414 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3415 memset(vd + oprsz - 16, 0, 16); \ 3416 } \ 3417 } 3418 3419 DO_ZIP(sve_zip_b, uint8_t, H1) 3420 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3421 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3422 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3423 DO_ZIP(sve2_zip_q, Int128, ) 3424 3425 #define DO_UZP(NAME, TYPE, H) \ 3426 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3427 { \ 3428 intptr_t oprsz = simd_oprsz(desc); \ 3429 intptr_t odd_ofs = simd_data(desc); \ 3430 intptr_t i, p; \ 3431 ARMVectorReg tmp_m; \ 3432 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3433 vm = memcpy(&tmp_m, vm, oprsz); \ 3434 } \ 3435 i = 0, p = odd_ofs; \ 3436 do { \ 3437 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3438 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3439 } while (p < oprsz); \ 3440 p -= oprsz; \ 3441 do { \ 3442 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3443 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3444 } while (p < oprsz); \ 3445 tcg_debug_assert(i == oprsz); \ 3446 } 3447 3448 DO_UZP(sve_uzp_b, uint8_t, H1) 3449 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3450 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3451 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3452 DO_UZP(sve2_uzp_q, Int128, ) 3453 3454 #define DO_TRN(NAME, TYPE, H) \ 3455 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3456 { \ 3457 intptr_t oprsz = simd_oprsz(desc); \ 3458 intptr_t odd_ofs = simd_data(desc); \ 3459 intptr_t i; \ 3460 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3461 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3462 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3463 *(TYPE *)(vd + H(i + 0)) = ae; \ 3464 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3465 } \ 3466 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3467 memset(vd + oprsz - 16, 0, 16); \ 3468 } \ 3469 } 3470 3471 DO_TRN(sve_trn_b, uint8_t, H1) 3472 DO_TRN(sve_trn_h, uint16_t, H1_2) 3473 DO_TRN(sve_trn_s, uint32_t, H1_4) 3474 DO_TRN(sve_trn_d, uint64_t, H1_8) 3475 DO_TRN(sve2_trn_q, Int128, ) 3476 3477 #undef DO_ZIP 3478 #undef DO_UZP 3479 #undef DO_TRN 3480 3481 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3482 { 3483 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3484 uint32_t *d = vd, *n = vn; 3485 uint8_t *pg = vg; 3486 3487 for (i = j = 0; i < opr_sz; i++) { 3488 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3489 d[H4(j)] = n[H4(i)]; 3490 j++; 3491 } 3492 } 3493 for (; j < opr_sz; j++) { 3494 d[H4(j)] = 0; 3495 } 3496 } 3497 3498 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3499 { 3500 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3501 uint64_t *d = vd, *n = vn; 3502 uint8_t *pg = vg; 3503 3504 for (i = j = 0; i < opr_sz; i++) { 3505 if (pg[H1(i)] & 1) { 3506 d[j] = n[i]; 3507 j++; 3508 } 3509 } 3510 for (; j < opr_sz; j++) { 3511 d[j] = 0; 3512 } 3513 } 3514 3515 /* Similar to the ARM LastActiveElement pseudocode function, except the 3516 * result is multiplied by the element size. This includes the not found 3517 * indication; e.g. not found for esz=3 is -8. 3518 */ 3519 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3520 { 3521 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3522 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3523 3524 return last_active_element(vg, words, esz); 3525 } 3526 3527 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3528 { 3529 intptr_t opr_sz = simd_oprsz(desc) / 8; 3530 int esz = simd_data(desc); 3531 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3532 intptr_t i, first_i, last_i; 3533 ARMVectorReg tmp; 3534 3535 first_i = last_i = 0; 3536 first_g = last_g = 0; 3537 3538 /* Find the extent of the active elements within VG. */ 3539 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3540 pg = *(uint64_t *)(vg + i) & mask; 3541 if (pg) { 3542 if (last_g == 0) { 3543 last_g = pg; 3544 last_i = i; 3545 } 3546 first_g = pg; 3547 first_i = i; 3548 } 3549 } 3550 3551 len = 0; 3552 if (first_g != 0) { 3553 first_i = first_i * 8 + ctz64(first_g); 3554 last_i = last_i * 8 + 63 - clz64(last_g); 3555 len = last_i - first_i + (1 << esz); 3556 if (vd == vm) { 3557 vm = memcpy(&tmp, vm, opr_sz * 8); 3558 } 3559 swap_memmove(vd, vn + first_i, len); 3560 } 3561 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3562 } 3563 3564 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3565 void *vg, uint32_t desc) 3566 { 3567 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3568 uint64_t *d = vd, *n = vn, *m = vm; 3569 uint8_t *pg = vg; 3570 3571 for (i = 0; i < opr_sz; i += 1) { 3572 uint64_t nn = n[i], mm = m[i]; 3573 uint64_t pp = expand_pred_b(pg[H1(i)]); 3574 d[i] = (nn & pp) | (mm & ~pp); 3575 } 3576 } 3577 3578 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3579 void *vg, uint32_t desc) 3580 { 3581 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3582 uint64_t *d = vd, *n = vn, *m = vm; 3583 uint8_t *pg = vg; 3584 3585 for (i = 0; i < opr_sz; i += 1) { 3586 uint64_t nn = n[i], mm = m[i]; 3587 uint64_t pp = expand_pred_h(pg[H1(i)]); 3588 d[i] = (nn & pp) | (mm & ~pp); 3589 } 3590 } 3591 3592 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3593 void *vg, uint32_t desc) 3594 { 3595 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3596 uint64_t *d = vd, *n = vn, *m = vm; 3597 uint8_t *pg = vg; 3598 3599 for (i = 0; i < opr_sz; i += 1) { 3600 uint64_t nn = n[i], mm = m[i]; 3601 uint64_t pp = expand_pred_s(pg[H1(i)]); 3602 d[i] = (nn & pp) | (mm & ~pp); 3603 } 3604 } 3605 3606 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3607 void *vg, uint32_t desc) 3608 { 3609 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3610 uint64_t *d = vd, *n = vn, *m = vm; 3611 uint8_t *pg = vg; 3612 3613 for (i = 0; i < opr_sz; i += 1) { 3614 uint64_t nn = n[i], mm = m[i]; 3615 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3616 } 3617 } 3618 3619 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3620 void *vg, uint32_t desc) 3621 { 3622 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3623 Int128 *d = vd, *n = vn, *m = vm; 3624 uint16_t *pg = vg; 3625 3626 for (i = 0; i < opr_sz; i += 1) { 3627 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3628 } 3629 } 3630 3631 /* Two operand comparison controlled by a predicate. 3632 * ??? It is very tempting to want to be able to expand this inline 3633 * with x86 instructions, e.g. 3634 * 3635 * vcmpeqw zm, zn, %ymm0 3636 * vpmovmskb %ymm0, %eax 3637 * and $0x5555, %eax 3638 * and pg, %eax 3639 * 3640 * or even aarch64, e.g. 3641 * 3642 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3643 * cmeq v0.8h, zn, zm 3644 * and v0.8h, v0.8h, mask 3645 * addv h0, v0.8h 3646 * and v0.8b, pg 3647 * 3648 * However, coming up with an abstraction that allows vector inputs and 3649 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3650 * scalar outputs, is tricky. 3651 */ 3652 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3653 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3654 { \ 3655 intptr_t opr_sz = simd_oprsz(desc); \ 3656 uint32_t flags = PREDTEST_INIT; \ 3657 intptr_t i = opr_sz; \ 3658 do { \ 3659 uint64_t out = 0, pg; \ 3660 do { \ 3661 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3662 TYPE nn = *(TYPE *)(vn + H(i)); \ 3663 TYPE mm = *(TYPE *)(vm + H(i)); \ 3664 out |= nn OP mm; \ 3665 } while (i & 63); \ 3666 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3667 out &= pg; \ 3668 *(uint64_t *)(vd + (i >> 3)) = out; \ 3669 flags = iter_predtest_bwd(out, pg, flags); \ 3670 } while (i > 0); \ 3671 return flags; \ 3672 } 3673 3674 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3675 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3676 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3677 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3678 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3679 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3680 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3681 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3682 3683 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3684 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3685 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3686 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3687 3688 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3689 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3690 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3691 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3692 3693 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3694 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3695 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3696 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3697 3698 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3699 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3700 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3701 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3702 3703 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3704 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3705 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3706 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3707 3708 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3709 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3710 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3711 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3712 3713 #undef DO_CMP_PPZZ_B 3714 #undef DO_CMP_PPZZ_H 3715 #undef DO_CMP_PPZZ_S 3716 #undef DO_CMP_PPZZ_D 3717 #undef DO_CMP_PPZZ 3718 3719 /* Similar, but the second source is "wide". */ 3720 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3721 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3722 { \ 3723 intptr_t opr_sz = simd_oprsz(desc); \ 3724 uint32_t flags = PREDTEST_INIT; \ 3725 intptr_t i = opr_sz; \ 3726 do { \ 3727 uint64_t out = 0, pg; \ 3728 do { \ 3729 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3730 do { \ 3731 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3732 TYPE nn = *(TYPE *)(vn + H(i)); \ 3733 out |= nn OP mm; \ 3734 } while (i & 7); \ 3735 } while (i & 63); \ 3736 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3737 out &= pg; \ 3738 *(uint64_t *)(vd + (i >> 3)) = out; \ 3739 flags = iter_predtest_bwd(out, pg, flags); \ 3740 } while (i > 0); \ 3741 return flags; \ 3742 } 3743 3744 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3745 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3746 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3747 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3748 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3749 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3750 3751 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3752 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3753 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3754 3755 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3756 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3757 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3758 3759 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3760 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3761 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3762 3763 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3764 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3765 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3766 3767 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3768 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3769 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3770 3771 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3772 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3773 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3774 3775 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3776 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3777 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3778 3779 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3780 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3781 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3782 3783 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3784 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3785 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3786 3787 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3788 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3789 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3790 3791 #undef DO_CMP_PPZW_B 3792 #undef DO_CMP_PPZW_H 3793 #undef DO_CMP_PPZW_S 3794 #undef DO_CMP_PPZW 3795 3796 /* Similar, but the second source is immediate. */ 3797 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3798 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3799 { \ 3800 intptr_t opr_sz = simd_oprsz(desc); \ 3801 uint32_t flags = PREDTEST_INIT; \ 3802 TYPE mm = simd_data(desc); \ 3803 intptr_t i = opr_sz; \ 3804 do { \ 3805 uint64_t out = 0, pg; \ 3806 do { \ 3807 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3808 TYPE nn = *(TYPE *)(vn + H(i)); \ 3809 out |= nn OP mm; \ 3810 } while (i & 63); \ 3811 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3812 out &= pg; \ 3813 *(uint64_t *)(vd + (i >> 3)) = out; \ 3814 flags = iter_predtest_bwd(out, pg, flags); \ 3815 } while (i > 0); \ 3816 return flags; \ 3817 } 3818 3819 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3820 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3821 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3822 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3823 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3824 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3825 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3826 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3827 3828 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3829 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3830 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3831 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3832 3833 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3834 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3835 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3836 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3837 3838 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3839 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3840 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3841 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3842 3843 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3844 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3845 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3846 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3847 3848 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3849 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3850 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3851 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3852 3853 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3854 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3855 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3856 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3857 3858 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3859 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3860 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3861 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3862 3863 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3864 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3865 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3866 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3867 3868 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3869 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3870 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3871 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3872 3873 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3874 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3875 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3876 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3877 3878 #undef DO_CMP_PPZI_B 3879 #undef DO_CMP_PPZI_H 3880 #undef DO_CMP_PPZI_S 3881 #undef DO_CMP_PPZI_D 3882 #undef DO_CMP_PPZI 3883 3884 /* Similar to the ARM LastActive pseudocode function. */ 3885 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3886 { 3887 intptr_t i; 3888 3889 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3890 uint64_t pg = *(uint64_t *)(vg + i); 3891 if (pg) { 3892 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3893 } 3894 } 3895 return 0; 3896 } 3897 3898 /* Compute a mask into RETB that is true for all G, up to and including 3899 * (if after) or excluding (if !after) the first G & N. 3900 * Return true if BRK found. 3901 */ 3902 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3903 bool brk, bool after) 3904 { 3905 uint64_t b; 3906 3907 if (brk) { 3908 b = 0; 3909 } else if ((g & n) == 0) { 3910 /* For all G, no N are set; break not found. */ 3911 b = g; 3912 } else { 3913 /* Break somewhere in N. Locate it. */ 3914 b = g & n; /* guard true, pred true */ 3915 b = b & -b; /* first such */ 3916 if (after) { 3917 b = b | (b - 1); /* break after same */ 3918 } else { 3919 b = b - 1; /* break before same */ 3920 } 3921 brk = true; 3922 } 3923 3924 *retb = b; 3925 return brk; 3926 } 3927 3928 /* Compute a zeroing BRK. */ 3929 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3930 intptr_t oprsz, bool after) 3931 { 3932 bool brk = false; 3933 intptr_t i; 3934 3935 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3936 uint64_t this_b, this_g = g[i]; 3937 3938 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3939 d[i] = this_b & this_g; 3940 } 3941 } 3942 3943 /* Likewise, but also compute flags. */ 3944 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3945 intptr_t oprsz, bool after) 3946 { 3947 uint32_t flags = PREDTEST_INIT; 3948 bool brk = false; 3949 intptr_t i; 3950 3951 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3952 uint64_t this_b, this_d, this_g = g[i]; 3953 3954 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3955 d[i] = this_d = this_b & this_g; 3956 flags = iter_predtest_fwd(this_d, this_g, flags); 3957 } 3958 return flags; 3959 } 3960 3961 /* Compute a merging BRK. */ 3962 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3963 intptr_t oprsz, bool after) 3964 { 3965 bool brk = false; 3966 intptr_t i; 3967 3968 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3969 uint64_t this_b, this_g = g[i]; 3970 3971 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3972 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3973 } 3974 } 3975 3976 /* Likewise, but also compute flags. */ 3977 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3978 intptr_t oprsz, bool after) 3979 { 3980 uint32_t flags = PREDTEST_INIT; 3981 bool brk = false; 3982 intptr_t i; 3983 3984 for (i = 0; i < oprsz / 8; ++i) { 3985 uint64_t this_b, this_d = d[i], this_g = g[i]; 3986 3987 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3988 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3989 flags = iter_predtest_fwd(this_d, this_g, flags); 3990 } 3991 return flags; 3992 } 3993 3994 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3995 { 3996 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3997 * The compiler should turn this into 4 64-bit integer stores. 3998 */ 3999 memset(d, 0, sizeof(ARMPredicateReg)); 4000 return PREDTEST_INIT; 4001 } 4002 4003 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 4004 uint32_t pred_desc) 4005 { 4006 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4007 if (last_active_pred(vn, vg, oprsz)) { 4008 compute_brk_z(vd, vm, vg, oprsz, true); 4009 } else { 4010 do_zero(vd, oprsz); 4011 } 4012 } 4013 4014 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 4015 uint32_t pred_desc) 4016 { 4017 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4018 if (last_active_pred(vn, vg, oprsz)) { 4019 return compute_brks_z(vd, vm, vg, oprsz, true); 4020 } else { 4021 return do_zero(vd, oprsz); 4022 } 4023 } 4024 4025 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 4026 uint32_t pred_desc) 4027 { 4028 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4029 if (last_active_pred(vn, vg, oprsz)) { 4030 compute_brk_z(vd, vm, vg, oprsz, false); 4031 } else { 4032 do_zero(vd, oprsz); 4033 } 4034 } 4035 4036 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4037 uint32_t pred_desc) 4038 { 4039 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4040 if (last_active_pred(vn, vg, oprsz)) { 4041 return compute_brks_z(vd, vm, vg, oprsz, false); 4042 } else { 4043 return do_zero(vd, oprsz); 4044 } 4045 } 4046 4047 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4048 { 4049 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4050 compute_brk_z(vd, vn, vg, oprsz, true); 4051 } 4052 4053 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4054 { 4055 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4056 return compute_brks_z(vd, vn, vg, oprsz, true); 4057 } 4058 4059 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4060 { 4061 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4062 compute_brk_z(vd, vn, vg, oprsz, false); 4063 } 4064 4065 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4066 { 4067 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4068 return compute_brks_z(vd, vn, vg, oprsz, false); 4069 } 4070 4071 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4072 { 4073 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4074 compute_brk_m(vd, vn, vg, oprsz, true); 4075 } 4076 4077 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4078 { 4079 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4080 return compute_brks_m(vd, vn, vg, oprsz, true); 4081 } 4082 4083 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4084 { 4085 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4086 compute_brk_m(vd, vn, vg, oprsz, false); 4087 } 4088 4089 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4090 { 4091 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4092 return compute_brks_m(vd, vn, vg, oprsz, false); 4093 } 4094 4095 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4096 { 4097 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4098 if (!last_active_pred(vn, vg, oprsz)) { 4099 do_zero(vd, oprsz); 4100 } 4101 } 4102 4103 /* As if PredTest(Ones(PL), D, esz). */ 4104 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4105 uint64_t esz_mask) 4106 { 4107 uint32_t flags = PREDTEST_INIT; 4108 intptr_t i; 4109 4110 for (i = 0; i < oprsz / 8; i++) { 4111 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4112 } 4113 if (oprsz & 7) { 4114 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4115 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4116 } 4117 return flags; 4118 } 4119 4120 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4121 { 4122 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4123 if (last_active_pred(vn, vg, oprsz)) { 4124 return predtest_ones(vd, oprsz, -1); 4125 } else { 4126 return do_zero(vd, oprsz); 4127 } 4128 } 4129 4130 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4131 { 4132 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4133 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4134 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4135 intptr_t i; 4136 4137 for (i = 0; i < words; ++i) { 4138 uint64_t t = n[i] & g[i] & mask; 4139 sum += ctpop64(t); 4140 } 4141 return sum; 4142 } 4143 4144 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4145 { 4146 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4147 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4148 uint64_t esz_mask = pred_esz_masks[esz]; 4149 ARMPredicateReg *d = vd; 4150 uint32_t flags; 4151 intptr_t i; 4152 4153 /* Begin with a zero predicate register. */ 4154 flags = do_zero(d, oprsz); 4155 if (count == 0) { 4156 return flags; 4157 } 4158 4159 /* Set all of the requested bits. */ 4160 for (i = 0; i < count / 64; ++i) { 4161 d->p[i] = esz_mask; 4162 } 4163 if (count & 63) { 4164 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4165 } 4166 4167 return predtest_ones(d, oprsz, esz_mask); 4168 } 4169 4170 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4171 { 4172 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4173 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4174 uint64_t esz_mask = pred_esz_masks[esz]; 4175 ARMPredicateReg *d = vd; 4176 intptr_t i, invcount, oprbits; 4177 uint64_t bits; 4178 4179 if (count == 0) { 4180 return do_zero(d, oprsz); 4181 } 4182 4183 oprbits = oprsz * 8; 4184 tcg_debug_assert(count <= oprbits); 4185 4186 bits = esz_mask; 4187 if (oprbits & 63) { 4188 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4189 } 4190 4191 invcount = oprbits - count; 4192 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4193 d->p[i] = bits; 4194 bits = esz_mask; 4195 } 4196 4197 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4198 4199 while (--i >= 0) { 4200 d->p[i] = 0; 4201 } 4202 4203 return predtest_ones(d, oprsz, esz_mask); 4204 } 4205 4206 /* Recursive reduction on a function; 4207 * C.f. the ARM ARM function ReducePredicated. 4208 * 4209 * While it would be possible to write this without the DATA temporary, 4210 * it is much simpler to process the predicate register this way. 4211 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4212 * little to gain with a more complex non-recursive form. 4213 */ 4214 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4215 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4216 { \ 4217 if (n == 1) { \ 4218 return *data; \ 4219 } else { \ 4220 uintptr_t half = n / 2; \ 4221 TYPE lo = NAME##_reduce(data, status, half); \ 4222 TYPE hi = NAME##_reduce(data + half, status, half); \ 4223 return FUNC(lo, hi, status); \ 4224 } \ 4225 } \ 4226 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ 4227 { \ 4228 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4229 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4230 for (i = 0; i < oprsz; ) { \ 4231 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4232 do { \ 4233 TYPE nn = *(TYPE *)(vn + H(i)); \ 4234 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4235 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4236 } while (i & 15); \ 4237 } \ 4238 for (; i < maxsz; i += sizeof(TYPE)) { \ 4239 *(TYPE *)((void *)data + i) = IDENT; \ 4240 } \ 4241 return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ 4242 } 4243 4244 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero) 4245 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero) 4246 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero) 4247 4248 /* Identity is floatN_default_nan, without the function call. */ 4249 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00) 4250 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000) 4251 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) 4252 4253 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00) 4254 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000) 4255 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) 4256 4257 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity) 4258 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity) 4259 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity) 4260 4261 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity)) 4262 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity)) 4263 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity)) 4264 4265 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) 4266 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) 4267 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) 4268 4269 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh, 4270 float16_chs(float16_infinity)) 4271 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs, 4272 float32_chs(float32_infinity)) 4273 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd, 4274 float64_chs(float64_infinity)) 4275 4276 #undef DO_REDUCE 4277 4278 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4279 float_status *status, uint32_t desc) 4280 { 4281 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4282 float16 result = nn; 4283 4284 do { 4285 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4286 do { 4287 if (pg & 1) { 4288 float16 mm = *(float16 *)(vm + H1_2(i)); 4289 result = float16_add(result, mm, status); 4290 } 4291 i += sizeof(float16), pg >>= sizeof(float16); 4292 } while (i & 15); 4293 } while (i < opr_sz); 4294 4295 return result; 4296 } 4297 4298 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4299 float_status *status, uint32_t desc) 4300 { 4301 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4302 float32 result = nn; 4303 4304 do { 4305 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4306 do { 4307 if (pg & 1) { 4308 float32 mm = *(float32 *)(vm + H1_2(i)); 4309 result = float32_add(result, mm, status); 4310 } 4311 i += sizeof(float32), pg >>= sizeof(float32); 4312 } while (i & 15); 4313 } while (i < opr_sz); 4314 4315 return result; 4316 } 4317 4318 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4319 float_status *status, uint32_t desc) 4320 { 4321 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4322 uint64_t *m = vm; 4323 uint8_t *pg = vg; 4324 4325 for (i = 0; i < opr_sz; i++) { 4326 if (pg[H1(i)] & 1) { 4327 nn = float64_add(nn, m[i], status); 4328 } 4329 } 4330 4331 return nn; 4332 } 4333 4334 /* Fully general three-operand expander, controlled by a predicate, 4335 * With the extra float_status parameter. 4336 */ 4337 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4338 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4339 float_status *status, uint32_t desc) \ 4340 { \ 4341 intptr_t i = simd_oprsz(desc); \ 4342 uint64_t *g = vg; \ 4343 do { \ 4344 uint64_t pg = g[(i - 1) >> 6]; \ 4345 do { \ 4346 i -= sizeof(TYPE); \ 4347 if (likely((pg >> (i & 63)) & 1)) { \ 4348 TYPE nn = *(TYPE *)(vn + H(i)); \ 4349 TYPE mm = *(TYPE *)(vm + H(i)); \ 4350 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4351 } \ 4352 } while (i & 63); \ 4353 } while (i != 0); \ 4354 } 4355 4356 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4357 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4358 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4359 4360 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4361 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4362 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4363 4364 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4365 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4366 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4367 4368 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4369 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4370 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4371 4372 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4373 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4374 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4375 4376 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4377 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4378 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4379 4380 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) 4381 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) 4382 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) 4383 4384 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) 4385 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) 4386 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) 4387 4388 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4389 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4390 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4391 4392 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4393 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4394 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4395 4396 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4397 { 4398 return float16_abs(float16_sub(a, b, s)); 4399 } 4400 4401 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4402 { 4403 return float32_abs(float32_sub(a, b, s)); 4404 } 4405 4406 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4407 { 4408 return float64_abs(float64_sub(a, b, s)); 4409 } 4410 4411 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 4412 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat) 4413 { 4414 float16 r = float16_sub(op1, op2, stat); 4415 return float16_is_any_nan(r) ? r : float16_abs(r); 4416 } 4417 4418 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat) 4419 { 4420 float32 r = float32_sub(op1, op2, stat); 4421 return float32_is_any_nan(r) ? r : float32_abs(r); 4422 } 4423 4424 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat) 4425 { 4426 float64 r = float64_sub(op1, op2, stat); 4427 return float64_is_any_nan(r) ? r : float64_abs(r); 4428 } 4429 4430 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4431 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4432 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4433 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h) 4434 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s) 4435 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d) 4436 4437 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4438 { 4439 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4440 return float64_scalbn(a, b_int, s); 4441 } 4442 4443 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4444 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4445 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4446 4447 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4448 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4449 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4450 4451 #undef DO_ZPZZ_FP 4452 4453 /* Three-operand expander, with one scalar operand, controlled by 4454 * a predicate, with the extra float_status parameter. 4455 */ 4456 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4457 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4458 float_status *status, uint32_t desc) \ 4459 { \ 4460 intptr_t i = simd_oprsz(desc); \ 4461 uint64_t *g = vg; \ 4462 TYPE mm = scalar; \ 4463 do { \ 4464 uint64_t pg = g[(i - 1) >> 6]; \ 4465 do { \ 4466 i -= sizeof(TYPE); \ 4467 if (likely((pg >> (i & 63)) & 1)) { \ 4468 TYPE nn = *(TYPE *)(vn + H(i)); \ 4469 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4470 } \ 4471 } while (i & 63); \ 4472 } while (i != 0); \ 4473 } 4474 4475 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4476 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4477 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4478 4479 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4480 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4481 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4482 4483 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4484 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4485 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4486 4487 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4488 { 4489 return float16_sub(b, a, s); 4490 } 4491 4492 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4493 { 4494 return float32_sub(b, a, s); 4495 } 4496 4497 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4498 { 4499 return float64_sub(b, a, s); 4500 } 4501 4502 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4503 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4504 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4505 4506 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4507 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4508 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4509 4510 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4511 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4512 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4513 4514 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4515 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4516 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4517 4518 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4519 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4520 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4521 4522 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh) 4523 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs) 4524 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd) 4525 4526 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh) 4527 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins) 4528 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind) 4529 4530 /* Fully general two-operand expander, controlled by a predicate, 4531 * With the extra float_status parameter. 4532 */ 4533 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4534 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4535 float_status *status, uint32_t desc) \ 4536 { \ 4537 intptr_t i = simd_oprsz(desc); \ 4538 uint64_t *g = vg; \ 4539 do { \ 4540 uint64_t pg = g[(i - 1) >> 6]; \ 4541 do { \ 4542 i -= sizeof(TYPE); \ 4543 if (likely((pg >> (i & 63)) & 1)) { \ 4544 TYPE nn = *(TYPE *)(vn + H(i)); \ 4545 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4546 } \ 4547 } while (i & 63); \ 4548 } while (i != 0); \ 4549 } 4550 4551 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4552 * FZ16. When converting from fp16, this affects flushing input denormals; 4553 * when converting to fp16, this affects flushing output denormals. 4554 */ 4555 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4556 { 4557 bool save = get_flush_inputs_to_zero(fpst); 4558 float32 ret; 4559 4560 set_flush_inputs_to_zero(false, fpst); 4561 ret = float16_to_float32(f, true, fpst); 4562 set_flush_inputs_to_zero(save, fpst); 4563 return ret; 4564 } 4565 4566 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4567 { 4568 bool save = get_flush_inputs_to_zero(fpst); 4569 float64 ret; 4570 4571 set_flush_inputs_to_zero(false, fpst); 4572 ret = float16_to_float64(f, true, fpst); 4573 set_flush_inputs_to_zero(save, fpst); 4574 return ret; 4575 } 4576 4577 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4578 { 4579 bool save = get_flush_to_zero(fpst); 4580 float16 ret; 4581 4582 set_flush_to_zero(false, fpst); 4583 ret = float32_to_float16(f, true, fpst); 4584 set_flush_to_zero(save, fpst); 4585 return ret; 4586 } 4587 4588 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4589 { 4590 bool save = get_flush_to_zero(fpst); 4591 float16 ret; 4592 4593 set_flush_to_zero(false, fpst); 4594 ret = float64_to_float16(f, true, fpst); 4595 set_flush_to_zero(save, fpst); 4596 return ret; 4597 } 4598 4599 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4600 { 4601 if (float16_is_any_nan(f)) { 4602 float_raise(float_flag_invalid, s); 4603 return 0; 4604 } 4605 return float16_to_int16_round_to_zero(f, s); 4606 } 4607 4608 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4609 { 4610 if (float16_is_any_nan(f)) { 4611 float_raise(float_flag_invalid, s); 4612 return 0; 4613 } 4614 return float16_to_int64_round_to_zero(f, s); 4615 } 4616 4617 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4618 { 4619 if (float32_is_any_nan(f)) { 4620 float_raise(float_flag_invalid, s); 4621 return 0; 4622 } 4623 return float32_to_int64_round_to_zero(f, s); 4624 } 4625 4626 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4627 { 4628 if (float64_is_any_nan(f)) { 4629 float_raise(float_flag_invalid, s); 4630 return 0; 4631 } 4632 return float64_to_int64_round_to_zero(f, s); 4633 } 4634 4635 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4636 { 4637 if (float16_is_any_nan(f)) { 4638 float_raise(float_flag_invalid, s); 4639 return 0; 4640 } 4641 return float16_to_uint16_round_to_zero(f, s); 4642 } 4643 4644 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4645 { 4646 if (float16_is_any_nan(f)) { 4647 float_raise(float_flag_invalid, s); 4648 return 0; 4649 } 4650 return float16_to_uint64_round_to_zero(f, s); 4651 } 4652 4653 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4654 { 4655 if (float32_is_any_nan(f)) { 4656 float_raise(float_flag_invalid, s); 4657 return 0; 4658 } 4659 return float32_to_uint64_round_to_zero(f, s); 4660 } 4661 4662 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4663 { 4664 if (float64_is_any_nan(f)) { 4665 float_raise(float_flag_invalid, s); 4666 return 0; 4667 } 4668 return float64_to_uint64_round_to_zero(f, s); 4669 } 4670 4671 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4672 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4673 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4674 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4675 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4676 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4677 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4678 4679 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4680 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4681 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4682 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4683 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4684 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4685 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4686 4687 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4688 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4689 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4690 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4691 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4692 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4693 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4694 4695 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4696 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4697 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4698 4699 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4700 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4701 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4702 4703 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4704 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4705 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4706 4707 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4708 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4709 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4710 4711 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4712 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4713 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4714 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4715 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4716 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4717 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4718 4719 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4720 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4721 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4722 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4723 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4724 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4725 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4726 4727 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4728 { 4729 /* Extract frac to the top of the uint32_t. */ 4730 uint32_t frac = (uint32_t)a << (16 + 6); 4731 int16_t exp = extract32(a, 10, 5); 4732 4733 if (unlikely(exp == 0)) { 4734 if (frac != 0) { 4735 if (!get_flush_inputs_to_zero(s)) { 4736 /* denormal: bias - fractional_zeros */ 4737 return -15 - clz32(frac); 4738 } 4739 /* flush to zero */ 4740 float_raise(float_flag_input_denormal_flushed, s); 4741 } 4742 } else if (unlikely(exp == 0x1f)) { 4743 if (frac == 0) { 4744 return INT16_MAX; /* infinity */ 4745 } 4746 } else { 4747 /* normal: exp - bias */ 4748 return exp - 15; 4749 } 4750 /* nan or zero */ 4751 float_raise(float_flag_invalid, s); 4752 return INT16_MIN; 4753 } 4754 4755 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4756 { 4757 /* Extract frac to the top of the uint32_t. */ 4758 uint32_t frac = a << 9; 4759 int32_t exp = extract32(a, 23, 8); 4760 4761 if (unlikely(exp == 0)) { 4762 if (frac != 0) { 4763 if (!get_flush_inputs_to_zero(s)) { 4764 /* denormal: bias - fractional_zeros */ 4765 return -127 - clz32(frac); 4766 } 4767 /* flush to zero */ 4768 float_raise(float_flag_input_denormal_flushed, s); 4769 } 4770 } else if (unlikely(exp == 0xff)) { 4771 if (frac == 0) { 4772 return INT32_MAX; /* infinity */ 4773 } 4774 } else { 4775 /* normal: exp - bias */ 4776 return exp - 127; 4777 } 4778 /* nan or zero */ 4779 float_raise(float_flag_invalid, s); 4780 return INT32_MIN; 4781 } 4782 4783 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4784 { 4785 /* Extract frac to the top of the uint64_t. */ 4786 uint64_t frac = a << 12; 4787 int64_t exp = extract64(a, 52, 11); 4788 4789 if (unlikely(exp == 0)) { 4790 if (frac != 0) { 4791 if (!get_flush_inputs_to_zero(s)) { 4792 /* denormal: bias - fractional_zeros */ 4793 return -1023 - clz64(frac); 4794 } 4795 /* flush to zero */ 4796 float_raise(float_flag_input_denormal_flushed, s); 4797 } 4798 } else if (unlikely(exp == 0x7ff)) { 4799 if (frac == 0) { 4800 return INT64_MAX; /* infinity */ 4801 } 4802 } else { 4803 /* normal: exp - bias */ 4804 return exp - 1023; 4805 } 4806 /* nan or zero */ 4807 float_raise(float_flag_invalid, s); 4808 return INT64_MIN; 4809 } 4810 4811 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4812 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4813 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4814 4815 #undef DO_ZPZ_FP 4816 4817 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4818 float_status *status, uint32_t desc, 4819 uint16_t neg1, uint16_t neg3, int flags) 4820 { 4821 intptr_t i = simd_oprsz(desc); 4822 uint64_t *g = vg; 4823 4824 do { 4825 uint64_t pg = g[(i - 1) >> 6]; 4826 do { 4827 i -= 2; 4828 if (likely((pg >> (i & 63)) & 1)) { 4829 float16 e1, e2, e3, r; 4830 4831 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4832 e2 = *(uint16_t *)(vm + H1_2(i)); 4833 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4834 r = float16_muladd(e1, e2, e3, flags, status); 4835 *(uint16_t *)(vd + H1_2(i)) = r; 4836 } 4837 } while (i & 63); 4838 } while (i != 0); 4839 } 4840 4841 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4842 void *vg, float_status *status, uint32_t desc) 4843 { 4844 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4845 } 4846 4847 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4848 void *vg, float_status *status, uint32_t desc) 4849 { 4850 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 4851 } 4852 4853 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4854 void *vg, float_status *status, uint32_t desc) 4855 { 4856 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 4857 } 4858 4859 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4860 void *vg, float_status *status, uint32_t desc) 4861 { 4862 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 4863 } 4864 4865 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4866 void *vg, float_status *status, uint32_t desc) 4867 { 4868 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4869 float_muladd_negate_product); 4870 } 4871 4872 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4873 void *vg, float_status *status, uint32_t desc) 4874 { 4875 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4876 float_muladd_negate_product | float_muladd_negate_c); 4877 } 4878 4879 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4880 void *vg, float_status *status, uint32_t desc) 4881 { 4882 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4883 float_muladd_negate_c); 4884 } 4885 4886 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4887 float_status *status, uint32_t desc, 4888 uint32_t neg1, uint32_t neg3, int flags) 4889 { 4890 intptr_t i = simd_oprsz(desc); 4891 uint64_t *g = vg; 4892 4893 do { 4894 uint64_t pg = g[(i - 1) >> 6]; 4895 do { 4896 i -= 4; 4897 if (likely((pg >> (i & 63)) & 1)) { 4898 float32 e1, e2, e3, r; 4899 4900 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4901 e2 = *(uint32_t *)(vm + H1_4(i)); 4902 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4903 r = float32_muladd(e1, e2, e3, flags, status); 4904 *(uint32_t *)(vd + H1_4(i)) = r; 4905 } 4906 } while (i & 63); 4907 } while (i != 0); 4908 } 4909 4910 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4911 void *vg, float_status *status, uint32_t desc) 4912 { 4913 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4914 } 4915 4916 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4917 void *vg, float_status *status, uint32_t desc) 4918 { 4919 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0); 4920 } 4921 4922 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4923 void *vg, float_status *status, uint32_t desc) 4924 { 4925 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0); 4926 } 4927 4928 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4929 void *vg, float_status *status, uint32_t desc) 4930 { 4931 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0); 4932 } 4933 4934 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4935 void *vg, float_status *status, uint32_t desc) 4936 { 4937 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4938 float_muladd_negate_product); 4939 } 4940 4941 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4942 void *vg, float_status *status, uint32_t desc) 4943 { 4944 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4945 float_muladd_negate_product | float_muladd_negate_c); 4946 } 4947 4948 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4949 void *vg, float_status *status, uint32_t desc) 4950 { 4951 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4952 float_muladd_negate_c); 4953 } 4954 4955 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4956 float_status *status, uint32_t desc, 4957 uint64_t neg1, uint64_t neg3, int flags) 4958 { 4959 intptr_t i = simd_oprsz(desc); 4960 uint64_t *g = vg; 4961 4962 do { 4963 uint64_t pg = g[(i - 1) >> 6]; 4964 do { 4965 i -= 8; 4966 if (likely((pg >> (i & 63)) & 1)) { 4967 float64 e1, e2, e3, r; 4968 4969 e1 = *(uint64_t *)(vn + i) ^ neg1; 4970 e2 = *(uint64_t *)(vm + i); 4971 e3 = *(uint64_t *)(va + i) ^ neg3; 4972 r = float64_muladd(e1, e2, e3, flags, status); 4973 *(uint64_t *)(vd + i) = r; 4974 } 4975 } while (i & 63); 4976 } while (i != 0); 4977 } 4978 4979 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4980 void *vg, float_status *status, uint32_t desc) 4981 { 4982 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4983 } 4984 4985 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4986 void *vg, float_status *status, uint32_t desc) 4987 { 4988 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0); 4989 } 4990 4991 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4992 void *vg, float_status *status, uint32_t desc) 4993 { 4994 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0); 4995 } 4996 4997 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4998 void *vg, float_status *status, uint32_t desc) 4999 { 5000 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0); 5001 } 5002 5003 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5004 void *vg, float_status *status, uint32_t desc) 5005 { 5006 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5007 float_muladd_negate_product); 5008 } 5009 5010 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5011 void *vg, float_status *status, uint32_t desc) 5012 { 5013 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5014 float_muladd_negate_product | float_muladd_negate_c); 5015 } 5016 5017 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5018 void *vg, float_status *status, uint32_t desc) 5019 { 5020 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5021 float_muladd_negate_c); 5022 } 5023 5024 /* Two operand floating-point comparison controlled by a predicate. 5025 * Unlike the integer version, we are not allowed to optimistically 5026 * compare operands, since the comparison may have side effects wrt 5027 * the FPSR. 5028 */ 5029 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 5030 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 5031 float_status *status, uint32_t desc) \ 5032 { \ 5033 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5034 uint64_t *d = vd, *g = vg; \ 5035 do { \ 5036 uint64_t out = 0, pg = g[j]; \ 5037 do { \ 5038 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5039 if (likely((pg >> (i & 63)) & 1)) { \ 5040 TYPE nn = *(TYPE *)(vn + H(i)); \ 5041 TYPE mm = *(TYPE *)(vm + H(i)); \ 5042 out |= OP(TYPE, nn, mm, status); \ 5043 } \ 5044 } while (i & 63); \ 5045 d[j--] = out; \ 5046 } while (i > 0); \ 5047 } 5048 5049 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 5050 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 5051 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 5052 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 5053 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 5054 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 5055 5056 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 5057 DO_FPCMP_PPZZ_H(NAME, OP) \ 5058 DO_FPCMP_PPZZ_S(NAME, OP) \ 5059 DO_FPCMP_PPZZ_D(NAME, OP) 5060 5061 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 5062 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 5063 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 5064 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 5065 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 5066 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 5067 #define DO_FCMUO(TYPE, X, Y, ST) \ 5068 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 5069 #define DO_FACGE(TYPE, X, Y, ST) \ 5070 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 5071 #define DO_FACGT(TYPE, X, Y, ST) \ 5072 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 5073 5074 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 5075 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 5076 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 5077 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 5078 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 5079 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 5080 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 5081 5082 #undef DO_FPCMP_PPZZ_ALL 5083 #undef DO_FPCMP_PPZZ_D 5084 #undef DO_FPCMP_PPZZ_S 5085 #undef DO_FPCMP_PPZZ_H 5086 #undef DO_FPCMP_PPZZ 5087 5088 /* One operand floating-point comparison against zero, controlled 5089 * by a predicate. 5090 */ 5091 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 5092 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 5093 float_status *status, uint32_t desc) \ 5094 { \ 5095 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5096 uint64_t *d = vd, *g = vg; \ 5097 do { \ 5098 uint64_t out = 0, pg = g[j]; \ 5099 do { \ 5100 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5101 if ((pg >> (i & 63)) & 1) { \ 5102 TYPE nn = *(TYPE *)(vn + H(i)); \ 5103 out |= OP(TYPE, nn, 0, status); \ 5104 } \ 5105 } while (i & 63); \ 5106 d[j--] = out; \ 5107 } while (i > 0); \ 5108 } 5109 5110 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 5111 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 5112 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 5113 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 5114 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 5115 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 5116 5117 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 5118 DO_FPCMP_PPZ0_H(NAME, OP) \ 5119 DO_FPCMP_PPZ0_S(NAME, OP) \ 5120 DO_FPCMP_PPZ0_D(NAME, OP) 5121 5122 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 5123 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 5124 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 5125 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 5126 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 5127 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 5128 5129 /* FP Trig Multiply-Add. */ 5130 5131 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 5132 float_status *s, uint32_t desc) 5133 { 5134 static const float16 coeff[16] = { 5135 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5136 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5137 }; 5138 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 5139 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5140 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5141 float16 *d = vd, *n = vn, *m = vm; 5142 5143 for (i = 0; i < opr_sz; i++) { 5144 float16 mm = m[i]; 5145 intptr_t xx = x; 5146 int flags = 0; 5147 5148 if (float16_is_neg(mm)) { 5149 if (fpcr_ah) { 5150 flags = float_muladd_negate_product; 5151 } else { 5152 mm = float16_abs(mm); 5153 } 5154 xx += 8; 5155 } 5156 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s); 5157 } 5158 } 5159 5160 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5161 float_status *s, uint32_t desc) 5162 { 5163 static const float32 coeff[16] = { 5164 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5165 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5166 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5167 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5168 }; 5169 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5170 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5171 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5172 float32 *d = vd, *n = vn, *m = vm; 5173 5174 for (i = 0; i < opr_sz; i++) { 5175 float32 mm = m[i]; 5176 intptr_t xx = x; 5177 int flags = 0; 5178 5179 if (float32_is_neg(mm)) { 5180 if (fpcr_ah) { 5181 flags = float_muladd_negate_product; 5182 } else { 5183 mm = float32_abs(mm); 5184 } 5185 xx += 8; 5186 } 5187 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s); 5188 } 5189 } 5190 5191 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5192 float_status *s, uint32_t desc) 5193 { 5194 static const float64 coeff[16] = { 5195 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5196 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5197 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5198 0x3de5d8408868552full, 0x0000000000000000ull, 5199 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5200 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5201 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5202 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5203 }; 5204 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5205 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5206 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5207 float64 *d = vd, *n = vn, *m = vm; 5208 5209 for (i = 0; i < opr_sz; i++) { 5210 float64 mm = m[i]; 5211 intptr_t xx = x; 5212 int flags = 0; 5213 5214 if (float64_is_neg(mm)) { 5215 if (fpcr_ah) { 5216 flags = float_muladd_negate_product; 5217 } else { 5218 mm = float64_abs(mm); 5219 } 5220 xx += 8; 5221 } 5222 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s); 5223 } 5224 } 5225 5226 /* 5227 * FP Complex Add 5228 */ 5229 5230 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5231 float_status *s, uint32_t desc) 5232 { 5233 intptr_t j, i = simd_oprsz(desc); 5234 uint64_t *g = vg; 5235 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5236 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5237 5238 do { 5239 uint64_t pg = g[(i - 1) >> 6]; 5240 do { 5241 float16 e0, e1, e2, e3; 5242 5243 /* I holds the real index; J holds the imag index. */ 5244 j = i - sizeof(float16); 5245 i -= 2 * sizeof(float16); 5246 5247 e0 = *(float16 *)(vn + H1_2(i)); 5248 e1 = *(float16 *)(vm + H1_2(j)); 5249 e2 = *(float16 *)(vn + H1_2(j)); 5250 e3 = *(float16 *)(vm + H1_2(i)); 5251 5252 if (rot) { 5253 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 5254 } else { 5255 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 5256 } 5257 5258 if (likely((pg >> (i & 63)) & 1)) { 5259 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5260 } 5261 if (likely((pg >> (j & 63)) & 1)) { 5262 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5263 } 5264 } while (i & 63); 5265 } while (i != 0); 5266 } 5267 5268 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5269 float_status *s, uint32_t desc) 5270 { 5271 intptr_t j, i = simd_oprsz(desc); 5272 uint64_t *g = vg; 5273 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5274 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5275 5276 do { 5277 uint64_t pg = g[(i - 1) >> 6]; 5278 do { 5279 float32 e0, e1, e2, e3; 5280 5281 /* I holds the real index; J holds the imag index. */ 5282 j = i - sizeof(float32); 5283 i -= 2 * sizeof(float32); 5284 5285 e0 = *(float32 *)(vn + H1_2(i)); 5286 e1 = *(float32 *)(vm + H1_2(j)); 5287 e2 = *(float32 *)(vn + H1_2(j)); 5288 e3 = *(float32 *)(vm + H1_2(i)); 5289 5290 if (rot) { 5291 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 5292 } else { 5293 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 5294 } 5295 5296 if (likely((pg >> (i & 63)) & 1)) { 5297 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5298 } 5299 if (likely((pg >> (j & 63)) & 1)) { 5300 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5301 } 5302 } while (i & 63); 5303 } while (i != 0); 5304 } 5305 5306 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5307 float_status *s, uint32_t desc) 5308 { 5309 intptr_t j, i = simd_oprsz(desc); 5310 uint64_t *g = vg; 5311 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5312 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5313 5314 do { 5315 uint64_t pg = g[(i - 1) >> 6]; 5316 do { 5317 float64 e0, e1, e2, e3; 5318 5319 /* I holds the real index; J holds the imag index. */ 5320 j = i - sizeof(float64); 5321 i -= 2 * sizeof(float64); 5322 5323 e0 = *(float64 *)(vn + H1_2(i)); 5324 e1 = *(float64 *)(vm + H1_2(j)); 5325 e2 = *(float64 *)(vn + H1_2(j)); 5326 e3 = *(float64 *)(vm + H1_2(i)); 5327 5328 if (rot) { 5329 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 5330 } else { 5331 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 5332 } 5333 5334 if (likely((pg >> (i & 63)) & 1)) { 5335 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5336 } 5337 if (likely((pg >> (j & 63)) & 1)) { 5338 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5339 } 5340 } while (i & 63); 5341 } while (i != 0); 5342 } 5343 5344 /* 5345 * FP Complex Multiply 5346 */ 5347 5348 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5349 void *vg, float_status *status, uint32_t desc) 5350 { 5351 intptr_t j, i = simd_oprsz(desc); 5352 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5353 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5354 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5355 uint32_t negf_real = flip ^ negf_imag; 5356 float16 negx_imag, negx_real; 5357 uint64_t *g = vg; 5358 5359 /* With AH=0, use negx; with AH=1 use negf. */ 5360 negx_real = (negf_real & ~fpcr_ah) << 15; 5361 negx_imag = (negf_imag & ~fpcr_ah) << 15; 5362 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5363 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5364 5365 do { 5366 uint64_t pg = g[(i - 1) >> 6]; 5367 do { 5368 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5369 5370 /* I holds the real index; J holds the imag index. */ 5371 j = i - sizeof(float16); 5372 i -= 2 * sizeof(float16); 5373 5374 nr = *(float16 *)(vn + H1_2(i)); 5375 ni = *(float16 *)(vn + H1_2(j)); 5376 mr = *(float16 *)(vm + H1_2(i)); 5377 mi = *(float16 *)(vm + H1_2(j)); 5378 5379 e2 = (flip ? ni : nr); 5380 e1 = (flip ? mi : mr) ^ negx_real; 5381 e4 = e2; 5382 e3 = (flip ? mr : mi) ^ negx_imag; 5383 5384 if (likely((pg >> (i & 63)) & 1)) { 5385 d = *(float16 *)(va + H1_2(i)); 5386 d = float16_muladd(e2, e1, d, negf_real, status); 5387 *(float16 *)(vd + H1_2(i)) = d; 5388 } 5389 if (likely((pg >> (j & 63)) & 1)) { 5390 d = *(float16 *)(va + H1_2(j)); 5391 d = float16_muladd(e4, e3, d, negf_imag, status); 5392 *(float16 *)(vd + H1_2(j)) = d; 5393 } 5394 } while (i & 63); 5395 } while (i != 0); 5396 } 5397 5398 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5399 void *vg, float_status *status, uint32_t desc) 5400 { 5401 intptr_t j, i = simd_oprsz(desc); 5402 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5403 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5404 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5405 uint32_t negf_real = flip ^ negf_imag; 5406 float32 negx_imag, negx_real; 5407 uint64_t *g = vg; 5408 5409 /* With AH=0, use negx; with AH=1 use negf. */ 5410 negx_real = (negf_real & ~fpcr_ah) << 31; 5411 negx_imag = (negf_imag & ~fpcr_ah) << 31; 5412 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5413 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5414 5415 do { 5416 uint64_t pg = g[(i - 1) >> 6]; 5417 do { 5418 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5419 5420 /* I holds the real index; J holds the imag index. */ 5421 j = i - sizeof(float32); 5422 i -= 2 * sizeof(float32); 5423 5424 nr = *(float32 *)(vn + H1_2(i)); 5425 ni = *(float32 *)(vn + H1_2(j)); 5426 mr = *(float32 *)(vm + H1_2(i)); 5427 mi = *(float32 *)(vm + H1_2(j)); 5428 5429 e2 = (flip ? ni : nr); 5430 e1 = (flip ? mi : mr) ^ negx_real; 5431 e4 = e2; 5432 e3 = (flip ? mr : mi) ^ negx_imag; 5433 5434 if (likely((pg >> (i & 63)) & 1)) { 5435 d = *(float32 *)(va + H1_2(i)); 5436 d = float32_muladd(e2, e1, d, negf_real, status); 5437 *(float32 *)(vd + H1_2(i)) = d; 5438 } 5439 if (likely((pg >> (j & 63)) & 1)) { 5440 d = *(float32 *)(va + H1_2(j)); 5441 d = float32_muladd(e4, e3, d, negf_imag, status); 5442 *(float32 *)(vd + H1_2(j)) = d; 5443 } 5444 } while (i & 63); 5445 } while (i != 0); 5446 } 5447 5448 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5449 void *vg, float_status *status, uint32_t desc) 5450 { 5451 intptr_t j, i = simd_oprsz(desc); 5452 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5453 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5454 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5455 uint32_t negf_real = flip ^ negf_imag; 5456 float64 negx_imag, negx_real; 5457 uint64_t *g = vg; 5458 5459 /* With AH=0, use negx; with AH=1 use negf. */ 5460 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 5461 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 5462 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5463 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5464 5465 do { 5466 uint64_t pg = g[(i - 1) >> 6]; 5467 do { 5468 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5469 5470 /* I holds the real index; J holds the imag index. */ 5471 j = i - sizeof(float64); 5472 i -= 2 * sizeof(float64); 5473 5474 nr = *(float64 *)(vn + H1_2(i)); 5475 ni = *(float64 *)(vn + H1_2(j)); 5476 mr = *(float64 *)(vm + H1_2(i)); 5477 mi = *(float64 *)(vm + H1_2(j)); 5478 5479 e2 = (flip ? ni : nr); 5480 e1 = (flip ? mi : mr) ^ negx_real; 5481 e4 = e2; 5482 e3 = (flip ? mr : mi) ^ negx_imag; 5483 5484 if (likely((pg >> (i & 63)) & 1)) { 5485 d = *(float64 *)(va + H1_2(i)); 5486 d = float64_muladd(e2, e1, d, negf_real, status); 5487 *(float64 *)(vd + H1_2(i)) = d; 5488 } 5489 if (likely((pg >> (j & 63)) & 1)) { 5490 d = *(float64 *)(va + H1_2(j)); 5491 d = float64_muladd(e4, e3, d, negf_imag, status); 5492 *(float64 *)(vd + H1_2(j)) = d; 5493 } 5494 } while (i & 63); 5495 } while (i != 0); 5496 } 5497 5498 /* 5499 * Load contiguous data, protected by a governing predicate. 5500 */ 5501 5502 /* 5503 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5504 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5505 * element >= @reg_off, or @reg_max if there were no active elements at all. 5506 */ 5507 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5508 intptr_t reg_max, int esz) 5509 { 5510 uint64_t pg_mask = pred_esz_masks[esz]; 5511 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5512 5513 /* In normal usage, the first element is active. */ 5514 if (likely(pg & 1)) { 5515 return reg_off; 5516 } 5517 5518 if (pg == 0) { 5519 reg_off &= -64; 5520 do { 5521 reg_off += 64; 5522 if (unlikely(reg_off >= reg_max)) { 5523 /* The entire predicate was false. */ 5524 return reg_max; 5525 } 5526 pg = vg[reg_off >> 6] & pg_mask; 5527 } while (pg == 0); 5528 } 5529 reg_off += ctz64(pg); 5530 5531 /* We should never see an out of range predicate bit set. */ 5532 tcg_debug_assert(reg_off < reg_max); 5533 return reg_off; 5534 } 5535 5536 /* 5537 * Resolve the guest virtual address to info->host and info->flags. 5538 * If @nofault, return false if the page is invalid, otherwise 5539 * exit via page fault exception. 5540 */ 5541 5542 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5543 target_ulong addr, int mem_off, MMUAccessType access_type, 5544 int mmu_idx, uintptr_t retaddr) 5545 { 5546 int flags; 5547 5548 addr += mem_off; 5549 5550 /* 5551 * User-only currently always issues with TBI. See the comment 5552 * above useronly_clean_ptr. Usually we clean this top byte away 5553 * during translation, but we can't do that for e.g. vector + imm 5554 * addressing modes. 5555 * 5556 * We currently always enable TBI for user-only, and do not provide 5557 * a way to turn it off. So clean the pointer unconditionally here, 5558 * rather than look it up here, or pass it down from above. 5559 */ 5560 addr = useronly_clean_ptr(addr); 5561 5562 #ifdef CONFIG_USER_ONLY 5563 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5564 &info->host, retaddr); 5565 #else 5566 CPUTLBEntryFull *full; 5567 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5568 &info->host, &full, retaddr); 5569 #endif 5570 info->flags = flags; 5571 5572 if (flags & TLB_INVALID_MASK) { 5573 g_assert(nofault); 5574 return false; 5575 } 5576 5577 #ifdef CONFIG_USER_ONLY 5578 memset(&info->attrs, 0, sizeof(info->attrs)); 5579 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5580 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5581 #else 5582 info->attrs = full->attrs; 5583 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5584 #endif 5585 5586 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5587 info->host -= mem_off; 5588 return true; 5589 } 5590 5591 /* 5592 * Find first active element on each page, and a loose bound for the 5593 * final element on each page. Identify any single element that spans 5594 * the page boundary. Return true if there are any active elements. 5595 */ 5596 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5597 intptr_t reg_max, int esz, int msize) 5598 { 5599 const int esize = 1 << esz; 5600 const uint64_t pg_mask = pred_esz_masks[esz]; 5601 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5602 intptr_t mem_off_last, mem_off_split; 5603 intptr_t page_split, elt_split; 5604 intptr_t i; 5605 5606 /* Set all of the element indices to -1, and the TLB data to 0. */ 5607 memset(info, -1, offsetof(SVEContLdSt, page)); 5608 memset(info->page, 0, sizeof(info->page)); 5609 5610 /* Gross scan over the entire predicate to find bounds. */ 5611 i = 0; 5612 do { 5613 uint64_t pg = vg[i] & pg_mask; 5614 if (pg) { 5615 reg_off_last = i * 64 + 63 - clz64(pg); 5616 if (reg_off_first < 0) { 5617 reg_off_first = i * 64 + ctz64(pg); 5618 } 5619 } 5620 } while (++i * 64 < reg_max); 5621 5622 if (unlikely(reg_off_first < 0)) { 5623 /* No active elements, no pages touched. */ 5624 return false; 5625 } 5626 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5627 5628 info->reg_off_first[0] = reg_off_first; 5629 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5630 mem_off_last = (reg_off_last >> esz) * msize; 5631 5632 page_split = -(addr | TARGET_PAGE_MASK); 5633 if (likely(mem_off_last + msize <= page_split)) { 5634 /* The entire operation fits within a single page. */ 5635 info->reg_off_last[0] = reg_off_last; 5636 return true; 5637 } 5638 5639 info->page_split = page_split; 5640 elt_split = page_split / msize; 5641 reg_off_split = elt_split << esz; 5642 mem_off_split = elt_split * msize; 5643 5644 /* 5645 * This is the last full element on the first page, but it is not 5646 * necessarily active. If there is no full element, i.e. the first 5647 * active element is the one that's split, this value remains -1. 5648 * It is useful as iteration bounds. 5649 */ 5650 if (elt_split != 0) { 5651 info->reg_off_last[0] = reg_off_split - esize; 5652 } 5653 5654 /* Determine if an unaligned element spans the pages. */ 5655 if (page_split % msize != 0) { 5656 /* It is helpful to know if the split element is active. */ 5657 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5658 info->reg_off_split = reg_off_split; 5659 info->mem_off_split = mem_off_split; 5660 5661 if (reg_off_split == reg_off_last) { 5662 /* The page crossing element is last. */ 5663 return true; 5664 } 5665 } 5666 reg_off_split += esize; 5667 mem_off_split += msize; 5668 } 5669 5670 /* 5671 * We do want the first active element on the second page, because 5672 * this may affect the address reported in an exception. 5673 */ 5674 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5675 tcg_debug_assert(reg_off_split <= reg_off_last); 5676 info->reg_off_first[1] = reg_off_split; 5677 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5678 info->reg_off_last[1] = reg_off_last; 5679 return true; 5680 } 5681 5682 /* 5683 * Resolve the guest virtual addresses to info->page[]. 5684 * Control the generation of page faults with @fault. Return false if 5685 * there is no work to do, which can only happen with @fault == FAULT_NO. 5686 */ 5687 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5688 CPUARMState *env, target_ulong addr, 5689 MMUAccessType access_type, uintptr_t retaddr) 5690 { 5691 int mmu_idx = arm_env_mmu_index(env); 5692 int mem_off = info->mem_off_first[0]; 5693 bool nofault = fault == FAULT_NO; 5694 bool have_work = true; 5695 5696 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5697 access_type, mmu_idx, retaddr)) { 5698 /* No work to be done. */ 5699 return false; 5700 } 5701 5702 if (likely(info->page_split < 0)) { 5703 /* The entire operation was on the one page. */ 5704 return true; 5705 } 5706 5707 /* 5708 * If the second page is invalid, then we want the fault address to be 5709 * the first byte on that page which is accessed. 5710 */ 5711 if (info->mem_off_split >= 0) { 5712 /* 5713 * There is an element split across the pages. The fault address 5714 * should be the first byte of the second page. 5715 */ 5716 mem_off = info->page_split; 5717 /* 5718 * If the split element is also the first active element 5719 * of the vector, then: For first-fault we should continue 5720 * to generate faults for the second page. For no-fault, 5721 * we have work only if the second page is valid. 5722 */ 5723 if (info->mem_off_first[0] < info->mem_off_split) { 5724 nofault = FAULT_FIRST; 5725 have_work = false; 5726 } 5727 } else { 5728 /* 5729 * There is no element split across the pages. The fault address 5730 * should be the first active element on the second page. 5731 */ 5732 mem_off = info->mem_off_first[1]; 5733 /* 5734 * There must have been one active element on the first page, 5735 * so we're out of first-fault territory. 5736 */ 5737 nofault = fault != FAULT_ALL; 5738 } 5739 5740 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5741 access_type, mmu_idx, retaddr); 5742 return have_work; 5743 } 5744 5745 #ifndef CONFIG_USER_ONLY 5746 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5747 uint64_t *vg, target_ulong addr, 5748 int esize, int msize, int wp_access, 5749 uintptr_t retaddr) 5750 { 5751 intptr_t mem_off, reg_off, reg_last; 5752 int flags0 = info->page[0].flags; 5753 int flags1 = info->page[1].flags; 5754 5755 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5756 return; 5757 } 5758 5759 /* Indicate that watchpoints are handled. */ 5760 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5761 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5762 5763 if (flags0 & TLB_WATCHPOINT) { 5764 mem_off = info->mem_off_first[0]; 5765 reg_off = info->reg_off_first[0]; 5766 reg_last = info->reg_off_last[0]; 5767 5768 while (reg_off <= reg_last) { 5769 uint64_t pg = vg[reg_off >> 6]; 5770 do { 5771 if ((pg >> (reg_off & 63)) & 1) { 5772 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5773 msize, info->page[0].attrs, 5774 wp_access, retaddr); 5775 } 5776 reg_off += esize; 5777 mem_off += msize; 5778 } while (reg_off <= reg_last && (reg_off & 63)); 5779 } 5780 } 5781 5782 mem_off = info->mem_off_split; 5783 if (mem_off >= 0) { 5784 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5785 info->page[0].attrs, wp_access, retaddr); 5786 } 5787 5788 mem_off = info->mem_off_first[1]; 5789 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5790 reg_off = info->reg_off_first[1]; 5791 reg_last = info->reg_off_last[1]; 5792 5793 do { 5794 uint64_t pg = vg[reg_off >> 6]; 5795 do { 5796 if ((pg >> (reg_off & 63)) & 1) { 5797 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5798 msize, info->page[1].attrs, 5799 wp_access, retaddr); 5800 } 5801 reg_off += esize; 5802 mem_off += msize; 5803 } while (reg_off & 63); 5804 } while (reg_off <= reg_last); 5805 } 5806 } 5807 #endif 5808 5809 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5810 uint64_t *vg, target_ulong addr, int esize, 5811 int msize, uint32_t mtedesc, uintptr_t ra) 5812 { 5813 intptr_t mem_off, reg_off, reg_last; 5814 5815 /* Process the page only if MemAttr == Tagged. */ 5816 if (info->page[0].tagged) { 5817 mem_off = info->mem_off_first[0]; 5818 reg_off = info->reg_off_first[0]; 5819 reg_last = info->reg_off_split; 5820 if (reg_last < 0) { 5821 reg_last = info->reg_off_last[0]; 5822 } 5823 5824 do { 5825 uint64_t pg = vg[reg_off >> 6]; 5826 do { 5827 if ((pg >> (reg_off & 63)) & 1) { 5828 mte_check(env, mtedesc, addr, ra); 5829 } 5830 reg_off += esize; 5831 mem_off += msize; 5832 } while (reg_off <= reg_last && (reg_off & 63)); 5833 } while (reg_off <= reg_last); 5834 } 5835 5836 mem_off = info->mem_off_first[1]; 5837 if (mem_off >= 0 && info->page[1].tagged) { 5838 reg_off = info->reg_off_first[1]; 5839 reg_last = info->reg_off_last[1]; 5840 5841 do { 5842 uint64_t pg = vg[reg_off >> 6]; 5843 do { 5844 if ((pg >> (reg_off & 63)) & 1) { 5845 mte_check(env, mtedesc, addr, ra); 5846 } 5847 reg_off += esize; 5848 mem_off += msize; 5849 } while (reg_off & 63); 5850 } while (reg_off <= reg_last); 5851 } 5852 } 5853 5854 /* 5855 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5856 */ 5857 static inline QEMU_ALWAYS_INLINE 5858 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5859 uint32_t desc, const uintptr_t retaddr, 5860 const int esz, const int msz, const int N, uint32_t mtedesc, 5861 sve_ldst1_host_fn *host_fn, 5862 sve_ldst1_tlb_fn *tlb_fn) 5863 { 5864 const unsigned rd = simd_data(desc); 5865 const intptr_t reg_max = simd_oprsz(desc); 5866 intptr_t reg_off, reg_last, mem_off; 5867 SVEContLdSt info; 5868 void *host; 5869 int flags, i; 5870 5871 /* Find the active elements. */ 5872 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5873 /* The entire predicate was false; no load occurs. */ 5874 for (i = 0; i < N; ++i) { 5875 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5876 } 5877 return; 5878 } 5879 5880 /* Probe the page(s). Exit with exception for any invalid page. */ 5881 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5882 5883 /* Handle watchpoints for all active elements. */ 5884 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5885 BP_MEM_READ, retaddr); 5886 5887 /* 5888 * Handle mte checks for all active elements. 5889 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5890 */ 5891 if (mtedesc) { 5892 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5893 mtedesc, retaddr); 5894 } 5895 5896 flags = info.page[0].flags | info.page[1].flags; 5897 if (unlikely(flags != 0)) { 5898 /* 5899 * At least one page includes MMIO. 5900 * Any bus operation can fail with cpu_transaction_failed, 5901 * which for ARM will raise SyncExternal. Perform the load 5902 * into scratch memory to preserve register state until the end. 5903 */ 5904 ARMVectorReg scratch[4] = { }; 5905 5906 mem_off = info.mem_off_first[0]; 5907 reg_off = info.reg_off_first[0]; 5908 reg_last = info.reg_off_last[1]; 5909 if (reg_last < 0) { 5910 reg_last = info.reg_off_split; 5911 if (reg_last < 0) { 5912 reg_last = info.reg_off_last[0]; 5913 } 5914 } 5915 5916 do { 5917 uint64_t pg = vg[reg_off >> 6]; 5918 do { 5919 if ((pg >> (reg_off & 63)) & 1) { 5920 for (i = 0; i < N; ++i) { 5921 tlb_fn(env, &scratch[i], reg_off, 5922 addr + mem_off + (i << msz), retaddr); 5923 } 5924 } 5925 reg_off += 1 << esz; 5926 mem_off += N << msz; 5927 } while (reg_off & 63); 5928 } while (reg_off <= reg_last); 5929 5930 for (i = 0; i < N; ++i) { 5931 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5932 } 5933 return; 5934 } 5935 5936 /* The entire operation is in RAM, on valid pages. */ 5937 5938 for (i = 0; i < N; ++i) { 5939 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5940 } 5941 5942 mem_off = info.mem_off_first[0]; 5943 reg_off = info.reg_off_first[0]; 5944 reg_last = info.reg_off_last[0]; 5945 host = info.page[0].host; 5946 5947 set_helper_retaddr(retaddr); 5948 5949 while (reg_off <= reg_last) { 5950 uint64_t pg = vg[reg_off >> 6]; 5951 do { 5952 if ((pg >> (reg_off & 63)) & 1) { 5953 for (i = 0; i < N; ++i) { 5954 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5955 host + mem_off + (i << msz)); 5956 } 5957 } 5958 reg_off += 1 << esz; 5959 mem_off += N << msz; 5960 } while (reg_off <= reg_last && (reg_off & 63)); 5961 } 5962 5963 clear_helper_retaddr(); 5964 5965 /* 5966 * Use the slow path to manage the cross-page misalignment. 5967 * But we know this is RAM and cannot trap. 5968 */ 5969 mem_off = info.mem_off_split; 5970 if (unlikely(mem_off >= 0)) { 5971 reg_off = info.reg_off_split; 5972 for (i = 0; i < N; ++i) { 5973 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5974 addr + mem_off + (i << msz), retaddr); 5975 } 5976 } 5977 5978 mem_off = info.mem_off_first[1]; 5979 if (unlikely(mem_off >= 0)) { 5980 reg_off = info.reg_off_first[1]; 5981 reg_last = info.reg_off_last[1]; 5982 host = info.page[1].host; 5983 5984 set_helper_retaddr(retaddr); 5985 5986 do { 5987 uint64_t pg = vg[reg_off >> 6]; 5988 do { 5989 if ((pg >> (reg_off & 63)) & 1) { 5990 for (i = 0; i < N; ++i) { 5991 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5992 host + mem_off + (i << msz)); 5993 } 5994 } 5995 reg_off += 1 << esz; 5996 mem_off += N << msz; 5997 } while (reg_off & 63); 5998 } while (reg_off <= reg_last); 5999 6000 clear_helper_retaddr(); 6001 } 6002 } 6003 6004 static inline QEMU_ALWAYS_INLINE 6005 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6006 uint32_t desc, const uintptr_t ra, 6007 const int esz, const int msz, const int N, 6008 sve_ldst1_host_fn *host_fn, 6009 sve_ldst1_tlb_fn *tlb_fn) 6010 { 6011 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6012 int bit55 = extract64(addr, 55, 1); 6013 6014 /* Remove mtedesc from the normal sve descriptor. */ 6015 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6016 6017 /* Perform gross MTE suppression early. */ 6018 if (!tbi_check(mtedesc, bit55) || 6019 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6020 mtedesc = 0; 6021 } 6022 6023 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6024 } 6025 6026 #define DO_LD1_1(NAME, ESZ) \ 6027 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 6028 target_ulong addr, uint32_t desc) \ 6029 { \ 6030 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 6031 sve_##NAME##_host, sve_##NAME##_tlb); \ 6032 } \ 6033 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6034 target_ulong addr, uint32_t desc) \ 6035 { \ 6036 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 6037 sve_##NAME##_host, sve_##NAME##_tlb); \ 6038 } 6039 6040 #define DO_LD1_2(NAME, ESZ, MSZ) \ 6041 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 6042 target_ulong addr, uint32_t desc) \ 6043 { \ 6044 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6045 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6046 } \ 6047 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 6048 target_ulong addr, uint32_t desc) \ 6049 { \ 6050 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6051 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6052 } \ 6053 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6054 target_ulong addr, uint32_t desc) \ 6055 { \ 6056 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6057 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6058 } \ 6059 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6060 target_ulong addr, uint32_t desc) \ 6061 { \ 6062 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6063 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6064 } 6065 6066 DO_LD1_1(ld1bb, MO_8) 6067 DO_LD1_1(ld1bhu, MO_16) 6068 DO_LD1_1(ld1bhs, MO_16) 6069 DO_LD1_1(ld1bsu, MO_32) 6070 DO_LD1_1(ld1bss, MO_32) 6071 DO_LD1_1(ld1bdu, MO_64) 6072 DO_LD1_1(ld1bds, MO_64) 6073 6074 DO_LD1_2(ld1hh, MO_16, MO_16) 6075 DO_LD1_2(ld1hsu, MO_32, MO_16) 6076 DO_LD1_2(ld1hss, MO_32, MO_16) 6077 DO_LD1_2(ld1hdu, MO_64, MO_16) 6078 DO_LD1_2(ld1hds, MO_64, MO_16) 6079 6080 DO_LD1_2(ld1ss, MO_32, MO_32) 6081 DO_LD1_2(ld1sdu, MO_64, MO_32) 6082 DO_LD1_2(ld1sds, MO_64, MO_32) 6083 6084 DO_LD1_2(ld1dd, MO_64, MO_64) 6085 6086 #undef DO_LD1_1 6087 #undef DO_LD1_2 6088 6089 #define DO_LDN_1(N) \ 6090 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 6091 target_ulong addr, uint32_t desc) \ 6092 { \ 6093 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 6094 sve_ld1bb_host, sve_ld1bb_tlb); \ 6095 } \ 6096 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 6097 target_ulong addr, uint32_t desc) \ 6098 { \ 6099 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 6100 sve_ld1bb_host, sve_ld1bb_tlb); \ 6101 } 6102 6103 #define DO_LDN_2(N, SUFF, ESZ) \ 6104 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 6105 target_ulong addr, uint32_t desc) \ 6106 { \ 6107 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6108 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6109 } \ 6110 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 6111 target_ulong addr, uint32_t desc) \ 6112 { \ 6113 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6114 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6115 } \ 6116 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 6117 target_ulong addr, uint32_t desc) \ 6118 { \ 6119 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6120 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6121 } \ 6122 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 6123 target_ulong addr, uint32_t desc) \ 6124 { \ 6125 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6126 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6127 } 6128 6129 DO_LDN_1(2) 6130 DO_LDN_1(3) 6131 DO_LDN_1(4) 6132 6133 DO_LDN_2(2, hh, MO_16) 6134 DO_LDN_2(3, hh, MO_16) 6135 DO_LDN_2(4, hh, MO_16) 6136 6137 DO_LDN_2(2, ss, MO_32) 6138 DO_LDN_2(3, ss, MO_32) 6139 DO_LDN_2(4, ss, MO_32) 6140 6141 DO_LDN_2(2, dd, MO_64) 6142 DO_LDN_2(3, dd, MO_64) 6143 DO_LDN_2(4, dd, MO_64) 6144 6145 #undef DO_LDN_1 6146 #undef DO_LDN_2 6147 6148 /* 6149 * Load contiguous data, first-fault and no-fault. 6150 * 6151 * For user-only, we control the race between page_check_range and 6152 * another thread's munmap by using set/clear_helper_retaddr. Any 6153 * SEGV that occurs between those markers is assumed to be because 6154 * the guest page vanished. Keep that block as small as possible 6155 * so that unrelated QEMU bugs are not blamed on the guest. 6156 */ 6157 6158 /* Fault on byte I. All bits in FFR from I are cleared. The vector 6159 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 6160 * option, which leaves subsequent data unchanged. 6161 */ 6162 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 6163 { 6164 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 6165 6166 if (i & 63) { 6167 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 6168 i = ROUND_UP(i, 64); 6169 } 6170 for (; i < oprsz; i += 64) { 6171 ffr[i / 64] = 0; 6172 } 6173 } 6174 6175 /* 6176 * Common helper for all contiguous no-fault and first-fault loads. 6177 */ 6178 static inline QEMU_ALWAYS_INLINE 6179 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 6180 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 6181 const int esz, const int msz, const SVEContFault fault, 6182 sve_ldst1_host_fn *host_fn, 6183 sve_ldst1_tlb_fn *tlb_fn) 6184 { 6185 const unsigned rd = simd_data(desc); 6186 void *vd = &env->vfp.zregs[rd]; 6187 const intptr_t reg_max = simd_oprsz(desc); 6188 intptr_t reg_off, mem_off, reg_last; 6189 SVEContLdSt info; 6190 int flags; 6191 void *host; 6192 6193 /* Find the active elements. */ 6194 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 6195 /* The entire predicate was false; no load occurs. */ 6196 memset(vd, 0, reg_max); 6197 return; 6198 } 6199 reg_off = info.reg_off_first[0]; 6200 6201 /* Probe the page(s). */ 6202 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6203 /* Fault on first element. */ 6204 tcg_debug_assert(fault == FAULT_NO); 6205 memset(vd, 0, reg_max); 6206 goto do_fault; 6207 } 6208 6209 mem_off = info.mem_off_first[0]; 6210 flags = info.page[0].flags; 6211 6212 /* 6213 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6214 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6215 */ 6216 if (!info.page[0].tagged) { 6217 mtedesc = 0; 6218 } 6219 6220 if (fault == FAULT_FIRST) { 6221 /* Trapping mte check for the first-fault element. */ 6222 if (mtedesc) { 6223 mte_check(env, mtedesc, addr + mem_off, retaddr); 6224 } 6225 6226 /* 6227 * Special handling of the first active element, 6228 * if it crosses a page boundary or is MMIO. 6229 */ 6230 bool is_split = mem_off == info.mem_off_split; 6231 if (unlikely(flags != 0) || unlikely(is_split)) { 6232 /* 6233 * Use the slow path for cross-page handling. 6234 * Might trap for MMIO or watchpoints. 6235 */ 6236 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6237 6238 /* After any fault, zero the other elements. */ 6239 swap_memzero(vd, reg_off); 6240 reg_off += 1 << esz; 6241 mem_off += 1 << msz; 6242 swap_memzero(vd + reg_off, reg_max - reg_off); 6243 6244 if (is_split) { 6245 goto second_page; 6246 } 6247 } else { 6248 memset(vd, 0, reg_max); 6249 } 6250 } else { 6251 memset(vd, 0, reg_max); 6252 if (unlikely(mem_off == info.mem_off_split)) { 6253 /* The first active element crosses a page boundary. */ 6254 flags |= info.page[1].flags; 6255 if (unlikely(flags & TLB_MMIO)) { 6256 /* Some page is MMIO, see below. */ 6257 goto do_fault; 6258 } 6259 if (unlikely(flags & TLB_WATCHPOINT) && 6260 (cpu_watchpoint_address_matches 6261 (env_cpu(env), addr + mem_off, 1 << msz) 6262 & BP_MEM_READ)) { 6263 /* Watchpoint hit, see below. */ 6264 goto do_fault; 6265 } 6266 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6267 goto do_fault; 6268 } 6269 /* 6270 * Use the slow path for cross-page handling. 6271 * This is RAM, without a watchpoint, and will not trap. 6272 */ 6273 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6274 goto second_page; 6275 } 6276 } 6277 6278 /* 6279 * From this point on, all memory operations are MemSingleNF. 6280 * 6281 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6282 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6283 * 6284 * Unfortuately we do not have access to the memory attributes from the 6285 * PTE to tell Device memory from Normal memory. So we make a mostly 6286 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6287 * This gives the right answer for the common cases of "Normal memory, 6288 * backed by host RAM" and "Device memory, backed by MMIO". 6289 * The architecture allows us to suppress an NF load and return 6290 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6291 * case of "Normal memory, backed by MMIO" is permitted. The case we 6292 * get wrong is "Device memory, backed by host RAM", for which we 6293 * should return (UNKNOWN, FAULT) for but do not. 6294 * 6295 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6296 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6297 * architectural breakpoints the same. 6298 */ 6299 if (unlikely(flags & TLB_MMIO)) { 6300 goto do_fault; 6301 } 6302 6303 reg_last = info.reg_off_last[0]; 6304 host = info.page[0].host; 6305 6306 set_helper_retaddr(retaddr); 6307 6308 do { 6309 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6310 do { 6311 if ((pg >> (reg_off & 63)) & 1) { 6312 if (unlikely(flags & TLB_WATCHPOINT) && 6313 (cpu_watchpoint_address_matches 6314 (env_cpu(env), addr + mem_off, 1 << msz) 6315 & BP_MEM_READ)) { 6316 clear_helper_retaddr(); 6317 goto do_fault; 6318 } 6319 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6320 clear_helper_retaddr(); 6321 goto do_fault; 6322 } 6323 host_fn(vd, reg_off, host + mem_off); 6324 } 6325 reg_off += 1 << esz; 6326 mem_off += 1 << msz; 6327 } while (reg_off <= reg_last && (reg_off & 63)); 6328 } while (reg_off <= reg_last); 6329 6330 clear_helper_retaddr(); 6331 6332 /* 6333 * MemSingleNF is allowed to fail for any reason. We have special 6334 * code above to handle the first element crossing a page boundary. 6335 * As an implementation choice, decline to handle a cross-page element 6336 * in any other position. 6337 */ 6338 reg_off = info.reg_off_split; 6339 if (reg_off >= 0) { 6340 goto do_fault; 6341 } 6342 6343 second_page: 6344 reg_off = info.reg_off_first[1]; 6345 if (likely(reg_off < 0)) { 6346 /* No active elements on the second page. All done. */ 6347 return; 6348 } 6349 6350 /* 6351 * MemSingleNF is allowed to fail for any reason. As an implementation 6352 * choice, decline to handle elements on the second page. This should 6353 * be low frequency as the guest walks through memory -- the next 6354 * iteration of the guest's loop should be aligned on the page boundary, 6355 * and then all following iterations will stay aligned. 6356 */ 6357 6358 do_fault: 6359 record_fault(env, reg_off, reg_max); 6360 } 6361 6362 static inline QEMU_ALWAYS_INLINE 6363 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6364 uint32_t desc, const uintptr_t retaddr, 6365 const int esz, const int msz, const SVEContFault fault, 6366 sve_ldst1_host_fn *host_fn, 6367 sve_ldst1_tlb_fn *tlb_fn) 6368 { 6369 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6370 int bit55 = extract64(addr, 55, 1); 6371 6372 /* Remove mtedesc from the normal sve descriptor. */ 6373 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6374 6375 /* Perform gross MTE suppression early. */ 6376 if (!tbi_check(mtedesc, bit55) || 6377 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6378 mtedesc = 0; 6379 } 6380 6381 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6382 esz, msz, fault, host_fn, tlb_fn); 6383 } 6384 6385 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6386 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6387 target_ulong addr, uint32_t desc) \ 6388 { \ 6389 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6390 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6391 } \ 6392 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6393 target_ulong addr, uint32_t desc) \ 6394 { \ 6395 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6396 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6397 } \ 6398 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6399 target_ulong addr, uint32_t desc) \ 6400 { \ 6401 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6402 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6403 } \ 6404 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6405 target_ulong addr, uint32_t desc) \ 6406 { \ 6407 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6408 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6409 } 6410 6411 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6412 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6413 target_ulong addr, uint32_t desc) \ 6414 { \ 6415 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6416 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6417 } \ 6418 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6419 target_ulong addr, uint32_t desc) \ 6420 { \ 6421 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6422 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6423 } \ 6424 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6425 target_ulong addr, uint32_t desc) \ 6426 { \ 6427 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6428 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6429 } \ 6430 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6431 target_ulong addr, uint32_t desc) \ 6432 { \ 6433 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6434 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6435 } \ 6436 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6437 target_ulong addr, uint32_t desc) \ 6438 { \ 6439 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6440 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6441 } \ 6442 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6443 target_ulong addr, uint32_t desc) \ 6444 { \ 6445 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6446 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6447 } \ 6448 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6449 target_ulong addr, uint32_t desc) \ 6450 { \ 6451 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6452 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6453 } \ 6454 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6455 target_ulong addr, uint32_t desc) \ 6456 { \ 6457 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6458 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6459 } 6460 6461 DO_LDFF1_LDNF1_1(bb, MO_8) 6462 DO_LDFF1_LDNF1_1(bhu, MO_16) 6463 DO_LDFF1_LDNF1_1(bhs, MO_16) 6464 DO_LDFF1_LDNF1_1(bsu, MO_32) 6465 DO_LDFF1_LDNF1_1(bss, MO_32) 6466 DO_LDFF1_LDNF1_1(bdu, MO_64) 6467 DO_LDFF1_LDNF1_1(bds, MO_64) 6468 6469 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6470 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6471 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6472 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6473 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6474 6475 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6476 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6477 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6478 6479 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6480 6481 #undef DO_LDFF1_LDNF1_1 6482 #undef DO_LDFF1_LDNF1_2 6483 6484 /* 6485 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6486 */ 6487 6488 static inline QEMU_ALWAYS_INLINE 6489 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6490 uint32_t desc, const uintptr_t retaddr, 6491 const int esz, const int msz, const int N, uint32_t mtedesc, 6492 sve_ldst1_host_fn *host_fn, 6493 sve_ldst1_tlb_fn *tlb_fn) 6494 { 6495 const unsigned rd = simd_data(desc); 6496 const intptr_t reg_max = simd_oprsz(desc); 6497 intptr_t reg_off, reg_last, mem_off; 6498 SVEContLdSt info; 6499 void *host; 6500 int i, flags; 6501 6502 /* Find the active elements. */ 6503 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6504 /* The entire predicate was false; no store occurs. */ 6505 return; 6506 } 6507 6508 /* Probe the page(s). Exit with exception for any invalid page. */ 6509 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6510 6511 /* Handle watchpoints for all active elements. */ 6512 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6513 BP_MEM_WRITE, retaddr); 6514 6515 /* 6516 * Handle mte checks for all active elements. 6517 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6518 */ 6519 if (mtedesc) { 6520 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6521 mtedesc, retaddr); 6522 } 6523 6524 flags = info.page[0].flags | info.page[1].flags; 6525 if (unlikely(flags != 0)) { 6526 /* 6527 * At least one page includes MMIO. 6528 * Any bus operation can fail with cpu_transaction_failed, 6529 * which for ARM will raise SyncExternal. We cannot avoid 6530 * this fault and will leave with the store incomplete. 6531 */ 6532 mem_off = info.mem_off_first[0]; 6533 reg_off = info.reg_off_first[0]; 6534 reg_last = info.reg_off_last[1]; 6535 if (reg_last < 0) { 6536 reg_last = info.reg_off_split; 6537 if (reg_last < 0) { 6538 reg_last = info.reg_off_last[0]; 6539 } 6540 } 6541 6542 do { 6543 uint64_t pg = vg[reg_off >> 6]; 6544 do { 6545 if ((pg >> (reg_off & 63)) & 1) { 6546 for (i = 0; i < N; ++i) { 6547 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6548 addr + mem_off + (i << msz), retaddr); 6549 } 6550 } 6551 reg_off += 1 << esz; 6552 mem_off += N << msz; 6553 } while (reg_off & 63); 6554 } while (reg_off <= reg_last); 6555 return; 6556 } 6557 6558 mem_off = info.mem_off_first[0]; 6559 reg_off = info.reg_off_first[0]; 6560 reg_last = info.reg_off_last[0]; 6561 host = info.page[0].host; 6562 6563 set_helper_retaddr(retaddr); 6564 6565 while (reg_off <= reg_last) { 6566 uint64_t pg = vg[reg_off >> 6]; 6567 do { 6568 if ((pg >> (reg_off & 63)) & 1) { 6569 for (i = 0; i < N; ++i) { 6570 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6571 host + mem_off + (i << msz)); 6572 } 6573 } 6574 reg_off += 1 << esz; 6575 mem_off += N << msz; 6576 } while (reg_off <= reg_last && (reg_off & 63)); 6577 } 6578 6579 clear_helper_retaddr(); 6580 6581 /* 6582 * Use the slow path to manage the cross-page misalignment. 6583 * But we know this is RAM and cannot trap. 6584 */ 6585 mem_off = info.mem_off_split; 6586 if (unlikely(mem_off >= 0)) { 6587 reg_off = info.reg_off_split; 6588 for (i = 0; i < N; ++i) { 6589 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6590 addr + mem_off + (i << msz), retaddr); 6591 } 6592 } 6593 6594 mem_off = info.mem_off_first[1]; 6595 if (unlikely(mem_off >= 0)) { 6596 reg_off = info.reg_off_first[1]; 6597 reg_last = info.reg_off_last[1]; 6598 host = info.page[1].host; 6599 6600 set_helper_retaddr(retaddr); 6601 6602 do { 6603 uint64_t pg = vg[reg_off >> 6]; 6604 do { 6605 if ((pg >> (reg_off & 63)) & 1) { 6606 for (i = 0; i < N; ++i) { 6607 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6608 host + mem_off + (i << msz)); 6609 } 6610 } 6611 reg_off += 1 << esz; 6612 mem_off += N << msz; 6613 } while (reg_off & 63); 6614 } while (reg_off <= reg_last); 6615 6616 clear_helper_retaddr(); 6617 } 6618 } 6619 6620 static inline QEMU_ALWAYS_INLINE 6621 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6622 uint32_t desc, const uintptr_t ra, 6623 const int esz, const int msz, const int N, 6624 sve_ldst1_host_fn *host_fn, 6625 sve_ldst1_tlb_fn *tlb_fn) 6626 { 6627 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6628 int bit55 = extract64(addr, 55, 1); 6629 6630 /* Remove mtedesc from the normal sve descriptor. */ 6631 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6632 6633 /* Perform gross MTE suppression early. */ 6634 if (!tbi_check(mtedesc, bit55) || 6635 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6636 mtedesc = 0; 6637 } 6638 6639 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6640 } 6641 6642 #define DO_STN_1(N, NAME, ESZ) \ 6643 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6644 target_ulong addr, uint32_t desc) \ 6645 { \ 6646 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6647 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6648 } \ 6649 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6650 target_ulong addr, uint32_t desc) \ 6651 { \ 6652 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6653 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6654 } 6655 6656 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6657 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6658 target_ulong addr, uint32_t desc) \ 6659 { \ 6660 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6661 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6662 } \ 6663 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6664 target_ulong addr, uint32_t desc) \ 6665 { \ 6666 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6667 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6668 } \ 6669 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6670 target_ulong addr, uint32_t desc) \ 6671 { \ 6672 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6673 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6674 } \ 6675 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6676 target_ulong addr, uint32_t desc) \ 6677 { \ 6678 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6679 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6680 } 6681 6682 DO_STN_1(1, bb, MO_8) 6683 DO_STN_1(1, bh, MO_16) 6684 DO_STN_1(1, bs, MO_32) 6685 DO_STN_1(1, bd, MO_64) 6686 DO_STN_1(2, bb, MO_8) 6687 DO_STN_1(3, bb, MO_8) 6688 DO_STN_1(4, bb, MO_8) 6689 6690 DO_STN_2(1, hh, MO_16, MO_16) 6691 DO_STN_2(1, hs, MO_32, MO_16) 6692 DO_STN_2(1, hd, MO_64, MO_16) 6693 DO_STN_2(2, hh, MO_16, MO_16) 6694 DO_STN_2(3, hh, MO_16, MO_16) 6695 DO_STN_2(4, hh, MO_16, MO_16) 6696 6697 DO_STN_2(1, ss, MO_32, MO_32) 6698 DO_STN_2(1, sd, MO_64, MO_32) 6699 DO_STN_2(2, ss, MO_32, MO_32) 6700 DO_STN_2(3, ss, MO_32, MO_32) 6701 DO_STN_2(4, ss, MO_32, MO_32) 6702 6703 DO_STN_2(1, dd, MO_64, MO_64) 6704 DO_STN_2(2, dd, MO_64, MO_64) 6705 DO_STN_2(3, dd, MO_64, MO_64) 6706 DO_STN_2(4, dd, MO_64, MO_64) 6707 6708 #undef DO_STN_1 6709 #undef DO_STN_2 6710 6711 /* 6712 * Loads with a vector index. 6713 */ 6714 6715 /* 6716 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6717 */ 6718 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6719 6720 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6721 { 6722 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6723 } 6724 6725 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6726 { 6727 return *(int32_t *)(reg + H1_4(reg_ofs)); 6728 } 6729 6730 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6731 { 6732 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6733 } 6734 6735 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6736 { 6737 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6738 } 6739 6740 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6741 { 6742 return *(uint64_t *)(reg + reg_ofs); 6743 } 6744 6745 static inline QEMU_ALWAYS_INLINE 6746 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6747 target_ulong base, uint32_t desc, uintptr_t retaddr, 6748 uint32_t mtedesc, int esize, int msize, 6749 zreg_off_fn *off_fn, 6750 sve_ldst1_host_fn *host_fn, 6751 sve_ldst1_tlb_fn *tlb_fn) 6752 { 6753 const int mmu_idx = arm_env_mmu_index(env); 6754 const intptr_t reg_max = simd_oprsz(desc); 6755 const int scale = simd_data(desc); 6756 ARMVectorReg scratch; 6757 intptr_t reg_off; 6758 SVEHostPage info, info2; 6759 6760 memset(&scratch, 0, reg_max); 6761 reg_off = 0; 6762 do { 6763 uint64_t pg = vg[reg_off >> 6]; 6764 do { 6765 if (likely(pg & 1)) { 6766 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6767 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6768 6769 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6770 mmu_idx, retaddr); 6771 6772 if (likely(in_page >= msize)) { 6773 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6774 cpu_check_watchpoint(env_cpu(env), addr, msize, 6775 info.attrs, BP_MEM_READ, retaddr); 6776 } 6777 if (mtedesc && info.tagged) { 6778 mte_check(env, mtedesc, addr, retaddr); 6779 } 6780 if (unlikely(info.flags & TLB_MMIO)) { 6781 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6782 } else { 6783 set_helper_retaddr(retaddr); 6784 host_fn(&scratch, reg_off, info.host); 6785 clear_helper_retaddr(); 6786 } 6787 } else { 6788 /* Element crosses the page boundary. */ 6789 sve_probe_page(&info2, false, env, addr + in_page, 0, 6790 MMU_DATA_LOAD, mmu_idx, retaddr); 6791 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6792 cpu_check_watchpoint(env_cpu(env), addr, 6793 msize, info.attrs, 6794 BP_MEM_READ, retaddr); 6795 } 6796 if (mtedesc && info.tagged) { 6797 mte_check(env, mtedesc, addr, retaddr); 6798 } 6799 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6800 } 6801 } 6802 reg_off += esize; 6803 pg >>= esize; 6804 } while (reg_off & 63); 6805 } while (reg_off < reg_max); 6806 6807 /* Wait until all exceptions have been raised to write back. */ 6808 memcpy(vd, &scratch, reg_max); 6809 } 6810 6811 static inline QEMU_ALWAYS_INLINE 6812 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6813 target_ulong base, uint32_t desc, uintptr_t retaddr, 6814 int esize, int msize, zreg_off_fn *off_fn, 6815 sve_ldst1_host_fn *host_fn, 6816 sve_ldst1_tlb_fn *tlb_fn) 6817 { 6818 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6819 /* Remove mtedesc from the normal sve descriptor. */ 6820 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6821 6822 /* 6823 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6824 * offset base entirely over the address space hole to change the 6825 * pointer tag, or change the bit55 selector. So we could here 6826 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6827 */ 6828 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6829 esize, msize, off_fn, host_fn, tlb_fn); 6830 } 6831 6832 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6833 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6834 void *vm, target_ulong base, uint32_t desc) \ 6835 { \ 6836 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6837 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6838 } \ 6839 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6840 void *vm, target_ulong base, uint32_t desc) \ 6841 { \ 6842 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6843 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6844 } 6845 6846 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6847 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6848 void *vm, target_ulong base, uint32_t desc) \ 6849 { \ 6850 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6851 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6852 } \ 6853 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6854 void *vm, target_ulong base, uint32_t desc) \ 6855 { \ 6856 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6857 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6858 } 6859 6860 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6861 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6862 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6863 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6864 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6865 6866 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6867 DO_LD1_ZPZ_S(bss, zss, MO_8) 6868 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6869 DO_LD1_ZPZ_D(bds, zss, MO_8) 6870 DO_LD1_ZPZ_D(bds, zd, MO_8) 6871 6872 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6873 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6874 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6875 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6876 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6877 6878 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6879 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6880 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6881 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6882 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6883 6884 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6885 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6886 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6887 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6888 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6889 6890 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6891 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6892 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6893 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6894 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6895 6896 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6897 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6898 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6899 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6900 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6901 6902 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6903 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6904 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6905 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6906 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6907 6908 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6909 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6910 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6911 6912 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6913 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6914 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6915 6916 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6917 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6918 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6919 6920 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6921 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6922 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6923 6924 #undef DO_LD1_ZPZ_S 6925 #undef DO_LD1_ZPZ_D 6926 6927 /* First fault loads with a vector index. */ 6928 6929 /* 6930 * Common helpers for all gather first-faulting loads. 6931 */ 6932 6933 static inline QEMU_ALWAYS_INLINE 6934 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6935 target_ulong base, uint32_t desc, uintptr_t retaddr, 6936 uint32_t mtedesc, const int esz, const int msz, 6937 zreg_off_fn *off_fn, 6938 sve_ldst1_host_fn *host_fn, 6939 sve_ldst1_tlb_fn *tlb_fn) 6940 { 6941 const int mmu_idx = arm_env_mmu_index(env); 6942 const intptr_t reg_max = simd_oprsz(desc); 6943 const int scale = simd_data(desc); 6944 const int esize = 1 << esz; 6945 const int msize = 1 << msz; 6946 intptr_t reg_off; 6947 SVEHostPage info; 6948 target_ulong addr, in_page; 6949 ARMVectorReg scratch; 6950 6951 /* Skip to the first true predicate. */ 6952 reg_off = find_next_active(vg, 0, reg_max, esz); 6953 if (unlikely(reg_off >= reg_max)) { 6954 /* The entire predicate was false; no load occurs. */ 6955 memset(vd, 0, reg_max); 6956 return; 6957 } 6958 6959 /* Protect against overlap between vd and vm. */ 6960 if (unlikely(vd == vm)) { 6961 vm = memcpy(&scratch, vm, reg_max); 6962 } 6963 6964 /* 6965 * Probe the first element, allowing faults. 6966 */ 6967 addr = base + (off_fn(vm, reg_off) << scale); 6968 if (mtedesc) { 6969 mte_check(env, mtedesc, addr, retaddr); 6970 } 6971 tlb_fn(env, vd, reg_off, addr, retaddr); 6972 6973 /* After any fault, zero the other elements. */ 6974 swap_memzero(vd, reg_off); 6975 reg_off += esize; 6976 swap_memzero(vd + reg_off, reg_max - reg_off); 6977 6978 /* 6979 * Probe the remaining elements, not allowing faults. 6980 */ 6981 while (reg_off < reg_max) { 6982 uint64_t pg = vg[reg_off >> 6]; 6983 do { 6984 if (likely((pg >> (reg_off & 63)) & 1)) { 6985 addr = base + (off_fn(vm, reg_off) << scale); 6986 in_page = -(addr | TARGET_PAGE_MASK); 6987 6988 if (unlikely(in_page < msize)) { 6989 /* Stop if the element crosses a page boundary. */ 6990 goto fault; 6991 } 6992 6993 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6994 mmu_idx, retaddr); 6995 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6996 goto fault; 6997 } 6998 if (unlikely(info.flags & TLB_WATCHPOINT) && 6999 (cpu_watchpoint_address_matches 7000 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 7001 goto fault; 7002 } 7003 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 7004 goto fault; 7005 } 7006 7007 set_helper_retaddr(retaddr); 7008 host_fn(vd, reg_off, info.host); 7009 clear_helper_retaddr(); 7010 } 7011 reg_off += esize; 7012 } while (reg_off & 63); 7013 } 7014 return; 7015 7016 fault: 7017 record_fault(env, reg_off, reg_max); 7018 } 7019 7020 static inline QEMU_ALWAYS_INLINE 7021 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7022 target_ulong base, uint32_t desc, uintptr_t retaddr, 7023 const int esz, const int msz, 7024 zreg_off_fn *off_fn, 7025 sve_ldst1_host_fn *host_fn, 7026 sve_ldst1_tlb_fn *tlb_fn) 7027 { 7028 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7029 /* Remove mtedesc from the normal sve descriptor. */ 7030 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7031 7032 /* 7033 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7034 * offset base entirely over the address space hole to change the 7035 * pointer tag, or change the bit55 selector. So we could here 7036 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7037 */ 7038 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7039 esz, msz, off_fn, host_fn, tlb_fn); 7040 } 7041 7042 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 7043 void HELPER(sve_ldff##MEM##_##OFS) \ 7044 (CPUARMState *env, void *vd, void *vg, \ 7045 void *vm, target_ulong base, uint32_t desc) \ 7046 { \ 7047 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 7048 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7049 } \ 7050 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7051 (CPUARMState *env, void *vd, void *vg, \ 7052 void *vm, target_ulong base, uint32_t desc) \ 7053 { \ 7054 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 7055 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7056 } 7057 7058 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 7059 void HELPER(sve_ldff##MEM##_##OFS) \ 7060 (CPUARMState *env, void *vd, void *vg, \ 7061 void *vm, target_ulong base, uint32_t desc) \ 7062 { \ 7063 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 7064 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7065 } \ 7066 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7067 (CPUARMState *env, void *vd, void *vg, \ 7068 void *vm, target_ulong base, uint32_t desc) \ 7069 { \ 7070 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 7071 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7072 } 7073 7074 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 7075 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 7076 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 7077 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 7078 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 7079 7080 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 7081 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 7082 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 7083 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 7084 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 7085 7086 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 7087 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 7088 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 7089 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 7090 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 7091 7092 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 7093 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 7094 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 7095 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 7096 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 7097 7098 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 7099 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 7100 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 7101 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 7102 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 7103 7104 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 7105 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 7106 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 7107 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 7108 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 7109 7110 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 7111 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 7112 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 7113 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 7114 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 7115 7116 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 7117 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 7118 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 7119 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 7120 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 7121 7122 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 7123 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 7124 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 7125 7126 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 7127 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 7128 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 7129 7130 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 7131 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 7132 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 7133 7134 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 7135 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 7136 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 7137 7138 /* Stores with a vector index. */ 7139 7140 static inline QEMU_ALWAYS_INLINE 7141 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7142 target_ulong base, uint32_t desc, uintptr_t retaddr, 7143 uint32_t mtedesc, int esize, int msize, 7144 zreg_off_fn *off_fn, 7145 sve_ldst1_host_fn *host_fn, 7146 sve_ldst1_tlb_fn *tlb_fn) 7147 { 7148 const int mmu_idx = arm_env_mmu_index(env); 7149 const intptr_t reg_max = simd_oprsz(desc); 7150 const int scale = simd_data(desc); 7151 void *host[ARM_MAX_VQ * 4]; 7152 intptr_t reg_off, i; 7153 SVEHostPage info, info2; 7154 7155 /* 7156 * Probe all of the elements for host addresses and flags. 7157 */ 7158 i = reg_off = 0; 7159 do { 7160 uint64_t pg = vg[reg_off >> 6]; 7161 do { 7162 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7163 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7164 7165 host[i] = NULL; 7166 if (likely((pg >> (reg_off & 63)) & 1)) { 7167 if (likely(in_page >= msize)) { 7168 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 7169 mmu_idx, retaddr); 7170 if (!(info.flags & TLB_MMIO)) { 7171 host[i] = info.host; 7172 } 7173 } else { 7174 /* 7175 * Element crosses the page boundary. 7176 * Probe both pages, but do not record the host address, 7177 * so that we use the slow path. 7178 */ 7179 sve_probe_page(&info, false, env, addr, 0, 7180 MMU_DATA_STORE, mmu_idx, retaddr); 7181 sve_probe_page(&info2, false, env, addr + in_page, 0, 7182 MMU_DATA_STORE, mmu_idx, retaddr); 7183 info.flags |= info2.flags; 7184 } 7185 7186 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7187 cpu_check_watchpoint(env_cpu(env), addr, msize, 7188 info.attrs, BP_MEM_WRITE, retaddr); 7189 } 7190 7191 if (mtedesc && info.tagged) { 7192 mte_check(env, mtedesc, addr, retaddr); 7193 } 7194 } 7195 i += 1; 7196 reg_off += esize; 7197 } while (reg_off & 63); 7198 } while (reg_off < reg_max); 7199 7200 /* 7201 * Now that we have recognized all exceptions except SyncExternal 7202 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7203 * 7204 * Note for the common case of an element in RAM, not crossing a page 7205 * boundary, we have stored the host address in host[]. This doubles 7206 * as a first-level check against the predicate, since only enabled 7207 * elements have non-null host addresses. 7208 */ 7209 i = reg_off = 0; 7210 do { 7211 void *h = host[i]; 7212 if (likely(h != NULL)) { 7213 set_helper_retaddr(retaddr); 7214 host_fn(vd, reg_off, h); 7215 clear_helper_retaddr(); 7216 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7217 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7218 tlb_fn(env, vd, reg_off, addr, retaddr); 7219 } 7220 i += 1; 7221 reg_off += esize; 7222 } while (reg_off < reg_max); 7223 } 7224 7225 static inline QEMU_ALWAYS_INLINE 7226 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7227 target_ulong base, uint32_t desc, uintptr_t retaddr, 7228 int esize, int msize, zreg_off_fn *off_fn, 7229 sve_ldst1_host_fn *host_fn, 7230 sve_ldst1_tlb_fn *tlb_fn) 7231 { 7232 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7233 /* Remove mtedesc from the normal sve descriptor. */ 7234 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7235 7236 /* 7237 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7238 * offset base entirely over the address space hole to change the 7239 * pointer tag, or change the bit55 selector. So we could here 7240 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7241 */ 7242 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7243 esize, msize, off_fn, host_fn, tlb_fn); 7244 } 7245 7246 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7247 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7248 void *vm, target_ulong base, uint32_t desc) \ 7249 { \ 7250 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7251 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7252 } \ 7253 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7254 void *vm, target_ulong base, uint32_t desc) \ 7255 { \ 7256 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7257 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7258 } 7259 7260 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7261 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7262 void *vm, target_ulong base, uint32_t desc) \ 7263 { \ 7264 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7265 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7266 } \ 7267 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7268 void *vm, target_ulong base, uint32_t desc) \ 7269 { \ 7270 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7271 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7272 } 7273 7274 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7275 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7276 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7277 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7278 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7279 7280 DO_ST1_ZPZ_S(bs, zss, MO_8) 7281 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7282 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7283 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7284 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7285 7286 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7287 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7288 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7289 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7290 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7291 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7292 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7293 7294 DO_ST1_ZPZ_D(bd, zss, MO_8) 7295 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7296 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7297 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7298 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7299 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7300 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7301 7302 DO_ST1_ZPZ_D(bd, zd, MO_8) 7303 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7304 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7305 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7306 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7307 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7308 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7309 7310 #undef DO_ST1_ZPZ_S 7311 #undef DO_ST1_ZPZ_D 7312 7313 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7314 { 7315 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7316 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7317 7318 for (i = 0; i < opr_sz; ++i) { 7319 d[i] = n[i] ^ m[i] ^ k[i]; 7320 } 7321 } 7322 7323 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7324 { 7325 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7326 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7327 7328 for (i = 0; i < opr_sz; ++i) { 7329 d[i] = n[i] ^ (m[i] & ~k[i]); 7330 } 7331 } 7332 7333 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7334 { 7335 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7336 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7337 7338 for (i = 0; i < opr_sz; ++i) { 7339 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7340 } 7341 } 7342 7343 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7344 { 7345 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7346 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7347 7348 for (i = 0; i < opr_sz; ++i) { 7349 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7350 } 7351 } 7352 7353 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7354 { 7355 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7356 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7357 7358 for (i = 0; i < opr_sz; ++i) { 7359 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7360 } 7361 } 7362 7363 /* 7364 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7365 * See hasless(v,1) from 7366 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7367 */ 7368 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7369 { 7370 int bits = 8 << esz; 7371 uint64_t ones = dup_const(esz, 1); 7372 uint64_t signs = ones << (bits - 1); 7373 uint64_t cmp0, cmp1; 7374 7375 cmp1 = dup_const(esz, n); 7376 cmp0 = cmp1 ^ m0; 7377 cmp1 = cmp1 ^ m1; 7378 cmp0 = (cmp0 - ones) & ~cmp0; 7379 cmp1 = (cmp1 - ones) & ~cmp1; 7380 return (cmp0 | cmp1) & signs; 7381 } 7382 7383 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7384 uint32_t desc, int esz, bool nmatch) 7385 { 7386 uint16_t esz_mask = pred_esz_masks[esz]; 7387 intptr_t opr_sz = simd_oprsz(desc); 7388 uint32_t flags = PREDTEST_INIT; 7389 intptr_t i, j, k; 7390 7391 for (i = 0; i < opr_sz; i += 16) { 7392 uint64_t m0 = *(uint64_t *)(vm + i); 7393 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7394 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7395 uint16_t out = 0; 7396 7397 for (j = 0; j < 16; j += 8) { 7398 uint64_t n = *(uint64_t *)(vn + i + j); 7399 7400 for (k = 0; k < 8; k += 1 << esz) { 7401 if (pg & (1 << (j + k))) { 7402 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7403 out |= (o ^ nmatch) << (j + k); 7404 } 7405 } 7406 } 7407 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7408 flags = iter_predtest_fwd(out, pg, flags); 7409 } 7410 return flags; 7411 } 7412 7413 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7414 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7415 { \ 7416 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7417 } 7418 7419 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7420 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7421 7422 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7423 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7424 7425 #undef DO_PPZZ_MATCH 7426 7427 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7428 uint32_t desc) 7429 { 7430 ARMVectorReg scratch; 7431 intptr_t i, j; 7432 intptr_t opr_sz = simd_oprsz(desc); 7433 uint32_t *d = vd, *n = vn, *m = vm; 7434 uint8_t *pg = vg; 7435 7436 if (d == n) { 7437 n = memcpy(&scratch, n, opr_sz); 7438 if (d == m) { 7439 m = n; 7440 } 7441 } else if (d == m) { 7442 m = memcpy(&scratch, m, opr_sz); 7443 } 7444 7445 for (i = 0; i < opr_sz; i += 4) { 7446 uint64_t count = 0; 7447 uint8_t pred; 7448 7449 pred = pg[H1(i >> 3)] >> (i & 7); 7450 if (pred & 1) { 7451 uint32_t nn = n[H4(i >> 2)]; 7452 7453 for (j = 0; j <= i; j += 4) { 7454 pred = pg[H1(j >> 3)] >> (j & 7); 7455 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7456 ++count; 7457 } 7458 } 7459 } 7460 d[H4(i >> 2)] = count; 7461 } 7462 } 7463 7464 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7465 uint32_t desc) 7466 { 7467 ARMVectorReg scratch; 7468 intptr_t i, j; 7469 intptr_t opr_sz = simd_oprsz(desc); 7470 uint64_t *d = vd, *n = vn, *m = vm; 7471 uint8_t *pg = vg; 7472 7473 if (d == n) { 7474 n = memcpy(&scratch, n, opr_sz); 7475 if (d == m) { 7476 m = n; 7477 } 7478 } else if (d == m) { 7479 m = memcpy(&scratch, m, opr_sz); 7480 } 7481 7482 for (i = 0; i < opr_sz / 8; ++i) { 7483 uint64_t count = 0; 7484 if (pg[H1(i)] & 1) { 7485 uint64_t nn = n[i]; 7486 for (j = 0; j <= i; ++j) { 7487 if ((pg[H1(j)] & 1) && nn == m[j]) { 7488 ++count; 7489 } 7490 } 7491 } 7492 d[i] = count; 7493 } 7494 } 7495 7496 /* 7497 * Returns the number of bytes in m0 and m1 that match n. 7498 * Unlike do_match2 we don't just need true/false, we need an exact count. 7499 * This requires two extra logical operations. 7500 */ 7501 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7502 { 7503 const uint64_t mask = dup_const(MO_8, 0x7f); 7504 uint64_t cmp0, cmp1; 7505 7506 cmp1 = dup_const(MO_8, n); 7507 cmp0 = cmp1 ^ m0; 7508 cmp1 = cmp1 ^ m1; 7509 7510 /* 7511 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7512 * 2: carry in to msb if byte != 0 (+ mask) 7513 * 3: set msb if cmp has msb set (| cmp) 7514 * 4: set ~msb to ignore them (| mask) 7515 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7516 * 5: invert, resulting in 0x80 if and only if byte == 0. 7517 */ 7518 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7519 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7520 7521 /* 7522 * Combine the two compares in a way that the bits do 7523 * not overlap, and so preserves the count of set bits. 7524 * If the host has an efficient instruction for ctpop, 7525 * then ctpop(x) + ctpop(y) has the same number of 7526 * operations as ctpop(x | (y >> 1)). If the host does 7527 * not have an efficient ctpop, then we only want to 7528 * use it once. 7529 */ 7530 return ctpop64(cmp0 | (cmp1 >> 1)); 7531 } 7532 7533 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7534 { 7535 intptr_t i, j; 7536 intptr_t opr_sz = simd_oprsz(desc); 7537 7538 for (i = 0; i < opr_sz; i += 16) { 7539 uint64_t n0 = *(uint64_t *)(vn + i); 7540 uint64_t m0 = *(uint64_t *)(vm + i); 7541 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7542 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7543 uint64_t out0 = 0; 7544 uint64_t out1 = 0; 7545 7546 for (j = 0; j < 64; j += 8) { 7547 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7548 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7549 out0 |= cnt0 << j; 7550 out1 |= cnt1 << j; 7551 } 7552 7553 *(uint64_t *)(vd + i) = out0; 7554 *(uint64_t *)(vd + i + 8) = out1; 7555 } 7556 } 7557 7558 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7559 { 7560 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7561 int shr = simd_data(desc); 7562 int shl = 8 - shr; 7563 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7564 uint64_t *d = vd, *n = vn, *m = vm; 7565 7566 for (i = 0; i < opr_sz; ++i) { 7567 uint64_t t = n[i] ^ m[i]; 7568 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7569 } 7570 } 7571 7572 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7573 { 7574 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7575 int shr = simd_data(desc); 7576 int shl = 16 - shr; 7577 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7578 uint64_t *d = vd, *n = vn, *m = vm; 7579 7580 for (i = 0; i < opr_sz; ++i) { 7581 uint64_t t = n[i] ^ m[i]; 7582 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7583 } 7584 } 7585 7586 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7587 { 7588 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7589 int shr = simd_data(desc); 7590 uint32_t *d = vd, *n = vn, *m = vm; 7591 7592 for (i = 0; i < opr_sz; ++i) { 7593 d[i] = ror32(n[i] ^ m[i], shr); 7594 } 7595 } 7596 7597 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7598 float_status *status, uint32_t desc) 7599 { 7600 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7601 7602 for (s = 0; s < opr_sz; ++s) { 7603 float32 *n = vn + s * sizeof(float32) * 4; 7604 float32 *m = vm + s * sizeof(float32) * 4; 7605 float32 *a = va + s * sizeof(float32) * 4; 7606 float32 *d = vd + s * sizeof(float32) * 4; 7607 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7608 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7609 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7610 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7611 float32 p0, p1; 7612 7613 /* i = 0, j = 0 */ 7614 p0 = float32_mul(n00, m00, status); 7615 p1 = float32_mul(n01, m01, status); 7616 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7617 7618 /* i = 0, j = 1 */ 7619 p0 = float32_mul(n00, m10, status); 7620 p1 = float32_mul(n01, m11, status); 7621 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7622 7623 /* i = 1, j = 0 */ 7624 p0 = float32_mul(n10, m00, status); 7625 p1 = float32_mul(n11, m01, status); 7626 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7627 7628 /* i = 1, j = 1 */ 7629 p0 = float32_mul(n10, m10, status); 7630 p1 = float32_mul(n11, m11, status); 7631 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7632 } 7633 } 7634 7635 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7636 float_status *status, uint32_t desc) 7637 { 7638 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7639 7640 for (s = 0; s < opr_sz; ++s) { 7641 float64 *n = vn + s * sizeof(float64) * 4; 7642 float64 *m = vm + s * sizeof(float64) * 4; 7643 float64 *a = va + s * sizeof(float64) * 4; 7644 float64 *d = vd + s * sizeof(float64) * 4; 7645 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7646 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7647 float64 p0, p1; 7648 7649 /* i = 0, j = 0 */ 7650 p0 = float64_mul(n00, m00, status); 7651 p1 = float64_mul(n01, m01, status); 7652 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7653 7654 /* i = 0, j = 1 */ 7655 p0 = float64_mul(n00, m10, status); 7656 p1 = float64_mul(n01, m11, status); 7657 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7658 7659 /* i = 1, j = 0 */ 7660 p0 = float64_mul(n10, m00, status); 7661 p1 = float64_mul(n11, m01, status); 7662 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7663 7664 /* i = 1, j = 1 */ 7665 p0 = float64_mul(n10, m10, status); 7666 p1 = float64_mul(n11, m11, status); 7667 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7668 } 7669 } 7670 7671 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7672 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7673 float_status *status, uint32_t desc) \ 7674 { \ 7675 intptr_t i = simd_oprsz(desc); \ 7676 uint64_t *g = vg; \ 7677 do { \ 7678 uint64_t pg = g[(i - 1) >> 6]; \ 7679 do { \ 7680 i -= sizeof(TYPEW); \ 7681 if (likely((pg >> (i & 63)) & 1)) { \ 7682 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7683 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7684 } \ 7685 } while (i & 63); \ 7686 } while (i != 0); \ 7687 } 7688 7689 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7690 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7691 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7692 7693 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7694 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7695 float_status *status, uint32_t desc) \ 7696 { \ 7697 intptr_t i = simd_oprsz(desc); \ 7698 uint64_t *g = vg; \ 7699 do { \ 7700 uint64_t pg = g[(i - 1) >> 6]; \ 7701 do { \ 7702 i -= sizeof(TYPEW); \ 7703 if (likely((pg >> (i & 63)) & 1)) { \ 7704 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7705 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7706 } \ 7707 } while (i & 63); \ 7708 } while (i != 0); \ 7709 } 7710 7711 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7712 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7713 7714 #undef DO_FCVTLT 7715 #undef DO_FCVTNT 7716