1 /* 2 * ARM SME Operations 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "exec/helper-proto.h" 25 #include "accel/tcg/cpu-ldst.h" 26 #include "qemu/int128.h" 27 #include "fpu/softfloat.h" 28 #include "vec_internal.h" 29 #include "sve_ldst_internal.h" 30 31 void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask) 32 { 33 aarch64_set_svcr(env, val, mask); 34 } 35 36 void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl) 37 { 38 uint32_t i; 39 40 /* 41 * Special case clearing the entire ZA space. 42 * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any 43 * parts of the ZA storage outside of SVL. 44 */ 45 if (imm == 0xff) { 46 memset(env->zarray, 0, sizeof(env->zarray)); 47 return; 48 } 49 50 /* 51 * Recall that ZAnH.D[m] is spread across ZA[n+8*m], 52 * so each row is discontiguous within ZA[]. 53 */ 54 for (i = 0; i < svl; i++) { 55 if (imm & (1 << (i % 8))) { 56 memset(&env->zarray[i], 0, svl); 57 } 58 } 59 } 60 61 62 /* 63 * When considering the ZA storage as an array of elements of 64 * type T, the index within that array of the Nth element of 65 * a vertical slice of a tile can be calculated like this, 66 * regardless of the size of type T. This is because the tiles 67 * are interleaved, so if type T is size N bytes then row 1 of 68 * the tile is N rows away from row 0. The division by N to 69 * convert a byte offset into an array index and the multiplication 70 * by N to convert from vslice-index-within-the-tile to 71 * the index within the ZA storage cancel out. 72 */ 73 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg)) 74 75 /* 76 * When doing byte arithmetic on the ZA storage, the element 77 * byteoff bytes away in a tile vertical slice is always this 78 * many bytes away in the ZA storage, regardless of the 79 * size of the tile element, assuming that byteoff is a multiple 80 * of the element size. Again this is because of the interleaving 81 * of the tiles. For instance if we have 1 byte per element then 82 * each row of the ZA storage has one byte of the vslice data, 83 * and (counting from 0) byte 8 goes in row 8 of the storage 84 * at offset (8 * row-size-in-bytes). 85 * If we have 8 bytes per element then each row of the ZA storage 86 * has 8 bytes of the data, but there are 8 interleaved tiles and 87 * so byte 8 of the data goes into row 1 of the tile, 88 * which is again row 8 of the storage, so the offset is still 89 * (8 * row-size-in-bytes). Similarly for other element sizes. 90 */ 91 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg)) 92 93 94 /* 95 * Move Zreg vector to ZArray column. 96 */ 97 #define DO_MOVA_C(NAME, TYPE, H) \ 98 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \ 99 { \ 100 int i, oprsz = simd_oprsz(desc); \ 101 for (i = 0; i < oprsz; ) { \ 102 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 103 do { \ 104 if (pg & 1) { \ 105 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \ 106 } \ 107 i += sizeof(TYPE); \ 108 pg >>= sizeof(TYPE); \ 109 } while (i & 15); \ 110 } \ 111 } 112 113 DO_MOVA_C(sme_mova_cz_b, uint8_t, H1) 114 DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2) 115 DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4) 116 117 void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc) 118 { 119 int i, oprsz = simd_oprsz(desc) / 8; 120 uint8_t *pg = vg; 121 uint64_t *n = vn; 122 uint64_t *a = za; 123 124 for (i = 0; i < oprsz; i++) { 125 if (pg[H1(i)] & 1) { 126 a[tile_vslice_index(i)] = n[i]; 127 } 128 } 129 } 130 131 void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc) 132 { 133 int i, oprsz = simd_oprsz(desc) / 16; 134 uint16_t *pg = vg; 135 Int128 *n = vn; 136 Int128 *a = za; 137 138 /* 139 * Int128 is used here simply to copy 16 bytes, and to simplify 140 * the address arithmetic. 141 */ 142 for (i = 0; i < oprsz; i++) { 143 if (pg[H2(i)] & 1) { 144 a[tile_vslice_index(i)] = n[i]; 145 } 146 } 147 } 148 149 #undef DO_MOVA_C 150 151 /* 152 * Move ZArray column to Zreg vector. 153 */ 154 #define DO_MOVA_Z(NAME, TYPE, H) \ 155 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \ 156 { \ 157 int i, oprsz = simd_oprsz(desc); \ 158 for (i = 0; i < oprsz; ) { \ 159 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 160 do { \ 161 if (pg & 1) { \ 162 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \ 163 } \ 164 i += sizeof(TYPE); \ 165 pg >>= sizeof(TYPE); \ 166 } while (i & 15); \ 167 } \ 168 } 169 170 DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1) 171 DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2) 172 DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4) 173 174 void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc) 175 { 176 int i, oprsz = simd_oprsz(desc) / 8; 177 uint8_t *pg = vg; 178 uint64_t *d = vd; 179 uint64_t *a = za; 180 181 for (i = 0; i < oprsz; i++) { 182 if (pg[H1(i)] & 1) { 183 d[i] = a[tile_vslice_index(i)]; 184 } 185 } 186 } 187 188 void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc) 189 { 190 int i, oprsz = simd_oprsz(desc) / 16; 191 uint16_t *pg = vg; 192 Int128 *d = vd; 193 Int128 *a = za; 194 195 /* 196 * Int128 is used here simply to copy 16 bytes, and to simplify 197 * the address arithmetic. 198 */ 199 for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) { 200 if (pg[H2(i)] & 1) { 201 d[i] = a[tile_vslice_index(i)]; 202 } 203 } 204 } 205 206 #undef DO_MOVA_Z 207 208 /* 209 * Clear elements in a tile slice comprising len bytes. 210 */ 211 212 typedef void ClearFn(void *ptr, size_t off, size_t len); 213 214 static void clear_horizontal(void *ptr, size_t off, size_t len) 215 { 216 memset(ptr + off, 0, len); 217 } 218 219 static void clear_vertical_b(void *vptr, size_t off, size_t len) 220 { 221 for (size_t i = 0; i < len; ++i) { 222 *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0; 223 } 224 } 225 226 static void clear_vertical_h(void *vptr, size_t off, size_t len) 227 { 228 for (size_t i = 0; i < len; i += 2) { 229 *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0; 230 } 231 } 232 233 static void clear_vertical_s(void *vptr, size_t off, size_t len) 234 { 235 for (size_t i = 0; i < len; i += 4) { 236 *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0; 237 } 238 } 239 240 static void clear_vertical_d(void *vptr, size_t off, size_t len) 241 { 242 for (size_t i = 0; i < len; i += 8) { 243 *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0; 244 } 245 } 246 247 static void clear_vertical_q(void *vptr, size_t off, size_t len) 248 { 249 for (size_t i = 0; i < len; i += 16) { 250 memset(vptr + tile_vslice_offset(i + off), 0, 16); 251 } 252 } 253 254 /* 255 * Copy elements from an array into a tile slice comprising len bytes. 256 */ 257 258 typedef void CopyFn(void *dst, const void *src, size_t len); 259 260 static void copy_horizontal(void *dst, const void *src, size_t len) 261 { 262 memcpy(dst, src, len); 263 } 264 265 static void copy_vertical_b(void *vdst, const void *vsrc, size_t len) 266 { 267 const uint8_t *src = vsrc; 268 uint8_t *dst = vdst; 269 size_t i; 270 271 for (i = 0; i < len; ++i) { 272 dst[tile_vslice_index(i)] = src[i]; 273 } 274 } 275 276 static void copy_vertical_h(void *vdst, const void *vsrc, size_t len) 277 { 278 const uint16_t *src = vsrc; 279 uint16_t *dst = vdst; 280 size_t i; 281 282 for (i = 0; i < len / 2; ++i) { 283 dst[tile_vslice_index(i)] = src[i]; 284 } 285 } 286 287 static void copy_vertical_s(void *vdst, const void *vsrc, size_t len) 288 { 289 const uint32_t *src = vsrc; 290 uint32_t *dst = vdst; 291 size_t i; 292 293 for (i = 0; i < len / 4; ++i) { 294 dst[tile_vslice_index(i)] = src[i]; 295 } 296 } 297 298 static void copy_vertical_d(void *vdst, const void *vsrc, size_t len) 299 { 300 const uint64_t *src = vsrc; 301 uint64_t *dst = vdst; 302 size_t i; 303 304 for (i = 0; i < len / 8; ++i) { 305 dst[tile_vslice_index(i)] = src[i]; 306 } 307 } 308 309 static void copy_vertical_q(void *vdst, const void *vsrc, size_t len) 310 { 311 for (size_t i = 0; i < len; i += 16) { 312 memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16); 313 } 314 } 315 316 /* 317 * Host and TLB primitives for vertical tile slice addressing. 318 */ 319 320 #define DO_LD(NAME, TYPE, HOST, TLB) \ 321 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ 322 { \ 323 TYPE val = HOST(host); \ 324 *(TYPE *)(za + tile_vslice_offset(off)) = val; \ 325 } \ 326 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ 327 intptr_t off, target_ulong addr, uintptr_t ra) \ 328 { \ 329 TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \ 330 *(TYPE *)(za + tile_vslice_offset(off)) = val; \ 331 } 332 333 #define DO_ST(NAME, TYPE, HOST, TLB) \ 334 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ 335 { \ 336 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ 337 HOST(host, val); \ 338 } \ 339 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ 340 intptr_t off, target_ulong addr, uintptr_t ra) \ 341 { \ 342 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ 343 TLB(env, useronly_clean_ptr(addr), val, ra); \ 344 } 345 346 /* 347 * The ARMVectorReg elements are stored in host-endian 64-bit units. 348 * For 128-bit quantities, the sequence defined by the Elem[] pseudocode 349 * corresponds to storing the two 64-bit pieces in little-endian order. 350 */ 351 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \ 352 static inline void HNAME##_host(void *za, intptr_t off, void *host) \ 353 { \ 354 uint64_t val0 = HOST(host), val1 = HOST(host + 8); \ 355 uint64_t *ptr = za + off; \ 356 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ 357 } \ 358 static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ 359 { \ 360 HNAME##_host(za, tile_vslice_offset(off), host); \ 361 } \ 362 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ 363 target_ulong addr, uintptr_t ra) \ 364 { \ 365 uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \ 366 uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \ 367 uint64_t *ptr = za + off; \ 368 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ 369 } \ 370 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ 371 target_ulong addr, uintptr_t ra) \ 372 { \ 373 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ 374 } 375 376 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \ 377 static inline void HNAME##_host(void *za, intptr_t off, void *host) \ 378 { \ 379 uint64_t *ptr = za + off; \ 380 HOST(host, ptr[BE]); \ 381 HOST(host + 8, ptr[!BE]); \ 382 } \ 383 static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ 384 { \ 385 HNAME##_host(za, tile_vslice_offset(off), host); \ 386 } \ 387 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ 388 target_ulong addr, uintptr_t ra) \ 389 { \ 390 uint64_t *ptr = za + off; \ 391 TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \ 392 TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \ 393 } \ 394 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ 395 target_ulong addr, uintptr_t ra) \ 396 { \ 397 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ 398 } 399 400 DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra) 401 DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra) 402 DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra) 403 DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra) 404 DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra) 405 DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra) 406 DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra) 407 408 DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra) 409 DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra) 410 411 DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra) 412 DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra) 413 DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra) 414 DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra) 415 DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra) 416 DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra) 417 DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra) 418 419 DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra) 420 DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra) 421 422 #undef DO_LD 423 #undef DO_ST 424 #undef DO_LDQ 425 #undef DO_STQ 426 427 /* 428 * Common helper for all contiguous predicated loads. 429 */ 430 431 static inline QEMU_ALWAYS_INLINE 432 void sme_ld1(CPUARMState *env, void *za, uint64_t *vg, 433 const target_ulong addr, uint32_t desc, const uintptr_t ra, 434 const int esz, uint32_t mtedesc, bool vertical, 435 sve_ldst1_host_fn *host_fn, 436 sve_ldst1_tlb_fn *tlb_fn, 437 ClearFn *clr_fn, 438 CopyFn *cpy_fn) 439 { 440 const intptr_t reg_max = simd_oprsz(desc); 441 const intptr_t esize = 1 << esz; 442 intptr_t reg_off, reg_last; 443 SVEContLdSt info; 444 void *host; 445 int flags; 446 447 /* Find the active elements. */ 448 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { 449 /* The entire predicate was false; no load occurs. */ 450 clr_fn(za, 0, reg_max); 451 return; 452 } 453 454 /* Probe the page(s). Exit with exception for any invalid page. */ 455 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); 456 457 /* Handle watchpoints for all active elements. */ 458 sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, 459 BP_MEM_READ, ra); 460 461 /* 462 * Handle mte checks for all active elements. 463 * Since TBI must be set for MTE, !mtedesc => !mte_active. 464 */ 465 if (mtedesc) { 466 sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, 467 mtedesc, ra); 468 } 469 470 flags = info.page[0].flags | info.page[1].flags; 471 if (unlikely(flags != 0)) { 472 #ifdef CONFIG_USER_ONLY 473 g_assert_not_reached(); 474 #else 475 /* 476 * At least one page includes MMIO. 477 * Any bus operation can fail with cpu_transaction_failed, 478 * which for ARM will raise SyncExternal. Perform the load 479 * into scratch memory to preserve register state until the end. 480 */ 481 ARMVectorReg scratch = { }; 482 483 reg_off = info.reg_off_first[0]; 484 reg_last = info.reg_off_last[1]; 485 if (reg_last < 0) { 486 reg_last = info.reg_off_split; 487 if (reg_last < 0) { 488 reg_last = info.reg_off_last[0]; 489 } 490 } 491 492 do { 493 uint64_t pg = vg[reg_off >> 6]; 494 do { 495 if ((pg >> (reg_off & 63)) & 1) { 496 tlb_fn(env, &scratch, reg_off, addr + reg_off, ra); 497 } 498 reg_off += esize; 499 } while (reg_off & 63); 500 } while (reg_off <= reg_last); 501 502 cpy_fn(za, &scratch, reg_max); 503 return; 504 #endif 505 } 506 507 /* The entire operation is in RAM, on valid pages. */ 508 509 reg_off = info.reg_off_first[0]; 510 reg_last = info.reg_off_last[0]; 511 host = info.page[0].host; 512 513 if (!vertical) { 514 memset(za, 0, reg_max); 515 } else if (reg_off) { 516 clr_fn(za, 0, reg_off); 517 } 518 519 set_helper_retaddr(ra); 520 521 while (reg_off <= reg_last) { 522 uint64_t pg = vg[reg_off >> 6]; 523 do { 524 if ((pg >> (reg_off & 63)) & 1) { 525 host_fn(za, reg_off, host + reg_off); 526 } else if (vertical) { 527 clr_fn(za, reg_off, esize); 528 } 529 reg_off += esize; 530 } while (reg_off <= reg_last && (reg_off & 63)); 531 } 532 533 clear_helper_retaddr(); 534 535 /* 536 * Use the slow path to manage the cross-page misalignment. 537 * But we know this is RAM and cannot trap. 538 */ 539 reg_off = info.reg_off_split; 540 if (unlikely(reg_off >= 0)) { 541 tlb_fn(env, za, reg_off, addr + reg_off, ra); 542 } 543 544 reg_off = info.reg_off_first[1]; 545 if (unlikely(reg_off >= 0)) { 546 reg_last = info.reg_off_last[1]; 547 host = info.page[1].host; 548 549 set_helper_retaddr(ra); 550 551 do { 552 uint64_t pg = vg[reg_off >> 6]; 553 do { 554 if ((pg >> (reg_off & 63)) & 1) { 555 host_fn(za, reg_off, host + reg_off); 556 } else if (vertical) { 557 clr_fn(za, reg_off, esize); 558 } 559 reg_off += esize; 560 } while (reg_off & 63); 561 } while (reg_off <= reg_last); 562 563 clear_helper_retaddr(); 564 } 565 } 566 567 static inline QEMU_ALWAYS_INLINE 568 void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, 569 target_ulong addr, uint32_t desc, uintptr_t ra, 570 const int esz, bool vertical, 571 sve_ldst1_host_fn *host_fn, 572 sve_ldst1_tlb_fn *tlb_fn, 573 ClearFn *clr_fn, 574 CopyFn *cpy_fn) 575 { 576 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 577 int bit55 = extract64(addr, 55, 1); 578 579 /* Remove mtedesc from the normal sve descriptor. */ 580 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 581 582 /* Perform gross MTE suppression early. */ 583 if (!tbi_check(mtedesc, bit55) || 584 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 585 mtedesc = 0; 586 } 587 588 sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical, 589 host_fn, tlb_fn, clr_fn, cpy_fn); 590 } 591 592 #define DO_LD(L, END, ESZ) \ 593 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ 594 target_ulong addr, uint32_t desc) \ 595 { \ 596 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ 597 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ 598 clear_horizontal, copy_horizontal); \ 599 } \ 600 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ 601 target_ulong addr, uint32_t desc) \ 602 { \ 603 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ 604 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ 605 clear_vertical_##L, copy_vertical_##L); \ 606 } \ 607 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ 608 target_ulong addr, uint32_t desc) \ 609 { \ 610 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ 611 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ 612 clear_horizontal, copy_horizontal); \ 613 } \ 614 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ 615 target_ulong addr, uint32_t desc) \ 616 { \ 617 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ 618 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ 619 clear_vertical_##L, copy_vertical_##L); \ 620 } 621 622 DO_LD(b, , MO_8) 623 DO_LD(h, _be, MO_16) 624 DO_LD(h, _le, MO_16) 625 DO_LD(s, _be, MO_32) 626 DO_LD(s, _le, MO_32) 627 DO_LD(d, _be, MO_64) 628 DO_LD(d, _le, MO_64) 629 DO_LD(q, _be, MO_128) 630 DO_LD(q, _le, MO_128) 631 632 #undef DO_LD 633 634 /* 635 * Common helper for all contiguous predicated stores. 636 */ 637 638 static inline QEMU_ALWAYS_INLINE 639 void sme_st1(CPUARMState *env, void *za, uint64_t *vg, 640 const target_ulong addr, uint32_t desc, const uintptr_t ra, 641 const int esz, uint32_t mtedesc, bool vertical, 642 sve_ldst1_host_fn *host_fn, 643 sve_ldst1_tlb_fn *tlb_fn) 644 { 645 const intptr_t reg_max = simd_oprsz(desc); 646 const intptr_t esize = 1 << esz; 647 intptr_t reg_off, reg_last; 648 SVEContLdSt info; 649 void *host; 650 int flags; 651 652 /* Find the active elements. */ 653 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { 654 /* The entire predicate was false; no store occurs. */ 655 return; 656 } 657 658 /* Probe the page(s). Exit with exception for any invalid page. */ 659 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); 660 661 /* Handle watchpoints for all active elements. */ 662 sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, 663 BP_MEM_WRITE, ra); 664 665 /* 666 * Handle mte checks for all active elements. 667 * Since TBI must be set for MTE, !mtedesc => !mte_active. 668 */ 669 if (mtedesc) { 670 sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, 671 mtedesc, ra); 672 } 673 674 flags = info.page[0].flags | info.page[1].flags; 675 if (unlikely(flags != 0)) { 676 #ifdef CONFIG_USER_ONLY 677 g_assert_not_reached(); 678 #else 679 /* 680 * At least one page includes MMIO. 681 * Any bus operation can fail with cpu_transaction_failed, 682 * which for ARM will raise SyncExternal. We cannot avoid 683 * this fault and will leave with the store incomplete. 684 */ 685 reg_off = info.reg_off_first[0]; 686 reg_last = info.reg_off_last[1]; 687 if (reg_last < 0) { 688 reg_last = info.reg_off_split; 689 if (reg_last < 0) { 690 reg_last = info.reg_off_last[0]; 691 } 692 } 693 694 do { 695 uint64_t pg = vg[reg_off >> 6]; 696 do { 697 if ((pg >> (reg_off & 63)) & 1) { 698 tlb_fn(env, za, reg_off, addr + reg_off, ra); 699 } 700 reg_off += esize; 701 } while (reg_off & 63); 702 } while (reg_off <= reg_last); 703 return; 704 #endif 705 } 706 707 reg_off = info.reg_off_first[0]; 708 reg_last = info.reg_off_last[0]; 709 host = info.page[0].host; 710 711 set_helper_retaddr(ra); 712 713 while (reg_off <= reg_last) { 714 uint64_t pg = vg[reg_off >> 6]; 715 do { 716 if ((pg >> (reg_off & 63)) & 1) { 717 host_fn(za, reg_off, host + reg_off); 718 } 719 reg_off += 1 << esz; 720 } while (reg_off <= reg_last && (reg_off & 63)); 721 } 722 723 clear_helper_retaddr(); 724 725 /* 726 * Use the slow path to manage the cross-page misalignment. 727 * But we know this is RAM and cannot trap. 728 */ 729 reg_off = info.reg_off_split; 730 if (unlikely(reg_off >= 0)) { 731 tlb_fn(env, za, reg_off, addr + reg_off, ra); 732 } 733 734 reg_off = info.reg_off_first[1]; 735 if (unlikely(reg_off >= 0)) { 736 reg_last = info.reg_off_last[1]; 737 host = info.page[1].host; 738 739 set_helper_retaddr(ra); 740 741 do { 742 uint64_t pg = vg[reg_off >> 6]; 743 do { 744 if ((pg >> (reg_off & 63)) & 1) { 745 host_fn(za, reg_off, host + reg_off); 746 } 747 reg_off += 1 << esz; 748 } while (reg_off & 63); 749 } while (reg_off <= reg_last); 750 751 clear_helper_retaddr(); 752 } 753 } 754 755 static inline QEMU_ALWAYS_INLINE 756 void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, 757 uint32_t desc, uintptr_t ra, int esz, bool vertical, 758 sve_ldst1_host_fn *host_fn, 759 sve_ldst1_tlb_fn *tlb_fn) 760 { 761 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 762 int bit55 = extract64(addr, 55, 1); 763 764 /* Remove mtedesc from the normal sve descriptor. */ 765 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 766 767 /* Perform gross MTE suppression early. */ 768 if (!tbi_check(mtedesc, bit55) || 769 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 770 mtedesc = 0; 771 } 772 773 sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc, 774 vertical, host_fn, tlb_fn); 775 } 776 777 #define DO_ST(L, END, ESZ) \ 778 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ 779 target_ulong addr, uint32_t desc) \ 780 { \ 781 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ 782 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ 783 } \ 784 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ 785 target_ulong addr, uint32_t desc) \ 786 { \ 787 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ 788 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ 789 } \ 790 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ 791 target_ulong addr, uint32_t desc) \ 792 { \ 793 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ 794 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ 795 } \ 796 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ 797 target_ulong addr, uint32_t desc) \ 798 { \ 799 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ 800 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ 801 } 802 803 DO_ST(b, , MO_8) 804 DO_ST(h, _be, MO_16) 805 DO_ST(h, _le, MO_16) 806 DO_ST(s, _be, MO_32) 807 DO_ST(s, _le, MO_32) 808 DO_ST(d, _be, MO_64) 809 DO_ST(d, _le, MO_64) 810 DO_ST(q, _be, MO_128) 811 DO_ST(q, _le, MO_128) 812 813 #undef DO_ST 814 815 void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn, 816 void *vpm, uint32_t desc) 817 { 818 intptr_t row, col, oprsz = simd_oprsz(desc) / 4; 819 uint64_t *pn = vpn, *pm = vpm; 820 uint32_t *zda = vzda, *zn = vzn; 821 822 for (row = 0; row < oprsz; ) { 823 uint64_t pa = pn[row >> 4]; 824 do { 825 if (pa & 1) { 826 for (col = 0; col < oprsz; ) { 827 uint64_t pb = pm[col >> 4]; 828 do { 829 if (pb & 1) { 830 zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)]; 831 } 832 pb >>= 4; 833 } while (++col & 15); 834 } 835 } 836 pa >>= 4; 837 } while (++row & 15); 838 } 839 } 840 841 void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn, 842 void *vpm, uint32_t desc) 843 { 844 intptr_t row, col, oprsz = simd_oprsz(desc) / 8; 845 uint8_t *pn = vpn, *pm = vpm; 846 uint64_t *zda = vzda, *zn = vzn; 847 848 for (row = 0; row < oprsz; ++row) { 849 if (pn[H1(row)] & 1) { 850 for (col = 0; col < oprsz; ++col) { 851 if (pm[H1(col)] & 1) { 852 zda[tile_vslice_index(row) + col] += zn[col]; 853 } 854 } 855 } 856 } 857 } 858 859 void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn, 860 void *vpm, uint32_t desc) 861 { 862 intptr_t row, col, oprsz = simd_oprsz(desc) / 4; 863 uint64_t *pn = vpn, *pm = vpm; 864 uint32_t *zda = vzda, *zn = vzn; 865 866 for (row = 0; row < oprsz; ) { 867 uint64_t pa = pn[row >> 4]; 868 do { 869 if (pa & 1) { 870 uint32_t zn_row = zn[H4(row)]; 871 for (col = 0; col < oprsz; ) { 872 uint64_t pb = pm[col >> 4]; 873 do { 874 if (pb & 1) { 875 zda[tile_vslice_index(row) + H4(col)] += zn_row; 876 } 877 pb >>= 4; 878 } while (++col & 15); 879 } 880 } 881 pa >>= 4; 882 } while (++row & 15); 883 } 884 } 885 886 void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn, 887 void *vpm, uint32_t desc) 888 { 889 intptr_t row, col, oprsz = simd_oprsz(desc) / 8; 890 uint8_t *pn = vpn, *pm = vpm; 891 uint64_t *zda = vzda, *zn = vzn; 892 893 for (row = 0; row < oprsz; ++row) { 894 if (pn[H1(row)] & 1) { 895 uint64_t zn_row = zn[row]; 896 for (col = 0; col < oprsz; ++col) { 897 if (pm[H1(col)] & 1) { 898 zda[tile_vslice_index(row) + col] += zn_row; 899 } 900 } 901 } 902 } 903 } 904 905 void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, 906 void *vpm, float_status *fpst_in, uint32_t desc) 907 { 908 intptr_t row, col, oprsz = simd_maxsz(desc); 909 uint32_t neg = simd_data(desc) << 31; 910 uint16_t *pn = vpn, *pm = vpm; 911 float_status fpst; 912 913 /* 914 * Make a copy of float_status because this operation does not 915 * update the cumulative fp exception status. It also produces 916 * default nans. 917 */ 918 fpst = *fpst_in; 919 set_default_nan_mode(true, &fpst); 920 921 for (row = 0; row < oprsz; ) { 922 uint16_t pa = pn[H2(row >> 4)]; 923 do { 924 if (pa & 1) { 925 void *vza_row = vza + tile_vslice_offset(row); 926 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg; 927 928 for (col = 0; col < oprsz; ) { 929 uint16_t pb = pm[H2(col >> 4)]; 930 do { 931 if (pb & 1) { 932 uint32_t *a = vza_row + H1_4(col); 933 uint32_t *m = vzm + H1_4(col); 934 *a = float32_muladd(n, *m, *a, 0, &fpst); 935 } 936 col += 4; 937 pb >>= 4; 938 } while (col & 15); 939 } 940 } 941 row += 4; 942 pa >>= 4; 943 } while (row & 15); 944 } 945 } 946 947 void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn, 948 void *vpm, float_status *fpst_in, uint32_t desc) 949 { 950 intptr_t row, col, oprsz = simd_oprsz(desc) / 8; 951 uint64_t neg = (uint64_t)simd_data(desc) << 63; 952 uint64_t *za = vza, *zn = vzn, *zm = vzm; 953 uint8_t *pn = vpn, *pm = vpm; 954 float_status fpst = *fpst_in; 955 956 set_default_nan_mode(true, &fpst); 957 958 for (row = 0; row < oprsz; ++row) { 959 if (pn[H1(row)] & 1) { 960 uint64_t *za_row = &za[tile_vslice_index(row)]; 961 uint64_t n = zn[row] ^ neg; 962 963 for (col = 0; col < oprsz; ++col) { 964 if (pm[H1(col)] & 1) { 965 uint64_t *a = &za_row[col]; 966 *a = float64_muladd(n, zm[col], *a, 0, &fpst); 967 } 968 } 969 } 970 } 971 } 972 973 /* 974 * Alter PAIR as needed for controlling predicates being false, 975 * and for NEG on an enabled row element. 976 */ 977 static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg) 978 { 979 /* 980 * The pseudocode uses a conditional negate after the conditional zero. 981 * It is simpler here to unconditionally negate before conditional zero. 982 */ 983 pair ^= neg; 984 if (!(pg & 1)) { 985 pair &= 0xffff0000u; 986 } 987 if (!(pg & 4)) { 988 pair &= 0x0000ffffu; 989 } 990 return pair; 991 } 992 993 static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, 994 float_status *s_f16, float_status *s_std, 995 float_status *s_odd) 996 { 997 /* 998 * We need three different float_status for different parts of this 999 * operation: 1000 * - the input conversion of the float16 values must use the 1001 * f16-specific float_status, so that the FPCR.FZ16 control is applied 1002 * - operations on float32 including the final accumulation must use 1003 * the normal float_status, so that FPCR.FZ is applied 1004 * - we have pre-set-up copy of s_std which is set to round-to-odd, 1005 * for the multiply (see below) 1006 */ 1007 float64 e1r = float16_to_float64(e1 & 0xffff, true, s_f16); 1008 float64 e1c = float16_to_float64(e1 >> 16, true, s_f16); 1009 float64 e2r = float16_to_float64(e2 & 0xffff, true, s_f16); 1010 float64 e2c = float16_to_float64(e2 >> 16, true, s_f16); 1011 float64 t64; 1012 float32 t32; 1013 1014 /* 1015 * The ARM pseudocode function FPDot performs both multiplies 1016 * and the add with a single rounding operation. Emulate this 1017 * by performing the first multiply in round-to-odd, then doing 1018 * the second multiply as fused multiply-add, and rounding to 1019 * float32 all in one step. 1020 */ 1021 t64 = float64_mul(e1r, e2r, s_odd); 1022 t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std); 1023 1024 /* This conversion is exact, because we've already rounded. */ 1025 t32 = float64_to_float32(t64, s_std); 1026 1027 /* The final accumulation step is not fused. */ 1028 return float32_add(sum, t32, s_std); 1029 } 1030 1031 void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, 1032 void *vpm, CPUARMState *env, uint32_t desc) 1033 { 1034 intptr_t row, col, oprsz = simd_maxsz(desc); 1035 uint32_t neg = simd_data(desc) * 0x80008000u; 1036 uint16_t *pn = vpn, *pm = vpm; 1037 float_status fpst_odd, fpst_std, fpst_f16; 1038 1039 /* 1040 * Make copies of the fp status fields we use, because this operation 1041 * does not update the cumulative fp exception status. It also 1042 * produces default NaNs. We also need a second copy of fp_status with 1043 * round-to-odd -- see above. 1044 */ 1045 fpst_f16 = env->vfp.fp_status[FPST_A64_F16]; 1046 fpst_std = env->vfp.fp_status[FPST_A64]; 1047 set_default_nan_mode(true, &fpst_std); 1048 set_default_nan_mode(true, &fpst_f16); 1049 fpst_odd = fpst_std; 1050 set_float_rounding_mode(float_round_to_odd, &fpst_odd); 1051 1052 for (row = 0; row < oprsz; ) { 1053 uint16_t prow = pn[H2(row >> 4)]; 1054 do { 1055 void *vza_row = vza + tile_vslice_offset(row); 1056 uint32_t n = *(uint32_t *)(vzn + H1_4(row)); 1057 1058 n = f16mop_adj_pair(n, prow, neg); 1059 1060 for (col = 0; col < oprsz; ) { 1061 uint16_t pcol = pm[H2(col >> 4)]; 1062 do { 1063 if (prow & pcol & 0b0101) { 1064 uint32_t *a = vza_row + H1_4(col); 1065 uint32_t m = *(uint32_t *)(vzm + H1_4(col)); 1066 1067 m = f16mop_adj_pair(m, pcol, 0); 1068 *a = f16_dotadd(*a, n, m, 1069 &fpst_f16, &fpst_std, &fpst_odd); 1070 } 1071 col += 4; 1072 pcol >>= 4; 1073 } while (col & 15); 1074 } 1075 row += 4; 1076 prow >>= 4; 1077 } while (row & 15); 1078 } 1079 } 1080 1081 void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, 1082 void *vpn, void *vpm, CPUARMState *env, uint32_t desc) 1083 { 1084 intptr_t row, col, oprsz = simd_maxsz(desc); 1085 uint32_t neg = simd_data(desc) * 0x80008000u; 1086 uint16_t *pn = vpn, *pm = vpm; 1087 float_status fpst, fpst_odd; 1088 1089 if (is_ebf(env, &fpst, &fpst_odd)) { 1090 for (row = 0; row < oprsz; ) { 1091 uint16_t prow = pn[H2(row >> 4)]; 1092 do { 1093 void *vza_row = vza + tile_vslice_offset(row); 1094 uint32_t n = *(uint32_t *)(vzn + H1_4(row)); 1095 1096 n = f16mop_adj_pair(n, prow, neg); 1097 1098 for (col = 0; col < oprsz; ) { 1099 uint16_t pcol = pm[H2(col >> 4)]; 1100 do { 1101 if (prow & pcol & 0b0101) { 1102 uint32_t *a = vza_row + H1_4(col); 1103 uint32_t m = *(uint32_t *)(vzm + H1_4(col)); 1104 1105 m = f16mop_adj_pair(m, pcol, 0); 1106 *a = bfdotadd_ebf(*a, n, m, &fpst, &fpst_odd); 1107 } 1108 col += 4; 1109 pcol >>= 4; 1110 } while (col & 15); 1111 } 1112 row += 4; 1113 prow >>= 4; 1114 } while (row & 15); 1115 } 1116 } else { 1117 for (row = 0; row < oprsz; ) { 1118 uint16_t prow = pn[H2(row >> 4)]; 1119 do { 1120 void *vza_row = vza + tile_vslice_offset(row); 1121 uint32_t n = *(uint32_t *)(vzn + H1_4(row)); 1122 1123 n = f16mop_adj_pair(n, prow, neg); 1124 1125 for (col = 0; col < oprsz; ) { 1126 uint16_t pcol = pm[H2(col >> 4)]; 1127 do { 1128 if (prow & pcol & 0b0101) { 1129 uint32_t *a = vza_row + H1_4(col); 1130 uint32_t m = *(uint32_t *)(vzm + H1_4(col)); 1131 1132 m = f16mop_adj_pair(m, pcol, 0); 1133 *a = bfdotadd(*a, n, m, &fpst); 1134 } 1135 col += 4; 1136 pcol >>= 4; 1137 } while (col & 15); 1138 } 1139 row += 4; 1140 prow >>= 4; 1141 } while (row & 15); 1142 } 1143 } 1144 } 1145 1146 typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool); 1147 static inline void do_imopa_s(uint32_t *za, uint32_t *zn, uint32_t *zm, 1148 uint8_t *pn, uint8_t *pm, 1149 uint32_t desc, IMOPFn32 *fn) 1150 { 1151 intptr_t row, col, oprsz = simd_oprsz(desc) / 4; 1152 bool neg = simd_data(desc); 1153 1154 for (row = 0; row < oprsz; ++row) { 1155 uint8_t pa = (pn[H1(row >> 1)] >> ((row & 1) * 4)) & 0xf; 1156 uint32_t *za_row = &za[tile_vslice_index(row)]; 1157 uint32_t n = zn[H4(row)]; 1158 1159 for (col = 0; col < oprsz; ++col) { 1160 uint8_t pb = pm[H1(col >> 1)] >> ((col & 1) * 4); 1161 uint32_t *a = &za_row[H4(col)]; 1162 1163 *a = fn(n, zm[H4(col)], *a, pa & pb, neg); 1164 } 1165 } 1166 } 1167 1168 typedef uint64_t IMOPFn64(uint64_t, uint64_t, uint64_t, uint8_t, bool); 1169 static inline void do_imopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm, 1170 uint8_t *pn, uint8_t *pm, 1171 uint32_t desc, IMOPFn64 *fn) 1172 { 1173 intptr_t row, col, oprsz = simd_oprsz(desc) / 8; 1174 bool neg = simd_data(desc); 1175 1176 for (row = 0; row < oprsz; ++row) { 1177 uint8_t pa = pn[H1(row)]; 1178 uint64_t *za_row = &za[tile_vslice_index(row)]; 1179 uint64_t n = zn[row]; 1180 1181 for (col = 0; col < oprsz; ++col) { 1182 uint8_t pb = pm[H1(col)]; 1183 uint64_t *a = &za_row[col]; 1184 1185 *a = fn(n, zm[col], *a, pa & pb, neg); 1186 } 1187 } 1188 } 1189 1190 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \ 1191 static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \ 1192 { \ 1193 uint32_t sum = 0; \ 1194 /* Apply P to N as a mask, making the inactive elements 0. */ \ 1195 n &= expand_pred_b(p); \ 1196 sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ 1197 sum += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \ 1198 sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ 1199 sum += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \ 1200 return neg ? a - sum : a + sum; \ 1201 } 1202 1203 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \ 1204 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ 1205 { \ 1206 uint64_t sum = 0; \ 1207 /* Apply P to N as a mask, making the inactive elements 0. */ \ 1208 n &= expand_pred_h(p); \ 1209 sum += (int64_t)(NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ 1210 sum += (int64_t)(NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ 1211 sum += (int64_t)(NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ 1212 sum += (int64_t)(NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ 1213 return neg ? a - sum : a + sum; \ 1214 } 1215 1216 DEF_IMOP_32(smopa_s, int8_t, int8_t) 1217 DEF_IMOP_32(umopa_s, uint8_t, uint8_t) 1218 DEF_IMOP_32(sumopa_s, int8_t, uint8_t) 1219 DEF_IMOP_32(usmopa_s, uint8_t, int8_t) 1220 1221 DEF_IMOP_64(smopa_d, int16_t, int16_t) 1222 DEF_IMOP_64(umopa_d, uint16_t, uint16_t) 1223 DEF_IMOP_64(sumopa_d, int16_t, uint16_t) 1224 DEF_IMOP_64(usmopa_d, uint16_t, int16_t) 1225 1226 #define DEF_IMOPH(NAME, S) \ 1227 void HELPER(sme_##NAME##_##S)(void *vza, void *vzn, void *vzm, \ 1228 void *vpn, void *vpm, uint32_t desc) \ 1229 { do_imopa_##S(vza, vzn, vzm, vpn, vpm, desc, NAME##_##S); } 1230 1231 DEF_IMOPH(smopa, s) 1232 DEF_IMOPH(umopa, s) 1233 DEF_IMOPH(sumopa, s) 1234 DEF_IMOPH(usmopa, s) 1235 1236 DEF_IMOPH(smopa, d) 1237 DEF_IMOPH(umopa, d) 1238 DEF_IMOPH(sumopa, d) 1239 DEF_IMOPH(usmopa, d) 1240