1 /* 2 * AArch64 SME translation 3 * 4 * Copyright (c) 2022 Linaro, Ltd 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "tcg/tcg-gvec-desc.h" 25 #include "translate.h" 26 #include "exec/helper-gen.h" 27 #include "translate-a64.h" 28 #include "fpu/softfloat.h" 29 30 31 /* 32 * Include the generated decoder. 33 */ 34 35 #include "decode-sme.c.inc" 36 37 38 /* 39 * Resolve tile.size[index] to a host pointer, where tile and index 40 * are always decoded together, dependent on the element size. 41 */ 42 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, 43 int tile_index, bool vertical) 44 { 45 int tile = tile_index >> (4 - esz); 46 int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz); 47 int pos, len, offset; 48 TCGv_i32 tmp; 49 TCGv_ptr addr; 50 51 /* Compute the final index, which is Rs+imm. */ 52 tmp = tcg_temp_new_i32(); 53 tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs)); 54 tcg_gen_addi_i32(tmp, tmp, index); 55 56 /* Prepare a power-of-two modulo via extraction of @len bits. */ 57 len = ctz32(streaming_vec_reg_size(s)) - esz; 58 59 if (vertical) { 60 /* 61 * Compute the byte offset of the index within the tile: 62 * (index % (svl / size)) * size 63 * = (index % (svl >> esz)) << esz 64 * Perform the power-of-two modulo via extraction of the low @len bits. 65 * Perform the multiply by shifting left by @pos bits. 66 * Perform these operations simultaneously via deposit into zero. 67 */ 68 pos = esz; 69 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 70 71 /* 72 * For big-endian, adjust the indexed column byte offset within 73 * the uint64_t host words that make up env->zarray[]. 74 */ 75 if (HOST_BIG_ENDIAN && esz < MO_64) { 76 tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz)); 77 } 78 } else { 79 /* 80 * Compute the byte offset of the index within the tile: 81 * (index % (svl / size)) * (size * sizeof(row)) 82 * = (index % (svl >> esz)) << (esz + log2(sizeof(row))) 83 */ 84 pos = esz + ctz32(sizeof(ARMVectorReg)); 85 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 86 87 /* Row slices are always aligned and need no endian adjustment. */ 88 } 89 90 /* The tile byte offset within env->zarray is the row. */ 91 offset = tile * sizeof(ARMVectorReg); 92 93 /* Include the byte offset of zarray to make this relative to env. */ 94 offset += offsetof(CPUARMState, zarray); 95 tcg_gen_addi_i32(tmp, tmp, offset); 96 97 /* Add the byte offset to env to produce the final pointer. */ 98 addr = tcg_temp_new_ptr(); 99 tcg_gen_ext_i32_ptr(addr, tmp); 100 tcg_temp_free_i32(tmp); 101 tcg_gen_add_ptr(addr, addr, cpu_env); 102 103 return addr; 104 } 105 106 static bool trans_ZERO(DisasContext *s, arg_ZERO *a) 107 { 108 if (!dc_isar_feature(aa64_sme, s)) { 109 return false; 110 } 111 if (sme_za_enabled_check(s)) { 112 gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm), 113 tcg_constant_i32(streaming_vec_reg_size(s))); 114 } 115 return true; 116 } 117 118 static bool trans_MOVA(DisasContext *s, arg_MOVA *a) 119 { 120 static gen_helper_gvec_4 * const h_fns[5] = { 121 gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, 122 gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d, 123 gen_helper_sve_sel_zpzz_q 124 }; 125 static gen_helper_gvec_3 * const cz_fns[5] = { 126 gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h, 127 gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d, 128 gen_helper_sme_mova_cz_q, 129 }; 130 static gen_helper_gvec_3 * const zc_fns[5] = { 131 gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h, 132 gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d, 133 gen_helper_sme_mova_zc_q, 134 }; 135 136 TCGv_ptr t_za, t_zr, t_pg; 137 TCGv_i32 t_desc; 138 int svl; 139 140 if (!dc_isar_feature(aa64_sme, s)) { 141 return false; 142 } 143 if (!sme_smza_enabled_check(s)) { 144 return true; 145 } 146 147 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 148 t_zr = vec_full_reg_ptr(s, a->zr); 149 t_pg = pred_full_reg_ptr(s, a->pg); 150 151 svl = streaming_vec_reg_size(s); 152 t_desc = tcg_constant_i32(simd_desc(svl, svl, 0)); 153 154 if (a->v) { 155 /* Vertical slice -- use sme mova helpers. */ 156 if (a->to_vec) { 157 zc_fns[a->esz](t_zr, t_za, t_pg, t_desc); 158 } else { 159 cz_fns[a->esz](t_za, t_zr, t_pg, t_desc); 160 } 161 } else { 162 /* Horizontal slice -- reuse sve sel helpers. */ 163 if (a->to_vec) { 164 h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc); 165 } else { 166 h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc); 167 } 168 } 169 170 tcg_temp_free_ptr(t_za); 171 tcg_temp_free_ptr(t_zr); 172 tcg_temp_free_ptr(t_pg); 173 174 return true; 175 } 176 177 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) 178 { 179 typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32); 180 181 /* 182 * Indexed by [esz][be][v][mte][st], which is (except for load/store) 183 * also the order in which the elements appear in the function names, 184 * and so how we must concatenate the pieces. 185 */ 186 187 #define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F } 188 #define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) } 189 #define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) } 190 #define FN_END(L, B) { FN_HV(L), FN_HV(B) } 191 192 static GenLdSt1 * const fns[5][2][2][2][2] = { 193 FN_END(b, b), 194 FN_END(h_le, h_be), 195 FN_END(s_le, s_be), 196 FN_END(d_le, d_be), 197 FN_END(q_le, q_be), 198 }; 199 200 #undef FN_LS 201 #undef FN_MTE 202 #undef FN_HV 203 #undef FN_END 204 205 TCGv_ptr t_za, t_pg; 206 TCGv_i64 addr; 207 int svl, desc = 0; 208 bool be = s->be_data == MO_BE; 209 bool mte = s->mte_active[0]; 210 211 if (!dc_isar_feature(aa64_sme, s)) { 212 return false; 213 } 214 if (!sme_smza_enabled_check(s)) { 215 return true; 216 } 217 218 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 219 t_pg = pred_full_reg_ptr(s, a->pg); 220 addr = tcg_temp_new_i64(); 221 222 tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz); 223 tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); 224 225 if (mte) { 226 desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); 227 desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); 228 desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); 229 desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st); 230 desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1); 231 desc <<= SVE_MTEDESC_SHIFT; 232 } else { 233 addr = clean_data_tbi(s, addr); 234 } 235 svl = streaming_vec_reg_size(s); 236 desc = simd_desc(svl, svl, desc); 237 238 fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr, 239 tcg_constant_i32(desc)); 240 241 tcg_temp_free_ptr(t_za); 242 tcg_temp_free_ptr(t_pg); 243 tcg_temp_free_i64(addr); 244 return true; 245 } 246 247 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int); 248 249 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn) 250 { 251 int svl = streaming_vec_reg_size(s); 252 int imm = a->imm; 253 TCGv_ptr base; 254 255 if (!sme_za_enabled_check(s)) { 256 return true; 257 } 258 259 /* ZA[n] equates to ZA0H.B[n]. */ 260 base = get_tile_rowcol(s, MO_8, a->rv, imm, false); 261 262 fn(s, base, 0, svl, a->rn, imm * svl); 263 264 tcg_temp_free_ptr(base); 265 return true; 266 } 267 268 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr) 269 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str) 270 271 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz, 272 gen_helper_gvec_4 *fn) 273 { 274 int svl = streaming_vec_reg_size(s); 275 uint32_t desc = simd_desc(svl, svl, 0); 276 TCGv_ptr za, zn, pn, pm; 277 278 if (!sme_smza_enabled_check(s)) { 279 return true; 280 } 281 282 /* Sum XZR+zad to find ZAd. */ 283 za = get_tile_rowcol(s, esz, 31, a->zad, false); 284 zn = vec_full_reg_ptr(s, a->zn); 285 pn = pred_full_reg_ptr(s, a->pn); 286 pm = pred_full_reg_ptr(s, a->pm); 287 288 fn(za, zn, pn, pm, tcg_constant_i32(desc)); 289 290 tcg_temp_free_ptr(za); 291 tcg_temp_free_ptr(zn); 292 tcg_temp_free_ptr(pn); 293 tcg_temp_free_ptr(pm); 294 return true; 295 } 296 297 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s) 298 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s) 299 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d) 300 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d) 301 302 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz, 303 gen_helper_gvec_5 *fn) 304 { 305 int svl = streaming_vec_reg_size(s); 306 uint32_t desc = simd_desc(svl, svl, a->sub); 307 TCGv_ptr za, zn, zm, pn, pm; 308 309 if (!sme_smza_enabled_check(s)) { 310 return true; 311 } 312 313 /* Sum XZR+zad to find ZAd. */ 314 za = get_tile_rowcol(s, esz, 31, a->zad, false); 315 zn = vec_full_reg_ptr(s, a->zn); 316 zm = vec_full_reg_ptr(s, a->zm); 317 pn = pred_full_reg_ptr(s, a->pn); 318 pm = pred_full_reg_ptr(s, a->pm); 319 320 fn(za, zn, zm, pn, pm, tcg_constant_i32(desc)); 321 322 tcg_temp_free_ptr(za); 323 tcg_temp_free_ptr(zn); 324 tcg_temp_free_ptr(pn); 325 tcg_temp_free_ptr(pm); 326 return true; 327 } 328 329 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz, 330 gen_helper_gvec_5_ptr *fn) 331 { 332 int svl = streaming_vec_reg_size(s); 333 uint32_t desc = simd_desc(svl, svl, a->sub); 334 TCGv_ptr za, zn, zm, pn, pm, fpst; 335 336 if (!sme_smza_enabled_check(s)) { 337 return true; 338 } 339 340 /* Sum XZR+zad to find ZAd. */ 341 za = get_tile_rowcol(s, esz, 31, a->zad, false); 342 zn = vec_full_reg_ptr(s, a->zn); 343 zm = vec_full_reg_ptr(s, a->zm); 344 pn = pred_full_reg_ptr(s, a->pn); 345 pm = pred_full_reg_ptr(s, a->pm); 346 fpst = fpstatus_ptr(FPST_FPCR); 347 348 fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc)); 349 350 tcg_temp_free_ptr(za); 351 tcg_temp_free_ptr(zn); 352 tcg_temp_free_ptr(pn); 353 tcg_temp_free_ptr(pm); 354 tcg_temp_free_ptr(fpst); 355 return true; 356 } 357 358 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h) 359 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s) 360 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d) 361 362 /* TODO: FEAT_EBF16 */ 363 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa) 364 365 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s) 366 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s) 367 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s) 368 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s) 369 370 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d) 371 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d) 372 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d) 373 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d) 374