1 /* 2 * AArch64 SME translation 3 * 4 * Copyright (c) 2022 Linaro, Ltd 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "translate.h" 22 #include "translate-a64.h" 23 24 /* 25 * Include the generated decoder. 26 */ 27 28 #include "decode-sme.c.inc" 29 30 31 /* 32 * Resolve tile.size[index] to a host pointer, where tile and index 33 * are always decoded together, dependent on the element size. 34 */ 35 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, 36 int tile_index, bool vertical) 37 { 38 int tile = tile_index >> (4 - esz); 39 int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz); 40 int pos, len, offset; 41 TCGv_i32 tmp; 42 TCGv_ptr addr; 43 44 /* Compute the final index, which is Rs+imm. */ 45 tmp = tcg_temp_new_i32(); 46 tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs)); 47 tcg_gen_addi_i32(tmp, tmp, index); 48 49 /* Prepare a power-of-two modulo via extraction of @len bits. */ 50 len = ctz32(streaming_vec_reg_size(s)) - esz; 51 52 if (!len) { 53 /* 54 * SVL is 128 and the element size is 128. There is exactly 55 * one 128x128 tile in the ZA storage, and so we calculate 56 * (Rs + imm) MOD 1, which is always 0. We need to special case 57 * this because TCG doesn't allow deposit ops with len 0. 58 */ 59 tcg_gen_movi_i32(tmp, 0); 60 } else if (vertical) { 61 /* 62 * Compute the byte offset of the index within the tile: 63 * (index % (svl / size)) * size 64 * = (index % (svl >> esz)) << esz 65 * Perform the power-of-two modulo via extraction of the low @len bits. 66 * Perform the multiply by shifting left by @pos bits. 67 * Perform these operations simultaneously via deposit into zero. 68 */ 69 pos = esz; 70 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 71 72 /* 73 * For big-endian, adjust the indexed column byte offset within 74 * the uint64_t host words that make up env->zarray[]. 75 */ 76 if (HOST_BIG_ENDIAN && esz < MO_64) { 77 tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz)); 78 } 79 } else { 80 /* 81 * Compute the byte offset of the index within the tile: 82 * (index % (svl / size)) * (size * sizeof(row)) 83 * = (index % (svl >> esz)) << (esz + log2(sizeof(row))) 84 */ 85 pos = esz + ctz32(sizeof(ARMVectorReg)); 86 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 87 88 /* Row slices are always aligned and need no endian adjustment. */ 89 } 90 91 /* The tile byte offset within env->zarray is the row. */ 92 offset = tile * sizeof(ARMVectorReg); 93 94 /* Include the byte offset of zarray to make this relative to env. */ 95 offset += offsetof(CPUARMState, zarray); 96 tcg_gen_addi_i32(tmp, tmp, offset); 97 98 /* Add the byte offset to env to produce the final pointer. */ 99 addr = tcg_temp_new_ptr(); 100 tcg_gen_ext_i32_ptr(addr, tmp); 101 tcg_gen_add_ptr(addr, addr, tcg_env); 102 103 return addr; 104 } 105 106 /* 107 * Resolve tile.size[0] to a host pointer. 108 * Used by e.g. outer product insns where we require the entire tile. 109 */ 110 static TCGv_ptr get_tile(DisasContext *s, int esz, int tile) 111 { 112 TCGv_ptr addr = tcg_temp_new_ptr(); 113 int offset; 114 115 offset = tile * sizeof(ARMVectorReg) + offsetof(CPUARMState, zarray); 116 117 tcg_gen_addi_ptr(addr, tcg_env, offset); 118 return addr; 119 } 120 121 static bool trans_ZERO(DisasContext *s, arg_ZERO *a) 122 { 123 if (!dc_isar_feature(aa64_sme, s)) { 124 return false; 125 } 126 if (sme_za_enabled_check(s)) { 127 gen_helper_sme_zero(tcg_env, tcg_constant_i32(a->imm), 128 tcg_constant_i32(streaming_vec_reg_size(s))); 129 } 130 return true; 131 } 132 133 static bool trans_MOVA(DisasContext *s, arg_MOVA *a) 134 { 135 static gen_helper_gvec_4 * const h_fns[5] = { 136 gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, 137 gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d, 138 gen_helper_sve_sel_zpzz_q 139 }; 140 static gen_helper_gvec_3 * const cz_fns[5] = { 141 gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h, 142 gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d, 143 gen_helper_sme_mova_cz_q, 144 }; 145 static gen_helper_gvec_3 * const zc_fns[5] = { 146 gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h, 147 gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d, 148 gen_helper_sme_mova_zc_q, 149 }; 150 151 TCGv_ptr t_za, t_zr, t_pg; 152 TCGv_i32 t_desc; 153 int svl; 154 155 if (!dc_isar_feature(aa64_sme, s)) { 156 return false; 157 } 158 if (!sme_smza_enabled_check(s)) { 159 return true; 160 } 161 162 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 163 t_zr = vec_full_reg_ptr(s, a->zr); 164 t_pg = pred_full_reg_ptr(s, a->pg); 165 166 svl = streaming_vec_reg_size(s); 167 t_desc = tcg_constant_i32(simd_desc(svl, svl, 0)); 168 169 if (a->v) { 170 /* Vertical slice -- use sme mova helpers. */ 171 if (a->to_vec) { 172 zc_fns[a->esz](t_zr, t_za, t_pg, t_desc); 173 } else { 174 cz_fns[a->esz](t_za, t_zr, t_pg, t_desc); 175 } 176 } else { 177 /* Horizontal slice -- reuse sve sel helpers. */ 178 if (a->to_vec) { 179 h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc); 180 } else { 181 h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc); 182 } 183 } 184 return true; 185 } 186 187 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) 188 { 189 typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32); 190 191 /* 192 * Indexed by [esz][be][v][mte][st], which is (except for load/store) 193 * also the order in which the elements appear in the function names, 194 * and so how we must concatenate the pieces. 195 */ 196 197 #define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F } 198 #define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) } 199 #define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) } 200 #define FN_END(L, B) { FN_HV(L), FN_HV(B) } 201 202 static GenLdSt1 * const fns[5][2][2][2][2] = { 203 FN_END(b, b), 204 FN_END(h_le, h_be), 205 FN_END(s_le, s_be), 206 FN_END(d_le, d_be), 207 FN_END(q_le, q_be), 208 }; 209 210 #undef FN_LS 211 #undef FN_MTE 212 #undef FN_HV 213 #undef FN_END 214 215 TCGv_ptr t_za, t_pg; 216 TCGv_i64 addr; 217 uint32_t desc; 218 bool be = s->be_data == MO_BE; 219 bool mte = s->mte_active[0]; 220 221 if (!dc_isar_feature(aa64_sme, s)) { 222 return false; 223 } 224 if (!sme_smza_enabled_check(s)) { 225 return true; 226 } 227 228 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 229 t_pg = pred_full_reg_ptr(s, a->pg); 230 addr = tcg_temp_new_i64(); 231 232 tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz); 233 tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); 234 235 if (!mte) { 236 addr = clean_data_tbi(s, addr); 237 } 238 239 desc = make_svemte_desc(s, streaming_vec_reg_size(s), 1, a->esz, a->st, 0); 240 241 fns[a->esz][be][a->v][mte][a->st](tcg_env, t_za, t_pg, addr, 242 tcg_constant_i32(desc)); 243 return true; 244 } 245 246 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int); 247 248 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn) 249 { 250 int svl = streaming_vec_reg_size(s); 251 int imm = a->imm; 252 TCGv_ptr base; 253 254 if (!sme_za_enabled_check(s)) { 255 return true; 256 } 257 258 /* ZA[n] equates to ZA0H.B[n]. */ 259 base = get_tile_rowcol(s, MO_8, a->rv, imm, false); 260 261 fn(s, base, 0, svl, a->rn, imm * svl); 262 return true; 263 } 264 265 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr) 266 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str) 267 268 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz, 269 gen_helper_gvec_4 *fn) 270 { 271 int svl = streaming_vec_reg_size(s); 272 uint32_t desc = simd_desc(svl, svl, 0); 273 TCGv_ptr za, zn, pn, pm; 274 275 if (!sme_smza_enabled_check(s)) { 276 return true; 277 } 278 279 za = get_tile(s, esz, a->zad); 280 zn = vec_full_reg_ptr(s, a->zn); 281 pn = pred_full_reg_ptr(s, a->pn); 282 pm = pred_full_reg_ptr(s, a->pm); 283 284 fn(za, zn, pn, pm, tcg_constant_i32(desc)); 285 return true; 286 } 287 288 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s) 289 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s) 290 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d) 291 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d) 292 293 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz, 294 gen_helper_gvec_5 *fn) 295 { 296 int svl = streaming_vec_reg_size(s); 297 uint32_t desc = simd_desc(svl, svl, a->sub); 298 TCGv_ptr za, zn, zm, pn, pm; 299 300 if (!sme_smza_enabled_check(s)) { 301 return true; 302 } 303 304 za = get_tile(s, esz, a->zad); 305 zn = vec_full_reg_ptr(s, a->zn); 306 zm = vec_full_reg_ptr(s, a->zm); 307 pn = pred_full_reg_ptr(s, a->pn); 308 pm = pred_full_reg_ptr(s, a->pm); 309 310 fn(za, zn, zm, pn, pm, tcg_constant_i32(desc)); 311 return true; 312 } 313 314 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz, 315 ARMFPStatusFlavour e_fpst, 316 gen_helper_gvec_5_ptr *fn) 317 { 318 int svl = streaming_vec_reg_size(s); 319 uint32_t desc = simd_desc(svl, svl, a->sub); 320 TCGv_ptr za, zn, zm, pn, pm, fpst; 321 322 if (!sme_smza_enabled_check(s)) { 323 return true; 324 } 325 326 za = get_tile(s, esz, a->zad); 327 zn = vec_full_reg_ptr(s, a->zn); 328 zm = vec_full_reg_ptr(s, a->zm); 329 pn = pred_full_reg_ptr(s, a->pn); 330 pm = pred_full_reg_ptr(s, a->pm); 331 fpst = fpstatus_ptr(e_fpst); 332 333 fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc)); 334 return true; 335 } 336 337 static bool do_outprod_env(DisasContext *s, arg_op *a, MemOp esz, 338 gen_helper_gvec_5_ptr *fn) 339 { 340 int svl = streaming_vec_reg_size(s); 341 uint32_t desc = simd_desc(svl, svl, a->sub); 342 TCGv_ptr za, zn, zm, pn, pm; 343 344 if (!sme_smza_enabled_check(s)) { 345 return true; 346 } 347 348 za = get_tile(s, esz, a->zad); 349 zn = vec_full_reg_ptr(s, a->zn); 350 zm = vec_full_reg_ptr(s, a->zm); 351 pn = pred_full_reg_ptr(s, a->pn); 352 pm = pred_full_reg_ptr(s, a->pm); 353 354 fn(za, zn, zm, pn, pm, tcg_env, tcg_constant_i32(desc)); 355 return true; 356 } 357 358 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_env, a, 359 MO_32, gen_helper_sme_fmopa_h) 360 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, 361 MO_32, FPST_FPCR, gen_helper_sme_fmopa_s) 362 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, 363 MO_64, FPST_FPCR, gen_helper_sme_fmopa_d) 364 365 /* TODO: FEAT_EBF16 */ 366 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa) 367 368 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s) 369 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s) 370 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s) 371 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s) 372 373 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d) 374 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d) 375 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d) 376 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d) 377