1 /* 2 * AArch64 SME translation 3 * 4 * Copyright (c) 2022 Linaro, Ltd 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "translate.h" 22 #include "translate-a64.h" 23 24 /* 25 * Include the generated decoder. 26 */ 27 28 #include "decode-sme.c.inc" 29 30 31 /* 32 * Resolve tile.size[index] to a host pointer, where tile and index 33 * are always decoded together, dependent on the element size. 34 */ 35 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, 36 int tile_index, bool vertical) 37 { 38 int tile = tile_index >> (4 - esz); 39 int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz); 40 int pos, len, offset; 41 TCGv_i32 tmp; 42 TCGv_ptr addr; 43 44 /* Compute the final index, which is Rs+imm. */ 45 tmp = tcg_temp_new_i32(); 46 tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs)); 47 tcg_gen_addi_i32(tmp, tmp, index); 48 49 /* Prepare a power-of-two modulo via extraction of @len bits. */ 50 len = ctz32(streaming_vec_reg_size(s)) - esz; 51 52 if (vertical) { 53 /* 54 * Compute the byte offset of the index within the tile: 55 * (index % (svl / size)) * size 56 * = (index % (svl >> esz)) << esz 57 * Perform the power-of-two modulo via extraction of the low @len bits. 58 * Perform the multiply by shifting left by @pos bits. 59 * Perform these operations simultaneously via deposit into zero. 60 */ 61 pos = esz; 62 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 63 64 /* 65 * For big-endian, adjust the indexed column byte offset within 66 * the uint64_t host words that make up env->zarray[]. 67 */ 68 if (HOST_BIG_ENDIAN && esz < MO_64) { 69 tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz)); 70 } 71 } else { 72 /* 73 * Compute the byte offset of the index within the tile: 74 * (index % (svl / size)) * (size * sizeof(row)) 75 * = (index % (svl >> esz)) << (esz + log2(sizeof(row))) 76 */ 77 pos = esz + ctz32(sizeof(ARMVectorReg)); 78 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 79 80 /* Row slices are always aligned and need no endian adjustment. */ 81 } 82 83 /* The tile byte offset within env->zarray is the row. */ 84 offset = tile * sizeof(ARMVectorReg); 85 86 /* Include the byte offset of zarray to make this relative to env. */ 87 offset += offsetof(CPUARMState, zarray); 88 tcg_gen_addi_i32(tmp, tmp, offset); 89 90 /* Add the byte offset to env to produce the final pointer. */ 91 addr = tcg_temp_new_ptr(); 92 tcg_gen_ext_i32_ptr(addr, tmp); 93 tcg_gen_add_ptr(addr, addr, cpu_env); 94 95 return addr; 96 } 97 98 static bool trans_ZERO(DisasContext *s, arg_ZERO *a) 99 { 100 if (!dc_isar_feature(aa64_sme, s)) { 101 return false; 102 } 103 if (sme_za_enabled_check(s)) { 104 gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm), 105 tcg_constant_i32(streaming_vec_reg_size(s))); 106 } 107 return true; 108 } 109 110 static bool trans_MOVA(DisasContext *s, arg_MOVA *a) 111 { 112 static gen_helper_gvec_4 * const h_fns[5] = { 113 gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, 114 gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d, 115 gen_helper_sve_sel_zpzz_q 116 }; 117 static gen_helper_gvec_3 * const cz_fns[5] = { 118 gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h, 119 gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d, 120 gen_helper_sme_mova_cz_q, 121 }; 122 static gen_helper_gvec_3 * const zc_fns[5] = { 123 gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h, 124 gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d, 125 gen_helper_sme_mova_zc_q, 126 }; 127 128 TCGv_ptr t_za, t_zr, t_pg; 129 TCGv_i32 t_desc; 130 int svl; 131 132 if (!dc_isar_feature(aa64_sme, s)) { 133 return false; 134 } 135 if (!sme_smza_enabled_check(s)) { 136 return true; 137 } 138 139 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 140 t_zr = vec_full_reg_ptr(s, a->zr); 141 t_pg = pred_full_reg_ptr(s, a->pg); 142 143 svl = streaming_vec_reg_size(s); 144 t_desc = tcg_constant_i32(simd_desc(svl, svl, 0)); 145 146 if (a->v) { 147 /* Vertical slice -- use sme mova helpers. */ 148 if (a->to_vec) { 149 zc_fns[a->esz](t_zr, t_za, t_pg, t_desc); 150 } else { 151 cz_fns[a->esz](t_za, t_zr, t_pg, t_desc); 152 } 153 } else { 154 /* Horizontal slice -- reuse sve sel helpers. */ 155 if (a->to_vec) { 156 h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc); 157 } else { 158 h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc); 159 } 160 } 161 return true; 162 } 163 164 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) 165 { 166 typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32); 167 168 /* 169 * Indexed by [esz][be][v][mte][st], which is (except for load/store) 170 * also the order in which the elements appear in the function names, 171 * and so how we must concatenate the pieces. 172 */ 173 174 #define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F } 175 #define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) } 176 #define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) } 177 #define FN_END(L, B) { FN_HV(L), FN_HV(B) } 178 179 static GenLdSt1 * const fns[5][2][2][2][2] = { 180 FN_END(b, b), 181 FN_END(h_le, h_be), 182 FN_END(s_le, s_be), 183 FN_END(d_le, d_be), 184 FN_END(q_le, q_be), 185 }; 186 187 #undef FN_LS 188 #undef FN_MTE 189 #undef FN_HV 190 #undef FN_END 191 192 TCGv_ptr t_za, t_pg; 193 TCGv_i64 addr; 194 int svl, desc = 0; 195 bool be = s->be_data == MO_BE; 196 bool mte = s->mte_active[0]; 197 198 if (!dc_isar_feature(aa64_sme, s)) { 199 return false; 200 } 201 if (!sme_smza_enabled_check(s)) { 202 return true; 203 } 204 205 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 206 t_pg = pred_full_reg_ptr(s, a->pg); 207 addr = tcg_temp_new_i64(); 208 209 tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz); 210 tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); 211 212 if (mte) { 213 desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); 214 desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); 215 desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); 216 desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st); 217 desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1); 218 desc <<= SVE_MTEDESC_SHIFT; 219 } else { 220 addr = clean_data_tbi(s, addr); 221 } 222 svl = streaming_vec_reg_size(s); 223 desc = simd_desc(svl, svl, desc); 224 225 fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr, 226 tcg_constant_i32(desc)); 227 return true; 228 } 229 230 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int); 231 232 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn) 233 { 234 int svl = streaming_vec_reg_size(s); 235 int imm = a->imm; 236 TCGv_ptr base; 237 238 if (!sme_za_enabled_check(s)) { 239 return true; 240 } 241 242 /* ZA[n] equates to ZA0H.B[n]. */ 243 base = get_tile_rowcol(s, MO_8, a->rv, imm, false); 244 245 fn(s, base, 0, svl, a->rn, imm * svl); 246 return true; 247 } 248 249 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr) 250 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str) 251 252 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz, 253 gen_helper_gvec_4 *fn) 254 { 255 int svl = streaming_vec_reg_size(s); 256 uint32_t desc = simd_desc(svl, svl, 0); 257 TCGv_ptr za, zn, pn, pm; 258 259 if (!sme_smza_enabled_check(s)) { 260 return true; 261 } 262 263 /* Sum XZR+zad to find ZAd. */ 264 za = get_tile_rowcol(s, esz, 31, a->zad, false); 265 zn = vec_full_reg_ptr(s, a->zn); 266 pn = pred_full_reg_ptr(s, a->pn); 267 pm = pred_full_reg_ptr(s, a->pm); 268 269 fn(za, zn, pn, pm, tcg_constant_i32(desc)); 270 return true; 271 } 272 273 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s) 274 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s) 275 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d) 276 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d) 277 278 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz, 279 gen_helper_gvec_5 *fn) 280 { 281 int svl = streaming_vec_reg_size(s); 282 uint32_t desc = simd_desc(svl, svl, a->sub); 283 TCGv_ptr za, zn, zm, pn, pm; 284 285 if (!sme_smza_enabled_check(s)) { 286 return true; 287 } 288 289 /* Sum XZR+zad to find ZAd. */ 290 za = get_tile_rowcol(s, esz, 31, a->zad, false); 291 zn = vec_full_reg_ptr(s, a->zn); 292 zm = vec_full_reg_ptr(s, a->zm); 293 pn = pred_full_reg_ptr(s, a->pn); 294 pm = pred_full_reg_ptr(s, a->pm); 295 296 fn(za, zn, zm, pn, pm, tcg_constant_i32(desc)); 297 return true; 298 } 299 300 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz, 301 gen_helper_gvec_5_ptr *fn) 302 { 303 int svl = streaming_vec_reg_size(s); 304 uint32_t desc = simd_desc(svl, svl, a->sub); 305 TCGv_ptr za, zn, zm, pn, pm, fpst; 306 307 if (!sme_smza_enabled_check(s)) { 308 return true; 309 } 310 311 /* Sum XZR+zad to find ZAd. */ 312 za = get_tile_rowcol(s, esz, 31, a->zad, false); 313 zn = vec_full_reg_ptr(s, a->zn); 314 zm = vec_full_reg_ptr(s, a->zm); 315 pn = pred_full_reg_ptr(s, a->pn); 316 pm = pred_full_reg_ptr(s, a->pm); 317 fpst = fpstatus_ptr(FPST_FPCR); 318 319 fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc)); 320 return true; 321 } 322 323 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h) 324 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s) 325 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d) 326 327 /* TODO: FEAT_EBF16 */ 328 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa) 329 330 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s) 331 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s) 332 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s) 333 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s) 334 335 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d) 336 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d) 337 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d) 338 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d) 339