1 /* 2 * AArch64 SME translation 3 * 4 * Copyright (c) 2022 Linaro, Ltd 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "tcg/tcg-gvec-desc.h" 25 #include "translate.h" 26 #include "translate-a64.h" 27 #include "fpu/softfloat.h" 28 29 30 /* 31 * Include the generated decoder. 32 */ 33 34 #include "decode-sme.c.inc" 35 36 37 /* 38 * Resolve tile.size[index] to a host pointer, where tile and index 39 * are always decoded together, dependent on the element size. 40 */ 41 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, 42 int tile_index, bool vertical) 43 { 44 int tile = tile_index >> (4 - esz); 45 int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz); 46 int pos, len, offset; 47 TCGv_i32 tmp; 48 TCGv_ptr addr; 49 50 /* Compute the final index, which is Rs+imm. */ 51 tmp = tcg_temp_new_i32(); 52 tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs)); 53 tcg_gen_addi_i32(tmp, tmp, index); 54 55 /* Prepare a power-of-two modulo via extraction of @len bits. */ 56 len = ctz32(streaming_vec_reg_size(s)) - esz; 57 58 if (vertical) { 59 /* 60 * Compute the byte offset of the index within the tile: 61 * (index % (svl / size)) * size 62 * = (index % (svl >> esz)) << esz 63 * Perform the power-of-two modulo via extraction of the low @len bits. 64 * Perform the multiply by shifting left by @pos bits. 65 * Perform these operations simultaneously via deposit into zero. 66 */ 67 pos = esz; 68 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 69 70 /* 71 * For big-endian, adjust the indexed column byte offset within 72 * the uint64_t host words that make up env->zarray[]. 73 */ 74 if (HOST_BIG_ENDIAN && esz < MO_64) { 75 tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz)); 76 } 77 } else { 78 /* 79 * Compute the byte offset of the index within the tile: 80 * (index % (svl / size)) * (size * sizeof(row)) 81 * = (index % (svl >> esz)) << (esz + log2(sizeof(row))) 82 */ 83 pos = esz + ctz32(sizeof(ARMVectorReg)); 84 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 85 86 /* Row slices are always aligned and need no endian adjustment. */ 87 } 88 89 /* The tile byte offset within env->zarray is the row. */ 90 offset = tile * sizeof(ARMVectorReg); 91 92 /* Include the byte offset of zarray to make this relative to env. */ 93 offset += offsetof(CPUARMState, zarray); 94 tcg_gen_addi_i32(tmp, tmp, offset); 95 96 /* Add the byte offset to env to produce the final pointer. */ 97 addr = tcg_temp_new_ptr(); 98 tcg_gen_ext_i32_ptr(addr, tmp); 99 tcg_gen_add_ptr(addr, addr, cpu_env); 100 101 return addr; 102 } 103 104 static bool trans_ZERO(DisasContext *s, arg_ZERO *a) 105 { 106 if (!dc_isar_feature(aa64_sme, s)) { 107 return false; 108 } 109 if (sme_za_enabled_check(s)) { 110 gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm), 111 tcg_constant_i32(streaming_vec_reg_size(s))); 112 } 113 return true; 114 } 115 116 static bool trans_MOVA(DisasContext *s, arg_MOVA *a) 117 { 118 static gen_helper_gvec_4 * const h_fns[5] = { 119 gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, 120 gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d, 121 gen_helper_sve_sel_zpzz_q 122 }; 123 static gen_helper_gvec_3 * const cz_fns[5] = { 124 gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h, 125 gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d, 126 gen_helper_sme_mova_cz_q, 127 }; 128 static gen_helper_gvec_3 * const zc_fns[5] = { 129 gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h, 130 gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d, 131 gen_helper_sme_mova_zc_q, 132 }; 133 134 TCGv_ptr t_za, t_zr, t_pg; 135 TCGv_i32 t_desc; 136 int svl; 137 138 if (!dc_isar_feature(aa64_sme, s)) { 139 return false; 140 } 141 if (!sme_smza_enabled_check(s)) { 142 return true; 143 } 144 145 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 146 t_zr = vec_full_reg_ptr(s, a->zr); 147 t_pg = pred_full_reg_ptr(s, a->pg); 148 149 svl = streaming_vec_reg_size(s); 150 t_desc = tcg_constant_i32(simd_desc(svl, svl, 0)); 151 152 if (a->v) { 153 /* Vertical slice -- use sme mova helpers. */ 154 if (a->to_vec) { 155 zc_fns[a->esz](t_zr, t_za, t_pg, t_desc); 156 } else { 157 cz_fns[a->esz](t_za, t_zr, t_pg, t_desc); 158 } 159 } else { 160 /* Horizontal slice -- reuse sve sel helpers. */ 161 if (a->to_vec) { 162 h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc); 163 } else { 164 h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc); 165 } 166 } 167 return true; 168 } 169 170 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) 171 { 172 typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32); 173 174 /* 175 * Indexed by [esz][be][v][mte][st], which is (except for load/store) 176 * also the order in which the elements appear in the function names, 177 * and so how we must concatenate the pieces. 178 */ 179 180 #define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F } 181 #define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) } 182 #define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) } 183 #define FN_END(L, B) { FN_HV(L), FN_HV(B) } 184 185 static GenLdSt1 * const fns[5][2][2][2][2] = { 186 FN_END(b, b), 187 FN_END(h_le, h_be), 188 FN_END(s_le, s_be), 189 FN_END(d_le, d_be), 190 FN_END(q_le, q_be), 191 }; 192 193 #undef FN_LS 194 #undef FN_MTE 195 #undef FN_HV 196 #undef FN_END 197 198 TCGv_ptr t_za, t_pg; 199 TCGv_i64 addr; 200 int svl, desc = 0; 201 bool be = s->be_data == MO_BE; 202 bool mte = s->mte_active[0]; 203 204 if (!dc_isar_feature(aa64_sme, s)) { 205 return false; 206 } 207 if (!sme_smza_enabled_check(s)) { 208 return true; 209 } 210 211 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 212 t_pg = pred_full_reg_ptr(s, a->pg); 213 addr = tcg_temp_new_i64(); 214 215 tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz); 216 tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); 217 218 if (mte) { 219 desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); 220 desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); 221 desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); 222 desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st); 223 desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1); 224 desc <<= SVE_MTEDESC_SHIFT; 225 } else { 226 addr = clean_data_tbi(s, addr); 227 } 228 svl = streaming_vec_reg_size(s); 229 desc = simd_desc(svl, svl, desc); 230 231 fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr, 232 tcg_constant_i32(desc)); 233 return true; 234 } 235 236 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int); 237 238 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn) 239 { 240 int svl = streaming_vec_reg_size(s); 241 int imm = a->imm; 242 TCGv_ptr base; 243 244 if (!sme_za_enabled_check(s)) { 245 return true; 246 } 247 248 /* ZA[n] equates to ZA0H.B[n]. */ 249 base = get_tile_rowcol(s, MO_8, a->rv, imm, false); 250 251 fn(s, base, 0, svl, a->rn, imm * svl); 252 return true; 253 } 254 255 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr) 256 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str) 257 258 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz, 259 gen_helper_gvec_4 *fn) 260 { 261 int svl = streaming_vec_reg_size(s); 262 uint32_t desc = simd_desc(svl, svl, 0); 263 TCGv_ptr za, zn, pn, pm; 264 265 if (!sme_smza_enabled_check(s)) { 266 return true; 267 } 268 269 /* Sum XZR+zad to find ZAd. */ 270 za = get_tile_rowcol(s, esz, 31, a->zad, false); 271 zn = vec_full_reg_ptr(s, a->zn); 272 pn = pred_full_reg_ptr(s, a->pn); 273 pm = pred_full_reg_ptr(s, a->pm); 274 275 fn(za, zn, pn, pm, tcg_constant_i32(desc)); 276 return true; 277 } 278 279 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s) 280 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s) 281 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d) 282 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d) 283 284 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz, 285 gen_helper_gvec_5 *fn) 286 { 287 int svl = streaming_vec_reg_size(s); 288 uint32_t desc = simd_desc(svl, svl, a->sub); 289 TCGv_ptr za, zn, zm, pn, pm; 290 291 if (!sme_smza_enabled_check(s)) { 292 return true; 293 } 294 295 /* Sum XZR+zad to find ZAd. */ 296 za = get_tile_rowcol(s, esz, 31, a->zad, false); 297 zn = vec_full_reg_ptr(s, a->zn); 298 zm = vec_full_reg_ptr(s, a->zm); 299 pn = pred_full_reg_ptr(s, a->pn); 300 pm = pred_full_reg_ptr(s, a->pm); 301 302 fn(za, zn, zm, pn, pm, tcg_constant_i32(desc)); 303 return true; 304 } 305 306 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz, 307 gen_helper_gvec_5_ptr *fn) 308 { 309 int svl = streaming_vec_reg_size(s); 310 uint32_t desc = simd_desc(svl, svl, a->sub); 311 TCGv_ptr za, zn, zm, pn, pm, fpst; 312 313 if (!sme_smza_enabled_check(s)) { 314 return true; 315 } 316 317 /* Sum XZR+zad to find ZAd. */ 318 za = get_tile_rowcol(s, esz, 31, a->zad, false); 319 zn = vec_full_reg_ptr(s, a->zn); 320 zm = vec_full_reg_ptr(s, a->zm); 321 pn = pred_full_reg_ptr(s, a->pn); 322 pm = pred_full_reg_ptr(s, a->pm); 323 fpst = fpstatus_ptr(FPST_FPCR); 324 325 fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc)); 326 return true; 327 } 328 329 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h) 330 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s) 331 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d) 332 333 /* TODO: FEAT_EBF16 */ 334 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa) 335 336 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s) 337 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s) 338 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s) 339 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s) 340 341 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d) 342 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d) 343 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d) 344 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d) 345