1 /* 2 * AArch64 SME translation 3 * 4 * Copyright (c) 2022 Linaro, Ltd 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "tcg/tcg-gvec-desc.h" 25 #include "translate.h" 26 #include "exec/helper-gen.h" 27 #include "translate-a64.h" 28 #include "fpu/softfloat.h" 29 30 31 /* 32 * Include the generated decoder. 33 */ 34 35 #include "decode-sme.c.inc" 36 37 38 /* 39 * Resolve tile.size[index] to a host pointer, where tile and index 40 * are always decoded together, dependent on the element size. 41 */ 42 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, 43 int tile_index, bool vertical) 44 { 45 int tile = tile_index >> (4 - esz); 46 int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz); 47 int pos, len, offset; 48 TCGv_i32 tmp; 49 TCGv_ptr addr; 50 51 /* Compute the final index, which is Rs+imm. */ 52 tmp = tcg_temp_new_i32(); 53 tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs)); 54 tcg_gen_addi_i32(tmp, tmp, index); 55 56 /* Prepare a power-of-two modulo via extraction of @len bits. */ 57 len = ctz32(streaming_vec_reg_size(s)) - esz; 58 59 if (vertical) { 60 /* 61 * Compute the byte offset of the index within the tile: 62 * (index % (svl / size)) * size 63 * = (index % (svl >> esz)) << esz 64 * Perform the power-of-two modulo via extraction of the low @len bits. 65 * Perform the multiply by shifting left by @pos bits. 66 * Perform these operations simultaneously via deposit into zero. 67 */ 68 pos = esz; 69 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 70 71 /* 72 * For big-endian, adjust the indexed column byte offset within 73 * the uint64_t host words that make up env->zarray[]. 74 */ 75 if (HOST_BIG_ENDIAN && esz < MO_64) { 76 tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz)); 77 } 78 } else { 79 /* 80 * Compute the byte offset of the index within the tile: 81 * (index % (svl / size)) * (size * sizeof(row)) 82 * = (index % (svl >> esz)) << (esz + log2(sizeof(row))) 83 */ 84 pos = esz + ctz32(sizeof(ARMVectorReg)); 85 tcg_gen_deposit_z_i32(tmp, tmp, pos, len); 86 87 /* Row slices are always aligned and need no endian adjustment. */ 88 } 89 90 /* The tile byte offset within env->zarray is the row. */ 91 offset = tile * sizeof(ARMVectorReg); 92 93 /* Include the byte offset of zarray to make this relative to env. */ 94 offset += offsetof(CPUARMState, zarray); 95 tcg_gen_addi_i32(tmp, tmp, offset); 96 97 /* Add the byte offset to env to produce the final pointer. */ 98 addr = tcg_temp_new_ptr(); 99 tcg_gen_ext_i32_ptr(addr, tmp); 100 tcg_gen_add_ptr(addr, addr, cpu_env); 101 102 return addr; 103 } 104 105 static bool trans_ZERO(DisasContext *s, arg_ZERO *a) 106 { 107 if (!dc_isar_feature(aa64_sme, s)) { 108 return false; 109 } 110 if (sme_za_enabled_check(s)) { 111 gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm), 112 tcg_constant_i32(streaming_vec_reg_size(s))); 113 } 114 return true; 115 } 116 117 static bool trans_MOVA(DisasContext *s, arg_MOVA *a) 118 { 119 static gen_helper_gvec_4 * const h_fns[5] = { 120 gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, 121 gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d, 122 gen_helper_sve_sel_zpzz_q 123 }; 124 static gen_helper_gvec_3 * const cz_fns[5] = { 125 gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h, 126 gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d, 127 gen_helper_sme_mova_cz_q, 128 }; 129 static gen_helper_gvec_3 * const zc_fns[5] = { 130 gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h, 131 gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d, 132 gen_helper_sme_mova_zc_q, 133 }; 134 135 TCGv_ptr t_za, t_zr, t_pg; 136 TCGv_i32 t_desc; 137 int svl; 138 139 if (!dc_isar_feature(aa64_sme, s)) { 140 return false; 141 } 142 if (!sme_smza_enabled_check(s)) { 143 return true; 144 } 145 146 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 147 t_zr = vec_full_reg_ptr(s, a->zr); 148 t_pg = pred_full_reg_ptr(s, a->pg); 149 150 svl = streaming_vec_reg_size(s); 151 t_desc = tcg_constant_i32(simd_desc(svl, svl, 0)); 152 153 if (a->v) { 154 /* Vertical slice -- use sme mova helpers. */ 155 if (a->to_vec) { 156 zc_fns[a->esz](t_zr, t_za, t_pg, t_desc); 157 } else { 158 cz_fns[a->esz](t_za, t_zr, t_pg, t_desc); 159 } 160 } else { 161 /* Horizontal slice -- reuse sve sel helpers. */ 162 if (a->to_vec) { 163 h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc); 164 } else { 165 h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc); 166 } 167 } 168 return true; 169 } 170 171 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) 172 { 173 typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32); 174 175 /* 176 * Indexed by [esz][be][v][mte][st], which is (except for load/store) 177 * also the order in which the elements appear in the function names, 178 * and so how we must concatenate the pieces. 179 */ 180 181 #define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F } 182 #define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) } 183 #define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) } 184 #define FN_END(L, B) { FN_HV(L), FN_HV(B) } 185 186 static GenLdSt1 * const fns[5][2][2][2][2] = { 187 FN_END(b, b), 188 FN_END(h_le, h_be), 189 FN_END(s_le, s_be), 190 FN_END(d_le, d_be), 191 FN_END(q_le, q_be), 192 }; 193 194 #undef FN_LS 195 #undef FN_MTE 196 #undef FN_HV 197 #undef FN_END 198 199 TCGv_ptr t_za, t_pg; 200 TCGv_i64 addr; 201 int svl, desc = 0; 202 bool be = s->be_data == MO_BE; 203 bool mte = s->mte_active[0]; 204 205 if (!dc_isar_feature(aa64_sme, s)) { 206 return false; 207 } 208 if (!sme_smza_enabled_check(s)) { 209 return true; 210 } 211 212 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); 213 t_pg = pred_full_reg_ptr(s, a->pg); 214 addr = tcg_temp_new_i64(); 215 216 tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz); 217 tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); 218 219 if (mte) { 220 desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); 221 desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); 222 desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); 223 desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st); 224 desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1); 225 desc <<= SVE_MTEDESC_SHIFT; 226 } else { 227 addr = clean_data_tbi(s, addr); 228 } 229 svl = streaming_vec_reg_size(s); 230 desc = simd_desc(svl, svl, desc); 231 232 fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr, 233 tcg_constant_i32(desc)); 234 return true; 235 } 236 237 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int); 238 239 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn) 240 { 241 int svl = streaming_vec_reg_size(s); 242 int imm = a->imm; 243 TCGv_ptr base; 244 245 if (!sme_za_enabled_check(s)) { 246 return true; 247 } 248 249 /* ZA[n] equates to ZA0H.B[n]. */ 250 base = get_tile_rowcol(s, MO_8, a->rv, imm, false); 251 252 fn(s, base, 0, svl, a->rn, imm * svl); 253 return true; 254 } 255 256 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr) 257 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str) 258 259 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz, 260 gen_helper_gvec_4 *fn) 261 { 262 int svl = streaming_vec_reg_size(s); 263 uint32_t desc = simd_desc(svl, svl, 0); 264 TCGv_ptr za, zn, pn, pm; 265 266 if (!sme_smza_enabled_check(s)) { 267 return true; 268 } 269 270 /* Sum XZR+zad to find ZAd. */ 271 za = get_tile_rowcol(s, esz, 31, a->zad, false); 272 zn = vec_full_reg_ptr(s, a->zn); 273 pn = pred_full_reg_ptr(s, a->pn); 274 pm = pred_full_reg_ptr(s, a->pm); 275 276 fn(za, zn, pn, pm, tcg_constant_i32(desc)); 277 return true; 278 } 279 280 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s) 281 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s) 282 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d) 283 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d) 284 285 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz, 286 gen_helper_gvec_5 *fn) 287 { 288 int svl = streaming_vec_reg_size(s); 289 uint32_t desc = simd_desc(svl, svl, a->sub); 290 TCGv_ptr za, zn, zm, pn, pm; 291 292 if (!sme_smza_enabled_check(s)) { 293 return true; 294 } 295 296 /* Sum XZR+zad to find ZAd. */ 297 za = get_tile_rowcol(s, esz, 31, a->zad, false); 298 zn = vec_full_reg_ptr(s, a->zn); 299 zm = vec_full_reg_ptr(s, a->zm); 300 pn = pred_full_reg_ptr(s, a->pn); 301 pm = pred_full_reg_ptr(s, a->pm); 302 303 fn(za, zn, zm, pn, pm, tcg_constant_i32(desc)); 304 return true; 305 } 306 307 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz, 308 gen_helper_gvec_5_ptr *fn) 309 { 310 int svl = streaming_vec_reg_size(s); 311 uint32_t desc = simd_desc(svl, svl, a->sub); 312 TCGv_ptr za, zn, zm, pn, pm, fpst; 313 314 if (!sme_smza_enabled_check(s)) { 315 return true; 316 } 317 318 /* Sum XZR+zad to find ZAd. */ 319 za = get_tile_rowcol(s, esz, 31, a->zad, false); 320 zn = vec_full_reg_ptr(s, a->zn); 321 zm = vec_full_reg_ptr(s, a->zm); 322 pn = pred_full_reg_ptr(s, a->pn); 323 pm = pred_full_reg_ptr(s, a->pm); 324 fpst = fpstatus_ptr(FPST_FPCR); 325 326 fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc)); 327 return true; 328 } 329 330 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h) 331 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s) 332 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d) 333 334 /* TODO: FEAT_EBF16 */ 335 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa) 336 337 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s) 338 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s) 339 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s) 340 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s) 341 342 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d) 343 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d) 344 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d) 345 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d) 346