1 /*
2 * AArch64 SME translation
3 *
4 * Copyright (c) 2022 Linaro, Ltd
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "translate.h"
22 #include "translate-a64.h"
23
24 /*
25 * Include the generated decoder.
26 */
27
28 #include "decode-sme.c.inc"
29
30
31 /*
32 * Resolve tile.size[index] to a host pointer, where tile and index
33 * are always decoded together, dependent on the element size.
34 */
get_tile_rowcol(DisasContext * s,int esz,int rs,int tile_index,bool vertical)35 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
36 int tile_index, bool vertical)
37 {
38 int tile = tile_index >> (4 - esz);
39 int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
40 int pos, len, offset;
41 TCGv_i32 tmp;
42 TCGv_ptr addr;
43
44 /* Compute the final index, which is Rs+imm. */
45 tmp = tcg_temp_new_i32();
46 tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs));
47 tcg_gen_addi_i32(tmp, tmp, index);
48
49 /* Prepare a power-of-two modulo via extraction of @len bits. */
50 len = ctz32(streaming_vec_reg_size(s)) - esz;
51
52 if (!len) {
53 /*
54 * SVL is 128 and the element size is 128. There is exactly
55 * one 128x128 tile in the ZA storage, and so we calculate
56 * (Rs + imm) MOD 1, which is always 0. We need to special case
57 * this because TCG doesn't allow deposit ops with len 0.
58 */
59 tcg_gen_movi_i32(tmp, 0);
60 } else if (vertical) {
61 /*
62 * Compute the byte offset of the index within the tile:
63 * (index % (svl / size)) * size
64 * = (index % (svl >> esz)) << esz
65 * Perform the power-of-two modulo via extraction of the low @len bits.
66 * Perform the multiply by shifting left by @pos bits.
67 * Perform these operations simultaneously via deposit into zero.
68 */
69 pos = esz;
70 tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
71
72 /*
73 * For big-endian, adjust the indexed column byte offset within
74 * the uint64_t host words that make up env->zarray[].
75 */
76 if (HOST_BIG_ENDIAN && esz < MO_64) {
77 tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz));
78 }
79 } else {
80 /*
81 * Compute the byte offset of the index within the tile:
82 * (index % (svl / size)) * (size * sizeof(row))
83 * = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
84 */
85 pos = esz + ctz32(sizeof(ARMVectorReg));
86 tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
87
88 /* Row slices are always aligned and need no endian adjustment. */
89 }
90
91 /* The tile byte offset within env->zarray is the row. */
92 offset = tile * sizeof(ARMVectorReg);
93
94 /* Include the byte offset of zarray to make this relative to env. */
95 offset += offsetof(CPUARMState, zarray);
96 tcg_gen_addi_i32(tmp, tmp, offset);
97
98 /* Add the byte offset to env to produce the final pointer. */
99 addr = tcg_temp_new_ptr();
100 tcg_gen_ext_i32_ptr(addr, tmp);
101 tcg_gen_add_ptr(addr, addr, tcg_env);
102
103 return addr;
104 }
105
106 /*
107 * Resolve tile.size[0] to a host pointer.
108 * Used by e.g. outer product insns where we require the entire tile.
109 */
get_tile(DisasContext * s,int esz,int tile)110 static TCGv_ptr get_tile(DisasContext *s, int esz, int tile)
111 {
112 TCGv_ptr addr = tcg_temp_new_ptr();
113 int offset;
114
115 offset = tile * sizeof(ARMVectorReg) + offsetof(CPUARMState, zarray);
116
117 tcg_gen_addi_ptr(addr, tcg_env, offset);
118 return addr;
119 }
120
trans_ZERO(DisasContext * s,arg_ZERO * a)121 static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
122 {
123 if (!dc_isar_feature(aa64_sme, s)) {
124 return false;
125 }
126 if (sme_za_enabled_check(s)) {
127 gen_helper_sme_zero(tcg_env, tcg_constant_i32(a->imm),
128 tcg_constant_i32(streaming_vec_reg_size(s)));
129 }
130 return true;
131 }
132
trans_MOVA(DisasContext * s,arg_MOVA * a)133 static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
134 {
135 static gen_helper_gvec_4 * const h_fns[5] = {
136 gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
137 gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d,
138 gen_helper_sve_sel_zpzz_q
139 };
140 static gen_helper_gvec_3 * const cz_fns[5] = {
141 gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h,
142 gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d,
143 gen_helper_sme_mova_cz_q,
144 };
145 static gen_helper_gvec_3 * const zc_fns[5] = {
146 gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h,
147 gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d,
148 gen_helper_sme_mova_zc_q,
149 };
150
151 TCGv_ptr t_za, t_zr, t_pg;
152 TCGv_i32 t_desc;
153 int svl;
154
155 if (!dc_isar_feature(aa64_sme, s)) {
156 return false;
157 }
158 if (!sme_smza_enabled_check(s)) {
159 return true;
160 }
161
162 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
163 t_zr = vec_full_reg_ptr(s, a->zr);
164 t_pg = pred_full_reg_ptr(s, a->pg);
165
166 svl = streaming_vec_reg_size(s);
167 t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
168
169 if (a->v) {
170 /* Vertical slice -- use sme mova helpers. */
171 if (a->to_vec) {
172 zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
173 } else {
174 cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
175 }
176 } else {
177 /* Horizontal slice -- reuse sve sel helpers. */
178 if (a->to_vec) {
179 h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
180 } else {
181 h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
182 }
183 }
184 return true;
185 }
186
trans_LDST1(DisasContext * s,arg_LDST1 * a)187 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
188 {
189 typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
190
191 /*
192 * Indexed by [esz][be][v][mte][st], which is (except for load/store)
193 * also the order in which the elements appear in the function names,
194 * and so how we must concatenate the pieces.
195 */
196
197 #define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
198 #define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) }
199 #define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) }
200 #define FN_END(L, B) { FN_HV(L), FN_HV(B) }
201
202 static GenLdSt1 * const fns[5][2][2][2][2] = {
203 FN_END(b, b),
204 FN_END(h_le, h_be),
205 FN_END(s_le, s_be),
206 FN_END(d_le, d_be),
207 FN_END(q_le, q_be),
208 };
209
210 #undef FN_LS
211 #undef FN_MTE
212 #undef FN_HV
213 #undef FN_END
214
215 TCGv_ptr t_za, t_pg;
216 TCGv_i64 addr;
217 uint32_t desc;
218 bool be = s->be_data == MO_BE;
219 bool mte = s->mte_active[0];
220
221 if (!dc_isar_feature(aa64_sme, s)) {
222 return false;
223 }
224 if (!sme_smza_enabled_check(s)) {
225 return true;
226 }
227
228 t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
229 t_pg = pred_full_reg_ptr(s, a->pg);
230 addr = tcg_temp_new_i64();
231
232 tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
233 tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
234
235 if (!mte) {
236 addr = clean_data_tbi(s, addr);
237 }
238
239 desc = make_svemte_desc(s, streaming_vec_reg_size(s), 1, a->esz, a->st, 0);
240
241 fns[a->esz][be][a->v][mte][a->st](tcg_env, t_za, t_pg, addr,
242 tcg_constant_i32(desc));
243 return true;
244 }
245
246 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int);
247
do_ldst_r(DisasContext * s,arg_ldstr * a,GenLdStR * fn)248 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn)
249 {
250 int svl = streaming_vec_reg_size(s);
251 int imm = a->imm;
252 TCGv_ptr base;
253
254 if (!sme_za_enabled_check(s)) {
255 return true;
256 }
257
258 /* ZA[n] equates to ZA0H.B[n]. */
259 base = get_tile_rowcol(s, MO_8, a->rv, imm, false);
260
261 fn(s, base, 0, svl, a->rn, imm * svl);
262 return true;
263 }
264
TRANS_FEAT(LDR,aa64_sme,do_ldst_r,a,gen_sve_ldr)265 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr)
266 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str)
267
268 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
269 gen_helper_gvec_4 *fn)
270 {
271 int svl = streaming_vec_reg_size(s);
272 uint32_t desc = simd_desc(svl, svl, 0);
273 TCGv_ptr za, zn, pn, pm;
274
275 if (!sme_smza_enabled_check(s)) {
276 return true;
277 }
278
279 za = get_tile(s, esz, a->zad);
280 zn = vec_full_reg_ptr(s, a->zn);
281 pn = pred_full_reg_ptr(s, a->pn);
282 pm = pred_full_reg_ptr(s, a->pm);
283
284 fn(za, zn, pn, pm, tcg_constant_i32(desc));
285 return true;
286 }
287
TRANS_FEAT(ADDHA_s,aa64_sme,do_adda,a,MO_32,gen_helper_sme_addha_s)288 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s)
289 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s)
290 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d)
291 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d)
292
293 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz,
294 gen_helper_gvec_5 *fn)
295 {
296 int svl = streaming_vec_reg_size(s);
297 uint32_t desc = simd_desc(svl, svl, a->sub);
298 TCGv_ptr za, zn, zm, pn, pm;
299
300 if (!sme_smza_enabled_check(s)) {
301 return true;
302 }
303
304 za = get_tile(s, esz, a->zad);
305 zn = vec_full_reg_ptr(s, a->zn);
306 zm = vec_full_reg_ptr(s, a->zm);
307 pn = pred_full_reg_ptr(s, a->pn);
308 pm = pred_full_reg_ptr(s, a->pm);
309
310 fn(za, zn, zm, pn, pm, tcg_constant_i32(desc));
311 return true;
312 }
313
do_outprod_fpst(DisasContext * s,arg_op * a,MemOp esz,ARMFPStatusFlavour e_fpst,gen_helper_gvec_5_ptr * fn)314 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz,
315 ARMFPStatusFlavour e_fpst,
316 gen_helper_gvec_5_ptr *fn)
317 {
318 int svl = streaming_vec_reg_size(s);
319 uint32_t desc = simd_desc(svl, svl, a->sub);
320 TCGv_ptr za, zn, zm, pn, pm, fpst;
321
322 if (!sme_smza_enabled_check(s)) {
323 return true;
324 }
325
326 za = get_tile(s, esz, a->zad);
327 zn = vec_full_reg_ptr(s, a->zn);
328 zm = vec_full_reg_ptr(s, a->zm);
329 pn = pred_full_reg_ptr(s, a->pn);
330 pm = pred_full_reg_ptr(s, a->pm);
331 fpst = fpstatus_ptr(e_fpst);
332
333 fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc));
334 return true;
335 }
336
do_outprod_env(DisasContext * s,arg_op * a,MemOp esz,gen_helper_gvec_5_ptr * fn)337 static bool do_outprod_env(DisasContext *s, arg_op *a, MemOp esz,
338 gen_helper_gvec_5_ptr *fn)
339 {
340 int svl = streaming_vec_reg_size(s);
341 uint32_t desc = simd_desc(svl, svl, a->sub);
342 TCGv_ptr za, zn, zm, pn, pm;
343
344 if (!sme_smza_enabled_check(s)) {
345 return true;
346 }
347
348 za = get_tile(s, esz, a->zad);
349 zn = vec_full_reg_ptr(s, a->zn);
350 zm = vec_full_reg_ptr(s, a->zm);
351 pn = pred_full_reg_ptr(s, a->pn);
352 pm = pred_full_reg_ptr(s, a->pm);
353
354 fn(za, zn, zm, pn, pm, tcg_env, tcg_constant_i32(desc));
355 return true;
356 }
357
358 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_env, a,
359 MO_32, gen_helper_sme_fmopa_h)
360 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a,
361 MO_32, FPST_FPCR, gen_helper_sme_fmopa_s)
362 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a,
363 MO_64, FPST_FPCR, gen_helper_sme_fmopa_d)
364
365 /* TODO: FEAT_EBF16 */
366 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa)
367
368 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s)
369 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s)
370 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s)
371 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s)
372
373 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d)
374 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d)
375 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d)
376 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d)
377