xref: /openbmc/qemu/target/arm/tcg/translate-sme.c (revision d53106c997e5c8e61e37ae9ff9f0e1f243b03968)
1 /*
2  * AArch64 SME translation
3  *
4  * Copyright (c) 2022 Linaro, Ltd
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "tcg/tcg-gvec-desc.h"
25 #include "translate.h"
26 #include "translate-a64.h"
27 #include "fpu/softfloat.h"
28 
29 
30 /*
31  * Include the generated decoder.
32  */
33 
34 #include "decode-sme.c.inc"
35 
36 
37 /*
38  * Resolve tile.size[index] to a host pointer, where tile and index
39  * are always decoded together, dependent on the element size.
40  */
41 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
42                                 int tile_index, bool vertical)
43 {
44     int tile = tile_index >> (4 - esz);
45     int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
46     int pos, len, offset;
47     TCGv_i32 tmp;
48     TCGv_ptr addr;
49 
50     /* Compute the final index, which is Rs+imm. */
51     tmp = tcg_temp_new_i32();
52     tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs));
53     tcg_gen_addi_i32(tmp, tmp, index);
54 
55     /* Prepare a power-of-two modulo via extraction of @len bits. */
56     len = ctz32(streaming_vec_reg_size(s)) - esz;
57 
58     if (vertical) {
59         /*
60          * Compute the byte offset of the index within the tile:
61          *     (index % (svl / size)) * size
62          *   = (index % (svl >> esz)) << esz
63          * Perform the power-of-two modulo via extraction of the low @len bits.
64          * Perform the multiply by shifting left by @pos bits.
65          * Perform these operations simultaneously via deposit into zero.
66          */
67         pos = esz;
68         tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
69 
70         /*
71          * For big-endian, adjust the indexed column byte offset within
72          * the uint64_t host words that make up env->zarray[].
73          */
74         if (HOST_BIG_ENDIAN && esz < MO_64) {
75             tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz));
76         }
77     } else {
78         /*
79          * Compute the byte offset of the index within the tile:
80          *     (index % (svl / size)) * (size * sizeof(row))
81          *   = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
82          */
83         pos = esz + ctz32(sizeof(ARMVectorReg));
84         tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
85 
86         /* Row slices are always aligned and need no endian adjustment. */
87     }
88 
89     /* The tile byte offset within env->zarray is the row. */
90     offset = tile * sizeof(ARMVectorReg);
91 
92     /* Include the byte offset of zarray to make this relative to env. */
93     offset += offsetof(CPUARMState, zarray);
94     tcg_gen_addi_i32(tmp, tmp, offset);
95 
96     /* Add the byte offset to env to produce the final pointer. */
97     addr = tcg_temp_new_ptr();
98     tcg_gen_ext_i32_ptr(addr, tmp);
99     tcg_gen_add_ptr(addr, addr, cpu_env);
100 
101     return addr;
102 }
103 
104 static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
105 {
106     if (!dc_isar_feature(aa64_sme, s)) {
107         return false;
108     }
109     if (sme_za_enabled_check(s)) {
110         gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm),
111                             tcg_constant_i32(streaming_vec_reg_size(s)));
112     }
113     return true;
114 }
115 
116 static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
117 {
118     static gen_helper_gvec_4 * const h_fns[5] = {
119         gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
120         gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d,
121         gen_helper_sve_sel_zpzz_q
122     };
123     static gen_helper_gvec_3 * const cz_fns[5] = {
124         gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h,
125         gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d,
126         gen_helper_sme_mova_cz_q,
127     };
128     static gen_helper_gvec_3 * const zc_fns[5] = {
129         gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h,
130         gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d,
131         gen_helper_sme_mova_zc_q,
132     };
133 
134     TCGv_ptr t_za, t_zr, t_pg;
135     TCGv_i32 t_desc;
136     int svl;
137 
138     if (!dc_isar_feature(aa64_sme, s)) {
139         return false;
140     }
141     if (!sme_smza_enabled_check(s)) {
142         return true;
143     }
144 
145     t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
146     t_zr = vec_full_reg_ptr(s, a->zr);
147     t_pg = pred_full_reg_ptr(s, a->pg);
148 
149     svl = streaming_vec_reg_size(s);
150     t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
151 
152     if (a->v) {
153         /* Vertical slice -- use sme mova helpers. */
154         if (a->to_vec) {
155             zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
156         } else {
157             cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
158         }
159     } else {
160         /* Horizontal slice -- reuse sve sel helpers. */
161         if (a->to_vec) {
162             h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
163         } else {
164             h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
165         }
166     }
167     return true;
168 }
169 
170 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
171 {
172     typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
173 
174     /*
175      * Indexed by [esz][be][v][mte][st], which is (except for load/store)
176      * also the order in which the elements appear in the function names,
177      * and so how we must concatenate the pieces.
178      */
179 
180 #define FN_LS(F)     { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
181 #define FN_MTE(F)    { FN_LS(F), FN_LS(F##_mte) }
182 #define FN_HV(F)     { FN_MTE(F##_h), FN_MTE(F##_v) }
183 #define FN_END(L, B) { FN_HV(L), FN_HV(B) }
184 
185     static GenLdSt1 * const fns[5][2][2][2][2] = {
186         FN_END(b, b),
187         FN_END(h_le, h_be),
188         FN_END(s_le, s_be),
189         FN_END(d_le, d_be),
190         FN_END(q_le, q_be),
191     };
192 
193 #undef FN_LS
194 #undef FN_MTE
195 #undef FN_HV
196 #undef FN_END
197 
198     TCGv_ptr t_za, t_pg;
199     TCGv_i64 addr;
200     int svl, desc = 0;
201     bool be = s->be_data == MO_BE;
202     bool mte = s->mte_active[0];
203 
204     if (!dc_isar_feature(aa64_sme, s)) {
205         return false;
206     }
207     if (!sme_smza_enabled_check(s)) {
208         return true;
209     }
210 
211     t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
212     t_pg = pred_full_reg_ptr(s, a->pg);
213     addr = tcg_temp_new_i64();
214 
215     tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
216     tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
217 
218     if (mte) {
219         desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
220         desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
221         desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
222         desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st);
223         desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1);
224         desc <<= SVE_MTEDESC_SHIFT;
225     } else {
226         addr = clean_data_tbi(s, addr);
227     }
228     svl = streaming_vec_reg_size(s);
229     desc = simd_desc(svl, svl, desc);
230 
231     fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr,
232                                       tcg_constant_i32(desc));
233     return true;
234 }
235 
236 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int);
237 
238 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn)
239 {
240     int svl = streaming_vec_reg_size(s);
241     int imm = a->imm;
242     TCGv_ptr base;
243 
244     if (!sme_za_enabled_check(s)) {
245         return true;
246     }
247 
248     /* ZA[n] equates to ZA0H.B[n]. */
249     base = get_tile_rowcol(s, MO_8, a->rv, imm, false);
250 
251     fn(s, base, 0, svl, a->rn, imm * svl);
252     return true;
253 }
254 
255 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr)
256 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str)
257 
258 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
259                     gen_helper_gvec_4 *fn)
260 {
261     int svl = streaming_vec_reg_size(s);
262     uint32_t desc = simd_desc(svl, svl, 0);
263     TCGv_ptr za, zn, pn, pm;
264 
265     if (!sme_smza_enabled_check(s)) {
266         return true;
267     }
268 
269     /* Sum XZR+zad to find ZAd. */
270     za = get_tile_rowcol(s, esz, 31, a->zad, false);
271     zn = vec_full_reg_ptr(s, a->zn);
272     pn = pred_full_reg_ptr(s, a->pn);
273     pm = pred_full_reg_ptr(s, a->pm);
274 
275     fn(za, zn, pn, pm, tcg_constant_i32(desc));
276     return true;
277 }
278 
279 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s)
280 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s)
281 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d)
282 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d)
283 
284 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz,
285                        gen_helper_gvec_5 *fn)
286 {
287     int svl = streaming_vec_reg_size(s);
288     uint32_t desc = simd_desc(svl, svl, a->sub);
289     TCGv_ptr za, zn, zm, pn, pm;
290 
291     if (!sme_smza_enabled_check(s)) {
292         return true;
293     }
294 
295     /* Sum XZR+zad to find ZAd. */
296     za = get_tile_rowcol(s, esz, 31, a->zad, false);
297     zn = vec_full_reg_ptr(s, a->zn);
298     zm = vec_full_reg_ptr(s, a->zm);
299     pn = pred_full_reg_ptr(s, a->pn);
300     pm = pred_full_reg_ptr(s, a->pm);
301 
302     fn(za, zn, zm, pn, pm, tcg_constant_i32(desc));
303     return true;
304 }
305 
306 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz,
307                             gen_helper_gvec_5_ptr *fn)
308 {
309     int svl = streaming_vec_reg_size(s);
310     uint32_t desc = simd_desc(svl, svl, a->sub);
311     TCGv_ptr za, zn, zm, pn, pm, fpst;
312 
313     if (!sme_smza_enabled_check(s)) {
314         return true;
315     }
316 
317     /* Sum XZR+zad to find ZAd. */
318     za = get_tile_rowcol(s, esz, 31, a->zad, false);
319     zn = vec_full_reg_ptr(s, a->zn);
320     zm = vec_full_reg_ptr(s, a->zm);
321     pn = pred_full_reg_ptr(s, a->pn);
322     pm = pred_full_reg_ptr(s, a->pm);
323     fpst = fpstatus_ptr(FPST_FPCR);
324 
325     fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc));
326     return true;
327 }
328 
329 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h)
330 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s)
331 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d)
332 
333 /* TODO: FEAT_EBF16 */
334 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa)
335 
336 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s)
337 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s)
338 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s)
339 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s)
340 
341 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d)
342 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d)
343 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d)
344 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d)
345