1 /*
2 * ARM generic vector expansion
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 * Copyright (c) 2005-2007 CodeSourcery
6 * Copyright (c) 2007 OpenedHand, Ltd.
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include "qemu/osdep.h"
23 #include "translate.h"
24
25
gen_gvec_fn3_qc(uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz,gen_helper_gvec_3_ptr * fn)26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
27 uint32_t opr_sz, uint32_t max_sz,
28 gen_helper_gvec_3_ptr *fn)
29 {
30 TCGv_ptr qc_ptr = tcg_temp_new_ptr();
31
32 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
33 tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc));
34 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
35 opr_sz, max_sz, 0, fn);
36 }
37
gen_gvec_sqdmulh_qc(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
39 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
40 {
41 static gen_helper_gvec_3_ptr * const fns[2] = {
42 gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s
43 };
44 tcg_debug_assert(vece >= 1 && vece <= 2);
45 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
46 }
47
gen_gvec_sqrdmulh_qc(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
49 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
50 {
51 static gen_helper_gvec_3_ptr * const fns[2] = {
52 gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s
53 };
54 tcg_debug_assert(vece >= 1 && vece <= 2);
55 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
56 }
57
gen_gvec_sqrdmlah_qc(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
59 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
60 {
61 static gen_helper_gvec_3_ptr * const fns[2] = {
62 gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
63 };
64 tcg_debug_assert(vece >= 1 && vece <= 2);
65 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
66 }
67
gen_gvec_sqrdmlsh_qc(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
69 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
70 {
71 static gen_helper_gvec_3_ptr * const fns[2] = {
72 gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
73 };
74 tcg_debug_assert(vece >= 1 && vece <= 2);
75 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
76 }
77
78 #define GEN_CMP0(NAME, COND) \
79 void NAME(unsigned vece, uint32_t d, uint32_t m, \
80 uint32_t opr_sz, uint32_t max_sz) \
81 { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }
82
GEN_CMP0(gen_gvec_ceq0,TCG_COND_EQ)83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
88
89 #undef GEN_CMP0
90
91 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
92 {
93 tcg_gen_vec_sar8i_i64(a, a, shift);
94 tcg_gen_vec_add8_i64(d, d, a);
95 }
96
gen_ssra16_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)97 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
98 {
99 tcg_gen_vec_sar16i_i64(a, a, shift);
100 tcg_gen_vec_add16_i64(d, d, a);
101 }
102
gen_ssra32_i32(TCGv_i32 d,TCGv_i32 a,int32_t shift)103 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
104 {
105 tcg_gen_sari_i32(a, a, shift);
106 tcg_gen_add_i32(d, d, a);
107 }
108
gen_ssra64_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)109 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
110 {
111 tcg_gen_sari_i64(a, a, shift);
112 tcg_gen_add_i64(d, d, a);
113 }
114
gen_ssra_vec(unsigned vece,TCGv_vec d,TCGv_vec a,int64_t sh)115 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
116 {
117 tcg_gen_sari_vec(vece, a, a, sh);
118 tcg_gen_add_vec(vece, d, d, a);
119 }
120
gen_gvec_ssra(unsigned vece,uint32_t rd_ofs,uint32_t rm_ofs,int64_t shift,uint32_t opr_sz,uint32_t max_sz)121 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
122 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
123 {
124 static const TCGOpcode vecop_list[] = {
125 INDEX_op_sari_vec, INDEX_op_add_vec, 0
126 };
127 static const GVecGen2i ops[4] = {
128 { .fni8 = gen_ssra8_i64,
129 .fniv = gen_ssra_vec,
130 .fno = gen_helper_gvec_ssra_b,
131 .load_dest = true,
132 .opt_opc = vecop_list,
133 .vece = MO_8 },
134 { .fni8 = gen_ssra16_i64,
135 .fniv = gen_ssra_vec,
136 .fno = gen_helper_gvec_ssra_h,
137 .load_dest = true,
138 .opt_opc = vecop_list,
139 .vece = MO_16 },
140 { .fni4 = gen_ssra32_i32,
141 .fniv = gen_ssra_vec,
142 .fno = gen_helper_gvec_ssra_s,
143 .load_dest = true,
144 .opt_opc = vecop_list,
145 .vece = MO_32 },
146 { .fni8 = gen_ssra64_i64,
147 .fniv = gen_ssra_vec,
148 .fno = gen_helper_gvec_ssra_d,
149 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
150 .opt_opc = vecop_list,
151 .load_dest = true,
152 .vece = MO_64 },
153 };
154
155 /* tszimm encoding produces immediates in the range [1..esize]. */
156 tcg_debug_assert(shift > 0);
157 tcg_debug_assert(shift <= (8 << vece));
158
159 /*
160 * Shifts larger than the element size are architecturally valid.
161 * Signed results in all sign bits.
162 */
163 shift = MIN(shift, (8 << vece) - 1);
164 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
165 }
166
gen_usra8_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)167 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
168 {
169 tcg_gen_vec_shr8i_i64(a, a, shift);
170 tcg_gen_vec_add8_i64(d, d, a);
171 }
172
gen_usra16_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)173 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
174 {
175 tcg_gen_vec_shr16i_i64(a, a, shift);
176 tcg_gen_vec_add16_i64(d, d, a);
177 }
178
gen_usra32_i32(TCGv_i32 d,TCGv_i32 a,int32_t shift)179 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
180 {
181 tcg_gen_shri_i32(a, a, shift);
182 tcg_gen_add_i32(d, d, a);
183 }
184
gen_usra64_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)185 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
186 {
187 tcg_gen_shri_i64(a, a, shift);
188 tcg_gen_add_i64(d, d, a);
189 }
190
gen_usra_vec(unsigned vece,TCGv_vec d,TCGv_vec a,int64_t sh)191 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
192 {
193 tcg_gen_shri_vec(vece, a, a, sh);
194 tcg_gen_add_vec(vece, d, d, a);
195 }
196
gen_gvec_usra(unsigned vece,uint32_t rd_ofs,uint32_t rm_ofs,int64_t shift,uint32_t opr_sz,uint32_t max_sz)197 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
198 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
199 {
200 static const TCGOpcode vecop_list[] = {
201 INDEX_op_shri_vec, INDEX_op_add_vec, 0
202 };
203 static const GVecGen2i ops[4] = {
204 { .fni8 = gen_usra8_i64,
205 .fniv = gen_usra_vec,
206 .fno = gen_helper_gvec_usra_b,
207 .load_dest = true,
208 .opt_opc = vecop_list,
209 .vece = MO_8, },
210 { .fni8 = gen_usra16_i64,
211 .fniv = gen_usra_vec,
212 .fno = gen_helper_gvec_usra_h,
213 .load_dest = true,
214 .opt_opc = vecop_list,
215 .vece = MO_16, },
216 { .fni4 = gen_usra32_i32,
217 .fniv = gen_usra_vec,
218 .fno = gen_helper_gvec_usra_s,
219 .load_dest = true,
220 .opt_opc = vecop_list,
221 .vece = MO_32, },
222 { .fni8 = gen_usra64_i64,
223 .fniv = gen_usra_vec,
224 .fno = gen_helper_gvec_usra_d,
225 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
226 .load_dest = true,
227 .opt_opc = vecop_list,
228 .vece = MO_64, },
229 };
230
231 /* tszimm encoding produces immediates in the range [1..esize]. */
232 tcg_debug_assert(shift > 0);
233 tcg_debug_assert(shift <= (8 << vece));
234
235 /*
236 * Shifts larger than the element size are architecturally valid.
237 * Unsigned results in all zeros as input to accumulate: nop.
238 */
239 if (shift < (8 << vece)) {
240 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
241 } else {
242 /* Nop, but we do need to clear the tail. */
243 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
244 }
245 }
246
247 /*
248 * Shift one less than the requested amount, and the low bit is
249 * the rounding bit. For the 8 and 16-bit operations, because we
250 * mask the low bit, we can perform a normal integer shift instead
251 * of a vector shift.
252 */
gen_srshr8_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)253 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
254 {
255 TCGv_i64 t = tcg_temp_new_i64();
256
257 tcg_gen_shri_i64(t, a, sh - 1);
258 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
259 tcg_gen_vec_sar8i_i64(d, a, sh);
260 tcg_gen_vec_add8_i64(d, d, t);
261 }
262
gen_srshr16_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)263 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
264 {
265 TCGv_i64 t = tcg_temp_new_i64();
266
267 tcg_gen_shri_i64(t, a, sh - 1);
268 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
269 tcg_gen_vec_sar16i_i64(d, a, sh);
270 tcg_gen_vec_add16_i64(d, d, t);
271 }
272
gen_srshr32_i32(TCGv_i32 d,TCGv_i32 a,int32_t sh)273 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
274 {
275 TCGv_i32 t;
276
277 /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
278 if (sh == 32) {
279 tcg_gen_movi_i32(d, 0);
280 return;
281 }
282 t = tcg_temp_new_i32();
283 tcg_gen_extract_i32(t, a, sh - 1, 1);
284 tcg_gen_sari_i32(d, a, sh);
285 tcg_gen_add_i32(d, d, t);
286 }
287
gen_srshr64_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)288 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
289 {
290 TCGv_i64 t = tcg_temp_new_i64();
291
292 tcg_gen_extract_i64(t, a, sh - 1, 1);
293 tcg_gen_sari_i64(d, a, sh);
294 tcg_gen_add_i64(d, d, t);
295 }
296
gen_srshr_vec(unsigned vece,TCGv_vec d,TCGv_vec a,int64_t sh)297 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
298 {
299 TCGv_vec t = tcg_temp_new_vec_matching(d);
300 TCGv_vec ones = tcg_temp_new_vec_matching(d);
301
302 tcg_gen_shri_vec(vece, t, a, sh - 1);
303 tcg_gen_dupi_vec(vece, ones, 1);
304 tcg_gen_and_vec(vece, t, t, ones);
305 tcg_gen_sari_vec(vece, d, a, sh);
306 tcg_gen_add_vec(vece, d, d, t);
307 }
308
gen_gvec_srshr(unsigned vece,uint32_t rd_ofs,uint32_t rm_ofs,int64_t shift,uint32_t opr_sz,uint32_t max_sz)309 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
310 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
311 {
312 static const TCGOpcode vecop_list[] = {
313 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
314 };
315 static const GVecGen2i ops[4] = {
316 { .fni8 = gen_srshr8_i64,
317 .fniv = gen_srshr_vec,
318 .fno = gen_helper_gvec_srshr_b,
319 .opt_opc = vecop_list,
320 .vece = MO_8 },
321 { .fni8 = gen_srshr16_i64,
322 .fniv = gen_srshr_vec,
323 .fno = gen_helper_gvec_srshr_h,
324 .opt_opc = vecop_list,
325 .vece = MO_16 },
326 { .fni4 = gen_srshr32_i32,
327 .fniv = gen_srshr_vec,
328 .fno = gen_helper_gvec_srshr_s,
329 .opt_opc = vecop_list,
330 .vece = MO_32 },
331 { .fni8 = gen_srshr64_i64,
332 .fniv = gen_srshr_vec,
333 .fno = gen_helper_gvec_srshr_d,
334 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
335 .opt_opc = vecop_list,
336 .vece = MO_64 },
337 };
338
339 /* tszimm encoding produces immediates in the range [1..esize] */
340 tcg_debug_assert(shift > 0);
341 tcg_debug_assert(shift <= (8 << vece));
342
343 if (shift == (8 << vece)) {
344 /*
345 * Shifts larger than the element size are architecturally valid.
346 * Signed results in all sign bits. With rounding, this produces
347 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
348 * I.e. always zero.
349 */
350 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
351 } else {
352 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
353 }
354 }
355
gen_srsra8_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)356 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
357 {
358 TCGv_i64 t = tcg_temp_new_i64();
359
360 gen_srshr8_i64(t, a, sh);
361 tcg_gen_vec_add8_i64(d, d, t);
362 }
363
gen_srsra16_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)364 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
365 {
366 TCGv_i64 t = tcg_temp_new_i64();
367
368 gen_srshr16_i64(t, a, sh);
369 tcg_gen_vec_add16_i64(d, d, t);
370 }
371
gen_srsra32_i32(TCGv_i32 d,TCGv_i32 a,int32_t sh)372 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
373 {
374 TCGv_i32 t = tcg_temp_new_i32();
375
376 gen_srshr32_i32(t, a, sh);
377 tcg_gen_add_i32(d, d, t);
378 }
379
gen_srsra64_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)380 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
381 {
382 TCGv_i64 t = tcg_temp_new_i64();
383
384 gen_srshr64_i64(t, a, sh);
385 tcg_gen_add_i64(d, d, t);
386 }
387
gen_srsra_vec(unsigned vece,TCGv_vec d,TCGv_vec a,int64_t sh)388 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
389 {
390 TCGv_vec t = tcg_temp_new_vec_matching(d);
391
392 gen_srshr_vec(vece, t, a, sh);
393 tcg_gen_add_vec(vece, d, d, t);
394 }
395
gen_gvec_srsra(unsigned vece,uint32_t rd_ofs,uint32_t rm_ofs,int64_t shift,uint32_t opr_sz,uint32_t max_sz)396 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
397 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
398 {
399 static const TCGOpcode vecop_list[] = {
400 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
401 };
402 static const GVecGen2i ops[4] = {
403 { .fni8 = gen_srsra8_i64,
404 .fniv = gen_srsra_vec,
405 .fno = gen_helper_gvec_srsra_b,
406 .opt_opc = vecop_list,
407 .load_dest = true,
408 .vece = MO_8 },
409 { .fni8 = gen_srsra16_i64,
410 .fniv = gen_srsra_vec,
411 .fno = gen_helper_gvec_srsra_h,
412 .opt_opc = vecop_list,
413 .load_dest = true,
414 .vece = MO_16 },
415 { .fni4 = gen_srsra32_i32,
416 .fniv = gen_srsra_vec,
417 .fno = gen_helper_gvec_srsra_s,
418 .opt_opc = vecop_list,
419 .load_dest = true,
420 .vece = MO_32 },
421 { .fni8 = gen_srsra64_i64,
422 .fniv = gen_srsra_vec,
423 .fno = gen_helper_gvec_srsra_d,
424 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
425 .opt_opc = vecop_list,
426 .load_dest = true,
427 .vece = MO_64 },
428 };
429
430 /* tszimm encoding produces immediates in the range [1..esize] */
431 tcg_debug_assert(shift > 0);
432 tcg_debug_assert(shift <= (8 << vece));
433
434 /*
435 * Shifts larger than the element size are architecturally valid.
436 * Signed results in all sign bits. With rounding, this produces
437 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
438 * I.e. always zero. With accumulation, this leaves D unchanged.
439 */
440 if (shift == (8 << vece)) {
441 /* Nop, but we do need to clear the tail. */
442 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
443 } else {
444 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
445 }
446 }
447
gen_urshr8_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)448 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
449 {
450 TCGv_i64 t = tcg_temp_new_i64();
451
452 tcg_gen_shri_i64(t, a, sh - 1);
453 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
454 tcg_gen_vec_shr8i_i64(d, a, sh);
455 tcg_gen_vec_add8_i64(d, d, t);
456 }
457
gen_urshr16_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)458 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
459 {
460 TCGv_i64 t = tcg_temp_new_i64();
461
462 tcg_gen_shri_i64(t, a, sh - 1);
463 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
464 tcg_gen_vec_shr16i_i64(d, a, sh);
465 tcg_gen_vec_add16_i64(d, d, t);
466 }
467
gen_urshr32_i32(TCGv_i32 d,TCGv_i32 a,int32_t sh)468 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
469 {
470 TCGv_i32 t;
471
472 /* Handle shift by the input size for the benefit of trans_URSHR_ri */
473 if (sh == 32) {
474 tcg_gen_extract_i32(d, a, sh - 1, 1);
475 return;
476 }
477 t = tcg_temp_new_i32();
478 tcg_gen_extract_i32(t, a, sh - 1, 1);
479 tcg_gen_shri_i32(d, a, sh);
480 tcg_gen_add_i32(d, d, t);
481 }
482
gen_urshr64_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)483 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
484 {
485 TCGv_i64 t = tcg_temp_new_i64();
486
487 tcg_gen_extract_i64(t, a, sh - 1, 1);
488 tcg_gen_shri_i64(d, a, sh);
489 tcg_gen_add_i64(d, d, t);
490 }
491
gen_urshr_vec(unsigned vece,TCGv_vec d,TCGv_vec a,int64_t shift)492 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
493 {
494 TCGv_vec t = tcg_temp_new_vec_matching(d);
495 TCGv_vec ones = tcg_temp_new_vec_matching(d);
496
497 tcg_gen_shri_vec(vece, t, a, shift - 1);
498 tcg_gen_dupi_vec(vece, ones, 1);
499 tcg_gen_and_vec(vece, t, t, ones);
500 tcg_gen_shri_vec(vece, d, a, shift);
501 tcg_gen_add_vec(vece, d, d, t);
502 }
503
gen_gvec_urshr(unsigned vece,uint32_t rd_ofs,uint32_t rm_ofs,int64_t shift,uint32_t opr_sz,uint32_t max_sz)504 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
505 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
506 {
507 static const TCGOpcode vecop_list[] = {
508 INDEX_op_shri_vec, INDEX_op_add_vec, 0
509 };
510 static const GVecGen2i ops[4] = {
511 { .fni8 = gen_urshr8_i64,
512 .fniv = gen_urshr_vec,
513 .fno = gen_helper_gvec_urshr_b,
514 .opt_opc = vecop_list,
515 .vece = MO_8 },
516 { .fni8 = gen_urshr16_i64,
517 .fniv = gen_urshr_vec,
518 .fno = gen_helper_gvec_urshr_h,
519 .opt_opc = vecop_list,
520 .vece = MO_16 },
521 { .fni4 = gen_urshr32_i32,
522 .fniv = gen_urshr_vec,
523 .fno = gen_helper_gvec_urshr_s,
524 .opt_opc = vecop_list,
525 .vece = MO_32 },
526 { .fni8 = gen_urshr64_i64,
527 .fniv = gen_urshr_vec,
528 .fno = gen_helper_gvec_urshr_d,
529 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
530 .opt_opc = vecop_list,
531 .vece = MO_64 },
532 };
533
534 /* tszimm encoding produces immediates in the range [1..esize] */
535 tcg_debug_assert(shift > 0);
536 tcg_debug_assert(shift <= (8 << vece));
537
538 if (shift == (8 << vece)) {
539 /*
540 * Shifts larger than the element size are architecturally valid.
541 * Unsigned results in zero. With rounding, this produces a
542 * copy of the most significant bit.
543 */
544 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
545 } else {
546 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
547 }
548 }
549
gen_ursra8_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)550 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
551 {
552 TCGv_i64 t = tcg_temp_new_i64();
553
554 if (sh == 8) {
555 tcg_gen_vec_shr8i_i64(t, a, 7);
556 } else {
557 gen_urshr8_i64(t, a, sh);
558 }
559 tcg_gen_vec_add8_i64(d, d, t);
560 }
561
gen_ursra16_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)562 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
563 {
564 TCGv_i64 t = tcg_temp_new_i64();
565
566 if (sh == 16) {
567 tcg_gen_vec_shr16i_i64(t, a, 15);
568 } else {
569 gen_urshr16_i64(t, a, sh);
570 }
571 tcg_gen_vec_add16_i64(d, d, t);
572 }
573
gen_ursra32_i32(TCGv_i32 d,TCGv_i32 a,int32_t sh)574 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
575 {
576 TCGv_i32 t = tcg_temp_new_i32();
577
578 if (sh == 32) {
579 tcg_gen_shri_i32(t, a, 31);
580 } else {
581 gen_urshr32_i32(t, a, sh);
582 }
583 tcg_gen_add_i32(d, d, t);
584 }
585
gen_ursra64_i64(TCGv_i64 d,TCGv_i64 a,int64_t sh)586 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
587 {
588 TCGv_i64 t = tcg_temp_new_i64();
589
590 if (sh == 64) {
591 tcg_gen_shri_i64(t, a, 63);
592 } else {
593 gen_urshr64_i64(t, a, sh);
594 }
595 tcg_gen_add_i64(d, d, t);
596 }
597
gen_ursra_vec(unsigned vece,TCGv_vec d,TCGv_vec a,int64_t sh)598 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
599 {
600 TCGv_vec t = tcg_temp_new_vec_matching(d);
601
602 if (sh == (8 << vece)) {
603 tcg_gen_shri_vec(vece, t, a, sh - 1);
604 } else {
605 gen_urshr_vec(vece, t, a, sh);
606 }
607 tcg_gen_add_vec(vece, d, d, t);
608 }
609
gen_gvec_ursra(unsigned vece,uint32_t rd_ofs,uint32_t rm_ofs,int64_t shift,uint32_t opr_sz,uint32_t max_sz)610 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
611 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
612 {
613 static const TCGOpcode vecop_list[] = {
614 INDEX_op_shri_vec, INDEX_op_add_vec, 0
615 };
616 static const GVecGen2i ops[4] = {
617 { .fni8 = gen_ursra8_i64,
618 .fniv = gen_ursra_vec,
619 .fno = gen_helper_gvec_ursra_b,
620 .opt_opc = vecop_list,
621 .load_dest = true,
622 .vece = MO_8 },
623 { .fni8 = gen_ursra16_i64,
624 .fniv = gen_ursra_vec,
625 .fno = gen_helper_gvec_ursra_h,
626 .opt_opc = vecop_list,
627 .load_dest = true,
628 .vece = MO_16 },
629 { .fni4 = gen_ursra32_i32,
630 .fniv = gen_ursra_vec,
631 .fno = gen_helper_gvec_ursra_s,
632 .opt_opc = vecop_list,
633 .load_dest = true,
634 .vece = MO_32 },
635 { .fni8 = gen_ursra64_i64,
636 .fniv = gen_ursra_vec,
637 .fno = gen_helper_gvec_ursra_d,
638 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
639 .opt_opc = vecop_list,
640 .load_dest = true,
641 .vece = MO_64 },
642 };
643
644 /* tszimm encoding produces immediates in the range [1..esize] */
645 tcg_debug_assert(shift > 0);
646 tcg_debug_assert(shift <= (8 << vece));
647
648 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
649 }
650
gen_shr8_ins_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)651 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
652 {
653 uint64_t mask = dup_const(MO_8, 0xff >> shift);
654 TCGv_i64 t = tcg_temp_new_i64();
655
656 tcg_gen_shri_i64(t, a, shift);
657 tcg_gen_andi_i64(t, t, mask);
658 tcg_gen_andi_i64(d, d, ~mask);
659 tcg_gen_or_i64(d, d, t);
660 }
661
gen_shr16_ins_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)662 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
663 {
664 uint64_t mask = dup_const(MO_16, 0xffff >> shift);
665 TCGv_i64 t = tcg_temp_new_i64();
666
667 tcg_gen_shri_i64(t, a, shift);
668 tcg_gen_andi_i64(t, t, mask);
669 tcg_gen_andi_i64(d, d, ~mask);
670 tcg_gen_or_i64(d, d, t);
671 }
672
gen_shr32_ins_i32(TCGv_i32 d,TCGv_i32 a,int32_t shift)673 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
674 {
675 tcg_gen_shri_i32(a, a, shift);
676 tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
677 }
678
gen_shr64_ins_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)679 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
680 {
681 tcg_gen_shri_i64(a, a, shift);
682 tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
683 }
684
gen_shr_ins_vec(unsigned vece,TCGv_vec d,TCGv_vec a,int64_t sh)685 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
686 {
687 TCGv_vec t = tcg_temp_new_vec_matching(d);
688 TCGv_vec m = tcg_temp_new_vec_matching(d);
689
690 tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
691 tcg_gen_shri_vec(vece, t, a, sh);
692 tcg_gen_and_vec(vece, d, d, m);
693 tcg_gen_or_vec(vece, d, d, t);
694 }
695
gen_gvec_sri(unsigned vece,uint32_t rd_ofs,uint32_t rm_ofs,int64_t shift,uint32_t opr_sz,uint32_t max_sz)696 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
697 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
698 {
699 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
700 const GVecGen2i ops[4] = {
701 { .fni8 = gen_shr8_ins_i64,
702 .fniv = gen_shr_ins_vec,
703 .fno = gen_helper_gvec_sri_b,
704 .load_dest = true,
705 .opt_opc = vecop_list,
706 .vece = MO_8 },
707 { .fni8 = gen_shr16_ins_i64,
708 .fniv = gen_shr_ins_vec,
709 .fno = gen_helper_gvec_sri_h,
710 .load_dest = true,
711 .opt_opc = vecop_list,
712 .vece = MO_16 },
713 { .fni4 = gen_shr32_ins_i32,
714 .fniv = gen_shr_ins_vec,
715 .fno = gen_helper_gvec_sri_s,
716 .load_dest = true,
717 .opt_opc = vecop_list,
718 .vece = MO_32 },
719 { .fni8 = gen_shr64_ins_i64,
720 .fniv = gen_shr_ins_vec,
721 .fno = gen_helper_gvec_sri_d,
722 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
723 .load_dest = true,
724 .opt_opc = vecop_list,
725 .vece = MO_64 },
726 };
727
728 /* tszimm encoding produces immediates in the range [1..esize]. */
729 tcg_debug_assert(shift > 0);
730 tcg_debug_assert(shift <= (8 << vece));
731
732 /* Shift of esize leaves destination unchanged. */
733 if (shift < (8 << vece)) {
734 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
735 } else {
736 /* Nop, but we do need to clear the tail. */
737 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
738 }
739 }
740
gen_shl8_ins_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)741 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
742 {
743 uint64_t mask = dup_const(MO_8, 0xff << shift);
744 TCGv_i64 t = tcg_temp_new_i64();
745
746 tcg_gen_shli_i64(t, a, shift);
747 tcg_gen_andi_i64(t, t, mask);
748 tcg_gen_andi_i64(d, d, ~mask);
749 tcg_gen_or_i64(d, d, t);
750 }
751
gen_shl16_ins_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)752 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
753 {
754 uint64_t mask = dup_const(MO_16, 0xffff << shift);
755 TCGv_i64 t = tcg_temp_new_i64();
756
757 tcg_gen_shli_i64(t, a, shift);
758 tcg_gen_andi_i64(t, t, mask);
759 tcg_gen_andi_i64(d, d, ~mask);
760 tcg_gen_or_i64(d, d, t);
761 }
762
gen_shl32_ins_i32(TCGv_i32 d,TCGv_i32 a,int32_t shift)763 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
764 {
765 tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
766 }
767
gen_shl64_ins_i64(TCGv_i64 d,TCGv_i64 a,int64_t shift)768 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
769 {
770 tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
771 }
772
gen_shl_ins_vec(unsigned vece,TCGv_vec d,TCGv_vec a,int64_t sh)773 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
774 {
775 TCGv_vec t = tcg_temp_new_vec_matching(d);
776 TCGv_vec m = tcg_temp_new_vec_matching(d);
777
778 tcg_gen_shli_vec(vece, t, a, sh);
779 tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
780 tcg_gen_and_vec(vece, d, d, m);
781 tcg_gen_or_vec(vece, d, d, t);
782 }
783
gen_gvec_sli(unsigned vece,uint32_t rd_ofs,uint32_t rm_ofs,int64_t shift,uint32_t opr_sz,uint32_t max_sz)784 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
785 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
786 {
787 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
788 const GVecGen2i ops[4] = {
789 { .fni8 = gen_shl8_ins_i64,
790 .fniv = gen_shl_ins_vec,
791 .fno = gen_helper_gvec_sli_b,
792 .load_dest = true,
793 .opt_opc = vecop_list,
794 .vece = MO_8 },
795 { .fni8 = gen_shl16_ins_i64,
796 .fniv = gen_shl_ins_vec,
797 .fno = gen_helper_gvec_sli_h,
798 .load_dest = true,
799 .opt_opc = vecop_list,
800 .vece = MO_16 },
801 { .fni4 = gen_shl32_ins_i32,
802 .fniv = gen_shl_ins_vec,
803 .fno = gen_helper_gvec_sli_s,
804 .load_dest = true,
805 .opt_opc = vecop_list,
806 .vece = MO_32 },
807 { .fni8 = gen_shl64_ins_i64,
808 .fniv = gen_shl_ins_vec,
809 .fno = gen_helper_gvec_sli_d,
810 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
811 .load_dest = true,
812 .opt_opc = vecop_list,
813 .vece = MO_64 },
814 };
815
816 /* tszimm encoding produces immediates in the range [0..esize-1]. */
817 tcg_debug_assert(shift >= 0);
818 tcg_debug_assert(shift < (8 << vece));
819
820 if (shift == 0) {
821 tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
822 } else {
823 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
824 }
825 }
826
gen_mla8_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)827 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
828 {
829 gen_helper_neon_mul_u8(a, a, b);
830 gen_helper_neon_add_u8(d, d, a);
831 }
832
gen_mls8_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)833 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
834 {
835 gen_helper_neon_mul_u8(a, a, b);
836 gen_helper_neon_sub_u8(d, d, a);
837 }
838
gen_mla16_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)839 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
840 {
841 gen_helper_neon_mul_u16(a, a, b);
842 gen_helper_neon_add_u16(d, d, a);
843 }
844
gen_mls16_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)845 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
846 {
847 gen_helper_neon_mul_u16(a, a, b);
848 gen_helper_neon_sub_u16(d, d, a);
849 }
850
gen_mla32_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)851 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
852 {
853 tcg_gen_mul_i32(a, a, b);
854 tcg_gen_add_i32(d, d, a);
855 }
856
gen_mls32_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)857 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
858 {
859 tcg_gen_mul_i32(a, a, b);
860 tcg_gen_sub_i32(d, d, a);
861 }
862
gen_mla64_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)863 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
864 {
865 tcg_gen_mul_i64(a, a, b);
866 tcg_gen_add_i64(d, d, a);
867 }
868
gen_mls64_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)869 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
870 {
871 tcg_gen_mul_i64(a, a, b);
872 tcg_gen_sub_i64(d, d, a);
873 }
874
gen_mla_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)875 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
876 {
877 tcg_gen_mul_vec(vece, a, a, b);
878 tcg_gen_add_vec(vece, d, d, a);
879 }
880
gen_mls_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)881 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
882 {
883 tcg_gen_mul_vec(vece, a, a, b);
884 tcg_gen_sub_vec(vece, d, d, a);
885 }
886
887 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
888 * these tables are shared with AArch64 which does support them.
889 */
gen_gvec_mla(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)890 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
891 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
892 {
893 static const TCGOpcode vecop_list[] = {
894 INDEX_op_mul_vec, INDEX_op_add_vec, 0
895 };
896 static const GVecGen3 ops[4] = {
897 { .fni4 = gen_mla8_i32,
898 .fniv = gen_mla_vec,
899 .load_dest = true,
900 .opt_opc = vecop_list,
901 .vece = MO_8 },
902 { .fni4 = gen_mla16_i32,
903 .fniv = gen_mla_vec,
904 .load_dest = true,
905 .opt_opc = vecop_list,
906 .vece = MO_16 },
907 { .fni4 = gen_mla32_i32,
908 .fniv = gen_mla_vec,
909 .load_dest = true,
910 .opt_opc = vecop_list,
911 .vece = MO_32 },
912 { .fni8 = gen_mla64_i64,
913 .fniv = gen_mla_vec,
914 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
915 .load_dest = true,
916 .opt_opc = vecop_list,
917 .vece = MO_64 },
918 };
919 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
920 }
921
gen_gvec_mls(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)922 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
923 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
924 {
925 static const TCGOpcode vecop_list[] = {
926 INDEX_op_mul_vec, INDEX_op_sub_vec, 0
927 };
928 static const GVecGen3 ops[4] = {
929 { .fni4 = gen_mls8_i32,
930 .fniv = gen_mls_vec,
931 .load_dest = true,
932 .opt_opc = vecop_list,
933 .vece = MO_8 },
934 { .fni4 = gen_mls16_i32,
935 .fniv = gen_mls_vec,
936 .load_dest = true,
937 .opt_opc = vecop_list,
938 .vece = MO_16 },
939 { .fni4 = gen_mls32_i32,
940 .fniv = gen_mls_vec,
941 .load_dest = true,
942 .opt_opc = vecop_list,
943 .vece = MO_32 },
944 { .fni8 = gen_mls64_i64,
945 .fniv = gen_mls_vec,
946 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
947 .load_dest = true,
948 .opt_opc = vecop_list,
949 .vece = MO_64 },
950 };
951 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
952 }
953
954 /* CMTST : test is "if (X & Y != 0)". */
gen_cmtst_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)955 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
956 {
957 tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b);
958 }
959
gen_cmtst_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)960 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
961 {
962 tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b);
963 }
964
gen_cmtst_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)965 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
966 {
967 tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b);
968 }
969
gen_gvec_cmtst(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)970 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
971 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
972 {
973 static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
974 static const GVecGen3 ops[4] = {
975 { .fni4 = gen_helper_neon_tst_u8,
976 .fniv = gen_cmtst_vec,
977 .opt_opc = vecop_list,
978 .vece = MO_8 },
979 { .fni4 = gen_helper_neon_tst_u16,
980 .fniv = gen_cmtst_vec,
981 .opt_opc = vecop_list,
982 .vece = MO_16 },
983 { .fni4 = gen_cmtst_i32,
984 .fniv = gen_cmtst_vec,
985 .opt_opc = vecop_list,
986 .vece = MO_32 },
987 { .fni8 = gen_cmtst_i64,
988 .fniv = gen_cmtst_vec,
989 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
990 .opt_opc = vecop_list,
991 .vece = MO_64 },
992 };
993 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
994 }
995
gen_ushl_i32(TCGv_i32 dst,TCGv_i32 src,TCGv_i32 shift)996 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
997 {
998 TCGv_i32 lval = tcg_temp_new_i32();
999 TCGv_i32 rval = tcg_temp_new_i32();
1000 TCGv_i32 lsh = tcg_temp_new_i32();
1001 TCGv_i32 rsh = tcg_temp_new_i32();
1002 TCGv_i32 zero = tcg_constant_i32(0);
1003 TCGv_i32 max = tcg_constant_i32(32);
1004
1005 /*
1006 * Rely on the TCG guarantee that out of range shifts produce
1007 * unspecified results, not undefined behaviour (i.e. no trap).
1008 * Discard out-of-range results after the fact.
1009 */
1010 tcg_gen_ext8s_i32(lsh, shift);
1011 tcg_gen_neg_i32(rsh, lsh);
1012 tcg_gen_shl_i32(lval, src, lsh);
1013 tcg_gen_shr_i32(rval, src, rsh);
1014 tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
1015 tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
1016 }
1017
gen_ushl_i64(TCGv_i64 dst,TCGv_i64 src,TCGv_i64 shift)1018 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1019 {
1020 TCGv_i64 lval = tcg_temp_new_i64();
1021 TCGv_i64 rval = tcg_temp_new_i64();
1022 TCGv_i64 lsh = tcg_temp_new_i64();
1023 TCGv_i64 rsh = tcg_temp_new_i64();
1024 TCGv_i64 zero = tcg_constant_i64(0);
1025 TCGv_i64 max = tcg_constant_i64(64);
1026
1027 /*
1028 * Rely on the TCG guarantee that out of range shifts produce
1029 * unspecified results, not undefined behaviour (i.e. no trap).
1030 * Discard out-of-range results after the fact.
1031 */
1032 tcg_gen_ext8s_i64(lsh, shift);
1033 tcg_gen_neg_i64(rsh, lsh);
1034 tcg_gen_shl_i64(lval, src, lsh);
1035 tcg_gen_shr_i64(rval, src, rsh);
1036 tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
1037 tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
1038 }
1039
gen_ushl_vec(unsigned vece,TCGv_vec dst,TCGv_vec src,TCGv_vec shift)1040 static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
1041 TCGv_vec src, TCGv_vec shift)
1042 {
1043 TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1044 TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1045 TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1046 TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1047 TCGv_vec msk, max;
1048
1049 tcg_gen_neg_vec(vece, rsh, shift);
1050 if (vece == MO_8) {
1051 tcg_gen_mov_vec(lsh, shift);
1052 } else {
1053 msk = tcg_temp_new_vec_matching(dst);
1054 tcg_gen_dupi_vec(vece, msk, 0xff);
1055 tcg_gen_and_vec(vece, lsh, shift, msk);
1056 tcg_gen_and_vec(vece, rsh, rsh, msk);
1057 }
1058
1059 /*
1060 * Rely on the TCG guarantee that out of range shifts produce
1061 * unspecified results, not undefined behaviour (i.e. no trap).
1062 * Discard out-of-range results after the fact.
1063 */
1064 tcg_gen_shlv_vec(vece, lval, src, lsh);
1065 tcg_gen_shrv_vec(vece, rval, src, rsh);
1066
1067 max = tcg_temp_new_vec_matching(dst);
1068 tcg_gen_dupi_vec(vece, max, 8 << vece);
1069
1070 /*
1071 * The choice of LT (signed) and GEU (unsigned) are biased toward
1072 * the instructions of the x86_64 host. For MO_8, the whole byte
1073 * is significant so we must use an unsigned compare; otherwise we
1074 * have already masked to a byte and so a signed compare works.
1075 * Other tcg hosts have a full set of comparisons and do not care.
1076 */
1077 if (vece == MO_8) {
1078 tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
1079 tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
1080 tcg_gen_andc_vec(vece, lval, lval, lsh);
1081 tcg_gen_andc_vec(vece, rval, rval, rsh);
1082 } else {
1083 tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
1084 tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
1085 tcg_gen_and_vec(vece, lval, lval, lsh);
1086 tcg_gen_and_vec(vece, rval, rval, rsh);
1087 }
1088 tcg_gen_or_vec(vece, dst, lval, rval);
1089 }
1090
gen_gvec_ushl(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1091 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1092 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1093 {
1094 static const TCGOpcode vecop_list[] = {
1095 INDEX_op_neg_vec, INDEX_op_shlv_vec,
1096 INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
1097 };
1098 static const GVecGen3 ops[4] = {
1099 { .fniv = gen_ushl_vec,
1100 .fno = gen_helper_gvec_ushl_b,
1101 .opt_opc = vecop_list,
1102 .vece = MO_8 },
1103 { .fniv = gen_ushl_vec,
1104 .fno = gen_helper_gvec_ushl_h,
1105 .opt_opc = vecop_list,
1106 .vece = MO_16 },
1107 { .fni4 = gen_ushl_i32,
1108 .fniv = gen_ushl_vec,
1109 .opt_opc = vecop_list,
1110 .vece = MO_32 },
1111 { .fni8 = gen_ushl_i64,
1112 .fniv = gen_ushl_vec,
1113 .opt_opc = vecop_list,
1114 .vece = MO_64 },
1115 };
1116 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1117 }
1118
gen_sshl_i32(TCGv_i32 dst,TCGv_i32 src,TCGv_i32 shift)1119 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1120 {
1121 TCGv_i32 lval = tcg_temp_new_i32();
1122 TCGv_i32 rval = tcg_temp_new_i32();
1123 TCGv_i32 lsh = tcg_temp_new_i32();
1124 TCGv_i32 rsh = tcg_temp_new_i32();
1125 TCGv_i32 zero = tcg_constant_i32(0);
1126 TCGv_i32 max = tcg_constant_i32(31);
1127
1128 /*
1129 * Rely on the TCG guarantee that out of range shifts produce
1130 * unspecified results, not undefined behaviour (i.e. no trap).
1131 * Discard out-of-range results after the fact.
1132 */
1133 tcg_gen_ext8s_i32(lsh, shift);
1134 tcg_gen_neg_i32(rsh, lsh);
1135 tcg_gen_shl_i32(lval, src, lsh);
1136 tcg_gen_umin_i32(rsh, rsh, max);
1137 tcg_gen_sar_i32(rval, src, rsh);
1138 tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
1139 tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
1140 }
1141
gen_sshl_i64(TCGv_i64 dst,TCGv_i64 src,TCGv_i64 shift)1142 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1143 {
1144 TCGv_i64 lval = tcg_temp_new_i64();
1145 TCGv_i64 rval = tcg_temp_new_i64();
1146 TCGv_i64 lsh = tcg_temp_new_i64();
1147 TCGv_i64 rsh = tcg_temp_new_i64();
1148 TCGv_i64 zero = tcg_constant_i64(0);
1149 TCGv_i64 max = tcg_constant_i64(63);
1150
1151 /*
1152 * Rely on the TCG guarantee that out of range shifts produce
1153 * unspecified results, not undefined behaviour (i.e. no trap).
1154 * Discard out-of-range results after the fact.
1155 */
1156 tcg_gen_ext8s_i64(lsh, shift);
1157 tcg_gen_neg_i64(rsh, lsh);
1158 tcg_gen_shl_i64(lval, src, lsh);
1159 tcg_gen_umin_i64(rsh, rsh, max);
1160 tcg_gen_sar_i64(rval, src, rsh);
1161 tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
1162 tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
1163 }
1164
gen_sshl_vec(unsigned vece,TCGv_vec dst,TCGv_vec src,TCGv_vec shift)1165 static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
1166 TCGv_vec src, TCGv_vec shift)
1167 {
1168 TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1169 TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1170 TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1171 TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1172 TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
1173
1174 /*
1175 * Rely on the TCG guarantee that out of range shifts produce
1176 * unspecified results, not undefined behaviour (i.e. no trap).
1177 * Discard out-of-range results after the fact.
1178 */
1179 tcg_gen_neg_vec(vece, rsh, shift);
1180 if (vece == MO_8) {
1181 tcg_gen_mov_vec(lsh, shift);
1182 } else {
1183 tcg_gen_dupi_vec(vece, tmp, 0xff);
1184 tcg_gen_and_vec(vece, lsh, shift, tmp);
1185 tcg_gen_and_vec(vece, rsh, rsh, tmp);
1186 }
1187
1188 /* Bound rsh so out of bound right shift gets -1. */
1189 tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
1190 tcg_gen_umin_vec(vece, rsh, rsh, tmp);
1191 tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
1192
1193 tcg_gen_shlv_vec(vece, lval, src, lsh);
1194 tcg_gen_sarv_vec(vece, rval, src, rsh);
1195
1196 /* Select in-bound left shift. */
1197 tcg_gen_andc_vec(vece, lval, lval, tmp);
1198
1199 /* Select between left and right shift. */
1200 if (vece == MO_8) {
1201 tcg_gen_dupi_vec(vece, tmp, 0);
1202 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
1203 } else {
1204 tcg_gen_dupi_vec(vece, tmp, 0x80);
1205 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
1206 }
1207 }
1208
gen_gvec_sshl(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1209 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1210 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1211 {
1212 static const TCGOpcode vecop_list[] = {
1213 INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
1214 INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
1215 };
1216 static const GVecGen3 ops[4] = {
1217 { .fniv = gen_sshl_vec,
1218 .fno = gen_helper_gvec_sshl_b,
1219 .opt_opc = vecop_list,
1220 .vece = MO_8 },
1221 { .fniv = gen_sshl_vec,
1222 .fno = gen_helper_gvec_sshl_h,
1223 .opt_opc = vecop_list,
1224 .vece = MO_16 },
1225 { .fni4 = gen_sshl_i32,
1226 .fniv = gen_sshl_vec,
1227 .opt_opc = vecop_list,
1228 .vece = MO_32 },
1229 { .fni8 = gen_sshl_i64,
1230 .fniv = gen_sshl_vec,
1231 .opt_opc = vecop_list,
1232 .vece = MO_64 },
1233 };
1234 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1235 }
1236
gen_gvec_srshl(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1237 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1238 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1239 {
1240 static gen_helper_gvec_3 * const fns[] = {
1241 gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h,
1242 gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d,
1243 };
1244 tcg_debug_assert(vece <= MO_64);
1245 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1246 }
1247
gen_gvec_urshl(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1248 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1249 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1250 {
1251 static gen_helper_gvec_3 * const fns[] = {
1252 gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h,
1253 gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d,
1254 };
1255 tcg_debug_assert(vece <= MO_64);
1256 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1257 }
1258
gen_neon_sqshl(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1259 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1260 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1261 {
1262 static gen_helper_gvec_3_ptr * const fns[] = {
1263 gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h,
1264 gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d,
1265 };
1266 tcg_debug_assert(vece <= MO_64);
1267 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1268 opr_sz, max_sz, 0, fns[vece]);
1269 }
1270
gen_neon_uqshl(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1271 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1272 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1273 {
1274 static gen_helper_gvec_3_ptr * const fns[] = {
1275 gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h,
1276 gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d,
1277 };
1278 tcg_debug_assert(vece <= MO_64);
1279 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1280 opr_sz, max_sz, 0, fns[vece]);
1281 }
1282
gen_neon_sqrshl(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1283 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1284 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1285 {
1286 static gen_helper_gvec_3_ptr * const fns[] = {
1287 gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h,
1288 gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d,
1289 };
1290 tcg_debug_assert(vece <= MO_64);
1291 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1292 opr_sz, max_sz, 0, fns[vece]);
1293 }
1294
gen_neon_uqrshl(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1295 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1296 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1297 {
1298 static gen_helper_gvec_3_ptr * const fns[] = {
1299 gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h,
1300 gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d,
1301 };
1302 tcg_debug_assert(vece <= MO_64);
1303 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1304 opr_sz, max_sz, 0, fns[vece]);
1305 }
1306
gen_uqadd_bhs(TCGv_i64 res,TCGv_i64 qc,TCGv_i64 a,TCGv_i64 b,MemOp esz)1307 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1308 {
1309 uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
1310 TCGv_i64 tmp = tcg_temp_new_i64();
1311
1312 tcg_gen_add_i64(tmp, a, b);
1313 tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max));
1314 tcg_gen_xor_i64(tmp, tmp, res);
1315 tcg_gen_or_i64(qc, qc, tmp);
1316 }
1317
gen_uqadd_d(TCGv_i64 res,TCGv_i64 qc,TCGv_i64 a,TCGv_i64 b)1318 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1319 {
1320 TCGv_i64 t = tcg_temp_new_i64();
1321
1322 tcg_gen_add_i64(t, a, b);
1323 tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a,
1324 tcg_constant_i64(UINT64_MAX), t);
1325 tcg_gen_xor_i64(t, t, res);
1326 tcg_gen_or_i64(qc, qc, t);
1327 }
1328
gen_uqadd_vec(unsigned vece,TCGv_vec t,TCGv_vec qc,TCGv_vec a,TCGv_vec b)1329 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1330 TCGv_vec a, TCGv_vec b)
1331 {
1332 TCGv_vec x = tcg_temp_new_vec_matching(t);
1333 tcg_gen_add_vec(vece, x, a, b);
1334 tcg_gen_usadd_vec(vece, t, a, b);
1335 tcg_gen_xor_vec(vece, x, x, t);
1336 tcg_gen_or_vec(vece, qc, qc, x);
1337 }
1338
gen_gvec_uqadd_qc(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1339 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1340 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1341 {
1342 static const TCGOpcode vecop_list[] = {
1343 INDEX_op_usadd_vec, INDEX_op_add_vec, 0
1344 };
1345 static const GVecGen4 ops[4] = {
1346 { .fniv = gen_uqadd_vec,
1347 .fno = gen_helper_gvec_uqadd_b,
1348 .write_aofs = true,
1349 .opt_opc = vecop_list,
1350 .vece = MO_8 },
1351 { .fniv = gen_uqadd_vec,
1352 .fno = gen_helper_gvec_uqadd_h,
1353 .write_aofs = true,
1354 .opt_opc = vecop_list,
1355 .vece = MO_16 },
1356 { .fniv = gen_uqadd_vec,
1357 .fno = gen_helper_gvec_uqadd_s,
1358 .write_aofs = true,
1359 .opt_opc = vecop_list,
1360 .vece = MO_32 },
1361 { .fniv = gen_uqadd_vec,
1362 .fni8 = gen_uqadd_d,
1363 .fno = gen_helper_gvec_uqadd_d,
1364 .write_aofs = true,
1365 .opt_opc = vecop_list,
1366 .vece = MO_64 },
1367 };
1368
1369 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1370 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1371 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1372 }
1373
gen_sqadd_bhs(TCGv_i64 res,TCGv_i64 qc,TCGv_i64 a,TCGv_i64 b,MemOp esz)1374 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1375 {
1376 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1377 int64_t min = -1ll - max;
1378 TCGv_i64 tmp = tcg_temp_new_i64();
1379
1380 tcg_gen_add_i64(tmp, a, b);
1381 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1382 tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1383 tcg_gen_xor_i64(tmp, tmp, res);
1384 tcg_gen_or_i64(qc, qc, tmp);
1385 }
1386
gen_sqadd_d(TCGv_i64 res,TCGv_i64 qc,TCGv_i64 a,TCGv_i64 b)1387 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1388 {
1389 TCGv_i64 t0 = tcg_temp_new_i64();
1390 TCGv_i64 t1 = tcg_temp_new_i64();
1391 TCGv_i64 t2 = tcg_temp_new_i64();
1392
1393 tcg_gen_add_i64(t0, a, b);
1394
1395 /* Compute signed overflow indication into T1 */
1396 tcg_gen_xor_i64(t1, a, b);
1397 tcg_gen_xor_i64(t2, t0, a);
1398 tcg_gen_andc_i64(t1, t2, t1);
1399
1400 /* Compute saturated value into T2 */
1401 tcg_gen_sari_i64(t2, a, 63);
1402 tcg_gen_xori_i64(t2, t2, INT64_MAX);
1403
1404 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1405 tcg_gen_xor_i64(t0, t0, res);
1406 tcg_gen_or_i64(qc, qc, t0);
1407 }
1408
gen_sqadd_vec(unsigned vece,TCGv_vec t,TCGv_vec qc,TCGv_vec a,TCGv_vec b)1409 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1410 TCGv_vec a, TCGv_vec b)
1411 {
1412 TCGv_vec x = tcg_temp_new_vec_matching(t);
1413 tcg_gen_add_vec(vece, x, a, b);
1414 tcg_gen_ssadd_vec(vece, t, a, b);
1415 tcg_gen_xor_vec(vece, x, x, t);
1416 tcg_gen_or_vec(vece, qc, qc, x);
1417 }
1418
gen_gvec_sqadd_qc(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1419 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1420 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1421 {
1422 static const TCGOpcode vecop_list[] = {
1423 INDEX_op_ssadd_vec, INDEX_op_add_vec, 0
1424 };
1425 static const GVecGen4 ops[4] = {
1426 { .fniv = gen_sqadd_vec,
1427 .fno = gen_helper_gvec_sqadd_b,
1428 .opt_opc = vecop_list,
1429 .write_aofs = true,
1430 .vece = MO_8 },
1431 { .fniv = gen_sqadd_vec,
1432 .fno = gen_helper_gvec_sqadd_h,
1433 .opt_opc = vecop_list,
1434 .write_aofs = true,
1435 .vece = MO_16 },
1436 { .fniv = gen_sqadd_vec,
1437 .fno = gen_helper_gvec_sqadd_s,
1438 .opt_opc = vecop_list,
1439 .write_aofs = true,
1440 .vece = MO_32 },
1441 { .fniv = gen_sqadd_vec,
1442 .fni8 = gen_sqadd_d,
1443 .fno = gen_helper_gvec_sqadd_d,
1444 .opt_opc = vecop_list,
1445 .write_aofs = true,
1446 .vece = MO_64 },
1447 };
1448
1449 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1450 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1451 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1452 }
1453
gen_uqsub_bhs(TCGv_i64 res,TCGv_i64 qc,TCGv_i64 a,TCGv_i64 b,MemOp esz)1454 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1455 {
1456 TCGv_i64 tmp = tcg_temp_new_i64();
1457
1458 tcg_gen_sub_i64(tmp, a, b);
1459 tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0));
1460 tcg_gen_xor_i64(tmp, tmp, res);
1461 tcg_gen_or_i64(qc, qc, tmp);
1462 }
1463
gen_uqsub_d(TCGv_i64 res,TCGv_i64 qc,TCGv_i64 a,TCGv_i64 b)1464 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1465 {
1466 TCGv_i64 t = tcg_temp_new_i64();
1467
1468 tcg_gen_sub_i64(t, a, b);
1469 tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t);
1470 tcg_gen_xor_i64(t, t, res);
1471 tcg_gen_or_i64(qc, qc, t);
1472 }
1473
gen_uqsub_vec(unsigned vece,TCGv_vec t,TCGv_vec qc,TCGv_vec a,TCGv_vec b)1474 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1475 TCGv_vec a, TCGv_vec b)
1476 {
1477 TCGv_vec x = tcg_temp_new_vec_matching(t);
1478 tcg_gen_sub_vec(vece, x, a, b);
1479 tcg_gen_ussub_vec(vece, t, a, b);
1480 tcg_gen_xor_vec(vece, x, x, t);
1481 tcg_gen_or_vec(vece, qc, qc, x);
1482 }
1483
gen_gvec_uqsub_qc(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1484 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1485 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1486 {
1487 static const TCGOpcode vecop_list[] = {
1488 INDEX_op_ussub_vec, INDEX_op_sub_vec, 0
1489 };
1490 static const GVecGen4 ops[4] = {
1491 { .fniv = gen_uqsub_vec,
1492 .fno = gen_helper_gvec_uqsub_b,
1493 .opt_opc = vecop_list,
1494 .write_aofs = true,
1495 .vece = MO_8 },
1496 { .fniv = gen_uqsub_vec,
1497 .fno = gen_helper_gvec_uqsub_h,
1498 .opt_opc = vecop_list,
1499 .write_aofs = true,
1500 .vece = MO_16 },
1501 { .fniv = gen_uqsub_vec,
1502 .fno = gen_helper_gvec_uqsub_s,
1503 .opt_opc = vecop_list,
1504 .write_aofs = true,
1505 .vece = MO_32 },
1506 { .fniv = gen_uqsub_vec,
1507 .fni8 = gen_uqsub_d,
1508 .fno = gen_helper_gvec_uqsub_d,
1509 .opt_opc = vecop_list,
1510 .write_aofs = true,
1511 .vece = MO_64 },
1512 };
1513
1514 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1515 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1516 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1517 }
1518
gen_sqsub_bhs(TCGv_i64 res,TCGv_i64 qc,TCGv_i64 a,TCGv_i64 b,MemOp esz)1519 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1520 {
1521 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1522 int64_t min = -1ll - max;
1523 TCGv_i64 tmp = tcg_temp_new_i64();
1524
1525 tcg_gen_sub_i64(tmp, a, b);
1526 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1527 tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1528 tcg_gen_xor_i64(tmp, tmp, res);
1529 tcg_gen_or_i64(qc, qc, tmp);
1530 }
1531
gen_sqsub_d(TCGv_i64 res,TCGv_i64 qc,TCGv_i64 a,TCGv_i64 b)1532 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1533 {
1534 TCGv_i64 t0 = tcg_temp_new_i64();
1535 TCGv_i64 t1 = tcg_temp_new_i64();
1536 TCGv_i64 t2 = tcg_temp_new_i64();
1537
1538 tcg_gen_sub_i64(t0, a, b);
1539
1540 /* Compute signed overflow indication into T1 */
1541 tcg_gen_xor_i64(t1, a, b);
1542 tcg_gen_xor_i64(t2, t0, a);
1543 tcg_gen_and_i64(t1, t1, t2);
1544
1545 /* Compute saturated value into T2 */
1546 tcg_gen_sari_i64(t2, a, 63);
1547 tcg_gen_xori_i64(t2, t2, INT64_MAX);
1548
1549 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1550 tcg_gen_xor_i64(t0, t0, res);
1551 tcg_gen_or_i64(qc, qc, t0);
1552 }
1553
gen_sqsub_vec(unsigned vece,TCGv_vec t,TCGv_vec qc,TCGv_vec a,TCGv_vec b)1554 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1555 TCGv_vec a, TCGv_vec b)
1556 {
1557 TCGv_vec x = tcg_temp_new_vec_matching(t);
1558 tcg_gen_sub_vec(vece, x, a, b);
1559 tcg_gen_sssub_vec(vece, t, a, b);
1560 tcg_gen_xor_vec(vece, x, x, t);
1561 tcg_gen_or_vec(vece, qc, qc, x);
1562 }
1563
gen_gvec_sqsub_qc(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1564 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1565 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1566 {
1567 static const TCGOpcode vecop_list[] = {
1568 INDEX_op_sssub_vec, INDEX_op_sub_vec, 0
1569 };
1570 static const GVecGen4 ops[4] = {
1571 { .fniv = gen_sqsub_vec,
1572 .fno = gen_helper_gvec_sqsub_b,
1573 .opt_opc = vecop_list,
1574 .write_aofs = true,
1575 .vece = MO_8 },
1576 { .fniv = gen_sqsub_vec,
1577 .fno = gen_helper_gvec_sqsub_h,
1578 .opt_opc = vecop_list,
1579 .write_aofs = true,
1580 .vece = MO_16 },
1581 { .fniv = gen_sqsub_vec,
1582 .fno = gen_helper_gvec_sqsub_s,
1583 .opt_opc = vecop_list,
1584 .write_aofs = true,
1585 .vece = MO_32 },
1586 { .fniv = gen_sqsub_vec,
1587 .fni8 = gen_sqsub_d,
1588 .fno = gen_helper_gvec_sqsub_d,
1589 .opt_opc = vecop_list,
1590 .write_aofs = true,
1591 .vece = MO_64 },
1592 };
1593
1594 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1595 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1596 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1597 }
1598
gen_sabd_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)1599 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1600 {
1601 TCGv_i32 t = tcg_temp_new_i32();
1602
1603 tcg_gen_sub_i32(t, a, b);
1604 tcg_gen_sub_i32(d, b, a);
1605 tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
1606 }
1607
gen_sabd_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1608 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1609 {
1610 TCGv_i64 t = tcg_temp_new_i64();
1611
1612 tcg_gen_sub_i64(t, a, b);
1613 tcg_gen_sub_i64(d, b, a);
1614 tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
1615 }
1616
gen_sabd_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)1617 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1618 {
1619 TCGv_vec t = tcg_temp_new_vec_matching(d);
1620
1621 tcg_gen_smin_vec(vece, t, a, b);
1622 tcg_gen_smax_vec(vece, d, a, b);
1623 tcg_gen_sub_vec(vece, d, d, t);
1624 }
1625
gen_gvec_sabd(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1626 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1627 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1628 {
1629 static const TCGOpcode vecop_list[] = {
1630 INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1631 };
1632 static const GVecGen3 ops[4] = {
1633 { .fniv = gen_sabd_vec,
1634 .fno = gen_helper_gvec_sabd_b,
1635 .opt_opc = vecop_list,
1636 .vece = MO_8 },
1637 { .fniv = gen_sabd_vec,
1638 .fno = gen_helper_gvec_sabd_h,
1639 .opt_opc = vecop_list,
1640 .vece = MO_16 },
1641 { .fni4 = gen_sabd_i32,
1642 .fniv = gen_sabd_vec,
1643 .fno = gen_helper_gvec_sabd_s,
1644 .opt_opc = vecop_list,
1645 .vece = MO_32 },
1646 { .fni8 = gen_sabd_i64,
1647 .fniv = gen_sabd_vec,
1648 .fno = gen_helper_gvec_sabd_d,
1649 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1650 .opt_opc = vecop_list,
1651 .vece = MO_64 },
1652 };
1653 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1654 }
1655
gen_uabd_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)1656 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1657 {
1658 TCGv_i32 t = tcg_temp_new_i32();
1659
1660 tcg_gen_sub_i32(t, a, b);
1661 tcg_gen_sub_i32(d, b, a);
1662 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
1663 }
1664
gen_uabd_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1665 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1666 {
1667 TCGv_i64 t = tcg_temp_new_i64();
1668
1669 tcg_gen_sub_i64(t, a, b);
1670 tcg_gen_sub_i64(d, b, a);
1671 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
1672 }
1673
gen_uabd_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)1674 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1675 {
1676 TCGv_vec t = tcg_temp_new_vec_matching(d);
1677
1678 tcg_gen_umin_vec(vece, t, a, b);
1679 tcg_gen_umax_vec(vece, d, a, b);
1680 tcg_gen_sub_vec(vece, d, d, t);
1681 }
1682
gen_gvec_uabd(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1683 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1684 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1685 {
1686 static const TCGOpcode vecop_list[] = {
1687 INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1688 };
1689 static const GVecGen3 ops[4] = {
1690 { .fniv = gen_uabd_vec,
1691 .fno = gen_helper_gvec_uabd_b,
1692 .opt_opc = vecop_list,
1693 .vece = MO_8 },
1694 { .fniv = gen_uabd_vec,
1695 .fno = gen_helper_gvec_uabd_h,
1696 .opt_opc = vecop_list,
1697 .vece = MO_16 },
1698 { .fni4 = gen_uabd_i32,
1699 .fniv = gen_uabd_vec,
1700 .fno = gen_helper_gvec_uabd_s,
1701 .opt_opc = vecop_list,
1702 .vece = MO_32 },
1703 { .fni8 = gen_uabd_i64,
1704 .fniv = gen_uabd_vec,
1705 .fno = gen_helper_gvec_uabd_d,
1706 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1707 .opt_opc = vecop_list,
1708 .vece = MO_64 },
1709 };
1710 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1711 }
1712
gen_saba_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)1713 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1714 {
1715 TCGv_i32 t = tcg_temp_new_i32();
1716 gen_sabd_i32(t, a, b);
1717 tcg_gen_add_i32(d, d, t);
1718 }
1719
gen_saba_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1720 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1721 {
1722 TCGv_i64 t = tcg_temp_new_i64();
1723 gen_sabd_i64(t, a, b);
1724 tcg_gen_add_i64(d, d, t);
1725 }
1726
gen_saba_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)1727 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1728 {
1729 TCGv_vec t = tcg_temp_new_vec_matching(d);
1730 gen_sabd_vec(vece, t, a, b);
1731 tcg_gen_add_vec(vece, d, d, t);
1732 }
1733
gen_gvec_saba(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1734 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1735 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1736 {
1737 static const TCGOpcode vecop_list[] = {
1738 INDEX_op_sub_vec, INDEX_op_add_vec,
1739 INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1740 };
1741 static const GVecGen3 ops[4] = {
1742 { .fniv = gen_saba_vec,
1743 .fno = gen_helper_gvec_saba_b,
1744 .opt_opc = vecop_list,
1745 .load_dest = true,
1746 .vece = MO_8 },
1747 { .fniv = gen_saba_vec,
1748 .fno = gen_helper_gvec_saba_h,
1749 .opt_opc = vecop_list,
1750 .load_dest = true,
1751 .vece = MO_16 },
1752 { .fni4 = gen_saba_i32,
1753 .fniv = gen_saba_vec,
1754 .fno = gen_helper_gvec_saba_s,
1755 .opt_opc = vecop_list,
1756 .load_dest = true,
1757 .vece = MO_32 },
1758 { .fni8 = gen_saba_i64,
1759 .fniv = gen_saba_vec,
1760 .fno = gen_helper_gvec_saba_d,
1761 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1762 .opt_opc = vecop_list,
1763 .load_dest = true,
1764 .vece = MO_64 },
1765 };
1766 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1767 }
1768
gen_uaba_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)1769 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1770 {
1771 TCGv_i32 t = tcg_temp_new_i32();
1772 gen_uabd_i32(t, a, b);
1773 tcg_gen_add_i32(d, d, t);
1774 }
1775
gen_uaba_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1776 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1777 {
1778 TCGv_i64 t = tcg_temp_new_i64();
1779 gen_uabd_i64(t, a, b);
1780 tcg_gen_add_i64(d, d, t);
1781 }
1782
gen_uaba_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)1783 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1784 {
1785 TCGv_vec t = tcg_temp_new_vec_matching(d);
1786 gen_uabd_vec(vece, t, a, b);
1787 tcg_gen_add_vec(vece, d, d, t);
1788 }
1789
gen_gvec_uaba(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1790 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1791 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1792 {
1793 static const TCGOpcode vecop_list[] = {
1794 INDEX_op_sub_vec, INDEX_op_add_vec,
1795 INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1796 };
1797 static const GVecGen3 ops[4] = {
1798 { .fniv = gen_uaba_vec,
1799 .fno = gen_helper_gvec_uaba_b,
1800 .opt_opc = vecop_list,
1801 .load_dest = true,
1802 .vece = MO_8 },
1803 { .fniv = gen_uaba_vec,
1804 .fno = gen_helper_gvec_uaba_h,
1805 .opt_opc = vecop_list,
1806 .load_dest = true,
1807 .vece = MO_16 },
1808 { .fni4 = gen_uaba_i32,
1809 .fniv = gen_uaba_vec,
1810 .fno = gen_helper_gvec_uaba_s,
1811 .opt_opc = vecop_list,
1812 .load_dest = true,
1813 .vece = MO_32 },
1814 { .fni8 = gen_uaba_i64,
1815 .fniv = gen_uaba_vec,
1816 .fno = gen_helper_gvec_uaba_d,
1817 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1818 .opt_opc = vecop_list,
1819 .load_dest = true,
1820 .vece = MO_64 },
1821 };
1822 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1823 }
1824
gen_gvec_addp(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1825 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1826 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1827 {
1828 static gen_helper_gvec_3 * const fns[4] = {
1829 gen_helper_gvec_addp_b,
1830 gen_helper_gvec_addp_h,
1831 gen_helper_gvec_addp_s,
1832 gen_helper_gvec_addp_d,
1833 };
1834 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1835 }
1836
gen_gvec_smaxp(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1837 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1838 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1839 {
1840 static gen_helper_gvec_3 * const fns[4] = {
1841 gen_helper_gvec_smaxp_b,
1842 gen_helper_gvec_smaxp_h,
1843 gen_helper_gvec_smaxp_s,
1844 };
1845 tcg_debug_assert(vece <= MO_32);
1846 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1847 }
1848
gen_gvec_sminp(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1849 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1850 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1851 {
1852 static gen_helper_gvec_3 * const fns[4] = {
1853 gen_helper_gvec_sminp_b,
1854 gen_helper_gvec_sminp_h,
1855 gen_helper_gvec_sminp_s,
1856 };
1857 tcg_debug_assert(vece <= MO_32);
1858 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1859 }
1860
gen_gvec_umaxp(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1861 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1862 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1863 {
1864 static gen_helper_gvec_3 * const fns[4] = {
1865 gen_helper_gvec_umaxp_b,
1866 gen_helper_gvec_umaxp_h,
1867 gen_helper_gvec_umaxp_s,
1868 };
1869 tcg_debug_assert(vece <= MO_32);
1870 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1871 }
1872
gen_gvec_uminp(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1873 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1874 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1875 {
1876 static gen_helper_gvec_3 * const fns[4] = {
1877 gen_helper_gvec_uminp_b,
1878 gen_helper_gvec_uminp_h,
1879 gen_helper_gvec_uminp_s,
1880 };
1881 tcg_debug_assert(vece <= MO_32);
1882 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1883 }
1884
gen_shadd8_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1885 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1886 {
1887 TCGv_i64 t = tcg_temp_new_i64();
1888
1889 tcg_gen_and_i64(t, a, b);
1890 tcg_gen_vec_sar8i_i64(a, a, 1);
1891 tcg_gen_vec_sar8i_i64(b, b, 1);
1892 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1893 tcg_gen_vec_add8_i64(d, a, b);
1894 tcg_gen_vec_add8_i64(d, d, t);
1895 }
1896
gen_shadd16_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1897 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1898 {
1899 TCGv_i64 t = tcg_temp_new_i64();
1900
1901 tcg_gen_and_i64(t, a, b);
1902 tcg_gen_vec_sar16i_i64(a, a, 1);
1903 tcg_gen_vec_sar16i_i64(b, b, 1);
1904 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1905 tcg_gen_vec_add16_i64(d, a, b);
1906 tcg_gen_vec_add16_i64(d, d, t);
1907 }
1908
gen_shadd_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)1909 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1910 {
1911 TCGv_i32 t = tcg_temp_new_i32();
1912
1913 tcg_gen_and_i32(t, a, b);
1914 tcg_gen_sari_i32(a, a, 1);
1915 tcg_gen_sari_i32(b, b, 1);
1916 tcg_gen_andi_i32(t, t, 1);
1917 tcg_gen_add_i32(d, a, b);
1918 tcg_gen_add_i32(d, d, t);
1919 }
1920
gen_shadd_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)1921 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1922 {
1923 TCGv_vec t = tcg_temp_new_vec_matching(d);
1924
1925 tcg_gen_and_vec(vece, t, a, b);
1926 tcg_gen_sari_vec(vece, a, a, 1);
1927 tcg_gen_sari_vec(vece, b, b, 1);
1928 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1929 tcg_gen_add_vec(vece, d, a, b);
1930 tcg_gen_add_vec(vece, d, d, t);
1931 }
1932
gen_gvec_shadd(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)1933 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1934 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1935 {
1936 static const TCGOpcode vecop_list[] = {
1937 INDEX_op_sari_vec, INDEX_op_add_vec, 0
1938 };
1939 static const GVecGen3 g[] = {
1940 { .fni8 = gen_shadd8_i64,
1941 .fniv = gen_shadd_vec,
1942 .opt_opc = vecop_list,
1943 .vece = MO_8 },
1944 { .fni8 = gen_shadd16_i64,
1945 .fniv = gen_shadd_vec,
1946 .opt_opc = vecop_list,
1947 .vece = MO_16 },
1948 { .fni4 = gen_shadd_i32,
1949 .fniv = gen_shadd_vec,
1950 .opt_opc = vecop_list,
1951 .vece = MO_32 },
1952 };
1953 tcg_debug_assert(vece <= MO_32);
1954 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
1955 }
1956
gen_uhadd8_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1957 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1958 {
1959 TCGv_i64 t = tcg_temp_new_i64();
1960
1961 tcg_gen_and_i64(t, a, b);
1962 tcg_gen_vec_shr8i_i64(a, a, 1);
1963 tcg_gen_vec_shr8i_i64(b, b, 1);
1964 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1965 tcg_gen_vec_add8_i64(d, a, b);
1966 tcg_gen_vec_add8_i64(d, d, t);
1967 }
1968
gen_uhadd16_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1969 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1970 {
1971 TCGv_i64 t = tcg_temp_new_i64();
1972
1973 tcg_gen_and_i64(t, a, b);
1974 tcg_gen_vec_shr16i_i64(a, a, 1);
1975 tcg_gen_vec_shr16i_i64(b, b, 1);
1976 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1977 tcg_gen_vec_add16_i64(d, a, b);
1978 tcg_gen_vec_add16_i64(d, d, t);
1979 }
1980
gen_uhadd_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)1981 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1982 {
1983 TCGv_i32 t = tcg_temp_new_i32();
1984
1985 tcg_gen_and_i32(t, a, b);
1986 tcg_gen_shri_i32(a, a, 1);
1987 tcg_gen_shri_i32(b, b, 1);
1988 tcg_gen_andi_i32(t, t, 1);
1989 tcg_gen_add_i32(d, a, b);
1990 tcg_gen_add_i32(d, d, t);
1991 }
1992
gen_uhadd_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)1993 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1994 {
1995 TCGv_vec t = tcg_temp_new_vec_matching(d);
1996
1997 tcg_gen_and_vec(vece, t, a, b);
1998 tcg_gen_shri_vec(vece, a, a, 1);
1999 tcg_gen_shri_vec(vece, b, b, 1);
2000 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2001 tcg_gen_add_vec(vece, d, a, b);
2002 tcg_gen_add_vec(vece, d, d, t);
2003 }
2004
gen_gvec_uhadd(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)2005 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2006 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2007 {
2008 static const TCGOpcode vecop_list[] = {
2009 INDEX_op_shri_vec, INDEX_op_add_vec, 0
2010 };
2011 static const GVecGen3 g[] = {
2012 { .fni8 = gen_uhadd8_i64,
2013 .fniv = gen_uhadd_vec,
2014 .opt_opc = vecop_list,
2015 .vece = MO_8 },
2016 { .fni8 = gen_uhadd16_i64,
2017 .fniv = gen_uhadd_vec,
2018 .opt_opc = vecop_list,
2019 .vece = MO_16 },
2020 { .fni4 = gen_uhadd_i32,
2021 .fniv = gen_uhadd_vec,
2022 .opt_opc = vecop_list,
2023 .vece = MO_32 },
2024 };
2025 tcg_debug_assert(vece <= MO_32);
2026 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2027 }
2028
gen_shsub8_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2029 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2030 {
2031 TCGv_i64 t = tcg_temp_new_i64();
2032
2033 tcg_gen_andc_i64(t, b, a);
2034 tcg_gen_vec_sar8i_i64(a, a, 1);
2035 tcg_gen_vec_sar8i_i64(b, b, 1);
2036 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2037 tcg_gen_vec_sub8_i64(d, a, b);
2038 tcg_gen_vec_sub8_i64(d, d, t);
2039 }
2040
gen_shsub16_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2041 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2042 {
2043 TCGv_i64 t = tcg_temp_new_i64();
2044
2045 tcg_gen_andc_i64(t, b, a);
2046 tcg_gen_vec_sar16i_i64(a, a, 1);
2047 tcg_gen_vec_sar16i_i64(b, b, 1);
2048 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2049 tcg_gen_vec_sub16_i64(d, a, b);
2050 tcg_gen_vec_sub16_i64(d, d, t);
2051 }
2052
gen_shsub_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)2053 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2054 {
2055 TCGv_i32 t = tcg_temp_new_i32();
2056
2057 tcg_gen_andc_i32(t, b, a);
2058 tcg_gen_sari_i32(a, a, 1);
2059 tcg_gen_sari_i32(b, b, 1);
2060 tcg_gen_andi_i32(t, t, 1);
2061 tcg_gen_sub_i32(d, a, b);
2062 tcg_gen_sub_i32(d, d, t);
2063 }
2064
gen_shsub_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)2065 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2066 {
2067 TCGv_vec t = tcg_temp_new_vec_matching(d);
2068
2069 tcg_gen_andc_vec(vece, t, b, a);
2070 tcg_gen_sari_vec(vece, a, a, 1);
2071 tcg_gen_sari_vec(vece, b, b, 1);
2072 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2073 tcg_gen_sub_vec(vece, d, a, b);
2074 tcg_gen_sub_vec(vece, d, d, t);
2075 }
2076
gen_gvec_shsub(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)2077 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2078 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2079 {
2080 static const TCGOpcode vecop_list[] = {
2081 INDEX_op_sari_vec, INDEX_op_sub_vec, 0
2082 };
2083 static const GVecGen3 g[4] = {
2084 { .fni8 = gen_shsub8_i64,
2085 .fniv = gen_shsub_vec,
2086 .opt_opc = vecop_list,
2087 .vece = MO_8 },
2088 { .fni8 = gen_shsub16_i64,
2089 .fniv = gen_shsub_vec,
2090 .opt_opc = vecop_list,
2091 .vece = MO_16 },
2092 { .fni4 = gen_shsub_i32,
2093 .fniv = gen_shsub_vec,
2094 .opt_opc = vecop_list,
2095 .vece = MO_32 },
2096 };
2097 assert(vece <= MO_32);
2098 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2099 }
2100
gen_uhsub8_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2101 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2102 {
2103 TCGv_i64 t = tcg_temp_new_i64();
2104
2105 tcg_gen_andc_i64(t, b, a);
2106 tcg_gen_vec_shr8i_i64(a, a, 1);
2107 tcg_gen_vec_shr8i_i64(b, b, 1);
2108 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2109 tcg_gen_vec_sub8_i64(d, a, b);
2110 tcg_gen_vec_sub8_i64(d, d, t);
2111 }
2112
gen_uhsub16_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2113 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2114 {
2115 TCGv_i64 t = tcg_temp_new_i64();
2116
2117 tcg_gen_andc_i64(t, b, a);
2118 tcg_gen_vec_shr16i_i64(a, a, 1);
2119 tcg_gen_vec_shr16i_i64(b, b, 1);
2120 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2121 tcg_gen_vec_sub16_i64(d, a, b);
2122 tcg_gen_vec_sub16_i64(d, d, t);
2123 }
2124
gen_uhsub_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)2125 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2126 {
2127 TCGv_i32 t = tcg_temp_new_i32();
2128
2129 tcg_gen_andc_i32(t, b, a);
2130 tcg_gen_shri_i32(a, a, 1);
2131 tcg_gen_shri_i32(b, b, 1);
2132 tcg_gen_andi_i32(t, t, 1);
2133 tcg_gen_sub_i32(d, a, b);
2134 tcg_gen_sub_i32(d, d, t);
2135 }
2136
gen_uhsub_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)2137 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2138 {
2139 TCGv_vec t = tcg_temp_new_vec_matching(d);
2140
2141 tcg_gen_andc_vec(vece, t, b, a);
2142 tcg_gen_shri_vec(vece, a, a, 1);
2143 tcg_gen_shri_vec(vece, b, b, 1);
2144 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2145 tcg_gen_sub_vec(vece, d, a, b);
2146 tcg_gen_sub_vec(vece, d, d, t);
2147 }
2148
gen_gvec_uhsub(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)2149 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2150 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2151 {
2152 static const TCGOpcode vecop_list[] = {
2153 INDEX_op_shri_vec, INDEX_op_sub_vec, 0
2154 };
2155 static const GVecGen3 g[4] = {
2156 { .fni8 = gen_uhsub8_i64,
2157 .fniv = gen_uhsub_vec,
2158 .opt_opc = vecop_list,
2159 .vece = MO_8 },
2160 { .fni8 = gen_uhsub16_i64,
2161 .fniv = gen_uhsub_vec,
2162 .opt_opc = vecop_list,
2163 .vece = MO_16 },
2164 { .fni4 = gen_uhsub_i32,
2165 .fniv = gen_uhsub_vec,
2166 .opt_opc = vecop_list,
2167 .vece = MO_32 },
2168 };
2169 assert(vece <= MO_32);
2170 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2171 }
2172
gen_srhadd8_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2173 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2174 {
2175 TCGv_i64 t = tcg_temp_new_i64();
2176
2177 tcg_gen_or_i64(t, a, b);
2178 tcg_gen_vec_sar8i_i64(a, a, 1);
2179 tcg_gen_vec_sar8i_i64(b, b, 1);
2180 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2181 tcg_gen_vec_add8_i64(d, a, b);
2182 tcg_gen_vec_add8_i64(d, d, t);
2183 }
2184
gen_srhadd16_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2185 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2186 {
2187 TCGv_i64 t = tcg_temp_new_i64();
2188
2189 tcg_gen_or_i64(t, a, b);
2190 tcg_gen_vec_sar16i_i64(a, a, 1);
2191 tcg_gen_vec_sar16i_i64(b, b, 1);
2192 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2193 tcg_gen_vec_add16_i64(d, a, b);
2194 tcg_gen_vec_add16_i64(d, d, t);
2195 }
2196
gen_srhadd_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)2197 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2198 {
2199 TCGv_i32 t = tcg_temp_new_i32();
2200
2201 tcg_gen_or_i32(t, a, b);
2202 tcg_gen_sari_i32(a, a, 1);
2203 tcg_gen_sari_i32(b, b, 1);
2204 tcg_gen_andi_i32(t, t, 1);
2205 tcg_gen_add_i32(d, a, b);
2206 tcg_gen_add_i32(d, d, t);
2207 }
2208
gen_srhadd_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)2209 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2210 {
2211 TCGv_vec t = tcg_temp_new_vec_matching(d);
2212
2213 tcg_gen_or_vec(vece, t, a, b);
2214 tcg_gen_sari_vec(vece, a, a, 1);
2215 tcg_gen_sari_vec(vece, b, b, 1);
2216 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2217 tcg_gen_add_vec(vece, d, a, b);
2218 tcg_gen_add_vec(vece, d, d, t);
2219 }
2220
gen_gvec_srhadd(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)2221 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2222 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2223 {
2224 static const TCGOpcode vecop_list[] = {
2225 INDEX_op_sari_vec, INDEX_op_add_vec, 0
2226 };
2227 static const GVecGen3 g[] = {
2228 { .fni8 = gen_srhadd8_i64,
2229 .fniv = gen_srhadd_vec,
2230 .opt_opc = vecop_list,
2231 .vece = MO_8 },
2232 { .fni8 = gen_srhadd16_i64,
2233 .fniv = gen_srhadd_vec,
2234 .opt_opc = vecop_list,
2235 .vece = MO_16 },
2236 { .fni4 = gen_srhadd_i32,
2237 .fniv = gen_srhadd_vec,
2238 .opt_opc = vecop_list,
2239 .vece = MO_32 },
2240 };
2241 assert(vece <= MO_32);
2242 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2243 }
2244
gen_urhadd8_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2245 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2246 {
2247 TCGv_i64 t = tcg_temp_new_i64();
2248
2249 tcg_gen_or_i64(t, a, b);
2250 tcg_gen_vec_shr8i_i64(a, a, 1);
2251 tcg_gen_vec_shr8i_i64(b, b, 1);
2252 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2253 tcg_gen_vec_add8_i64(d, a, b);
2254 tcg_gen_vec_add8_i64(d, d, t);
2255 }
2256
gen_urhadd16_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2257 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2258 {
2259 TCGv_i64 t = tcg_temp_new_i64();
2260
2261 tcg_gen_or_i64(t, a, b);
2262 tcg_gen_vec_shr16i_i64(a, a, 1);
2263 tcg_gen_vec_shr16i_i64(b, b, 1);
2264 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2265 tcg_gen_vec_add16_i64(d, a, b);
2266 tcg_gen_vec_add16_i64(d, d, t);
2267 }
2268
gen_urhadd_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)2269 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2270 {
2271 TCGv_i32 t = tcg_temp_new_i32();
2272
2273 tcg_gen_or_i32(t, a, b);
2274 tcg_gen_shri_i32(a, a, 1);
2275 tcg_gen_shri_i32(b, b, 1);
2276 tcg_gen_andi_i32(t, t, 1);
2277 tcg_gen_add_i32(d, a, b);
2278 tcg_gen_add_i32(d, d, t);
2279 }
2280
gen_urhadd_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)2281 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2282 {
2283 TCGv_vec t = tcg_temp_new_vec_matching(d);
2284
2285 tcg_gen_or_vec(vece, t, a, b);
2286 tcg_gen_shri_vec(vece, a, a, 1);
2287 tcg_gen_shri_vec(vece, b, b, 1);
2288 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2289 tcg_gen_add_vec(vece, d, a, b);
2290 tcg_gen_add_vec(vece, d, d, t);
2291 }
2292
gen_gvec_urhadd(unsigned vece,uint32_t rd_ofs,uint32_t rn_ofs,uint32_t rm_ofs,uint32_t opr_sz,uint32_t max_sz)2293 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2294 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2295 {
2296 static const TCGOpcode vecop_list[] = {
2297 INDEX_op_shri_vec, INDEX_op_add_vec, 0
2298 };
2299 static const GVecGen3 g[] = {
2300 { .fni8 = gen_urhadd8_i64,
2301 .fniv = gen_urhadd_vec,
2302 .opt_opc = vecop_list,
2303 .vece = MO_8 },
2304 { .fni8 = gen_urhadd16_i64,
2305 .fniv = gen_urhadd_vec,
2306 .opt_opc = vecop_list,
2307 .vece = MO_16 },
2308 { .fni4 = gen_urhadd_i32,
2309 .fniv = gen_urhadd_vec,
2310 .opt_opc = vecop_list,
2311 .vece = MO_32 },
2312 };
2313 assert(vece <= MO_32);
2314 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2315 }
2316