xref: /openbmc/qemu/target/arm/tcg/gengvec.c (revision b0409139)
1 /*
2  *  ARM generic vector expansion
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "translate.h"
24 
25 
26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
27                             uint32_t opr_sz, uint32_t max_sz,
28                             gen_helper_gvec_3_ptr *fn)
29 {
30     TCGv_ptr qc_ptr = tcg_temp_new_ptr();
31 
32     tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc));
33     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
34                        opr_sz, max_sz, 0, fn);
35 }
36 
37 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
38                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
39 {
40     static gen_helper_gvec_3_ptr * const fns[2] = {
41         gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
42     };
43     tcg_debug_assert(vece >= 1 && vece <= 2);
44     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
45 }
46 
47 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
48                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
49 {
50     static gen_helper_gvec_3_ptr * const fns[2] = {
51         gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
52     };
53     tcg_debug_assert(vece >= 1 && vece <= 2);
54     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
55 }
56 
57 #define GEN_CMP0(NAME, COND)                              \
58     void NAME(unsigned vece, uint32_t d, uint32_t m,      \
59               uint32_t opr_sz, uint32_t max_sz)           \
60     { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }
61 
62 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
63 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
64 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
65 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
66 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
67 
68 #undef GEN_CMP0
69 
70 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
71 {
72     tcg_gen_vec_sar8i_i64(a, a, shift);
73     tcg_gen_vec_add8_i64(d, d, a);
74 }
75 
76 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
77 {
78     tcg_gen_vec_sar16i_i64(a, a, shift);
79     tcg_gen_vec_add16_i64(d, d, a);
80 }
81 
82 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
83 {
84     tcg_gen_sari_i32(a, a, shift);
85     tcg_gen_add_i32(d, d, a);
86 }
87 
88 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
89 {
90     tcg_gen_sari_i64(a, a, shift);
91     tcg_gen_add_i64(d, d, a);
92 }
93 
94 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
95 {
96     tcg_gen_sari_vec(vece, a, a, sh);
97     tcg_gen_add_vec(vece, d, d, a);
98 }
99 
100 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
101                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
102 {
103     static const TCGOpcode vecop_list[] = {
104         INDEX_op_sari_vec, INDEX_op_add_vec, 0
105     };
106     static const GVecGen2i ops[4] = {
107         { .fni8 = gen_ssra8_i64,
108           .fniv = gen_ssra_vec,
109           .fno = gen_helper_gvec_ssra_b,
110           .load_dest = true,
111           .opt_opc = vecop_list,
112           .vece = MO_8 },
113         { .fni8 = gen_ssra16_i64,
114           .fniv = gen_ssra_vec,
115           .fno = gen_helper_gvec_ssra_h,
116           .load_dest = true,
117           .opt_opc = vecop_list,
118           .vece = MO_16 },
119         { .fni4 = gen_ssra32_i32,
120           .fniv = gen_ssra_vec,
121           .fno = gen_helper_gvec_ssra_s,
122           .load_dest = true,
123           .opt_opc = vecop_list,
124           .vece = MO_32 },
125         { .fni8 = gen_ssra64_i64,
126           .fniv = gen_ssra_vec,
127           .fno = gen_helper_gvec_ssra_d,
128           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
129           .opt_opc = vecop_list,
130           .load_dest = true,
131           .vece = MO_64 },
132     };
133 
134     /* tszimm encoding produces immediates in the range [1..esize]. */
135     tcg_debug_assert(shift > 0);
136     tcg_debug_assert(shift <= (8 << vece));
137 
138     /*
139      * Shifts larger than the element size are architecturally valid.
140      * Signed results in all sign bits.
141      */
142     shift = MIN(shift, (8 << vece) - 1);
143     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
144 }
145 
146 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
147 {
148     tcg_gen_vec_shr8i_i64(a, a, shift);
149     tcg_gen_vec_add8_i64(d, d, a);
150 }
151 
152 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
153 {
154     tcg_gen_vec_shr16i_i64(a, a, shift);
155     tcg_gen_vec_add16_i64(d, d, a);
156 }
157 
158 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
159 {
160     tcg_gen_shri_i32(a, a, shift);
161     tcg_gen_add_i32(d, d, a);
162 }
163 
164 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
165 {
166     tcg_gen_shri_i64(a, a, shift);
167     tcg_gen_add_i64(d, d, a);
168 }
169 
170 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
171 {
172     tcg_gen_shri_vec(vece, a, a, sh);
173     tcg_gen_add_vec(vece, d, d, a);
174 }
175 
176 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
177                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
178 {
179     static const TCGOpcode vecop_list[] = {
180         INDEX_op_shri_vec, INDEX_op_add_vec, 0
181     };
182     static const GVecGen2i ops[4] = {
183         { .fni8 = gen_usra8_i64,
184           .fniv = gen_usra_vec,
185           .fno = gen_helper_gvec_usra_b,
186           .load_dest = true,
187           .opt_opc = vecop_list,
188           .vece = MO_8, },
189         { .fni8 = gen_usra16_i64,
190           .fniv = gen_usra_vec,
191           .fno = gen_helper_gvec_usra_h,
192           .load_dest = true,
193           .opt_opc = vecop_list,
194           .vece = MO_16, },
195         { .fni4 = gen_usra32_i32,
196           .fniv = gen_usra_vec,
197           .fno = gen_helper_gvec_usra_s,
198           .load_dest = true,
199           .opt_opc = vecop_list,
200           .vece = MO_32, },
201         { .fni8 = gen_usra64_i64,
202           .fniv = gen_usra_vec,
203           .fno = gen_helper_gvec_usra_d,
204           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
205           .load_dest = true,
206           .opt_opc = vecop_list,
207           .vece = MO_64, },
208     };
209 
210     /* tszimm encoding produces immediates in the range [1..esize]. */
211     tcg_debug_assert(shift > 0);
212     tcg_debug_assert(shift <= (8 << vece));
213 
214     /*
215      * Shifts larger than the element size are architecturally valid.
216      * Unsigned results in all zeros as input to accumulate: nop.
217      */
218     if (shift < (8 << vece)) {
219         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
220     } else {
221         /* Nop, but we do need to clear the tail. */
222         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
223     }
224 }
225 
226 /*
227  * Shift one less than the requested amount, and the low bit is
228  * the rounding bit.  For the 8 and 16-bit operations, because we
229  * mask the low bit, we can perform a normal integer shift instead
230  * of a vector shift.
231  */
232 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
233 {
234     TCGv_i64 t = tcg_temp_new_i64();
235 
236     tcg_gen_shri_i64(t, a, sh - 1);
237     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
238     tcg_gen_vec_sar8i_i64(d, a, sh);
239     tcg_gen_vec_add8_i64(d, d, t);
240 }
241 
242 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
243 {
244     TCGv_i64 t = tcg_temp_new_i64();
245 
246     tcg_gen_shri_i64(t, a, sh - 1);
247     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
248     tcg_gen_vec_sar16i_i64(d, a, sh);
249     tcg_gen_vec_add16_i64(d, d, t);
250 }
251 
252 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
253 {
254     TCGv_i32 t;
255 
256     /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
257     if (sh == 32) {
258         tcg_gen_movi_i32(d, 0);
259         return;
260     }
261     t = tcg_temp_new_i32();
262     tcg_gen_extract_i32(t, a, sh - 1, 1);
263     tcg_gen_sari_i32(d, a, sh);
264     tcg_gen_add_i32(d, d, t);
265 }
266 
267  void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
268 {
269     TCGv_i64 t = tcg_temp_new_i64();
270 
271     tcg_gen_extract_i64(t, a, sh - 1, 1);
272     tcg_gen_sari_i64(d, a, sh);
273     tcg_gen_add_i64(d, d, t);
274 }
275 
276 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
277 {
278     TCGv_vec t = tcg_temp_new_vec_matching(d);
279     TCGv_vec ones = tcg_temp_new_vec_matching(d);
280 
281     tcg_gen_shri_vec(vece, t, a, sh - 1);
282     tcg_gen_dupi_vec(vece, ones, 1);
283     tcg_gen_and_vec(vece, t, t, ones);
284     tcg_gen_sari_vec(vece, d, a, sh);
285     tcg_gen_add_vec(vece, d, d, t);
286 }
287 
288 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
289                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
290 {
291     static const TCGOpcode vecop_list[] = {
292         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
293     };
294     static const GVecGen2i ops[4] = {
295         { .fni8 = gen_srshr8_i64,
296           .fniv = gen_srshr_vec,
297           .fno = gen_helper_gvec_srshr_b,
298           .opt_opc = vecop_list,
299           .vece = MO_8 },
300         { .fni8 = gen_srshr16_i64,
301           .fniv = gen_srshr_vec,
302           .fno = gen_helper_gvec_srshr_h,
303           .opt_opc = vecop_list,
304           .vece = MO_16 },
305         { .fni4 = gen_srshr32_i32,
306           .fniv = gen_srshr_vec,
307           .fno = gen_helper_gvec_srshr_s,
308           .opt_opc = vecop_list,
309           .vece = MO_32 },
310         { .fni8 = gen_srshr64_i64,
311           .fniv = gen_srshr_vec,
312           .fno = gen_helper_gvec_srshr_d,
313           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
314           .opt_opc = vecop_list,
315           .vece = MO_64 },
316     };
317 
318     /* tszimm encoding produces immediates in the range [1..esize] */
319     tcg_debug_assert(shift > 0);
320     tcg_debug_assert(shift <= (8 << vece));
321 
322     if (shift == (8 << vece)) {
323         /*
324          * Shifts larger than the element size are architecturally valid.
325          * Signed results in all sign bits.  With rounding, this produces
326          *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
327          * I.e. always zero.
328          */
329         tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
330     } else {
331         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
332     }
333 }
334 
335 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
336 {
337     TCGv_i64 t = tcg_temp_new_i64();
338 
339     gen_srshr8_i64(t, a, sh);
340     tcg_gen_vec_add8_i64(d, d, t);
341 }
342 
343 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
344 {
345     TCGv_i64 t = tcg_temp_new_i64();
346 
347     gen_srshr16_i64(t, a, sh);
348     tcg_gen_vec_add16_i64(d, d, t);
349 }
350 
351 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
352 {
353     TCGv_i32 t = tcg_temp_new_i32();
354 
355     gen_srshr32_i32(t, a, sh);
356     tcg_gen_add_i32(d, d, t);
357 }
358 
359 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
360 {
361     TCGv_i64 t = tcg_temp_new_i64();
362 
363     gen_srshr64_i64(t, a, sh);
364     tcg_gen_add_i64(d, d, t);
365 }
366 
367 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
368 {
369     TCGv_vec t = tcg_temp_new_vec_matching(d);
370 
371     gen_srshr_vec(vece, t, a, sh);
372     tcg_gen_add_vec(vece, d, d, t);
373 }
374 
375 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
376                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
377 {
378     static const TCGOpcode vecop_list[] = {
379         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
380     };
381     static const GVecGen2i ops[4] = {
382         { .fni8 = gen_srsra8_i64,
383           .fniv = gen_srsra_vec,
384           .fno = gen_helper_gvec_srsra_b,
385           .opt_opc = vecop_list,
386           .load_dest = true,
387           .vece = MO_8 },
388         { .fni8 = gen_srsra16_i64,
389           .fniv = gen_srsra_vec,
390           .fno = gen_helper_gvec_srsra_h,
391           .opt_opc = vecop_list,
392           .load_dest = true,
393           .vece = MO_16 },
394         { .fni4 = gen_srsra32_i32,
395           .fniv = gen_srsra_vec,
396           .fno = gen_helper_gvec_srsra_s,
397           .opt_opc = vecop_list,
398           .load_dest = true,
399           .vece = MO_32 },
400         { .fni8 = gen_srsra64_i64,
401           .fniv = gen_srsra_vec,
402           .fno = gen_helper_gvec_srsra_d,
403           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
404           .opt_opc = vecop_list,
405           .load_dest = true,
406           .vece = MO_64 },
407     };
408 
409     /* tszimm encoding produces immediates in the range [1..esize] */
410     tcg_debug_assert(shift > 0);
411     tcg_debug_assert(shift <= (8 << vece));
412 
413     /*
414      * Shifts larger than the element size are architecturally valid.
415      * Signed results in all sign bits.  With rounding, this produces
416      *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
417      * I.e. always zero.  With accumulation, this leaves D unchanged.
418      */
419     if (shift == (8 << vece)) {
420         /* Nop, but we do need to clear the tail. */
421         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
422     } else {
423         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
424     }
425 }
426 
427 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
428 {
429     TCGv_i64 t = tcg_temp_new_i64();
430 
431     tcg_gen_shri_i64(t, a, sh - 1);
432     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
433     tcg_gen_vec_shr8i_i64(d, a, sh);
434     tcg_gen_vec_add8_i64(d, d, t);
435 }
436 
437 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
438 {
439     TCGv_i64 t = tcg_temp_new_i64();
440 
441     tcg_gen_shri_i64(t, a, sh - 1);
442     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
443     tcg_gen_vec_shr16i_i64(d, a, sh);
444     tcg_gen_vec_add16_i64(d, d, t);
445 }
446 
447 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
448 {
449     TCGv_i32 t;
450 
451     /* Handle shift by the input size for the benefit of trans_URSHR_ri */
452     if (sh == 32) {
453         tcg_gen_extract_i32(d, a, sh - 1, 1);
454         return;
455     }
456     t = tcg_temp_new_i32();
457     tcg_gen_extract_i32(t, a, sh - 1, 1);
458     tcg_gen_shri_i32(d, a, sh);
459     tcg_gen_add_i32(d, d, t);
460 }
461 
462 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
463 {
464     TCGv_i64 t = tcg_temp_new_i64();
465 
466     tcg_gen_extract_i64(t, a, sh - 1, 1);
467     tcg_gen_shri_i64(d, a, sh);
468     tcg_gen_add_i64(d, d, t);
469 }
470 
471 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
472 {
473     TCGv_vec t = tcg_temp_new_vec_matching(d);
474     TCGv_vec ones = tcg_temp_new_vec_matching(d);
475 
476     tcg_gen_shri_vec(vece, t, a, shift - 1);
477     tcg_gen_dupi_vec(vece, ones, 1);
478     tcg_gen_and_vec(vece, t, t, ones);
479     tcg_gen_shri_vec(vece, d, a, shift);
480     tcg_gen_add_vec(vece, d, d, t);
481 }
482 
483 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
484                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
485 {
486     static const TCGOpcode vecop_list[] = {
487         INDEX_op_shri_vec, INDEX_op_add_vec, 0
488     };
489     static const GVecGen2i ops[4] = {
490         { .fni8 = gen_urshr8_i64,
491           .fniv = gen_urshr_vec,
492           .fno = gen_helper_gvec_urshr_b,
493           .opt_opc = vecop_list,
494           .vece = MO_8 },
495         { .fni8 = gen_urshr16_i64,
496           .fniv = gen_urshr_vec,
497           .fno = gen_helper_gvec_urshr_h,
498           .opt_opc = vecop_list,
499           .vece = MO_16 },
500         { .fni4 = gen_urshr32_i32,
501           .fniv = gen_urshr_vec,
502           .fno = gen_helper_gvec_urshr_s,
503           .opt_opc = vecop_list,
504           .vece = MO_32 },
505         { .fni8 = gen_urshr64_i64,
506           .fniv = gen_urshr_vec,
507           .fno = gen_helper_gvec_urshr_d,
508           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
509           .opt_opc = vecop_list,
510           .vece = MO_64 },
511     };
512 
513     /* tszimm encoding produces immediates in the range [1..esize] */
514     tcg_debug_assert(shift > 0);
515     tcg_debug_assert(shift <= (8 << vece));
516 
517     if (shift == (8 << vece)) {
518         /*
519          * Shifts larger than the element size are architecturally valid.
520          * Unsigned results in zero.  With rounding, this produces a
521          * copy of the most significant bit.
522          */
523         tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
524     } else {
525         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
526     }
527 }
528 
529 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
530 {
531     TCGv_i64 t = tcg_temp_new_i64();
532 
533     if (sh == 8) {
534         tcg_gen_vec_shr8i_i64(t, a, 7);
535     } else {
536         gen_urshr8_i64(t, a, sh);
537     }
538     tcg_gen_vec_add8_i64(d, d, t);
539 }
540 
541 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
542 {
543     TCGv_i64 t = tcg_temp_new_i64();
544 
545     if (sh == 16) {
546         tcg_gen_vec_shr16i_i64(t, a, 15);
547     } else {
548         gen_urshr16_i64(t, a, sh);
549     }
550     tcg_gen_vec_add16_i64(d, d, t);
551 }
552 
553 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
554 {
555     TCGv_i32 t = tcg_temp_new_i32();
556 
557     if (sh == 32) {
558         tcg_gen_shri_i32(t, a, 31);
559     } else {
560         gen_urshr32_i32(t, a, sh);
561     }
562     tcg_gen_add_i32(d, d, t);
563 }
564 
565 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
566 {
567     TCGv_i64 t = tcg_temp_new_i64();
568 
569     if (sh == 64) {
570         tcg_gen_shri_i64(t, a, 63);
571     } else {
572         gen_urshr64_i64(t, a, sh);
573     }
574     tcg_gen_add_i64(d, d, t);
575 }
576 
577 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
578 {
579     TCGv_vec t = tcg_temp_new_vec_matching(d);
580 
581     if (sh == (8 << vece)) {
582         tcg_gen_shri_vec(vece, t, a, sh - 1);
583     } else {
584         gen_urshr_vec(vece, t, a, sh);
585     }
586     tcg_gen_add_vec(vece, d, d, t);
587 }
588 
589 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
590                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
591 {
592     static const TCGOpcode vecop_list[] = {
593         INDEX_op_shri_vec, INDEX_op_add_vec, 0
594     };
595     static const GVecGen2i ops[4] = {
596         { .fni8 = gen_ursra8_i64,
597           .fniv = gen_ursra_vec,
598           .fno = gen_helper_gvec_ursra_b,
599           .opt_opc = vecop_list,
600           .load_dest = true,
601           .vece = MO_8 },
602         { .fni8 = gen_ursra16_i64,
603           .fniv = gen_ursra_vec,
604           .fno = gen_helper_gvec_ursra_h,
605           .opt_opc = vecop_list,
606           .load_dest = true,
607           .vece = MO_16 },
608         { .fni4 = gen_ursra32_i32,
609           .fniv = gen_ursra_vec,
610           .fno = gen_helper_gvec_ursra_s,
611           .opt_opc = vecop_list,
612           .load_dest = true,
613           .vece = MO_32 },
614         { .fni8 = gen_ursra64_i64,
615           .fniv = gen_ursra_vec,
616           .fno = gen_helper_gvec_ursra_d,
617           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
618           .opt_opc = vecop_list,
619           .load_dest = true,
620           .vece = MO_64 },
621     };
622 
623     /* tszimm encoding produces immediates in the range [1..esize] */
624     tcg_debug_assert(shift > 0);
625     tcg_debug_assert(shift <= (8 << vece));
626 
627     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
628 }
629 
630 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
631 {
632     uint64_t mask = dup_const(MO_8, 0xff >> shift);
633     TCGv_i64 t = tcg_temp_new_i64();
634 
635     tcg_gen_shri_i64(t, a, shift);
636     tcg_gen_andi_i64(t, t, mask);
637     tcg_gen_andi_i64(d, d, ~mask);
638     tcg_gen_or_i64(d, d, t);
639 }
640 
641 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
642 {
643     uint64_t mask = dup_const(MO_16, 0xffff >> shift);
644     TCGv_i64 t = tcg_temp_new_i64();
645 
646     tcg_gen_shri_i64(t, a, shift);
647     tcg_gen_andi_i64(t, t, mask);
648     tcg_gen_andi_i64(d, d, ~mask);
649     tcg_gen_or_i64(d, d, t);
650 }
651 
652 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
653 {
654     tcg_gen_shri_i32(a, a, shift);
655     tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
656 }
657 
658 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
659 {
660     tcg_gen_shri_i64(a, a, shift);
661     tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
662 }
663 
664 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
665 {
666     TCGv_vec t = tcg_temp_new_vec_matching(d);
667     TCGv_vec m = tcg_temp_new_vec_matching(d);
668 
669     tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
670     tcg_gen_shri_vec(vece, t, a, sh);
671     tcg_gen_and_vec(vece, d, d, m);
672     tcg_gen_or_vec(vece, d, d, t);
673 }
674 
675 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
676                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
677 {
678     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
679     const GVecGen2i ops[4] = {
680         { .fni8 = gen_shr8_ins_i64,
681           .fniv = gen_shr_ins_vec,
682           .fno = gen_helper_gvec_sri_b,
683           .load_dest = true,
684           .opt_opc = vecop_list,
685           .vece = MO_8 },
686         { .fni8 = gen_shr16_ins_i64,
687           .fniv = gen_shr_ins_vec,
688           .fno = gen_helper_gvec_sri_h,
689           .load_dest = true,
690           .opt_opc = vecop_list,
691           .vece = MO_16 },
692         { .fni4 = gen_shr32_ins_i32,
693           .fniv = gen_shr_ins_vec,
694           .fno = gen_helper_gvec_sri_s,
695           .load_dest = true,
696           .opt_opc = vecop_list,
697           .vece = MO_32 },
698         { .fni8 = gen_shr64_ins_i64,
699           .fniv = gen_shr_ins_vec,
700           .fno = gen_helper_gvec_sri_d,
701           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
702           .load_dest = true,
703           .opt_opc = vecop_list,
704           .vece = MO_64 },
705     };
706 
707     /* tszimm encoding produces immediates in the range [1..esize]. */
708     tcg_debug_assert(shift > 0);
709     tcg_debug_assert(shift <= (8 << vece));
710 
711     /* Shift of esize leaves destination unchanged. */
712     if (shift < (8 << vece)) {
713         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
714     } else {
715         /* Nop, but we do need to clear the tail. */
716         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
717     }
718 }
719 
720 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
721 {
722     uint64_t mask = dup_const(MO_8, 0xff << shift);
723     TCGv_i64 t = tcg_temp_new_i64();
724 
725     tcg_gen_shli_i64(t, a, shift);
726     tcg_gen_andi_i64(t, t, mask);
727     tcg_gen_andi_i64(d, d, ~mask);
728     tcg_gen_or_i64(d, d, t);
729 }
730 
731 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
732 {
733     uint64_t mask = dup_const(MO_16, 0xffff << shift);
734     TCGv_i64 t = tcg_temp_new_i64();
735 
736     tcg_gen_shli_i64(t, a, shift);
737     tcg_gen_andi_i64(t, t, mask);
738     tcg_gen_andi_i64(d, d, ~mask);
739     tcg_gen_or_i64(d, d, t);
740 }
741 
742 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
743 {
744     tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
745 }
746 
747 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
748 {
749     tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
750 }
751 
752 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
753 {
754     TCGv_vec t = tcg_temp_new_vec_matching(d);
755     TCGv_vec m = tcg_temp_new_vec_matching(d);
756 
757     tcg_gen_shli_vec(vece, t, a, sh);
758     tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
759     tcg_gen_and_vec(vece, d, d, m);
760     tcg_gen_or_vec(vece, d, d, t);
761 }
762 
763 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
764                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
765 {
766     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
767     const GVecGen2i ops[4] = {
768         { .fni8 = gen_shl8_ins_i64,
769           .fniv = gen_shl_ins_vec,
770           .fno = gen_helper_gvec_sli_b,
771           .load_dest = true,
772           .opt_opc = vecop_list,
773           .vece = MO_8 },
774         { .fni8 = gen_shl16_ins_i64,
775           .fniv = gen_shl_ins_vec,
776           .fno = gen_helper_gvec_sli_h,
777           .load_dest = true,
778           .opt_opc = vecop_list,
779           .vece = MO_16 },
780         { .fni4 = gen_shl32_ins_i32,
781           .fniv = gen_shl_ins_vec,
782           .fno = gen_helper_gvec_sli_s,
783           .load_dest = true,
784           .opt_opc = vecop_list,
785           .vece = MO_32 },
786         { .fni8 = gen_shl64_ins_i64,
787           .fniv = gen_shl_ins_vec,
788           .fno = gen_helper_gvec_sli_d,
789           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
790           .load_dest = true,
791           .opt_opc = vecop_list,
792           .vece = MO_64 },
793     };
794 
795     /* tszimm encoding produces immediates in the range [0..esize-1]. */
796     tcg_debug_assert(shift >= 0);
797     tcg_debug_assert(shift < (8 << vece));
798 
799     if (shift == 0) {
800         tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
801     } else {
802         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
803     }
804 }
805 
806 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
807 {
808     gen_helper_neon_mul_u8(a, a, b);
809     gen_helper_neon_add_u8(d, d, a);
810 }
811 
812 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
813 {
814     gen_helper_neon_mul_u8(a, a, b);
815     gen_helper_neon_sub_u8(d, d, a);
816 }
817 
818 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
819 {
820     gen_helper_neon_mul_u16(a, a, b);
821     gen_helper_neon_add_u16(d, d, a);
822 }
823 
824 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
825 {
826     gen_helper_neon_mul_u16(a, a, b);
827     gen_helper_neon_sub_u16(d, d, a);
828 }
829 
830 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
831 {
832     tcg_gen_mul_i32(a, a, b);
833     tcg_gen_add_i32(d, d, a);
834 }
835 
836 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
837 {
838     tcg_gen_mul_i32(a, a, b);
839     tcg_gen_sub_i32(d, d, a);
840 }
841 
842 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
843 {
844     tcg_gen_mul_i64(a, a, b);
845     tcg_gen_add_i64(d, d, a);
846 }
847 
848 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
849 {
850     tcg_gen_mul_i64(a, a, b);
851     tcg_gen_sub_i64(d, d, a);
852 }
853 
854 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
855 {
856     tcg_gen_mul_vec(vece, a, a, b);
857     tcg_gen_add_vec(vece, d, d, a);
858 }
859 
860 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
861 {
862     tcg_gen_mul_vec(vece, a, a, b);
863     tcg_gen_sub_vec(vece, d, d, a);
864 }
865 
866 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
867  * these tables are shared with AArch64 which does support them.
868  */
869 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
870                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
871 {
872     static const TCGOpcode vecop_list[] = {
873         INDEX_op_mul_vec, INDEX_op_add_vec, 0
874     };
875     static const GVecGen3 ops[4] = {
876         { .fni4 = gen_mla8_i32,
877           .fniv = gen_mla_vec,
878           .load_dest = true,
879           .opt_opc = vecop_list,
880           .vece = MO_8 },
881         { .fni4 = gen_mla16_i32,
882           .fniv = gen_mla_vec,
883           .load_dest = true,
884           .opt_opc = vecop_list,
885           .vece = MO_16 },
886         { .fni4 = gen_mla32_i32,
887           .fniv = gen_mla_vec,
888           .load_dest = true,
889           .opt_opc = vecop_list,
890           .vece = MO_32 },
891         { .fni8 = gen_mla64_i64,
892           .fniv = gen_mla_vec,
893           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
894           .load_dest = true,
895           .opt_opc = vecop_list,
896           .vece = MO_64 },
897     };
898     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
899 }
900 
901 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
902                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
903 {
904     static const TCGOpcode vecop_list[] = {
905         INDEX_op_mul_vec, INDEX_op_sub_vec, 0
906     };
907     static const GVecGen3 ops[4] = {
908         { .fni4 = gen_mls8_i32,
909           .fniv = gen_mls_vec,
910           .load_dest = true,
911           .opt_opc = vecop_list,
912           .vece = MO_8 },
913         { .fni4 = gen_mls16_i32,
914           .fniv = gen_mls_vec,
915           .load_dest = true,
916           .opt_opc = vecop_list,
917           .vece = MO_16 },
918         { .fni4 = gen_mls32_i32,
919           .fniv = gen_mls_vec,
920           .load_dest = true,
921           .opt_opc = vecop_list,
922           .vece = MO_32 },
923         { .fni8 = gen_mls64_i64,
924           .fniv = gen_mls_vec,
925           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
926           .load_dest = true,
927           .opt_opc = vecop_list,
928           .vece = MO_64 },
929     };
930     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
931 }
932 
933 /* CMTST : test is "if (X & Y != 0)". */
934 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
935 {
936     tcg_gen_and_i32(d, a, b);
937     tcg_gen_negsetcond_i32(TCG_COND_NE, d, d, tcg_constant_i32(0));
938 }
939 
940 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
941 {
942     tcg_gen_and_i64(d, a, b);
943     tcg_gen_negsetcond_i64(TCG_COND_NE, d, d, tcg_constant_i64(0));
944 }
945 
946 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
947 {
948     tcg_gen_and_vec(vece, d, a, b);
949     tcg_gen_dupi_vec(vece, a, 0);
950     tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a);
951 }
952 
953 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
954                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
955 {
956     static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
957     static const GVecGen3 ops[4] = {
958         { .fni4 = gen_helper_neon_tst_u8,
959           .fniv = gen_cmtst_vec,
960           .opt_opc = vecop_list,
961           .vece = MO_8 },
962         { .fni4 = gen_helper_neon_tst_u16,
963           .fniv = gen_cmtst_vec,
964           .opt_opc = vecop_list,
965           .vece = MO_16 },
966         { .fni4 = gen_cmtst_i32,
967           .fniv = gen_cmtst_vec,
968           .opt_opc = vecop_list,
969           .vece = MO_32 },
970         { .fni8 = gen_cmtst_i64,
971           .fniv = gen_cmtst_vec,
972           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
973           .opt_opc = vecop_list,
974           .vece = MO_64 },
975     };
976     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
977 }
978 
979 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
980 {
981     TCGv_i32 lval = tcg_temp_new_i32();
982     TCGv_i32 rval = tcg_temp_new_i32();
983     TCGv_i32 lsh = tcg_temp_new_i32();
984     TCGv_i32 rsh = tcg_temp_new_i32();
985     TCGv_i32 zero = tcg_constant_i32(0);
986     TCGv_i32 max = tcg_constant_i32(32);
987 
988     /*
989      * Rely on the TCG guarantee that out of range shifts produce
990      * unspecified results, not undefined behaviour (i.e. no trap).
991      * Discard out-of-range results after the fact.
992      */
993     tcg_gen_ext8s_i32(lsh, shift);
994     tcg_gen_neg_i32(rsh, lsh);
995     tcg_gen_shl_i32(lval, src, lsh);
996     tcg_gen_shr_i32(rval, src, rsh);
997     tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
998     tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
999 }
1000 
1001 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1002 {
1003     TCGv_i64 lval = tcg_temp_new_i64();
1004     TCGv_i64 rval = tcg_temp_new_i64();
1005     TCGv_i64 lsh = tcg_temp_new_i64();
1006     TCGv_i64 rsh = tcg_temp_new_i64();
1007     TCGv_i64 zero = tcg_constant_i64(0);
1008     TCGv_i64 max = tcg_constant_i64(64);
1009 
1010     /*
1011      * Rely on the TCG guarantee that out of range shifts produce
1012      * unspecified results, not undefined behaviour (i.e. no trap).
1013      * Discard out-of-range results after the fact.
1014      */
1015     tcg_gen_ext8s_i64(lsh, shift);
1016     tcg_gen_neg_i64(rsh, lsh);
1017     tcg_gen_shl_i64(lval, src, lsh);
1018     tcg_gen_shr_i64(rval, src, rsh);
1019     tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
1020     tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
1021 }
1022 
1023 static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
1024                          TCGv_vec src, TCGv_vec shift)
1025 {
1026     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1027     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1028     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1029     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1030     TCGv_vec msk, max;
1031 
1032     tcg_gen_neg_vec(vece, rsh, shift);
1033     if (vece == MO_8) {
1034         tcg_gen_mov_vec(lsh, shift);
1035     } else {
1036         msk = tcg_temp_new_vec_matching(dst);
1037         tcg_gen_dupi_vec(vece, msk, 0xff);
1038         tcg_gen_and_vec(vece, lsh, shift, msk);
1039         tcg_gen_and_vec(vece, rsh, rsh, msk);
1040     }
1041 
1042     /*
1043      * Rely on the TCG guarantee that out of range shifts produce
1044      * unspecified results, not undefined behaviour (i.e. no trap).
1045      * Discard out-of-range results after the fact.
1046      */
1047     tcg_gen_shlv_vec(vece, lval, src, lsh);
1048     tcg_gen_shrv_vec(vece, rval, src, rsh);
1049 
1050     max = tcg_temp_new_vec_matching(dst);
1051     tcg_gen_dupi_vec(vece, max, 8 << vece);
1052 
1053     /*
1054      * The choice of LT (signed) and GEU (unsigned) are biased toward
1055      * the instructions of the x86_64 host.  For MO_8, the whole byte
1056      * is significant so we must use an unsigned compare; otherwise we
1057      * have already masked to a byte and so a signed compare works.
1058      * Other tcg hosts have a full set of comparisons and do not care.
1059      */
1060     if (vece == MO_8) {
1061         tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
1062         tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
1063         tcg_gen_andc_vec(vece, lval, lval, lsh);
1064         tcg_gen_andc_vec(vece, rval, rval, rsh);
1065     } else {
1066         tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
1067         tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
1068         tcg_gen_and_vec(vece, lval, lval, lsh);
1069         tcg_gen_and_vec(vece, rval, rval, rsh);
1070     }
1071     tcg_gen_or_vec(vece, dst, lval, rval);
1072 }
1073 
1074 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1075                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1076 {
1077     static const TCGOpcode vecop_list[] = {
1078         INDEX_op_neg_vec, INDEX_op_shlv_vec,
1079         INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
1080     };
1081     static const GVecGen3 ops[4] = {
1082         { .fniv = gen_ushl_vec,
1083           .fno = gen_helper_gvec_ushl_b,
1084           .opt_opc = vecop_list,
1085           .vece = MO_8 },
1086         { .fniv = gen_ushl_vec,
1087           .fno = gen_helper_gvec_ushl_h,
1088           .opt_opc = vecop_list,
1089           .vece = MO_16 },
1090         { .fni4 = gen_ushl_i32,
1091           .fniv = gen_ushl_vec,
1092           .opt_opc = vecop_list,
1093           .vece = MO_32 },
1094         { .fni8 = gen_ushl_i64,
1095           .fniv = gen_ushl_vec,
1096           .opt_opc = vecop_list,
1097           .vece = MO_64 },
1098     };
1099     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1100 }
1101 
1102 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1103 {
1104     TCGv_i32 lval = tcg_temp_new_i32();
1105     TCGv_i32 rval = tcg_temp_new_i32();
1106     TCGv_i32 lsh = tcg_temp_new_i32();
1107     TCGv_i32 rsh = tcg_temp_new_i32();
1108     TCGv_i32 zero = tcg_constant_i32(0);
1109     TCGv_i32 max = tcg_constant_i32(31);
1110 
1111     /*
1112      * Rely on the TCG guarantee that out of range shifts produce
1113      * unspecified results, not undefined behaviour (i.e. no trap).
1114      * Discard out-of-range results after the fact.
1115      */
1116     tcg_gen_ext8s_i32(lsh, shift);
1117     tcg_gen_neg_i32(rsh, lsh);
1118     tcg_gen_shl_i32(lval, src, lsh);
1119     tcg_gen_umin_i32(rsh, rsh, max);
1120     tcg_gen_sar_i32(rval, src, rsh);
1121     tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
1122     tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
1123 }
1124 
1125 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1126 {
1127     TCGv_i64 lval = tcg_temp_new_i64();
1128     TCGv_i64 rval = tcg_temp_new_i64();
1129     TCGv_i64 lsh = tcg_temp_new_i64();
1130     TCGv_i64 rsh = tcg_temp_new_i64();
1131     TCGv_i64 zero = tcg_constant_i64(0);
1132     TCGv_i64 max = tcg_constant_i64(63);
1133 
1134     /*
1135      * Rely on the TCG guarantee that out of range shifts produce
1136      * unspecified results, not undefined behaviour (i.e. no trap).
1137      * Discard out-of-range results after the fact.
1138      */
1139     tcg_gen_ext8s_i64(lsh, shift);
1140     tcg_gen_neg_i64(rsh, lsh);
1141     tcg_gen_shl_i64(lval, src, lsh);
1142     tcg_gen_umin_i64(rsh, rsh, max);
1143     tcg_gen_sar_i64(rval, src, rsh);
1144     tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
1145     tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
1146 }
1147 
1148 static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
1149                          TCGv_vec src, TCGv_vec shift)
1150 {
1151     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1152     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1153     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1154     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1155     TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
1156 
1157     /*
1158      * Rely on the TCG guarantee that out of range shifts produce
1159      * unspecified results, not undefined behaviour (i.e. no trap).
1160      * Discard out-of-range results after the fact.
1161      */
1162     tcg_gen_neg_vec(vece, rsh, shift);
1163     if (vece == MO_8) {
1164         tcg_gen_mov_vec(lsh, shift);
1165     } else {
1166         tcg_gen_dupi_vec(vece, tmp, 0xff);
1167         tcg_gen_and_vec(vece, lsh, shift, tmp);
1168         tcg_gen_and_vec(vece, rsh, rsh, tmp);
1169     }
1170 
1171     /* Bound rsh so out of bound right shift gets -1.  */
1172     tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
1173     tcg_gen_umin_vec(vece, rsh, rsh, tmp);
1174     tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
1175 
1176     tcg_gen_shlv_vec(vece, lval, src, lsh);
1177     tcg_gen_sarv_vec(vece, rval, src, rsh);
1178 
1179     /* Select in-bound left shift.  */
1180     tcg_gen_andc_vec(vece, lval, lval, tmp);
1181 
1182     /* Select between left and right shift.  */
1183     if (vece == MO_8) {
1184         tcg_gen_dupi_vec(vece, tmp, 0);
1185         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
1186     } else {
1187         tcg_gen_dupi_vec(vece, tmp, 0x80);
1188         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
1189     }
1190 }
1191 
1192 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1193                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1194 {
1195     static const TCGOpcode vecop_list[] = {
1196         INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
1197         INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
1198     };
1199     static const GVecGen3 ops[4] = {
1200         { .fniv = gen_sshl_vec,
1201           .fno = gen_helper_gvec_sshl_b,
1202           .opt_opc = vecop_list,
1203           .vece = MO_8 },
1204         { .fniv = gen_sshl_vec,
1205           .fno = gen_helper_gvec_sshl_h,
1206           .opt_opc = vecop_list,
1207           .vece = MO_16 },
1208         { .fni4 = gen_sshl_i32,
1209           .fniv = gen_sshl_vec,
1210           .opt_opc = vecop_list,
1211           .vece = MO_32 },
1212         { .fni8 = gen_sshl_i64,
1213           .fniv = gen_sshl_vec,
1214           .opt_opc = vecop_list,
1215           .vece = MO_64 },
1216     };
1217     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1218 }
1219 
1220 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
1221                           TCGv_vec a, TCGv_vec b)
1222 {
1223     TCGv_vec x = tcg_temp_new_vec_matching(t);
1224     tcg_gen_add_vec(vece, x, a, b);
1225     tcg_gen_usadd_vec(vece, t, a, b);
1226     tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
1227     tcg_gen_or_vec(vece, sat, sat, x);
1228 }
1229 
1230 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1231                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1232 {
1233     static const TCGOpcode vecop_list[] = {
1234         INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
1235     };
1236     static const GVecGen4 ops[4] = {
1237         { .fniv = gen_uqadd_vec,
1238           .fno = gen_helper_gvec_uqadd_b,
1239           .write_aofs = true,
1240           .opt_opc = vecop_list,
1241           .vece = MO_8 },
1242         { .fniv = gen_uqadd_vec,
1243           .fno = gen_helper_gvec_uqadd_h,
1244           .write_aofs = true,
1245           .opt_opc = vecop_list,
1246           .vece = MO_16 },
1247         { .fniv = gen_uqadd_vec,
1248           .fno = gen_helper_gvec_uqadd_s,
1249           .write_aofs = true,
1250           .opt_opc = vecop_list,
1251           .vece = MO_32 },
1252         { .fniv = gen_uqadd_vec,
1253           .fno = gen_helper_gvec_uqadd_d,
1254           .write_aofs = true,
1255           .opt_opc = vecop_list,
1256           .vece = MO_64 },
1257     };
1258     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1259                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1260 }
1261 
1262 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
1263                           TCGv_vec a, TCGv_vec b)
1264 {
1265     TCGv_vec x = tcg_temp_new_vec_matching(t);
1266     tcg_gen_add_vec(vece, x, a, b);
1267     tcg_gen_ssadd_vec(vece, t, a, b);
1268     tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
1269     tcg_gen_or_vec(vece, sat, sat, x);
1270 }
1271 
1272 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1273                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1274 {
1275     static const TCGOpcode vecop_list[] = {
1276         INDEX_op_ssadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
1277     };
1278     static const GVecGen4 ops[4] = {
1279         { .fniv = gen_sqadd_vec,
1280           .fno = gen_helper_gvec_sqadd_b,
1281           .opt_opc = vecop_list,
1282           .write_aofs = true,
1283           .vece = MO_8 },
1284         { .fniv = gen_sqadd_vec,
1285           .fno = gen_helper_gvec_sqadd_h,
1286           .opt_opc = vecop_list,
1287           .write_aofs = true,
1288           .vece = MO_16 },
1289         { .fniv = gen_sqadd_vec,
1290           .fno = gen_helper_gvec_sqadd_s,
1291           .opt_opc = vecop_list,
1292           .write_aofs = true,
1293           .vece = MO_32 },
1294         { .fniv = gen_sqadd_vec,
1295           .fno = gen_helper_gvec_sqadd_d,
1296           .opt_opc = vecop_list,
1297           .write_aofs = true,
1298           .vece = MO_64 },
1299     };
1300     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1301                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1302 }
1303 
1304 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
1305                           TCGv_vec a, TCGv_vec b)
1306 {
1307     TCGv_vec x = tcg_temp_new_vec_matching(t);
1308     tcg_gen_sub_vec(vece, x, a, b);
1309     tcg_gen_ussub_vec(vece, t, a, b);
1310     tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
1311     tcg_gen_or_vec(vece, sat, sat, x);
1312 }
1313 
1314 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1315                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1316 {
1317     static const TCGOpcode vecop_list[] = {
1318         INDEX_op_ussub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
1319     };
1320     static const GVecGen4 ops[4] = {
1321         { .fniv = gen_uqsub_vec,
1322           .fno = gen_helper_gvec_uqsub_b,
1323           .opt_opc = vecop_list,
1324           .write_aofs = true,
1325           .vece = MO_8 },
1326         { .fniv = gen_uqsub_vec,
1327           .fno = gen_helper_gvec_uqsub_h,
1328           .opt_opc = vecop_list,
1329           .write_aofs = true,
1330           .vece = MO_16 },
1331         { .fniv = gen_uqsub_vec,
1332           .fno = gen_helper_gvec_uqsub_s,
1333           .opt_opc = vecop_list,
1334           .write_aofs = true,
1335           .vece = MO_32 },
1336         { .fniv = gen_uqsub_vec,
1337           .fno = gen_helper_gvec_uqsub_d,
1338           .opt_opc = vecop_list,
1339           .write_aofs = true,
1340           .vece = MO_64 },
1341     };
1342     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1343                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1344 }
1345 
1346 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
1347                           TCGv_vec a, TCGv_vec b)
1348 {
1349     TCGv_vec x = tcg_temp_new_vec_matching(t);
1350     tcg_gen_sub_vec(vece, x, a, b);
1351     tcg_gen_sssub_vec(vece, t, a, b);
1352     tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
1353     tcg_gen_or_vec(vece, sat, sat, x);
1354 }
1355 
1356 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1357                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1358 {
1359     static const TCGOpcode vecop_list[] = {
1360         INDEX_op_sssub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
1361     };
1362     static const GVecGen4 ops[4] = {
1363         { .fniv = gen_sqsub_vec,
1364           .fno = gen_helper_gvec_sqsub_b,
1365           .opt_opc = vecop_list,
1366           .write_aofs = true,
1367           .vece = MO_8 },
1368         { .fniv = gen_sqsub_vec,
1369           .fno = gen_helper_gvec_sqsub_h,
1370           .opt_opc = vecop_list,
1371           .write_aofs = true,
1372           .vece = MO_16 },
1373         { .fniv = gen_sqsub_vec,
1374           .fno = gen_helper_gvec_sqsub_s,
1375           .opt_opc = vecop_list,
1376           .write_aofs = true,
1377           .vece = MO_32 },
1378         { .fniv = gen_sqsub_vec,
1379           .fno = gen_helper_gvec_sqsub_d,
1380           .opt_opc = vecop_list,
1381           .write_aofs = true,
1382           .vece = MO_64 },
1383     };
1384     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1385                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1386 }
1387 
1388 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1389 {
1390     TCGv_i32 t = tcg_temp_new_i32();
1391 
1392     tcg_gen_sub_i32(t, a, b);
1393     tcg_gen_sub_i32(d, b, a);
1394     tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
1395 }
1396 
1397 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1398 {
1399     TCGv_i64 t = tcg_temp_new_i64();
1400 
1401     tcg_gen_sub_i64(t, a, b);
1402     tcg_gen_sub_i64(d, b, a);
1403     tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
1404 }
1405 
1406 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1407 {
1408     TCGv_vec t = tcg_temp_new_vec_matching(d);
1409 
1410     tcg_gen_smin_vec(vece, t, a, b);
1411     tcg_gen_smax_vec(vece, d, a, b);
1412     tcg_gen_sub_vec(vece, d, d, t);
1413 }
1414 
1415 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1416                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1417 {
1418     static const TCGOpcode vecop_list[] = {
1419         INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1420     };
1421     static const GVecGen3 ops[4] = {
1422         { .fniv = gen_sabd_vec,
1423           .fno = gen_helper_gvec_sabd_b,
1424           .opt_opc = vecop_list,
1425           .vece = MO_8 },
1426         { .fniv = gen_sabd_vec,
1427           .fno = gen_helper_gvec_sabd_h,
1428           .opt_opc = vecop_list,
1429           .vece = MO_16 },
1430         { .fni4 = gen_sabd_i32,
1431           .fniv = gen_sabd_vec,
1432           .fno = gen_helper_gvec_sabd_s,
1433           .opt_opc = vecop_list,
1434           .vece = MO_32 },
1435         { .fni8 = gen_sabd_i64,
1436           .fniv = gen_sabd_vec,
1437           .fno = gen_helper_gvec_sabd_d,
1438           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1439           .opt_opc = vecop_list,
1440           .vece = MO_64 },
1441     };
1442     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1443 }
1444 
1445 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1446 {
1447     TCGv_i32 t = tcg_temp_new_i32();
1448 
1449     tcg_gen_sub_i32(t, a, b);
1450     tcg_gen_sub_i32(d, b, a);
1451     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
1452 }
1453 
1454 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1455 {
1456     TCGv_i64 t = tcg_temp_new_i64();
1457 
1458     tcg_gen_sub_i64(t, a, b);
1459     tcg_gen_sub_i64(d, b, a);
1460     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
1461 }
1462 
1463 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1464 {
1465     TCGv_vec t = tcg_temp_new_vec_matching(d);
1466 
1467     tcg_gen_umin_vec(vece, t, a, b);
1468     tcg_gen_umax_vec(vece, d, a, b);
1469     tcg_gen_sub_vec(vece, d, d, t);
1470 }
1471 
1472 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1473                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1474 {
1475     static const TCGOpcode vecop_list[] = {
1476         INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1477     };
1478     static const GVecGen3 ops[4] = {
1479         { .fniv = gen_uabd_vec,
1480           .fno = gen_helper_gvec_uabd_b,
1481           .opt_opc = vecop_list,
1482           .vece = MO_8 },
1483         { .fniv = gen_uabd_vec,
1484           .fno = gen_helper_gvec_uabd_h,
1485           .opt_opc = vecop_list,
1486           .vece = MO_16 },
1487         { .fni4 = gen_uabd_i32,
1488           .fniv = gen_uabd_vec,
1489           .fno = gen_helper_gvec_uabd_s,
1490           .opt_opc = vecop_list,
1491           .vece = MO_32 },
1492         { .fni8 = gen_uabd_i64,
1493           .fniv = gen_uabd_vec,
1494           .fno = gen_helper_gvec_uabd_d,
1495           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1496           .opt_opc = vecop_list,
1497           .vece = MO_64 },
1498     };
1499     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1500 }
1501 
1502 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1503 {
1504     TCGv_i32 t = tcg_temp_new_i32();
1505     gen_sabd_i32(t, a, b);
1506     tcg_gen_add_i32(d, d, t);
1507 }
1508 
1509 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1510 {
1511     TCGv_i64 t = tcg_temp_new_i64();
1512     gen_sabd_i64(t, a, b);
1513     tcg_gen_add_i64(d, d, t);
1514 }
1515 
1516 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1517 {
1518     TCGv_vec t = tcg_temp_new_vec_matching(d);
1519     gen_sabd_vec(vece, t, a, b);
1520     tcg_gen_add_vec(vece, d, d, t);
1521 }
1522 
1523 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1524                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1525 {
1526     static const TCGOpcode vecop_list[] = {
1527         INDEX_op_sub_vec, INDEX_op_add_vec,
1528         INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1529     };
1530     static const GVecGen3 ops[4] = {
1531         { .fniv = gen_saba_vec,
1532           .fno = gen_helper_gvec_saba_b,
1533           .opt_opc = vecop_list,
1534           .load_dest = true,
1535           .vece = MO_8 },
1536         { .fniv = gen_saba_vec,
1537           .fno = gen_helper_gvec_saba_h,
1538           .opt_opc = vecop_list,
1539           .load_dest = true,
1540           .vece = MO_16 },
1541         { .fni4 = gen_saba_i32,
1542           .fniv = gen_saba_vec,
1543           .fno = gen_helper_gvec_saba_s,
1544           .opt_opc = vecop_list,
1545           .load_dest = true,
1546           .vece = MO_32 },
1547         { .fni8 = gen_saba_i64,
1548           .fniv = gen_saba_vec,
1549           .fno = gen_helper_gvec_saba_d,
1550           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1551           .opt_opc = vecop_list,
1552           .load_dest = true,
1553           .vece = MO_64 },
1554     };
1555     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1556 }
1557 
1558 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1559 {
1560     TCGv_i32 t = tcg_temp_new_i32();
1561     gen_uabd_i32(t, a, b);
1562     tcg_gen_add_i32(d, d, t);
1563 }
1564 
1565 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1566 {
1567     TCGv_i64 t = tcg_temp_new_i64();
1568     gen_uabd_i64(t, a, b);
1569     tcg_gen_add_i64(d, d, t);
1570 }
1571 
1572 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1573 {
1574     TCGv_vec t = tcg_temp_new_vec_matching(d);
1575     gen_uabd_vec(vece, t, a, b);
1576     tcg_gen_add_vec(vece, d, d, t);
1577 }
1578 
1579 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1580                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1581 {
1582     static const TCGOpcode vecop_list[] = {
1583         INDEX_op_sub_vec, INDEX_op_add_vec,
1584         INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1585     };
1586     static const GVecGen3 ops[4] = {
1587         { .fniv = gen_uaba_vec,
1588           .fno = gen_helper_gvec_uaba_b,
1589           .opt_opc = vecop_list,
1590           .load_dest = true,
1591           .vece = MO_8 },
1592         { .fniv = gen_uaba_vec,
1593           .fno = gen_helper_gvec_uaba_h,
1594           .opt_opc = vecop_list,
1595           .load_dest = true,
1596           .vece = MO_16 },
1597         { .fni4 = gen_uaba_i32,
1598           .fniv = gen_uaba_vec,
1599           .fno = gen_helper_gvec_uaba_s,
1600           .opt_opc = vecop_list,
1601           .load_dest = true,
1602           .vece = MO_32 },
1603         { .fni8 = gen_uaba_i64,
1604           .fniv = gen_uaba_vec,
1605           .fno = gen_helper_gvec_uaba_d,
1606           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1607           .opt_opc = vecop_list,
1608           .load_dest = true,
1609           .vece = MO_64 },
1610     };
1611     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1612 }
1613 
1614 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1615                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1616 {
1617     static gen_helper_gvec_3 * const fns[4] = {
1618         gen_helper_gvec_addp_b,
1619         gen_helper_gvec_addp_h,
1620         gen_helper_gvec_addp_s,
1621         gen_helper_gvec_addp_d,
1622     };
1623     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1624 }
1625 
1626 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1627                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1628 {
1629     static gen_helper_gvec_3 * const fns[4] = {
1630         gen_helper_gvec_smaxp_b,
1631         gen_helper_gvec_smaxp_h,
1632         gen_helper_gvec_smaxp_s,
1633     };
1634     tcg_debug_assert(vece <= MO_32);
1635     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1636 }
1637 
1638 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1639                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1640 {
1641     static gen_helper_gvec_3 * const fns[4] = {
1642         gen_helper_gvec_sminp_b,
1643         gen_helper_gvec_sminp_h,
1644         gen_helper_gvec_sminp_s,
1645     };
1646     tcg_debug_assert(vece <= MO_32);
1647     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1648 }
1649 
1650 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1651                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1652 {
1653     static gen_helper_gvec_3 * const fns[4] = {
1654         gen_helper_gvec_umaxp_b,
1655         gen_helper_gvec_umaxp_h,
1656         gen_helper_gvec_umaxp_s,
1657     };
1658     tcg_debug_assert(vece <= MO_32);
1659     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1660 }
1661 
1662 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1663                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1664 {
1665     static gen_helper_gvec_3 * const fns[4] = {
1666         gen_helper_gvec_uminp_b,
1667         gen_helper_gvec_uminp_h,
1668         gen_helper_gvec_uminp_s,
1669     };
1670     tcg_debug_assert(vece <= MO_32);
1671     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1672 }
1673