xref: /openbmc/qemu/target/arm/tcg/gengvec.c (revision 8989b95e71dea9292bab77477949cc1a385c9543)
1 /*
2  *  ARM generic vector expansion
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "translate.h"
24 
25 
26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
27                             uint32_t opr_sz, uint32_t max_sz,
28                             gen_helper_gvec_3_ptr *fn)
29 {
30     TCGv_ptr qc_ptr = tcg_temp_new_ptr();
31 
32     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
33     tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc));
34     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
35                        opr_sz, max_sz, 0, fn);
36 }
37 
38 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
39                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
40 {
41     static gen_helper_gvec_3_ptr * const fns[2] = {
42         gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
43     };
44     tcg_debug_assert(vece >= 1 && vece <= 2);
45     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
46 }
47 
48 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
49                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
50 {
51     static gen_helper_gvec_3_ptr * const fns[2] = {
52         gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
53     };
54     tcg_debug_assert(vece >= 1 && vece <= 2);
55     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
56 }
57 
58 #define GEN_CMP0(NAME, COND)                              \
59     void NAME(unsigned vece, uint32_t d, uint32_t m,      \
60               uint32_t opr_sz, uint32_t max_sz)           \
61     { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }
62 
63 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
64 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
65 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
66 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
67 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
68 
69 #undef GEN_CMP0
70 
71 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
72 {
73     tcg_gen_vec_sar8i_i64(a, a, shift);
74     tcg_gen_vec_add8_i64(d, d, a);
75 }
76 
77 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
78 {
79     tcg_gen_vec_sar16i_i64(a, a, shift);
80     tcg_gen_vec_add16_i64(d, d, a);
81 }
82 
83 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
84 {
85     tcg_gen_sari_i32(a, a, shift);
86     tcg_gen_add_i32(d, d, a);
87 }
88 
89 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
90 {
91     tcg_gen_sari_i64(a, a, shift);
92     tcg_gen_add_i64(d, d, a);
93 }
94 
95 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
96 {
97     tcg_gen_sari_vec(vece, a, a, sh);
98     tcg_gen_add_vec(vece, d, d, a);
99 }
100 
101 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
102                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
103 {
104     static const TCGOpcode vecop_list[] = {
105         INDEX_op_sari_vec, INDEX_op_add_vec, 0
106     };
107     static const GVecGen2i ops[4] = {
108         { .fni8 = gen_ssra8_i64,
109           .fniv = gen_ssra_vec,
110           .fno = gen_helper_gvec_ssra_b,
111           .load_dest = true,
112           .opt_opc = vecop_list,
113           .vece = MO_8 },
114         { .fni8 = gen_ssra16_i64,
115           .fniv = gen_ssra_vec,
116           .fno = gen_helper_gvec_ssra_h,
117           .load_dest = true,
118           .opt_opc = vecop_list,
119           .vece = MO_16 },
120         { .fni4 = gen_ssra32_i32,
121           .fniv = gen_ssra_vec,
122           .fno = gen_helper_gvec_ssra_s,
123           .load_dest = true,
124           .opt_opc = vecop_list,
125           .vece = MO_32 },
126         { .fni8 = gen_ssra64_i64,
127           .fniv = gen_ssra_vec,
128           .fno = gen_helper_gvec_ssra_d,
129           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
130           .opt_opc = vecop_list,
131           .load_dest = true,
132           .vece = MO_64 },
133     };
134 
135     /* tszimm encoding produces immediates in the range [1..esize]. */
136     tcg_debug_assert(shift > 0);
137     tcg_debug_assert(shift <= (8 << vece));
138 
139     /*
140      * Shifts larger than the element size are architecturally valid.
141      * Signed results in all sign bits.
142      */
143     shift = MIN(shift, (8 << vece) - 1);
144     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
145 }
146 
147 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
148 {
149     tcg_gen_vec_shr8i_i64(a, a, shift);
150     tcg_gen_vec_add8_i64(d, d, a);
151 }
152 
153 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
154 {
155     tcg_gen_vec_shr16i_i64(a, a, shift);
156     tcg_gen_vec_add16_i64(d, d, a);
157 }
158 
159 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
160 {
161     tcg_gen_shri_i32(a, a, shift);
162     tcg_gen_add_i32(d, d, a);
163 }
164 
165 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
166 {
167     tcg_gen_shri_i64(a, a, shift);
168     tcg_gen_add_i64(d, d, a);
169 }
170 
171 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
172 {
173     tcg_gen_shri_vec(vece, a, a, sh);
174     tcg_gen_add_vec(vece, d, d, a);
175 }
176 
177 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
178                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
179 {
180     static const TCGOpcode vecop_list[] = {
181         INDEX_op_shri_vec, INDEX_op_add_vec, 0
182     };
183     static const GVecGen2i ops[4] = {
184         { .fni8 = gen_usra8_i64,
185           .fniv = gen_usra_vec,
186           .fno = gen_helper_gvec_usra_b,
187           .load_dest = true,
188           .opt_opc = vecop_list,
189           .vece = MO_8, },
190         { .fni8 = gen_usra16_i64,
191           .fniv = gen_usra_vec,
192           .fno = gen_helper_gvec_usra_h,
193           .load_dest = true,
194           .opt_opc = vecop_list,
195           .vece = MO_16, },
196         { .fni4 = gen_usra32_i32,
197           .fniv = gen_usra_vec,
198           .fno = gen_helper_gvec_usra_s,
199           .load_dest = true,
200           .opt_opc = vecop_list,
201           .vece = MO_32, },
202         { .fni8 = gen_usra64_i64,
203           .fniv = gen_usra_vec,
204           .fno = gen_helper_gvec_usra_d,
205           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
206           .load_dest = true,
207           .opt_opc = vecop_list,
208           .vece = MO_64, },
209     };
210 
211     /* tszimm encoding produces immediates in the range [1..esize]. */
212     tcg_debug_assert(shift > 0);
213     tcg_debug_assert(shift <= (8 << vece));
214 
215     /*
216      * Shifts larger than the element size are architecturally valid.
217      * Unsigned results in all zeros as input to accumulate: nop.
218      */
219     if (shift < (8 << vece)) {
220         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
221     } else {
222         /* Nop, but we do need to clear the tail. */
223         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
224     }
225 }
226 
227 /*
228  * Shift one less than the requested amount, and the low bit is
229  * the rounding bit.  For the 8 and 16-bit operations, because we
230  * mask the low bit, we can perform a normal integer shift instead
231  * of a vector shift.
232  */
233 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
234 {
235     TCGv_i64 t = tcg_temp_new_i64();
236 
237     tcg_gen_shri_i64(t, a, sh - 1);
238     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
239     tcg_gen_vec_sar8i_i64(d, a, sh);
240     tcg_gen_vec_add8_i64(d, d, t);
241 }
242 
243 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
244 {
245     TCGv_i64 t = tcg_temp_new_i64();
246 
247     tcg_gen_shri_i64(t, a, sh - 1);
248     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
249     tcg_gen_vec_sar16i_i64(d, a, sh);
250     tcg_gen_vec_add16_i64(d, d, t);
251 }
252 
253 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
254 {
255     TCGv_i32 t;
256 
257     /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
258     if (sh == 32) {
259         tcg_gen_movi_i32(d, 0);
260         return;
261     }
262     t = tcg_temp_new_i32();
263     tcg_gen_extract_i32(t, a, sh - 1, 1);
264     tcg_gen_sari_i32(d, a, sh);
265     tcg_gen_add_i32(d, d, t);
266 }
267 
268  void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
269 {
270     TCGv_i64 t = tcg_temp_new_i64();
271 
272     tcg_gen_extract_i64(t, a, sh - 1, 1);
273     tcg_gen_sari_i64(d, a, sh);
274     tcg_gen_add_i64(d, d, t);
275 }
276 
277 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
278 {
279     TCGv_vec t = tcg_temp_new_vec_matching(d);
280     TCGv_vec ones = tcg_temp_new_vec_matching(d);
281 
282     tcg_gen_shri_vec(vece, t, a, sh - 1);
283     tcg_gen_dupi_vec(vece, ones, 1);
284     tcg_gen_and_vec(vece, t, t, ones);
285     tcg_gen_sari_vec(vece, d, a, sh);
286     tcg_gen_add_vec(vece, d, d, t);
287 }
288 
289 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
290                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
291 {
292     static const TCGOpcode vecop_list[] = {
293         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
294     };
295     static const GVecGen2i ops[4] = {
296         { .fni8 = gen_srshr8_i64,
297           .fniv = gen_srshr_vec,
298           .fno = gen_helper_gvec_srshr_b,
299           .opt_opc = vecop_list,
300           .vece = MO_8 },
301         { .fni8 = gen_srshr16_i64,
302           .fniv = gen_srshr_vec,
303           .fno = gen_helper_gvec_srshr_h,
304           .opt_opc = vecop_list,
305           .vece = MO_16 },
306         { .fni4 = gen_srshr32_i32,
307           .fniv = gen_srshr_vec,
308           .fno = gen_helper_gvec_srshr_s,
309           .opt_opc = vecop_list,
310           .vece = MO_32 },
311         { .fni8 = gen_srshr64_i64,
312           .fniv = gen_srshr_vec,
313           .fno = gen_helper_gvec_srshr_d,
314           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
315           .opt_opc = vecop_list,
316           .vece = MO_64 },
317     };
318 
319     /* tszimm encoding produces immediates in the range [1..esize] */
320     tcg_debug_assert(shift > 0);
321     tcg_debug_assert(shift <= (8 << vece));
322 
323     if (shift == (8 << vece)) {
324         /*
325          * Shifts larger than the element size are architecturally valid.
326          * Signed results in all sign bits.  With rounding, this produces
327          *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
328          * I.e. always zero.
329          */
330         tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
331     } else {
332         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
333     }
334 }
335 
336 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
337 {
338     TCGv_i64 t = tcg_temp_new_i64();
339 
340     gen_srshr8_i64(t, a, sh);
341     tcg_gen_vec_add8_i64(d, d, t);
342 }
343 
344 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
345 {
346     TCGv_i64 t = tcg_temp_new_i64();
347 
348     gen_srshr16_i64(t, a, sh);
349     tcg_gen_vec_add16_i64(d, d, t);
350 }
351 
352 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
353 {
354     TCGv_i32 t = tcg_temp_new_i32();
355 
356     gen_srshr32_i32(t, a, sh);
357     tcg_gen_add_i32(d, d, t);
358 }
359 
360 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
361 {
362     TCGv_i64 t = tcg_temp_new_i64();
363 
364     gen_srshr64_i64(t, a, sh);
365     tcg_gen_add_i64(d, d, t);
366 }
367 
368 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
369 {
370     TCGv_vec t = tcg_temp_new_vec_matching(d);
371 
372     gen_srshr_vec(vece, t, a, sh);
373     tcg_gen_add_vec(vece, d, d, t);
374 }
375 
376 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
377                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
378 {
379     static const TCGOpcode vecop_list[] = {
380         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
381     };
382     static const GVecGen2i ops[4] = {
383         { .fni8 = gen_srsra8_i64,
384           .fniv = gen_srsra_vec,
385           .fno = gen_helper_gvec_srsra_b,
386           .opt_opc = vecop_list,
387           .load_dest = true,
388           .vece = MO_8 },
389         { .fni8 = gen_srsra16_i64,
390           .fniv = gen_srsra_vec,
391           .fno = gen_helper_gvec_srsra_h,
392           .opt_opc = vecop_list,
393           .load_dest = true,
394           .vece = MO_16 },
395         { .fni4 = gen_srsra32_i32,
396           .fniv = gen_srsra_vec,
397           .fno = gen_helper_gvec_srsra_s,
398           .opt_opc = vecop_list,
399           .load_dest = true,
400           .vece = MO_32 },
401         { .fni8 = gen_srsra64_i64,
402           .fniv = gen_srsra_vec,
403           .fno = gen_helper_gvec_srsra_d,
404           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
405           .opt_opc = vecop_list,
406           .load_dest = true,
407           .vece = MO_64 },
408     };
409 
410     /* tszimm encoding produces immediates in the range [1..esize] */
411     tcg_debug_assert(shift > 0);
412     tcg_debug_assert(shift <= (8 << vece));
413 
414     /*
415      * Shifts larger than the element size are architecturally valid.
416      * Signed results in all sign bits.  With rounding, this produces
417      *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
418      * I.e. always zero.  With accumulation, this leaves D unchanged.
419      */
420     if (shift == (8 << vece)) {
421         /* Nop, but we do need to clear the tail. */
422         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
423     } else {
424         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
425     }
426 }
427 
428 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
429 {
430     TCGv_i64 t = tcg_temp_new_i64();
431 
432     tcg_gen_shri_i64(t, a, sh - 1);
433     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
434     tcg_gen_vec_shr8i_i64(d, a, sh);
435     tcg_gen_vec_add8_i64(d, d, t);
436 }
437 
438 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
439 {
440     TCGv_i64 t = tcg_temp_new_i64();
441 
442     tcg_gen_shri_i64(t, a, sh - 1);
443     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
444     tcg_gen_vec_shr16i_i64(d, a, sh);
445     tcg_gen_vec_add16_i64(d, d, t);
446 }
447 
448 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
449 {
450     TCGv_i32 t;
451 
452     /* Handle shift by the input size for the benefit of trans_URSHR_ri */
453     if (sh == 32) {
454         tcg_gen_extract_i32(d, a, sh - 1, 1);
455         return;
456     }
457     t = tcg_temp_new_i32();
458     tcg_gen_extract_i32(t, a, sh - 1, 1);
459     tcg_gen_shri_i32(d, a, sh);
460     tcg_gen_add_i32(d, d, t);
461 }
462 
463 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
464 {
465     TCGv_i64 t = tcg_temp_new_i64();
466 
467     tcg_gen_extract_i64(t, a, sh - 1, 1);
468     tcg_gen_shri_i64(d, a, sh);
469     tcg_gen_add_i64(d, d, t);
470 }
471 
472 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
473 {
474     TCGv_vec t = tcg_temp_new_vec_matching(d);
475     TCGv_vec ones = tcg_temp_new_vec_matching(d);
476 
477     tcg_gen_shri_vec(vece, t, a, shift - 1);
478     tcg_gen_dupi_vec(vece, ones, 1);
479     tcg_gen_and_vec(vece, t, t, ones);
480     tcg_gen_shri_vec(vece, d, a, shift);
481     tcg_gen_add_vec(vece, d, d, t);
482 }
483 
484 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
485                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
486 {
487     static const TCGOpcode vecop_list[] = {
488         INDEX_op_shri_vec, INDEX_op_add_vec, 0
489     };
490     static const GVecGen2i ops[4] = {
491         { .fni8 = gen_urshr8_i64,
492           .fniv = gen_urshr_vec,
493           .fno = gen_helper_gvec_urshr_b,
494           .opt_opc = vecop_list,
495           .vece = MO_8 },
496         { .fni8 = gen_urshr16_i64,
497           .fniv = gen_urshr_vec,
498           .fno = gen_helper_gvec_urshr_h,
499           .opt_opc = vecop_list,
500           .vece = MO_16 },
501         { .fni4 = gen_urshr32_i32,
502           .fniv = gen_urshr_vec,
503           .fno = gen_helper_gvec_urshr_s,
504           .opt_opc = vecop_list,
505           .vece = MO_32 },
506         { .fni8 = gen_urshr64_i64,
507           .fniv = gen_urshr_vec,
508           .fno = gen_helper_gvec_urshr_d,
509           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
510           .opt_opc = vecop_list,
511           .vece = MO_64 },
512     };
513 
514     /* tszimm encoding produces immediates in the range [1..esize] */
515     tcg_debug_assert(shift > 0);
516     tcg_debug_assert(shift <= (8 << vece));
517 
518     if (shift == (8 << vece)) {
519         /*
520          * Shifts larger than the element size are architecturally valid.
521          * Unsigned results in zero.  With rounding, this produces a
522          * copy of the most significant bit.
523          */
524         tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
525     } else {
526         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
527     }
528 }
529 
530 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
531 {
532     TCGv_i64 t = tcg_temp_new_i64();
533 
534     if (sh == 8) {
535         tcg_gen_vec_shr8i_i64(t, a, 7);
536     } else {
537         gen_urshr8_i64(t, a, sh);
538     }
539     tcg_gen_vec_add8_i64(d, d, t);
540 }
541 
542 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
543 {
544     TCGv_i64 t = tcg_temp_new_i64();
545 
546     if (sh == 16) {
547         tcg_gen_vec_shr16i_i64(t, a, 15);
548     } else {
549         gen_urshr16_i64(t, a, sh);
550     }
551     tcg_gen_vec_add16_i64(d, d, t);
552 }
553 
554 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
555 {
556     TCGv_i32 t = tcg_temp_new_i32();
557 
558     if (sh == 32) {
559         tcg_gen_shri_i32(t, a, 31);
560     } else {
561         gen_urshr32_i32(t, a, sh);
562     }
563     tcg_gen_add_i32(d, d, t);
564 }
565 
566 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
567 {
568     TCGv_i64 t = tcg_temp_new_i64();
569 
570     if (sh == 64) {
571         tcg_gen_shri_i64(t, a, 63);
572     } else {
573         gen_urshr64_i64(t, a, sh);
574     }
575     tcg_gen_add_i64(d, d, t);
576 }
577 
578 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
579 {
580     TCGv_vec t = tcg_temp_new_vec_matching(d);
581 
582     if (sh == (8 << vece)) {
583         tcg_gen_shri_vec(vece, t, a, sh - 1);
584     } else {
585         gen_urshr_vec(vece, t, a, sh);
586     }
587     tcg_gen_add_vec(vece, d, d, t);
588 }
589 
590 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
591                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
592 {
593     static const TCGOpcode vecop_list[] = {
594         INDEX_op_shri_vec, INDEX_op_add_vec, 0
595     };
596     static const GVecGen2i ops[4] = {
597         { .fni8 = gen_ursra8_i64,
598           .fniv = gen_ursra_vec,
599           .fno = gen_helper_gvec_ursra_b,
600           .opt_opc = vecop_list,
601           .load_dest = true,
602           .vece = MO_8 },
603         { .fni8 = gen_ursra16_i64,
604           .fniv = gen_ursra_vec,
605           .fno = gen_helper_gvec_ursra_h,
606           .opt_opc = vecop_list,
607           .load_dest = true,
608           .vece = MO_16 },
609         { .fni4 = gen_ursra32_i32,
610           .fniv = gen_ursra_vec,
611           .fno = gen_helper_gvec_ursra_s,
612           .opt_opc = vecop_list,
613           .load_dest = true,
614           .vece = MO_32 },
615         { .fni8 = gen_ursra64_i64,
616           .fniv = gen_ursra_vec,
617           .fno = gen_helper_gvec_ursra_d,
618           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
619           .opt_opc = vecop_list,
620           .load_dest = true,
621           .vece = MO_64 },
622     };
623 
624     /* tszimm encoding produces immediates in the range [1..esize] */
625     tcg_debug_assert(shift > 0);
626     tcg_debug_assert(shift <= (8 << vece));
627 
628     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
629 }
630 
631 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
632 {
633     uint64_t mask = dup_const(MO_8, 0xff >> shift);
634     TCGv_i64 t = tcg_temp_new_i64();
635 
636     tcg_gen_shri_i64(t, a, shift);
637     tcg_gen_andi_i64(t, t, mask);
638     tcg_gen_andi_i64(d, d, ~mask);
639     tcg_gen_or_i64(d, d, t);
640 }
641 
642 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
643 {
644     uint64_t mask = dup_const(MO_16, 0xffff >> shift);
645     TCGv_i64 t = tcg_temp_new_i64();
646 
647     tcg_gen_shri_i64(t, a, shift);
648     tcg_gen_andi_i64(t, t, mask);
649     tcg_gen_andi_i64(d, d, ~mask);
650     tcg_gen_or_i64(d, d, t);
651 }
652 
653 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
654 {
655     tcg_gen_shri_i32(a, a, shift);
656     tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
657 }
658 
659 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
660 {
661     tcg_gen_shri_i64(a, a, shift);
662     tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
663 }
664 
665 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
666 {
667     TCGv_vec t = tcg_temp_new_vec_matching(d);
668     TCGv_vec m = tcg_temp_new_vec_matching(d);
669 
670     tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
671     tcg_gen_shri_vec(vece, t, a, sh);
672     tcg_gen_and_vec(vece, d, d, m);
673     tcg_gen_or_vec(vece, d, d, t);
674 }
675 
676 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
677                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
678 {
679     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
680     const GVecGen2i ops[4] = {
681         { .fni8 = gen_shr8_ins_i64,
682           .fniv = gen_shr_ins_vec,
683           .fno = gen_helper_gvec_sri_b,
684           .load_dest = true,
685           .opt_opc = vecop_list,
686           .vece = MO_8 },
687         { .fni8 = gen_shr16_ins_i64,
688           .fniv = gen_shr_ins_vec,
689           .fno = gen_helper_gvec_sri_h,
690           .load_dest = true,
691           .opt_opc = vecop_list,
692           .vece = MO_16 },
693         { .fni4 = gen_shr32_ins_i32,
694           .fniv = gen_shr_ins_vec,
695           .fno = gen_helper_gvec_sri_s,
696           .load_dest = true,
697           .opt_opc = vecop_list,
698           .vece = MO_32 },
699         { .fni8 = gen_shr64_ins_i64,
700           .fniv = gen_shr_ins_vec,
701           .fno = gen_helper_gvec_sri_d,
702           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
703           .load_dest = true,
704           .opt_opc = vecop_list,
705           .vece = MO_64 },
706     };
707 
708     /* tszimm encoding produces immediates in the range [1..esize]. */
709     tcg_debug_assert(shift > 0);
710     tcg_debug_assert(shift <= (8 << vece));
711 
712     /* Shift of esize leaves destination unchanged. */
713     if (shift < (8 << vece)) {
714         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
715     } else {
716         /* Nop, but we do need to clear the tail. */
717         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
718     }
719 }
720 
721 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
722 {
723     uint64_t mask = dup_const(MO_8, 0xff << shift);
724     TCGv_i64 t = tcg_temp_new_i64();
725 
726     tcg_gen_shli_i64(t, a, shift);
727     tcg_gen_andi_i64(t, t, mask);
728     tcg_gen_andi_i64(d, d, ~mask);
729     tcg_gen_or_i64(d, d, t);
730 }
731 
732 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
733 {
734     uint64_t mask = dup_const(MO_16, 0xffff << shift);
735     TCGv_i64 t = tcg_temp_new_i64();
736 
737     tcg_gen_shli_i64(t, a, shift);
738     tcg_gen_andi_i64(t, t, mask);
739     tcg_gen_andi_i64(d, d, ~mask);
740     tcg_gen_or_i64(d, d, t);
741 }
742 
743 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
744 {
745     tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
746 }
747 
748 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
749 {
750     tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
751 }
752 
753 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
754 {
755     TCGv_vec t = tcg_temp_new_vec_matching(d);
756     TCGv_vec m = tcg_temp_new_vec_matching(d);
757 
758     tcg_gen_shli_vec(vece, t, a, sh);
759     tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
760     tcg_gen_and_vec(vece, d, d, m);
761     tcg_gen_or_vec(vece, d, d, t);
762 }
763 
764 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
765                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
766 {
767     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
768     const GVecGen2i ops[4] = {
769         { .fni8 = gen_shl8_ins_i64,
770           .fniv = gen_shl_ins_vec,
771           .fno = gen_helper_gvec_sli_b,
772           .load_dest = true,
773           .opt_opc = vecop_list,
774           .vece = MO_8 },
775         { .fni8 = gen_shl16_ins_i64,
776           .fniv = gen_shl_ins_vec,
777           .fno = gen_helper_gvec_sli_h,
778           .load_dest = true,
779           .opt_opc = vecop_list,
780           .vece = MO_16 },
781         { .fni4 = gen_shl32_ins_i32,
782           .fniv = gen_shl_ins_vec,
783           .fno = gen_helper_gvec_sli_s,
784           .load_dest = true,
785           .opt_opc = vecop_list,
786           .vece = MO_32 },
787         { .fni8 = gen_shl64_ins_i64,
788           .fniv = gen_shl_ins_vec,
789           .fno = gen_helper_gvec_sli_d,
790           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
791           .load_dest = true,
792           .opt_opc = vecop_list,
793           .vece = MO_64 },
794     };
795 
796     /* tszimm encoding produces immediates in the range [0..esize-1]. */
797     tcg_debug_assert(shift >= 0);
798     tcg_debug_assert(shift < (8 << vece));
799 
800     if (shift == 0) {
801         tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
802     } else {
803         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
804     }
805 }
806 
807 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
808 {
809     gen_helper_neon_mul_u8(a, a, b);
810     gen_helper_neon_add_u8(d, d, a);
811 }
812 
813 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
814 {
815     gen_helper_neon_mul_u8(a, a, b);
816     gen_helper_neon_sub_u8(d, d, a);
817 }
818 
819 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
820 {
821     gen_helper_neon_mul_u16(a, a, b);
822     gen_helper_neon_add_u16(d, d, a);
823 }
824 
825 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
826 {
827     gen_helper_neon_mul_u16(a, a, b);
828     gen_helper_neon_sub_u16(d, d, a);
829 }
830 
831 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
832 {
833     tcg_gen_mul_i32(a, a, b);
834     tcg_gen_add_i32(d, d, a);
835 }
836 
837 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
838 {
839     tcg_gen_mul_i32(a, a, b);
840     tcg_gen_sub_i32(d, d, a);
841 }
842 
843 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
844 {
845     tcg_gen_mul_i64(a, a, b);
846     tcg_gen_add_i64(d, d, a);
847 }
848 
849 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
850 {
851     tcg_gen_mul_i64(a, a, b);
852     tcg_gen_sub_i64(d, d, a);
853 }
854 
855 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
856 {
857     tcg_gen_mul_vec(vece, a, a, b);
858     tcg_gen_add_vec(vece, d, d, a);
859 }
860 
861 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
862 {
863     tcg_gen_mul_vec(vece, a, a, b);
864     tcg_gen_sub_vec(vece, d, d, a);
865 }
866 
867 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
868  * these tables are shared with AArch64 which does support them.
869  */
870 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
871                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
872 {
873     static const TCGOpcode vecop_list[] = {
874         INDEX_op_mul_vec, INDEX_op_add_vec, 0
875     };
876     static const GVecGen3 ops[4] = {
877         { .fni4 = gen_mla8_i32,
878           .fniv = gen_mla_vec,
879           .load_dest = true,
880           .opt_opc = vecop_list,
881           .vece = MO_8 },
882         { .fni4 = gen_mla16_i32,
883           .fniv = gen_mla_vec,
884           .load_dest = true,
885           .opt_opc = vecop_list,
886           .vece = MO_16 },
887         { .fni4 = gen_mla32_i32,
888           .fniv = gen_mla_vec,
889           .load_dest = true,
890           .opt_opc = vecop_list,
891           .vece = MO_32 },
892         { .fni8 = gen_mla64_i64,
893           .fniv = gen_mla_vec,
894           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
895           .load_dest = true,
896           .opt_opc = vecop_list,
897           .vece = MO_64 },
898     };
899     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
900 }
901 
902 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
903                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
904 {
905     static const TCGOpcode vecop_list[] = {
906         INDEX_op_mul_vec, INDEX_op_sub_vec, 0
907     };
908     static const GVecGen3 ops[4] = {
909         { .fni4 = gen_mls8_i32,
910           .fniv = gen_mls_vec,
911           .load_dest = true,
912           .opt_opc = vecop_list,
913           .vece = MO_8 },
914         { .fni4 = gen_mls16_i32,
915           .fniv = gen_mls_vec,
916           .load_dest = true,
917           .opt_opc = vecop_list,
918           .vece = MO_16 },
919         { .fni4 = gen_mls32_i32,
920           .fniv = gen_mls_vec,
921           .load_dest = true,
922           .opt_opc = vecop_list,
923           .vece = MO_32 },
924         { .fni8 = gen_mls64_i64,
925           .fniv = gen_mls_vec,
926           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
927           .load_dest = true,
928           .opt_opc = vecop_list,
929           .vece = MO_64 },
930     };
931     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
932 }
933 
934 /* CMTST : test is "if (X & Y != 0)". */
935 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
936 {
937     tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b);
938 }
939 
940 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
941 {
942     tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b);
943 }
944 
945 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
946 {
947     tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b);
948 }
949 
950 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
951                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
952 {
953     static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
954     static const GVecGen3 ops[4] = {
955         { .fni4 = gen_helper_neon_tst_u8,
956           .fniv = gen_cmtst_vec,
957           .opt_opc = vecop_list,
958           .vece = MO_8 },
959         { .fni4 = gen_helper_neon_tst_u16,
960           .fniv = gen_cmtst_vec,
961           .opt_opc = vecop_list,
962           .vece = MO_16 },
963         { .fni4 = gen_cmtst_i32,
964           .fniv = gen_cmtst_vec,
965           .opt_opc = vecop_list,
966           .vece = MO_32 },
967         { .fni8 = gen_cmtst_i64,
968           .fniv = gen_cmtst_vec,
969           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
970           .opt_opc = vecop_list,
971           .vece = MO_64 },
972     };
973     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
974 }
975 
976 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
977 {
978     TCGv_i32 lval = tcg_temp_new_i32();
979     TCGv_i32 rval = tcg_temp_new_i32();
980     TCGv_i32 lsh = tcg_temp_new_i32();
981     TCGv_i32 rsh = tcg_temp_new_i32();
982     TCGv_i32 zero = tcg_constant_i32(0);
983     TCGv_i32 max = tcg_constant_i32(32);
984 
985     /*
986      * Rely on the TCG guarantee that out of range shifts produce
987      * unspecified results, not undefined behaviour (i.e. no trap).
988      * Discard out-of-range results after the fact.
989      */
990     tcg_gen_ext8s_i32(lsh, shift);
991     tcg_gen_neg_i32(rsh, lsh);
992     tcg_gen_shl_i32(lval, src, lsh);
993     tcg_gen_shr_i32(rval, src, rsh);
994     tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
995     tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
996 }
997 
998 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
999 {
1000     TCGv_i64 lval = tcg_temp_new_i64();
1001     TCGv_i64 rval = tcg_temp_new_i64();
1002     TCGv_i64 lsh = tcg_temp_new_i64();
1003     TCGv_i64 rsh = tcg_temp_new_i64();
1004     TCGv_i64 zero = tcg_constant_i64(0);
1005     TCGv_i64 max = tcg_constant_i64(64);
1006 
1007     /*
1008      * Rely on the TCG guarantee that out of range shifts produce
1009      * unspecified results, not undefined behaviour (i.e. no trap).
1010      * Discard out-of-range results after the fact.
1011      */
1012     tcg_gen_ext8s_i64(lsh, shift);
1013     tcg_gen_neg_i64(rsh, lsh);
1014     tcg_gen_shl_i64(lval, src, lsh);
1015     tcg_gen_shr_i64(rval, src, rsh);
1016     tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
1017     tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
1018 }
1019 
1020 static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
1021                          TCGv_vec src, TCGv_vec shift)
1022 {
1023     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1024     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1025     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1026     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1027     TCGv_vec msk, max;
1028 
1029     tcg_gen_neg_vec(vece, rsh, shift);
1030     if (vece == MO_8) {
1031         tcg_gen_mov_vec(lsh, shift);
1032     } else {
1033         msk = tcg_temp_new_vec_matching(dst);
1034         tcg_gen_dupi_vec(vece, msk, 0xff);
1035         tcg_gen_and_vec(vece, lsh, shift, msk);
1036         tcg_gen_and_vec(vece, rsh, rsh, msk);
1037     }
1038 
1039     /*
1040      * Rely on the TCG guarantee that out of range shifts produce
1041      * unspecified results, not undefined behaviour (i.e. no trap).
1042      * Discard out-of-range results after the fact.
1043      */
1044     tcg_gen_shlv_vec(vece, lval, src, lsh);
1045     tcg_gen_shrv_vec(vece, rval, src, rsh);
1046 
1047     max = tcg_temp_new_vec_matching(dst);
1048     tcg_gen_dupi_vec(vece, max, 8 << vece);
1049 
1050     /*
1051      * The choice of LT (signed) and GEU (unsigned) are biased toward
1052      * the instructions of the x86_64 host.  For MO_8, the whole byte
1053      * is significant so we must use an unsigned compare; otherwise we
1054      * have already masked to a byte and so a signed compare works.
1055      * Other tcg hosts have a full set of comparisons and do not care.
1056      */
1057     if (vece == MO_8) {
1058         tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
1059         tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
1060         tcg_gen_andc_vec(vece, lval, lval, lsh);
1061         tcg_gen_andc_vec(vece, rval, rval, rsh);
1062     } else {
1063         tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
1064         tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
1065         tcg_gen_and_vec(vece, lval, lval, lsh);
1066         tcg_gen_and_vec(vece, rval, rval, rsh);
1067     }
1068     tcg_gen_or_vec(vece, dst, lval, rval);
1069 }
1070 
1071 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1072                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1073 {
1074     static const TCGOpcode vecop_list[] = {
1075         INDEX_op_neg_vec, INDEX_op_shlv_vec,
1076         INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
1077     };
1078     static const GVecGen3 ops[4] = {
1079         { .fniv = gen_ushl_vec,
1080           .fno = gen_helper_gvec_ushl_b,
1081           .opt_opc = vecop_list,
1082           .vece = MO_8 },
1083         { .fniv = gen_ushl_vec,
1084           .fno = gen_helper_gvec_ushl_h,
1085           .opt_opc = vecop_list,
1086           .vece = MO_16 },
1087         { .fni4 = gen_ushl_i32,
1088           .fniv = gen_ushl_vec,
1089           .opt_opc = vecop_list,
1090           .vece = MO_32 },
1091         { .fni8 = gen_ushl_i64,
1092           .fniv = gen_ushl_vec,
1093           .opt_opc = vecop_list,
1094           .vece = MO_64 },
1095     };
1096     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1097 }
1098 
1099 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1100 {
1101     TCGv_i32 lval = tcg_temp_new_i32();
1102     TCGv_i32 rval = tcg_temp_new_i32();
1103     TCGv_i32 lsh = tcg_temp_new_i32();
1104     TCGv_i32 rsh = tcg_temp_new_i32();
1105     TCGv_i32 zero = tcg_constant_i32(0);
1106     TCGv_i32 max = tcg_constant_i32(31);
1107 
1108     /*
1109      * Rely on the TCG guarantee that out of range shifts produce
1110      * unspecified results, not undefined behaviour (i.e. no trap).
1111      * Discard out-of-range results after the fact.
1112      */
1113     tcg_gen_ext8s_i32(lsh, shift);
1114     tcg_gen_neg_i32(rsh, lsh);
1115     tcg_gen_shl_i32(lval, src, lsh);
1116     tcg_gen_umin_i32(rsh, rsh, max);
1117     tcg_gen_sar_i32(rval, src, rsh);
1118     tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
1119     tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
1120 }
1121 
1122 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1123 {
1124     TCGv_i64 lval = tcg_temp_new_i64();
1125     TCGv_i64 rval = tcg_temp_new_i64();
1126     TCGv_i64 lsh = tcg_temp_new_i64();
1127     TCGv_i64 rsh = tcg_temp_new_i64();
1128     TCGv_i64 zero = tcg_constant_i64(0);
1129     TCGv_i64 max = tcg_constant_i64(63);
1130 
1131     /*
1132      * Rely on the TCG guarantee that out of range shifts produce
1133      * unspecified results, not undefined behaviour (i.e. no trap).
1134      * Discard out-of-range results after the fact.
1135      */
1136     tcg_gen_ext8s_i64(lsh, shift);
1137     tcg_gen_neg_i64(rsh, lsh);
1138     tcg_gen_shl_i64(lval, src, lsh);
1139     tcg_gen_umin_i64(rsh, rsh, max);
1140     tcg_gen_sar_i64(rval, src, rsh);
1141     tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
1142     tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
1143 }
1144 
1145 static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
1146                          TCGv_vec src, TCGv_vec shift)
1147 {
1148     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1149     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1150     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1151     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1152     TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
1153 
1154     /*
1155      * Rely on the TCG guarantee that out of range shifts produce
1156      * unspecified results, not undefined behaviour (i.e. no trap).
1157      * Discard out-of-range results after the fact.
1158      */
1159     tcg_gen_neg_vec(vece, rsh, shift);
1160     if (vece == MO_8) {
1161         tcg_gen_mov_vec(lsh, shift);
1162     } else {
1163         tcg_gen_dupi_vec(vece, tmp, 0xff);
1164         tcg_gen_and_vec(vece, lsh, shift, tmp);
1165         tcg_gen_and_vec(vece, rsh, rsh, tmp);
1166     }
1167 
1168     /* Bound rsh so out of bound right shift gets -1.  */
1169     tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
1170     tcg_gen_umin_vec(vece, rsh, rsh, tmp);
1171     tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
1172 
1173     tcg_gen_shlv_vec(vece, lval, src, lsh);
1174     tcg_gen_sarv_vec(vece, rval, src, rsh);
1175 
1176     /* Select in-bound left shift.  */
1177     tcg_gen_andc_vec(vece, lval, lval, tmp);
1178 
1179     /* Select between left and right shift.  */
1180     if (vece == MO_8) {
1181         tcg_gen_dupi_vec(vece, tmp, 0);
1182         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
1183     } else {
1184         tcg_gen_dupi_vec(vece, tmp, 0x80);
1185         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
1186     }
1187 }
1188 
1189 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1190                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1191 {
1192     static const TCGOpcode vecop_list[] = {
1193         INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
1194         INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
1195     };
1196     static const GVecGen3 ops[4] = {
1197         { .fniv = gen_sshl_vec,
1198           .fno = gen_helper_gvec_sshl_b,
1199           .opt_opc = vecop_list,
1200           .vece = MO_8 },
1201         { .fniv = gen_sshl_vec,
1202           .fno = gen_helper_gvec_sshl_h,
1203           .opt_opc = vecop_list,
1204           .vece = MO_16 },
1205         { .fni4 = gen_sshl_i32,
1206           .fniv = gen_sshl_vec,
1207           .opt_opc = vecop_list,
1208           .vece = MO_32 },
1209         { .fni8 = gen_sshl_i64,
1210           .fniv = gen_sshl_vec,
1211           .opt_opc = vecop_list,
1212           .vece = MO_64 },
1213     };
1214     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1215 }
1216 
1217 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1218                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1219 {
1220     static gen_helper_gvec_3 * const fns[] = {
1221         gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h,
1222         gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d,
1223     };
1224     tcg_debug_assert(vece <= MO_64);
1225     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1226 }
1227 
1228 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1229                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1230 {
1231     static gen_helper_gvec_3 * const fns[] = {
1232         gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h,
1233         gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d,
1234     };
1235     tcg_debug_assert(vece <= MO_64);
1236     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1237 }
1238 
1239 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1240                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1241 {
1242     static gen_helper_gvec_3_ptr * const fns[] = {
1243         gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h,
1244         gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d,
1245     };
1246     tcg_debug_assert(vece <= MO_64);
1247     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1248                        opr_sz, max_sz, 0, fns[vece]);
1249 }
1250 
1251 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1252                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1253 {
1254     static gen_helper_gvec_3_ptr * const fns[] = {
1255         gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h,
1256         gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d,
1257     };
1258     tcg_debug_assert(vece <= MO_64);
1259     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1260                        opr_sz, max_sz, 0, fns[vece]);
1261 }
1262 
1263 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1264                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1265 {
1266     static gen_helper_gvec_3_ptr * const fns[] = {
1267         gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h,
1268         gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d,
1269     };
1270     tcg_debug_assert(vece <= MO_64);
1271     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1272                        opr_sz, max_sz, 0, fns[vece]);
1273 }
1274 
1275 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1276                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1277 {
1278     static gen_helper_gvec_3_ptr * const fns[] = {
1279         gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h,
1280         gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d,
1281     };
1282     tcg_debug_assert(vece <= MO_64);
1283     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1284                        opr_sz, max_sz, 0, fns[vece]);
1285 }
1286 
1287 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1288 {
1289     uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
1290     TCGv_i64 tmp = tcg_temp_new_i64();
1291 
1292     tcg_gen_add_i64(tmp, a, b);
1293     tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max));
1294     tcg_gen_xor_i64(tmp, tmp, res);
1295     tcg_gen_or_i64(qc, qc, tmp);
1296 }
1297 
1298 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1299 {
1300     TCGv_i64 t = tcg_temp_new_i64();
1301 
1302     tcg_gen_add_i64(t, a, b);
1303     tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a,
1304                         tcg_constant_i64(UINT64_MAX), t);
1305     tcg_gen_xor_i64(t, t, res);
1306     tcg_gen_or_i64(qc, qc, t);
1307 }
1308 
1309 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1310                           TCGv_vec a, TCGv_vec b)
1311 {
1312     TCGv_vec x = tcg_temp_new_vec_matching(t);
1313     tcg_gen_add_vec(vece, x, a, b);
1314     tcg_gen_usadd_vec(vece, t, a, b);
1315     tcg_gen_xor_vec(vece, x, x, t);
1316     tcg_gen_or_vec(vece, qc, qc, x);
1317 }
1318 
1319 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1320                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1321 {
1322     static const TCGOpcode vecop_list[] = {
1323         INDEX_op_usadd_vec, INDEX_op_add_vec, 0
1324     };
1325     static const GVecGen4 ops[4] = {
1326         { .fniv = gen_uqadd_vec,
1327           .fno = gen_helper_gvec_uqadd_b,
1328           .write_aofs = true,
1329           .opt_opc = vecop_list,
1330           .vece = MO_8 },
1331         { .fniv = gen_uqadd_vec,
1332           .fno = gen_helper_gvec_uqadd_h,
1333           .write_aofs = true,
1334           .opt_opc = vecop_list,
1335           .vece = MO_16 },
1336         { .fniv = gen_uqadd_vec,
1337           .fno = gen_helper_gvec_uqadd_s,
1338           .write_aofs = true,
1339           .opt_opc = vecop_list,
1340           .vece = MO_32 },
1341         { .fniv = gen_uqadd_vec,
1342           .fni8 = gen_uqadd_d,
1343           .fno = gen_helper_gvec_uqadd_d,
1344           .write_aofs = true,
1345           .opt_opc = vecop_list,
1346           .vece = MO_64 },
1347     };
1348 
1349     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1350     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1351                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1352 }
1353 
1354 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1355 {
1356     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1357     int64_t min = -1ll - max;
1358     TCGv_i64 tmp = tcg_temp_new_i64();
1359 
1360     tcg_gen_add_i64(tmp, a, b);
1361     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1362     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1363     tcg_gen_xor_i64(tmp, tmp, res);
1364     tcg_gen_or_i64(qc, qc, tmp);
1365 }
1366 
1367 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1368 {
1369     TCGv_i64 t0 = tcg_temp_new_i64();
1370     TCGv_i64 t1 = tcg_temp_new_i64();
1371     TCGv_i64 t2 = tcg_temp_new_i64();
1372 
1373     tcg_gen_add_i64(t0, a, b);
1374 
1375     /* Compute signed overflow indication into T1 */
1376     tcg_gen_xor_i64(t1, a, b);
1377     tcg_gen_xor_i64(t2, t0, a);
1378     tcg_gen_andc_i64(t1, t2, t1);
1379 
1380     /* Compute saturated value into T2 */
1381     tcg_gen_sari_i64(t2, a, 63);
1382     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1383 
1384     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1385     tcg_gen_xor_i64(t0, t0, res);
1386     tcg_gen_or_i64(qc, qc, t0);
1387 }
1388 
1389 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1390                           TCGv_vec a, TCGv_vec b)
1391 {
1392     TCGv_vec x = tcg_temp_new_vec_matching(t);
1393     tcg_gen_add_vec(vece, x, a, b);
1394     tcg_gen_ssadd_vec(vece, t, a, b);
1395     tcg_gen_xor_vec(vece, x, x, t);
1396     tcg_gen_or_vec(vece, qc, qc, x);
1397 }
1398 
1399 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1400                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1401 {
1402     static const TCGOpcode vecop_list[] = {
1403         INDEX_op_ssadd_vec, INDEX_op_add_vec, 0
1404     };
1405     static const GVecGen4 ops[4] = {
1406         { .fniv = gen_sqadd_vec,
1407           .fno = gen_helper_gvec_sqadd_b,
1408           .opt_opc = vecop_list,
1409           .write_aofs = true,
1410           .vece = MO_8 },
1411         { .fniv = gen_sqadd_vec,
1412           .fno = gen_helper_gvec_sqadd_h,
1413           .opt_opc = vecop_list,
1414           .write_aofs = true,
1415           .vece = MO_16 },
1416         { .fniv = gen_sqadd_vec,
1417           .fno = gen_helper_gvec_sqadd_s,
1418           .opt_opc = vecop_list,
1419           .write_aofs = true,
1420           .vece = MO_32 },
1421         { .fniv = gen_sqadd_vec,
1422           .fni8 = gen_sqadd_d,
1423           .fno = gen_helper_gvec_sqadd_d,
1424           .opt_opc = vecop_list,
1425           .write_aofs = true,
1426           .vece = MO_64 },
1427     };
1428 
1429     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1430     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1431                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1432 }
1433 
1434 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1435 {
1436     TCGv_i64 tmp = tcg_temp_new_i64();
1437 
1438     tcg_gen_sub_i64(tmp, a, b);
1439     tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0));
1440     tcg_gen_xor_i64(tmp, tmp, res);
1441     tcg_gen_or_i64(qc, qc, tmp);
1442 }
1443 
1444 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1445 {
1446     TCGv_i64 t = tcg_temp_new_i64();
1447 
1448     tcg_gen_sub_i64(t, a, b);
1449     tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t);
1450     tcg_gen_xor_i64(t, t, res);
1451     tcg_gen_or_i64(qc, qc, t);
1452 }
1453 
1454 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1455                           TCGv_vec a, TCGv_vec b)
1456 {
1457     TCGv_vec x = tcg_temp_new_vec_matching(t);
1458     tcg_gen_sub_vec(vece, x, a, b);
1459     tcg_gen_ussub_vec(vece, t, a, b);
1460     tcg_gen_xor_vec(vece, x, x, t);
1461     tcg_gen_or_vec(vece, qc, qc, x);
1462 }
1463 
1464 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1465                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1466 {
1467     static const TCGOpcode vecop_list[] = {
1468         INDEX_op_ussub_vec, INDEX_op_sub_vec, 0
1469     };
1470     static const GVecGen4 ops[4] = {
1471         { .fniv = gen_uqsub_vec,
1472           .fno = gen_helper_gvec_uqsub_b,
1473           .opt_opc = vecop_list,
1474           .write_aofs = true,
1475           .vece = MO_8 },
1476         { .fniv = gen_uqsub_vec,
1477           .fno = gen_helper_gvec_uqsub_h,
1478           .opt_opc = vecop_list,
1479           .write_aofs = true,
1480           .vece = MO_16 },
1481         { .fniv = gen_uqsub_vec,
1482           .fno = gen_helper_gvec_uqsub_s,
1483           .opt_opc = vecop_list,
1484           .write_aofs = true,
1485           .vece = MO_32 },
1486         { .fniv = gen_uqsub_vec,
1487           .fni8 = gen_uqsub_d,
1488           .fno = gen_helper_gvec_uqsub_d,
1489           .opt_opc = vecop_list,
1490           .write_aofs = true,
1491           .vece = MO_64 },
1492     };
1493 
1494     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1495     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1496                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1497 }
1498 
1499 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1500 {
1501     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1502     int64_t min = -1ll - max;
1503     TCGv_i64 tmp = tcg_temp_new_i64();
1504 
1505     tcg_gen_sub_i64(tmp, a, b);
1506     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1507     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1508     tcg_gen_xor_i64(tmp, tmp, res);
1509     tcg_gen_or_i64(qc, qc, tmp);
1510 }
1511 
1512 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1513 {
1514     TCGv_i64 t0 = tcg_temp_new_i64();
1515     TCGv_i64 t1 = tcg_temp_new_i64();
1516     TCGv_i64 t2 = tcg_temp_new_i64();
1517 
1518     tcg_gen_sub_i64(t0, a, b);
1519 
1520     /* Compute signed overflow indication into T1 */
1521     tcg_gen_xor_i64(t1, a, b);
1522     tcg_gen_xor_i64(t2, t0, a);
1523     tcg_gen_and_i64(t1, t1, t2);
1524 
1525     /* Compute saturated value into T2 */
1526     tcg_gen_sari_i64(t2, a, 63);
1527     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1528 
1529     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1530     tcg_gen_xor_i64(t0, t0, res);
1531     tcg_gen_or_i64(qc, qc, t0);
1532 }
1533 
1534 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1535                           TCGv_vec a, TCGv_vec b)
1536 {
1537     TCGv_vec x = tcg_temp_new_vec_matching(t);
1538     tcg_gen_sub_vec(vece, x, a, b);
1539     tcg_gen_sssub_vec(vece, t, a, b);
1540     tcg_gen_xor_vec(vece, x, x, t);
1541     tcg_gen_or_vec(vece, qc, qc, x);
1542 }
1543 
1544 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1545                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1546 {
1547     static const TCGOpcode vecop_list[] = {
1548         INDEX_op_sssub_vec, INDEX_op_sub_vec, 0
1549     };
1550     static const GVecGen4 ops[4] = {
1551         { .fniv = gen_sqsub_vec,
1552           .fno = gen_helper_gvec_sqsub_b,
1553           .opt_opc = vecop_list,
1554           .write_aofs = true,
1555           .vece = MO_8 },
1556         { .fniv = gen_sqsub_vec,
1557           .fno = gen_helper_gvec_sqsub_h,
1558           .opt_opc = vecop_list,
1559           .write_aofs = true,
1560           .vece = MO_16 },
1561         { .fniv = gen_sqsub_vec,
1562           .fno = gen_helper_gvec_sqsub_s,
1563           .opt_opc = vecop_list,
1564           .write_aofs = true,
1565           .vece = MO_32 },
1566         { .fniv = gen_sqsub_vec,
1567           .fni8 = gen_sqsub_d,
1568           .fno = gen_helper_gvec_sqsub_d,
1569           .opt_opc = vecop_list,
1570           .write_aofs = true,
1571           .vece = MO_64 },
1572     };
1573 
1574     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1575     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1576                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1577 }
1578 
1579 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1580 {
1581     TCGv_i32 t = tcg_temp_new_i32();
1582 
1583     tcg_gen_sub_i32(t, a, b);
1584     tcg_gen_sub_i32(d, b, a);
1585     tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
1586 }
1587 
1588 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1589 {
1590     TCGv_i64 t = tcg_temp_new_i64();
1591 
1592     tcg_gen_sub_i64(t, a, b);
1593     tcg_gen_sub_i64(d, b, a);
1594     tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
1595 }
1596 
1597 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1598 {
1599     TCGv_vec t = tcg_temp_new_vec_matching(d);
1600 
1601     tcg_gen_smin_vec(vece, t, a, b);
1602     tcg_gen_smax_vec(vece, d, a, b);
1603     tcg_gen_sub_vec(vece, d, d, t);
1604 }
1605 
1606 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1607                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1608 {
1609     static const TCGOpcode vecop_list[] = {
1610         INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1611     };
1612     static const GVecGen3 ops[4] = {
1613         { .fniv = gen_sabd_vec,
1614           .fno = gen_helper_gvec_sabd_b,
1615           .opt_opc = vecop_list,
1616           .vece = MO_8 },
1617         { .fniv = gen_sabd_vec,
1618           .fno = gen_helper_gvec_sabd_h,
1619           .opt_opc = vecop_list,
1620           .vece = MO_16 },
1621         { .fni4 = gen_sabd_i32,
1622           .fniv = gen_sabd_vec,
1623           .fno = gen_helper_gvec_sabd_s,
1624           .opt_opc = vecop_list,
1625           .vece = MO_32 },
1626         { .fni8 = gen_sabd_i64,
1627           .fniv = gen_sabd_vec,
1628           .fno = gen_helper_gvec_sabd_d,
1629           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1630           .opt_opc = vecop_list,
1631           .vece = MO_64 },
1632     };
1633     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1634 }
1635 
1636 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1637 {
1638     TCGv_i32 t = tcg_temp_new_i32();
1639 
1640     tcg_gen_sub_i32(t, a, b);
1641     tcg_gen_sub_i32(d, b, a);
1642     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
1643 }
1644 
1645 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1646 {
1647     TCGv_i64 t = tcg_temp_new_i64();
1648 
1649     tcg_gen_sub_i64(t, a, b);
1650     tcg_gen_sub_i64(d, b, a);
1651     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
1652 }
1653 
1654 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1655 {
1656     TCGv_vec t = tcg_temp_new_vec_matching(d);
1657 
1658     tcg_gen_umin_vec(vece, t, a, b);
1659     tcg_gen_umax_vec(vece, d, a, b);
1660     tcg_gen_sub_vec(vece, d, d, t);
1661 }
1662 
1663 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1664                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1665 {
1666     static const TCGOpcode vecop_list[] = {
1667         INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1668     };
1669     static const GVecGen3 ops[4] = {
1670         { .fniv = gen_uabd_vec,
1671           .fno = gen_helper_gvec_uabd_b,
1672           .opt_opc = vecop_list,
1673           .vece = MO_8 },
1674         { .fniv = gen_uabd_vec,
1675           .fno = gen_helper_gvec_uabd_h,
1676           .opt_opc = vecop_list,
1677           .vece = MO_16 },
1678         { .fni4 = gen_uabd_i32,
1679           .fniv = gen_uabd_vec,
1680           .fno = gen_helper_gvec_uabd_s,
1681           .opt_opc = vecop_list,
1682           .vece = MO_32 },
1683         { .fni8 = gen_uabd_i64,
1684           .fniv = gen_uabd_vec,
1685           .fno = gen_helper_gvec_uabd_d,
1686           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1687           .opt_opc = vecop_list,
1688           .vece = MO_64 },
1689     };
1690     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1691 }
1692 
1693 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1694 {
1695     TCGv_i32 t = tcg_temp_new_i32();
1696     gen_sabd_i32(t, a, b);
1697     tcg_gen_add_i32(d, d, t);
1698 }
1699 
1700 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1701 {
1702     TCGv_i64 t = tcg_temp_new_i64();
1703     gen_sabd_i64(t, a, b);
1704     tcg_gen_add_i64(d, d, t);
1705 }
1706 
1707 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1708 {
1709     TCGv_vec t = tcg_temp_new_vec_matching(d);
1710     gen_sabd_vec(vece, t, a, b);
1711     tcg_gen_add_vec(vece, d, d, t);
1712 }
1713 
1714 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1715                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1716 {
1717     static const TCGOpcode vecop_list[] = {
1718         INDEX_op_sub_vec, INDEX_op_add_vec,
1719         INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1720     };
1721     static const GVecGen3 ops[4] = {
1722         { .fniv = gen_saba_vec,
1723           .fno = gen_helper_gvec_saba_b,
1724           .opt_opc = vecop_list,
1725           .load_dest = true,
1726           .vece = MO_8 },
1727         { .fniv = gen_saba_vec,
1728           .fno = gen_helper_gvec_saba_h,
1729           .opt_opc = vecop_list,
1730           .load_dest = true,
1731           .vece = MO_16 },
1732         { .fni4 = gen_saba_i32,
1733           .fniv = gen_saba_vec,
1734           .fno = gen_helper_gvec_saba_s,
1735           .opt_opc = vecop_list,
1736           .load_dest = true,
1737           .vece = MO_32 },
1738         { .fni8 = gen_saba_i64,
1739           .fniv = gen_saba_vec,
1740           .fno = gen_helper_gvec_saba_d,
1741           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1742           .opt_opc = vecop_list,
1743           .load_dest = true,
1744           .vece = MO_64 },
1745     };
1746     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1747 }
1748 
1749 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1750 {
1751     TCGv_i32 t = tcg_temp_new_i32();
1752     gen_uabd_i32(t, a, b);
1753     tcg_gen_add_i32(d, d, t);
1754 }
1755 
1756 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1757 {
1758     TCGv_i64 t = tcg_temp_new_i64();
1759     gen_uabd_i64(t, a, b);
1760     tcg_gen_add_i64(d, d, t);
1761 }
1762 
1763 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1764 {
1765     TCGv_vec t = tcg_temp_new_vec_matching(d);
1766     gen_uabd_vec(vece, t, a, b);
1767     tcg_gen_add_vec(vece, d, d, t);
1768 }
1769 
1770 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1771                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1772 {
1773     static const TCGOpcode vecop_list[] = {
1774         INDEX_op_sub_vec, INDEX_op_add_vec,
1775         INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1776     };
1777     static const GVecGen3 ops[4] = {
1778         { .fniv = gen_uaba_vec,
1779           .fno = gen_helper_gvec_uaba_b,
1780           .opt_opc = vecop_list,
1781           .load_dest = true,
1782           .vece = MO_8 },
1783         { .fniv = gen_uaba_vec,
1784           .fno = gen_helper_gvec_uaba_h,
1785           .opt_opc = vecop_list,
1786           .load_dest = true,
1787           .vece = MO_16 },
1788         { .fni4 = gen_uaba_i32,
1789           .fniv = gen_uaba_vec,
1790           .fno = gen_helper_gvec_uaba_s,
1791           .opt_opc = vecop_list,
1792           .load_dest = true,
1793           .vece = MO_32 },
1794         { .fni8 = gen_uaba_i64,
1795           .fniv = gen_uaba_vec,
1796           .fno = gen_helper_gvec_uaba_d,
1797           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1798           .opt_opc = vecop_list,
1799           .load_dest = true,
1800           .vece = MO_64 },
1801     };
1802     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1803 }
1804 
1805 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1806                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1807 {
1808     static gen_helper_gvec_3 * const fns[4] = {
1809         gen_helper_gvec_addp_b,
1810         gen_helper_gvec_addp_h,
1811         gen_helper_gvec_addp_s,
1812         gen_helper_gvec_addp_d,
1813     };
1814     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1815 }
1816 
1817 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1818                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1819 {
1820     static gen_helper_gvec_3 * const fns[4] = {
1821         gen_helper_gvec_smaxp_b,
1822         gen_helper_gvec_smaxp_h,
1823         gen_helper_gvec_smaxp_s,
1824     };
1825     tcg_debug_assert(vece <= MO_32);
1826     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1827 }
1828 
1829 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1830                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1831 {
1832     static gen_helper_gvec_3 * const fns[4] = {
1833         gen_helper_gvec_sminp_b,
1834         gen_helper_gvec_sminp_h,
1835         gen_helper_gvec_sminp_s,
1836     };
1837     tcg_debug_assert(vece <= MO_32);
1838     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1839 }
1840 
1841 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1842                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1843 {
1844     static gen_helper_gvec_3 * const fns[4] = {
1845         gen_helper_gvec_umaxp_b,
1846         gen_helper_gvec_umaxp_h,
1847         gen_helper_gvec_umaxp_s,
1848     };
1849     tcg_debug_assert(vece <= MO_32);
1850     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1851 }
1852 
1853 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1854                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1855 {
1856     static gen_helper_gvec_3 * const fns[4] = {
1857         gen_helper_gvec_uminp_b,
1858         gen_helper_gvec_uminp_h,
1859         gen_helper_gvec_uminp_s,
1860     };
1861     tcg_debug_assert(vece <= MO_32);
1862     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1863 }
1864 
1865 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1866 {
1867     TCGv_i64 t = tcg_temp_new_i64();
1868 
1869     tcg_gen_and_i64(t, a, b);
1870     tcg_gen_vec_sar8i_i64(a, a, 1);
1871     tcg_gen_vec_sar8i_i64(b, b, 1);
1872     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1873     tcg_gen_vec_add8_i64(d, a, b);
1874     tcg_gen_vec_add8_i64(d, d, t);
1875 }
1876 
1877 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1878 {
1879     TCGv_i64 t = tcg_temp_new_i64();
1880 
1881     tcg_gen_and_i64(t, a, b);
1882     tcg_gen_vec_sar16i_i64(a, a, 1);
1883     tcg_gen_vec_sar16i_i64(b, b, 1);
1884     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1885     tcg_gen_vec_add16_i64(d, a, b);
1886     tcg_gen_vec_add16_i64(d, d, t);
1887 }
1888 
1889 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1890 {
1891     TCGv_i32 t = tcg_temp_new_i32();
1892 
1893     tcg_gen_and_i32(t, a, b);
1894     tcg_gen_sari_i32(a, a, 1);
1895     tcg_gen_sari_i32(b, b, 1);
1896     tcg_gen_andi_i32(t, t, 1);
1897     tcg_gen_add_i32(d, a, b);
1898     tcg_gen_add_i32(d, d, t);
1899 }
1900 
1901 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1902 {
1903     TCGv_vec t = tcg_temp_new_vec_matching(d);
1904 
1905     tcg_gen_and_vec(vece, t, a, b);
1906     tcg_gen_sari_vec(vece, a, a, 1);
1907     tcg_gen_sari_vec(vece, b, b, 1);
1908     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1909     tcg_gen_add_vec(vece, d, a, b);
1910     tcg_gen_add_vec(vece, d, d, t);
1911 }
1912 
1913 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1914                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1915 {
1916     static const TCGOpcode vecop_list[] = {
1917         INDEX_op_sari_vec, INDEX_op_add_vec, 0
1918     };
1919     static const GVecGen3 g[] = {
1920         { .fni8 = gen_shadd8_i64,
1921           .fniv = gen_shadd_vec,
1922           .opt_opc = vecop_list,
1923           .vece = MO_8 },
1924         { .fni8 = gen_shadd16_i64,
1925           .fniv = gen_shadd_vec,
1926           .opt_opc = vecop_list,
1927           .vece = MO_16 },
1928         { .fni4 = gen_shadd_i32,
1929           .fniv = gen_shadd_vec,
1930           .opt_opc = vecop_list,
1931           .vece = MO_32 },
1932     };
1933     tcg_debug_assert(vece <= MO_32);
1934     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
1935 }
1936 
1937 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1938 {
1939     TCGv_i64 t = tcg_temp_new_i64();
1940 
1941     tcg_gen_and_i64(t, a, b);
1942     tcg_gen_vec_shr8i_i64(a, a, 1);
1943     tcg_gen_vec_shr8i_i64(b, b, 1);
1944     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1945     tcg_gen_vec_add8_i64(d, a, b);
1946     tcg_gen_vec_add8_i64(d, d, t);
1947 }
1948 
1949 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1950 {
1951     TCGv_i64 t = tcg_temp_new_i64();
1952 
1953     tcg_gen_and_i64(t, a, b);
1954     tcg_gen_vec_shr16i_i64(a, a, 1);
1955     tcg_gen_vec_shr16i_i64(b, b, 1);
1956     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1957     tcg_gen_vec_add16_i64(d, a, b);
1958     tcg_gen_vec_add16_i64(d, d, t);
1959 }
1960 
1961 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1962 {
1963     TCGv_i32 t = tcg_temp_new_i32();
1964 
1965     tcg_gen_and_i32(t, a, b);
1966     tcg_gen_shri_i32(a, a, 1);
1967     tcg_gen_shri_i32(b, b, 1);
1968     tcg_gen_andi_i32(t, t, 1);
1969     tcg_gen_add_i32(d, a, b);
1970     tcg_gen_add_i32(d, d, t);
1971 }
1972 
1973 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1974 {
1975     TCGv_vec t = tcg_temp_new_vec_matching(d);
1976 
1977     tcg_gen_and_vec(vece, t, a, b);
1978     tcg_gen_shri_vec(vece, a, a, 1);
1979     tcg_gen_shri_vec(vece, b, b, 1);
1980     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1981     tcg_gen_add_vec(vece, d, a, b);
1982     tcg_gen_add_vec(vece, d, d, t);
1983 }
1984 
1985 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1986                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1987 {
1988     static const TCGOpcode vecop_list[] = {
1989         INDEX_op_shri_vec, INDEX_op_add_vec, 0
1990     };
1991     static const GVecGen3 g[] = {
1992         { .fni8 = gen_uhadd8_i64,
1993           .fniv = gen_uhadd_vec,
1994           .opt_opc = vecop_list,
1995           .vece = MO_8 },
1996         { .fni8 = gen_uhadd16_i64,
1997           .fniv = gen_uhadd_vec,
1998           .opt_opc = vecop_list,
1999           .vece = MO_16 },
2000         { .fni4 = gen_uhadd_i32,
2001           .fniv = gen_uhadd_vec,
2002           .opt_opc = vecop_list,
2003           .vece = MO_32 },
2004     };
2005     tcg_debug_assert(vece <= MO_32);
2006     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2007 }
2008 
2009 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2010 {
2011     TCGv_i64 t = tcg_temp_new_i64();
2012 
2013     tcg_gen_andc_i64(t, b, a);
2014     tcg_gen_vec_sar8i_i64(a, a, 1);
2015     tcg_gen_vec_sar8i_i64(b, b, 1);
2016     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2017     tcg_gen_vec_sub8_i64(d, a, b);
2018     tcg_gen_vec_sub8_i64(d, d, t);
2019 }
2020 
2021 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2022 {
2023     TCGv_i64 t = tcg_temp_new_i64();
2024 
2025     tcg_gen_andc_i64(t, b, a);
2026     tcg_gen_vec_sar16i_i64(a, a, 1);
2027     tcg_gen_vec_sar16i_i64(b, b, 1);
2028     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2029     tcg_gen_vec_sub16_i64(d, a, b);
2030     tcg_gen_vec_sub16_i64(d, d, t);
2031 }
2032 
2033 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2034 {
2035     TCGv_i32 t = tcg_temp_new_i32();
2036 
2037     tcg_gen_andc_i32(t, b, a);
2038     tcg_gen_sari_i32(a, a, 1);
2039     tcg_gen_sari_i32(b, b, 1);
2040     tcg_gen_andi_i32(t, t, 1);
2041     tcg_gen_sub_i32(d, a, b);
2042     tcg_gen_sub_i32(d, d, t);
2043 }
2044 
2045 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2046 {
2047     TCGv_vec t = tcg_temp_new_vec_matching(d);
2048 
2049     tcg_gen_andc_vec(vece, t, b, a);
2050     tcg_gen_sari_vec(vece, a, a, 1);
2051     tcg_gen_sari_vec(vece, b, b, 1);
2052     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2053     tcg_gen_sub_vec(vece, d, a, b);
2054     tcg_gen_sub_vec(vece, d, d, t);
2055 }
2056 
2057 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2058                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2059 {
2060     static const TCGOpcode vecop_list[] = {
2061         INDEX_op_sari_vec, INDEX_op_sub_vec, 0
2062     };
2063     static const GVecGen3 g[4] = {
2064         { .fni8 = gen_shsub8_i64,
2065           .fniv = gen_shsub_vec,
2066           .opt_opc = vecop_list,
2067           .vece = MO_8 },
2068         { .fni8 = gen_shsub16_i64,
2069           .fniv = gen_shsub_vec,
2070           .opt_opc = vecop_list,
2071           .vece = MO_16 },
2072         { .fni4 = gen_shsub_i32,
2073           .fniv = gen_shsub_vec,
2074           .opt_opc = vecop_list,
2075           .vece = MO_32 },
2076     };
2077     assert(vece <= MO_32);
2078     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2079 }
2080 
2081 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2082 {
2083     TCGv_i64 t = tcg_temp_new_i64();
2084 
2085     tcg_gen_andc_i64(t, b, a);
2086     tcg_gen_vec_shr8i_i64(a, a, 1);
2087     tcg_gen_vec_shr8i_i64(b, b, 1);
2088     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2089     tcg_gen_vec_sub8_i64(d, a, b);
2090     tcg_gen_vec_sub8_i64(d, d, t);
2091 }
2092 
2093 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2094 {
2095     TCGv_i64 t = tcg_temp_new_i64();
2096 
2097     tcg_gen_andc_i64(t, b, a);
2098     tcg_gen_vec_shr16i_i64(a, a, 1);
2099     tcg_gen_vec_shr16i_i64(b, b, 1);
2100     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2101     tcg_gen_vec_sub16_i64(d, a, b);
2102     tcg_gen_vec_sub16_i64(d, d, t);
2103 }
2104 
2105 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2106 {
2107     TCGv_i32 t = tcg_temp_new_i32();
2108 
2109     tcg_gen_andc_i32(t, b, a);
2110     tcg_gen_shri_i32(a, a, 1);
2111     tcg_gen_shri_i32(b, b, 1);
2112     tcg_gen_andi_i32(t, t, 1);
2113     tcg_gen_sub_i32(d, a, b);
2114     tcg_gen_sub_i32(d, d, t);
2115 }
2116 
2117 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2118 {
2119     TCGv_vec t = tcg_temp_new_vec_matching(d);
2120 
2121     tcg_gen_andc_vec(vece, t, b, a);
2122     tcg_gen_shri_vec(vece, a, a, 1);
2123     tcg_gen_shri_vec(vece, b, b, 1);
2124     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2125     tcg_gen_sub_vec(vece, d, a, b);
2126     tcg_gen_sub_vec(vece, d, d, t);
2127 }
2128 
2129 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2130                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2131 {
2132     static const TCGOpcode vecop_list[] = {
2133         INDEX_op_shri_vec, INDEX_op_sub_vec, 0
2134     };
2135     static const GVecGen3 g[4] = {
2136         { .fni8 = gen_uhsub8_i64,
2137           .fniv = gen_uhsub_vec,
2138           .opt_opc = vecop_list,
2139           .vece = MO_8 },
2140         { .fni8 = gen_uhsub16_i64,
2141           .fniv = gen_uhsub_vec,
2142           .opt_opc = vecop_list,
2143           .vece = MO_16 },
2144         { .fni4 = gen_uhsub_i32,
2145           .fniv = gen_uhsub_vec,
2146           .opt_opc = vecop_list,
2147           .vece = MO_32 },
2148     };
2149     assert(vece <= MO_32);
2150     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2151 }
2152 
2153 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2154 {
2155     TCGv_i64 t = tcg_temp_new_i64();
2156 
2157     tcg_gen_or_i64(t, a, b);
2158     tcg_gen_vec_sar8i_i64(a, a, 1);
2159     tcg_gen_vec_sar8i_i64(b, b, 1);
2160     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2161     tcg_gen_vec_add8_i64(d, a, b);
2162     tcg_gen_vec_add8_i64(d, d, t);
2163 }
2164 
2165 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2166 {
2167     TCGv_i64 t = tcg_temp_new_i64();
2168 
2169     tcg_gen_or_i64(t, a, b);
2170     tcg_gen_vec_sar16i_i64(a, a, 1);
2171     tcg_gen_vec_sar16i_i64(b, b, 1);
2172     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2173     tcg_gen_vec_add16_i64(d, a, b);
2174     tcg_gen_vec_add16_i64(d, d, t);
2175 }
2176 
2177 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2178 {
2179     TCGv_i32 t = tcg_temp_new_i32();
2180 
2181     tcg_gen_or_i32(t, a, b);
2182     tcg_gen_sari_i32(a, a, 1);
2183     tcg_gen_sari_i32(b, b, 1);
2184     tcg_gen_andi_i32(t, t, 1);
2185     tcg_gen_add_i32(d, a, b);
2186     tcg_gen_add_i32(d, d, t);
2187 }
2188 
2189 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2190 {
2191     TCGv_vec t = tcg_temp_new_vec_matching(d);
2192 
2193     tcg_gen_or_vec(vece, t, a, b);
2194     tcg_gen_sari_vec(vece, a, a, 1);
2195     tcg_gen_sari_vec(vece, b, b, 1);
2196     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2197     tcg_gen_add_vec(vece, d, a, b);
2198     tcg_gen_add_vec(vece, d, d, t);
2199 }
2200 
2201 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2202                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2203 {
2204     static const TCGOpcode vecop_list[] = {
2205         INDEX_op_sari_vec, INDEX_op_add_vec, 0
2206     };
2207     static const GVecGen3 g[] = {
2208         { .fni8 = gen_srhadd8_i64,
2209           .fniv = gen_srhadd_vec,
2210           .opt_opc = vecop_list,
2211           .vece = MO_8 },
2212         { .fni8 = gen_srhadd16_i64,
2213           .fniv = gen_srhadd_vec,
2214           .opt_opc = vecop_list,
2215           .vece = MO_16 },
2216         { .fni4 = gen_srhadd_i32,
2217           .fniv = gen_srhadd_vec,
2218           .opt_opc = vecop_list,
2219           .vece = MO_32 },
2220     };
2221     assert(vece <= MO_32);
2222     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2223 }
2224 
2225 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2226 {
2227     TCGv_i64 t = tcg_temp_new_i64();
2228 
2229     tcg_gen_or_i64(t, a, b);
2230     tcg_gen_vec_shr8i_i64(a, a, 1);
2231     tcg_gen_vec_shr8i_i64(b, b, 1);
2232     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2233     tcg_gen_vec_add8_i64(d, a, b);
2234     tcg_gen_vec_add8_i64(d, d, t);
2235 }
2236 
2237 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2238 {
2239     TCGv_i64 t = tcg_temp_new_i64();
2240 
2241     tcg_gen_or_i64(t, a, b);
2242     tcg_gen_vec_shr16i_i64(a, a, 1);
2243     tcg_gen_vec_shr16i_i64(b, b, 1);
2244     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2245     tcg_gen_vec_add16_i64(d, a, b);
2246     tcg_gen_vec_add16_i64(d, d, t);
2247 }
2248 
2249 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2250 {
2251     TCGv_i32 t = tcg_temp_new_i32();
2252 
2253     tcg_gen_or_i32(t, a, b);
2254     tcg_gen_shri_i32(a, a, 1);
2255     tcg_gen_shri_i32(b, b, 1);
2256     tcg_gen_andi_i32(t, t, 1);
2257     tcg_gen_add_i32(d, a, b);
2258     tcg_gen_add_i32(d, d, t);
2259 }
2260 
2261 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2262 {
2263     TCGv_vec t = tcg_temp_new_vec_matching(d);
2264 
2265     tcg_gen_or_vec(vece, t, a, b);
2266     tcg_gen_shri_vec(vece, a, a, 1);
2267     tcg_gen_shri_vec(vece, b, b, 1);
2268     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2269     tcg_gen_add_vec(vece, d, a, b);
2270     tcg_gen_add_vec(vece, d, d, t);
2271 }
2272 
2273 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2274                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2275 {
2276     static const TCGOpcode vecop_list[] = {
2277         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2278     };
2279     static const GVecGen3 g[] = {
2280         { .fni8 = gen_urhadd8_i64,
2281           .fniv = gen_urhadd_vec,
2282           .opt_opc = vecop_list,
2283           .vece = MO_8 },
2284         { .fni8 = gen_urhadd16_i64,
2285           .fniv = gen_urhadd_vec,
2286           .opt_opc = vecop_list,
2287           .vece = MO_16 },
2288         { .fni4 = gen_urhadd_i32,
2289           .fniv = gen_urhadd_vec,
2290           .opt_opc = vecop_list,
2291           .vece = MO_32 },
2292     };
2293     assert(vece <= MO_32);
2294     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2295 }
2296