xref: /openbmc/qemu/target/arm/tcg/gengvec.c (revision 72baef13b9dce71f20ae840d9951e559e14abf6d)
1 /*
2  *  ARM generic vector expansion
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "translate.h"
24 
25 
26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
27                             uint32_t opr_sz, uint32_t max_sz,
28                             gen_helper_gvec_3_ptr *fn)
29 {
30     TCGv_ptr qc_ptr = tcg_temp_new_ptr();
31 
32     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
33     tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc));
34     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
35                        opr_sz, max_sz, 0, fn);
36 }
37 
38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
39                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
40 {
41     static gen_helper_gvec_3_ptr * const fns[2] = {
42         gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s
43     };
44     tcg_debug_assert(vece >= 1 && vece <= 2);
45     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
46 }
47 
48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
49                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
50 {
51     static gen_helper_gvec_3_ptr * const fns[2] = {
52         gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s
53     };
54     tcg_debug_assert(vece >= 1 && vece <= 2);
55     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
56 }
57 
58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
59                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
60 {
61     static gen_helper_gvec_3_ptr * const fns[2] = {
62         gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
63     };
64     tcg_debug_assert(vece >= 1 && vece <= 2);
65     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
66 }
67 
68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
69                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
70 {
71     static gen_helper_gvec_3_ptr * const fns[2] = {
72         gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
73     };
74     tcg_debug_assert(vece >= 1 && vece <= 2);
75     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
76 }
77 
78 #define GEN_CMP0(NAME, COND)                              \
79     void NAME(unsigned vece, uint32_t d, uint32_t m,      \
80               uint32_t opr_sz, uint32_t max_sz)           \
81     { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }
82 
83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
88 
89 #undef GEN_CMP0
90 
91 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
92 {
93     tcg_gen_vec_sar8i_i64(a, a, shift);
94     tcg_gen_vec_add8_i64(d, d, a);
95 }
96 
97 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
98 {
99     tcg_gen_vec_sar16i_i64(a, a, shift);
100     tcg_gen_vec_add16_i64(d, d, a);
101 }
102 
103 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
104 {
105     tcg_gen_sari_i32(a, a, shift);
106     tcg_gen_add_i32(d, d, a);
107 }
108 
109 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
110 {
111     tcg_gen_sari_i64(a, a, shift);
112     tcg_gen_add_i64(d, d, a);
113 }
114 
115 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
116 {
117     tcg_gen_sari_vec(vece, a, a, sh);
118     tcg_gen_add_vec(vece, d, d, a);
119 }
120 
121 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
122                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
123 {
124     static const TCGOpcode vecop_list[] = {
125         INDEX_op_sari_vec, INDEX_op_add_vec, 0
126     };
127     static const GVecGen2i ops[4] = {
128         { .fni8 = gen_ssra8_i64,
129           .fniv = gen_ssra_vec,
130           .fno = gen_helper_gvec_ssra_b,
131           .load_dest = true,
132           .opt_opc = vecop_list,
133           .vece = MO_8 },
134         { .fni8 = gen_ssra16_i64,
135           .fniv = gen_ssra_vec,
136           .fno = gen_helper_gvec_ssra_h,
137           .load_dest = true,
138           .opt_opc = vecop_list,
139           .vece = MO_16 },
140         { .fni4 = gen_ssra32_i32,
141           .fniv = gen_ssra_vec,
142           .fno = gen_helper_gvec_ssra_s,
143           .load_dest = true,
144           .opt_opc = vecop_list,
145           .vece = MO_32 },
146         { .fni8 = gen_ssra64_i64,
147           .fniv = gen_ssra_vec,
148           .fno = gen_helper_gvec_ssra_d,
149           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
150           .opt_opc = vecop_list,
151           .load_dest = true,
152           .vece = MO_64 },
153     };
154 
155     /* tszimm encoding produces immediates in the range [1..esize]. */
156     tcg_debug_assert(shift > 0);
157     tcg_debug_assert(shift <= (8 << vece));
158 
159     /*
160      * Shifts larger than the element size are architecturally valid.
161      * Signed results in all sign bits.
162      */
163     shift = MIN(shift, (8 << vece) - 1);
164     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
165 }
166 
167 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
168 {
169     tcg_gen_vec_shr8i_i64(a, a, shift);
170     tcg_gen_vec_add8_i64(d, d, a);
171 }
172 
173 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
174 {
175     tcg_gen_vec_shr16i_i64(a, a, shift);
176     tcg_gen_vec_add16_i64(d, d, a);
177 }
178 
179 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
180 {
181     tcg_gen_shri_i32(a, a, shift);
182     tcg_gen_add_i32(d, d, a);
183 }
184 
185 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
186 {
187     tcg_gen_shri_i64(a, a, shift);
188     tcg_gen_add_i64(d, d, a);
189 }
190 
191 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
192 {
193     tcg_gen_shri_vec(vece, a, a, sh);
194     tcg_gen_add_vec(vece, d, d, a);
195 }
196 
197 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
198                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
199 {
200     static const TCGOpcode vecop_list[] = {
201         INDEX_op_shri_vec, INDEX_op_add_vec, 0
202     };
203     static const GVecGen2i ops[4] = {
204         { .fni8 = gen_usra8_i64,
205           .fniv = gen_usra_vec,
206           .fno = gen_helper_gvec_usra_b,
207           .load_dest = true,
208           .opt_opc = vecop_list,
209           .vece = MO_8, },
210         { .fni8 = gen_usra16_i64,
211           .fniv = gen_usra_vec,
212           .fno = gen_helper_gvec_usra_h,
213           .load_dest = true,
214           .opt_opc = vecop_list,
215           .vece = MO_16, },
216         { .fni4 = gen_usra32_i32,
217           .fniv = gen_usra_vec,
218           .fno = gen_helper_gvec_usra_s,
219           .load_dest = true,
220           .opt_opc = vecop_list,
221           .vece = MO_32, },
222         { .fni8 = gen_usra64_i64,
223           .fniv = gen_usra_vec,
224           .fno = gen_helper_gvec_usra_d,
225           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
226           .load_dest = true,
227           .opt_opc = vecop_list,
228           .vece = MO_64, },
229     };
230 
231     /* tszimm encoding produces immediates in the range [1..esize]. */
232     tcg_debug_assert(shift > 0);
233     tcg_debug_assert(shift <= (8 << vece));
234 
235     /*
236      * Shifts larger than the element size are architecturally valid.
237      * Unsigned results in all zeros as input to accumulate: nop.
238      */
239     if (shift < (8 << vece)) {
240         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
241     } else {
242         /* Nop, but we do need to clear the tail. */
243         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
244     }
245 }
246 
247 /*
248  * Shift one less than the requested amount, and the low bit is
249  * the rounding bit.  For the 8 and 16-bit operations, because we
250  * mask the low bit, we can perform a normal integer shift instead
251  * of a vector shift.
252  */
253 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
254 {
255     TCGv_i64 t = tcg_temp_new_i64();
256 
257     tcg_gen_shri_i64(t, a, sh - 1);
258     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
259     tcg_gen_vec_sar8i_i64(d, a, sh);
260     tcg_gen_vec_add8_i64(d, d, t);
261 }
262 
263 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
264 {
265     TCGv_i64 t = tcg_temp_new_i64();
266 
267     tcg_gen_shri_i64(t, a, sh - 1);
268     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
269     tcg_gen_vec_sar16i_i64(d, a, sh);
270     tcg_gen_vec_add16_i64(d, d, t);
271 }
272 
273 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
274 {
275     TCGv_i32 t;
276 
277     /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
278     if (sh == 32) {
279         tcg_gen_movi_i32(d, 0);
280         return;
281     }
282     t = tcg_temp_new_i32();
283     tcg_gen_extract_i32(t, a, sh - 1, 1);
284     tcg_gen_sari_i32(d, a, sh);
285     tcg_gen_add_i32(d, d, t);
286 }
287 
288  void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
289 {
290     TCGv_i64 t = tcg_temp_new_i64();
291 
292     tcg_gen_extract_i64(t, a, sh - 1, 1);
293     tcg_gen_sari_i64(d, a, sh);
294     tcg_gen_add_i64(d, d, t);
295 }
296 
297 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
298 {
299     TCGv_vec t = tcg_temp_new_vec_matching(d);
300     TCGv_vec ones = tcg_temp_new_vec_matching(d);
301 
302     tcg_gen_shri_vec(vece, t, a, sh - 1);
303     tcg_gen_dupi_vec(vece, ones, 1);
304     tcg_gen_and_vec(vece, t, t, ones);
305     tcg_gen_sari_vec(vece, d, a, sh);
306     tcg_gen_add_vec(vece, d, d, t);
307 }
308 
309 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
310                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
311 {
312     static const TCGOpcode vecop_list[] = {
313         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
314     };
315     static const GVecGen2i ops[4] = {
316         { .fni8 = gen_srshr8_i64,
317           .fniv = gen_srshr_vec,
318           .fno = gen_helper_gvec_srshr_b,
319           .opt_opc = vecop_list,
320           .vece = MO_8 },
321         { .fni8 = gen_srshr16_i64,
322           .fniv = gen_srshr_vec,
323           .fno = gen_helper_gvec_srshr_h,
324           .opt_opc = vecop_list,
325           .vece = MO_16 },
326         { .fni4 = gen_srshr32_i32,
327           .fniv = gen_srshr_vec,
328           .fno = gen_helper_gvec_srshr_s,
329           .opt_opc = vecop_list,
330           .vece = MO_32 },
331         { .fni8 = gen_srshr64_i64,
332           .fniv = gen_srshr_vec,
333           .fno = gen_helper_gvec_srshr_d,
334           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
335           .opt_opc = vecop_list,
336           .vece = MO_64 },
337     };
338 
339     /* tszimm encoding produces immediates in the range [1..esize] */
340     tcg_debug_assert(shift > 0);
341     tcg_debug_assert(shift <= (8 << vece));
342 
343     if (shift == (8 << vece)) {
344         /*
345          * Shifts larger than the element size are architecturally valid.
346          * Signed results in all sign bits.  With rounding, this produces
347          *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
348          * I.e. always zero.
349          */
350         tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
351     } else {
352         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
353     }
354 }
355 
356 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
357 {
358     TCGv_i64 t = tcg_temp_new_i64();
359 
360     gen_srshr8_i64(t, a, sh);
361     tcg_gen_vec_add8_i64(d, d, t);
362 }
363 
364 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
365 {
366     TCGv_i64 t = tcg_temp_new_i64();
367 
368     gen_srshr16_i64(t, a, sh);
369     tcg_gen_vec_add16_i64(d, d, t);
370 }
371 
372 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
373 {
374     TCGv_i32 t = tcg_temp_new_i32();
375 
376     gen_srshr32_i32(t, a, sh);
377     tcg_gen_add_i32(d, d, t);
378 }
379 
380 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
381 {
382     TCGv_i64 t = tcg_temp_new_i64();
383 
384     gen_srshr64_i64(t, a, sh);
385     tcg_gen_add_i64(d, d, t);
386 }
387 
388 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
389 {
390     TCGv_vec t = tcg_temp_new_vec_matching(d);
391 
392     gen_srshr_vec(vece, t, a, sh);
393     tcg_gen_add_vec(vece, d, d, t);
394 }
395 
396 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
397                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
398 {
399     static const TCGOpcode vecop_list[] = {
400         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
401     };
402     static const GVecGen2i ops[4] = {
403         { .fni8 = gen_srsra8_i64,
404           .fniv = gen_srsra_vec,
405           .fno = gen_helper_gvec_srsra_b,
406           .opt_opc = vecop_list,
407           .load_dest = true,
408           .vece = MO_8 },
409         { .fni8 = gen_srsra16_i64,
410           .fniv = gen_srsra_vec,
411           .fno = gen_helper_gvec_srsra_h,
412           .opt_opc = vecop_list,
413           .load_dest = true,
414           .vece = MO_16 },
415         { .fni4 = gen_srsra32_i32,
416           .fniv = gen_srsra_vec,
417           .fno = gen_helper_gvec_srsra_s,
418           .opt_opc = vecop_list,
419           .load_dest = true,
420           .vece = MO_32 },
421         { .fni8 = gen_srsra64_i64,
422           .fniv = gen_srsra_vec,
423           .fno = gen_helper_gvec_srsra_d,
424           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
425           .opt_opc = vecop_list,
426           .load_dest = true,
427           .vece = MO_64 },
428     };
429 
430     /* tszimm encoding produces immediates in the range [1..esize] */
431     tcg_debug_assert(shift > 0);
432     tcg_debug_assert(shift <= (8 << vece));
433 
434     /*
435      * Shifts larger than the element size are architecturally valid.
436      * Signed results in all sign bits.  With rounding, this produces
437      *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
438      * I.e. always zero.  With accumulation, this leaves D unchanged.
439      */
440     if (shift == (8 << vece)) {
441         /* Nop, but we do need to clear the tail. */
442         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
443     } else {
444         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
445     }
446 }
447 
448 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
449 {
450     TCGv_i64 t = tcg_temp_new_i64();
451 
452     tcg_gen_shri_i64(t, a, sh - 1);
453     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
454     tcg_gen_vec_shr8i_i64(d, a, sh);
455     tcg_gen_vec_add8_i64(d, d, t);
456 }
457 
458 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
459 {
460     TCGv_i64 t = tcg_temp_new_i64();
461 
462     tcg_gen_shri_i64(t, a, sh - 1);
463     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
464     tcg_gen_vec_shr16i_i64(d, a, sh);
465     tcg_gen_vec_add16_i64(d, d, t);
466 }
467 
468 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
469 {
470     TCGv_i32 t;
471 
472     /* Handle shift by the input size for the benefit of trans_URSHR_ri */
473     if (sh == 32) {
474         tcg_gen_extract_i32(d, a, sh - 1, 1);
475         return;
476     }
477     t = tcg_temp_new_i32();
478     tcg_gen_extract_i32(t, a, sh - 1, 1);
479     tcg_gen_shri_i32(d, a, sh);
480     tcg_gen_add_i32(d, d, t);
481 }
482 
483 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
484 {
485     TCGv_i64 t = tcg_temp_new_i64();
486 
487     tcg_gen_extract_i64(t, a, sh - 1, 1);
488     tcg_gen_shri_i64(d, a, sh);
489     tcg_gen_add_i64(d, d, t);
490 }
491 
492 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
493 {
494     TCGv_vec t = tcg_temp_new_vec_matching(d);
495     TCGv_vec ones = tcg_temp_new_vec_matching(d);
496 
497     tcg_gen_shri_vec(vece, t, a, shift - 1);
498     tcg_gen_dupi_vec(vece, ones, 1);
499     tcg_gen_and_vec(vece, t, t, ones);
500     tcg_gen_shri_vec(vece, d, a, shift);
501     tcg_gen_add_vec(vece, d, d, t);
502 }
503 
504 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
505                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
506 {
507     static const TCGOpcode vecop_list[] = {
508         INDEX_op_shri_vec, INDEX_op_add_vec, 0
509     };
510     static const GVecGen2i ops[4] = {
511         { .fni8 = gen_urshr8_i64,
512           .fniv = gen_urshr_vec,
513           .fno = gen_helper_gvec_urshr_b,
514           .opt_opc = vecop_list,
515           .vece = MO_8 },
516         { .fni8 = gen_urshr16_i64,
517           .fniv = gen_urshr_vec,
518           .fno = gen_helper_gvec_urshr_h,
519           .opt_opc = vecop_list,
520           .vece = MO_16 },
521         { .fni4 = gen_urshr32_i32,
522           .fniv = gen_urshr_vec,
523           .fno = gen_helper_gvec_urshr_s,
524           .opt_opc = vecop_list,
525           .vece = MO_32 },
526         { .fni8 = gen_urshr64_i64,
527           .fniv = gen_urshr_vec,
528           .fno = gen_helper_gvec_urshr_d,
529           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
530           .opt_opc = vecop_list,
531           .vece = MO_64 },
532     };
533 
534     /* tszimm encoding produces immediates in the range [1..esize] */
535     tcg_debug_assert(shift > 0);
536     tcg_debug_assert(shift <= (8 << vece));
537 
538     if (shift == (8 << vece)) {
539         /*
540          * Shifts larger than the element size are architecturally valid.
541          * Unsigned results in zero.  With rounding, this produces a
542          * copy of the most significant bit.
543          */
544         tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
545     } else {
546         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
547     }
548 }
549 
550 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
551 {
552     TCGv_i64 t = tcg_temp_new_i64();
553 
554     if (sh == 8) {
555         tcg_gen_vec_shr8i_i64(t, a, 7);
556     } else {
557         gen_urshr8_i64(t, a, sh);
558     }
559     tcg_gen_vec_add8_i64(d, d, t);
560 }
561 
562 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
563 {
564     TCGv_i64 t = tcg_temp_new_i64();
565 
566     if (sh == 16) {
567         tcg_gen_vec_shr16i_i64(t, a, 15);
568     } else {
569         gen_urshr16_i64(t, a, sh);
570     }
571     tcg_gen_vec_add16_i64(d, d, t);
572 }
573 
574 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
575 {
576     TCGv_i32 t = tcg_temp_new_i32();
577 
578     if (sh == 32) {
579         tcg_gen_shri_i32(t, a, 31);
580     } else {
581         gen_urshr32_i32(t, a, sh);
582     }
583     tcg_gen_add_i32(d, d, t);
584 }
585 
586 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
587 {
588     TCGv_i64 t = tcg_temp_new_i64();
589 
590     if (sh == 64) {
591         tcg_gen_shri_i64(t, a, 63);
592     } else {
593         gen_urshr64_i64(t, a, sh);
594     }
595     tcg_gen_add_i64(d, d, t);
596 }
597 
598 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
599 {
600     TCGv_vec t = tcg_temp_new_vec_matching(d);
601 
602     if (sh == (8 << vece)) {
603         tcg_gen_shri_vec(vece, t, a, sh - 1);
604     } else {
605         gen_urshr_vec(vece, t, a, sh);
606     }
607     tcg_gen_add_vec(vece, d, d, t);
608 }
609 
610 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
611                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
612 {
613     static const TCGOpcode vecop_list[] = {
614         INDEX_op_shri_vec, INDEX_op_add_vec, 0
615     };
616     static const GVecGen2i ops[4] = {
617         { .fni8 = gen_ursra8_i64,
618           .fniv = gen_ursra_vec,
619           .fno = gen_helper_gvec_ursra_b,
620           .opt_opc = vecop_list,
621           .load_dest = true,
622           .vece = MO_8 },
623         { .fni8 = gen_ursra16_i64,
624           .fniv = gen_ursra_vec,
625           .fno = gen_helper_gvec_ursra_h,
626           .opt_opc = vecop_list,
627           .load_dest = true,
628           .vece = MO_16 },
629         { .fni4 = gen_ursra32_i32,
630           .fniv = gen_ursra_vec,
631           .fno = gen_helper_gvec_ursra_s,
632           .opt_opc = vecop_list,
633           .load_dest = true,
634           .vece = MO_32 },
635         { .fni8 = gen_ursra64_i64,
636           .fniv = gen_ursra_vec,
637           .fno = gen_helper_gvec_ursra_d,
638           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
639           .opt_opc = vecop_list,
640           .load_dest = true,
641           .vece = MO_64 },
642     };
643 
644     /* tszimm encoding produces immediates in the range [1..esize] */
645     tcg_debug_assert(shift > 0);
646     tcg_debug_assert(shift <= (8 << vece));
647 
648     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
649 }
650 
651 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
652 {
653     uint64_t mask = dup_const(MO_8, 0xff >> shift);
654     TCGv_i64 t = tcg_temp_new_i64();
655 
656     tcg_gen_shri_i64(t, a, shift);
657     tcg_gen_andi_i64(t, t, mask);
658     tcg_gen_andi_i64(d, d, ~mask);
659     tcg_gen_or_i64(d, d, t);
660 }
661 
662 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
663 {
664     uint64_t mask = dup_const(MO_16, 0xffff >> shift);
665     TCGv_i64 t = tcg_temp_new_i64();
666 
667     tcg_gen_shri_i64(t, a, shift);
668     tcg_gen_andi_i64(t, t, mask);
669     tcg_gen_andi_i64(d, d, ~mask);
670     tcg_gen_or_i64(d, d, t);
671 }
672 
673 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
674 {
675     tcg_gen_shri_i32(a, a, shift);
676     tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
677 }
678 
679 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
680 {
681     tcg_gen_shri_i64(a, a, shift);
682     tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
683 }
684 
685 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
686 {
687     TCGv_vec t = tcg_temp_new_vec_matching(d);
688     TCGv_vec m = tcg_temp_new_vec_matching(d);
689 
690     tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
691     tcg_gen_shri_vec(vece, t, a, sh);
692     tcg_gen_and_vec(vece, d, d, m);
693     tcg_gen_or_vec(vece, d, d, t);
694 }
695 
696 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
697                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
698 {
699     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
700     const GVecGen2i ops[4] = {
701         { .fni8 = gen_shr8_ins_i64,
702           .fniv = gen_shr_ins_vec,
703           .fno = gen_helper_gvec_sri_b,
704           .load_dest = true,
705           .opt_opc = vecop_list,
706           .vece = MO_8 },
707         { .fni8 = gen_shr16_ins_i64,
708           .fniv = gen_shr_ins_vec,
709           .fno = gen_helper_gvec_sri_h,
710           .load_dest = true,
711           .opt_opc = vecop_list,
712           .vece = MO_16 },
713         { .fni4 = gen_shr32_ins_i32,
714           .fniv = gen_shr_ins_vec,
715           .fno = gen_helper_gvec_sri_s,
716           .load_dest = true,
717           .opt_opc = vecop_list,
718           .vece = MO_32 },
719         { .fni8 = gen_shr64_ins_i64,
720           .fniv = gen_shr_ins_vec,
721           .fno = gen_helper_gvec_sri_d,
722           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
723           .load_dest = true,
724           .opt_opc = vecop_list,
725           .vece = MO_64 },
726     };
727 
728     /* tszimm encoding produces immediates in the range [1..esize]. */
729     tcg_debug_assert(shift > 0);
730     tcg_debug_assert(shift <= (8 << vece));
731 
732     /* Shift of esize leaves destination unchanged. */
733     if (shift < (8 << vece)) {
734         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
735     } else {
736         /* Nop, but we do need to clear the tail. */
737         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
738     }
739 }
740 
741 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
742 {
743     uint64_t mask = dup_const(MO_8, 0xff << shift);
744     TCGv_i64 t = tcg_temp_new_i64();
745 
746     tcg_gen_shli_i64(t, a, shift);
747     tcg_gen_andi_i64(t, t, mask);
748     tcg_gen_andi_i64(d, d, ~mask);
749     tcg_gen_or_i64(d, d, t);
750 }
751 
752 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
753 {
754     uint64_t mask = dup_const(MO_16, 0xffff << shift);
755     TCGv_i64 t = tcg_temp_new_i64();
756 
757     tcg_gen_shli_i64(t, a, shift);
758     tcg_gen_andi_i64(t, t, mask);
759     tcg_gen_andi_i64(d, d, ~mask);
760     tcg_gen_or_i64(d, d, t);
761 }
762 
763 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
764 {
765     tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
766 }
767 
768 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
769 {
770     tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
771 }
772 
773 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
774 {
775     TCGv_vec t = tcg_temp_new_vec_matching(d);
776     TCGv_vec m = tcg_temp_new_vec_matching(d);
777 
778     tcg_gen_shli_vec(vece, t, a, sh);
779     tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
780     tcg_gen_and_vec(vece, d, d, m);
781     tcg_gen_or_vec(vece, d, d, t);
782 }
783 
784 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
785                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
786 {
787     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
788     const GVecGen2i ops[4] = {
789         { .fni8 = gen_shl8_ins_i64,
790           .fniv = gen_shl_ins_vec,
791           .fno = gen_helper_gvec_sli_b,
792           .load_dest = true,
793           .opt_opc = vecop_list,
794           .vece = MO_8 },
795         { .fni8 = gen_shl16_ins_i64,
796           .fniv = gen_shl_ins_vec,
797           .fno = gen_helper_gvec_sli_h,
798           .load_dest = true,
799           .opt_opc = vecop_list,
800           .vece = MO_16 },
801         { .fni4 = gen_shl32_ins_i32,
802           .fniv = gen_shl_ins_vec,
803           .fno = gen_helper_gvec_sli_s,
804           .load_dest = true,
805           .opt_opc = vecop_list,
806           .vece = MO_32 },
807         { .fni8 = gen_shl64_ins_i64,
808           .fniv = gen_shl_ins_vec,
809           .fno = gen_helper_gvec_sli_d,
810           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
811           .load_dest = true,
812           .opt_opc = vecop_list,
813           .vece = MO_64 },
814     };
815 
816     /* tszimm encoding produces immediates in the range [0..esize-1]. */
817     tcg_debug_assert(shift >= 0);
818     tcg_debug_assert(shift < (8 << vece));
819 
820     if (shift == 0) {
821         tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
822     } else {
823         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
824     }
825 }
826 
827 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
828 {
829     gen_helper_neon_mul_u8(a, a, b);
830     gen_helper_neon_add_u8(d, d, a);
831 }
832 
833 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
834 {
835     gen_helper_neon_mul_u8(a, a, b);
836     gen_helper_neon_sub_u8(d, d, a);
837 }
838 
839 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
840 {
841     gen_helper_neon_mul_u16(a, a, b);
842     gen_helper_neon_add_u16(d, d, a);
843 }
844 
845 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
846 {
847     gen_helper_neon_mul_u16(a, a, b);
848     gen_helper_neon_sub_u16(d, d, a);
849 }
850 
851 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
852 {
853     tcg_gen_mul_i32(a, a, b);
854     tcg_gen_add_i32(d, d, a);
855 }
856 
857 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
858 {
859     tcg_gen_mul_i32(a, a, b);
860     tcg_gen_sub_i32(d, d, a);
861 }
862 
863 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
864 {
865     tcg_gen_mul_i64(a, a, b);
866     tcg_gen_add_i64(d, d, a);
867 }
868 
869 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
870 {
871     tcg_gen_mul_i64(a, a, b);
872     tcg_gen_sub_i64(d, d, a);
873 }
874 
875 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
876 {
877     tcg_gen_mul_vec(vece, a, a, b);
878     tcg_gen_add_vec(vece, d, d, a);
879 }
880 
881 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
882 {
883     tcg_gen_mul_vec(vece, a, a, b);
884     tcg_gen_sub_vec(vece, d, d, a);
885 }
886 
887 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
888  * these tables are shared with AArch64 which does support them.
889  */
890 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
891                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
892 {
893     static const TCGOpcode vecop_list[] = {
894         INDEX_op_mul_vec, INDEX_op_add_vec, 0
895     };
896     static const GVecGen3 ops[4] = {
897         { .fni4 = gen_mla8_i32,
898           .fniv = gen_mla_vec,
899           .load_dest = true,
900           .opt_opc = vecop_list,
901           .vece = MO_8 },
902         { .fni4 = gen_mla16_i32,
903           .fniv = gen_mla_vec,
904           .load_dest = true,
905           .opt_opc = vecop_list,
906           .vece = MO_16 },
907         { .fni4 = gen_mla32_i32,
908           .fniv = gen_mla_vec,
909           .load_dest = true,
910           .opt_opc = vecop_list,
911           .vece = MO_32 },
912         { .fni8 = gen_mla64_i64,
913           .fniv = gen_mla_vec,
914           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
915           .load_dest = true,
916           .opt_opc = vecop_list,
917           .vece = MO_64 },
918     };
919     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
920 }
921 
922 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
923                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
924 {
925     static const TCGOpcode vecop_list[] = {
926         INDEX_op_mul_vec, INDEX_op_sub_vec, 0
927     };
928     static const GVecGen3 ops[4] = {
929         { .fni4 = gen_mls8_i32,
930           .fniv = gen_mls_vec,
931           .load_dest = true,
932           .opt_opc = vecop_list,
933           .vece = MO_8 },
934         { .fni4 = gen_mls16_i32,
935           .fniv = gen_mls_vec,
936           .load_dest = true,
937           .opt_opc = vecop_list,
938           .vece = MO_16 },
939         { .fni4 = gen_mls32_i32,
940           .fniv = gen_mls_vec,
941           .load_dest = true,
942           .opt_opc = vecop_list,
943           .vece = MO_32 },
944         { .fni8 = gen_mls64_i64,
945           .fniv = gen_mls_vec,
946           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
947           .load_dest = true,
948           .opt_opc = vecop_list,
949           .vece = MO_64 },
950     };
951     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
952 }
953 
954 /* CMTST : test is "if (X & Y != 0)". */
955 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
956 {
957     tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b);
958 }
959 
960 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
961 {
962     tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b);
963 }
964 
965 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
966 {
967     tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b);
968 }
969 
970 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
971                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
972 {
973     static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
974     static const GVecGen3 ops[4] = {
975         { .fni4 = gen_helper_neon_tst_u8,
976           .fniv = gen_cmtst_vec,
977           .opt_opc = vecop_list,
978           .vece = MO_8 },
979         { .fni4 = gen_helper_neon_tst_u16,
980           .fniv = gen_cmtst_vec,
981           .opt_opc = vecop_list,
982           .vece = MO_16 },
983         { .fni4 = gen_cmtst_i32,
984           .fniv = gen_cmtst_vec,
985           .opt_opc = vecop_list,
986           .vece = MO_32 },
987         { .fni8 = gen_cmtst_i64,
988           .fniv = gen_cmtst_vec,
989           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
990           .opt_opc = vecop_list,
991           .vece = MO_64 },
992     };
993     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
994 }
995 
996 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
997 {
998     TCGv_i32 lval = tcg_temp_new_i32();
999     TCGv_i32 rval = tcg_temp_new_i32();
1000     TCGv_i32 lsh = tcg_temp_new_i32();
1001     TCGv_i32 rsh = tcg_temp_new_i32();
1002     TCGv_i32 zero = tcg_constant_i32(0);
1003     TCGv_i32 max = tcg_constant_i32(32);
1004 
1005     /*
1006      * Rely on the TCG guarantee that out of range shifts produce
1007      * unspecified results, not undefined behaviour (i.e. no trap).
1008      * Discard out-of-range results after the fact.
1009      */
1010     tcg_gen_ext8s_i32(lsh, shift);
1011     tcg_gen_neg_i32(rsh, lsh);
1012     tcg_gen_shl_i32(lval, src, lsh);
1013     tcg_gen_shr_i32(rval, src, rsh);
1014     tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
1015     tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
1016 }
1017 
1018 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1019 {
1020     TCGv_i64 lval = tcg_temp_new_i64();
1021     TCGv_i64 rval = tcg_temp_new_i64();
1022     TCGv_i64 lsh = tcg_temp_new_i64();
1023     TCGv_i64 rsh = tcg_temp_new_i64();
1024     TCGv_i64 zero = tcg_constant_i64(0);
1025     TCGv_i64 max = tcg_constant_i64(64);
1026 
1027     /*
1028      * Rely on the TCG guarantee that out of range shifts produce
1029      * unspecified results, not undefined behaviour (i.e. no trap).
1030      * Discard out-of-range results after the fact.
1031      */
1032     tcg_gen_ext8s_i64(lsh, shift);
1033     tcg_gen_neg_i64(rsh, lsh);
1034     tcg_gen_shl_i64(lval, src, lsh);
1035     tcg_gen_shr_i64(rval, src, rsh);
1036     tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
1037     tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
1038 }
1039 
1040 static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
1041                          TCGv_vec src, TCGv_vec shift)
1042 {
1043     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1044     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1045     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1046     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1047     TCGv_vec msk, max;
1048 
1049     tcg_gen_neg_vec(vece, rsh, shift);
1050     if (vece == MO_8) {
1051         tcg_gen_mov_vec(lsh, shift);
1052     } else {
1053         msk = tcg_temp_new_vec_matching(dst);
1054         tcg_gen_dupi_vec(vece, msk, 0xff);
1055         tcg_gen_and_vec(vece, lsh, shift, msk);
1056         tcg_gen_and_vec(vece, rsh, rsh, msk);
1057     }
1058 
1059     /*
1060      * Rely on the TCG guarantee that out of range shifts produce
1061      * unspecified results, not undefined behaviour (i.e. no trap).
1062      * Discard out-of-range results after the fact.
1063      */
1064     tcg_gen_shlv_vec(vece, lval, src, lsh);
1065     tcg_gen_shrv_vec(vece, rval, src, rsh);
1066 
1067     max = tcg_temp_new_vec_matching(dst);
1068     tcg_gen_dupi_vec(vece, max, 8 << vece);
1069 
1070     /*
1071      * The choice of LT (signed) and GEU (unsigned) are biased toward
1072      * the instructions of the x86_64 host.  For MO_8, the whole byte
1073      * is significant so we must use an unsigned compare; otherwise we
1074      * have already masked to a byte and so a signed compare works.
1075      * Other tcg hosts have a full set of comparisons and do not care.
1076      */
1077     if (vece == MO_8) {
1078         tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
1079         tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
1080         tcg_gen_andc_vec(vece, lval, lval, lsh);
1081         tcg_gen_andc_vec(vece, rval, rval, rsh);
1082     } else {
1083         tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
1084         tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
1085         tcg_gen_and_vec(vece, lval, lval, lsh);
1086         tcg_gen_and_vec(vece, rval, rval, rsh);
1087     }
1088     tcg_gen_or_vec(vece, dst, lval, rval);
1089 }
1090 
1091 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1092                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1093 {
1094     static const TCGOpcode vecop_list[] = {
1095         INDEX_op_neg_vec, INDEX_op_shlv_vec,
1096         INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
1097     };
1098     static const GVecGen3 ops[4] = {
1099         { .fniv = gen_ushl_vec,
1100           .fno = gen_helper_gvec_ushl_b,
1101           .opt_opc = vecop_list,
1102           .vece = MO_8 },
1103         { .fniv = gen_ushl_vec,
1104           .fno = gen_helper_gvec_ushl_h,
1105           .opt_opc = vecop_list,
1106           .vece = MO_16 },
1107         { .fni4 = gen_ushl_i32,
1108           .fniv = gen_ushl_vec,
1109           .opt_opc = vecop_list,
1110           .vece = MO_32 },
1111         { .fni8 = gen_ushl_i64,
1112           .fniv = gen_ushl_vec,
1113           .opt_opc = vecop_list,
1114           .vece = MO_64 },
1115     };
1116     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1117 }
1118 
1119 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1120 {
1121     TCGv_i32 lval = tcg_temp_new_i32();
1122     TCGv_i32 rval = tcg_temp_new_i32();
1123     TCGv_i32 lsh = tcg_temp_new_i32();
1124     TCGv_i32 rsh = tcg_temp_new_i32();
1125     TCGv_i32 zero = tcg_constant_i32(0);
1126     TCGv_i32 max = tcg_constant_i32(31);
1127 
1128     /*
1129      * Rely on the TCG guarantee that out of range shifts produce
1130      * unspecified results, not undefined behaviour (i.e. no trap).
1131      * Discard out-of-range results after the fact.
1132      */
1133     tcg_gen_ext8s_i32(lsh, shift);
1134     tcg_gen_neg_i32(rsh, lsh);
1135     tcg_gen_shl_i32(lval, src, lsh);
1136     tcg_gen_umin_i32(rsh, rsh, max);
1137     tcg_gen_sar_i32(rval, src, rsh);
1138     tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
1139     tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
1140 }
1141 
1142 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1143 {
1144     TCGv_i64 lval = tcg_temp_new_i64();
1145     TCGv_i64 rval = tcg_temp_new_i64();
1146     TCGv_i64 lsh = tcg_temp_new_i64();
1147     TCGv_i64 rsh = tcg_temp_new_i64();
1148     TCGv_i64 zero = tcg_constant_i64(0);
1149     TCGv_i64 max = tcg_constant_i64(63);
1150 
1151     /*
1152      * Rely on the TCG guarantee that out of range shifts produce
1153      * unspecified results, not undefined behaviour (i.e. no trap).
1154      * Discard out-of-range results after the fact.
1155      */
1156     tcg_gen_ext8s_i64(lsh, shift);
1157     tcg_gen_neg_i64(rsh, lsh);
1158     tcg_gen_shl_i64(lval, src, lsh);
1159     tcg_gen_umin_i64(rsh, rsh, max);
1160     tcg_gen_sar_i64(rval, src, rsh);
1161     tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
1162     tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
1163 }
1164 
1165 static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
1166                          TCGv_vec src, TCGv_vec shift)
1167 {
1168     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1169     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1170     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1171     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1172     TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
1173 
1174     /*
1175      * Rely on the TCG guarantee that out of range shifts produce
1176      * unspecified results, not undefined behaviour (i.e. no trap).
1177      * Discard out-of-range results after the fact.
1178      */
1179     tcg_gen_neg_vec(vece, rsh, shift);
1180     if (vece == MO_8) {
1181         tcg_gen_mov_vec(lsh, shift);
1182     } else {
1183         tcg_gen_dupi_vec(vece, tmp, 0xff);
1184         tcg_gen_and_vec(vece, lsh, shift, tmp);
1185         tcg_gen_and_vec(vece, rsh, rsh, tmp);
1186     }
1187 
1188     /* Bound rsh so out of bound right shift gets -1.  */
1189     tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
1190     tcg_gen_umin_vec(vece, rsh, rsh, tmp);
1191     tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
1192 
1193     tcg_gen_shlv_vec(vece, lval, src, lsh);
1194     tcg_gen_sarv_vec(vece, rval, src, rsh);
1195 
1196     /* Select in-bound left shift.  */
1197     tcg_gen_andc_vec(vece, lval, lval, tmp);
1198 
1199     /* Select between left and right shift.  */
1200     if (vece == MO_8) {
1201         tcg_gen_dupi_vec(vece, tmp, 0);
1202         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
1203     } else {
1204         tcg_gen_dupi_vec(vece, tmp, 0x80);
1205         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
1206     }
1207 }
1208 
1209 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1210                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1211 {
1212     static const TCGOpcode vecop_list[] = {
1213         INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
1214         INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
1215     };
1216     static const GVecGen3 ops[4] = {
1217         { .fniv = gen_sshl_vec,
1218           .fno = gen_helper_gvec_sshl_b,
1219           .opt_opc = vecop_list,
1220           .vece = MO_8 },
1221         { .fniv = gen_sshl_vec,
1222           .fno = gen_helper_gvec_sshl_h,
1223           .opt_opc = vecop_list,
1224           .vece = MO_16 },
1225         { .fni4 = gen_sshl_i32,
1226           .fniv = gen_sshl_vec,
1227           .opt_opc = vecop_list,
1228           .vece = MO_32 },
1229         { .fni8 = gen_sshl_i64,
1230           .fniv = gen_sshl_vec,
1231           .opt_opc = vecop_list,
1232           .vece = MO_64 },
1233     };
1234     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1235 }
1236 
1237 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1238                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1239 {
1240     static gen_helper_gvec_3 * const fns[] = {
1241         gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h,
1242         gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d,
1243     };
1244     tcg_debug_assert(vece <= MO_64);
1245     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1246 }
1247 
1248 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1249                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1250 {
1251     static gen_helper_gvec_3 * const fns[] = {
1252         gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h,
1253         gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d,
1254     };
1255     tcg_debug_assert(vece <= MO_64);
1256     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1257 }
1258 
1259 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1260                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1261 {
1262     static gen_helper_gvec_3_ptr * const fns[] = {
1263         gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h,
1264         gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d,
1265     };
1266     tcg_debug_assert(vece <= MO_64);
1267     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1268                        opr_sz, max_sz, 0, fns[vece]);
1269 }
1270 
1271 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1272                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1273 {
1274     static gen_helper_gvec_3_ptr * const fns[] = {
1275         gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h,
1276         gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d,
1277     };
1278     tcg_debug_assert(vece <= MO_64);
1279     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1280                        opr_sz, max_sz, 0, fns[vece]);
1281 }
1282 
1283 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1284                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1285 {
1286     static gen_helper_gvec_3_ptr * const fns[] = {
1287         gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h,
1288         gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d,
1289     };
1290     tcg_debug_assert(vece <= MO_64);
1291     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1292                        opr_sz, max_sz, 0, fns[vece]);
1293 }
1294 
1295 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1296                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1297 {
1298     static gen_helper_gvec_3_ptr * const fns[] = {
1299         gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h,
1300         gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d,
1301     };
1302     tcg_debug_assert(vece <= MO_64);
1303     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1304                        opr_sz, max_sz, 0, fns[vece]);
1305 }
1306 
1307 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1308 {
1309     uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
1310     TCGv_i64 tmp = tcg_temp_new_i64();
1311 
1312     tcg_gen_add_i64(tmp, a, b);
1313     tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max));
1314     tcg_gen_xor_i64(tmp, tmp, res);
1315     tcg_gen_or_i64(qc, qc, tmp);
1316 }
1317 
1318 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1319 {
1320     TCGv_i64 t = tcg_temp_new_i64();
1321 
1322     tcg_gen_add_i64(t, a, b);
1323     tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a,
1324                         tcg_constant_i64(UINT64_MAX), t);
1325     tcg_gen_xor_i64(t, t, res);
1326     tcg_gen_or_i64(qc, qc, t);
1327 }
1328 
1329 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1330                           TCGv_vec a, TCGv_vec b)
1331 {
1332     TCGv_vec x = tcg_temp_new_vec_matching(t);
1333     tcg_gen_add_vec(vece, x, a, b);
1334     tcg_gen_usadd_vec(vece, t, a, b);
1335     tcg_gen_xor_vec(vece, x, x, t);
1336     tcg_gen_or_vec(vece, qc, qc, x);
1337 }
1338 
1339 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1340                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1341 {
1342     static const TCGOpcode vecop_list[] = {
1343         INDEX_op_usadd_vec, INDEX_op_add_vec, 0
1344     };
1345     static const GVecGen4 ops[4] = {
1346         { .fniv = gen_uqadd_vec,
1347           .fno = gen_helper_gvec_uqadd_b,
1348           .write_aofs = true,
1349           .opt_opc = vecop_list,
1350           .vece = MO_8 },
1351         { .fniv = gen_uqadd_vec,
1352           .fno = gen_helper_gvec_uqadd_h,
1353           .write_aofs = true,
1354           .opt_opc = vecop_list,
1355           .vece = MO_16 },
1356         { .fniv = gen_uqadd_vec,
1357           .fno = gen_helper_gvec_uqadd_s,
1358           .write_aofs = true,
1359           .opt_opc = vecop_list,
1360           .vece = MO_32 },
1361         { .fniv = gen_uqadd_vec,
1362           .fni8 = gen_uqadd_d,
1363           .fno = gen_helper_gvec_uqadd_d,
1364           .write_aofs = true,
1365           .opt_opc = vecop_list,
1366           .vece = MO_64 },
1367     };
1368 
1369     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1370     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1371                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1372 }
1373 
1374 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1375 {
1376     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1377     int64_t min = -1ll - max;
1378     TCGv_i64 tmp = tcg_temp_new_i64();
1379 
1380     tcg_gen_add_i64(tmp, a, b);
1381     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1382     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1383     tcg_gen_xor_i64(tmp, tmp, res);
1384     tcg_gen_or_i64(qc, qc, tmp);
1385 }
1386 
1387 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1388 {
1389     TCGv_i64 t0 = tcg_temp_new_i64();
1390     TCGv_i64 t1 = tcg_temp_new_i64();
1391     TCGv_i64 t2 = tcg_temp_new_i64();
1392 
1393     tcg_gen_add_i64(t0, a, b);
1394 
1395     /* Compute signed overflow indication into T1 */
1396     tcg_gen_xor_i64(t1, a, b);
1397     tcg_gen_xor_i64(t2, t0, a);
1398     tcg_gen_andc_i64(t1, t2, t1);
1399 
1400     /* Compute saturated value into T2 */
1401     tcg_gen_sari_i64(t2, a, 63);
1402     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1403 
1404     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1405     tcg_gen_xor_i64(t0, t0, res);
1406     tcg_gen_or_i64(qc, qc, t0);
1407 }
1408 
1409 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1410                           TCGv_vec a, TCGv_vec b)
1411 {
1412     TCGv_vec x = tcg_temp_new_vec_matching(t);
1413     tcg_gen_add_vec(vece, x, a, b);
1414     tcg_gen_ssadd_vec(vece, t, a, b);
1415     tcg_gen_xor_vec(vece, x, x, t);
1416     tcg_gen_or_vec(vece, qc, qc, x);
1417 }
1418 
1419 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1420                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1421 {
1422     static const TCGOpcode vecop_list[] = {
1423         INDEX_op_ssadd_vec, INDEX_op_add_vec, 0
1424     };
1425     static const GVecGen4 ops[4] = {
1426         { .fniv = gen_sqadd_vec,
1427           .fno = gen_helper_gvec_sqadd_b,
1428           .opt_opc = vecop_list,
1429           .write_aofs = true,
1430           .vece = MO_8 },
1431         { .fniv = gen_sqadd_vec,
1432           .fno = gen_helper_gvec_sqadd_h,
1433           .opt_opc = vecop_list,
1434           .write_aofs = true,
1435           .vece = MO_16 },
1436         { .fniv = gen_sqadd_vec,
1437           .fno = gen_helper_gvec_sqadd_s,
1438           .opt_opc = vecop_list,
1439           .write_aofs = true,
1440           .vece = MO_32 },
1441         { .fniv = gen_sqadd_vec,
1442           .fni8 = gen_sqadd_d,
1443           .fno = gen_helper_gvec_sqadd_d,
1444           .opt_opc = vecop_list,
1445           .write_aofs = true,
1446           .vece = MO_64 },
1447     };
1448 
1449     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1450     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1451                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1452 }
1453 
1454 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1455 {
1456     TCGv_i64 tmp = tcg_temp_new_i64();
1457 
1458     tcg_gen_sub_i64(tmp, a, b);
1459     tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0));
1460     tcg_gen_xor_i64(tmp, tmp, res);
1461     tcg_gen_or_i64(qc, qc, tmp);
1462 }
1463 
1464 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1465 {
1466     TCGv_i64 t = tcg_temp_new_i64();
1467 
1468     tcg_gen_sub_i64(t, a, b);
1469     tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t);
1470     tcg_gen_xor_i64(t, t, res);
1471     tcg_gen_or_i64(qc, qc, t);
1472 }
1473 
1474 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1475                           TCGv_vec a, TCGv_vec b)
1476 {
1477     TCGv_vec x = tcg_temp_new_vec_matching(t);
1478     tcg_gen_sub_vec(vece, x, a, b);
1479     tcg_gen_ussub_vec(vece, t, a, b);
1480     tcg_gen_xor_vec(vece, x, x, t);
1481     tcg_gen_or_vec(vece, qc, qc, x);
1482 }
1483 
1484 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1485                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1486 {
1487     static const TCGOpcode vecop_list[] = {
1488         INDEX_op_ussub_vec, INDEX_op_sub_vec, 0
1489     };
1490     static const GVecGen4 ops[4] = {
1491         { .fniv = gen_uqsub_vec,
1492           .fno = gen_helper_gvec_uqsub_b,
1493           .opt_opc = vecop_list,
1494           .write_aofs = true,
1495           .vece = MO_8 },
1496         { .fniv = gen_uqsub_vec,
1497           .fno = gen_helper_gvec_uqsub_h,
1498           .opt_opc = vecop_list,
1499           .write_aofs = true,
1500           .vece = MO_16 },
1501         { .fniv = gen_uqsub_vec,
1502           .fno = gen_helper_gvec_uqsub_s,
1503           .opt_opc = vecop_list,
1504           .write_aofs = true,
1505           .vece = MO_32 },
1506         { .fniv = gen_uqsub_vec,
1507           .fni8 = gen_uqsub_d,
1508           .fno = gen_helper_gvec_uqsub_d,
1509           .opt_opc = vecop_list,
1510           .write_aofs = true,
1511           .vece = MO_64 },
1512     };
1513 
1514     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1515     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1516                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1517 }
1518 
1519 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1520 {
1521     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1522     int64_t min = -1ll - max;
1523     TCGv_i64 tmp = tcg_temp_new_i64();
1524 
1525     tcg_gen_sub_i64(tmp, a, b);
1526     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1527     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1528     tcg_gen_xor_i64(tmp, tmp, res);
1529     tcg_gen_or_i64(qc, qc, tmp);
1530 }
1531 
1532 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1533 {
1534     TCGv_i64 t0 = tcg_temp_new_i64();
1535     TCGv_i64 t1 = tcg_temp_new_i64();
1536     TCGv_i64 t2 = tcg_temp_new_i64();
1537 
1538     tcg_gen_sub_i64(t0, a, b);
1539 
1540     /* Compute signed overflow indication into T1 */
1541     tcg_gen_xor_i64(t1, a, b);
1542     tcg_gen_xor_i64(t2, t0, a);
1543     tcg_gen_and_i64(t1, t1, t2);
1544 
1545     /* Compute saturated value into T2 */
1546     tcg_gen_sari_i64(t2, a, 63);
1547     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1548 
1549     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1550     tcg_gen_xor_i64(t0, t0, res);
1551     tcg_gen_or_i64(qc, qc, t0);
1552 }
1553 
1554 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1555                           TCGv_vec a, TCGv_vec b)
1556 {
1557     TCGv_vec x = tcg_temp_new_vec_matching(t);
1558     tcg_gen_sub_vec(vece, x, a, b);
1559     tcg_gen_sssub_vec(vece, t, a, b);
1560     tcg_gen_xor_vec(vece, x, x, t);
1561     tcg_gen_or_vec(vece, qc, qc, x);
1562 }
1563 
1564 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1565                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1566 {
1567     static const TCGOpcode vecop_list[] = {
1568         INDEX_op_sssub_vec, INDEX_op_sub_vec, 0
1569     };
1570     static const GVecGen4 ops[4] = {
1571         { .fniv = gen_sqsub_vec,
1572           .fno = gen_helper_gvec_sqsub_b,
1573           .opt_opc = vecop_list,
1574           .write_aofs = true,
1575           .vece = MO_8 },
1576         { .fniv = gen_sqsub_vec,
1577           .fno = gen_helper_gvec_sqsub_h,
1578           .opt_opc = vecop_list,
1579           .write_aofs = true,
1580           .vece = MO_16 },
1581         { .fniv = gen_sqsub_vec,
1582           .fno = gen_helper_gvec_sqsub_s,
1583           .opt_opc = vecop_list,
1584           .write_aofs = true,
1585           .vece = MO_32 },
1586         { .fniv = gen_sqsub_vec,
1587           .fni8 = gen_sqsub_d,
1588           .fno = gen_helper_gvec_sqsub_d,
1589           .opt_opc = vecop_list,
1590           .write_aofs = true,
1591           .vece = MO_64 },
1592     };
1593 
1594     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1595     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1596                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1597 }
1598 
1599 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1600 {
1601     TCGv_i32 t = tcg_temp_new_i32();
1602 
1603     tcg_gen_sub_i32(t, a, b);
1604     tcg_gen_sub_i32(d, b, a);
1605     tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
1606 }
1607 
1608 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1609 {
1610     TCGv_i64 t = tcg_temp_new_i64();
1611 
1612     tcg_gen_sub_i64(t, a, b);
1613     tcg_gen_sub_i64(d, b, a);
1614     tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
1615 }
1616 
1617 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1618 {
1619     TCGv_vec t = tcg_temp_new_vec_matching(d);
1620 
1621     tcg_gen_smin_vec(vece, t, a, b);
1622     tcg_gen_smax_vec(vece, d, a, b);
1623     tcg_gen_sub_vec(vece, d, d, t);
1624 }
1625 
1626 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1627                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1628 {
1629     static const TCGOpcode vecop_list[] = {
1630         INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1631     };
1632     static const GVecGen3 ops[4] = {
1633         { .fniv = gen_sabd_vec,
1634           .fno = gen_helper_gvec_sabd_b,
1635           .opt_opc = vecop_list,
1636           .vece = MO_8 },
1637         { .fniv = gen_sabd_vec,
1638           .fno = gen_helper_gvec_sabd_h,
1639           .opt_opc = vecop_list,
1640           .vece = MO_16 },
1641         { .fni4 = gen_sabd_i32,
1642           .fniv = gen_sabd_vec,
1643           .fno = gen_helper_gvec_sabd_s,
1644           .opt_opc = vecop_list,
1645           .vece = MO_32 },
1646         { .fni8 = gen_sabd_i64,
1647           .fniv = gen_sabd_vec,
1648           .fno = gen_helper_gvec_sabd_d,
1649           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1650           .opt_opc = vecop_list,
1651           .vece = MO_64 },
1652     };
1653     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1654 }
1655 
1656 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1657 {
1658     TCGv_i32 t = tcg_temp_new_i32();
1659 
1660     tcg_gen_sub_i32(t, a, b);
1661     tcg_gen_sub_i32(d, b, a);
1662     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
1663 }
1664 
1665 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1666 {
1667     TCGv_i64 t = tcg_temp_new_i64();
1668 
1669     tcg_gen_sub_i64(t, a, b);
1670     tcg_gen_sub_i64(d, b, a);
1671     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
1672 }
1673 
1674 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1675 {
1676     TCGv_vec t = tcg_temp_new_vec_matching(d);
1677 
1678     tcg_gen_umin_vec(vece, t, a, b);
1679     tcg_gen_umax_vec(vece, d, a, b);
1680     tcg_gen_sub_vec(vece, d, d, t);
1681 }
1682 
1683 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1684                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1685 {
1686     static const TCGOpcode vecop_list[] = {
1687         INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1688     };
1689     static const GVecGen3 ops[4] = {
1690         { .fniv = gen_uabd_vec,
1691           .fno = gen_helper_gvec_uabd_b,
1692           .opt_opc = vecop_list,
1693           .vece = MO_8 },
1694         { .fniv = gen_uabd_vec,
1695           .fno = gen_helper_gvec_uabd_h,
1696           .opt_opc = vecop_list,
1697           .vece = MO_16 },
1698         { .fni4 = gen_uabd_i32,
1699           .fniv = gen_uabd_vec,
1700           .fno = gen_helper_gvec_uabd_s,
1701           .opt_opc = vecop_list,
1702           .vece = MO_32 },
1703         { .fni8 = gen_uabd_i64,
1704           .fniv = gen_uabd_vec,
1705           .fno = gen_helper_gvec_uabd_d,
1706           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1707           .opt_opc = vecop_list,
1708           .vece = MO_64 },
1709     };
1710     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1711 }
1712 
1713 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1714 {
1715     TCGv_i32 t = tcg_temp_new_i32();
1716     gen_sabd_i32(t, a, b);
1717     tcg_gen_add_i32(d, d, t);
1718 }
1719 
1720 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1721 {
1722     TCGv_i64 t = tcg_temp_new_i64();
1723     gen_sabd_i64(t, a, b);
1724     tcg_gen_add_i64(d, d, t);
1725 }
1726 
1727 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1728 {
1729     TCGv_vec t = tcg_temp_new_vec_matching(d);
1730     gen_sabd_vec(vece, t, a, b);
1731     tcg_gen_add_vec(vece, d, d, t);
1732 }
1733 
1734 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1735                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1736 {
1737     static const TCGOpcode vecop_list[] = {
1738         INDEX_op_sub_vec, INDEX_op_add_vec,
1739         INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1740     };
1741     static const GVecGen3 ops[4] = {
1742         { .fniv = gen_saba_vec,
1743           .fno = gen_helper_gvec_saba_b,
1744           .opt_opc = vecop_list,
1745           .load_dest = true,
1746           .vece = MO_8 },
1747         { .fniv = gen_saba_vec,
1748           .fno = gen_helper_gvec_saba_h,
1749           .opt_opc = vecop_list,
1750           .load_dest = true,
1751           .vece = MO_16 },
1752         { .fni4 = gen_saba_i32,
1753           .fniv = gen_saba_vec,
1754           .fno = gen_helper_gvec_saba_s,
1755           .opt_opc = vecop_list,
1756           .load_dest = true,
1757           .vece = MO_32 },
1758         { .fni8 = gen_saba_i64,
1759           .fniv = gen_saba_vec,
1760           .fno = gen_helper_gvec_saba_d,
1761           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1762           .opt_opc = vecop_list,
1763           .load_dest = true,
1764           .vece = MO_64 },
1765     };
1766     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1767 }
1768 
1769 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1770 {
1771     TCGv_i32 t = tcg_temp_new_i32();
1772     gen_uabd_i32(t, a, b);
1773     tcg_gen_add_i32(d, d, t);
1774 }
1775 
1776 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1777 {
1778     TCGv_i64 t = tcg_temp_new_i64();
1779     gen_uabd_i64(t, a, b);
1780     tcg_gen_add_i64(d, d, t);
1781 }
1782 
1783 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1784 {
1785     TCGv_vec t = tcg_temp_new_vec_matching(d);
1786     gen_uabd_vec(vece, t, a, b);
1787     tcg_gen_add_vec(vece, d, d, t);
1788 }
1789 
1790 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1791                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1792 {
1793     static const TCGOpcode vecop_list[] = {
1794         INDEX_op_sub_vec, INDEX_op_add_vec,
1795         INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1796     };
1797     static const GVecGen3 ops[4] = {
1798         { .fniv = gen_uaba_vec,
1799           .fno = gen_helper_gvec_uaba_b,
1800           .opt_opc = vecop_list,
1801           .load_dest = true,
1802           .vece = MO_8 },
1803         { .fniv = gen_uaba_vec,
1804           .fno = gen_helper_gvec_uaba_h,
1805           .opt_opc = vecop_list,
1806           .load_dest = true,
1807           .vece = MO_16 },
1808         { .fni4 = gen_uaba_i32,
1809           .fniv = gen_uaba_vec,
1810           .fno = gen_helper_gvec_uaba_s,
1811           .opt_opc = vecop_list,
1812           .load_dest = true,
1813           .vece = MO_32 },
1814         { .fni8 = gen_uaba_i64,
1815           .fniv = gen_uaba_vec,
1816           .fno = gen_helper_gvec_uaba_d,
1817           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1818           .opt_opc = vecop_list,
1819           .load_dest = true,
1820           .vece = MO_64 },
1821     };
1822     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1823 }
1824 
1825 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1826                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1827 {
1828     static gen_helper_gvec_3 * const fns[4] = {
1829         gen_helper_gvec_addp_b,
1830         gen_helper_gvec_addp_h,
1831         gen_helper_gvec_addp_s,
1832         gen_helper_gvec_addp_d,
1833     };
1834     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1835 }
1836 
1837 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1838                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1839 {
1840     static gen_helper_gvec_3 * const fns[4] = {
1841         gen_helper_gvec_smaxp_b,
1842         gen_helper_gvec_smaxp_h,
1843         gen_helper_gvec_smaxp_s,
1844     };
1845     tcg_debug_assert(vece <= MO_32);
1846     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1847 }
1848 
1849 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1850                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1851 {
1852     static gen_helper_gvec_3 * const fns[4] = {
1853         gen_helper_gvec_sminp_b,
1854         gen_helper_gvec_sminp_h,
1855         gen_helper_gvec_sminp_s,
1856     };
1857     tcg_debug_assert(vece <= MO_32);
1858     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1859 }
1860 
1861 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1862                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1863 {
1864     static gen_helper_gvec_3 * const fns[4] = {
1865         gen_helper_gvec_umaxp_b,
1866         gen_helper_gvec_umaxp_h,
1867         gen_helper_gvec_umaxp_s,
1868     };
1869     tcg_debug_assert(vece <= MO_32);
1870     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1871 }
1872 
1873 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1874                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1875 {
1876     static gen_helper_gvec_3 * const fns[4] = {
1877         gen_helper_gvec_uminp_b,
1878         gen_helper_gvec_uminp_h,
1879         gen_helper_gvec_uminp_s,
1880     };
1881     tcg_debug_assert(vece <= MO_32);
1882     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1883 }
1884 
1885 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1886 {
1887     TCGv_i64 t = tcg_temp_new_i64();
1888 
1889     tcg_gen_and_i64(t, a, b);
1890     tcg_gen_vec_sar8i_i64(a, a, 1);
1891     tcg_gen_vec_sar8i_i64(b, b, 1);
1892     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1893     tcg_gen_vec_add8_i64(d, a, b);
1894     tcg_gen_vec_add8_i64(d, d, t);
1895 }
1896 
1897 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1898 {
1899     TCGv_i64 t = tcg_temp_new_i64();
1900 
1901     tcg_gen_and_i64(t, a, b);
1902     tcg_gen_vec_sar16i_i64(a, a, 1);
1903     tcg_gen_vec_sar16i_i64(b, b, 1);
1904     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1905     tcg_gen_vec_add16_i64(d, a, b);
1906     tcg_gen_vec_add16_i64(d, d, t);
1907 }
1908 
1909 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1910 {
1911     TCGv_i32 t = tcg_temp_new_i32();
1912 
1913     tcg_gen_and_i32(t, a, b);
1914     tcg_gen_sari_i32(a, a, 1);
1915     tcg_gen_sari_i32(b, b, 1);
1916     tcg_gen_andi_i32(t, t, 1);
1917     tcg_gen_add_i32(d, a, b);
1918     tcg_gen_add_i32(d, d, t);
1919 }
1920 
1921 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1922 {
1923     TCGv_vec t = tcg_temp_new_vec_matching(d);
1924 
1925     tcg_gen_and_vec(vece, t, a, b);
1926     tcg_gen_sari_vec(vece, a, a, 1);
1927     tcg_gen_sari_vec(vece, b, b, 1);
1928     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1929     tcg_gen_add_vec(vece, d, a, b);
1930     tcg_gen_add_vec(vece, d, d, t);
1931 }
1932 
1933 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1934                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1935 {
1936     static const TCGOpcode vecop_list[] = {
1937         INDEX_op_sari_vec, INDEX_op_add_vec, 0
1938     };
1939     static const GVecGen3 g[] = {
1940         { .fni8 = gen_shadd8_i64,
1941           .fniv = gen_shadd_vec,
1942           .opt_opc = vecop_list,
1943           .vece = MO_8 },
1944         { .fni8 = gen_shadd16_i64,
1945           .fniv = gen_shadd_vec,
1946           .opt_opc = vecop_list,
1947           .vece = MO_16 },
1948         { .fni4 = gen_shadd_i32,
1949           .fniv = gen_shadd_vec,
1950           .opt_opc = vecop_list,
1951           .vece = MO_32 },
1952     };
1953     tcg_debug_assert(vece <= MO_32);
1954     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
1955 }
1956 
1957 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1958 {
1959     TCGv_i64 t = tcg_temp_new_i64();
1960 
1961     tcg_gen_and_i64(t, a, b);
1962     tcg_gen_vec_shr8i_i64(a, a, 1);
1963     tcg_gen_vec_shr8i_i64(b, b, 1);
1964     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1965     tcg_gen_vec_add8_i64(d, a, b);
1966     tcg_gen_vec_add8_i64(d, d, t);
1967 }
1968 
1969 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1970 {
1971     TCGv_i64 t = tcg_temp_new_i64();
1972 
1973     tcg_gen_and_i64(t, a, b);
1974     tcg_gen_vec_shr16i_i64(a, a, 1);
1975     tcg_gen_vec_shr16i_i64(b, b, 1);
1976     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1977     tcg_gen_vec_add16_i64(d, a, b);
1978     tcg_gen_vec_add16_i64(d, d, t);
1979 }
1980 
1981 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1982 {
1983     TCGv_i32 t = tcg_temp_new_i32();
1984 
1985     tcg_gen_and_i32(t, a, b);
1986     tcg_gen_shri_i32(a, a, 1);
1987     tcg_gen_shri_i32(b, b, 1);
1988     tcg_gen_andi_i32(t, t, 1);
1989     tcg_gen_add_i32(d, a, b);
1990     tcg_gen_add_i32(d, d, t);
1991 }
1992 
1993 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1994 {
1995     TCGv_vec t = tcg_temp_new_vec_matching(d);
1996 
1997     tcg_gen_and_vec(vece, t, a, b);
1998     tcg_gen_shri_vec(vece, a, a, 1);
1999     tcg_gen_shri_vec(vece, b, b, 1);
2000     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2001     tcg_gen_add_vec(vece, d, a, b);
2002     tcg_gen_add_vec(vece, d, d, t);
2003 }
2004 
2005 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2006                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2007 {
2008     static const TCGOpcode vecop_list[] = {
2009         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2010     };
2011     static const GVecGen3 g[] = {
2012         { .fni8 = gen_uhadd8_i64,
2013           .fniv = gen_uhadd_vec,
2014           .opt_opc = vecop_list,
2015           .vece = MO_8 },
2016         { .fni8 = gen_uhadd16_i64,
2017           .fniv = gen_uhadd_vec,
2018           .opt_opc = vecop_list,
2019           .vece = MO_16 },
2020         { .fni4 = gen_uhadd_i32,
2021           .fniv = gen_uhadd_vec,
2022           .opt_opc = vecop_list,
2023           .vece = MO_32 },
2024     };
2025     tcg_debug_assert(vece <= MO_32);
2026     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2027 }
2028 
2029 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2030 {
2031     TCGv_i64 t = tcg_temp_new_i64();
2032 
2033     tcg_gen_andc_i64(t, b, a);
2034     tcg_gen_vec_sar8i_i64(a, a, 1);
2035     tcg_gen_vec_sar8i_i64(b, b, 1);
2036     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2037     tcg_gen_vec_sub8_i64(d, a, b);
2038     tcg_gen_vec_sub8_i64(d, d, t);
2039 }
2040 
2041 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2042 {
2043     TCGv_i64 t = tcg_temp_new_i64();
2044 
2045     tcg_gen_andc_i64(t, b, a);
2046     tcg_gen_vec_sar16i_i64(a, a, 1);
2047     tcg_gen_vec_sar16i_i64(b, b, 1);
2048     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2049     tcg_gen_vec_sub16_i64(d, a, b);
2050     tcg_gen_vec_sub16_i64(d, d, t);
2051 }
2052 
2053 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2054 {
2055     TCGv_i32 t = tcg_temp_new_i32();
2056 
2057     tcg_gen_andc_i32(t, b, a);
2058     tcg_gen_sari_i32(a, a, 1);
2059     tcg_gen_sari_i32(b, b, 1);
2060     tcg_gen_andi_i32(t, t, 1);
2061     tcg_gen_sub_i32(d, a, b);
2062     tcg_gen_sub_i32(d, d, t);
2063 }
2064 
2065 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2066 {
2067     TCGv_vec t = tcg_temp_new_vec_matching(d);
2068 
2069     tcg_gen_andc_vec(vece, t, b, a);
2070     tcg_gen_sari_vec(vece, a, a, 1);
2071     tcg_gen_sari_vec(vece, b, b, 1);
2072     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2073     tcg_gen_sub_vec(vece, d, a, b);
2074     tcg_gen_sub_vec(vece, d, d, t);
2075 }
2076 
2077 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2078                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2079 {
2080     static const TCGOpcode vecop_list[] = {
2081         INDEX_op_sari_vec, INDEX_op_sub_vec, 0
2082     };
2083     static const GVecGen3 g[4] = {
2084         { .fni8 = gen_shsub8_i64,
2085           .fniv = gen_shsub_vec,
2086           .opt_opc = vecop_list,
2087           .vece = MO_8 },
2088         { .fni8 = gen_shsub16_i64,
2089           .fniv = gen_shsub_vec,
2090           .opt_opc = vecop_list,
2091           .vece = MO_16 },
2092         { .fni4 = gen_shsub_i32,
2093           .fniv = gen_shsub_vec,
2094           .opt_opc = vecop_list,
2095           .vece = MO_32 },
2096     };
2097     assert(vece <= MO_32);
2098     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2099 }
2100 
2101 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2102 {
2103     TCGv_i64 t = tcg_temp_new_i64();
2104 
2105     tcg_gen_andc_i64(t, b, a);
2106     tcg_gen_vec_shr8i_i64(a, a, 1);
2107     tcg_gen_vec_shr8i_i64(b, b, 1);
2108     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2109     tcg_gen_vec_sub8_i64(d, a, b);
2110     tcg_gen_vec_sub8_i64(d, d, t);
2111 }
2112 
2113 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2114 {
2115     TCGv_i64 t = tcg_temp_new_i64();
2116 
2117     tcg_gen_andc_i64(t, b, a);
2118     tcg_gen_vec_shr16i_i64(a, a, 1);
2119     tcg_gen_vec_shr16i_i64(b, b, 1);
2120     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2121     tcg_gen_vec_sub16_i64(d, a, b);
2122     tcg_gen_vec_sub16_i64(d, d, t);
2123 }
2124 
2125 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2126 {
2127     TCGv_i32 t = tcg_temp_new_i32();
2128 
2129     tcg_gen_andc_i32(t, b, a);
2130     tcg_gen_shri_i32(a, a, 1);
2131     tcg_gen_shri_i32(b, b, 1);
2132     tcg_gen_andi_i32(t, t, 1);
2133     tcg_gen_sub_i32(d, a, b);
2134     tcg_gen_sub_i32(d, d, t);
2135 }
2136 
2137 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2138 {
2139     TCGv_vec t = tcg_temp_new_vec_matching(d);
2140 
2141     tcg_gen_andc_vec(vece, t, b, a);
2142     tcg_gen_shri_vec(vece, a, a, 1);
2143     tcg_gen_shri_vec(vece, b, b, 1);
2144     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2145     tcg_gen_sub_vec(vece, d, a, b);
2146     tcg_gen_sub_vec(vece, d, d, t);
2147 }
2148 
2149 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2150                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2151 {
2152     static const TCGOpcode vecop_list[] = {
2153         INDEX_op_shri_vec, INDEX_op_sub_vec, 0
2154     };
2155     static const GVecGen3 g[4] = {
2156         { .fni8 = gen_uhsub8_i64,
2157           .fniv = gen_uhsub_vec,
2158           .opt_opc = vecop_list,
2159           .vece = MO_8 },
2160         { .fni8 = gen_uhsub16_i64,
2161           .fniv = gen_uhsub_vec,
2162           .opt_opc = vecop_list,
2163           .vece = MO_16 },
2164         { .fni4 = gen_uhsub_i32,
2165           .fniv = gen_uhsub_vec,
2166           .opt_opc = vecop_list,
2167           .vece = MO_32 },
2168     };
2169     assert(vece <= MO_32);
2170     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2171 }
2172 
2173 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2174 {
2175     TCGv_i64 t = tcg_temp_new_i64();
2176 
2177     tcg_gen_or_i64(t, a, b);
2178     tcg_gen_vec_sar8i_i64(a, a, 1);
2179     tcg_gen_vec_sar8i_i64(b, b, 1);
2180     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2181     tcg_gen_vec_add8_i64(d, a, b);
2182     tcg_gen_vec_add8_i64(d, d, t);
2183 }
2184 
2185 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2186 {
2187     TCGv_i64 t = tcg_temp_new_i64();
2188 
2189     tcg_gen_or_i64(t, a, b);
2190     tcg_gen_vec_sar16i_i64(a, a, 1);
2191     tcg_gen_vec_sar16i_i64(b, b, 1);
2192     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2193     tcg_gen_vec_add16_i64(d, a, b);
2194     tcg_gen_vec_add16_i64(d, d, t);
2195 }
2196 
2197 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2198 {
2199     TCGv_i32 t = tcg_temp_new_i32();
2200 
2201     tcg_gen_or_i32(t, a, b);
2202     tcg_gen_sari_i32(a, a, 1);
2203     tcg_gen_sari_i32(b, b, 1);
2204     tcg_gen_andi_i32(t, t, 1);
2205     tcg_gen_add_i32(d, a, b);
2206     tcg_gen_add_i32(d, d, t);
2207 }
2208 
2209 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2210 {
2211     TCGv_vec t = tcg_temp_new_vec_matching(d);
2212 
2213     tcg_gen_or_vec(vece, t, a, b);
2214     tcg_gen_sari_vec(vece, a, a, 1);
2215     tcg_gen_sari_vec(vece, b, b, 1);
2216     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2217     tcg_gen_add_vec(vece, d, a, b);
2218     tcg_gen_add_vec(vece, d, d, t);
2219 }
2220 
2221 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2222                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2223 {
2224     static const TCGOpcode vecop_list[] = {
2225         INDEX_op_sari_vec, INDEX_op_add_vec, 0
2226     };
2227     static const GVecGen3 g[] = {
2228         { .fni8 = gen_srhadd8_i64,
2229           .fniv = gen_srhadd_vec,
2230           .opt_opc = vecop_list,
2231           .vece = MO_8 },
2232         { .fni8 = gen_srhadd16_i64,
2233           .fniv = gen_srhadd_vec,
2234           .opt_opc = vecop_list,
2235           .vece = MO_16 },
2236         { .fni4 = gen_srhadd_i32,
2237           .fniv = gen_srhadd_vec,
2238           .opt_opc = vecop_list,
2239           .vece = MO_32 },
2240     };
2241     assert(vece <= MO_32);
2242     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2243 }
2244 
2245 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2246 {
2247     TCGv_i64 t = tcg_temp_new_i64();
2248 
2249     tcg_gen_or_i64(t, a, b);
2250     tcg_gen_vec_shr8i_i64(a, a, 1);
2251     tcg_gen_vec_shr8i_i64(b, b, 1);
2252     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2253     tcg_gen_vec_add8_i64(d, a, b);
2254     tcg_gen_vec_add8_i64(d, d, t);
2255 }
2256 
2257 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2258 {
2259     TCGv_i64 t = tcg_temp_new_i64();
2260 
2261     tcg_gen_or_i64(t, a, b);
2262     tcg_gen_vec_shr16i_i64(a, a, 1);
2263     tcg_gen_vec_shr16i_i64(b, b, 1);
2264     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2265     tcg_gen_vec_add16_i64(d, a, b);
2266     tcg_gen_vec_add16_i64(d, d, t);
2267 }
2268 
2269 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2270 {
2271     TCGv_i32 t = tcg_temp_new_i32();
2272 
2273     tcg_gen_or_i32(t, a, b);
2274     tcg_gen_shri_i32(a, a, 1);
2275     tcg_gen_shri_i32(b, b, 1);
2276     tcg_gen_andi_i32(t, t, 1);
2277     tcg_gen_add_i32(d, a, b);
2278     tcg_gen_add_i32(d, d, t);
2279 }
2280 
2281 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2282 {
2283     TCGv_vec t = tcg_temp_new_vec_matching(d);
2284 
2285     tcg_gen_or_vec(vece, t, a, b);
2286     tcg_gen_shri_vec(vece, a, a, 1);
2287     tcg_gen_shri_vec(vece, b, b, 1);
2288     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2289     tcg_gen_add_vec(vece, d, a, b);
2290     tcg_gen_add_vec(vece, d, d, t);
2291 }
2292 
2293 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2294                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2295 {
2296     static const TCGOpcode vecop_list[] = {
2297         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2298     };
2299     static const GVecGen3 g[] = {
2300         { .fni8 = gen_urhadd8_i64,
2301           .fniv = gen_urhadd_vec,
2302           .opt_opc = vecop_list,
2303           .vece = MO_8 },
2304         { .fni8 = gen_urhadd16_i64,
2305           .fniv = gen_urhadd_vec,
2306           .opt_opc = vecop_list,
2307           .vece = MO_16 },
2308         { .fni4 = gen_urhadd_i32,
2309           .fniv = gen_urhadd_vec,
2310           .opt_opc = vecop_list,
2311           .vece = MO_32 },
2312     };
2313     assert(vece <= MO_32);
2314     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2315 }
2316