xref: /openbmc/qemu/target/arm/tcg/gengvec.c (revision 8d3dfb6205a9e00dff30c09e4f6f0d274a090dbe)
1 /*
2  *  ARM generic vector expansion
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "translate.h"
24 
25 
26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
27                             uint32_t opr_sz, uint32_t max_sz,
28                             gen_helper_gvec_3_ptr *fn)
29 {
30     TCGv_ptr qc_ptr = tcg_temp_new_ptr();
31 
32     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
33     tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc));
34     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
35                        opr_sz, max_sz, 0, fn);
36 }
37 
38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
39                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
40 {
41     static gen_helper_gvec_3_ptr * const fns[2] = {
42         gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s
43     };
44     tcg_debug_assert(vece >= 1 && vece <= 2);
45     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
46 }
47 
48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
49                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
50 {
51     static gen_helper_gvec_3_ptr * const fns[2] = {
52         gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s
53     };
54     tcg_debug_assert(vece >= 1 && vece <= 2);
55     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
56 }
57 
58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
59                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
60 {
61     static gen_helper_gvec_3_ptr * const fns[2] = {
62         gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
63     };
64     tcg_debug_assert(vece >= 1 && vece <= 2);
65     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
66 }
67 
68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
69                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
70 {
71     static gen_helper_gvec_3_ptr * const fns[2] = {
72         gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
73     };
74     tcg_debug_assert(vece >= 1 && vece <= 2);
75     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
76 }
77 
78 #define GEN_CMP0(NAME, COND)                              \
79     void NAME(unsigned vece, uint32_t d, uint32_t m,      \
80               uint32_t opr_sz, uint32_t max_sz)           \
81     { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }
82 
83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
88 
89 #undef GEN_CMP0
90 
91 void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
92                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
93 {
94     /* Signed shift out of range results in all-sign-bits */
95     shift = MIN(shift, (8 << vece) - 1);
96     tcg_gen_gvec_sari(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
97 }
98 
99 void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
100                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
101 {
102     /* Unsigned shift out of range results in all-zero-bits */
103     if (shift >= (8 << vece)) {
104         tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
105     } else {
106         tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
107     }
108 }
109 
110 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
111 {
112     tcg_gen_vec_sar8i_i64(a, a, shift);
113     tcg_gen_vec_add8_i64(d, d, a);
114 }
115 
116 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
117 {
118     tcg_gen_vec_sar16i_i64(a, a, shift);
119     tcg_gen_vec_add16_i64(d, d, a);
120 }
121 
122 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
123 {
124     tcg_gen_sari_i32(a, a, shift);
125     tcg_gen_add_i32(d, d, a);
126 }
127 
128 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
129 {
130     tcg_gen_sari_i64(a, a, shift);
131     tcg_gen_add_i64(d, d, a);
132 }
133 
134 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
135 {
136     tcg_gen_sari_vec(vece, a, a, sh);
137     tcg_gen_add_vec(vece, d, d, a);
138 }
139 
140 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
141                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
142 {
143     static const TCGOpcode vecop_list[] = {
144         INDEX_op_sari_vec, INDEX_op_add_vec, 0
145     };
146     static const GVecGen2i ops[4] = {
147         { .fni8 = gen_ssra8_i64,
148           .fniv = gen_ssra_vec,
149           .fno = gen_helper_gvec_ssra_b,
150           .load_dest = true,
151           .opt_opc = vecop_list,
152           .vece = MO_8 },
153         { .fni8 = gen_ssra16_i64,
154           .fniv = gen_ssra_vec,
155           .fno = gen_helper_gvec_ssra_h,
156           .load_dest = true,
157           .opt_opc = vecop_list,
158           .vece = MO_16 },
159         { .fni4 = gen_ssra32_i32,
160           .fniv = gen_ssra_vec,
161           .fno = gen_helper_gvec_ssra_s,
162           .load_dest = true,
163           .opt_opc = vecop_list,
164           .vece = MO_32 },
165         { .fni8 = gen_ssra64_i64,
166           .fniv = gen_ssra_vec,
167           .fno = gen_helper_gvec_ssra_d,
168           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
169           .opt_opc = vecop_list,
170           .load_dest = true,
171           .vece = MO_64 },
172     };
173 
174     /* tszimm encoding produces immediates in the range [1..esize]. */
175     tcg_debug_assert(shift > 0);
176     tcg_debug_assert(shift <= (8 << vece));
177 
178     /*
179      * Shifts larger than the element size are architecturally valid.
180      * Signed results in all sign bits.
181      */
182     shift = MIN(shift, (8 << vece) - 1);
183     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
184 }
185 
186 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
187 {
188     tcg_gen_vec_shr8i_i64(a, a, shift);
189     tcg_gen_vec_add8_i64(d, d, a);
190 }
191 
192 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
193 {
194     tcg_gen_vec_shr16i_i64(a, a, shift);
195     tcg_gen_vec_add16_i64(d, d, a);
196 }
197 
198 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
199 {
200     tcg_gen_shri_i32(a, a, shift);
201     tcg_gen_add_i32(d, d, a);
202 }
203 
204 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
205 {
206     tcg_gen_shri_i64(a, a, shift);
207     tcg_gen_add_i64(d, d, a);
208 }
209 
210 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
211 {
212     tcg_gen_shri_vec(vece, a, a, sh);
213     tcg_gen_add_vec(vece, d, d, a);
214 }
215 
216 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
217                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
218 {
219     static const TCGOpcode vecop_list[] = {
220         INDEX_op_shri_vec, INDEX_op_add_vec, 0
221     };
222     static const GVecGen2i ops[4] = {
223         { .fni8 = gen_usra8_i64,
224           .fniv = gen_usra_vec,
225           .fno = gen_helper_gvec_usra_b,
226           .load_dest = true,
227           .opt_opc = vecop_list,
228           .vece = MO_8, },
229         { .fni8 = gen_usra16_i64,
230           .fniv = gen_usra_vec,
231           .fno = gen_helper_gvec_usra_h,
232           .load_dest = true,
233           .opt_opc = vecop_list,
234           .vece = MO_16, },
235         { .fni4 = gen_usra32_i32,
236           .fniv = gen_usra_vec,
237           .fno = gen_helper_gvec_usra_s,
238           .load_dest = true,
239           .opt_opc = vecop_list,
240           .vece = MO_32, },
241         { .fni8 = gen_usra64_i64,
242           .fniv = gen_usra_vec,
243           .fno = gen_helper_gvec_usra_d,
244           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
245           .load_dest = true,
246           .opt_opc = vecop_list,
247           .vece = MO_64, },
248     };
249 
250     /* tszimm encoding produces immediates in the range [1..esize]. */
251     tcg_debug_assert(shift > 0);
252     tcg_debug_assert(shift <= (8 << vece));
253 
254     /*
255      * Shifts larger than the element size are architecturally valid.
256      * Unsigned results in all zeros as input to accumulate: nop.
257      */
258     if (shift < (8 << vece)) {
259         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
260     } else {
261         /* Nop, but we do need to clear the tail. */
262         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
263     }
264 }
265 
266 /*
267  * Shift one less than the requested amount, and the low bit is
268  * the rounding bit.  For the 8 and 16-bit operations, because we
269  * mask the low bit, we can perform a normal integer shift instead
270  * of a vector shift.
271  */
272 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
273 {
274     TCGv_i64 t = tcg_temp_new_i64();
275 
276     tcg_gen_shri_i64(t, a, sh - 1);
277     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
278     tcg_gen_vec_sar8i_i64(d, a, sh);
279     tcg_gen_vec_add8_i64(d, d, t);
280 }
281 
282 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
283 {
284     TCGv_i64 t = tcg_temp_new_i64();
285 
286     tcg_gen_shri_i64(t, a, sh - 1);
287     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
288     tcg_gen_vec_sar16i_i64(d, a, sh);
289     tcg_gen_vec_add16_i64(d, d, t);
290 }
291 
292 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
293 {
294     TCGv_i32 t;
295 
296     /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
297     if (sh == 32) {
298         tcg_gen_movi_i32(d, 0);
299         return;
300     }
301     t = tcg_temp_new_i32();
302     tcg_gen_extract_i32(t, a, sh - 1, 1);
303     tcg_gen_sari_i32(d, a, sh);
304     tcg_gen_add_i32(d, d, t);
305 }
306 
307 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
308 {
309     TCGv_i64 t = tcg_temp_new_i64();
310 
311     tcg_gen_extract_i64(t, a, sh - 1, 1);
312     tcg_gen_sari_i64(d, a, sh);
313     tcg_gen_add_i64(d, d, t);
314 }
315 
316 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
317 {
318     TCGv_vec t = tcg_temp_new_vec_matching(d);
319     TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
320 
321     tcg_gen_shri_vec(vece, t, a, sh - 1);
322     tcg_gen_and_vec(vece, t, t, ones);
323     tcg_gen_sari_vec(vece, d, a, sh);
324     tcg_gen_add_vec(vece, d, d, t);
325 }
326 
327 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
328                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
329 {
330     static const TCGOpcode vecop_list[] = {
331         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
332     };
333     static const GVecGen2i ops[4] = {
334         { .fni8 = gen_srshr8_i64,
335           .fniv = gen_srshr_vec,
336           .fno = gen_helper_gvec_srshr_b,
337           .opt_opc = vecop_list,
338           .vece = MO_8 },
339         { .fni8 = gen_srshr16_i64,
340           .fniv = gen_srshr_vec,
341           .fno = gen_helper_gvec_srshr_h,
342           .opt_opc = vecop_list,
343           .vece = MO_16 },
344         { .fni4 = gen_srshr32_i32,
345           .fniv = gen_srshr_vec,
346           .fno = gen_helper_gvec_srshr_s,
347           .opt_opc = vecop_list,
348           .vece = MO_32 },
349         { .fni8 = gen_srshr64_i64,
350           .fniv = gen_srshr_vec,
351           .fno = gen_helper_gvec_srshr_d,
352           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
353           .opt_opc = vecop_list,
354           .vece = MO_64 },
355     };
356 
357     /* tszimm encoding produces immediates in the range [1..esize] */
358     tcg_debug_assert(shift > 0);
359     tcg_debug_assert(shift <= (8 << vece));
360 
361     if (shift == (8 << vece)) {
362         /*
363          * Shifts larger than the element size are architecturally valid.
364          * Signed results in all sign bits.  With rounding, this produces
365          *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
366          * I.e. always zero.
367          */
368         tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
369     } else {
370         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
371     }
372 }
373 
374 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
375 {
376     TCGv_i64 t = tcg_temp_new_i64();
377 
378     gen_srshr8_i64(t, a, sh);
379     tcg_gen_vec_add8_i64(d, d, t);
380 }
381 
382 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
383 {
384     TCGv_i64 t = tcg_temp_new_i64();
385 
386     gen_srshr16_i64(t, a, sh);
387     tcg_gen_vec_add16_i64(d, d, t);
388 }
389 
390 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
391 {
392     TCGv_i32 t = tcg_temp_new_i32();
393 
394     gen_srshr32_i32(t, a, sh);
395     tcg_gen_add_i32(d, d, t);
396 }
397 
398 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
399 {
400     TCGv_i64 t = tcg_temp_new_i64();
401 
402     gen_srshr64_i64(t, a, sh);
403     tcg_gen_add_i64(d, d, t);
404 }
405 
406 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
407 {
408     TCGv_vec t = tcg_temp_new_vec_matching(d);
409 
410     gen_srshr_vec(vece, t, a, sh);
411     tcg_gen_add_vec(vece, d, d, t);
412 }
413 
414 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
415                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
416 {
417     static const TCGOpcode vecop_list[] = {
418         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
419     };
420     static const GVecGen2i ops[4] = {
421         { .fni8 = gen_srsra8_i64,
422           .fniv = gen_srsra_vec,
423           .fno = gen_helper_gvec_srsra_b,
424           .opt_opc = vecop_list,
425           .load_dest = true,
426           .vece = MO_8 },
427         { .fni8 = gen_srsra16_i64,
428           .fniv = gen_srsra_vec,
429           .fno = gen_helper_gvec_srsra_h,
430           .opt_opc = vecop_list,
431           .load_dest = true,
432           .vece = MO_16 },
433         { .fni4 = gen_srsra32_i32,
434           .fniv = gen_srsra_vec,
435           .fno = gen_helper_gvec_srsra_s,
436           .opt_opc = vecop_list,
437           .load_dest = true,
438           .vece = MO_32 },
439         { .fni8 = gen_srsra64_i64,
440           .fniv = gen_srsra_vec,
441           .fno = gen_helper_gvec_srsra_d,
442           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
443           .opt_opc = vecop_list,
444           .load_dest = true,
445           .vece = MO_64 },
446     };
447 
448     /* tszimm encoding produces immediates in the range [1..esize] */
449     tcg_debug_assert(shift > 0);
450     tcg_debug_assert(shift <= (8 << vece));
451 
452     /*
453      * Shifts larger than the element size are architecturally valid.
454      * Signed results in all sign bits.  With rounding, this produces
455      *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
456      * I.e. always zero.  With accumulation, this leaves D unchanged.
457      */
458     if (shift == (8 << vece)) {
459         /* Nop, but we do need to clear the tail. */
460         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
461     } else {
462         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
463     }
464 }
465 
466 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
467 {
468     TCGv_i64 t = tcg_temp_new_i64();
469 
470     tcg_gen_shri_i64(t, a, sh - 1);
471     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
472     tcg_gen_vec_shr8i_i64(d, a, sh);
473     tcg_gen_vec_add8_i64(d, d, t);
474 }
475 
476 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
477 {
478     TCGv_i64 t = tcg_temp_new_i64();
479 
480     tcg_gen_shri_i64(t, a, sh - 1);
481     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
482     tcg_gen_vec_shr16i_i64(d, a, sh);
483     tcg_gen_vec_add16_i64(d, d, t);
484 }
485 
486 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
487 {
488     TCGv_i32 t;
489 
490     /* Handle shift by the input size for the benefit of trans_URSHR_ri */
491     if (sh == 32) {
492         tcg_gen_extract_i32(d, a, sh - 1, 1);
493         return;
494     }
495     t = tcg_temp_new_i32();
496     tcg_gen_extract_i32(t, a, sh - 1, 1);
497     tcg_gen_shri_i32(d, a, sh);
498     tcg_gen_add_i32(d, d, t);
499 }
500 
501 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
502 {
503     TCGv_i64 t = tcg_temp_new_i64();
504 
505     tcg_gen_extract_i64(t, a, sh - 1, 1);
506     tcg_gen_shri_i64(d, a, sh);
507     tcg_gen_add_i64(d, d, t);
508 }
509 
510 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
511 {
512     TCGv_vec t = tcg_temp_new_vec_matching(d);
513     TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
514 
515     tcg_gen_shri_vec(vece, t, a, shift - 1);
516     tcg_gen_and_vec(vece, t, t, ones);
517     tcg_gen_shri_vec(vece, d, a, shift);
518     tcg_gen_add_vec(vece, d, d, t);
519 }
520 
521 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
522                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
523 {
524     static const TCGOpcode vecop_list[] = {
525         INDEX_op_shri_vec, INDEX_op_add_vec, 0
526     };
527     static const GVecGen2i ops[4] = {
528         { .fni8 = gen_urshr8_i64,
529           .fniv = gen_urshr_vec,
530           .fno = gen_helper_gvec_urshr_b,
531           .opt_opc = vecop_list,
532           .vece = MO_8 },
533         { .fni8 = gen_urshr16_i64,
534           .fniv = gen_urshr_vec,
535           .fno = gen_helper_gvec_urshr_h,
536           .opt_opc = vecop_list,
537           .vece = MO_16 },
538         { .fni4 = gen_urshr32_i32,
539           .fniv = gen_urshr_vec,
540           .fno = gen_helper_gvec_urshr_s,
541           .opt_opc = vecop_list,
542           .vece = MO_32 },
543         { .fni8 = gen_urshr64_i64,
544           .fniv = gen_urshr_vec,
545           .fno = gen_helper_gvec_urshr_d,
546           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
547           .opt_opc = vecop_list,
548           .vece = MO_64 },
549     };
550 
551     /* tszimm encoding produces immediates in the range [1..esize] */
552     tcg_debug_assert(shift > 0);
553     tcg_debug_assert(shift <= (8 << vece));
554 
555     if (shift == (8 << vece)) {
556         /*
557          * Shifts larger than the element size are architecturally valid.
558          * Unsigned results in zero.  With rounding, this produces a
559          * copy of the most significant bit.
560          */
561         tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
562     } else {
563         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
564     }
565 }
566 
567 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
568 {
569     TCGv_i64 t = tcg_temp_new_i64();
570 
571     if (sh == 8) {
572         tcg_gen_vec_shr8i_i64(t, a, 7);
573     } else {
574         gen_urshr8_i64(t, a, sh);
575     }
576     tcg_gen_vec_add8_i64(d, d, t);
577 }
578 
579 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
580 {
581     TCGv_i64 t = tcg_temp_new_i64();
582 
583     if (sh == 16) {
584         tcg_gen_vec_shr16i_i64(t, a, 15);
585     } else {
586         gen_urshr16_i64(t, a, sh);
587     }
588     tcg_gen_vec_add16_i64(d, d, t);
589 }
590 
591 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
592 {
593     TCGv_i32 t = tcg_temp_new_i32();
594 
595     if (sh == 32) {
596         tcg_gen_shri_i32(t, a, 31);
597     } else {
598         gen_urshr32_i32(t, a, sh);
599     }
600     tcg_gen_add_i32(d, d, t);
601 }
602 
603 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
604 {
605     TCGv_i64 t = tcg_temp_new_i64();
606 
607     if (sh == 64) {
608         tcg_gen_shri_i64(t, a, 63);
609     } else {
610         gen_urshr64_i64(t, a, sh);
611     }
612     tcg_gen_add_i64(d, d, t);
613 }
614 
615 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
616 {
617     TCGv_vec t = tcg_temp_new_vec_matching(d);
618 
619     if (sh == (8 << vece)) {
620         tcg_gen_shri_vec(vece, t, a, sh - 1);
621     } else {
622         gen_urshr_vec(vece, t, a, sh);
623     }
624     tcg_gen_add_vec(vece, d, d, t);
625 }
626 
627 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
628                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
629 {
630     static const TCGOpcode vecop_list[] = {
631         INDEX_op_shri_vec, INDEX_op_add_vec, 0
632     };
633     static const GVecGen2i ops[4] = {
634         { .fni8 = gen_ursra8_i64,
635           .fniv = gen_ursra_vec,
636           .fno = gen_helper_gvec_ursra_b,
637           .opt_opc = vecop_list,
638           .load_dest = true,
639           .vece = MO_8 },
640         { .fni8 = gen_ursra16_i64,
641           .fniv = gen_ursra_vec,
642           .fno = gen_helper_gvec_ursra_h,
643           .opt_opc = vecop_list,
644           .load_dest = true,
645           .vece = MO_16 },
646         { .fni4 = gen_ursra32_i32,
647           .fniv = gen_ursra_vec,
648           .fno = gen_helper_gvec_ursra_s,
649           .opt_opc = vecop_list,
650           .load_dest = true,
651           .vece = MO_32 },
652         { .fni8 = gen_ursra64_i64,
653           .fniv = gen_ursra_vec,
654           .fno = gen_helper_gvec_ursra_d,
655           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
656           .opt_opc = vecop_list,
657           .load_dest = true,
658           .vece = MO_64 },
659     };
660 
661     /* tszimm encoding produces immediates in the range [1..esize] */
662     tcg_debug_assert(shift > 0);
663     tcg_debug_assert(shift <= (8 << vece));
664 
665     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
666 }
667 
668 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
669 {
670     uint64_t mask = dup_const(MO_8, 0xff >> shift);
671     TCGv_i64 t = tcg_temp_new_i64();
672 
673     tcg_gen_shri_i64(t, a, shift);
674     tcg_gen_andi_i64(t, t, mask);
675     tcg_gen_andi_i64(d, d, ~mask);
676     tcg_gen_or_i64(d, d, t);
677 }
678 
679 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
680 {
681     uint64_t mask = dup_const(MO_16, 0xffff >> shift);
682     TCGv_i64 t = tcg_temp_new_i64();
683 
684     tcg_gen_shri_i64(t, a, shift);
685     tcg_gen_andi_i64(t, t, mask);
686     tcg_gen_andi_i64(d, d, ~mask);
687     tcg_gen_or_i64(d, d, t);
688 }
689 
690 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
691 {
692     tcg_gen_shri_i32(a, a, shift);
693     tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
694 }
695 
696 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
697 {
698     tcg_gen_shri_i64(a, a, shift);
699     tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
700 }
701 
702 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
703 {
704     TCGv_vec t = tcg_temp_new_vec_matching(d);
705     int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh);
706     TCGv_vec m = tcg_constant_vec_matching(d, vece, mi);
707 
708     tcg_gen_shri_vec(vece, t, a, sh);
709     tcg_gen_and_vec(vece, d, d, m);
710     tcg_gen_or_vec(vece, d, d, t);
711 }
712 
713 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
714                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
715 {
716     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
717     const GVecGen2i ops[4] = {
718         { .fni8 = gen_shr8_ins_i64,
719           .fniv = gen_shr_ins_vec,
720           .fno = gen_helper_gvec_sri_b,
721           .load_dest = true,
722           .opt_opc = vecop_list,
723           .vece = MO_8 },
724         { .fni8 = gen_shr16_ins_i64,
725           .fniv = gen_shr_ins_vec,
726           .fno = gen_helper_gvec_sri_h,
727           .load_dest = true,
728           .opt_opc = vecop_list,
729           .vece = MO_16 },
730         { .fni4 = gen_shr32_ins_i32,
731           .fniv = gen_shr_ins_vec,
732           .fno = gen_helper_gvec_sri_s,
733           .load_dest = true,
734           .opt_opc = vecop_list,
735           .vece = MO_32 },
736         { .fni8 = gen_shr64_ins_i64,
737           .fniv = gen_shr_ins_vec,
738           .fno = gen_helper_gvec_sri_d,
739           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
740           .load_dest = true,
741           .opt_opc = vecop_list,
742           .vece = MO_64 },
743     };
744 
745     /* tszimm encoding produces immediates in the range [1..esize]. */
746     tcg_debug_assert(shift > 0);
747     tcg_debug_assert(shift <= (8 << vece));
748 
749     /* Shift of esize leaves destination unchanged. */
750     if (shift < (8 << vece)) {
751         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
752     } else {
753         /* Nop, but we do need to clear the tail. */
754         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
755     }
756 }
757 
758 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
759 {
760     uint64_t mask = dup_const(MO_8, 0xff << shift);
761     TCGv_i64 t = tcg_temp_new_i64();
762 
763     tcg_gen_shli_i64(t, a, shift);
764     tcg_gen_andi_i64(t, t, mask);
765     tcg_gen_andi_i64(d, d, ~mask);
766     tcg_gen_or_i64(d, d, t);
767 }
768 
769 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
770 {
771     uint64_t mask = dup_const(MO_16, 0xffff << shift);
772     TCGv_i64 t = tcg_temp_new_i64();
773 
774     tcg_gen_shli_i64(t, a, shift);
775     tcg_gen_andi_i64(t, t, mask);
776     tcg_gen_andi_i64(d, d, ~mask);
777     tcg_gen_or_i64(d, d, t);
778 }
779 
780 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
781 {
782     tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
783 }
784 
785 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
786 {
787     tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
788 }
789 
790 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
791 {
792     TCGv_vec t = tcg_temp_new_vec_matching(d);
793     TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh));
794 
795     tcg_gen_shli_vec(vece, t, a, sh);
796     tcg_gen_and_vec(vece, d, d, m);
797     tcg_gen_or_vec(vece, d, d, t);
798 }
799 
800 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
801                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
802 {
803     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
804     const GVecGen2i ops[4] = {
805         { .fni8 = gen_shl8_ins_i64,
806           .fniv = gen_shl_ins_vec,
807           .fno = gen_helper_gvec_sli_b,
808           .load_dest = true,
809           .opt_opc = vecop_list,
810           .vece = MO_8 },
811         { .fni8 = gen_shl16_ins_i64,
812           .fniv = gen_shl_ins_vec,
813           .fno = gen_helper_gvec_sli_h,
814           .load_dest = true,
815           .opt_opc = vecop_list,
816           .vece = MO_16 },
817         { .fni4 = gen_shl32_ins_i32,
818           .fniv = gen_shl_ins_vec,
819           .fno = gen_helper_gvec_sli_s,
820           .load_dest = true,
821           .opt_opc = vecop_list,
822           .vece = MO_32 },
823         { .fni8 = gen_shl64_ins_i64,
824           .fniv = gen_shl_ins_vec,
825           .fno = gen_helper_gvec_sli_d,
826           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
827           .load_dest = true,
828           .opt_opc = vecop_list,
829           .vece = MO_64 },
830     };
831 
832     /* tszimm encoding produces immediates in the range [0..esize-1]. */
833     tcg_debug_assert(shift >= 0);
834     tcg_debug_assert(shift < (8 << vece));
835 
836     if (shift == 0) {
837         tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
838     } else {
839         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
840     }
841 }
842 
843 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
844 {
845     gen_helper_neon_mul_u8(a, a, b);
846     gen_helper_neon_add_u8(d, d, a);
847 }
848 
849 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
850 {
851     gen_helper_neon_mul_u8(a, a, b);
852     gen_helper_neon_sub_u8(d, d, a);
853 }
854 
855 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
856 {
857     gen_helper_neon_mul_u16(a, a, b);
858     gen_helper_neon_add_u16(d, d, a);
859 }
860 
861 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
862 {
863     gen_helper_neon_mul_u16(a, a, b);
864     gen_helper_neon_sub_u16(d, d, a);
865 }
866 
867 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
868 {
869     tcg_gen_mul_i32(a, a, b);
870     tcg_gen_add_i32(d, d, a);
871 }
872 
873 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
874 {
875     tcg_gen_mul_i32(a, a, b);
876     tcg_gen_sub_i32(d, d, a);
877 }
878 
879 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
880 {
881     tcg_gen_mul_i64(a, a, b);
882     tcg_gen_add_i64(d, d, a);
883 }
884 
885 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
886 {
887     tcg_gen_mul_i64(a, a, b);
888     tcg_gen_sub_i64(d, d, a);
889 }
890 
891 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
892 {
893     tcg_gen_mul_vec(vece, a, a, b);
894     tcg_gen_add_vec(vece, d, d, a);
895 }
896 
897 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
898 {
899     tcg_gen_mul_vec(vece, a, a, b);
900     tcg_gen_sub_vec(vece, d, d, a);
901 }
902 
903 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
904  * these tables are shared with AArch64 which does support them.
905  */
906 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
907                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
908 {
909     static const TCGOpcode vecop_list[] = {
910         INDEX_op_mul_vec, INDEX_op_add_vec, 0
911     };
912     static const GVecGen3 ops[4] = {
913         { .fni4 = gen_mla8_i32,
914           .fniv = gen_mla_vec,
915           .load_dest = true,
916           .opt_opc = vecop_list,
917           .vece = MO_8 },
918         { .fni4 = gen_mla16_i32,
919           .fniv = gen_mla_vec,
920           .load_dest = true,
921           .opt_opc = vecop_list,
922           .vece = MO_16 },
923         { .fni4 = gen_mla32_i32,
924           .fniv = gen_mla_vec,
925           .load_dest = true,
926           .opt_opc = vecop_list,
927           .vece = MO_32 },
928         { .fni8 = gen_mla64_i64,
929           .fniv = gen_mla_vec,
930           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
931           .load_dest = true,
932           .opt_opc = vecop_list,
933           .vece = MO_64 },
934     };
935     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
936 }
937 
938 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
939                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
940 {
941     static const TCGOpcode vecop_list[] = {
942         INDEX_op_mul_vec, INDEX_op_sub_vec, 0
943     };
944     static const GVecGen3 ops[4] = {
945         { .fni4 = gen_mls8_i32,
946           .fniv = gen_mls_vec,
947           .load_dest = true,
948           .opt_opc = vecop_list,
949           .vece = MO_8 },
950         { .fni4 = gen_mls16_i32,
951           .fniv = gen_mls_vec,
952           .load_dest = true,
953           .opt_opc = vecop_list,
954           .vece = MO_16 },
955         { .fni4 = gen_mls32_i32,
956           .fniv = gen_mls_vec,
957           .load_dest = true,
958           .opt_opc = vecop_list,
959           .vece = MO_32 },
960         { .fni8 = gen_mls64_i64,
961           .fniv = gen_mls_vec,
962           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
963           .load_dest = true,
964           .opt_opc = vecop_list,
965           .vece = MO_64 },
966     };
967     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
968 }
969 
970 /* CMTST : test is "if (X & Y != 0)". */
971 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
972 {
973     tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b);
974 }
975 
976 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
977 {
978     tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b);
979 }
980 
981 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
982 {
983     tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b);
984 }
985 
986 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
987                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
988 {
989     static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
990     static const GVecGen3 ops[4] = {
991         { .fni4 = gen_helper_neon_tst_u8,
992           .fniv = gen_cmtst_vec,
993           .opt_opc = vecop_list,
994           .vece = MO_8 },
995         { .fni4 = gen_helper_neon_tst_u16,
996           .fniv = gen_cmtst_vec,
997           .opt_opc = vecop_list,
998           .vece = MO_16 },
999         { .fni4 = gen_cmtst_i32,
1000           .fniv = gen_cmtst_vec,
1001           .opt_opc = vecop_list,
1002           .vece = MO_32 },
1003         { .fni8 = gen_cmtst_i64,
1004           .fniv = gen_cmtst_vec,
1005           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1006           .opt_opc = vecop_list,
1007           .vece = MO_64 },
1008     };
1009     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1010 }
1011 
1012 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1013 {
1014     TCGv_i32 lval = tcg_temp_new_i32();
1015     TCGv_i32 rval = tcg_temp_new_i32();
1016     TCGv_i32 lsh = tcg_temp_new_i32();
1017     TCGv_i32 rsh = tcg_temp_new_i32();
1018     TCGv_i32 zero = tcg_constant_i32(0);
1019     TCGv_i32 max = tcg_constant_i32(32);
1020 
1021     /*
1022      * Rely on the TCG guarantee that out of range shifts produce
1023      * unspecified results, not undefined behaviour (i.e. no trap).
1024      * Discard out-of-range results after the fact.
1025      */
1026     tcg_gen_ext8s_i32(lsh, shift);
1027     tcg_gen_neg_i32(rsh, lsh);
1028     tcg_gen_shl_i32(lval, src, lsh);
1029     tcg_gen_shr_i32(rval, src, rsh);
1030     tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
1031     tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
1032 }
1033 
1034 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1035 {
1036     TCGv_i64 lval = tcg_temp_new_i64();
1037     TCGv_i64 rval = tcg_temp_new_i64();
1038     TCGv_i64 lsh = tcg_temp_new_i64();
1039     TCGv_i64 rsh = tcg_temp_new_i64();
1040     TCGv_i64 zero = tcg_constant_i64(0);
1041     TCGv_i64 max = tcg_constant_i64(64);
1042 
1043     /*
1044      * Rely on the TCG guarantee that out of range shifts produce
1045      * unspecified results, not undefined behaviour (i.e. no trap).
1046      * Discard out-of-range results after the fact.
1047      */
1048     tcg_gen_ext8s_i64(lsh, shift);
1049     tcg_gen_neg_i64(rsh, lsh);
1050     tcg_gen_shl_i64(lval, src, lsh);
1051     tcg_gen_shr_i64(rval, src, rsh);
1052     tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
1053     tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
1054 }
1055 
1056 static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
1057                          TCGv_vec src, TCGv_vec shift)
1058 {
1059     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1060     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1061     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1062     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1063     TCGv_vec max, zero;
1064 
1065     tcg_gen_neg_vec(vece, rsh, shift);
1066     if (vece == MO_8) {
1067         tcg_gen_mov_vec(lsh, shift);
1068     } else {
1069         TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
1070         tcg_gen_and_vec(vece, lsh, shift, msk);
1071         tcg_gen_and_vec(vece, rsh, rsh, msk);
1072     }
1073 
1074     /*
1075      * Rely on the TCG guarantee that out of range shifts produce
1076      * unspecified results, not undefined behaviour (i.e. no trap).
1077      * Discard out-of-range results after the fact.
1078      */
1079     tcg_gen_shlv_vec(vece, lval, src, lsh);
1080     tcg_gen_shrv_vec(vece, rval, src, rsh);
1081 
1082     /*
1083      * The choice of GE (signed) and GEU (unsigned) are biased toward
1084      * the instructions of the x86_64 host.  For MO_8, the whole byte
1085      * is significant so we must use an unsigned compare; otherwise we
1086      * have already masked to a byte and so a signed compare works.
1087      * Other tcg hosts have a full set of comparisons and do not care.
1088      */
1089     zero = tcg_constant_vec_matching(dst, vece, 0);
1090     max = tcg_constant_vec_matching(dst, vece, 8 << vece);
1091     if (vece == MO_8) {
1092         tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval);
1093         tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval);
1094     } else {
1095         tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval);
1096         tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval);
1097     }
1098     tcg_gen_or_vec(vece, dst, lval, rval);
1099 }
1100 
1101 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1102                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1103 {
1104     static const TCGOpcode vecop_list[] = {
1105         INDEX_op_neg_vec, INDEX_op_shlv_vec,
1106         INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0
1107     };
1108     static const GVecGen3 ops[4] = {
1109         { .fniv = gen_ushl_vec,
1110           .fno = gen_helper_gvec_ushl_b,
1111           .opt_opc = vecop_list,
1112           .vece = MO_8 },
1113         { .fniv = gen_ushl_vec,
1114           .fno = gen_helper_gvec_ushl_h,
1115           .opt_opc = vecop_list,
1116           .vece = MO_16 },
1117         { .fni4 = gen_ushl_i32,
1118           .fniv = gen_ushl_vec,
1119           .opt_opc = vecop_list,
1120           .vece = MO_32 },
1121         { .fni8 = gen_ushl_i64,
1122           .fniv = gen_ushl_vec,
1123           .opt_opc = vecop_list,
1124           .vece = MO_64 },
1125     };
1126     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1127 }
1128 
1129 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1130 {
1131     TCGv_i32 lval = tcg_temp_new_i32();
1132     TCGv_i32 rval = tcg_temp_new_i32();
1133     TCGv_i32 lsh = tcg_temp_new_i32();
1134     TCGv_i32 rsh = tcg_temp_new_i32();
1135     TCGv_i32 zero = tcg_constant_i32(0);
1136     TCGv_i32 max = tcg_constant_i32(31);
1137 
1138     /*
1139      * Rely on the TCG guarantee that out of range shifts produce
1140      * unspecified results, not undefined behaviour (i.e. no trap).
1141      * Discard out-of-range results after the fact.
1142      */
1143     tcg_gen_ext8s_i32(lsh, shift);
1144     tcg_gen_neg_i32(rsh, lsh);
1145     tcg_gen_shl_i32(lval, src, lsh);
1146     tcg_gen_umin_i32(rsh, rsh, max);
1147     tcg_gen_sar_i32(rval, src, rsh);
1148     tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
1149     tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
1150 }
1151 
1152 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1153 {
1154     TCGv_i64 lval = tcg_temp_new_i64();
1155     TCGv_i64 rval = tcg_temp_new_i64();
1156     TCGv_i64 lsh = tcg_temp_new_i64();
1157     TCGv_i64 rsh = tcg_temp_new_i64();
1158     TCGv_i64 zero = tcg_constant_i64(0);
1159     TCGv_i64 max = tcg_constant_i64(63);
1160 
1161     /*
1162      * Rely on the TCG guarantee that out of range shifts produce
1163      * unspecified results, not undefined behaviour (i.e. no trap).
1164      * Discard out-of-range results after the fact.
1165      */
1166     tcg_gen_ext8s_i64(lsh, shift);
1167     tcg_gen_neg_i64(rsh, lsh);
1168     tcg_gen_shl_i64(lval, src, lsh);
1169     tcg_gen_umin_i64(rsh, rsh, max);
1170     tcg_gen_sar_i64(rval, src, rsh);
1171     tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
1172     tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
1173 }
1174 
1175 static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
1176                          TCGv_vec src, TCGv_vec shift)
1177 {
1178     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1179     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1180     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1181     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1182     TCGv_vec max, zero;
1183 
1184     /*
1185      * Rely on the TCG guarantee that out of range shifts produce
1186      * unspecified results, not undefined behaviour (i.e. no trap).
1187      * Discard out-of-range results after the fact.
1188      */
1189     tcg_gen_neg_vec(vece, rsh, shift);
1190     if (vece == MO_8) {
1191         tcg_gen_mov_vec(lsh, shift);
1192     } else {
1193         TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
1194         tcg_gen_and_vec(vece, lsh, shift, msk);
1195         tcg_gen_and_vec(vece, rsh, rsh, msk);
1196     }
1197 
1198     /* Bound rsh so out of bound right shift gets -1.  */
1199     max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1);
1200     tcg_gen_umin_vec(vece, rsh, rsh, max);
1201 
1202     tcg_gen_shlv_vec(vece, lval, src, lsh);
1203     tcg_gen_sarv_vec(vece, rval, src, rsh);
1204 
1205     /* Select in-bound left shift.  */
1206     zero = tcg_constant_vec_matching(dst, vece, 0);
1207     tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval);
1208 
1209     /* Select between left and right shift.  */
1210     if (vece == MO_8) {
1211         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval);
1212     } else {
1213         TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80);
1214         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval);
1215     }
1216 }
1217 
1218 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1219                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1220 {
1221     static const TCGOpcode vecop_list[] = {
1222         INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
1223         INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0
1224     };
1225     static const GVecGen3 ops[4] = {
1226         { .fniv = gen_sshl_vec,
1227           .fno = gen_helper_gvec_sshl_b,
1228           .opt_opc = vecop_list,
1229           .vece = MO_8 },
1230         { .fniv = gen_sshl_vec,
1231           .fno = gen_helper_gvec_sshl_h,
1232           .opt_opc = vecop_list,
1233           .vece = MO_16 },
1234         { .fni4 = gen_sshl_i32,
1235           .fniv = gen_sshl_vec,
1236           .opt_opc = vecop_list,
1237           .vece = MO_32 },
1238         { .fni8 = gen_sshl_i64,
1239           .fniv = gen_sshl_vec,
1240           .opt_opc = vecop_list,
1241           .vece = MO_64 },
1242     };
1243     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1244 }
1245 
1246 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1247                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1248 {
1249     static gen_helper_gvec_3 * const fns[] = {
1250         gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h,
1251         gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d,
1252     };
1253     tcg_debug_assert(vece <= MO_64);
1254     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1255 }
1256 
1257 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1258                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1259 {
1260     static gen_helper_gvec_3 * const fns[] = {
1261         gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h,
1262         gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d,
1263     };
1264     tcg_debug_assert(vece <= MO_64);
1265     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1266 }
1267 
1268 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1269                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1270 {
1271     static gen_helper_gvec_3_ptr * const fns[] = {
1272         gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h,
1273         gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d,
1274     };
1275     tcg_debug_assert(vece <= MO_64);
1276     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1277                        opr_sz, max_sz, 0, fns[vece]);
1278 }
1279 
1280 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1281                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1282 {
1283     static gen_helper_gvec_3_ptr * const fns[] = {
1284         gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h,
1285         gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d,
1286     };
1287     tcg_debug_assert(vece <= MO_64);
1288     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1289                        opr_sz, max_sz, 0, fns[vece]);
1290 }
1291 
1292 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1293                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1294 {
1295     static gen_helper_gvec_3_ptr * const fns[] = {
1296         gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h,
1297         gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d,
1298     };
1299     tcg_debug_assert(vece <= MO_64);
1300     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1301                        opr_sz, max_sz, 0, fns[vece]);
1302 }
1303 
1304 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1305                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1306 {
1307     static gen_helper_gvec_3_ptr * const fns[] = {
1308         gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h,
1309         gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d,
1310     };
1311     tcg_debug_assert(vece <= MO_64);
1312     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1313                        opr_sz, max_sz, 0, fns[vece]);
1314 }
1315 
1316 void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1317                      int64_t c, uint32_t opr_sz, uint32_t max_sz)
1318 {
1319     static gen_helper_gvec_2_ptr * const fns[] = {
1320         gen_helper_neon_sqshli_b, gen_helper_neon_sqshli_h,
1321         gen_helper_neon_sqshli_s, gen_helper_neon_sqshli_d,
1322     };
1323     tcg_debug_assert(vece <= MO_64);
1324     tcg_debug_assert(c >= 0 && c <= (8 << vece));
1325     tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
1326 }
1327 
1328 void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1329                      int64_t c, uint32_t opr_sz, uint32_t max_sz)
1330 {
1331     static gen_helper_gvec_2_ptr * const fns[] = {
1332         gen_helper_neon_uqshli_b, gen_helper_neon_uqshli_h,
1333         gen_helper_neon_uqshli_s, gen_helper_neon_uqshli_d,
1334     };
1335     tcg_debug_assert(vece <= MO_64);
1336     tcg_debug_assert(c >= 0 && c <= (8 << vece));
1337     tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
1338 }
1339 
1340 void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1341                       int64_t c, uint32_t opr_sz, uint32_t max_sz)
1342 {
1343     static gen_helper_gvec_2_ptr * const fns[] = {
1344         gen_helper_neon_sqshlui_b, gen_helper_neon_sqshlui_h,
1345         gen_helper_neon_sqshlui_s, gen_helper_neon_sqshlui_d,
1346     };
1347     tcg_debug_assert(vece <= MO_64);
1348     tcg_debug_assert(c >= 0 && c <= (8 << vece));
1349     tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
1350 }
1351 
1352 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1353 {
1354     uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
1355     TCGv_i64 tmp = tcg_temp_new_i64();
1356 
1357     tcg_gen_add_i64(tmp, a, b);
1358     tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max));
1359     tcg_gen_xor_i64(tmp, tmp, res);
1360     tcg_gen_or_i64(qc, qc, tmp);
1361 }
1362 
1363 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1364 {
1365     TCGv_i64 t = tcg_temp_new_i64();
1366 
1367     tcg_gen_add_i64(t, a, b);
1368     tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a,
1369                         tcg_constant_i64(UINT64_MAX), t);
1370     tcg_gen_xor_i64(t, t, res);
1371     tcg_gen_or_i64(qc, qc, t);
1372 }
1373 
1374 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1375                           TCGv_vec a, TCGv_vec b)
1376 {
1377     TCGv_vec x = tcg_temp_new_vec_matching(t);
1378     tcg_gen_add_vec(vece, x, a, b);
1379     tcg_gen_usadd_vec(vece, t, a, b);
1380     tcg_gen_xor_vec(vece, x, x, t);
1381     tcg_gen_or_vec(vece, qc, qc, x);
1382 }
1383 
1384 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1385                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1386 {
1387     static const TCGOpcode vecop_list[] = {
1388         INDEX_op_usadd_vec, INDEX_op_add_vec, 0
1389     };
1390     static const GVecGen4 ops[4] = {
1391         { .fniv = gen_uqadd_vec,
1392           .fno = gen_helper_gvec_uqadd_b,
1393           .write_aofs = true,
1394           .opt_opc = vecop_list,
1395           .vece = MO_8 },
1396         { .fniv = gen_uqadd_vec,
1397           .fno = gen_helper_gvec_uqadd_h,
1398           .write_aofs = true,
1399           .opt_opc = vecop_list,
1400           .vece = MO_16 },
1401         { .fniv = gen_uqadd_vec,
1402           .fno = gen_helper_gvec_uqadd_s,
1403           .write_aofs = true,
1404           .opt_opc = vecop_list,
1405           .vece = MO_32 },
1406         { .fniv = gen_uqadd_vec,
1407           .fni8 = gen_uqadd_d,
1408           .fno = gen_helper_gvec_uqadd_d,
1409           .write_aofs = true,
1410           .opt_opc = vecop_list,
1411           .vece = MO_64 },
1412     };
1413 
1414     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1415     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1416                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1417 }
1418 
1419 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1420 {
1421     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1422     int64_t min = -1ll - max;
1423     TCGv_i64 tmp = tcg_temp_new_i64();
1424 
1425     tcg_gen_add_i64(tmp, a, b);
1426     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1427     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1428     tcg_gen_xor_i64(tmp, tmp, res);
1429     tcg_gen_or_i64(qc, qc, tmp);
1430 }
1431 
1432 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1433 {
1434     TCGv_i64 t0 = tcg_temp_new_i64();
1435     TCGv_i64 t1 = tcg_temp_new_i64();
1436     TCGv_i64 t2 = tcg_temp_new_i64();
1437 
1438     tcg_gen_add_i64(t0, a, b);
1439 
1440     /* Compute signed overflow indication into T1 */
1441     tcg_gen_xor_i64(t1, a, b);
1442     tcg_gen_xor_i64(t2, t0, a);
1443     tcg_gen_andc_i64(t1, t2, t1);
1444 
1445     /* Compute saturated value into T2 */
1446     tcg_gen_sari_i64(t2, a, 63);
1447     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1448 
1449     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1450     tcg_gen_xor_i64(t0, t0, res);
1451     tcg_gen_or_i64(qc, qc, t0);
1452 }
1453 
1454 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1455                           TCGv_vec a, TCGv_vec b)
1456 {
1457     TCGv_vec x = tcg_temp_new_vec_matching(t);
1458     tcg_gen_add_vec(vece, x, a, b);
1459     tcg_gen_ssadd_vec(vece, t, a, b);
1460     tcg_gen_xor_vec(vece, x, x, t);
1461     tcg_gen_or_vec(vece, qc, qc, x);
1462 }
1463 
1464 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1465                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1466 {
1467     static const TCGOpcode vecop_list[] = {
1468         INDEX_op_ssadd_vec, INDEX_op_add_vec, 0
1469     };
1470     static const GVecGen4 ops[4] = {
1471         { .fniv = gen_sqadd_vec,
1472           .fno = gen_helper_gvec_sqadd_b,
1473           .opt_opc = vecop_list,
1474           .write_aofs = true,
1475           .vece = MO_8 },
1476         { .fniv = gen_sqadd_vec,
1477           .fno = gen_helper_gvec_sqadd_h,
1478           .opt_opc = vecop_list,
1479           .write_aofs = true,
1480           .vece = MO_16 },
1481         { .fniv = gen_sqadd_vec,
1482           .fno = gen_helper_gvec_sqadd_s,
1483           .opt_opc = vecop_list,
1484           .write_aofs = true,
1485           .vece = MO_32 },
1486         { .fniv = gen_sqadd_vec,
1487           .fni8 = gen_sqadd_d,
1488           .fno = gen_helper_gvec_sqadd_d,
1489           .opt_opc = vecop_list,
1490           .write_aofs = true,
1491           .vece = MO_64 },
1492     };
1493 
1494     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1495     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1496                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1497 }
1498 
1499 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1500 {
1501     TCGv_i64 tmp = tcg_temp_new_i64();
1502 
1503     tcg_gen_sub_i64(tmp, a, b);
1504     tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0));
1505     tcg_gen_xor_i64(tmp, tmp, res);
1506     tcg_gen_or_i64(qc, qc, tmp);
1507 }
1508 
1509 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1510 {
1511     TCGv_i64 t = tcg_temp_new_i64();
1512 
1513     tcg_gen_sub_i64(t, a, b);
1514     tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t);
1515     tcg_gen_xor_i64(t, t, res);
1516     tcg_gen_or_i64(qc, qc, t);
1517 }
1518 
1519 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1520                           TCGv_vec a, TCGv_vec b)
1521 {
1522     TCGv_vec x = tcg_temp_new_vec_matching(t);
1523     tcg_gen_sub_vec(vece, x, a, b);
1524     tcg_gen_ussub_vec(vece, t, a, b);
1525     tcg_gen_xor_vec(vece, x, x, t);
1526     tcg_gen_or_vec(vece, qc, qc, x);
1527 }
1528 
1529 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1530                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1531 {
1532     static const TCGOpcode vecop_list[] = {
1533         INDEX_op_ussub_vec, INDEX_op_sub_vec, 0
1534     };
1535     static const GVecGen4 ops[4] = {
1536         { .fniv = gen_uqsub_vec,
1537           .fno = gen_helper_gvec_uqsub_b,
1538           .opt_opc = vecop_list,
1539           .write_aofs = true,
1540           .vece = MO_8 },
1541         { .fniv = gen_uqsub_vec,
1542           .fno = gen_helper_gvec_uqsub_h,
1543           .opt_opc = vecop_list,
1544           .write_aofs = true,
1545           .vece = MO_16 },
1546         { .fniv = gen_uqsub_vec,
1547           .fno = gen_helper_gvec_uqsub_s,
1548           .opt_opc = vecop_list,
1549           .write_aofs = true,
1550           .vece = MO_32 },
1551         { .fniv = gen_uqsub_vec,
1552           .fni8 = gen_uqsub_d,
1553           .fno = gen_helper_gvec_uqsub_d,
1554           .opt_opc = vecop_list,
1555           .write_aofs = true,
1556           .vece = MO_64 },
1557     };
1558 
1559     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1560     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1561                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1562 }
1563 
1564 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1565 {
1566     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1567     int64_t min = -1ll - max;
1568     TCGv_i64 tmp = tcg_temp_new_i64();
1569 
1570     tcg_gen_sub_i64(tmp, a, b);
1571     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1572     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1573     tcg_gen_xor_i64(tmp, tmp, res);
1574     tcg_gen_or_i64(qc, qc, tmp);
1575 }
1576 
1577 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1578 {
1579     TCGv_i64 t0 = tcg_temp_new_i64();
1580     TCGv_i64 t1 = tcg_temp_new_i64();
1581     TCGv_i64 t2 = tcg_temp_new_i64();
1582 
1583     tcg_gen_sub_i64(t0, a, b);
1584 
1585     /* Compute signed overflow indication into T1 */
1586     tcg_gen_xor_i64(t1, a, b);
1587     tcg_gen_xor_i64(t2, t0, a);
1588     tcg_gen_and_i64(t1, t1, t2);
1589 
1590     /* Compute saturated value into T2 */
1591     tcg_gen_sari_i64(t2, a, 63);
1592     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1593 
1594     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1595     tcg_gen_xor_i64(t0, t0, res);
1596     tcg_gen_or_i64(qc, qc, t0);
1597 }
1598 
1599 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1600                           TCGv_vec a, TCGv_vec b)
1601 {
1602     TCGv_vec x = tcg_temp_new_vec_matching(t);
1603     tcg_gen_sub_vec(vece, x, a, b);
1604     tcg_gen_sssub_vec(vece, t, a, b);
1605     tcg_gen_xor_vec(vece, x, x, t);
1606     tcg_gen_or_vec(vece, qc, qc, x);
1607 }
1608 
1609 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1610                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1611 {
1612     static const TCGOpcode vecop_list[] = {
1613         INDEX_op_sssub_vec, INDEX_op_sub_vec, 0
1614     };
1615     static const GVecGen4 ops[4] = {
1616         { .fniv = gen_sqsub_vec,
1617           .fno = gen_helper_gvec_sqsub_b,
1618           .opt_opc = vecop_list,
1619           .write_aofs = true,
1620           .vece = MO_8 },
1621         { .fniv = gen_sqsub_vec,
1622           .fno = gen_helper_gvec_sqsub_h,
1623           .opt_opc = vecop_list,
1624           .write_aofs = true,
1625           .vece = MO_16 },
1626         { .fniv = gen_sqsub_vec,
1627           .fno = gen_helper_gvec_sqsub_s,
1628           .opt_opc = vecop_list,
1629           .write_aofs = true,
1630           .vece = MO_32 },
1631         { .fniv = gen_sqsub_vec,
1632           .fni8 = gen_sqsub_d,
1633           .fno = gen_helper_gvec_sqsub_d,
1634           .opt_opc = vecop_list,
1635           .write_aofs = true,
1636           .vece = MO_64 },
1637     };
1638 
1639     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1640     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1641                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1642 }
1643 
1644 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1645 {
1646     TCGv_i32 t = tcg_temp_new_i32();
1647 
1648     tcg_gen_sub_i32(t, a, b);
1649     tcg_gen_sub_i32(d, b, a);
1650     tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
1651 }
1652 
1653 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1654 {
1655     TCGv_i64 t = tcg_temp_new_i64();
1656 
1657     tcg_gen_sub_i64(t, a, b);
1658     tcg_gen_sub_i64(d, b, a);
1659     tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
1660 }
1661 
1662 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1663 {
1664     TCGv_vec t = tcg_temp_new_vec_matching(d);
1665 
1666     tcg_gen_smin_vec(vece, t, a, b);
1667     tcg_gen_smax_vec(vece, d, a, b);
1668     tcg_gen_sub_vec(vece, d, d, t);
1669 }
1670 
1671 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1672                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1673 {
1674     static const TCGOpcode vecop_list[] = {
1675         INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1676     };
1677     static const GVecGen3 ops[4] = {
1678         { .fniv = gen_sabd_vec,
1679           .fno = gen_helper_gvec_sabd_b,
1680           .opt_opc = vecop_list,
1681           .vece = MO_8 },
1682         { .fniv = gen_sabd_vec,
1683           .fno = gen_helper_gvec_sabd_h,
1684           .opt_opc = vecop_list,
1685           .vece = MO_16 },
1686         { .fni4 = gen_sabd_i32,
1687           .fniv = gen_sabd_vec,
1688           .fno = gen_helper_gvec_sabd_s,
1689           .opt_opc = vecop_list,
1690           .vece = MO_32 },
1691         { .fni8 = gen_sabd_i64,
1692           .fniv = gen_sabd_vec,
1693           .fno = gen_helper_gvec_sabd_d,
1694           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1695           .opt_opc = vecop_list,
1696           .vece = MO_64 },
1697     };
1698     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1699 }
1700 
1701 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1702 {
1703     TCGv_i32 t = tcg_temp_new_i32();
1704 
1705     tcg_gen_sub_i32(t, a, b);
1706     tcg_gen_sub_i32(d, b, a);
1707     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
1708 }
1709 
1710 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1711 {
1712     TCGv_i64 t = tcg_temp_new_i64();
1713 
1714     tcg_gen_sub_i64(t, a, b);
1715     tcg_gen_sub_i64(d, b, a);
1716     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
1717 }
1718 
1719 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1720 {
1721     TCGv_vec t = tcg_temp_new_vec_matching(d);
1722 
1723     tcg_gen_umin_vec(vece, t, a, b);
1724     tcg_gen_umax_vec(vece, d, a, b);
1725     tcg_gen_sub_vec(vece, d, d, t);
1726 }
1727 
1728 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1729                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1730 {
1731     static const TCGOpcode vecop_list[] = {
1732         INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1733     };
1734     static const GVecGen3 ops[4] = {
1735         { .fniv = gen_uabd_vec,
1736           .fno = gen_helper_gvec_uabd_b,
1737           .opt_opc = vecop_list,
1738           .vece = MO_8 },
1739         { .fniv = gen_uabd_vec,
1740           .fno = gen_helper_gvec_uabd_h,
1741           .opt_opc = vecop_list,
1742           .vece = MO_16 },
1743         { .fni4 = gen_uabd_i32,
1744           .fniv = gen_uabd_vec,
1745           .fno = gen_helper_gvec_uabd_s,
1746           .opt_opc = vecop_list,
1747           .vece = MO_32 },
1748         { .fni8 = gen_uabd_i64,
1749           .fniv = gen_uabd_vec,
1750           .fno = gen_helper_gvec_uabd_d,
1751           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1752           .opt_opc = vecop_list,
1753           .vece = MO_64 },
1754     };
1755     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1756 }
1757 
1758 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1759 {
1760     TCGv_i32 t = tcg_temp_new_i32();
1761     gen_sabd_i32(t, a, b);
1762     tcg_gen_add_i32(d, d, t);
1763 }
1764 
1765 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1766 {
1767     TCGv_i64 t = tcg_temp_new_i64();
1768     gen_sabd_i64(t, a, b);
1769     tcg_gen_add_i64(d, d, t);
1770 }
1771 
1772 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1773 {
1774     TCGv_vec t = tcg_temp_new_vec_matching(d);
1775     gen_sabd_vec(vece, t, a, b);
1776     tcg_gen_add_vec(vece, d, d, t);
1777 }
1778 
1779 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1780                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1781 {
1782     static const TCGOpcode vecop_list[] = {
1783         INDEX_op_sub_vec, INDEX_op_add_vec,
1784         INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1785     };
1786     static const GVecGen3 ops[4] = {
1787         { .fniv = gen_saba_vec,
1788           .fno = gen_helper_gvec_saba_b,
1789           .opt_opc = vecop_list,
1790           .load_dest = true,
1791           .vece = MO_8 },
1792         { .fniv = gen_saba_vec,
1793           .fno = gen_helper_gvec_saba_h,
1794           .opt_opc = vecop_list,
1795           .load_dest = true,
1796           .vece = MO_16 },
1797         { .fni4 = gen_saba_i32,
1798           .fniv = gen_saba_vec,
1799           .fno = gen_helper_gvec_saba_s,
1800           .opt_opc = vecop_list,
1801           .load_dest = true,
1802           .vece = MO_32 },
1803         { .fni8 = gen_saba_i64,
1804           .fniv = gen_saba_vec,
1805           .fno = gen_helper_gvec_saba_d,
1806           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1807           .opt_opc = vecop_list,
1808           .load_dest = true,
1809           .vece = MO_64 },
1810     };
1811     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1812 }
1813 
1814 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1815 {
1816     TCGv_i32 t = tcg_temp_new_i32();
1817     gen_uabd_i32(t, a, b);
1818     tcg_gen_add_i32(d, d, t);
1819 }
1820 
1821 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1822 {
1823     TCGv_i64 t = tcg_temp_new_i64();
1824     gen_uabd_i64(t, a, b);
1825     tcg_gen_add_i64(d, d, t);
1826 }
1827 
1828 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1829 {
1830     TCGv_vec t = tcg_temp_new_vec_matching(d);
1831     gen_uabd_vec(vece, t, a, b);
1832     tcg_gen_add_vec(vece, d, d, t);
1833 }
1834 
1835 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1836                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1837 {
1838     static const TCGOpcode vecop_list[] = {
1839         INDEX_op_sub_vec, INDEX_op_add_vec,
1840         INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1841     };
1842     static const GVecGen3 ops[4] = {
1843         { .fniv = gen_uaba_vec,
1844           .fno = gen_helper_gvec_uaba_b,
1845           .opt_opc = vecop_list,
1846           .load_dest = true,
1847           .vece = MO_8 },
1848         { .fniv = gen_uaba_vec,
1849           .fno = gen_helper_gvec_uaba_h,
1850           .opt_opc = vecop_list,
1851           .load_dest = true,
1852           .vece = MO_16 },
1853         { .fni4 = gen_uaba_i32,
1854           .fniv = gen_uaba_vec,
1855           .fno = gen_helper_gvec_uaba_s,
1856           .opt_opc = vecop_list,
1857           .load_dest = true,
1858           .vece = MO_32 },
1859         { .fni8 = gen_uaba_i64,
1860           .fniv = gen_uaba_vec,
1861           .fno = gen_helper_gvec_uaba_d,
1862           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1863           .opt_opc = vecop_list,
1864           .load_dest = true,
1865           .vece = MO_64 },
1866     };
1867     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1868 }
1869 
1870 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1871                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1872 {
1873     static gen_helper_gvec_3 * const fns[4] = {
1874         gen_helper_gvec_addp_b,
1875         gen_helper_gvec_addp_h,
1876         gen_helper_gvec_addp_s,
1877         gen_helper_gvec_addp_d,
1878     };
1879     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1880 }
1881 
1882 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1883                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1884 {
1885     static gen_helper_gvec_3 * const fns[4] = {
1886         gen_helper_gvec_smaxp_b,
1887         gen_helper_gvec_smaxp_h,
1888         gen_helper_gvec_smaxp_s,
1889     };
1890     tcg_debug_assert(vece <= MO_32);
1891     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1892 }
1893 
1894 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1895                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1896 {
1897     static gen_helper_gvec_3 * const fns[4] = {
1898         gen_helper_gvec_sminp_b,
1899         gen_helper_gvec_sminp_h,
1900         gen_helper_gvec_sminp_s,
1901     };
1902     tcg_debug_assert(vece <= MO_32);
1903     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1904 }
1905 
1906 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1907                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1908 {
1909     static gen_helper_gvec_3 * const fns[4] = {
1910         gen_helper_gvec_umaxp_b,
1911         gen_helper_gvec_umaxp_h,
1912         gen_helper_gvec_umaxp_s,
1913     };
1914     tcg_debug_assert(vece <= MO_32);
1915     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1916 }
1917 
1918 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1919                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1920 {
1921     static gen_helper_gvec_3 * const fns[4] = {
1922         gen_helper_gvec_uminp_b,
1923         gen_helper_gvec_uminp_h,
1924         gen_helper_gvec_uminp_s,
1925     };
1926     tcg_debug_assert(vece <= MO_32);
1927     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1928 }
1929 
1930 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1931 {
1932     TCGv_i64 t = tcg_temp_new_i64();
1933 
1934     tcg_gen_and_i64(t, a, b);
1935     tcg_gen_vec_sar8i_i64(a, a, 1);
1936     tcg_gen_vec_sar8i_i64(b, b, 1);
1937     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1938     tcg_gen_vec_add8_i64(d, a, b);
1939     tcg_gen_vec_add8_i64(d, d, t);
1940 }
1941 
1942 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1943 {
1944     TCGv_i64 t = tcg_temp_new_i64();
1945 
1946     tcg_gen_and_i64(t, a, b);
1947     tcg_gen_vec_sar16i_i64(a, a, 1);
1948     tcg_gen_vec_sar16i_i64(b, b, 1);
1949     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1950     tcg_gen_vec_add16_i64(d, a, b);
1951     tcg_gen_vec_add16_i64(d, d, t);
1952 }
1953 
1954 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1955 {
1956     TCGv_i32 t = tcg_temp_new_i32();
1957 
1958     tcg_gen_and_i32(t, a, b);
1959     tcg_gen_sari_i32(a, a, 1);
1960     tcg_gen_sari_i32(b, b, 1);
1961     tcg_gen_andi_i32(t, t, 1);
1962     tcg_gen_add_i32(d, a, b);
1963     tcg_gen_add_i32(d, d, t);
1964 }
1965 
1966 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1967 {
1968     TCGv_vec t = tcg_temp_new_vec_matching(d);
1969 
1970     tcg_gen_and_vec(vece, t, a, b);
1971     tcg_gen_sari_vec(vece, a, a, 1);
1972     tcg_gen_sari_vec(vece, b, b, 1);
1973     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1974     tcg_gen_add_vec(vece, d, a, b);
1975     tcg_gen_add_vec(vece, d, d, t);
1976 }
1977 
1978 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1979                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1980 {
1981     static const TCGOpcode vecop_list[] = {
1982         INDEX_op_sari_vec, INDEX_op_add_vec, 0
1983     };
1984     static const GVecGen3 g[] = {
1985         { .fni8 = gen_shadd8_i64,
1986           .fniv = gen_shadd_vec,
1987           .opt_opc = vecop_list,
1988           .vece = MO_8 },
1989         { .fni8 = gen_shadd16_i64,
1990           .fniv = gen_shadd_vec,
1991           .opt_opc = vecop_list,
1992           .vece = MO_16 },
1993         { .fni4 = gen_shadd_i32,
1994           .fniv = gen_shadd_vec,
1995           .opt_opc = vecop_list,
1996           .vece = MO_32 },
1997     };
1998     tcg_debug_assert(vece <= MO_32);
1999     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2000 }
2001 
2002 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2003 {
2004     TCGv_i64 t = tcg_temp_new_i64();
2005 
2006     tcg_gen_and_i64(t, a, b);
2007     tcg_gen_vec_shr8i_i64(a, a, 1);
2008     tcg_gen_vec_shr8i_i64(b, b, 1);
2009     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2010     tcg_gen_vec_add8_i64(d, a, b);
2011     tcg_gen_vec_add8_i64(d, d, t);
2012 }
2013 
2014 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2015 {
2016     TCGv_i64 t = tcg_temp_new_i64();
2017 
2018     tcg_gen_and_i64(t, a, b);
2019     tcg_gen_vec_shr16i_i64(a, a, 1);
2020     tcg_gen_vec_shr16i_i64(b, b, 1);
2021     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2022     tcg_gen_vec_add16_i64(d, a, b);
2023     tcg_gen_vec_add16_i64(d, d, t);
2024 }
2025 
2026 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2027 {
2028     TCGv_i32 t = tcg_temp_new_i32();
2029 
2030     tcg_gen_and_i32(t, a, b);
2031     tcg_gen_shri_i32(a, a, 1);
2032     tcg_gen_shri_i32(b, b, 1);
2033     tcg_gen_andi_i32(t, t, 1);
2034     tcg_gen_add_i32(d, a, b);
2035     tcg_gen_add_i32(d, d, t);
2036 }
2037 
2038 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2039 {
2040     TCGv_vec t = tcg_temp_new_vec_matching(d);
2041 
2042     tcg_gen_and_vec(vece, t, a, b);
2043     tcg_gen_shri_vec(vece, a, a, 1);
2044     tcg_gen_shri_vec(vece, b, b, 1);
2045     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2046     tcg_gen_add_vec(vece, d, a, b);
2047     tcg_gen_add_vec(vece, d, d, t);
2048 }
2049 
2050 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2051                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2052 {
2053     static const TCGOpcode vecop_list[] = {
2054         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2055     };
2056     static const GVecGen3 g[] = {
2057         { .fni8 = gen_uhadd8_i64,
2058           .fniv = gen_uhadd_vec,
2059           .opt_opc = vecop_list,
2060           .vece = MO_8 },
2061         { .fni8 = gen_uhadd16_i64,
2062           .fniv = gen_uhadd_vec,
2063           .opt_opc = vecop_list,
2064           .vece = MO_16 },
2065         { .fni4 = gen_uhadd_i32,
2066           .fniv = gen_uhadd_vec,
2067           .opt_opc = vecop_list,
2068           .vece = MO_32 },
2069     };
2070     tcg_debug_assert(vece <= MO_32);
2071     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2072 }
2073 
2074 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2075 {
2076     TCGv_i64 t = tcg_temp_new_i64();
2077 
2078     tcg_gen_andc_i64(t, b, a);
2079     tcg_gen_vec_sar8i_i64(a, a, 1);
2080     tcg_gen_vec_sar8i_i64(b, b, 1);
2081     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2082     tcg_gen_vec_sub8_i64(d, a, b);
2083     tcg_gen_vec_sub8_i64(d, d, t);
2084 }
2085 
2086 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2087 {
2088     TCGv_i64 t = tcg_temp_new_i64();
2089 
2090     tcg_gen_andc_i64(t, b, a);
2091     tcg_gen_vec_sar16i_i64(a, a, 1);
2092     tcg_gen_vec_sar16i_i64(b, b, 1);
2093     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2094     tcg_gen_vec_sub16_i64(d, a, b);
2095     tcg_gen_vec_sub16_i64(d, d, t);
2096 }
2097 
2098 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2099 {
2100     TCGv_i32 t = tcg_temp_new_i32();
2101 
2102     tcg_gen_andc_i32(t, b, a);
2103     tcg_gen_sari_i32(a, a, 1);
2104     tcg_gen_sari_i32(b, b, 1);
2105     tcg_gen_andi_i32(t, t, 1);
2106     tcg_gen_sub_i32(d, a, b);
2107     tcg_gen_sub_i32(d, d, t);
2108 }
2109 
2110 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2111 {
2112     TCGv_vec t = tcg_temp_new_vec_matching(d);
2113 
2114     tcg_gen_andc_vec(vece, t, b, a);
2115     tcg_gen_sari_vec(vece, a, a, 1);
2116     tcg_gen_sari_vec(vece, b, b, 1);
2117     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2118     tcg_gen_sub_vec(vece, d, a, b);
2119     tcg_gen_sub_vec(vece, d, d, t);
2120 }
2121 
2122 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2123                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2124 {
2125     static const TCGOpcode vecop_list[] = {
2126         INDEX_op_sari_vec, INDEX_op_sub_vec, 0
2127     };
2128     static const GVecGen3 g[4] = {
2129         { .fni8 = gen_shsub8_i64,
2130           .fniv = gen_shsub_vec,
2131           .opt_opc = vecop_list,
2132           .vece = MO_8 },
2133         { .fni8 = gen_shsub16_i64,
2134           .fniv = gen_shsub_vec,
2135           .opt_opc = vecop_list,
2136           .vece = MO_16 },
2137         { .fni4 = gen_shsub_i32,
2138           .fniv = gen_shsub_vec,
2139           .opt_opc = vecop_list,
2140           .vece = MO_32 },
2141     };
2142     assert(vece <= MO_32);
2143     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2144 }
2145 
2146 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2147 {
2148     TCGv_i64 t = tcg_temp_new_i64();
2149 
2150     tcg_gen_andc_i64(t, b, a);
2151     tcg_gen_vec_shr8i_i64(a, a, 1);
2152     tcg_gen_vec_shr8i_i64(b, b, 1);
2153     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2154     tcg_gen_vec_sub8_i64(d, a, b);
2155     tcg_gen_vec_sub8_i64(d, d, t);
2156 }
2157 
2158 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2159 {
2160     TCGv_i64 t = tcg_temp_new_i64();
2161 
2162     tcg_gen_andc_i64(t, b, a);
2163     tcg_gen_vec_shr16i_i64(a, a, 1);
2164     tcg_gen_vec_shr16i_i64(b, b, 1);
2165     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2166     tcg_gen_vec_sub16_i64(d, a, b);
2167     tcg_gen_vec_sub16_i64(d, d, t);
2168 }
2169 
2170 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2171 {
2172     TCGv_i32 t = tcg_temp_new_i32();
2173 
2174     tcg_gen_andc_i32(t, b, a);
2175     tcg_gen_shri_i32(a, a, 1);
2176     tcg_gen_shri_i32(b, b, 1);
2177     tcg_gen_andi_i32(t, t, 1);
2178     tcg_gen_sub_i32(d, a, b);
2179     tcg_gen_sub_i32(d, d, t);
2180 }
2181 
2182 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2183 {
2184     TCGv_vec t = tcg_temp_new_vec_matching(d);
2185 
2186     tcg_gen_andc_vec(vece, t, b, a);
2187     tcg_gen_shri_vec(vece, a, a, 1);
2188     tcg_gen_shri_vec(vece, b, b, 1);
2189     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2190     tcg_gen_sub_vec(vece, d, a, b);
2191     tcg_gen_sub_vec(vece, d, d, t);
2192 }
2193 
2194 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2195                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2196 {
2197     static const TCGOpcode vecop_list[] = {
2198         INDEX_op_shri_vec, INDEX_op_sub_vec, 0
2199     };
2200     static const GVecGen3 g[4] = {
2201         { .fni8 = gen_uhsub8_i64,
2202           .fniv = gen_uhsub_vec,
2203           .opt_opc = vecop_list,
2204           .vece = MO_8 },
2205         { .fni8 = gen_uhsub16_i64,
2206           .fniv = gen_uhsub_vec,
2207           .opt_opc = vecop_list,
2208           .vece = MO_16 },
2209         { .fni4 = gen_uhsub_i32,
2210           .fniv = gen_uhsub_vec,
2211           .opt_opc = vecop_list,
2212           .vece = MO_32 },
2213     };
2214     assert(vece <= MO_32);
2215     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2216 }
2217 
2218 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2219 {
2220     TCGv_i64 t = tcg_temp_new_i64();
2221 
2222     tcg_gen_or_i64(t, a, b);
2223     tcg_gen_vec_sar8i_i64(a, a, 1);
2224     tcg_gen_vec_sar8i_i64(b, b, 1);
2225     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2226     tcg_gen_vec_add8_i64(d, a, b);
2227     tcg_gen_vec_add8_i64(d, d, t);
2228 }
2229 
2230 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2231 {
2232     TCGv_i64 t = tcg_temp_new_i64();
2233 
2234     tcg_gen_or_i64(t, a, b);
2235     tcg_gen_vec_sar16i_i64(a, a, 1);
2236     tcg_gen_vec_sar16i_i64(b, b, 1);
2237     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2238     tcg_gen_vec_add16_i64(d, a, b);
2239     tcg_gen_vec_add16_i64(d, d, t);
2240 }
2241 
2242 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2243 {
2244     TCGv_i32 t = tcg_temp_new_i32();
2245 
2246     tcg_gen_or_i32(t, a, b);
2247     tcg_gen_sari_i32(a, a, 1);
2248     tcg_gen_sari_i32(b, b, 1);
2249     tcg_gen_andi_i32(t, t, 1);
2250     tcg_gen_add_i32(d, a, b);
2251     tcg_gen_add_i32(d, d, t);
2252 }
2253 
2254 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2255 {
2256     TCGv_vec t = tcg_temp_new_vec_matching(d);
2257 
2258     tcg_gen_or_vec(vece, t, a, b);
2259     tcg_gen_sari_vec(vece, a, a, 1);
2260     tcg_gen_sari_vec(vece, b, b, 1);
2261     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2262     tcg_gen_add_vec(vece, d, a, b);
2263     tcg_gen_add_vec(vece, d, d, t);
2264 }
2265 
2266 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2267                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2268 {
2269     static const TCGOpcode vecop_list[] = {
2270         INDEX_op_sari_vec, INDEX_op_add_vec, 0
2271     };
2272     static const GVecGen3 g[] = {
2273         { .fni8 = gen_srhadd8_i64,
2274           .fniv = gen_srhadd_vec,
2275           .opt_opc = vecop_list,
2276           .vece = MO_8 },
2277         { .fni8 = gen_srhadd16_i64,
2278           .fniv = gen_srhadd_vec,
2279           .opt_opc = vecop_list,
2280           .vece = MO_16 },
2281         { .fni4 = gen_srhadd_i32,
2282           .fniv = gen_srhadd_vec,
2283           .opt_opc = vecop_list,
2284           .vece = MO_32 },
2285     };
2286     assert(vece <= MO_32);
2287     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2288 }
2289 
2290 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2291 {
2292     TCGv_i64 t = tcg_temp_new_i64();
2293 
2294     tcg_gen_or_i64(t, a, b);
2295     tcg_gen_vec_shr8i_i64(a, a, 1);
2296     tcg_gen_vec_shr8i_i64(b, b, 1);
2297     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2298     tcg_gen_vec_add8_i64(d, a, b);
2299     tcg_gen_vec_add8_i64(d, d, t);
2300 }
2301 
2302 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2303 {
2304     TCGv_i64 t = tcg_temp_new_i64();
2305 
2306     tcg_gen_or_i64(t, a, b);
2307     tcg_gen_vec_shr16i_i64(a, a, 1);
2308     tcg_gen_vec_shr16i_i64(b, b, 1);
2309     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2310     tcg_gen_vec_add16_i64(d, a, b);
2311     tcg_gen_vec_add16_i64(d, d, t);
2312 }
2313 
2314 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2315 {
2316     TCGv_i32 t = tcg_temp_new_i32();
2317 
2318     tcg_gen_or_i32(t, a, b);
2319     tcg_gen_shri_i32(a, a, 1);
2320     tcg_gen_shri_i32(b, b, 1);
2321     tcg_gen_andi_i32(t, t, 1);
2322     tcg_gen_add_i32(d, a, b);
2323     tcg_gen_add_i32(d, d, t);
2324 }
2325 
2326 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2327 {
2328     TCGv_vec t = tcg_temp_new_vec_matching(d);
2329 
2330     tcg_gen_or_vec(vece, t, a, b);
2331     tcg_gen_shri_vec(vece, a, a, 1);
2332     tcg_gen_shri_vec(vece, b, b, 1);
2333     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2334     tcg_gen_add_vec(vece, d, a, b);
2335     tcg_gen_add_vec(vece, d, d, t);
2336 }
2337 
2338 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2339                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2340 {
2341     static const TCGOpcode vecop_list[] = {
2342         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2343     };
2344     static const GVecGen3 g[] = {
2345         { .fni8 = gen_urhadd8_i64,
2346           .fniv = gen_urhadd_vec,
2347           .opt_opc = vecop_list,
2348           .vece = MO_8 },
2349         { .fni8 = gen_urhadd16_i64,
2350           .fniv = gen_urhadd_vec,
2351           .opt_opc = vecop_list,
2352           .vece = MO_16 },
2353         { .fni4 = gen_urhadd_i32,
2354           .fniv = gen_urhadd_vec,
2355           .opt_opc = vecop_list,
2356           .vece = MO_32 },
2357     };
2358     assert(vece <= MO_32);
2359     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2360 }
2361