xref: /openbmc/qemu/target/arm/tcg/gengvec.c (revision 00bcab5bad03f885bf785dcad94babed8e938d0f)
1 /*
2  *  ARM generic vector expansion
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "translate.h"
24 
25 
26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
27                             uint32_t opr_sz, uint32_t max_sz,
28                             gen_helper_gvec_3_ptr *fn)
29 {
30     TCGv_ptr qc_ptr = tcg_temp_new_ptr();
31 
32     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
33     tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc));
34     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
35                        opr_sz, max_sz, 0, fn);
36 }
37 
38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
39                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
40 {
41     static gen_helper_gvec_3_ptr * const fns[2] = {
42         gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s
43     };
44     tcg_debug_assert(vece >= 1 && vece <= 2);
45     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
46 }
47 
48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
49                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
50 {
51     static gen_helper_gvec_3_ptr * const fns[2] = {
52         gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s
53     };
54     tcg_debug_assert(vece >= 1 && vece <= 2);
55     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
56 }
57 
58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
59                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
60 {
61     static gen_helper_gvec_3_ptr * const fns[2] = {
62         gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
63     };
64     tcg_debug_assert(vece >= 1 && vece <= 2);
65     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
66 }
67 
68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
69                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
70 {
71     static gen_helper_gvec_3_ptr * const fns[2] = {
72         gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
73     };
74     tcg_debug_assert(vece >= 1 && vece <= 2);
75     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
76 }
77 
78 #define GEN_CMP0(NAME, COND)                              \
79     void NAME(unsigned vece, uint32_t d, uint32_t m,      \
80               uint32_t opr_sz, uint32_t max_sz)           \
81     { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }
82 
83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
88 
89 #undef GEN_CMP0
90 
91 void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
92                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
93 {
94     /* Signed shift out of range results in all-sign-bits */
95     shift = MIN(shift, (8 << vece) - 1);
96     tcg_gen_gvec_sari(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
97 }
98 
99 void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
100                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
101 {
102     /* Unsigned shift out of range results in all-zero-bits */
103     if (shift >= (8 << vece)) {
104         tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
105     } else {
106         tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
107     }
108 }
109 
110 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
111 {
112     tcg_gen_vec_sar8i_i64(a, a, shift);
113     tcg_gen_vec_add8_i64(d, d, a);
114 }
115 
116 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
117 {
118     tcg_gen_vec_sar16i_i64(a, a, shift);
119     tcg_gen_vec_add16_i64(d, d, a);
120 }
121 
122 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
123 {
124     tcg_gen_sari_i32(a, a, shift);
125     tcg_gen_add_i32(d, d, a);
126 }
127 
128 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
129 {
130     tcg_gen_sari_i64(a, a, shift);
131     tcg_gen_add_i64(d, d, a);
132 }
133 
134 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
135 {
136     tcg_gen_sari_vec(vece, a, a, sh);
137     tcg_gen_add_vec(vece, d, d, a);
138 }
139 
140 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
141                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
142 {
143     static const TCGOpcode vecop_list[] = {
144         INDEX_op_sari_vec, INDEX_op_add_vec, 0
145     };
146     static const GVecGen2i ops[4] = {
147         { .fni8 = gen_ssra8_i64,
148           .fniv = gen_ssra_vec,
149           .fno = gen_helper_gvec_ssra_b,
150           .load_dest = true,
151           .opt_opc = vecop_list,
152           .vece = MO_8 },
153         { .fni8 = gen_ssra16_i64,
154           .fniv = gen_ssra_vec,
155           .fno = gen_helper_gvec_ssra_h,
156           .load_dest = true,
157           .opt_opc = vecop_list,
158           .vece = MO_16 },
159         { .fni4 = gen_ssra32_i32,
160           .fniv = gen_ssra_vec,
161           .fno = gen_helper_gvec_ssra_s,
162           .load_dest = true,
163           .opt_opc = vecop_list,
164           .vece = MO_32 },
165         { .fni8 = gen_ssra64_i64,
166           .fniv = gen_ssra_vec,
167           .fno = gen_helper_gvec_ssra_d,
168           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
169           .opt_opc = vecop_list,
170           .load_dest = true,
171           .vece = MO_64 },
172     };
173 
174     /* tszimm encoding produces immediates in the range [1..esize]. */
175     tcg_debug_assert(shift > 0);
176     tcg_debug_assert(shift <= (8 << vece));
177 
178     /*
179      * Shifts larger than the element size are architecturally valid.
180      * Signed results in all sign bits.
181      */
182     shift = MIN(shift, (8 << vece) - 1);
183     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
184 }
185 
186 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
187 {
188     tcg_gen_vec_shr8i_i64(a, a, shift);
189     tcg_gen_vec_add8_i64(d, d, a);
190 }
191 
192 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
193 {
194     tcg_gen_vec_shr16i_i64(a, a, shift);
195     tcg_gen_vec_add16_i64(d, d, a);
196 }
197 
198 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
199 {
200     tcg_gen_shri_i32(a, a, shift);
201     tcg_gen_add_i32(d, d, a);
202 }
203 
204 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
205 {
206     tcg_gen_shri_i64(a, a, shift);
207     tcg_gen_add_i64(d, d, a);
208 }
209 
210 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
211 {
212     tcg_gen_shri_vec(vece, a, a, sh);
213     tcg_gen_add_vec(vece, d, d, a);
214 }
215 
216 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
217                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
218 {
219     static const TCGOpcode vecop_list[] = {
220         INDEX_op_shri_vec, INDEX_op_add_vec, 0
221     };
222     static const GVecGen2i ops[4] = {
223         { .fni8 = gen_usra8_i64,
224           .fniv = gen_usra_vec,
225           .fno = gen_helper_gvec_usra_b,
226           .load_dest = true,
227           .opt_opc = vecop_list,
228           .vece = MO_8, },
229         { .fni8 = gen_usra16_i64,
230           .fniv = gen_usra_vec,
231           .fno = gen_helper_gvec_usra_h,
232           .load_dest = true,
233           .opt_opc = vecop_list,
234           .vece = MO_16, },
235         { .fni4 = gen_usra32_i32,
236           .fniv = gen_usra_vec,
237           .fno = gen_helper_gvec_usra_s,
238           .load_dest = true,
239           .opt_opc = vecop_list,
240           .vece = MO_32, },
241         { .fni8 = gen_usra64_i64,
242           .fniv = gen_usra_vec,
243           .fno = gen_helper_gvec_usra_d,
244           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
245           .load_dest = true,
246           .opt_opc = vecop_list,
247           .vece = MO_64, },
248     };
249 
250     /* tszimm encoding produces immediates in the range [1..esize]. */
251     tcg_debug_assert(shift > 0);
252     tcg_debug_assert(shift <= (8 << vece));
253 
254     /*
255      * Shifts larger than the element size are architecturally valid.
256      * Unsigned results in all zeros as input to accumulate: nop.
257      */
258     if (shift < (8 << vece)) {
259         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
260     } else {
261         /* Nop, but we do need to clear the tail. */
262         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
263     }
264 }
265 
266 /*
267  * Shift one less than the requested amount, and the low bit is
268  * the rounding bit.  For the 8 and 16-bit operations, because we
269  * mask the low bit, we can perform a normal integer shift instead
270  * of a vector shift.
271  */
272 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
273 {
274     TCGv_i64 t = tcg_temp_new_i64();
275 
276     tcg_gen_shri_i64(t, a, sh - 1);
277     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
278     tcg_gen_vec_sar8i_i64(d, a, sh);
279     tcg_gen_vec_add8_i64(d, d, t);
280 }
281 
282 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
283 {
284     TCGv_i64 t = tcg_temp_new_i64();
285 
286     tcg_gen_shri_i64(t, a, sh - 1);
287     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
288     tcg_gen_vec_sar16i_i64(d, a, sh);
289     tcg_gen_vec_add16_i64(d, d, t);
290 }
291 
292 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
293 {
294     TCGv_i32 t;
295 
296     /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
297     if (sh == 32) {
298         tcg_gen_movi_i32(d, 0);
299         return;
300     }
301     t = tcg_temp_new_i32();
302     tcg_gen_extract_i32(t, a, sh - 1, 1);
303     tcg_gen_sari_i32(d, a, sh);
304     tcg_gen_add_i32(d, d, t);
305 }
306 
307  void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
308 {
309     TCGv_i64 t = tcg_temp_new_i64();
310 
311     tcg_gen_extract_i64(t, a, sh - 1, 1);
312     tcg_gen_sari_i64(d, a, sh);
313     tcg_gen_add_i64(d, d, t);
314 }
315 
316 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
317 {
318     TCGv_vec t = tcg_temp_new_vec_matching(d);
319     TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
320 
321     tcg_gen_shri_vec(vece, t, a, sh - 1);
322     tcg_gen_and_vec(vece, t, t, ones);
323     tcg_gen_sari_vec(vece, d, a, sh);
324     tcg_gen_add_vec(vece, d, d, t);
325 }
326 
327 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
328                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
329 {
330     static const TCGOpcode vecop_list[] = {
331         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
332     };
333     static const GVecGen2i ops[4] = {
334         { .fni8 = gen_srshr8_i64,
335           .fniv = gen_srshr_vec,
336           .fno = gen_helper_gvec_srshr_b,
337           .opt_opc = vecop_list,
338           .vece = MO_8 },
339         { .fni8 = gen_srshr16_i64,
340           .fniv = gen_srshr_vec,
341           .fno = gen_helper_gvec_srshr_h,
342           .opt_opc = vecop_list,
343           .vece = MO_16 },
344         { .fni4 = gen_srshr32_i32,
345           .fniv = gen_srshr_vec,
346           .fno = gen_helper_gvec_srshr_s,
347           .opt_opc = vecop_list,
348           .vece = MO_32 },
349         { .fni8 = gen_srshr64_i64,
350           .fniv = gen_srshr_vec,
351           .fno = gen_helper_gvec_srshr_d,
352           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
353           .opt_opc = vecop_list,
354           .vece = MO_64 },
355     };
356 
357     /* tszimm encoding produces immediates in the range [1..esize] */
358     tcg_debug_assert(shift > 0);
359     tcg_debug_assert(shift <= (8 << vece));
360 
361     if (shift == (8 << vece)) {
362         /*
363          * Shifts larger than the element size are architecturally valid.
364          * Signed results in all sign bits.  With rounding, this produces
365          *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
366          * I.e. always zero.
367          */
368         tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
369     } else {
370         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
371     }
372 }
373 
374 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
375 {
376     TCGv_i64 t = tcg_temp_new_i64();
377 
378     gen_srshr8_i64(t, a, sh);
379     tcg_gen_vec_add8_i64(d, d, t);
380 }
381 
382 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
383 {
384     TCGv_i64 t = tcg_temp_new_i64();
385 
386     gen_srshr16_i64(t, a, sh);
387     tcg_gen_vec_add16_i64(d, d, t);
388 }
389 
390 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
391 {
392     TCGv_i32 t = tcg_temp_new_i32();
393 
394     gen_srshr32_i32(t, a, sh);
395     tcg_gen_add_i32(d, d, t);
396 }
397 
398 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
399 {
400     TCGv_i64 t = tcg_temp_new_i64();
401 
402     gen_srshr64_i64(t, a, sh);
403     tcg_gen_add_i64(d, d, t);
404 }
405 
406 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
407 {
408     TCGv_vec t = tcg_temp_new_vec_matching(d);
409 
410     gen_srshr_vec(vece, t, a, sh);
411     tcg_gen_add_vec(vece, d, d, t);
412 }
413 
414 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
415                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
416 {
417     static const TCGOpcode vecop_list[] = {
418         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
419     };
420     static const GVecGen2i ops[4] = {
421         { .fni8 = gen_srsra8_i64,
422           .fniv = gen_srsra_vec,
423           .fno = gen_helper_gvec_srsra_b,
424           .opt_opc = vecop_list,
425           .load_dest = true,
426           .vece = MO_8 },
427         { .fni8 = gen_srsra16_i64,
428           .fniv = gen_srsra_vec,
429           .fno = gen_helper_gvec_srsra_h,
430           .opt_opc = vecop_list,
431           .load_dest = true,
432           .vece = MO_16 },
433         { .fni4 = gen_srsra32_i32,
434           .fniv = gen_srsra_vec,
435           .fno = gen_helper_gvec_srsra_s,
436           .opt_opc = vecop_list,
437           .load_dest = true,
438           .vece = MO_32 },
439         { .fni8 = gen_srsra64_i64,
440           .fniv = gen_srsra_vec,
441           .fno = gen_helper_gvec_srsra_d,
442           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
443           .opt_opc = vecop_list,
444           .load_dest = true,
445           .vece = MO_64 },
446     };
447 
448     /* tszimm encoding produces immediates in the range [1..esize] */
449     tcg_debug_assert(shift > 0);
450     tcg_debug_assert(shift <= (8 << vece));
451 
452     /*
453      * Shifts larger than the element size are architecturally valid.
454      * Signed results in all sign bits.  With rounding, this produces
455      *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
456      * I.e. always zero.  With accumulation, this leaves D unchanged.
457      */
458     if (shift == (8 << vece)) {
459         /* Nop, but we do need to clear the tail. */
460         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
461     } else {
462         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
463     }
464 }
465 
466 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
467 {
468     TCGv_i64 t = tcg_temp_new_i64();
469 
470     tcg_gen_shri_i64(t, a, sh - 1);
471     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
472     tcg_gen_vec_shr8i_i64(d, a, sh);
473     tcg_gen_vec_add8_i64(d, d, t);
474 }
475 
476 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
477 {
478     TCGv_i64 t = tcg_temp_new_i64();
479 
480     tcg_gen_shri_i64(t, a, sh - 1);
481     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
482     tcg_gen_vec_shr16i_i64(d, a, sh);
483     tcg_gen_vec_add16_i64(d, d, t);
484 }
485 
486 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
487 {
488     TCGv_i32 t;
489 
490     /* Handle shift by the input size for the benefit of trans_URSHR_ri */
491     if (sh == 32) {
492         tcg_gen_extract_i32(d, a, sh - 1, 1);
493         return;
494     }
495     t = tcg_temp_new_i32();
496     tcg_gen_extract_i32(t, a, sh - 1, 1);
497     tcg_gen_shri_i32(d, a, sh);
498     tcg_gen_add_i32(d, d, t);
499 }
500 
501 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
502 {
503     TCGv_i64 t = tcg_temp_new_i64();
504 
505     tcg_gen_extract_i64(t, a, sh - 1, 1);
506     tcg_gen_shri_i64(d, a, sh);
507     tcg_gen_add_i64(d, d, t);
508 }
509 
510 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
511 {
512     TCGv_vec t = tcg_temp_new_vec_matching(d);
513     TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
514 
515     tcg_gen_shri_vec(vece, t, a, shift - 1);
516     tcg_gen_and_vec(vece, t, t, ones);
517     tcg_gen_shri_vec(vece, d, a, shift);
518     tcg_gen_add_vec(vece, d, d, t);
519 }
520 
521 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
522                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
523 {
524     static const TCGOpcode vecop_list[] = {
525         INDEX_op_shri_vec, INDEX_op_add_vec, 0
526     };
527     static const GVecGen2i ops[4] = {
528         { .fni8 = gen_urshr8_i64,
529           .fniv = gen_urshr_vec,
530           .fno = gen_helper_gvec_urshr_b,
531           .opt_opc = vecop_list,
532           .vece = MO_8 },
533         { .fni8 = gen_urshr16_i64,
534           .fniv = gen_urshr_vec,
535           .fno = gen_helper_gvec_urshr_h,
536           .opt_opc = vecop_list,
537           .vece = MO_16 },
538         { .fni4 = gen_urshr32_i32,
539           .fniv = gen_urshr_vec,
540           .fno = gen_helper_gvec_urshr_s,
541           .opt_opc = vecop_list,
542           .vece = MO_32 },
543         { .fni8 = gen_urshr64_i64,
544           .fniv = gen_urshr_vec,
545           .fno = gen_helper_gvec_urshr_d,
546           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
547           .opt_opc = vecop_list,
548           .vece = MO_64 },
549     };
550 
551     /* tszimm encoding produces immediates in the range [1..esize] */
552     tcg_debug_assert(shift > 0);
553     tcg_debug_assert(shift <= (8 << vece));
554 
555     if (shift == (8 << vece)) {
556         /*
557          * Shifts larger than the element size are architecturally valid.
558          * Unsigned results in zero.  With rounding, this produces a
559          * copy of the most significant bit.
560          */
561         tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
562     } else {
563         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
564     }
565 }
566 
567 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
568 {
569     TCGv_i64 t = tcg_temp_new_i64();
570 
571     if (sh == 8) {
572         tcg_gen_vec_shr8i_i64(t, a, 7);
573     } else {
574         gen_urshr8_i64(t, a, sh);
575     }
576     tcg_gen_vec_add8_i64(d, d, t);
577 }
578 
579 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
580 {
581     TCGv_i64 t = tcg_temp_new_i64();
582 
583     if (sh == 16) {
584         tcg_gen_vec_shr16i_i64(t, a, 15);
585     } else {
586         gen_urshr16_i64(t, a, sh);
587     }
588     tcg_gen_vec_add16_i64(d, d, t);
589 }
590 
591 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
592 {
593     TCGv_i32 t = tcg_temp_new_i32();
594 
595     if (sh == 32) {
596         tcg_gen_shri_i32(t, a, 31);
597     } else {
598         gen_urshr32_i32(t, a, sh);
599     }
600     tcg_gen_add_i32(d, d, t);
601 }
602 
603 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
604 {
605     TCGv_i64 t = tcg_temp_new_i64();
606 
607     if (sh == 64) {
608         tcg_gen_shri_i64(t, a, 63);
609     } else {
610         gen_urshr64_i64(t, a, sh);
611     }
612     tcg_gen_add_i64(d, d, t);
613 }
614 
615 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
616 {
617     TCGv_vec t = tcg_temp_new_vec_matching(d);
618 
619     if (sh == (8 << vece)) {
620         tcg_gen_shri_vec(vece, t, a, sh - 1);
621     } else {
622         gen_urshr_vec(vece, t, a, sh);
623     }
624     tcg_gen_add_vec(vece, d, d, t);
625 }
626 
627 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
628                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
629 {
630     static const TCGOpcode vecop_list[] = {
631         INDEX_op_shri_vec, INDEX_op_add_vec, 0
632     };
633     static const GVecGen2i ops[4] = {
634         { .fni8 = gen_ursra8_i64,
635           .fniv = gen_ursra_vec,
636           .fno = gen_helper_gvec_ursra_b,
637           .opt_opc = vecop_list,
638           .load_dest = true,
639           .vece = MO_8 },
640         { .fni8 = gen_ursra16_i64,
641           .fniv = gen_ursra_vec,
642           .fno = gen_helper_gvec_ursra_h,
643           .opt_opc = vecop_list,
644           .load_dest = true,
645           .vece = MO_16 },
646         { .fni4 = gen_ursra32_i32,
647           .fniv = gen_ursra_vec,
648           .fno = gen_helper_gvec_ursra_s,
649           .opt_opc = vecop_list,
650           .load_dest = true,
651           .vece = MO_32 },
652         { .fni8 = gen_ursra64_i64,
653           .fniv = gen_ursra_vec,
654           .fno = gen_helper_gvec_ursra_d,
655           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
656           .opt_opc = vecop_list,
657           .load_dest = true,
658           .vece = MO_64 },
659     };
660 
661     /* tszimm encoding produces immediates in the range [1..esize] */
662     tcg_debug_assert(shift > 0);
663     tcg_debug_assert(shift <= (8 << vece));
664 
665     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
666 }
667 
668 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
669 {
670     uint64_t mask = dup_const(MO_8, 0xff >> shift);
671     TCGv_i64 t = tcg_temp_new_i64();
672 
673     tcg_gen_shri_i64(t, a, shift);
674     tcg_gen_andi_i64(t, t, mask);
675     tcg_gen_andi_i64(d, d, ~mask);
676     tcg_gen_or_i64(d, d, t);
677 }
678 
679 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
680 {
681     uint64_t mask = dup_const(MO_16, 0xffff >> shift);
682     TCGv_i64 t = tcg_temp_new_i64();
683 
684     tcg_gen_shri_i64(t, a, shift);
685     tcg_gen_andi_i64(t, t, mask);
686     tcg_gen_andi_i64(d, d, ~mask);
687     tcg_gen_or_i64(d, d, t);
688 }
689 
690 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
691 {
692     tcg_gen_shri_i32(a, a, shift);
693     tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
694 }
695 
696 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
697 {
698     tcg_gen_shri_i64(a, a, shift);
699     tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
700 }
701 
702 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
703 {
704     TCGv_vec t = tcg_temp_new_vec_matching(d);
705     int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh);
706     TCGv_vec m = tcg_constant_vec_matching(d, vece, mi);
707 
708     tcg_gen_shri_vec(vece, t, a, sh);
709     tcg_gen_and_vec(vece, d, d, m);
710     tcg_gen_or_vec(vece, d, d, t);
711 }
712 
713 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
714                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
715 {
716     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
717     const GVecGen2i ops[4] = {
718         { .fni8 = gen_shr8_ins_i64,
719           .fniv = gen_shr_ins_vec,
720           .fno = gen_helper_gvec_sri_b,
721           .load_dest = true,
722           .opt_opc = vecop_list,
723           .vece = MO_8 },
724         { .fni8 = gen_shr16_ins_i64,
725           .fniv = gen_shr_ins_vec,
726           .fno = gen_helper_gvec_sri_h,
727           .load_dest = true,
728           .opt_opc = vecop_list,
729           .vece = MO_16 },
730         { .fni4 = gen_shr32_ins_i32,
731           .fniv = gen_shr_ins_vec,
732           .fno = gen_helper_gvec_sri_s,
733           .load_dest = true,
734           .opt_opc = vecop_list,
735           .vece = MO_32 },
736         { .fni8 = gen_shr64_ins_i64,
737           .fniv = gen_shr_ins_vec,
738           .fno = gen_helper_gvec_sri_d,
739           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
740           .load_dest = true,
741           .opt_opc = vecop_list,
742           .vece = MO_64 },
743     };
744 
745     /* tszimm encoding produces immediates in the range [1..esize]. */
746     tcg_debug_assert(shift > 0);
747     tcg_debug_assert(shift <= (8 << vece));
748 
749     /* Shift of esize leaves destination unchanged. */
750     if (shift < (8 << vece)) {
751         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
752     } else {
753         /* Nop, but we do need to clear the tail. */
754         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
755     }
756 }
757 
758 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
759 {
760     uint64_t mask = dup_const(MO_8, 0xff << shift);
761     TCGv_i64 t = tcg_temp_new_i64();
762 
763     tcg_gen_shli_i64(t, a, shift);
764     tcg_gen_andi_i64(t, t, mask);
765     tcg_gen_andi_i64(d, d, ~mask);
766     tcg_gen_or_i64(d, d, t);
767 }
768 
769 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
770 {
771     uint64_t mask = dup_const(MO_16, 0xffff << shift);
772     TCGv_i64 t = tcg_temp_new_i64();
773 
774     tcg_gen_shli_i64(t, a, shift);
775     tcg_gen_andi_i64(t, t, mask);
776     tcg_gen_andi_i64(d, d, ~mask);
777     tcg_gen_or_i64(d, d, t);
778 }
779 
780 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
781 {
782     tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
783 }
784 
785 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
786 {
787     tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
788 }
789 
790 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
791 {
792     TCGv_vec t = tcg_temp_new_vec_matching(d);
793     TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh));
794 
795     tcg_gen_shli_vec(vece, t, a, sh);
796     tcg_gen_and_vec(vece, d, d, m);
797     tcg_gen_or_vec(vece, d, d, t);
798 }
799 
800 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
801                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
802 {
803     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
804     const GVecGen2i ops[4] = {
805         { .fni8 = gen_shl8_ins_i64,
806           .fniv = gen_shl_ins_vec,
807           .fno = gen_helper_gvec_sli_b,
808           .load_dest = true,
809           .opt_opc = vecop_list,
810           .vece = MO_8 },
811         { .fni8 = gen_shl16_ins_i64,
812           .fniv = gen_shl_ins_vec,
813           .fno = gen_helper_gvec_sli_h,
814           .load_dest = true,
815           .opt_opc = vecop_list,
816           .vece = MO_16 },
817         { .fni4 = gen_shl32_ins_i32,
818           .fniv = gen_shl_ins_vec,
819           .fno = gen_helper_gvec_sli_s,
820           .load_dest = true,
821           .opt_opc = vecop_list,
822           .vece = MO_32 },
823         { .fni8 = gen_shl64_ins_i64,
824           .fniv = gen_shl_ins_vec,
825           .fno = gen_helper_gvec_sli_d,
826           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
827           .load_dest = true,
828           .opt_opc = vecop_list,
829           .vece = MO_64 },
830     };
831 
832     /* tszimm encoding produces immediates in the range [0..esize-1]. */
833     tcg_debug_assert(shift >= 0);
834     tcg_debug_assert(shift < (8 << vece));
835 
836     if (shift == 0) {
837         tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
838     } else {
839         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
840     }
841 }
842 
843 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
844 {
845     gen_helper_neon_mul_u8(a, a, b);
846     gen_helper_neon_add_u8(d, d, a);
847 }
848 
849 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
850 {
851     gen_helper_neon_mul_u8(a, a, b);
852     gen_helper_neon_sub_u8(d, d, a);
853 }
854 
855 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
856 {
857     gen_helper_neon_mul_u16(a, a, b);
858     gen_helper_neon_add_u16(d, d, a);
859 }
860 
861 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
862 {
863     gen_helper_neon_mul_u16(a, a, b);
864     gen_helper_neon_sub_u16(d, d, a);
865 }
866 
867 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
868 {
869     tcg_gen_mul_i32(a, a, b);
870     tcg_gen_add_i32(d, d, a);
871 }
872 
873 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
874 {
875     tcg_gen_mul_i32(a, a, b);
876     tcg_gen_sub_i32(d, d, a);
877 }
878 
879 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
880 {
881     tcg_gen_mul_i64(a, a, b);
882     tcg_gen_add_i64(d, d, a);
883 }
884 
885 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
886 {
887     tcg_gen_mul_i64(a, a, b);
888     tcg_gen_sub_i64(d, d, a);
889 }
890 
891 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
892 {
893     tcg_gen_mul_vec(vece, a, a, b);
894     tcg_gen_add_vec(vece, d, d, a);
895 }
896 
897 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
898 {
899     tcg_gen_mul_vec(vece, a, a, b);
900     tcg_gen_sub_vec(vece, d, d, a);
901 }
902 
903 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
904  * these tables are shared with AArch64 which does support them.
905  */
906 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
907                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
908 {
909     static const TCGOpcode vecop_list[] = {
910         INDEX_op_mul_vec, INDEX_op_add_vec, 0
911     };
912     static const GVecGen3 ops[4] = {
913         { .fni4 = gen_mla8_i32,
914           .fniv = gen_mla_vec,
915           .load_dest = true,
916           .opt_opc = vecop_list,
917           .vece = MO_8 },
918         { .fni4 = gen_mla16_i32,
919           .fniv = gen_mla_vec,
920           .load_dest = true,
921           .opt_opc = vecop_list,
922           .vece = MO_16 },
923         { .fni4 = gen_mla32_i32,
924           .fniv = gen_mla_vec,
925           .load_dest = true,
926           .opt_opc = vecop_list,
927           .vece = MO_32 },
928         { .fni8 = gen_mla64_i64,
929           .fniv = gen_mla_vec,
930           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
931           .load_dest = true,
932           .opt_opc = vecop_list,
933           .vece = MO_64 },
934     };
935     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
936 }
937 
938 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
939                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
940 {
941     static const TCGOpcode vecop_list[] = {
942         INDEX_op_mul_vec, INDEX_op_sub_vec, 0
943     };
944     static const GVecGen3 ops[4] = {
945         { .fni4 = gen_mls8_i32,
946           .fniv = gen_mls_vec,
947           .load_dest = true,
948           .opt_opc = vecop_list,
949           .vece = MO_8 },
950         { .fni4 = gen_mls16_i32,
951           .fniv = gen_mls_vec,
952           .load_dest = true,
953           .opt_opc = vecop_list,
954           .vece = MO_16 },
955         { .fni4 = gen_mls32_i32,
956           .fniv = gen_mls_vec,
957           .load_dest = true,
958           .opt_opc = vecop_list,
959           .vece = MO_32 },
960         { .fni8 = gen_mls64_i64,
961           .fniv = gen_mls_vec,
962           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
963           .load_dest = true,
964           .opt_opc = vecop_list,
965           .vece = MO_64 },
966     };
967     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
968 }
969 
970 /* CMTST : test is "if (X & Y != 0)". */
971 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
972 {
973     tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b);
974 }
975 
976 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
977 {
978     tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b);
979 }
980 
981 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
982 {
983     tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b);
984 }
985 
986 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
987                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
988 {
989     static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
990     static const GVecGen3 ops[4] = {
991         { .fni4 = gen_helper_neon_tst_u8,
992           .fniv = gen_cmtst_vec,
993           .opt_opc = vecop_list,
994           .vece = MO_8 },
995         { .fni4 = gen_helper_neon_tst_u16,
996           .fniv = gen_cmtst_vec,
997           .opt_opc = vecop_list,
998           .vece = MO_16 },
999         { .fni4 = gen_cmtst_i32,
1000           .fniv = gen_cmtst_vec,
1001           .opt_opc = vecop_list,
1002           .vece = MO_32 },
1003         { .fni8 = gen_cmtst_i64,
1004           .fniv = gen_cmtst_vec,
1005           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1006           .opt_opc = vecop_list,
1007           .vece = MO_64 },
1008     };
1009     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1010 }
1011 
1012 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1013 {
1014     TCGv_i32 lval = tcg_temp_new_i32();
1015     TCGv_i32 rval = tcg_temp_new_i32();
1016     TCGv_i32 lsh = tcg_temp_new_i32();
1017     TCGv_i32 rsh = tcg_temp_new_i32();
1018     TCGv_i32 zero = tcg_constant_i32(0);
1019     TCGv_i32 max = tcg_constant_i32(32);
1020 
1021     /*
1022      * Rely on the TCG guarantee that out of range shifts produce
1023      * unspecified results, not undefined behaviour (i.e. no trap).
1024      * Discard out-of-range results after the fact.
1025      */
1026     tcg_gen_ext8s_i32(lsh, shift);
1027     tcg_gen_neg_i32(rsh, lsh);
1028     tcg_gen_shl_i32(lval, src, lsh);
1029     tcg_gen_shr_i32(rval, src, rsh);
1030     tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
1031     tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
1032 }
1033 
1034 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1035 {
1036     TCGv_i64 lval = tcg_temp_new_i64();
1037     TCGv_i64 rval = tcg_temp_new_i64();
1038     TCGv_i64 lsh = tcg_temp_new_i64();
1039     TCGv_i64 rsh = tcg_temp_new_i64();
1040     TCGv_i64 zero = tcg_constant_i64(0);
1041     TCGv_i64 max = tcg_constant_i64(64);
1042 
1043     /*
1044      * Rely on the TCG guarantee that out of range shifts produce
1045      * unspecified results, not undefined behaviour (i.e. no trap).
1046      * Discard out-of-range results after the fact.
1047      */
1048     tcg_gen_ext8s_i64(lsh, shift);
1049     tcg_gen_neg_i64(rsh, lsh);
1050     tcg_gen_shl_i64(lval, src, lsh);
1051     tcg_gen_shr_i64(rval, src, rsh);
1052     tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
1053     tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
1054 }
1055 
1056 static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
1057                          TCGv_vec src, TCGv_vec shift)
1058 {
1059     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1060     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1061     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1062     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1063     TCGv_vec max, zero;
1064 
1065     tcg_gen_neg_vec(vece, rsh, shift);
1066     if (vece == MO_8) {
1067         tcg_gen_mov_vec(lsh, shift);
1068     } else {
1069         TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
1070         tcg_gen_and_vec(vece, lsh, shift, msk);
1071         tcg_gen_and_vec(vece, rsh, rsh, msk);
1072     }
1073 
1074     /*
1075      * Rely on the TCG guarantee that out of range shifts produce
1076      * unspecified results, not undefined behaviour (i.e. no trap).
1077      * Discard out-of-range results after the fact.
1078      */
1079     tcg_gen_shlv_vec(vece, lval, src, lsh);
1080     tcg_gen_shrv_vec(vece, rval, src, rsh);
1081 
1082     /*
1083      * The choice of GE (signed) and GEU (unsigned) are biased toward
1084      * the instructions of the x86_64 host.  For MO_8, the whole byte
1085      * is significant so we must use an unsigned compare; otherwise we
1086      * have already masked to a byte and so a signed compare works.
1087      * Other tcg hosts have a full set of comparisons and do not care.
1088      */
1089     zero = tcg_constant_vec_matching(dst, vece, 0);
1090     max = tcg_constant_vec_matching(dst, vece, 8 << vece);
1091     if (vece == MO_8) {
1092         tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval);
1093         tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval);
1094     } else {
1095         tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval);
1096         tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval);
1097     }
1098     tcg_gen_or_vec(vece, dst, lval, rval);
1099 }
1100 
1101 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1102                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1103 {
1104     static const TCGOpcode vecop_list[] = {
1105         INDEX_op_neg_vec, INDEX_op_shlv_vec,
1106         INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0
1107     };
1108     static const GVecGen3 ops[4] = {
1109         { .fniv = gen_ushl_vec,
1110           .fno = gen_helper_gvec_ushl_b,
1111           .opt_opc = vecop_list,
1112           .vece = MO_8 },
1113         { .fniv = gen_ushl_vec,
1114           .fno = gen_helper_gvec_ushl_h,
1115           .opt_opc = vecop_list,
1116           .vece = MO_16 },
1117         { .fni4 = gen_ushl_i32,
1118           .fniv = gen_ushl_vec,
1119           .opt_opc = vecop_list,
1120           .vece = MO_32 },
1121         { .fni8 = gen_ushl_i64,
1122           .fniv = gen_ushl_vec,
1123           .opt_opc = vecop_list,
1124           .vece = MO_64 },
1125     };
1126     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1127 }
1128 
1129 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1130 {
1131     TCGv_i32 lval = tcg_temp_new_i32();
1132     TCGv_i32 rval = tcg_temp_new_i32();
1133     TCGv_i32 lsh = tcg_temp_new_i32();
1134     TCGv_i32 rsh = tcg_temp_new_i32();
1135     TCGv_i32 zero = tcg_constant_i32(0);
1136     TCGv_i32 max = tcg_constant_i32(31);
1137 
1138     /*
1139      * Rely on the TCG guarantee that out of range shifts produce
1140      * unspecified results, not undefined behaviour (i.e. no trap).
1141      * Discard out-of-range results after the fact.
1142      */
1143     tcg_gen_ext8s_i32(lsh, shift);
1144     tcg_gen_neg_i32(rsh, lsh);
1145     tcg_gen_shl_i32(lval, src, lsh);
1146     tcg_gen_umin_i32(rsh, rsh, max);
1147     tcg_gen_sar_i32(rval, src, rsh);
1148     tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
1149     tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
1150 }
1151 
1152 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1153 {
1154     TCGv_i64 lval = tcg_temp_new_i64();
1155     TCGv_i64 rval = tcg_temp_new_i64();
1156     TCGv_i64 lsh = tcg_temp_new_i64();
1157     TCGv_i64 rsh = tcg_temp_new_i64();
1158     TCGv_i64 zero = tcg_constant_i64(0);
1159     TCGv_i64 max = tcg_constant_i64(63);
1160 
1161     /*
1162      * Rely on the TCG guarantee that out of range shifts produce
1163      * unspecified results, not undefined behaviour (i.e. no trap).
1164      * Discard out-of-range results after the fact.
1165      */
1166     tcg_gen_ext8s_i64(lsh, shift);
1167     tcg_gen_neg_i64(rsh, lsh);
1168     tcg_gen_shl_i64(lval, src, lsh);
1169     tcg_gen_umin_i64(rsh, rsh, max);
1170     tcg_gen_sar_i64(rval, src, rsh);
1171     tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
1172     tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
1173 }
1174 
1175 static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
1176                          TCGv_vec src, TCGv_vec shift)
1177 {
1178     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1179     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1180     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1181     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1182     TCGv_vec max, zero;
1183 
1184     /*
1185      * Rely on the TCG guarantee that out of range shifts produce
1186      * unspecified results, not undefined behaviour (i.e. no trap).
1187      * Discard out-of-range results after the fact.
1188      */
1189     tcg_gen_neg_vec(vece, rsh, shift);
1190     if (vece == MO_8) {
1191         tcg_gen_mov_vec(lsh, shift);
1192     } else {
1193         TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
1194         tcg_gen_and_vec(vece, lsh, shift, msk);
1195         tcg_gen_and_vec(vece, rsh, rsh, msk);
1196     }
1197 
1198     /* Bound rsh so out of bound right shift gets -1.  */
1199     max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1);
1200     tcg_gen_umin_vec(vece, rsh, rsh, max);
1201 
1202     tcg_gen_shlv_vec(vece, lval, src, lsh);
1203     tcg_gen_sarv_vec(vece, rval, src, rsh);
1204 
1205     /* Select in-bound left shift.  */
1206     zero = tcg_constant_vec_matching(dst, vece, 0);
1207     tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval);
1208 
1209     /* Select between left and right shift.  */
1210     if (vece == MO_8) {
1211         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval);
1212     } else {
1213         TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80);
1214         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval);
1215     }
1216 }
1217 
1218 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1219                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1220 {
1221     static const TCGOpcode vecop_list[] = {
1222         INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
1223         INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0
1224     };
1225     static const GVecGen3 ops[4] = {
1226         { .fniv = gen_sshl_vec,
1227           .fno = gen_helper_gvec_sshl_b,
1228           .opt_opc = vecop_list,
1229           .vece = MO_8 },
1230         { .fniv = gen_sshl_vec,
1231           .fno = gen_helper_gvec_sshl_h,
1232           .opt_opc = vecop_list,
1233           .vece = MO_16 },
1234         { .fni4 = gen_sshl_i32,
1235           .fniv = gen_sshl_vec,
1236           .opt_opc = vecop_list,
1237           .vece = MO_32 },
1238         { .fni8 = gen_sshl_i64,
1239           .fniv = gen_sshl_vec,
1240           .opt_opc = vecop_list,
1241           .vece = MO_64 },
1242     };
1243     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1244 }
1245 
1246 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1247                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1248 {
1249     static gen_helper_gvec_3 * const fns[] = {
1250         gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h,
1251         gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d,
1252     };
1253     tcg_debug_assert(vece <= MO_64);
1254     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1255 }
1256 
1257 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1258                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1259 {
1260     static gen_helper_gvec_3 * const fns[] = {
1261         gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h,
1262         gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d,
1263     };
1264     tcg_debug_assert(vece <= MO_64);
1265     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1266 }
1267 
1268 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1269                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1270 {
1271     static gen_helper_gvec_3_ptr * const fns[] = {
1272         gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h,
1273         gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d,
1274     };
1275     tcg_debug_assert(vece <= MO_64);
1276     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1277                        opr_sz, max_sz, 0, fns[vece]);
1278 }
1279 
1280 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1281                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1282 {
1283     static gen_helper_gvec_3_ptr * const fns[] = {
1284         gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h,
1285         gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d,
1286     };
1287     tcg_debug_assert(vece <= MO_64);
1288     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1289                        opr_sz, max_sz, 0, fns[vece]);
1290 }
1291 
1292 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1293                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1294 {
1295     static gen_helper_gvec_3_ptr * const fns[] = {
1296         gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h,
1297         gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d,
1298     };
1299     tcg_debug_assert(vece <= MO_64);
1300     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1301                        opr_sz, max_sz, 0, fns[vece]);
1302 }
1303 
1304 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1305                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1306 {
1307     static gen_helper_gvec_3_ptr * const fns[] = {
1308         gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h,
1309         gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d,
1310     };
1311     tcg_debug_assert(vece <= MO_64);
1312     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1313                        opr_sz, max_sz, 0, fns[vece]);
1314 }
1315 
1316 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1317 {
1318     uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
1319     TCGv_i64 tmp = tcg_temp_new_i64();
1320 
1321     tcg_gen_add_i64(tmp, a, b);
1322     tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max));
1323     tcg_gen_xor_i64(tmp, tmp, res);
1324     tcg_gen_or_i64(qc, qc, tmp);
1325 }
1326 
1327 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1328 {
1329     TCGv_i64 t = tcg_temp_new_i64();
1330 
1331     tcg_gen_add_i64(t, a, b);
1332     tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a,
1333                         tcg_constant_i64(UINT64_MAX), t);
1334     tcg_gen_xor_i64(t, t, res);
1335     tcg_gen_or_i64(qc, qc, t);
1336 }
1337 
1338 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1339                           TCGv_vec a, TCGv_vec b)
1340 {
1341     TCGv_vec x = tcg_temp_new_vec_matching(t);
1342     tcg_gen_add_vec(vece, x, a, b);
1343     tcg_gen_usadd_vec(vece, t, a, b);
1344     tcg_gen_xor_vec(vece, x, x, t);
1345     tcg_gen_or_vec(vece, qc, qc, x);
1346 }
1347 
1348 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1349                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1350 {
1351     static const TCGOpcode vecop_list[] = {
1352         INDEX_op_usadd_vec, INDEX_op_add_vec, 0
1353     };
1354     static const GVecGen4 ops[4] = {
1355         { .fniv = gen_uqadd_vec,
1356           .fno = gen_helper_gvec_uqadd_b,
1357           .write_aofs = true,
1358           .opt_opc = vecop_list,
1359           .vece = MO_8 },
1360         { .fniv = gen_uqadd_vec,
1361           .fno = gen_helper_gvec_uqadd_h,
1362           .write_aofs = true,
1363           .opt_opc = vecop_list,
1364           .vece = MO_16 },
1365         { .fniv = gen_uqadd_vec,
1366           .fno = gen_helper_gvec_uqadd_s,
1367           .write_aofs = true,
1368           .opt_opc = vecop_list,
1369           .vece = MO_32 },
1370         { .fniv = gen_uqadd_vec,
1371           .fni8 = gen_uqadd_d,
1372           .fno = gen_helper_gvec_uqadd_d,
1373           .write_aofs = true,
1374           .opt_opc = vecop_list,
1375           .vece = MO_64 },
1376     };
1377 
1378     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1379     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1380                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1381 }
1382 
1383 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1384 {
1385     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1386     int64_t min = -1ll - max;
1387     TCGv_i64 tmp = tcg_temp_new_i64();
1388 
1389     tcg_gen_add_i64(tmp, a, b);
1390     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1391     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1392     tcg_gen_xor_i64(tmp, tmp, res);
1393     tcg_gen_or_i64(qc, qc, tmp);
1394 }
1395 
1396 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1397 {
1398     TCGv_i64 t0 = tcg_temp_new_i64();
1399     TCGv_i64 t1 = tcg_temp_new_i64();
1400     TCGv_i64 t2 = tcg_temp_new_i64();
1401 
1402     tcg_gen_add_i64(t0, a, b);
1403 
1404     /* Compute signed overflow indication into T1 */
1405     tcg_gen_xor_i64(t1, a, b);
1406     tcg_gen_xor_i64(t2, t0, a);
1407     tcg_gen_andc_i64(t1, t2, t1);
1408 
1409     /* Compute saturated value into T2 */
1410     tcg_gen_sari_i64(t2, a, 63);
1411     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1412 
1413     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1414     tcg_gen_xor_i64(t0, t0, res);
1415     tcg_gen_or_i64(qc, qc, t0);
1416 }
1417 
1418 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1419                           TCGv_vec a, TCGv_vec b)
1420 {
1421     TCGv_vec x = tcg_temp_new_vec_matching(t);
1422     tcg_gen_add_vec(vece, x, a, b);
1423     tcg_gen_ssadd_vec(vece, t, a, b);
1424     tcg_gen_xor_vec(vece, x, x, t);
1425     tcg_gen_or_vec(vece, qc, qc, x);
1426 }
1427 
1428 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1429                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1430 {
1431     static const TCGOpcode vecop_list[] = {
1432         INDEX_op_ssadd_vec, INDEX_op_add_vec, 0
1433     };
1434     static const GVecGen4 ops[4] = {
1435         { .fniv = gen_sqadd_vec,
1436           .fno = gen_helper_gvec_sqadd_b,
1437           .opt_opc = vecop_list,
1438           .write_aofs = true,
1439           .vece = MO_8 },
1440         { .fniv = gen_sqadd_vec,
1441           .fno = gen_helper_gvec_sqadd_h,
1442           .opt_opc = vecop_list,
1443           .write_aofs = true,
1444           .vece = MO_16 },
1445         { .fniv = gen_sqadd_vec,
1446           .fno = gen_helper_gvec_sqadd_s,
1447           .opt_opc = vecop_list,
1448           .write_aofs = true,
1449           .vece = MO_32 },
1450         { .fniv = gen_sqadd_vec,
1451           .fni8 = gen_sqadd_d,
1452           .fno = gen_helper_gvec_sqadd_d,
1453           .opt_opc = vecop_list,
1454           .write_aofs = true,
1455           .vece = MO_64 },
1456     };
1457 
1458     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1459     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1460                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1461 }
1462 
1463 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1464 {
1465     TCGv_i64 tmp = tcg_temp_new_i64();
1466 
1467     tcg_gen_sub_i64(tmp, a, b);
1468     tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0));
1469     tcg_gen_xor_i64(tmp, tmp, res);
1470     tcg_gen_or_i64(qc, qc, tmp);
1471 }
1472 
1473 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1474 {
1475     TCGv_i64 t = tcg_temp_new_i64();
1476 
1477     tcg_gen_sub_i64(t, a, b);
1478     tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t);
1479     tcg_gen_xor_i64(t, t, res);
1480     tcg_gen_or_i64(qc, qc, t);
1481 }
1482 
1483 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1484                           TCGv_vec a, TCGv_vec b)
1485 {
1486     TCGv_vec x = tcg_temp_new_vec_matching(t);
1487     tcg_gen_sub_vec(vece, x, a, b);
1488     tcg_gen_ussub_vec(vece, t, a, b);
1489     tcg_gen_xor_vec(vece, x, x, t);
1490     tcg_gen_or_vec(vece, qc, qc, x);
1491 }
1492 
1493 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1494                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1495 {
1496     static const TCGOpcode vecop_list[] = {
1497         INDEX_op_ussub_vec, INDEX_op_sub_vec, 0
1498     };
1499     static const GVecGen4 ops[4] = {
1500         { .fniv = gen_uqsub_vec,
1501           .fno = gen_helper_gvec_uqsub_b,
1502           .opt_opc = vecop_list,
1503           .write_aofs = true,
1504           .vece = MO_8 },
1505         { .fniv = gen_uqsub_vec,
1506           .fno = gen_helper_gvec_uqsub_h,
1507           .opt_opc = vecop_list,
1508           .write_aofs = true,
1509           .vece = MO_16 },
1510         { .fniv = gen_uqsub_vec,
1511           .fno = gen_helper_gvec_uqsub_s,
1512           .opt_opc = vecop_list,
1513           .write_aofs = true,
1514           .vece = MO_32 },
1515         { .fniv = gen_uqsub_vec,
1516           .fni8 = gen_uqsub_d,
1517           .fno = gen_helper_gvec_uqsub_d,
1518           .opt_opc = vecop_list,
1519           .write_aofs = true,
1520           .vece = MO_64 },
1521     };
1522 
1523     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1524     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1525                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1526 }
1527 
1528 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1529 {
1530     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1531     int64_t min = -1ll - max;
1532     TCGv_i64 tmp = tcg_temp_new_i64();
1533 
1534     tcg_gen_sub_i64(tmp, a, b);
1535     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1536     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1537     tcg_gen_xor_i64(tmp, tmp, res);
1538     tcg_gen_or_i64(qc, qc, tmp);
1539 }
1540 
1541 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1542 {
1543     TCGv_i64 t0 = tcg_temp_new_i64();
1544     TCGv_i64 t1 = tcg_temp_new_i64();
1545     TCGv_i64 t2 = tcg_temp_new_i64();
1546 
1547     tcg_gen_sub_i64(t0, a, b);
1548 
1549     /* Compute signed overflow indication into T1 */
1550     tcg_gen_xor_i64(t1, a, b);
1551     tcg_gen_xor_i64(t2, t0, a);
1552     tcg_gen_and_i64(t1, t1, t2);
1553 
1554     /* Compute saturated value into T2 */
1555     tcg_gen_sari_i64(t2, a, 63);
1556     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1557 
1558     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1559     tcg_gen_xor_i64(t0, t0, res);
1560     tcg_gen_or_i64(qc, qc, t0);
1561 }
1562 
1563 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1564                           TCGv_vec a, TCGv_vec b)
1565 {
1566     TCGv_vec x = tcg_temp_new_vec_matching(t);
1567     tcg_gen_sub_vec(vece, x, a, b);
1568     tcg_gen_sssub_vec(vece, t, a, b);
1569     tcg_gen_xor_vec(vece, x, x, t);
1570     tcg_gen_or_vec(vece, qc, qc, x);
1571 }
1572 
1573 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1574                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1575 {
1576     static const TCGOpcode vecop_list[] = {
1577         INDEX_op_sssub_vec, INDEX_op_sub_vec, 0
1578     };
1579     static const GVecGen4 ops[4] = {
1580         { .fniv = gen_sqsub_vec,
1581           .fno = gen_helper_gvec_sqsub_b,
1582           .opt_opc = vecop_list,
1583           .write_aofs = true,
1584           .vece = MO_8 },
1585         { .fniv = gen_sqsub_vec,
1586           .fno = gen_helper_gvec_sqsub_h,
1587           .opt_opc = vecop_list,
1588           .write_aofs = true,
1589           .vece = MO_16 },
1590         { .fniv = gen_sqsub_vec,
1591           .fno = gen_helper_gvec_sqsub_s,
1592           .opt_opc = vecop_list,
1593           .write_aofs = true,
1594           .vece = MO_32 },
1595         { .fniv = gen_sqsub_vec,
1596           .fni8 = gen_sqsub_d,
1597           .fno = gen_helper_gvec_sqsub_d,
1598           .opt_opc = vecop_list,
1599           .write_aofs = true,
1600           .vece = MO_64 },
1601     };
1602 
1603     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1604     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1605                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1606 }
1607 
1608 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1609 {
1610     TCGv_i32 t = tcg_temp_new_i32();
1611 
1612     tcg_gen_sub_i32(t, a, b);
1613     tcg_gen_sub_i32(d, b, a);
1614     tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
1615 }
1616 
1617 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1618 {
1619     TCGv_i64 t = tcg_temp_new_i64();
1620 
1621     tcg_gen_sub_i64(t, a, b);
1622     tcg_gen_sub_i64(d, b, a);
1623     tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
1624 }
1625 
1626 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1627 {
1628     TCGv_vec t = tcg_temp_new_vec_matching(d);
1629 
1630     tcg_gen_smin_vec(vece, t, a, b);
1631     tcg_gen_smax_vec(vece, d, a, b);
1632     tcg_gen_sub_vec(vece, d, d, t);
1633 }
1634 
1635 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1636                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1637 {
1638     static const TCGOpcode vecop_list[] = {
1639         INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1640     };
1641     static const GVecGen3 ops[4] = {
1642         { .fniv = gen_sabd_vec,
1643           .fno = gen_helper_gvec_sabd_b,
1644           .opt_opc = vecop_list,
1645           .vece = MO_8 },
1646         { .fniv = gen_sabd_vec,
1647           .fno = gen_helper_gvec_sabd_h,
1648           .opt_opc = vecop_list,
1649           .vece = MO_16 },
1650         { .fni4 = gen_sabd_i32,
1651           .fniv = gen_sabd_vec,
1652           .fno = gen_helper_gvec_sabd_s,
1653           .opt_opc = vecop_list,
1654           .vece = MO_32 },
1655         { .fni8 = gen_sabd_i64,
1656           .fniv = gen_sabd_vec,
1657           .fno = gen_helper_gvec_sabd_d,
1658           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1659           .opt_opc = vecop_list,
1660           .vece = MO_64 },
1661     };
1662     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1663 }
1664 
1665 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1666 {
1667     TCGv_i32 t = tcg_temp_new_i32();
1668 
1669     tcg_gen_sub_i32(t, a, b);
1670     tcg_gen_sub_i32(d, b, a);
1671     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
1672 }
1673 
1674 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1675 {
1676     TCGv_i64 t = tcg_temp_new_i64();
1677 
1678     tcg_gen_sub_i64(t, a, b);
1679     tcg_gen_sub_i64(d, b, a);
1680     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
1681 }
1682 
1683 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1684 {
1685     TCGv_vec t = tcg_temp_new_vec_matching(d);
1686 
1687     tcg_gen_umin_vec(vece, t, a, b);
1688     tcg_gen_umax_vec(vece, d, a, b);
1689     tcg_gen_sub_vec(vece, d, d, t);
1690 }
1691 
1692 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1693                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1694 {
1695     static const TCGOpcode vecop_list[] = {
1696         INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1697     };
1698     static const GVecGen3 ops[4] = {
1699         { .fniv = gen_uabd_vec,
1700           .fno = gen_helper_gvec_uabd_b,
1701           .opt_opc = vecop_list,
1702           .vece = MO_8 },
1703         { .fniv = gen_uabd_vec,
1704           .fno = gen_helper_gvec_uabd_h,
1705           .opt_opc = vecop_list,
1706           .vece = MO_16 },
1707         { .fni4 = gen_uabd_i32,
1708           .fniv = gen_uabd_vec,
1709           .fno = gen_helper_gvec_uabd_s,
1710           .opt_opc = vecop_list,
1711           .vece = MO_32 },
1712         { .fni8 = gen_uabd_i64,
1713           .fniv = gen_uabd_vec,
1714           .fno = gen_helper_gvec_uabd_d,
1715           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1716           .opt_opc = vecop_list,
1717           .vece = MO_64 },
1718     };
1719     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1720 }
1721 
1722 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1723 {
1724     TCGv_i32 t = tcg_temp_new_i32();
1725     gen_sabd_i32(t, a, b);
1726     tcg_gen_add_i32(d, d, t);
1727 }
1728 
1729 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1730 {
1731     TCGv_i64 t = tcg_temp_new_i64();
1732     gen_sabd_i64(t, a, b);
1733     tcg_gen_add_i64(d, d, t);
1734 }
1735 
1736 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1737 {
1738     TCGv_vec t = tcg_temp_new_vec_matching(d);
1739     gen_sabd_vec(vece, t, a, b);
1740     tcg_gen_add_vec(vece, d, d, t);
1741 }
1742 
1743 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1744                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1745 {
1746     static const TCGOpcode vecop_list[] = {
1747         INDEX_op_sub_vec, INDEX_op_add_vec,
1748         INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1749     };
1750     static const GVecGen3 ops[4] = {
1751         { .fniv = gen_saba_vec,
1752           .fno = gen_helper_gvec_saba_b,
1753           .opt_opc = vecop_list,
1754           .load_dest = true,
1755           .vece = MO_8 },
1756         { .fniv = gen_saba_vec,
1757           .fno = gen_helper_gvec_saba_h,
1758           .opt_opc = vecop_list,
1759           .load_dest = true,
1760           .vece = MO_16 },
1761         { .fni4 = gen_saba_i32,
1762           .fniv = gen_saba_vec,
1763           .fno = gen_helper_gvec_saba_s,
1764           .opt_opc = vecop_list,
1765           .load_dest = true,
1766           .vece = MO_32 },
1767         { .fni8 = gen_saba_i64,
1768           .fniv = gen_saba_vec,
1769           .fno = gen_helper_gvec_saba_d,
1770           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1771           .opt_opc = vecop_list,
1772           .load_dest = true,
1773           .vece = MO_64 },
1774     };
1775     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1776 }
1777 
1778 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1779 {
1780     TCGv_i32 t = tcg_temp_new_i32();
1781     gen_uabd_i32(t, a, b);
1782     tcg_gen_add_i32(d, d, t);
1783 }
1784 
1785 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1786 {
1787     TCGv_i64 t = tcg_temp_new_i64();
1788     gen_uabd_i64(t, a, b);
1789     tcg_gen_add_i64(d, d, t);
1790 }
1791 
1792 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1793 {
1794     TCGv_vec t = tcg_temp_new_vec_matching(d);
1795     gen_uabd_vec(vece, t, a, b);
1796     tcg_gen_add_vec(vece, d, d, t);
1797 }
1798 
1799 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1800                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1801 {
1802     static const TCGOpcode vecop_list[] = {
1803         INDEX_op_sub_vec, INDEX_op_add_vec,
1804         INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1805     };
1806     static const GVecGen3 ops[4] = {
1807         { .fniv = gen_uaba_vec,
1808           .fno = gen_helper_gvec_uaba_b,
1809           .opt_opc = vecop_list,
1810           .load_dest = true,
1811           .vece = MO_8 },
1812         { .fniv = gen_uaba_vec,
1813           .fno = gen_helper_gvec_uaba_h,
1814           .opt_opc = vecop_list,
1815           .load_dest = true,
1816           .vece = MO_16 },
1817         { .fni4 = gen_uaba_i32,
1818           .fniv = gen_uaba_vec,
1819           .fno = gen_helper_gvec_uaba_s,
1820           .opt_opc = vecop_list,
1821           .load_dest = true,
1822           .vece = MO_32 },
1823         { .fni8 = gen_uaba_i64,
1824           .fniv = gen_uaba_vec,
1825           .fno = gen_helper_gvec_uaba_d,
1826           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1827           .opt_opc = vecop_list,
1828           .load_dest = true,
1829           .vece = MO_64 },
1830     };
1831     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1832 }
1833 
1834 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1835                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1836 {
1837     static gen_helper_gvec_3 * const fns[4] = {
1838         gen_helper_gvec_addp_b,
1839         gen_helper_gvec_addp_h,
1840         gen_helper_gvec_addp_s,
1841         gen_helper_gvec_addp_d,
1842     };
1843     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1844 }
1845 
1846 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1847                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1848 {
1849     static gen_helper_gvec_3 * const fns[4] = {
1850         gen_helper_gvec_smaxp_b,
1851         gen_helper_gvec_smaxp_h,
1852         gen_helper_gvec_smaxp_s,
1853     };
1854     tcg_debug_assert(vece <= MO_32);
1855     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1856 }
1857 
1858 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1859                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1860 {
1861     static gen_helper_gvec_3 * const fns[4] = {
1862         gen_helper_gvec_sminp_b,
1863         gen_helper_gvec_sminp_h,
1864         gen_helper_gvec_sminp_s,
1865     };
1866     tcg_debug_assert(vece <= MO_32);
1867     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1868 }
1869 
1870 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1871                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1872 {
1873     static gen_helper_gvec_3 * const fns[4] = {
1874         gen_helper_gvec_umaxp_b,
1875         gen_helper_gvec_umaxp_h,
1876         gen_helper_gvec_umaxp_s,
1877     };
1878     tcg_debug_assert(vece <= MO_32);
1879     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1880 }
1881 
1882 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1883                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1884 {
1885     static gen_helper_gvec_3 * const fns[4] = {
1886         gen_helper_gvec_uminp_b,
1887         gen_helper_gvec_uminp_h,
1888         gen_helper_gvec_uminp_s,
1889     };
1890     tcg_debug_assert(vece <= MO_32);
1891     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1892 }
1893 
1894 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1895 {
1896     TCGv_i64 t = tcg_temp_new_i64();
1897 
1898     tcg_gen_and_i64(t, a, b);
1899     tcg_gen_vec_sar8i_i64(a, a, 1);
1900     tcg_gen_vec_sar8i_i64(b, b, 1);
1901     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1902     tcg_gen_vec_add8_i64(d, a, b);
1903     tcg_gen_vec_add8_i64(d, d, t);
1904 }
1905 
1906 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1907 {
1908     TCGv_i64 t = tcg_temp_new_i64();
1909 
1910     tcg_gen_and_i64(t, a, b);
1911     tcg_gen_vec_sar16i_i64(a, a, 1);
1912     tcg_gen_vec_sar16i_i64(b, b, 1);
1913     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1914     tcg_gen_vec_add16_i64(d, a, b);
1915     tcg_gen_vec_add16_i64(d, d, t);
1916 }
1917 
1918 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1919 {
1920     TCGv_i32 t = tcg_temp_new_i32();
1921 
1922     tcg_gen_and_i32(t, a, b);
1923     tcg_gen_sari_i32(a, a, 1);
1924     tcg_gen_sari_i32(b, b, 1);
1925     tcg_gen_andi_i32(t, t, 1);
1926     tcg_gen_add_i32(d, a, b);
1927     tcg_gen_add_i32(d, d, t);
1928 }
1929 
1930 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1931 {
1932     TCGv_vec t = tcg_temp_new_vec_matching(d);
1933 
1934     tcg_gen_and_vec(vece, t, a, b);
1935     tcg_gen_sari_vec(vece, a, a, 1);
1936     tcg_gen_sari_vec(vece, b, b, 1);
1937     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1938     tcg_gen_add_vec(vece, d, a, b);
1939     tcg_gen_add_vec(vece, d, d, t);
1940 }
1941 
1942 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1943                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1944 {
1945     static const TCGOpcode vecop_list[] = {
1946         INDEX_op_sari_vec, INDEX_op_add_vec, 0
1947     };
1948     static const GVecGen3 g[] = {
1949         { .fni8 = gen_shadd8_i64,
1950           .fniv = gen_shadd_vec,
1951           .opt_opc = vecop_list,
1952           .vece = MO_8 },
1953         { .fni8 = gen_shadd16_i64,
1954           .fniv = gen_shadd_vec,
1955           .opt_opc = vecop_list,
1956           .vece = MO_16 },
1957         { .fni4 = gen_shadd_i32,
1958           .fniv = gen_shadd_vec,
1959           .opt_opc = vecop_list,
1960           .vece = MO_32 },
1961     };
1962     tcg_debug_assert(vece <= MO_32);
1963     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
1964 }
1965 
1966 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1967 {
1968     TCGv_i64 t = tcg_temp_new_i64();
1969 
1970     tcg_gen_and_i64(t, a, b);
1971     tcg_gen_vec_shr8i_i64(a, a, 1);
1972     tcg_gen_vec_shr8i_i64(b, b, 1);
1973     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1974     tcg_gen_vec_add8_i64(d, a, b);
1975     tcg_gen_vec_add8_i64(d, d, t);
1976 }
1977 
1978 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1979 {
1980     TCGv_i64 t = tcg_temp_new_i64();
1981 
1982     tcg_gen_and_i64(t, a, b);
1983     tcg_gen_vec_shr16i_i64(a, a, 1);
1984     tcg_gen_vec_shr16i_i64(b, b, 1);
1985     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1986     tcg_gen_vec_add16_i64(d, a, b);
1987     tcg_gen_vec_add16_i64(d, d, t);
1988 }
1989 
1990 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1991 {
1992     TCGv_i32 t = tcg_temp_new_i32();
1993 
1994     tcg_gen_and_i32(t, a, b);
1995     tcg_gen_shri_i32(a, a, 1);
1996     tcg_gen_shri_i32(b, b, 1);
1997     tcg_gen_andi_i32(t, t, 1);
1998     tcg_gen_add_i32(d, a, b);
1999     tcg_gen_add_i32(d, d, t);
2000 }
2001 
2002 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2003 {
2004     TCGv_vec t = tcg_temp_new_vec_matching(d);
2005 
2006     tcg_gen_and_vec(vece, t, a, b);
2007     tcg_gen_shri_vec(vece, a, a, 1);
2008     tcg_gen_shri_vec(vece, b, b, 1);
2009     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2010     tcg_gen_add_vec(vece, d, a, b);
2011     tcg_gen_add_vec(vece, d, d, t);
2012 }
2013 
2014 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2015                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2016 {
2017     static const TCGOpcode vecop_list[] = {
2018         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2019     };
2020     static const GVecGen3 g[] = {
2021         { .fni8 = gen_uhadd8_i64,
2022           .fniv = gen_uhadd_vec,
2023           .opt_opc = vecop_list,
2024           .vece = MO_8 },
2025         { .fni8 = gen_uhadd16_i64,
2026           .fniv = gen_uhadd_vec,
2027           .opt_opc = vecop_list,
2028           .vece = MO_16 },
2029         { .fni4 = gen_uhadd_i32,
2030           .fniv = gen_uhadd_vec,
2031           .opt_opc = vecop_list,
2032           .vece = MO_32 },
2033     };
2034     tcg_debug_assert(vece <= MO_32);
2035     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2036 }
2037 
2038 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2039 {
2040     TCGv_i64 t = tcg_temp_new_i64();
2041 
2042     tcg_gen_andc_i64(t, b, a);
2043     tcg_gen_vec_sar8i_i64(a, a, 1);
2044     tcg_gen_vec_sar8i_i64(b, b, 1);
2045     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2046     tcg_gen_vec_sub8_i64(d, a, b);
2047     tcg_gen_vec_sub8_i64(d, d, t);
2048 }
2049 
2050 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2051 {
2052     TCGv_i64 t = tcg_temp_new_i64();
2053 
2054     tcg_gen_andc_i64(t, b, a);
2055     tcg_gen_vec_sar16i_i64(a, a, 1);
2056     tcg_gen_vec_sar16i_i64(b, b, 1);
2057     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2058     tcg_gen_vec_sub16_i64(d, a, b);
2059     tcg_gen_vec_sub16_i64(d, d, t);
2060 }
2061 
2062 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2063 {
2064     TCGv_i32 t = tcg_temp_new_i32();
2065 
2066     tcg_gen_andc_i32(t, b, a);
2067     tcg_gen_sari_i32(a, a, 1);
2068     tcg_gen_sari_i32(b, b, 1);
2069     tcg_gen_andi_i32(t, t, 1);
2070     tcg_gen_sub_i32(d, a, b);
2071     tcg_gen_sub_i32(d, d, t);
2072 }
2073 
2074 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2075 {
2076     TCGv_vec t = tcg_temp_new_vec_matching(d);
2077 
2078     tcg_gen_andc_vec(vece, t, b, a);
2079     tcg_gen_sari_vec(vece, a, a, 1);
2080     tcg_gen_sari_vec(vece, b, b, 1);
2081     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2082     tcg_gen_sub_vec(vece, d, a, b);
2083     tcg_gen_sub_vec(vece, d, d, t);
2084 }
2085 
2086 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2087                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2088 {
2089     static const TCGOpcode vecop_list[] = {
2090         INDEX_op_sari_vec, INDEX_op_sub_vec, 0
2091     };
2092     static const GVecGen3 g[4] = {
2093         { .fni8 = gen_shsub8_i64,
2094           .fniv = gen_shsub_vec,
2095           .opt_opc = vecop_list,
2096           .vece = MO_8 },
2097         { .fni8 = gen_shsub16_i64,
2098           .fniv = gen_shsub_vec,
2099           .opt_opc = vecop_list,
2100           .vece = MO_16 },
2101         { .fni4 = gen_shsub_i32,
2102           .fniv = gen_shsub_vec,
2103           .opt_opc = vecop_list,
2104           .vece = MO_32 },
2105     };
2106     assert(vece <= MO_32);
2107     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2108 }
2109 
2110 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2111 {
2112     TCGv_i64 t = tcg_temp_new_i64();
2113 
2114     tcg_gen_andc_i64(t, b, a);
2115     tcg_gen_vec_shr8i_i64(a, a, 1);
2116     tcg_gen_vec_shr8i_i64(b, b, 1);
2117     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2118     tcg_gen_vec_sub8_i64(d, a, b);
2119     tcg_gen_vec_sub8_i64(d, d, t);
2120 }
2121 
2122 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2123 {
2124     TCGv_i64 t = tcg_temp_new_i64();
2125 
2126     tcg_gen_andc_i64(t, b, a);
2127     tcg_gen_vec_shr16i_i64(a, a, 1);
2128     tcg_gen_vec_shr16i_i64(b, b, 1);
2129     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2130     tcg_gen_vec_sub16_i64(d, a, b);
2131     tcg_gen_vec_sub16_i64(d, d, t);
2132 }
2133 
2134 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2135 {
2136     TCGv_i32 t = tcg_temp_new_i32();
2137 
2138     tcg_gen_andc_i32(t, b, a);
2139     tcg_gen_shri_i32(a, a, 1);
2140     tcg_gen_shri_i32(b, b, 1);
2141     tcg_gen_andi_i32(t, t, 1);
2142     tcg_gen_sub_i32(d, a, b);
2143     tcg_gen_sub_i32(d, d, t);
2144 }
2145 
2146 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2147 {
2148     TCGv_vec t = tcg_temp_new_vec_matching(d);
2149 
2150     tcg_gen_andc_vec(vece, t, b, a);
2151     tcg_gen_shri_vec(vece, a, a, 1);
2152     tcg_gen_shri_vec(vece, b, b, 1);
2153     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2154     tcg_gen_sub_vec(vece, d, a, b);
2155     tcg_gen_sub_vec(vece, d, d, t);
2156 }
2157 
2158 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2159                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2160 {
2161     static const TCGOpcode vecop_list[] = {
2162         INDEX_op_shri_vec, INDEX_op_sub_vec, 0
2163     };
2164     static const GVecGen3 g[4] = {
2165         { .fni8 = gen_uhsub8_i64,
2166           .fniv = gen_uhsub_vec,
2167           .opt_opc = vecop_list,
2168           .vece = MO_8 },
2169         { .fni8 = gen_uhsub16_i64,
2170           .fniv = gen_uhsub_vec,
2171           .opt_opc = vecop_list,
2172           .vece = MO_16 },
2173         { .fni4 = gen_uhsub_i32,
2174           .fniv = gen_uhsub_vec,
2175           .opt_opc = vecop_list,
2176           .vece = MO_32 },
2177     };
2178     assert(vece <= MO_32);
2179     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2180 }
2181 
2182 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2183 {
2184     TCGv_i64 t = tcg_temp_new_i64();
2185 
2186     tcg_gen_or_i64(t, a, b);
2187     tcg_gen_vec_sar8i_i64(a, a, 1);
2188     tcg_gen_vec_sar8i_i64(b, b, 1);
2189     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2190     tcg_gen_vec_add8_i64(d, a, b);
2191     tcg_gen_vec_add8_i64(d, d, t);
2192 }
2193 
2194 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2195 {
2196     TCGv_i64 t = tcg_temp_new_i64();
2197 
2198     tcg_gen_or_i64(t, a, b);
2199     tcg_gen_vec_sar16i_i64(a, a, 1);
2200     tcg_gen_vec_sar16i_i64(b, b, 1);
2201     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2202     tcg_gen_vec_add16_i64(d, a, b);
2203     tcg_gen_vec_add16_i64(d, d, t);
2204 }
2205 
2206 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2207 {
2208     TCGv_i32 t = tcg_temp_new_i32();
2209 
2210     tcg_gen_or_i32(t, a, b);
2211     tcg_gen_sari_i32(a, a, 1);
2212     tcg_gen_sari_i32(b, b, 1);
2213     tcg_gen_andi_i32(t, t, 1);
2214     tcg_gen_add_i32(d, a, b);
2215     tcg_gen_add_i32(d, d, t);
2216 }
2217 
2218 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2219 {
2220     TCGv_vec t = tcg_temp_new_vec_matching(d);
2221 
2222     tcg_gen_or_vec(vece, t, a, b);
2223     tcg_gen_sari_vec(vece, a, a, 1);
2224     tcg_gen_sari_vec(vece, b, b, 1);
2225     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2226     tcg_gen_add_vec(vece, d, a, b);
2227     tcg_gen_add_vec(vece, d, d, t);
2228 }
2229 
2230 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2231                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2232 {
2233     static const TCGOpcode vecop_list[] = {
2234         INDEX_op_sari_vec, INDEX_op_add_vec, 0
2235     };
2236     static const GVecGen3 g[] = {
2237         { .fni8 = gen_srhadd8_i64,
2238           .fniv = gen_srhadd_vec,
2239           .opt_opc = vecop_list,
2240           .vece = MO_8 },
2241         { .fni8 = gen_srhadd16_i64,
2242           .fniv = gen_srhadd_vec,
2243           .opt_opc = vecop_list,
2244           .vece = MO_16 },
2245         { .fni4 = gen_srhadd_i32,
2246           .fniv = gen_srhadd_vec,
2247           .opt_opc = vecop_list,
2248           .vece = MO_32 },
2249     };
2250     assert(vece <= MO_32);
2251     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2252 }
2253 
2254 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2255 {
2256     TCGv_i64 t = tcg_temp_new_i64();
2257 
2258     tcg_gen_or_i64(t, a, b);
2259     tcg_gen_vec_shr8i_i64(a, a, 1);
2260     tcg_gen_vec_shr8i_i64(b, b, 1);
2261     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2262     tcg_gen_vec_add8_i64(d, a, b);
2263     tcg_gen_vec_add8_i64(d, d, t);
2264 }
2265 
2266 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2267 {
2268     TCGv_i64 t = tcg_temp_new_i64();
2269 
2270     tcg_gen_or_i64(t, a, b);
2271     tcg_gen_vec_shr16i_i64(a, a, 1);
2272     tcg_gen_vec_shr16i_i64(b, b, 1);
2273     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2274     tcg_gen_vec_add16_i64(d, a, b);
2275     tcg_gen_vec_add16_i64(d, d, t);
2276 }
2277 
2278 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2279 {
2280     TCGv_i32 t = tcg_temp_new_i32();
2281 
2282     tcg_gen_or_i32(t, a, b);
2283     tcg_gen_shri_i32(a, a, 1);
2284     tcg_gen_shri_i32(b, b, 1);
2285     tcg_gen_andi_i32(t, t, 1);
2286     tcg_gen_add_i32(d, a, b);
2287     tcg_gen_add_i32(d, d, t);
2288 }
2289 
2290 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2291 {
2292     TCGv_vec t = tcg_temp_new_vec_matching(d);
2293 
2294     tcg_gen_or_vec(vece, t, a, b);
2295     tcg_gen_shri_vec(vece, a, a, 1);
2296     tcg_gen_shri_vec(vece, b, b, 1);
2297     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2298     tcg_gen_add_vec(vece, d, a, b);
2299     tcg_gen_add_vec(vece, d, d, t);
2300 }
2301 
2302 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2303                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2304 {
2305     static const TCGOpcode vecop_list[] = {
2306         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2307     };
2308     static const GVecGen3 g[] = {
2309         { .fni8 = gen_urhadd8_i64,
2310           .fniv = gen_urhadd_vec,
2311           .opt_opc = vecop_list,
2312           .vece = MO_8 },
2313         { .fni8 = gen_urhadd16_i64,
2314           .fniv = gen_urhadd_vec,
2315           .opt_opc = vecop_list,
2316           .vece = MO_16 },
2317         { .fni4 = gen_urhadd_i32,
2318           .fniv = gen_urhadd_vec,
2319           .opt_opc = vecop_list,
2320           .vece = MO_32 },
2321     };
2322     assert(vece <= MO_32);
2323     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2324 }
2325