xref: /openbmc/qemu/target/arm/tcg/gengvec.c (revision 143e179c84415ce5b0b38274c39a7955d75f191c)
1 /*
2  *  ARM generic vector expansion
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "translate.h"
24 
25 
26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
27                             uint32_t opr_sz, uint32_t max_sz,
28                             gen_helper_gvec_3_ptr *fn)
29 {
30     TCGv_ptr qc_ptr = tcg_temp_new_ptr();
31 
32     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
33     tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc));
34     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
35                        opr_sz, max_sz, 0, fn);
36 }
37 
38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
39                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
40 {
41     static gen_helper_gvec_3_ptr * const fns[2] = {
42         gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s
43     };
44     tcg_debug_assert(vece >= 1 && vece <= 2);
45     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
46 }
47 
48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
49                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
50 {
51     static gen_helper_gvec_3_ptr * const fns[2] = {
52         gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s
53     };
54     tcg_debug_assert(vece >= 1 && vece <= 2);
55     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
56 }
57 
58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
59                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
60 {
61     static gen_helper_gvec_3_ptr * const fns[2] = {
62         gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
63     };
64     tcg_debug_assert(vece >= 1 && vece <= 2);
65     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
66 }
67 
68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
69                           uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
70 {
71     static gen_helper_gvec_3_ptr * const fns[2] = {
72         gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
73     };
74     tcg_debug_assert(vece >= 1 && vece <= 2);
75     gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
76 }
77 
78 #define GEN_CMP0(NAME, COND)                              \
79     void NAME(unsigned vece, uint32_t d, uint32_t m,      \
80               uint32_t opr_sz, uint32_t max_sz)           \
81     { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }
82 
83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
88 
89 #undef GEN_CMP0
90 
91 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
92 {
93     tcg_gen_vec_sar8i_i64(a, a, shift);
94     tcg_gen_vec_add8_i64(d, d, a);
95 }
96 
97 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
98 {
99     tcg_gen_vec_sar16i_i64(a, a, shift);
100     tcg_gen_vec_add16_i64(d, d, a);
101 }
102 
103 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
104 {
105     tcg_gen_sari_i32(a, a, shift);
106     tcg_gen_add_i32(d, d, a);
107 }
108 
109 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
110 {
111     tcg_gen_sari_i64(a, a, shift);
112     tcg_gen_add_i64(d, d, a);
113 }
114 
115 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
116 {
117     tcg_gen_sari_vec(vece, a, a, sh);
118     tcg_gen_add_vec(vece, d, d, a);
119 }
120 
121 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
122                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
123 {
124     static const TCGOpcode vecop_list[] = {
125         INDEX_op_sari_vec, INDEX_op_add_vec, 0
126     };
127     static const GVecGen2i ops[4] = {
128         { .fni8 = gen_ssra8_i64,
129           .fniv = gen_ssra_vec,
130           .fno = gen_helper_gvec_ssra_b,
131           .load_dest = true,
132           .opt_opc = vecop_list,
133           .vece = MO_8 },
134         { .fni8 = gen_ssra16_i64,
135           .fniv = gen_ssra_vec,
136           .fno = gen_helper_gvec_ssra_h,
137           .load_dest = true,
138           .opt_opc = vecop_list,
139           .vece = MO_16 },
140         { .fni4 = gen_ssra32_i32,
141           .fniv = gen_ssra_vec,
142           .fno = gen_helper_gvec_ssra_s,
143           .load_dest = true,
144           .opt_opc = vecop_list,
145           .vece = MO_32 },
146         { .fni8 = gen_ssra64_i64,
147           .fniv = gen_ssra_vec,
148           .fno = gen_helper_gvec_ssra_d,
149           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
150           .opt_opc = vecop_list,
151           .load_dest = true,
152           .vece = MO_64 },
153     };
154 
155     /* tszimm encoding produces immediates in the range [1..esize]. */
156     tcg_debug_assert(shift > 0);
157     tcg_debug_assert(shift <= (8 << vece));
158 
159     /*
160      * Shifts larger than the element size are architecturally valid.
161      * Signed results in all sign bits.
162      */
163     shift = MIN(shift, (8 << vece) - 1);
164     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
165 }
166 
167 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
168 {
169     tcg_gen_vec_shr8i_i64(a, a, shift);
170     tcg_gen_vec_add8_i64(d, d, a);
171 }
172 
173 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
174 {
175     tcg_gen_vec_shr16i_i64(a, a, shift);
176     tcg_gen_vec_add16_i64(d, d, a);
177 }
178 
179 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
180 {
181     tcg_gen_shri_i32(a, a, shift);
182     tcg_gen_add_i32(d, d, a);
183 }
184 
185 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
186 {
187     tcg_gen_shri_i64(a, a, shift);
188     tcg_gen_add_i64(d, d, a);
189 }
190 
191 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
192 {
193     tcg_gen_shri_vec(vece, a, a, sh);
194     tcg_gen_add_vec(vece, d, d, a);
195 }
196 
197 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
198                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
199 {
200     static const TCGOpcode vecop_list[] = {
201         INDEX_op_shri_vec, INDEX_op_add_vec, 0
202     };
203     static const GVecGen2i ops[4] = {
204         { .fni8 = gen_usra8_i64,
205           .fniv = gen_usra_vec,
206           .fno = gen_helper_gvec_usra_b,
207           .load_dest = true,
208           .opt_opc = vecop_list,
209           .vece = MO_8, },
210         { .fni8 = gen_usra16_i64,
211           .fniv = gen_usra_vec,
212           .fno = gen_helper_gvec_usra_h,
213           .load_dest = true,
214           .opt_opc = vecop_list,
215           .vece = MO_16, },
216         { .fni4 = gen_usra32_i32,
217           .fniv = gen_usra_vec,
218           .fno = gen_helper_gvec_usra_s,
219           .load_dest = true,
220           .opt_opc = vecop_list,
221           .vece = MO_32, },
222         { .fni8 = gen_usra64_i64,
223           .fniv = gen_usra_vec,
224           .fno = gen_helper_gvec_usra_d,
225           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
226           .load_dest = true,
227           .opt_opc = vecop_list,
228           .vece = MO_64, },
229     };
230 
231     /* tszimm encoding produces immediates in the range [1..esize]. */
232     tcg_debug_assert(shift > 0);
233     tcg_debug_assert(shift <= (8 << vece));
234 
235     /*
236      * Shifts larger than the element size are architecturally valid.
237      * Unsigned results in all zeros as input to accumulate: nop.
238      */
239     if (shift < (8 << vece)) {
240         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
241     } else {
242         /* Nop, but we do need to clear the tail. */
243         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
244     }
245 }
246 
247 /*
248  * Shift one less than the requested amount, and the low bit is
249  * the rounding bit.  For the 8 and 16-bit operations, because we
250  * mask the low bit, we can perform a normal integer shift instead
251  * of a vector shift.
252  */
253 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
254 {
255     TCGv_i64 t = tcg_temp_new_i64();
256 
257     tcg_gen_shri_i64(t, a, sh - 1);
258     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
259     tcg_gen_vec_sar8i_i64(d, a, sh);
260     tcg_gen_vec_add8_i64(d, d, t);
261 }
262 
263 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
264 {
265     TCGv_i64 t = tcg_temp_new_i64();
266 
267     tcg_gen_shri_i64(t, a, sh - 1);
268     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
269     tcg_gen_vec_sar16i_i64(d, a, sh);
270     tcg_gen_vec_add16_i64(d, d, t);
271 }
272 
273 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
274 {
275     TCGv_i32 t;
276 
277     /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
278     if (sh == 32) {
279         tcg_gen_movi_i32(d, 0);
280         return;
281     }
282     t = tcg_temp_new_i32();
283     tcg_gen_extract_i32(t, a, sh - 1, 1);
284     tcg_gen_sari_i32(d, a, sh);
285     tcg_gen_add_i32(d, d, t);
286 }
287 
288  void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
289 {
290     TCGv_i64 t = tcg_temp_new_i64();
291 
292     tcg_gen_extract_i64(t, a, sh - 1, 1);
293     tcg_gen_sari_i64(d, a, sh);
294     tcg_gen_add_i64(d, d, t);
295 }
296 
297 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
298 {
299     TCGv_vec t = tcg_temp_new_vec_matching(d);
300     TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
301 
302     tcg_gen_shri_vec(vece, t, a, sh - 1);
303     tcg_gen_and_vec(vece, t, t, ones);
304     tcg_gen_sari_vec(vece, d, a, sh);
305     tcg_gen_add_vec(vece, d, d, t);
306 }
307 
308 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
309                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
310 {
311     static const TCGOpcode vecop_list[] = {
312         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
313     };
314     static const GVecGen2i ops[4] = {
315         { .fni8 = gen_srshr8_i64,
316           .fniv = gen_srshr_vec,
317           .fno = gen_helper_gvec_srshr_b,
318           .opt_opc = vecop_list,
319           .vece = MO_8 },
320         { .fni8 = gen_srshr16_i64,
321           .fniv = gen_srshr_vec,
322           .fno = gen_helper_gvec_srshr_h,
323           .opt_opc = vecop_list,
324           .vece = MO_16 },
325         { .fni4 = gen_srshr32_i32,
326           .fniv = gen_srshr_vec,
327           .fno = gen_helper_gvec_srshr_s,
328           .opt_opc = vecop_list,
329           .vece = MO_32 },
330         { .fni8 = gen_srshr64_i64,
331           .fniv = gen_srshr_vec,
332           .fno = gen_helper_gvec_srshr_d,
333           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
334           .opt_opc = vecop_list,
335           .vece = MO_64 },
336     };
337 
338     /* tszimm encoding produces immediates in the range [1..esize] */
339     tcg_debug_assert(shift > 0);
340     tcg_debug_assert(shift <= (8 << vece));
341 
342     if (shift == (8 << vece)) {
343         /*
344          * Shifts larger than the element size are architecturally valid.
345          * Signed results in all sign bits.  With rounding, this produces
346          *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
347          * I.e. always zero.
348          */
349         tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
350     } else {
351         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
352     }
353 }
354 
355 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
356 {
357     TCGv_i64 t = tcg_temp_new_i64();
358 
359     gen_srshr8_i64(t, a, sh);
360     tcg_gen_vec_add8_i64(d, d, t);
361 }
362 
363 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
364 {
365     TCGv_i64 t = tcg_temp_new_i64();
366 
367     gen_srshr16_i64(t, a, sh);
368     tcg_gen_vec_add16_i64(d, d, t);
369 }
370 
371 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
372 {
373     TCGv_i32 t = tcg_temp_new_i32();
374 
375     gen_srshr32_i32(t, a, sh);
376     tcg_gen_add_i32(d, d, t);
377 }
378 
379 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
380 {
381     TCGv_i64 t = tcg_temp_new_i64();
382 
383     gen_srshr64_i64(t, a, sh);
384     tcg_gen_add_i64(d, d, t);
385 }
386 
387 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
388 {
389     TCGv_vec t = tcg_temp_new_vec_matching(d);
390 
391     gen_srshr_vec(vece, t, a, sh);
392     tcg_gen_add_vec(vece, d, d, t);
393 }
394 
395 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
396                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
397 {
398     static const TCGOpcode vecop_list[] = {
399         INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
400     };
401     static const GVecGen2i ops[4] = {
402         { .fni8 = gen_srsra8_i64,
403           .fniv = gen_srsra_vec,
404           .fno = gen_helper_gvec_srsra_b,
405           .opt_opc = vecop_list,
406           .load_dest = true,
407           .vece = MO_8 },
408         { .fni8 = gen_srsra16_i64,
409           .fniv = gen_srsra_vec,
410           .fno = gen_helper_gvec_srsra_h,
411           .opt_opc = vecop_list,
412           .load_dest = true,
413           .vece = MO_16 },
414         { .fni4 = gen_srsra32_i32,
415           .fniv = gen_srsra_vec,
416           .fno = gen_helper_gvec_srsra_s,
417           .opt_opc = vecop_list,
418           .load_dest = true,
419           .vece = MO_32 },
420         { .fni8 = gen_srsra64_i64,
421           .fniv = gen_srsra_vec,
422           .fno = gen_helper_gvec_srsra_d,
423           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
424           .opt_opc = vecop_list,
425           .load_dest = true,
426           .vece = MO_64 },
427     };
428 
429     /* tszimm encoding produces immediates in the range [1..esize] */
430     tcg_debug_assert(shift > 0);
431     tcg_debug_assert(shift <= (8 << vece));
432 
433     /*
434      * Shifts larger than the element size are architecturally valid.
435      * Signed results in all sign bits.  With rounding, this produces
436      *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
437      * I.e. always zero.  With accumulation, this leaves D unchanged.
438      */
439     if (shift == (8 << vece)) {
440         /* Nop, but we do need to clear the tail. */
441         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
442     } else {
443         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
444     }
445 }
446 
447 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
448 {
449     TCGv_i64 t = tcg_temp_new_i64();
450 
451     tcg_gen_shri_i64(t, a, sh - 1);
452     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
453     tcg_gen_vec_shr8i_i64(d, a, sh);
454     tcg_gen_vec_add8_i64(d, d, t);
455 }
456 
457 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
458 {
459     TCGv_i64 t = tcg_temp_new_i64();
460 
461     tcg_gen_shri_i64(t, a, sh - 1);
462     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
463     tcg_gen_vec_shr16i_i64(d, a, sh);
464     tcg_gen_vec_add16_i64(d, d, t);
465 }
466 
467 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
468 {
469     TCGv_i32 t;
470 
471     /* Handle shift by the input size for the benefit of trans_URSHR_ri */
472     if (sh == 32) {
473         tcg_gen_extract_i32(d, a, sh - 1, 1);
474         return;
475     }
476     t = tcg_temp_new_i32();
477     tcg_gen_extract_i32(t, a, sh - 1, 1);
478     tcg_gen_shri_i32(d, a, sh);
479     tcg_gen_add_i32(d, d, t);
480 }
481 
482 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
483 {
484     TCGv_i64 t = tcg_temp_new_i64();
485 
486     tcg_gen_extract_i64(t, a, sh - 1, 1);
487     tcg_gen_shri_i64(d, a, sh);
488     tcg_gen_add_i64(d, d, t);
489 }
490 
491 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
492 {
493     TCGv_vec t = tcg_temp_new_vec_matching(d);
494     TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
495 
496     tcg_gen_shri_vec(vece, t, a, shift - 1);
497     tcg_gen_and_vec(vece, t, t, ones);
498     tcg_gen_shri_vec(vece, d, a, shift);
499     tcg_gen_add_vec(vece, d, d, t);
500 }
501 
502 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
503                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
504 {
505     static const TCGOpcode vecop_list[] = {
506         INDEX_op_shri_vec, INDEX_op_add_vec, 0
507     };
508     static const GVecGen2i ops[4] = {
509         { .fni8 = gen_urshr8_i64,
510           .fniv = gen_urshr_vec,
511           .fno = gen_helper_gvec_urshr_b,
512           .opt_opc = vecop_list,
513           .vece = MO_8 },
514         { .fni8 = gen_urshr16_i64,
515           .fniv = gen_urshr_vec,
516           .fno = gen_helper_gvec_urshr_h,
517           .opt_opc = vecop_list,
518           .vece = MO_16 },
519         { .fni4 = gen_urshr32_i32,
520           .fniv = gen_urshr_vec,
521           .fno = gen_helper_gvec_urshr_s,
522           .opt_opc = vecop_list,
523           .vece = MO_32 },
524         { .fni8 = gen_urshr64_i64,
525           .fniv = gen_urshr_vec,
526           .fno = gen_helper_gvec_urshr_d,
527           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
528           .opt_opc = vecop_list,
529           .vece = MO_64 },
530     };
531 
532     /* tszimm encoding produces immediates in the range [1..esize] */
533     tcg_debug_assert(shift > 0);
534     tcg_debug_assert(shift <= (8 << vece));
535 
536     if (shift == (8 << vece)) {
537         /*
538          * Shifts larger than the element size are architecturally valid.
539          * Unsigned results in zero.  With rounding, this produces a
540          * copy of the most significant bit.
541          */
542         tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
543     } else {
544         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
545     }
546 }
547 
548 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
549 {
550     TCGv_i64 t = tcg_temp_new_i64();
551 
552     if (sh == 8) {
553         tcg_gen_vec_shr8i_i64(t, a, 7);
554     } else {
555         gen_urshr8_i64(t, a, sh);
556     }
557     tcg_gen_vec_add8_i64(d, d, t);
558 }
559 
560 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
561 {
562     TCGv_i64 t = tcg_temp_new_i64();
563 
564     if (sh == 16) {
565         tcg_gen_vec_shr16i_i64(t, a, 15);
566     } else {
567         gen_urshr16_i64(t, a, sh);
568     }
569     tcg_gen_vec_add16_i64(d, d, t);
570 }
571 
572 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
573 {
574     TCGv_i32 t = tcg_temp_new_i32();
575 
576     if (sh == 32) {
577         tcg_gen_shri_i32(t, a, 31);
578     } else {
579         gen_urshr32_i32(t, a, sh);
580     }
581     tcg_gen_add_i32(d, d, t);
582 }
583 
584 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
585 {
586     TCGv_i64 t = tcg_temp_new_i64();
587 
588     if (sh == 64) {
589         tcg_gen_shri_i64(t, a, 63);
590     } else {
591         gen_urshr64_i64(t, a, sh);
592     }
593     tcg_gen_add_i64(d, d, t);
594 }
595 
596 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
597 {
598     TCGv_vec t = tcg_temp_new_vec_matching(d);
599 
600     if (sh == (8 << vece)) {
601         tcg_gen_shri_vec(vece, t, a, sh - 1);
602     } else {
603         gen_urshr_vec(vece, t, a, sh);
604     }
605     tcg_gen_add_vec(vece, d, d, t);
606 }
607 
608 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
609                     int64_t shift, uint32_t opr_sz, uint32_t max_sz)
610 {
611     static const TCGOpcode vecop_list[] = {
612         INDEX_op_shri_vec, INDEX_op_add_vec, 0
613     };
614     static const GVecGen2i ops[4] = {
615         { .fni8 = gen_ursra8_i64,
616           .fniv = gen_ursra_vec,
617           .fno = gen_helper_gvec_ursra_b,
618           .opt_opc = vecop_list,
619           .load_dest = true,
620           .vece = MO_8 },
621         { .fni8 = gen_ursra16_i64,
622           .fniv = gen_ursra_vec,
623           .fno = gen_helper_gvec_ursra_h,
624           .opt_opc = vecop_list,
625           .load_dest = true,
626           .vece = MO_16 },
627         { .fni4 = gen_ursra32_i32,
628           .fniv = gen_ursra_vec,
629           .fno = gen_helper_gvec_ursra_s,
630           .opt_opc = vecop_list,
631           .load_dest = true,
632           .vece = MO_32 },
633         { .fni8 = gen_ursra64_i64,
634           .fniv = gen_ursra_vec,
635           .fno = gen_helper_gvec_ursra_d,
636           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
637           .opt_opc = vecop_list,
638           .load_dest = true,
639           .vece = MO_64 },
640     };
641 
642     /* tszimm encoding produces immediates in the range [1..esize] */
643     tcg_debug_assert(shift > 0);
644     tcg_debug_assert(shift <= (8 << vece));
645 
646     tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
647 }
648 
649 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
650 {
651     uint64_t mask = dup_const(MO_8, 0xff >> shift);
652     TCGv_i64 t = tcg_temp_new_i64();
653 
654     tcg_gen_shri_i64(t, a, shift);
655     tcg_gen_andi_i64(t, t, mask);
656     tcg_gen_andi_i64(d, d, ~mask);
657     tcg_gen_or_i64(d, d, t);
658 }
659 
660 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
661 {
662     uint64_t mask = dup_const(MO_16, 0xffff >> shift);
663     TCGv_i64 t = tcg_temp_new_i64();
664 
665     tcg_gen_shri_i64(t, a, shift);
666     tcg_gen_andi_i64(t, t, mask);
667     tcg_gen_andi_i64(d, d, ~mask);
668     tcg_gen_or_i64(d, d, t);
669 }
670 
671 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
672 {
673     tcg_gen_shri_i32(a, a, shift);
674     tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
675 }
676 
677 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
678 {
679     tcg_gen_shri_i64(a, a, shift);
680     tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
681 }
682 
683 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
684 {
685     TCGv_vec t = tcg_temp_new_vec_matching(d);
686     int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh);
687     TCGv_vec m = tcg_constant_vec_matching(d, vece, mi);
688 
689     tcg_gen_shri_vec(vece, t, a, sh);
690     tcg_gen_and_vec(vece, d, d, m);
691     tcg_gen_or_vec(vece, d, d, t);
692 }
693 
694 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
695                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
696 {
697     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
698     const GVecGen2i ops[4] = {
699         { .fni8 = gen_shr8_ins_i64,
700           .fniv = gen_shr_ins_vec,
701           .fno = gen_helper_gvec_sri_b,
702           .load_dest = true,
703           .opt_opc = vecop_list,
704           .vece = MO_8 },
705         { .fni8 = gen_shr16_ins_i64,
706           .fniv = gen_shr_ins_vec,
707           .fno = gen_helper_gvec_sri_h,
708           .load_dest = true,
709           .opt_opc = vecop_list,
710           .vece = MO_16 },
711         { .fni4 = gen_shr32_ins_i32,
712           .fniv = gen_shr_ins_vec,
713           .fno = gen_helper_gvec_sri_s,
714           .load_dest = true,
715           .opt_opc = vecop_list,
716           .vece = MO_32 },
717         { .fni8 = gen_shr64_ins_i64,
718           .fniv = gen_shr_ins_vec,
719           .fno = gen_helper_gvec_sri_d,
720           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
721           .load_dest = true,
722           .opt_opc = vecop_list,
723           .vece = MO_64 },
724     };
725 
726     /* tszimm encoding produces immediates in the range [1..esize]. */
727     tcg_debug_assert(shift > 0);
728     tcg_debug_assert(shift <= (8 << vece));
729 
730     /* Shift of esize leaves destination unchanged. */
731     if (shift < (8 << vece)) {
732         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
733     } else {
734         /* Nop, but we do need to clear the tail. */
735         tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
736     }
737 }
738 
739 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
740 {
741     uint64_t mask = dup_const(MO_8, 0xff << shift);
742     TCGv_i64 t = tcg_temp_new_i64();
743 
744     tcg_gen_shli_i64(t, a, shift);
745     tcg_gen_andi_i64(t, t, mask);
746     tcg_gen_andi_i64(d, d, ~mask);
747     tcg_gen_or_i64(d, d, t);
748 }
749 
750 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
751 {
752     uint64_t mask = dup_const(MO_16, 0xffff << shift);
753     TCGv_i64 t = tcg_temp_new_i64();
754 
755     tcg_gen_shli_i64(t, a, shift);
756     tcg_gen_andi_i64(t, t, mask);
757     tcg_gen_andi_i64(d, d, ~mask);
758     tcg_gen_or_i64(d, d, t);
759 }
760 
761 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
762 {
763     tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
764 }
765 
766 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
767 {
768     tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
769 }
770 
771 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
772 {
773     TCGv_vec t = tcg_temp_new_vec_matching(d);
774     TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh));
775 
776     tcg_gen_shli_vec(vece, t, a, sh);
777     tcg_gen_and_vec(vece, d, d, m);
778     tcg_gen_or_vec(vece, d, d, t);
779 }
780 
781 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
782                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
783 {
784     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
785     const GVecGen2i ops[4] = {
786         { .fni8 = gen_shl8_ins_i64,
787           .fniv = gen_shl_ins_vec,
788           .fno = gen_helper_gvec_sli_b,
789           .load_dest = true,
790           .opt_opc = vecop_list,
791           .vece = MO_8 },
792         { .fni8 = gen_shl16_ins_i64,
793           .fniv = gen_shl_ins_vec,
794           .fno = gen_helper_gvec_sli_h,
795           .load_dest = true,
796           .opt_opc = vecop_list,
797           .vece = MO_16 },
798         { .fni4 = gen_shl32_ins_i32,
799           .fniv = gen_shl_ins_vec,
800           .fno = gen_helper_gvec_sli_s,
801           .load_dest = true,
802           .opt_opc = vecop_list,
803           .vece = MO_32 },
804         { .fni8 = gen_shl64_ins_i64,
805           .fniv = gen_shl_ins_vec,
806           .fno = gen_helper_gvec_sli_d,
807           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
808           .load_dest = true,
809           .opt_opc = vecop_list,
810           .vece = MO_64 },
811     };
812 
813     /* tszimm encoding produces immediates in the range [0..esize-1]. */
814     tcg_debug_assert(shift >= 0);
815     tcg_debug_assert(shift < (8 << vece));
816 
817     if (shift == 0) {
818         tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
819     } else {
820         tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
821     }
822 }
823 
824 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
825 {
826     gen_helper_neon_mul_u8(a, a, b);
827     gen_helper_neon_add_u8(d, d, a);
828 }
829 
830 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
831 {
832     gen_helper_neon_mul_u8(a, a, b);
833     gen_helper_neon_sub_u8(d, d, a);
834 }
835 
836 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
837 {
838     gen_helper_neon_mul_u16(a, a, b);
839     gen_helper_neon_add_u16(d, d, a);
840 }
841 
842 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
843 {
844     gen_helper_neon_mul_u16(a, a, b);
845     gen_helper_neon_sub_u16(d, d, a);
846 }
847 
848 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
849 {
850     tcg_gen_mul_i32(a, a, b);
851     tcg_gen_add_i32(d, d, a);
852 }
853 
854 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
855 {
856     tcg_gen_mul_i32(a, a, b);
857     tcg_gen_sub_i32(d, d, a);
858 }
859 
860 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
861 {
862     tcg_gen_mul_i64(a, a, b);
863     tcg_gen_add_i64(d, d, a);
864 }
865 
866 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
867 {
868     tcg_gen_mul_i64(a, a, b);
869     tcg_gen_sub_i64(d, d, a);
870 }
871 
872 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
873 {
874     tcg_gen_mul_vec(vece, a, a, b);
875     tcg_gen_add_vec(vece, d, d, a);
876 }
877 
878 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
879 {
880     tcg_gen_mul_vec(vece, a, a, b);
881     tcg_gen_sub_vec(vece, d, d, a);
882 }
883 
884 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
885  * these tables are shared with AArch64 which does support them.
886  */
887 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
888                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
889 {
890     static const TCGOpcode vecop_list[] = {
891         INDEX_op_mul_vec, INDEX_op_add_vec, 0
892     };
893     static const GVecGen3 ops[4] = {
894         { .fni4 = gen_mla8_i32,
895           .fniv = gen_mla_vec,
896           .load_dest = true,
897           .opt_opc = vecop_list,
898           .vece = MO_8 },
899         { .fni4 = gen_mla16_i32,
900           .fniv = gen_mla_vec,
901           .load_dest = true,
902           .opt_opc = vecop_list,
903           .vece = MO_16 },
904         { .fni4 = gen_mla32_i32,
905           .fniv = gen_mla_vec,
906           .load_dest = true,
907           .opt_opc = vecop_list,
908           .vece = MO_32 },
909         { .fni8 = gen_mla64_i64,
910           .fniv = gen_mla_vec,
911           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
912           .load_dest = true,
913           .opt_opc = vecop_list,
914           .vece = MO_64 },
915     };
916     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
917 }
918 
919 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
920                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
921 {
922     static const TCGOpcode vecop_list[] = {
923         INDEX_op_mul_vec, INDEX_op_sub_vec, 0
924     };
925     static const GVecGen3 ops[4] = {
926         { .fni4 = gen_mls8_i32,
927           .fniv = gen_mls_vec,
928           .load_dest = true,
929           .opt_opc = vecop_list,
930           .vece = MO_8 },
931         { .fni4 = gen_mls16_i32,
932           .fniv = gen_mls_vec,
933           .load_dest = true,
934           .opt_opc = vecop_list,
935           .vece = MO_16 },
936         { .fni4 = gen_mls32_i32,
937           .fniv = gen_mls_vec,
938           .load_dest = true,
939           .opt_opc = vecop_list,
940           .vece = MO_32 },
941         { .fni8 = gen_mls64_i64,
942           .fniv = gen_mls_vec,
943           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
944           .load_dest = true,
945           .opt_opc = vecop_list,
946           .vece = MO_64 },
947     };
948     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
949 }
950 
951 /* CMTST : test is "if (X & Y != 0)". */
952 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
953 {
954     tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b);
955 }
956 
957 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
958 {
959     tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b);
960 }
961 
962 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
963 {
964     tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b);
965 }
966 
967 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
968                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
969 {
970     static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
971     static const GVecGen3 ops[4] = {
972         { .fni4 = gen_helper_neon_tst_u8,
973           .fniv = gen_cmtst_vec,
974           .opt_opc = vecop_list,
975           .vece = MO_8 },
976         { .fni4 = gen_helper_neon_tst_u16,
977           .fniv = gen_cmtst_vec,
978           .opt_opc = vecop_list,
979           .vece = MO_16 },
980         { .fni4 = gen_cmtst_i32,
981           .fniv = gen_cmtst_vec,
982           .opt_opc = vecop_list,
983           .vece = MO_32 },
984         { .fni8 = gen_cmtst_i64,
985           .fniv = gen_cmtst_vec,
986           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
987           .opt_opc = vecop_list,
988           .vece = MO_64 },
989     };
990     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
991 }
992 
993 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
994 {
995     TCGv_i32 lval = tcg_temp_new_i32();
996     TCGv_i32 rval = tcg_temp_new_i32();
997     TCGv_i32 lsh = tcg_temp_new_i32();
998     TCGv_i32 rsh = tcg_temp_new_i32();
999     TCGv_i32 zero = tcg_constant_i32(0);
1000     TCGv_i32 max = tcg_constant_i32(32);
1001 
1002     /*
1003      * Rely on the TCG guarantee that out of range shifts produce
1004      * unspecified results, not undefined behaviour (i.e. no trap).
1005      * Discard out-of-range results after the fact.
1006      */
1007     tcg_gen_ext8s_i32(lsh, shift);
1008     tcg_gen_neg_i32(rsh, lsh);
1009     tcg_gen_shl_i32(lval, src, lsh);
1010     tcg_gen_shr_i32(rval, src, rsh);
1011     tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
1012     tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
1013 }
1014 
1015 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1016 {
1017     TCGv_i64 lval = tcg_temp_new_i64();
1018     TCGv_i64 rval = tcg_temp_new_i64();
1019     TCGv_i64 lsh = tcg_temp_new_i64();
1020     TCGv_i64 rsh = tcg_temp_new_i64();
1021     TCGv_i64 zero = tcg_constant_i64(0);
1022     TCGv_i64 max = tcg_constant_i64(64);
1023 
1024     /*
1025      * Rely on the TCG guarantee that out of range shifts produce
1026      * unspecified results, not undefined behaviour (i.e. no trap).
1027      * Discard out-of-range results after the fact.
1028      */
1029     tcg_gen_ext8s_i64(lsh, shift);
1030     tcg_gen_neg_i64(rsh, lsh);
1031     tcg_gen_shl_i64(lval, src, lsh);
1032     tcg_gen_shr_i64(rval, src, rsh);
1033     tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
1034     tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
1035 }
1036 
1037 static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
1038                          TCGv_vec src, TCGv_vec shift)
1039 {
1040     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1041     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1042     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1043     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1044     TCGv_vec max;
1045 
1046     tcg_gen_neg_vec(vece, rsh, shift);
1047     if (vece == MO_8) {
1048         tcg_gen_mov_vec(lsh, shift);
1049     } else {
1050         TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
1051         tcg_gen_and_vec(vece, lsh, shift, msk);
1052         tcg_gen_and_vec(vece, rsh, rsh, msk);
1053     }
1054 
1055     /*
1056      * Rely on the TCG guarantee that out of range shifts produce
1057      * unspecified results, not undefined behaviour (i.e. no trap).
1058      * Discard out-of-range results after the fact.
1059      */
1060     tcg_gen_shlv_vec(vece, lval, src, lsh);
1061     tcg_gen_shrv_vec(vece, rval, src, rsh);
1062 
1063     /*
1064      * The choice of LT (signed) and GEU (unsigned) are biased toward
1065      * the instructions of the x86_64 host.  For MO_8, the whole byte
1066      * is significant so we must use an unsigned compare; otherwise we
1067      * have already masked to a byte and so a signed compare works.
1068      * Other tcg hosts have a full set of comparisons and do not care.
1069      */
1070     max = tcg_constant_vec_matching(dst, vece, 8 << vece);
1071     if (vece == MO_8) {
1072         tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
1073         tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
1074         tcg_gen_andc_vec(vece, lval, lval, lsh);
1075         tcg_gen_andc_vec(vece, rval, rval, rsh);
1076     } else {
1077         tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
1078         tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
1079         tcg_gen_and_vec(vece, lval, lval, lsh);
1080         tcg_gen_and_vec(vece, rval, rval, rsh);
1081     }
1082     tcg_gen_or_vec(vece, dst, lval, rval);
1083 }
1084 
1085 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1086                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1087 {
1088     static const TCGOpcode vecop_list[] = {
1089         INDEX_op_neg_vec, INDEX_op_shlv_vec,
1090         INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
1091     };
1092     static const GVecGen3 ops[4] = {
1093         { .fniv = gen_ushl_vec,
1094           .fno = gen_helper_gvec_ushl_b,
1095           .opt_opc = vecop_list,
1096           .vece = MO_8 },
1097         { .fniv = gen_ushl_vec,
1098           .fno = gen_helper_gvec_ushl_h,
1099           .opt_opc = vecop_list,
1100           .vece = MO_16 },
1101         { .fni4 = gen_ushl_i32,
1102           .fniv = gen_ushl_vec,
1103           .opt_opc = vecop_list,
1104           .vece = MO_32 },
1105         { .fni8 = gen_ushl_i64,
1106           .fniv = gen_ushl_vec,
1107           .opt_opc = vecop_list,
1108           .vece = MO_64 },
1109     };
1110     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1111 }
1112 
1113 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1114 {
1115     TCGv_i32 lval = tcg_temp_new_i32();
1116     TCGv_i32 rval = tcg_temp_new_i32();
1117     TCGv_i32 lsh = tcg_temp_new_i32();
1118     TCGv_i32 rsh = tcg_temp_new_i32();
1119     TCGv_i32 zero = tcg_constant_i32(0);
1120     TCGv_i32 max = tcg_constant_i32(31);
1121 
1122     /*
1123      * Rely on the TCG guarantee that out of range shifts produce
1124      * unspecified results, not undefined behaviour (i.e. no trap).
1125      * Discard out-of-range results after the fact.
1126      */
1127     tcg_gen_ext8s_i32(lsh, shift);
1128     tcg_gen_neg_i32(rsh, lsh);
1129     tcg_gen_shl_i32(lval, src, lsh);
1130     tcg_gen_umin_i32(rsh, rsh, max);
1131     tcg_gen_sar_i32(rval, src, rsh);
1132     tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
1133     tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
1134 }
1135 
1136 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1137 {
1138     TCGv_i64 lval = tcg_temp_new_i64();
1139     TCGv_i64 rval = tcg_temp_new_i64();
1140     TCGv_i64 lsh = tcg_temp_new_i64();
1141     TCGv_i64 rsh = tcg_temp_new_i64();
1142     TCGv_i64 zero = tcg_constant_i64(0);
1143     TCGv_i64 max = tcg_constant_i64(63);
1144 
1145     /*
1146      * Rely on the TCG guarantee that out of range shifts produce
1147      * unspecified results, not undefined behaviour (i.e. no trap).
1148      * Discard out-of-range results after the fact.
1149      */
1150     tcg_gen_ext8s_i64(lsh, shift);
1151     tcg_gen_neg_i64(rsh, lsh);
1152     tcg_gen_shl_i64(lval, src, lsh);
1153     tcg_gen_umin_i64(rsh, rsh, max);
1154     tcg_gen_sar_i64(rval, src, rsh);
1155     tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
1156     tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
1157 }
1158 
1159 static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
1160                          TCGv_vec src, TCGv_vec shift)
1161 {
1162     TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1163     TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1164     TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1165     TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1166     TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
1167     TCGv_vec max, zero;
1168 
1169     /*
1170      * Rely on the TCG guarantee that out of range shifts produce
1171      * unspecified results, not undefined behaviour (i.e. no trap).
1172      * Discard out-of-range results after the fact.
1173      */
1174     tcg_gen_neg_vec(vece, rsh, shift);
1175     if (vece == MO_8) {
1176         tcg_gen_mov_vec(lsh, shift);
1177     } else {
1178         TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
1179         tcg_gen_and_vec(vece, lsh, shift, msk);
1180         tcg_gen_and_vec(vece, rsh, rsh, msk);
1181     }
1182 
1183     /* Bound rsh so out of bound right shift gets -1.  */
1184     max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1);
1185     tcg_gen_umin_vec(vece, rsh, rsh, max);
1186     tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, max);
1187 
1188     tcg_gen_shlv_vec(vece, lval, src, lsh);
1189     tcg_gen_sarv_vec(vece, rval, src, rsh);
1190 
1191     /* Select in-bound left shift.  */
1192     tcg_gen_andc_vec(vece, lval, lval, tmp);
1193 
1194     /* Select between left and right shift.  */
1195     zero = tcg_constant_vec_matching(dst, vece, 0);
1196     if (vece == MO_8) {
1197         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval);
1198     } else {
1199         TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80);
1200         tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval);
1201     }
1202 }
1203 
1204 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1205                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1206 {
1207     static const TCGOpcode vecop_list[] = {
1208         INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
1209         INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
1210     };
1211     static const GVecGen3 ops[4] = {
1212         { .fniv = gen_sshl_vec,
1213           .fno = gen_helper_gvec_sshl_b,
1214           .opt_opc = vecop_list,
1215           .vece = MO_8 },
1216         { .fniv = gen_sshl_vec,
1217           .fno = gen_helper_gvec_sshl_h,
1218           .opt_opc = vecop_list,
1219           .vece = MO_16 },
1220         { .fni4 = gen_sshl_i32,
1221           .fniv = gen_sshl_vec,
1222           .opt_opc = vecop_list,
1223           .vece = MO_32 },
1224         { .fni8 = gen_sshl_i64,
1225           .fniv = gen_sshl_vec,
1226           .opt_opc = vecop_list,
1227           .vece = MO_64 },
1228     };
1229     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1230 }
1231 
1232 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1233                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1234 {
1235     static gen_helper_gvec_3 * const fns[] = {
1236         gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h,
1237         gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d,
1238     };
1239     tcg_debug_assert(vece <= MO_64);
1240     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1241 }
1242 
1243 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1244                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1245 {
1246     static gen_helper_gvec_3 * const fns[] = {
1247         gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h,
1248         gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d,
1249     };
1250     tcg_debug_assert(vece <= MO_64);
1251     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1252 }
1253 
1254 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1255                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1256 {
1257     static gen_helper_gvec_3_ptr * const fns[] = {
1258         gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h,
1259         gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d,
1260     };
1261     tcg_debug_assert(vece <= MO_64);
1262     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1263                        opr_sz, max_sz, 0, fns[vece]);
1264 }
1265 
1266 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1267                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1268 {
1269     static gen_helper_gvec_3_ptr * const fns[] = {
1270         gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h,
1271         gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d,
1272     };
1273     tcg_debug_assert(vece <= MO_64);
1274     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1275                        opr_sz, max_sz, 0, fns[vece]);
1276 }
1277 
1278 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1279                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1280 {
1281     static gen_helper_gvec_3_ptr * const fns[] = {
1282         gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h,
1283         gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d,
1284     };
1285     tcg_debug_assert(vece <= MO_64);
1286     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1287                        opr_sz, max_sz, 0, fns[vece]);
1288 }
1289 
1290 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1291                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1292 {
1293     static gen_helper_gvec_3_ptr * const fns[] = {
1294         gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h,
1295         gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d,
1296     };
1297     tcg_debug_assert(vece <= MO_64);
1298     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1299                        opr_sz, max_sz, 0, fns[vece]);
1300 }
1301 
1302 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1303 {
1304     uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
1305     TCGv_i64 tmp = tcg_temp_new_i64();
1306 
1307     tcg_gen_add_i64(tmp, a, b);
1308     tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max));
1309     tcg_gen_xor_i64(tmp, tmp, res);
1310     tcg_gen_or_i64(qc, qc, tmp);
1311 }
1312 
1313 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1314 {
1315     TCGv_i64 t = tcg_temp_new_i64();
1316 
1317     tcg_gen_add_i64(t, a, b);
1318     tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a,
1319                         tcg_constant_i64(UINT64_MAX), t);
1320     tcg_gen_xor_i64(t, t, res);
1321     tcg_gen_or_i64(qc, qc, t);
1322 }
1323 
1324 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1325                           TCGv_vec a, TCGv_vec b)
1326 {
1327     TCGv_vec x = tcg_temp_new_vec_matching(t);
1328     tcg_gen_add_vec(vece, x, a, b);
1329     tcg_gen_usadd_vec(vece, t, a, b);
1330     tcg_gen_xor_vec(vece, x, x, t);
1331     tcg_gen_or_vec(vece, qc, qc, x);
1332 }
1333 
1334 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1335                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1336 {
1337     static const TCGOpcode vecop_list[] = {
1338         INDEX_op_usadd_vec, INDEX_op_add_vec, 0
1339     };
1340     static const GVecGen4 ops[4] = {
1341         { .fniv = gen_uqadd_vec,
1342           .fno = gen_helper_gvec_uqadd_b,
1343           .write_aofs = true,
1344           .opt_opc = vecop_list,
1345           .vece = MO_8 },
1346         { .fniv = gen_uqadd_vec,
1347           .fno = gen_helper_gvec_uqadd_h,
1348           .write_aofs = true,
1349           .opt_opc = vecop_list,
1350           .vece = MO_16 },
1351         { .fniv = gen_uqadd_vec,
1352           .fno = gen_helper_gvec_uqadd_s,
1353           .write_aofs = true,
1354           .opt_opc = vecop_list,
1355           .vece = MO_32 },
1356         { .fniv = gen_uqadd_vec,
1357           .fni8 = gen_uqadd_d,
1358           .fno = gen_helper_gvec_uqadd_d,
1359           .write_aofs = true,
1360           .opt_opc = vecop_list,
1361           .vece = MO_64 },
1362     };
1363 
1364     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1365     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1366                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1367 }
1368 
1369 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1370 {
1371     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1372     int64_t min = -1ll - max;
1373     TCGv_i64 tmp = tcg_temp_new_i64();
1374 
1375     tcg_gen_add_i64(tmp, a, b);
1376     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1377     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1378     tcg_gen_xor_i64(tmp, tmp, res);
1379     tcg_gen_or_i64(qc, qc, tmp);
1380 }
1381 
1382 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1383 {
1384     TCGv_i64 t0 = tcg_temp_new_i64();
1385     TCGv_i64 t1 = tcg_temp_new_i64();
1386     TCGv_i64 t2 = tcg_temp_new_i64();
1387 
1388     tcg_gen_add_i64(t0, a, b);
1389 
1390     /* Compute signed overflow indication into T1 */
1391     tcg_gen_xor_i64(t1, a, b);
1392     tcg_gen_xor_i64(t2, t0, a);
1393     tcg_gen_andc_i64(t1, t2, t1);
1394 
1395     /* Compute saturated value into T2 */
1396     tcg_gen_sari_i64(t2, a, 63);
1397     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1398 
1399     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1400     tcg_gen_xor_i64(t0, t0, res);
1401     tcg_gen_or_i64(qc, qc, t0);
1402 }
1403 
1404 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1405                           TCGv_vec a, TCGv_vec b)
1406 {
1407     TCGv_vec x = tcg_temp_new_vec_matching(t);
1408     tcg_gen_add_vec(vece, x, a, b);
1409     tcg_gen_ssadd_vec(vece, t, a, b);
1410     tcg_gen_xor_vec(vece, x, x, t);
1411     tcg_gen_or_vec(vece, qc, qc, x);
1412 }
1413 
1414 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1415                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1416 {
1417     static const TCGOpcode vecop_list[] = {
1418         INDEX_op_ssadd_vec, INDEX_op_add_vec, 0
1419     };
1420     static const GVecGen4 ops[4] = {
1421         { .fniv = gen_sqadd_vec,
1422           .fno = gen_helper_gvec_sqadd_b,
1423           .opt_opc = vecop_list,
1424           .write_aofs = true,
1425           .vece = MO_8 },
1426         { .fniv = gen_sqadd_vec,
1427           .fno = gen_helper_gvec_sqadd_h,
1428           .opt_opc = vecop_list,
1429           .write_aofs = true,
1430           .vece = MO_16 },
1431         { .fniv = gen_sqadd_vec,
1432           .fno = gen_helper_gvec_sqadd_s,
1433           .opt_opc = vecop_list,
1434           .write_aofs = true,
1435           .vece = MO_32 },
1436         { .fniv = gen_sqadd_vec,
1437           .fni8 = gen_sqadd_d,
1438           .fno = gen_helper_gvec_sqadd_d,
1439           .opt_opc = vecop_list,
1440           .write_aofs = true,
1441           .vece = MO_64 },
1442     };
1443 
1444     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1445     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1446                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1447 }
1448 
1449 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1450 {
1451     TCGv_i64 tmp = tcg_temp_new_i64();
1452 
1453     tcg_gen_sub_i64(tmp, a, b);
1454     tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0));
1455     tcg_gen_xor_i64(tmp, tmp, res);
1456     tcg_gen_or_i64(qc, qc, tmp);
1457 }
1458 
1459 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1460 {
1461     TCGv_i64 t = tcg_temp_new_i64();
1462 
1463     tcg_gen_sub_i64(t, a, b);
1464     tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t);
1465     tcg_gen_xor_i64(t, t, res);
1466     tcg_gen_or_i64(qc, qc, t);
1467 }
1468 
1469 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1470                           TCGv_vec a, TCGv_vec b)
1471 {
1472     TCGv_vec x = tcg_temp_new_vec_matching(t);
1473     tcg_gen_sub_vec(vece, x, a, b);
1474     tcg_gen_ussub_vec(vece, t, a, b);
1475     tcg_gen_xor_vec(vece, x, x, t);
1476     tcg_gen_or_vec(vece, qc, qc, x);
1477 }
1478 
1479 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1480                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1481 {
1482     static const TCGOpcode vecop_list[] = {
1483         INDEX_op_ussub_vec, INDEX_op_sub_vec, 0
1484     };
1485     static const GVecGen4 ops[4] = {
1486         { .fniv = gen_uqsub_vec,
1487           .fno = gen_helper_gvec_uqsub_b,
1488           .opt_opc = vecop_list,
1489           .write_aofs = true,
1490           .vece = MO_8 },
1491         { .fniv = gen_uqsub_vec,
1492           .fno = gen_helper_gvec_uqsub_h,
1493           .opt_opc = vecop_list,
1494           .write_aofs = true,
1495           .vece = MO_16 },
1496         { .fniv = gen_uqsub_vec,
1497           .fno = gen_helper_gvec_uqsub_s,
1498           .opt_opc = vecop_list,
1499           .write_aofs = true,
1500           .vece = MO_32 },
1501         { .fniv = gen_uqsub_vec,
1502           .fni8 = gen_uqsub_d,
1503           .fno = gen_helper_gvec_uqsub_d,
1504           .opt_opc = vecop_list,
1505           .write_aofs = true,
1506           .vece = MO_64 },
1507     };
1508 
1509     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1510     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1511                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1512 }
1513 
1514 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1515 {
1516     int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1517     int64_t min = -1ll - max;
1518     TCGv_i64 tmp = tcg_temp_new_i64();
1519 
1520     tcg_gen_sub_i64(tmp, a, b);
1521     tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1522     tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1523     tcg_gen_xor_i64(tmp, tmp, res);
1524     tcg_gen_or_i64(qc, qc, tmp);
1525 }
1526 
1527 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1528 {
1529     TCGv_i64 t0 = tcg_temp_new_i64();
1530     TCGv_i64 t1 = tcg_temp_new_i64();
1531     TCGv_i64 t2 = tcg_temp_new_i64();
1532 
1533     tcg_gen_sub_i64(t0, a, b);
1534 
1535     /* Compute signed overflow indication into T1 */
1536     tcg_gen_xor_i64(t1, a, b);
1537     tcg_gen_xor_i64(t2, t0, a);
1538     tcg_gen_and_i64(t1, t1, t2);
1539 
1540     /* Compute saturated value into T2 */
1541     tcg_gen_sari_i64(t2, a, 63);
1542     tcg_gen_xori_i64(t2, t2, INT64_MAX);
1543 
1544     tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1545     tcg_gen_xor_i64(t0, t0, res);
1546     tcg_gen_or_i64(qc, qc, t0);
1547 }
1548 
1549 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1550                           TCGv_vec a, TCGv_vec b)
1551 {
1552     TCGv_vec x = tcg_temp_new_vec_matching(t);
1553     tcg_gen_sub_vec(vece, x, a, b);
1554     tcg_gen_sssub_vec(vece, t, a, b);
1555     tcg_gen_xor_vec(vece, x, x, t);
1556     tcg_gen_or_vec(vece, qc, qc, x);
1557 }
1558 
1559 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1560                        uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1561 {
1562     static const TCGOpcode vecop_list[] = {
1563         INDEX_op_sssub_vec, INDEX_op_sub_vec, 0
1564     };
1565     static const GVecGen4 ops[4] = {
1566         { .fniv = gen_sqsub_vec,
1567           .fno = gen_helper_gvec_sqsub_b,
1568           .opt_opc = vecop_list,
1569           .write_aofs = true,
1570           .vece = MO_8 },
1571         { .fniv = gen_sqsub_vec,
1572           .fno = gen_helper_gvec_sqsub_h,
1573           .opt_opc = vecop_list,
1574           .write_aofs = true,
1575           .vece = MO_16 },
1576         { .fniv = gen_sqsub_vec,
1577           .fno = gen_helper_gvec_sqsub_s,
1578           .opt_opc = vecop_list,
1579           .write_aofs = true,
1580           .vece = MO_32 },
1581         { .fniv = gen_sqsub_vec,
1582           .fni8 = gen_sqsub_d,
1583           .fno = gen_helper_gvec_sqsub_d,
1584           .opt_opc = vecop_list,
1585           .write_aofs = true,
1586           .vece = MO_64 },
1587     };
1588 
1589     tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1590     tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1591                    rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1592 }
1593 
1594 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1595 {
1596     TCGv_i32 t = tcg_temp_new_i32();
1597 
1598     tcg_gen_sub_i32(t, a, b);
1599     tcg_gen_sub_i32(d, b, a);
1600     tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
1601 }
1602 
1603 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1604 {
1605     TCGv_i64 t = tcg_temp_new_i64();
1606 
1607     tcg_gen_sub_i64(t, a, b);
1608     tcg_gen_sub_i64(d, b, a);
1609     tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
1610 }
1611 
1612 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1613 {
1614     TCGv_vec t = tcg_temp_new_vec_matching(d);
1615 
1616     tcg_gen_smin_vec(vece, t, a, b);
1617     tcg_gen_smax_vec(vece, d, a, b);
1618     tcg_gen_sub_vec(vece, d, d, t);
1619 }
1620 
1621 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1622                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1623 {
1624     static const TCGOpcode vecop_list[] = {
1625         INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1626     };
1627     static const GVecGen3 ops[4] = {
1628         { .fniv = gen_sabd_vec,
1629           .fno = gen_helper_gvec_sabd_b,
1630           .opt_opc = vecop_list,
1631           .vece = MO_8 },
1632         { .fniv = gen_sabd_vec,
1633           .fno = gen_helper_gvec_sabd_h,
1634           .opt_opc = vecop_list,
1635           .vece = MO_16 },
1636         { .fni4 = gen_sabd_i32,
1637           .fniv = gen_sabd_vec,
1638           .fno = gen_helper_gvec_sabd_s,
1639           .opt_opc = vecop_list,
1640           .vece = MO_32 },
1641         { .fni8 = gen_sabd_i64,
1642           .fniv = gen_sabd_vec,
1643           .fno = gen_helper_gvec_sabd_d,
1644           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1645           .opt_opc = vecop_list,
1646           .vece = MO_64 },
1647     };
1648     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1649 }
1650 
1651 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1652 {
1653     TCGv_i32 t = tcg_temp_new_i32();
1654 
1655     tcg_gen_sub_i32(t, a, b);
1656     tcg_gen_sub_i32(d, b, a);
1657     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
1658 }
1659 
1660 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1661 {
1662     TCGv_i64 t = tcg_temp_new_i64();
1663 
1664     tcg_gen_sub_i64(t, a, b);
1665     tcg_gen_sub_i64(d, b, a);
1666     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
1667 }
1668 
1669 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1670 {
1671     TCGv_vec t = tcg_temp_new_vec_matching(d);
1672 
1673     tcg_gen_umin_vec(vece, t, a, b);
1674     tcg_gen_umax_vec(vece, d, a, b);
1675     tcg_gen_sub_vec(vece, d, d, t);
1676 }
1677 
1678 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1679                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1680 {
1681     static const TCGOpcode vecop_list[] = {
1682         INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1683     };
1684     static const GVecGen3 ops[4] = {
1685         { .fniv = gen_uabd_vec,
1686           .fno = gen_helper_gvec_uabd_b,
1687           .opt_opc = vecop_list,
1688           .vece = MO_8 },
1689         { .fniv = gen_uabd_vec,
1690           .fno = gen_helper_gvec_uabd_h,
1691           .opt_opc = vecop_list,
1692           .vece = MO_16 },
1693         { .fni4 = gen_uabd_i32,
1694           .fniv = gen_uabd_vec,
1695           .fno = gen_helper_gvec_uabd_s,
1696           .opt_opc = vecop_list,
1697           .vece = MO_32 },
1698         { .fni8 = gen_uabd_i64,
1699           .fniv = gen_uabd_vec,
1700           .fno = gen_helper_gvec_uabd_d,
1701           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1702           .opt_opc = vecop_list,
1703           .vece = MO_64 },
1704     };
1705     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1706 }
1707 
1708 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1709 {
1710     TCGv_i32 t = tcg_temp_new_i32();
1711     gen_sabd_i32(t, a, b);
1712     tcg_gen_add_i32(d, d, t);
1713 }
1714 
1715 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1716 {
1717     TCGv_i64 t = tcg_temp_new_i64();
1718     gen_sabd_i64(t, a, b);
1719     tcg_gen_add_i64(d, d, t);
1720 }
1721 
1722 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1723 {
1724     TCGv_vec t = tcg_temp_new_vec_matching(d);
1725     gen_sabd_vec(vece, t, a, b);
1726     tcg_gen_add_vec(vece, d, d, t);
1727 }
1728 
1729 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1730                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1731 {
1732     static const TCGOpcode vecop_list[] = {
1733         INDEX_op_sub_vec, INDEX_op_add_vec,
1734         INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1735     };
1736     static const GVecGen3 ops[4] = {
1737         { .fniv = gen_saba_vec,
1738           .fno = gen_helper_gvec_saba_b,
1739           .opt_opc = vecop_list,
1740           .load_dest = true,
1741           .vece = MO_8 },
1742         { .fniv = gen_saba_vec,
1743           .fno = gen_helper_gvec_saba_h,
1744           .opt_opc = vecop_list,
1745           .load_dest = true,
1746           .vece = MO_16 },
1747         { .fni4 = gen_saba_i32,
1748           .fniv = gen_saba_vec,
1749           .fno = gen_helper_gvec_saba_s,
1750           .opt_opc = vecop_list,
1751           .load_dest = true,
1752           .vece = MO_32 },
1753         { .fni8 = gen_saba_i64,
1754           .fniv = gen_saba_vec,
1755           .fno = gen_helper_gvec_saba_d,
1756           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1757           .opt_opc = vecop_list,
1758           .load_dest = true,
1759           .vece = MO_64 },
1760     };
1761     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1762 }
1763 
1764 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1765 {
1766     TCGv_i32 t = tcg_temp_new_i32();
1767     gen_uabd_i32(t, a, b);
1768     tcg_gen_add_i32(d, d, t);
1769 }
1770 
1771 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1772 {
1773     TCGv_i64 t = tcg_temp_new_i64();
1774     gen_uabd_i64(t, a, b);
1775     tcg_gen_add_i64(d, d, t);
1776 }
1777 
1778 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1779 {
1780     TCGv_vec t = tcg_temp_new_vec_matching(d);
1781     gen_uabd_vec(vece, t, a, b);
1782     tcg_gen_add_vec(vece, d, d, t);
1783 }
1784 
1785 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1786                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1787 {
1788     static const TCGOpcode vecop_list[] = {
1789         INDEX_op_sub_vec, INDEX_op_add_vec,
1790         INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1791     };
1792     static const GVecGen3 ops[4] = {
1793         { .fniv = gen_uaba_vec,
1794           .fno = gen_helper_gvec_uaba_b,
1795           .opt_opc = vecop_list,
1796           .load_dest = true,
1797           .vece = MO_8 },
1798         { .fniv = gen_uaba_vec,
1799           .fno = gen_helper_gvec_uaba_h,
1800           .opt_opc = vecop_list,
1801           .load_dest = true,
1802           .vece = MO_16 },
1803         { .fni4 = gen_uaba_i32,
1804           .fniv = gen_uaba_vec,
1805           .fno = gen_helper_gvec_uaba_s,
1806           .opt_opc = vecop_list,
1807           .load_dest = true,
1808           .vece = MO_32 },
1809         { .fni8 = gen_uaba_i64,
1810           .fniv = gen_uaba_vec,
1811           .fno = gen_helper_gvec_uaba_d,
1812           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1813           .opt_opc = vecop_list,
1814           .load_dest = true,
1815           .vece = MO_64 },
1816     };
1817     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1818 }
1819 
1820 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1821                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1822 {
1823     static gen_helper_gvec_3 * const fns[4] = {
1824         gen_helper_gvec_addp_b,
1825         gen_helper_gvec_addp_h,
1826         gen_helper_gvec_addp_s,
1827         gen_helper_gvec_addp_d,
1828     };
1829     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1830 }
1831 
1832 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1833                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1834 {
1835     static gen_helper_gvec_3 * const fns[4] = {
1836         gen_helper_gvec_smaxp_b,
1837         gen_helper_gvec_smaxp_h,
1838         gen_helper_gvec_smaxp_s,
1839     };
1840     tcg_debug_assert(vece <= MO_32);
1841     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1842 }
1843 
1844 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1845                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1846 {
1847     static gen_helper_gvec_3 * const fns[4] = {
1848         gen_helper_gvec_sminp_b,
1849         gen_helper_gvec_sminp_h,
1850         gen_helper_gvec_sminp_s,
1851     };
1852     tcg_debug_assert(vece <= MO_32);
1853     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1854 }
1855 
1856 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1857                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1858 {
1859     static gen_helper_gvec_3 * const fns[4] = {
1860         gen_helper_gvec_umaxp_b,
1861         gen_helper_gvec_umaxp_h,
1862         gen_helper_gvec_umaxp_s,
1863     };
1864     tcg_debug_assert(vece <= MO_32);
1865     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1866 }
1867 
1868 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1869                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1870 {
1871     static gen_helper_gvec_3 * const fns[4] = {
1872         gen_helper_gvec_uminp_b,
1873         gen_helper_gvec_uminp_h,
1874         gen_helper_gvec_uminp_s,
1875     };
1876     tcg_debug_assert(vece <= MO_32);
1877     tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1878 }
1879 
1880 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1881 {
1882     TCGv_i64 t = tcg_temp_new_i64();
1883 
1884     tcg_gen_and_i64(t, a, b);
1885     tcg_gen_vec_sar8i_i64(a, a, 1);
1886     tcg_gen_vec_sar8i_i64(b, b, 1);
1887     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1888     tcg_gen_vec_add8_i64(d, a, b);
1889     tcg_gen_vec_add8_i64(d, d, t);
1890 }
1891 
1892 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1893 {
1894     TCGv_i64 t = tcg_temp_new_i64();
1895 
1896     tcg_gen_and_i64(t, a, b);
1897     tcg_gen_vec_sar16i_i64(a, a, 1);
1898     tcg_gen_vec_sar16i_i64(b, b, 1);
1899     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1900     tcg_gen_vec_add16_i64(d, a, b);
1901     tcg_gen_vec_add16_i64(d, d, t);
1902 }
1903 
1904 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1905 {
1906     TCGv_i32 t = tcg_temp_new_i32();
1907 
1908     tcg_gen_and_i32(t, a, b);
1909     tcg_gen_sari_i32(a, a, 1);
1910     tcg_gen_sari_i32(b, b, 1);
1911     tcg_gen_andi_i32(t, t, 1);
1912     tcg_gen_add_i32(d, a, b);
1913     tcg_gen_add_i32(d, d, t);
1914 }
1915 
1916 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1917 {
1918     TCGv_vec t = tcg_temp_new_vec_matching(d);
1919 
1920     tcg_gen_and_vec(vece, t, a, b);
1921     tcg_gen_sari_vec(vece, a, a, 1);
1922     tcg_gen_sari_vec(vece, b, b, 1);
1923     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1924     tcg_gen_add_vec(vece, d, a, b);
1925     tcg_gen_add_vec(vece, d, d, t);
1926 }
1927 
1928 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1929                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1930 {
1931     static const TCGOpcode vecop_list[] = {
1932         INDEX_op_sari_vec, INDEX_op_add_vec, 0
1933     };
1934     static const GVecGen3 g[] = {
1935         { .fni8 = gen_shadd8_i64,
1936           .fniv = gen_shadd_vec,
1937           .opt_opc = vecop_list,
1938           .vece = MO_8 },
1939         { .fni8 = gen_shadd16_i64,
1940           .fniv = gen_shadd_vec,
1941           .opt_opc = vecop_list,
1942           .vece = MO_16 },
1943         { .fni4 = gen_shadd_i32,
1944           .fniv = gen_shadd_vec,
1945           .opt_opc = vecop_list,
1946           .vece = MO_32 },
1947     };
1948     tcg_debug_assert(vece <= MO_32);
1949     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
1950 }
1951 
1952 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1953 {
1954     TCGv_i64 t = tcg_temp_new_i64();
1955 
1956     tcg_gen_and_i64(t, a, b);
1957     tcg_gen_vec_shr8i_i64(a, a, 1);
1958     tcg_gen_vec_shr8i_i64(b, b, 1);
1959     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1960     tcg_gen_vec_add8_i64(d, a, b);
1961     tcg_gen_vec_add8_i64(d, d, t);
1962 }
1963 
1964 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1965 {
1966     TCGv_i64 t = tcg_temp_new_i64();
1967 
1968     tcg_gen_and_i64(t, a, b);
1969     tcg_gen_vec_shr16i_i64(a, a, 1);
1970     tcg_gen_vec_shr16i_i64(b, b, 1);
1971     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1972     tcg_gen_vec_add16_i64(d, a, b);
1973     tcg_gen_vec_add16_i64(d, d, t);
1974 }
1975 
1976 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1977 {
1978     TCGv_i32 t = tcg_temp_new_i32();
1979 
1980     tcg_gen_and_i32(t, a, b);
1981     tcg_gen_shri_i32(a, a, 1);
1982     tcg_gen_shri_i32(b, b, 1);
1983     tcg_gen_andi_i32(t, t, 1);
1984     tcg_gen_add_i32(d, a, b);
1985     tcg_gen_add_i32(d, d, t);
1986 }
1987 
1988 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1989 {
1990     TCGv_vec t = tcg_temp_new_vec_matching(d);
1991 
1992     tcg_gen_and_vec(vece, t, a, b);
1993     tcg_gen_shri_vec(vece, a, a, 1);
1994     tcg_gen_shri_vec(vece, b, b, 1);
1995     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1996     tcg_gen_add_vec(vece, d, a, b);
1997     tcg_gen_add_vec(vece, d, d, t);
1998 }
1999 
2000 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2001                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2002 {
2003     static const TCGOpcode vecop_list[] = {
2004         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2005     };
2006     static const GVecGen3 g[] = {
2007         { .fni8 = gen_uhadd8_i64,
2008           .fniv = gen_uhadd_vec,
2009           .opt_opc = vecop_list,
2010           .vece = MO_8 },
2011         { .fni8 = gen_uhadd16_i64,
2012           .fniv = gen_uhadd_vec,
2013           .opt_opc = vecop_list,
2014           .vece = MO_16 },
2015         { .fni4 = gen_uhadd_i32,
2016           .fniv = gen_uhadd_vec,
2017           .opt_opc = vecop_list,
2018           .vece = MO_32 },
2019     };
2020     tcg_debug_assert(vece <= MO_32);
2021     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2022 }
2023 
2024 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2025 {
2026     TCGv_i64 t = tcg_temp_new_i64();
2027 
2028     tcg_gen_andc_i64(t, b, a);
2029     tcg_gen_vec_sar8i_i64(a, a, 1);
2030     tcg_gen_vec_sar8i_i64(b, b, 1);
2031     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2032     tcg_gen_vec_sub8_i64(d, a, b);
2033     tcg_gen_vec_sub8_i64(d, d, t);
2034 }
2035 
2036 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2037 {
2038     TCGv_i64 t = tcg_temp_new_i64();
2039 
2040     tcg_gen_andc_i64(t, b, a);
2041     tcg_gen_vec_sar16i_i64(a, a, 1);
2042     tcg_gen_vec_sar16i_i64(b, b, 1);
2043     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2044     tcg_gen_vec_sub16_i64(d, a, b);
2045     tcg_gen_vec_sub16_i64(d, d, t);
2046 }
2047 
2048 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2049 {
2050     TCGv_i32 t = tcg_temp_new_i32();
2051 
2052     tcg_gen_andc_i32(t, b, a);
2053     tcg_gen_sari_i32(a, a, 1);
2054     tcg_gen_sari_i32(b, b, 1);
2055     tcg_gen_andi_i32(t, t, 1);
2056     tcg_gen_sub_i32(d, a, b);
2057     tcg_gen_sub_i32(d, d, t);
2058 }
2059 
2060 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2061 {
2062     TCGv_vec t = tcg_temp_new_vec_matching(d);
2063 
2064     tcg_gen_andc_vec(vece, t, b, a);
2065     tcg_gen_sari_vec(vece, a, a, 1);
2066     tcg_gen_sari_vec(vece, b, b, 1);
2067     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2068     tcg_gen_sub_vec(vece, d, a, b);
2069     tcg_gen_sub_vec(vece, d, d, t);
2070 }
2071 
2072 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2073                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2074 {
2075     static const TCGOpcode vecop_list[] = {
2076         INDEX_op_sari_vec, INDEX_op_sub_vec, 0
2077     };
2078     static const GVecGen3 g[4] = {
2079         { .fni8 = gen_shsub8_i64,
2080           .fniv = gen_shsub_vec,
2081           .opt_opc = vecop_list,
2082           .vece = MO_8 },
2083         { .fni8 = gen_shsub16_i64,
2084           .fniv = gen_shsub_vec,
2085           .opt_opc = vecop_list,
2086           .vece = MO_16 },
2087         { .fni4 = gen_shsub_i32,
2088           .fniv = gen_shsub_vec,
2089           .opt_opc = vecop_list,
2090           .vece = MO_32 },
2091     };
2092     assert(vece <= MO_32);
2093     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2094 }
2095 
2096 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2097 {
2098     TCGv_i64 t = tcg_temp_new_i64();
2099 
2100     tcg_gen_andc_i64(t, b, a);
2101     tcg_gen_vec_shr8i_i64(a, a, 1);
2102     tcg_gen_vec_shr8i_i64(b, b, 1);
2103     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2104     tcg_gen_vec_sub8_i64(d, a, b);
2105     tcg_gen_vec_sub8_i64(d, d, t);
2106 }
2107 
2108 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2109 {
2110     TCGv_i64 t = tcg_temp_new_i64();
2111 
2112     tcg_gen_andc_i64(t, b, a);
2113     tcg_gen_vec_shr16i_i64(a, a, 1);
2114     tcg_gen_vec_shr16i_i64(b, b, 1);
2115     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2116     tcg_gen_vec_sub16_i64(d, a, b);
2117     tcg_gen_vec_sub16_i64(d, d, t);
2118 }
2119 
2120 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2121 {
2122     TCGv_i32 t = tcg_temp_new_i32();
2123 
2124     tcg_gen_andc_i32(t, b, a);
2125     tcg_gen_shri_i32(a, a, 1);
2126     tcg_gen_shri_i32(b, b, 1);
2127     tcg_gen_andi_i32(t, t, 1);
2128     tcg_gen_sub_i32(d, a, b);
2129     tcg_gen_sub_i32(d, d, t);
2130 }
2131 
2132 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2133 {
2134     TCGv_vec t = tcg_temp_new_vec_matching(d);
2135 
2136     tcg_gen_andc_vec(vece, t, b, a);
2137     tcg_gen_shri_vec(vece, a, a, 1);
2138     tcg_gen_shri_vec(vece, b, b, 1);
2139     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2140     tcg_gen_sub_vec(vece, d, a, b);
2141     tcg_gen_sub_vec(vece, d, d, t);
2142 }
2143 
2144 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2145                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2146 {
2147     static const TCGOpcode vecop_list[] = {
2148         INDEX_op_shri_vec, INDEX_op_sub_vec, 0
2149     };
2150     static const GVecGen3 g[4] = {
2151         { .fni8 = gen_uhsub8_i64,
2152           .fniv = gen_uhsub_vec,
2153           .opt_opc = vecop_list,
2154           .vece = MO_8 },
2155         { .fni8 = gen_uhsub16_i64,
2156           .fniv = gen_uhsub_vec,
2157           .opt_opc = vecop_list,
2158           .vece = MO_16 },
2159         { .fni4 = gen_uhsub_i32,
2160           .fniv = gen_uhsub_vec,
2161           .opt_opc = vecop_list,
2162           .vece = MO_32 },
2163     };
2164     assert(vece <= MO_32);
2165     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2166 }
2167 
2168 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2169 {
2170     TCGv_i64 t = tcg_temp_new_i64();
2171 
2172     tcg_gen_or_i64(t, a, b);
2173     tcg_gen_vec_sar8i_i64(a, a, 1);
2174     tcg_gen_vec_sar8i_i64(b, b, 1);
2175     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2176     tcg_gen_vec_add8_i64(d, a, b);
2177     tcg_gen_vec_add8_i64(d, d, t);
2178 }
2179 
2180 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2181 {
2182     TCGv_i64 t = tcg_temp_new_i64();
2183 
2184     tcg_gen_or_i64(t, a, b);
2185     tcg_gen_vec_sar16i_i64(a, a, 1);
2186     tcg_gen_vec_sar16i_i64(b, b, 1);
2187     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2188     tcg_gen_vec_add16_i64(d, a, b);
2189     tcg_gen_vec_add16_i64(d, d, t);
2190 }
2191 
2192 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2193 {
2194     TCGv_i32 t = tcg_temp_new_i32();
2195 
2196     tcg_gen_or_i32(t, a, b);
2197     tcg_gen_sari_i32(a, a, 1);
2198     tcg_gen_sari_i32(b, b, 1);
2199     tcg_gen_andi_i32(t, t, 1);
2200     tcg_gen_add_i32(d, a, b);
2201     tcg_gen_add_i32(d, d, t);
2202 }
2203 
2204 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2205 {
2206     TCGv_vec t = tcg_temp_new_vec_matching(d);
2207 
2208     tcg_gen_or_vec(vece, t, a, b);
2209     tcg_gen_sari_vec(vece, a, a, 1);
2210     tcg_gen_sari_vec(vece, b, b, 1);
2211     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2212     tcg_gen_add_vec(vece, d, a, b);
2213     tcg_gen_add_vec(vece, d, d, t);
2214 }
2215 
2216 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2217                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2218 {
2219     static const TCGOpcode vecop_list[] = {
2220         INDEX_op_sari_vec, INDEX_op_add_vec, 0
2221     };
2222     static const GVecGen3 g[] = {
2223         { .fni8 = gen_srhadd8_i64,
2224           .fniv = gen_srhadd_vec,
2225           .opt_opc = vecop_list,
2226           .vece = MO_8 },
2227         { .fni8 = gen_srhadd16_i64,
2228           .fniv = gen_srhadd_vec,
2229           .opt_opc = vecop_list,
2230           .vece = MO_16 },
2231         { .fni4 = gen_srhadd_i32,
2232           .fniv = gen_srhadd_vec,
2233           .opt_opc = vecop_list,
2234           .vece = MO_32 },
2235     };
2236     assert(vece <= MO_32);
2237     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2238 }
2239 
2240 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2241 {
2242     TCGv_i64 t = tcg_temp_new_i64();
2243 
2244     tcg_gen_or_i64(t, a, b);
2245     tcg_gen_vec_shr8i_i64(a, a, 1);
2246     tcg_gen_vec_shr8i_i64(b, b, 1);
2247     tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2248     tcg_gen_vec_add8_i64(d, a, b);
2249     tcg_gen_vec_add8_i64(d, d, t);
2250 }
2251 
2252 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2253 {
2254     TCGv_i64 t = tcg_temp_new_i64();
2255 
2256     tcg_gen_or_i64(t, a, b);
2257     tcg_gen_vec_shr16i_i64(a, a, 1);
2258     tcg_gen_vec_shr16i_i64(b, b, 1);
2259     tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2260     tcg_gen_vec_add16_i64(d, a, b);
2261     tcg_gen_vec_add16_i64(d, d, t);
2262 }
2263 
2264 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2265 {
2266     TCGv_i32 t = tcg_temp_new_i32();
2267 
2268     tcg_gen_or_i32(t, a, b);
2269     tcg_gen_shri_i32(a, a, 1);
2270     tcg_gen_shri_i32(b, b, 1);
2271     tcg_gen_andi_i32(t, t, 1);
2272     tcg_gen_add_i32(d, a, b);
2273     tcg_gen_add_i32(d, d, t);
2274 }
2275 
2276 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2277 {
2278     TCGv_vec t = tcg_temp_new_vec_matching(d);
2279 
2280     tcg_gen_or_vec(vece, t, a, b);
2281     tcg_gen_shri_vec(vece, a, a, 1);
2282     tcg_gen_shri_vec(vece, b, b, 1);
2283     tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2284     tcg_gen_add_vec(vece, d, a, b);
2285     tcg_gen_add_vec(vece, d, d, t);
2286 }
2287 
2288 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2289                      uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2290 {
2291     static const TCGOpcode vecop_list[] = {
2292         INDEX_op_shri_vec, INDEX_op_add_vec, 0
2293     };
2294     static const GVecGen3 g[] = {
2295         { .fni8 = gen_urhadd8_i64,
2296           .fniv = gen_urhadd_vec,
2297           .opt_opc = vecop_list,
2298           .vece = MO_8 },
2299         { .fni8 = gen_urhadd16_i64,
2300           .fniv = gen_urhadd_vec,
2301           .opt_opc = vecop_list,
2302           .vece = MO_16 },
2303         { .fni4 = gen_urhadd_i32,
2304           .fniv = gen_urhadd_vec,
2305           .opt_opc = vecop_list,
2306           .vece = MO_32 },
2307     };
2308     assert(vece <= MO_32);
2309     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2310 }
2311