xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision f7ceab1e)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-temp-internal.h"
23 #include "tcg/tcg-op-common.h"
24 #include "tcg/tcg-op-gvec-common.h"
25 #include "tcg/tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t max_align;
41 
42     switch (oprsz) {
43     case 8:
44     case 16:
45     case 32:
46         tcg_debug_assert(oprsz <= maxsz);
47         break;
48     default:
49         tcg_debug_assert(oprsz == maxsz);
50         break;
51     }
52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
53 
54     max_align = maxsz >= 16 ? 15 : 7;
55     tcg_debug_assert((maxsz & max_align) == 0);
56     tcg_debug_assert((ofs & max_align) == 0);
57 }
58 
59 /* Verify vector overlap rules for two operands.  */
60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
61 {
62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
63 }
64 
65 /* Verify vector overlap rules for three operands.  */
66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
67 {
68     check_overlap_2(d, a, s);
69     check_overlap_2(d, b, s);
70     check_overlap_2(a, b, s);
71 }
72 
73 /* Verify vector overlap rules for four operands.  */
74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
75                             uint32_t c, uint32_t s)
76 {
77     check_overlap_2(d, a, s);
78     check_overlap_2(d, b, s);
79     check_overlap_2(d, c, s);
80     check_overlap_2(a, b, s);
81     check_overlap_2(a, c, s);
82     check_overlap_2(b, c, s);
83 }
84 
85 /* Create a descriptor from components.  */
86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
87 {
88     uint32_t desc = 0;
89 
90     check_size_align(oprsz, maxsz, 0);
91 
92     /*
93      * We want to check that 'data' will fit into SIMD_DATA_BITS.
94      * However, some callers want to treat the data as a signed
95      * value (which they can later get back with simd_data())
96      * and some want to treat it as an unsigned value.
97      * So here we assert only that the data will fit into the
98      * field in at least one way. This means that some invalid
99      * values from the caller will not be detected, e.g. if the
100      * caller wants to handle the value as a signed integer but
101      * incorrectly passes us 1 << (SIMD_DATA_BITS - 1).
102      */
103     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) ||
104                      data == extract32(data, 0, SIMD_DATA_BITS));
105 
106     oprsz = (oprsz / 8) - 1;
107     maxsz = (maxsz / 8) - 1;
108 
109     /*
110      * We have just asserted in check_size_align that either
111      * oprsz is {8,16,32} or matches maxsz.  Encode the final
112      * case with '2', as that would otherwise map to 24.
113      */
114     if (oprsz == maxsz) {
115         oprsz = 2;
116     }
117 
118     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
119     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
120     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
121 
122     return desc;
123 }
124 
125 /* Generate a call to a gvec-style helper with two vector operands.  */
126 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
127                         uint32_t oprsz, uint32_t maxsz, int32_t data,
128                         gen_helper_gvec_2 *fn)
129 {
130     TCGv_ptr a0, a1;
131     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
132 
133     a0 = tcg_temp_ebb_new_ptr();
134     a1 = tcg_temp_ebb_new_ptr();
135 
136     tcg_gen_addi_ptr(a0, tcg_env, dofs);
137     tcg_gen_addi_ptr(a1, tcg_env, aofs);
138 
139     fn(a0, a1, desc);
140 
141     tcg_temp_free_ptr(a0);
142     tcg_temp_free_ptr(a1);
143 }
144 
145 /* Generate a call to a gvec-style helper with two vector operands
146    and one scalar operand.  */
147 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
148                          uint32_t oprsz, uint32_t maxsz, int32_t data,
149                          gen_helper_gvec_2i *fn)
150 {
151     TCGv_ptr a0, a1;
152     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
153 
154     a0 = tcg_temp_ebb_new_ptr();
155     a1 = tcg_temp_ebb_new_ptr();
156 
157     tcg_gen_addi_ptr(a0, tcg_env, dofs);
158     tcg_gen_addi_ptr(a1, tcg_env, aofs);
159 
160     fn(a0, a1, c, desc);
161 
162     tcg_temp_free_ptr(a0);
163     tcg_temp_free_ptr(a1);
164 }
165 
166 /* Generate a call to a gvec-style helper with three vector operands.  */
167 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
168                         uint32_t oprsz, uint32_t maxsz, int32_t data,
169                         gen_helper_gvec_3 *fn)
170 {
171     TCGv_ptr a0, a1, a2;
172     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
173 
174     a0 = tcg_temp_ebb_new_ptr();
175     a1 = tcg_temp_ebb_new_ptr();
176     a2 = tcg_temp_ebb_new_ptr();
177 
178     tcg_gen_addi_ptr(a0, tcg_env, dofs);
179     tcg_gen_addi_ptr(a1, tcg_env, aofs);
180     tcg_gen_addi_ptr(a2, tcg_env, bofs);
181 
182     fn(a0, a1, a2, desc);
183 
184     tcg_temp_free_ptr(a0);
185     tcg_temp_free_ptr(a1);
186     tcg_temp_free_ptr(a2);
187 }
188 
189 /* Generate a call to a gvec-style helper with four vector operands.  */
190 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
191                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
192                         int32_t data, gen_helper_gvec_4 *fn)
193 {
194     TCGv_ptr a0, a1, a2, a3;
195     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
196 
197     a0 = tcg_temp_ebb_new_ptr();
198     a1 = tcg_temp_ebb_new_ptr();
199     a2 = tcg_temp_ebb_new_ptr();
200     a3 = tcg_temp_ebb_new_ptr();
201 
202     tcg_gen_addi_ptr(a0, tcg_env, dofs);
203     tcg_gen_addi_ptr(a1, tcg_env, aofs);
204     tcg_gen_addi_ptr(a2, tcg_env, bofs);
205     tcg_gen_addi_ptr(a3, tcg_env, cofs);
206 
207     fn(a0, a1, a2, a3, desc);
208 
209     tcg_temp_free_ptr(a0);
210     tcg_temp_free_ptr(a1);
211     tcg_temp_free_ptr(a2);
212     tcg_temp_free_ptr(a3);
213 }
214 
215 /* Generate a call to a gvec-style helper with five vector operands.  */
216 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
217                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
218                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
219 {
220     TCGv_ptr a0, a1, a2, a3, a4;
221     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
222 
223     a0 = tcg_temp_ebb_new_ptr();
224     a1 = tcg_temp_ebb_new_ptr();
225     a2 = tcg_temp_ebb_new_ptr();
226     a3 = tcg_temp_ebb_new_ptr();
227     a4 = tcg_temp_ebb_new_ptr();
228 
229     tcg_gen_addi_ptr(a0, tcg_env, dofs);
230     tcg_gen_addi_ptr(a1, tcg_env, aofs);
231     tcg_gen_addi_ptr(a2, tcg_env, bofs);
232     tcg_gen_addi_ptr(a3, tcg_env, cofs);
233     tcg_gen_addi_ptr(a4, tcg_env, xofs);
234 
235     fn(a0, a1, a2, a3, a4, desc);
236 
237     tcg_temp_free_ptr(a0);
238     tcg_temp_free_ptr(a1);
239     tcg_temp_free_ptr(a2);
240     tcg_temp_free_ptr(a3);
241     tcg_temp_free_ptr(a4);
242 }
243 
244 /* Generate a call to a gvec-style helper with three vector operands
245    and an extra pointer operand.  */
246 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
247                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
248                         int32_t data, gen_helper_gvec_2_ptr *fn)
249 {
250     TCGv_ptr a0, a1;
251     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
252 
253     a0 = tcg_temp_ebb_new_ptr();
254     a1 = tcg_temp_ebb_new_ptr();
255 
256     tcg_gen_addi_ptr(a0, tcg_env, dofs);
257     tcg_gen_addi_ptr(a1, tcg_env, aofs);
258 
259     fn(a0, a1, ptr, desc);
260 
261     tcg_temp_free_ptr(a0);
262     tcg_temp_free_ptr(a1);
263 }
264 
265 /* Generate a call to a gvec-style helper with three vector operands
266    and an extra pointer operand.  */
267 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
268                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
269                         int32_t data, gen_helper_gvec_3_ptr *fn)
270 {
271     TCGv_ptr a0, a1, a2;
272     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
273 
274     a0 = tcg_temp_ebb_new_ptr();
275     a1 = tcg_temp_ebb_new_ptr();
276     a2 = tcg_temp_ebb_new_ptr();
277 
278     tcg_gen_addi_ptr(a0, tcg_env, dofs);
279     tcg_gen_addi_ptr(a1, tcg_env, aofs);
280     tcg_gen_addi_ptr(a2, tcg_env, bofs);
281 
282     fn(a0, a1, a2, ptr, desc);
283 
284     tcg_temp_free_ptr(a0);
285     tcg_temp_free_ptr(a1);
286     tcg_temp_free_ptr(a2);
287 }
288 
289 /* Generate a call to a gvec-style helper with four vector operands
290    and an extra pointer operand.  */
291 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
292                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
293                         uint32_t maxsz, int32_t data,
294                         gen_helper_gvec_4_ptr *fn)
295 {
296     TCGv_ptr a0, a1, a2, a3;
297     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
298 
299     a0 = tcg_temp_ebb_new_ptr();
300     a1 = tcg_temp_ebb_new_ptr();
301     a2 = tcg_temp_ebb_new_ptr();
302     a3 = tcg_temp_ebb_new_ptr();
303 
304     tcg_gen_addi_ptr(a0, tcg_env, dofs);
305     tcg_gen_addi_ptr(a1, tcg_env, aofs);
306     tcg_gen_addi_ptr(a2, tcg_env, bofs);
307     tcg_gen_addi_ptr(a3, tcg_env, cofs);
308 
309     fn(a0, a1, a2, a3, ptr, desc);
310 
311     tcg_temp_free_ptr(a0);
312     tcg_temp_free_ptr(a1);
313     tcg_temp_free_ptr(a2);
314     tcg_temp_free_ptr(a3);
315 }
316 
317 /* Generate a call to a gvec-style helper with five vector operands
318    and an extra pointer operand.  */
319 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
320                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
321                         uint32_t oprsz, uint32_t maxsz, int32_t data,
322                         gen_helper_gvec_5_ptr *fn)
323 {
324     TCGv_ptr a0, a1, a2, a3, a4;
325     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
326 
327     a0 = tcg_temp_ebb_new_ptr();
328     a1 = tcg_temp_ebb_new_ptr();
329     a2 = tcg_temp_ebb_new_ptr();
330     a3 = tcg_temp_ebb_new_ptr();
331     a4 = tcg_temp_ebb_new_ptr();
332 
333     tcg_gen_addi_ptr(a0, tcg_env, dofs);
334     tcg_gen_addi_ptr(a1, tcg_env, aofs);
335     tcg_gen_addi_ptr(a2, tcg_env, bofs);
336     tcg_gen_addi_ptr(a3, tcg_env, cofs);
337     tcg_gen_addi_ptr(a4, tcg_env, eofs);
338 
339     fn(a0, a1, a2, a3, a4, ptr, desc);
340 
341     tcg_temp_free_ptr(a0);
342     tcg_temp_free_ptr(a1);
343     tcg_temp_free_ptr(a2);
344     tcg_temp_free_ptr(a3);
345     tcg_temp_free_ptr(a4);
346 }
347 
348 /* Return true if we want to implement something of OPRSZ bytes
349    in units of LNSZ.  This limits the expansion of inline code.  */
350 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
351 {
352     uint32_t q, r;
353 
354     if (oprsz < lnsz) {
355         return false;
356     }
357 
358     q = oprsz / lnsz;
359     r = oprsz % lnsz;
360     tcg_debug_assert((r & 7) == 0);
361 
362     if (lnsz < 16) {
363         /* For sizes below 16, accept no remainder. */
364         if (r != 0) {
365             return false;
366         }
367     } else {
368         /*
369          * Recall that ARM SVE allows vector sizes that are not a
370          * power of 2, but always a multiple of 16.  The intent is
371          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
372          * In addition, expand_clr needs to handle a multiple of 8.
373          * Thus we can handle the tail with one more operation per
374          * diminishing power of 2.
375          */
376         q += ctpop32(r);
377     }
378 
379     return q <= MAX_UNROLL;
380 }
381 
382 static void expand_clr(uint32_t dofs, uint32_t maxsz);
383 
384 /* Duplicate C as per VECE.  */
385 uint64_t (dup_const)(unsigned vece, uint64_t c)
386 {
387     switch (vece) {
388     case MO_8:
389         return 0x0101010101010101ull * (uint8_t)c;
390     case MO_16:
391         return 0x0001000100010001ull * (uint16_t)c;
392     case MO_32:
393         return 0x0000000100000001ull * (uint32_t)c;
394     case MO_64:
395         return c;
396     default:
397         g_assert_not_reached();
398     }
399 }
400 
401 /* Duplicate IN into OUT as per VECE.  */
402 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
403 {
404     switch (vece) {
405     case MO_8:
406         tcg_gen_ext8u_i32(out, in);
407         tcg_gen_muli_i32(out, out, 0x01010101);
408         break;
409     case MO_16:
410         tcg_gen_deposit_i32(out, in, in, 16, 16);
411         break;
412     case MO_32:
413         tcg_gen_mov_i32(out, in);
414         break;
415     default:
416         g_assert_not_reached();
417     }
418 }
419 
420 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
421 {
422     switch (vece) {
423     case MO_8:
424         tcg_gen_ext8u_i64(out, in);
425         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
426         break;
427     case MO_16:
428         tcg_gen_ext16u_i64(out, in);
429         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
430         break;
431     case MO_32:
432         tcg_gen_deposit_i64(out, in, in, 32, 32);
433         break;
434     case MO_64:
435         tcg_gen_mov_i64(out, in);
436         break;
437     default:
438         g_assert_not_reached();
439     }
440 }
441 
442 /* Select a supported vector type for implementing an operation on SIZE
443  * bytes.  If OP is 0, assume that the real operation to be performed is
444  * required by all backends.  Otherwise, make sure than OP can be performed
445  * on elements of size VECE in the selected type.  Do not select V64 if
446  * PREFER_I64 is true.  Return 0 if no vector type is selected.
447  */
448 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
449                                   uint32_t size, bool prefer_i64)
450 {
451     /*
452      * Recall that ARM SVE allows vector sizes that are not a
453      * power of 2, but always a multiple of 16.  The intent is
454      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
455      * It is hard to imagine a case in which v256 is supported
456      * but v128 is not, but check anyway.
457      * In addition, expand_clr needs to handle a multiple of 8.
458      */
459     if (TCG_TARGET_HAS_v256 &&
460         check_size_impl(size, 32) &&
461         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
462         (!(size & 16) ||
463          (TCG_TARGET_HAS_v128 &&
464           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
465         (!(size & 8) ||
466          (TCG_TARGET_HAS_v64 &&
467           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
468         return TCG_TYPE_V256;
469     }
470     if (TCG_TARGET_HAS_v128 &&
471         check_size_impl(size, 16) &&
472         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
473         (!(size & 8) ||
474          (TCG_TARGET_HAS_v64 &&
475           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
476         return TCG_TYPE_V128;
477     }
478     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
479         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
480         return TCG_TYPE_V64;
481     }
482     return 0;
483 }
484 
485 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
486                          uint32_t maxsz, TCGv_vec t_vec)
487 {
488     uint32_t i = 0;
489 
490     tcg_debug_assert(oprsz >= 8);
491 
492     /*
493      * This may be expand_clr for the tail of an operation, e.g.
494      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
495      * are misaligned wrt the maximum vector size, so do that first.
496      */
497     if (dofs & 8) {
498         tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64);
499         i += 8;
500     }
501 
502     switch (type) {
503     case TCG_TYPE_V256:
504         /*
505          * Recall that ARM SVE allows vector sizes that are not a
506          * power of 2, but always a multiple of 16.  The intent is
507          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
508          */
509         for (; i + 32 <= oprsz; i += 32) {
510             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V256);
511         }
512         /* fallthru */
513     case TCG_TYPE_V128:
514         for (; i + 16 <= oprsz; i += 16) {
515             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V128);
516         }
517         break;
518     case TCG_TYPE_V64:
519         for (; i < oprsz; i += 8) {
520             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64);
521         }
522         break;
523     default:
524         g_assert_not_reached();
525     }
526 
527     if (oprsz < maxsz) {
528         expand_clr(dofs + oprsz, maxsz - oprsz);
529     }
530 }
531 
532 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
533  * Only one of IN_32 or IN_64 may be set;
534  * IN_C is used if IN_32 and IN_64 are unset.
535  */
536 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
537                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
538                    uint64_t in_c)
539 {
540     TCGType type;
541     TCGv_i64 t_64;
542     TCGv_i32 t_32, t_desc;
543     TCGv_ptr t_ptr;
544     uint32_t i;
545 
546     assert(vece <= (in_32 ? MO_32 : MO_64));
547     assert(in_32 == NULL || in_64 == NULL);
548 
549     /* If we're storing 0, expand oprsz to maxsz.  */
550     if (in_32 == NULL && in_64 == NULL) {
551         in_c = dup_const(vece, in_c);
552         if (in_c == 0) {
553             oprsz = maxsz;
554             vece = MO_8;
555         } else if (in_c == dup_const(MO_8, in_c)) {
556             vece = MO_8;
557         }
558     }
559 
560     /* Implement inline with a vector type, if possible.
561      * Prefer integer when 64-bit host and no variable dup.
562      */
563     type = choose_vector_type(NULL, vece, oprsz,
564                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
565                                && (in_64 == NULL || vece == MO_64)));
566     if (type != 0) {
567         TCGv_vec t_vec = tcg_temp_new_vec(type);
568 
569         if (in_32) {
570             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
571         } else if (in_64) {
572             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
573         } else {
574             tcg_gen_dupi_vec(vece, t_vec, in_c);
575         }
576         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
577         return;
578     }
579 
580     /* Otherwise, inline with an integer type, unless "large".  */
581     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
582         t_64 = NULL;
583         t_32 = NULL;
584 
585         if (in_32) {
586             /* We are given a 32-bit variable input.  For a 64-bit host,
587                use a 64-bit operation unless the 32-bit operation would
588                be simple enough.  */
589             if (TCG_TARGET_REG_BITS == 64
590                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
591                 t_64 = tcg_temp_ebb_new_i64();
592                 tcg_gen_extu_i32_i64(t_64, in_32);
593                 tcg_gen_dup_i64(vece, t_64, t_64);
594             } else {
595                 t_32 = tcg_temp_ebb_new_i32();
596                 tcg_gen_dup_i32(vece, t_32, in_32);
597             }
598         } else if (in_64) {
599             /* We are given a 64-bit variable input.  */
600             t_64 = tcg_temp_ebb_new_i64();
601             tcg_gen_dup_i64(vece, t_64, in_64);
602         } else {
603             /* We are given a constant input.  */
604             /* For 64-bit hosts, use 64-bit constants for "simple" constants
605                or when we'd need too many 32-bit stores, or when a 64-bit
606                constant is really required.  */
607             if (vece == MO_64
608                 || (TCG_TARGET_REG_BITS == 64
609                     && (in_c == 0 || in_c == -1
610                         || !check_size_impl(oprsz, 4)))) {
611                 t_64 = tcg_constant_i64(in_c);
612             } else {
613                 t_32 = tcg_constant_i32(in_c);
614             }
615         }
616 
617         /* Implement inline if we picked an implementation size above.  */
618         if (t_32) {
619             for (i = 0; i < oprsz; i += 4) {
620                 tcg_gen_st_i32(t_32, tcg_env, dofs + i);
621             }
622             tcg_temp_free_i32(t_32);
623             goto done;
624         }
625         if (t_64) {
626             for (i = 0; i < oprsz; i += 8) {
627                 tcg_gen_st_i64(t_64, tcg_env, dofs + i);
628             }
629             tcg_temp_free_i64(t_64);
630             goto done;
631         }
632     }
633 
634     /* Otherwise implement out of line.  */
635     t_ptr = tcg_temp_ebb_new_ptr();
636     tcg_gen_addi_ptr(t_ptr, tcg_env, dofs);
637 
638     /*
639      * This may be expand_clr for the tail of an operation, e.g.
640      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
641      * wrt simd_desc and will assert.  Simply pass all replicated byte
642      * stores through to memset.
643      */
644     if (oprsz == maxsz && vece == MO_8) {
645         TCGv_ptr t_size = tcg_constant_ptr(oprsz);
646         TCGv_i32 t_val;
647 
648         if (in_32) {
649             t_val = in_32;
650         } else if (in_64) {
651             t_val = tcg_temp_ebb_new_i32();
652             tcg_gen_extrl_i64_i32(t_val, in_64);
653         } else {
654             t_val = tcg_constant_i32(in_c);
655         }
656         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
657 
658         if (in_64) {
659             tcg_temp_free_i32(t_val);
660         }
661         tcg_temp_free_ptr(t_ptr);
662         return;
663     }
664 
665     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
666 
667     if (vece == MO_64) {
668         if (in_64) {
669             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
670         } else {
671             t_64 = tcg_constant_i64(in_c);
672             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
673         }
674     } else {
675         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
676         static dup_fn * const fns[3] = {
677             gen_helper_gvec_dup8,
678             gen_helper_gvec_dup16,
679             gen_helper_gvec_dup32
680         };
681 
682         if (in_32) {
683             fns[vece](t_ptr, t_desc, in_32);
684         } else if (in_64) {
685             t_32 = tcg_temp_ebb_new_i32();
686             tcg_gen_extrl_i64_i32(t_32, in_64);
687             fns[vece](t_ptr, t_desc, t_32);
688             tcg_temp_free_i32(t_32);
689         } else {
690             if (vece == MO_8) {
691                 in_c &= 0xff;
692             } else if (vece == MO_16) {
693                 in_c &= 0xffff;
694             }
695             t_32 = tcg_constant_i32(in_c);
696             fns[vece](t_ptr, t_desc, t_32);
697         }
698     }
699 
700     tcg_temp_free_ptr(t_ptr);
701     return;
702 
703  done:
704     if (oprsz < maxsz) {
705         expand_clr(dofs + oprsz, maxsz - oprsz);
706     }
707 }
708 
709 /* Likewise, but with zero.  */
710 static void expand_clr(uint32_t dofs, uint32_t maxsz)
711 {
712     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
713 }
714 
715 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
716 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
717                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
718 {
719     TCGv_i32 t0 = tcg_temp_new_i32();
720     TCGv_i32 t1 = tcg_temp_new_i32();
721     uint32_t i;
722 
723     for (i = 0; i < oprsz; i += 4) {
724         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
725         if (load_dest) {
726             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
727         }
728         fni(t1, t0);
729         tcg_gen_st_i32(t1, tcg_env, dofs + i);
730     }
731     tcg_temp_free_i32(t0);
732     tcg_temp_free_i32(t1);
733 }
734 
735 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
736                           int32_t c, bool load_dest,
737                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
738 {
739     TCGv_i32 t0 = tcg_temp_new_i32();
740     TCGv_i32 t1 = tcg_temp_new_i32();
741     uint32_t i;
742 
743     for (i = 0; i < oprsz; i += 4) {
744         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
745         if (load_dest) {
746             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
747         }
748         fni(t1, t0, c);
749         tcg_gen_st_i32(t1, tcg_env, dofs + i);
750     }
751     tcg_temp_free_i32(t0);
752     tcg_temp_free_i32(t1);
753 }
754 
755 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
756                           TCGv_i32 c, bool scalar_first,
757                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
758 {
759     TCGv_i32 t0 = tcg_temp_new_i32();
760     TCGv_i32 t1 = tcg_temp_new_i32();
761     uint32_t i;
762 
763     for (i = 0; i < oprsz; i += 4) {
764         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
765         if (scalar_first) {
766             fni(t1, c, t0);
767         } else {
768             fni(t1, t0, c);
769         }
770         tcg_gen_st_i32(t1, tcg_env, dofs + i);
771     }
772     tcg_temp_free_i32(t0);
773     tcg_temp_free_i32(t1);
774 }
775 
776 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
777 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
778                          uint32_t bofs, uint32_t oprsz, bool load_dest,
779                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
780 {
781     TCGv_i32 t0 = tcg_temp_new_i32();
782     TCGv_i32 t1 = tcg_temp_new_i32();
783     TCGv_i32 t2 = tcg_temp_new_i32();
784     uint32_t i;
785 
786     for (i = 0; i < oprsz; i += 4) {
787         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
788         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
789         if (load_dest) {
790             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
791         }
792         fni(t2, t0, t1);
793         tcg_gen_st_i32(t2, tcg_env, dofs + i);
794     }
795     tcg_temp_free_i32(t2);
796     tcg_temp_free_i32(t1);
797     tcg_temp_free_i32(t0);
798 }
799 
800 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
801                           uint32_t oprsz, int32_t c,
802                           bool load_dest, bool write_aofs,
803                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
804 {
805     TCGv_i32 t0 = tcg_temp_new_i32();
806     TCGv_i32 t1 = tcg_temp_new_i32();
807     TCGv_i32 t2 = tcg_temp_new_i32();
808     uint32_t i;
809 
810     for (i = 0; i < oprsz; i += 4) {
811         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
812         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
813         if (load_dest) {
814             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
815         }
816         fni(t2, t0, t1, c);
817         tcg_gen_st_i32(t2, tcg_env, dofs + i);
818         if (write_aofs) {
819             tcg_gen_st_i32(t0, tcg_env, aofs + i);
820         }
821     }
822     tcg_temp_free_i32(t0);
823     tcg_temp_free_i32(t1);
824     tcg_temp_free_i32(t2);
825 }
826 
827 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
828 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
829                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
830                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
831 {
832     TCGv_i32 t0 = tcg_temp_new_i32();
833     TCGv_i32 t1 = tcg_temp_new_i32();
834     TCGv_i32 t2 = tcg_temp_new_i32();
835     TCGv_i32 t3 = tcg_temp_new_i32();
836     uint32_t i;
837 
838     for (i = 0; i < oprsz; i += 4) {
839         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
840         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
841         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
842         fni(t0, t1, t2, t3);
843         tcg_gen_st_i32(t0, tcg_env, dofs + i);
844         if (write_aofs) {
845             tcg_gen_st_i32(t1, tcg_env, aofs + i);
846         }
847     }
848     tcg_temp_free_i32(t3);
849     tcg_temp_free_i32(t2);
850     tcg_temp_free_i32(t1);
851     tcg_temp_free_i32(t0);
852 }
853 
854 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
855                           uint32_t cofs, uint32_t oprsz, int32_t c,
856                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
857                                       int32_t))
858 {
859     TCGv_i32 t0 = tcg_temp_new_i32();
860     TCGv_i32 t1 = tcg_temp_new_i32();
861     TCGv_i32 t2 = tcg_temp_new_i32();
862     TCGv_i32 t3 = tcg_temp_new_i32();
863     uint32_t i;
864 
865     for (i = 0; i < oprsz; i += 4) {
866         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
867         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
868         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
869         fni(t0, t1, t2, t3, c);
870         tcg_gen_st_i32(t0, tcg_env, dofs + i);
871     }
872     tcg_temp_free_i32(t3);
873     tcg_temp_free_i32(t2);
874     tcg_temp_free_i32(t1);
875     tcg_temp_free_i32(t0);
876 }
877 
878 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
879 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
880                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
881 {
882     TCGv_i64 t0 = tcg_temp_new_i64();
883     TCGv_i64 t1 = tcg_temp_new_i64();
884     uint32_t i;
885 
886     for (i = 0; i < oprsz; i += 8) {
887         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
888         if (load_dest) {
889             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
890         }
891         fni(t1, t0);
892         tcg_gen_st_i64(t1, tcg_env, dofs + i);
893     }
894     tcg_temp_free_i64(t0);
895     tcg_temp_free_i64(t1);
896 }
897 
898 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
899                           int64_t c, bool load_dest,
900                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
901 {
902     TCGv_i64 t0 = tcg_temp_new_i64();
903     TCGv_i64 t1 = tcg_temp_new_i64();
904     uint32_t i;
905 
906     for (i = 0; i < oprsz; i += 8) {
907         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
908         if (load_dest) {
909             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
910         }
911         fni(t1, t0, c);
912         tcg_gen_st_i64(t1, tcg_env, dofs + i);
913     }
914     tcg_temp_free_i64(t0);
915     tcg_temp_free_i64(t1);
916 }
917 
918 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
919                           TCGv_i64 c, bool scalar_first,
920                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
921 {
922     TCGv_i64 t0 = tcg_temp_new_i64();
923     TCGv_i64 t1 = tcg_temp_new_i64();
924     uint32_t i;
925 
926     for (i = 0; i < oprsz; i += 8) {
927         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
928         if (scalar_first) {
929             fni(t1, c, t0);
930         } else {
931             fni(t1, t0, c);
932         }
933         tcg_gen_st_i64(t1, tcg_env, dofs + i);
934     }
935     tcg_temp_free_i64(t0);
936     tcg_temp_free_i64(t1);
937 }
938 
939 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
940 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
941                          uint32_t bofs, uint32_t oprsz, bool load_dest,
942                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
943 {
944     TCGv_i64 t0 = tcg_temp_new_i64();
945     TCGv_i64 t1 = tcg_temp_new_i64();
946     TCGv_i64 t2 = tcg_temp_new_i64();
947     uint32_t i;
948 
949     for (i = 0; i < oprsz; i += 8) {
950         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
951         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
952         if (load_dest) {
953             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
954         }
955         fni(t2, t0, t1);
956         tcg_gen_st_i64(t2, tcg_env, dofs + i);
957     }
958     tcg_temp_free_i64(t2);
959     tcg_temp_free_i64(t1);
960     tcg_temp_free_i64(t0);
961 }
962 
963 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
964                           uint32_t oprsz, int64_t c,
965                           bool load_dest, bool write_aofs,
966                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
967 {
968     TCGv_i64 t0 = tcg_temp_new_i64();
969     TCGv_i64 t1 = tcg_temp_new_i64();
970     TCGv_i64 t2 = tcg_temp_new_i64();
971     uint32_t i;
972 
973     for (i = 0; i < oprsz; i += 8) {
974         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
975         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
976         if (load_dest) {
977             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
978         }
979         fni(t2, t0, t1, c);
980         tcg_gen_st_i64(t2, tcg_env, dofs + i);
981         if (write_aofs) {
982             tcg_gen_st_i64(t0, tcg_env, aofs + i);
983         }
984     }
985     tcg_temp_free_i64(t0);
986     tcg_temp_free_i64(t1);
987     tcg_temp_free_i64(t2);
988 }
989 
990 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
991 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
992                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
993                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
994 {
995     TCGv_i64 t0 = tcg_temp_new_i64();
996     TCGv_i64 t1 = tcg_temp_new_i64();
997     TCGv_i64 t2 = tcg_temp_new_i64();
998     TCGv_i64 t3 = tcg_temp_new_i64();
999     uint32_t i;
1000 
1001     for (i = 0; i < oprsz; i += 8) {
1002         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1003         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1004         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1005         fni(t0, t1, t2, t3);
1006         tcg_gen_st_i64(t0, tcg_env, dofs + i);
1007         if (write_aofs) {
1008             tcg_gen_st_i64(t1, tcg_env, aofs + i);
1009         }
1010     }
1011     tcg_temp_free_i64(t3);
1012     tcg_temp_free_i64(t2);
1013     tcg_temp_free_i64(t1);
1014     tcg_temp_free_i64(t0);
1015 }
1016 
1017 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1018                           uint32_t cofs, uint32_t oprsz, int64_t c,
1019                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
1020                                       int64_t))
1021 {
1022     TCGv_i64 t0 = tcg_temp_new_i64();
1023     TCGv_i64 t1 = tcg_temp_new_i64();
1024     TCGv_i64 t2 = tcg_temp_new_i64();
1025     TCGv_i64 t3 = tcg_temp_new_i64();
1026     uint32_t i;
1027 
1028     for (i = 0; i < oprsz; i += 8) {
1029         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1030         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1031         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1032         fni(t0, t1, t2, t3, c);
1033         tcg_gen_st_i64(t0, tcg_env, dofs + i);
1034     }
1035     tcg_temp_free_i64(t3);
1036     tcg_temp_free_i64(t2);
1037     tcg_temp_free_i64(t1);
1038     tcg_temp_free_i64(t0);
1039 }
1040 
1041 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
1042 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1043                          uint32_t oprsz, uint32_t tysz, TCGType type,
1044                          bool load_dest,
1045                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1046 {
1047     for (uint32_t i = 0; i < oprsz; i += tysz) {
1048         TCGv_vec t0 = tcg_temp_new_vec(type);
1049         TCGv_vec t1 = tcg_temp_new_vec(type);
1050 
1051         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1052         if (load_dest) {
1053             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1054         }
1055         fni(vece, t1, t0);
1056         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1057     }
1058 }
1059 
1060 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1061    using host vectors.  */
1062 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1063                           uint32_t oprsz, uint32_t tysz, TCGType type,
1064                           int64_t c, bool load_dest,
1065                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1066 {
1067     for (uint32_t i = 0; i < oprsz; i += tysz) {
1068         TCGv_vec t0 = tcg_temp_new_vec(type);
1069         TCGv_vec t1 = tcg_temp_new_vec(type);
1070 
1071         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1072         if (load_dest) {
1073             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1074         }
1075         fni(vece, t1, t0, c);
1076         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1077     }
1078 }
1079 
1080 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1081                           uint32_t oprsz, uint32_t tysz, TCGType type,
1082                           TCGv_vec c, bool scalar_first,
1083                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1084 {
1085     for (uint32_t i = 0; i < oprsz; i += tysz) {
1086         TCGv_vec t0 = tcg_temp_new_vec(type);
1087         TCGv_vec t1 = tcg_temp_new_vec(type);
1088 
1089         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1090         if (scalar_first) {
1091             fni(vece, t1, c, t0);
1092         } else {
1093             fni(vece, t1, t0, c);
1094         }
1095         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1096     }
1097 }
1098 
1099 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1100 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1101                          uint32_t bofs, uint32_t oprsz,
1102                          uint32_t tysz, TCGType type, bool load_dest,
1103                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1104 {
1105     for (uint32_t i = 0; i < oprsz; i += tysz) {
1106         TCGv_vec t0 = tcg_temp_new_vec(type);
1107         TCGv_vec t1 = tcg_temp_new_vec(type);
1108         TCGv_vec t2 = tcg_temp_new_vec(type);
1109 
1110         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1111         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1112         if (load_dest) {
1113             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1114         }
1115         fni(vece, t2, t0, t1);
1116         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1117     }
1118 }
1119 
1120 /*
1121  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1122  * using host vectors.
1123  */
1124 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1125                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1126                           TCGType type, int64_t c,
1127                           bool load_dest, bool write_aofs,
1128                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1129                                       int64_t))
1130 {
1131     for (uint32_t i = 0; i < oprsz; i += tysz) {
1132         TCGv_vec t0 = tcg_temp_new_vec(type);
1133         TCGv_vec t1 = tcg_temp_new_vec(type);
1134         TCGv_vec t2 = tcg_temp_new_vec(type);
1135 
1136         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1137         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1138         if (load_dest) {
1139             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1140         }
1141         fni(vece, t2, t0, t1, c);
1142         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1143         if (write_aofs) {
1144             tcg_gen_st_vec(t0, tcg_env, aofs + i);
1145         }
1146     }
1147 }
1148 
1149 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1150 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1151                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1152                          uint32_t tysz, TCGType type, bool write_aofs,
1153                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1154                                      TCGv_vec, TCGv_vec))
1155 {
1156     for (uint32_t i = 0; i < oprsz; i += tysz) {
1157         TCGv_vec t0 = tcg_temp_new_vec(type);
1158         TCGv_vec t1 = tcg_temp_new_vec(type);
1159         TCGv_vec t2 = tcg_temp_new_vec(type);
1160         TCGv_vec t3 = tcg_temp_new_vec(type);
1161 
1162         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1163         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1164         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1165         fni(vece, t0, t1, t2, t3);
1166         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1167         if (write_aofs) {
1168             tcg_gen_st_vec(t1, tcg_env, aofs + i);
1169         }
1170     }
1171 }
1172 
1173 /*
1174  * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1175  * using host vectors.
1176  */
1177 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1178                           uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1179                           uint32_t tysz, TCGType type, int64_t c,
1180                           void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1181                                      TCGv_vec, TCGv_vec, int64_t))
1182 {
1183     for (uint32_t i = 0; i < oprsz; i += tysz) {
1184         TCGv_vec t0 = tcg_temp_new_vec(type);
1185         TCGv_vec t1 = tcg_temp_new_vec(type);
1186         TCGv_vec t2 = tcg_temp_new_vec(type);
1187         TCGv_vec t3 = tcg_temp_new_vec(type);
1188 
1189         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1190         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1191         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1192         fni(vece, t0, t1, t2, t3, c);
1193         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1194     }
1195 }
1196 
1197 /* Expand a vector two-operand operation.  */
1198 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1199                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1200 {
1201     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1202     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1203     TCGType type;
1204     uint32_t some;
1205 
1206     check_size_align(oprsz, maxsz, dofs | aofs);
1207     check_overlap_2(dofs, aofs, maxsz);
1208 
1209     type = 0;
1210     if (g->fniv) {
1211         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1212     }
1213     switch (type) {
1214     case TCG_TYPE_V256:
1215         /* Recall that ARM SVE allows vector sizes that are not a
1216          * power of 2, but always a multiple of 16.  The intent is
1217          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1218          */
1219         some = QEMU_ALIGN_DOWN(oprsz, 32);
1220         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1221                      g->load_dest, g->fniv);
1222         if (some == oprsz) {
1223             break;
1224         }
1225         dofs += some;
1226         aofs += some;
1227         oprsz -= some;
1228         maxsz -= some;
1229         /* fallthru */
1230     case TCG_TYPE_V128:
1231         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1232                      g->load_dest, g->fniv);
1233         break;
1234     case TCG_TYPE_V64:
1235         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1236                      g->load_dest, g->fniv);
1237         break;
1238 
1239     case 0:
1240         if (g->fni8 && check_size_impl(oprsz, 8)) {
1241             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1242         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1243             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1244         } else {
1245             assert(g->fno != NULL);
1246             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1247             oprsz = maxsz;
1248         }
1249         break;
1250 
1251     default:
1252         g_assert_not_reached();
1253     }
1254     tcg_swap_vecop_list(hold_list);
1255 
1256     if (oprsz < maxsz) {
1257         expand_clr(dofs + oprsz, maxsz - oprsz);
1258     }
1259 }
1260 
1261 /* Expand a vector operation with two vectors and an immediate.  */
1262 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1263                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1264 {
1265     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1266     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1267     TCGType type;
1268     uint32_t some;
1269 
1270     check_size_align(oprsz, maxsz, dofs | aofs);
1271     check_overlap_2(dofs, aofs, maxsz);
1272 
1273     type = 0;
1274     if (g->fniv) {
1275         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1276     }
1277     switch (type) {
1278     case TCG_TYPE_V256:
1279         /* Recall that ARM SVE allows vector sizes that are not a
1280          * power of 2, but always a multiple of 16.  The intent is
1281          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1282          */
1283         some = QEMU_ALIGN_DOWN(oprsz, 32);
1284         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1285                       c, g->load_dest, g->fniv);
1286         if (some == oprsz) {
1287             break;
1288         }
1289         dofs += some;
1290         aofs += some;
1291         oprsz -= some;
1292         maxsz -= some;
1293         /* fallthru */
1294     case TCG_TYPE_V128:
1295         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1296                       c, g->load_dest, g->fniv);
1297         break;
1298     case TCG_TYPE_V64:
1299         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1300                       c, g->load_dest, g->fniv);
1301         break;
1302 
1303     case 0:
1304         if (g->fni8 && check_size_impl(oprsz, 8)) {
1305             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1306         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1307             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1308         } else {
1309             if (g->fno) {
1310                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1311             } else {
1312                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1313                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1314                                     maxsz, c, g->fnoi);
1315             }
1316             oprsz = maxsz;
1317         }
1318         break;
1319 
1320     default:
1321         g_assert_not_reached();
1322     }
1323     tcg_swap_vecop_list(hold_list);
1324 
1325     if (oprsz < maxsz) {
1326         expand_clr(dofs + oprsz, maxsz - oprsz);
1327     }
1328 }
1329 
1330 /* Expand a vector operation with two vectors and a scalar.  */
1331 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1332                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1333 {
1334     TCGType type;
1335 
1336     check_size_align(oprsz, maxsz, dofs | aofs);
1337     check_overlap_2(dofs, aofs, maxsz);
1338 
1339     type = 0;
1340     if (g->fniv) {
1341         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1342     }
1343     if (type != 0) {
1344         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1345         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1346         TCGv_vec t_vec = tcg_temp_new_vec(type);
1347         uint32_t some;
1348 
1349         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1350 
1351         switch (type) {
1352         case TCG_TYPE_V256:
1353             /* Recall that ARM SVE allows vector sizes that are not a
1354              * power of 2, but always a multiple of 16.  The intent is
1355              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1356              */
1357             some = QEMU_ALIGN_DOWN(oprsz, 32);
1358             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1359                           t_vec, g->scalar_first, g->fniv);
1360             if (some == oprsz) {
1361                 break;
1362             }
1363             dofs += some;
1364             aofs += some;
1365             oprsz -= some;
1366             maxsz -= some;
1367             /* fallthru */
1368 
1369         case TCG_TYPE_V128:
1370             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1371                           t_vec, g->scalar_first, g->fniv);
1372             break;
1373 
1374         case TCG_TYPE_V64:
1375             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1376                           t_vec, g->scalar_first, g->fniv);
1377             break;
1378 
1379         default:
1380             g_assert_not_reached();
1381         }
1382         tcg_temp_free_vec(t_vec);
1383         tcg_swap_vecop_list(hold_list);
1384     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1385         TCGv_i64 t64 = tcg_temp_new_i64();
1386 
1387         tcg_gen_dup_i64(g->vece, t64, c);
1388         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1389         tcg_temp_free_i64(t64);
1390     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1391         TCGv_i32 t32 = tcg_temp_new_i32();
1392 
1393         tcg_gen_extrl_i64_i32(t32, c);
1394         tcg_gen_dup_i32(g->vece, t32, t32);
1395         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1396         tcg_temp_free_i32(t32);
1397     } else {
1398         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1399         return;
1400     }
1401 
1402     if (oprsz < maxsz) {
1403         expand_clr(dofs + oprsz, maxsz - oprsz);
1404     }
1405 }
1406 
1407 /* Expand a vector three-operand operation.  */
1408 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1409                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1410 {
1411     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1412     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1413     TCGType type;
1414     uint32_t some;
1415 
1416     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1417     check_overlap_3(dofs, aofs, bofs, maxsz);
1418 
1419     type = 0;
1420     if (g->fniv) {
1421         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1422     }
1423     switch (type) {
1424     case TCG_TYPE_V256:
1425         /* Recall that ARM SVE allows vector sizes that are not a
1426          * power of 2, but always a multiple of 16.  The intent is
1427          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1428          */
1429         some = QEMU_ALIGN_DOWN(oprsz, 32);
1430         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1431                      g->load_dest, g->fniv);
1432         if (some == oprsz) {
1433             break;
1434         }
1435         dofs += some;
1436         aofs += some;
1437         bofs += some;
1438         oprsz -= some;
1439         maxsz -= some;
1440         /* fallthru */
1441     case TCG_TYPE_V128:
1442         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1443                      g->load_dest, g->fniv);
1444         break;
1445     case TCG_TYPE_V64:
1446         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1447                      g->load_dest, g->fniv);
1448         break;
1449 
1450     case 0:
1451         if (g->fni8 && check_size_impl(oprsz, 8)) {
1452             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1453         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1454             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1455         } else {
1456             assert(g->fno != NULL);
1457             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1458                                maxsz, g->data, g->fno);
1459             oprsz = maxsz;
1460         }
1461         break;
1462 
1463     default:
1464         g_assert_not_reached();
1465     }
1466     tcg_swap_vecop_list(hold_list);
1467 
1468     if (oprsz < maxsz) {
1469         expand_clr(dofs + oprsz, maxsz - oprsz);
1470     }
1471 }
1472 
1473 /* Expand a vector operation with three vectors and an immediate.  */
1474 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1475                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1476                      const GVecGen3i *g)
1477 {
1478     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1479     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1480     TCGType type;
1481     uint32_t some;
1482 
1483     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1484     check_overlap_3(dofs, aofs, bofs, maxsz);
1485 
1486     type = 0;
1487     if (g->fniv) {
1488         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1489     }
1490     switch (type) {
1491     case TCG_TYPE_V256:
1492         /*
1493          * Recall that ARM SVE allows vector sizes that are not a
1494          * power of 2, but always a multiple of 16.  The intent is
1495          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1496          */
1497         some = QEMU_ALIGN_DOWN(oprsz, 32);
1498         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1499                       c, g->load_dest, g->write_aofs, g->fniv);
1500         if (some == oprsz) {
1501             break;
1502         }
1503         dofs += some;
1504         aofs += some;
1505         bofs += some;
1506         oprsz -= some;
1507         maxsz -= some;
1508         /* fallthru */
1509     case TCG_TYPE_V128:
1510         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1511                       c, g->load_dest, g->write_aofs, g->fniv);
1512         break;
1513     case TCG_TYPE_V64:
1514         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1515                       c, g->load_dest, g->write_aofs, g->fniv);
1516         break;
1517 
1518     case 0:
1519         if (g->fni8 && check_size_impl(oprsz, 8)) {
1520             expand_3i_i64(dofs, aofs, bofs, oprsz, c,
1521                           g->load_dest, g->write_aofs, g->fni8);
1522         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1523             expand_3i_i32(dofs, aofs, bofs, oprsz, c,
1524                           g->load_dest, g->write_aofs, g->fni4);
1525         } else {
1526             assert(g->fno != NULL);
1527             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1528             oprsz = maxsz;
1529         }
1530         break;
1531 
1532     default:
1533         g_assert_not_reached();
1534     }
1535     tcg_swap_vecop_list(hold_list);
1536 
1537     if (oprsz < maxsz) {
1538         expand_clr(dofs + oprsz, maxsz - oprsz);
1539     }
1540 }
1541 
1542 /* Expand a vector four-operand operation.  */
1543 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1544                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1545 {
1546     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1547     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1548     TCGType type;
1549     uint32_t some;
1550 
1551     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1552     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1553 
1554     type = 0;
1555     if (g->fniv) {
1556         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1557     }
1558     switch (type) {
1559     case TCG_TYPE_V256:
1560         /* Recall that ARM SVE allows vector sizes that are not a
1561          * power of 2, but always a multiple of 16.  The intent is
1562          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1563          */
1564         some = QEMU_ALIGN_DOWN(oprsz, 32);
1565         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1566                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1567         if (some == oprsz) {
1568             break;
1569         }
1570         dofs += some;
1571         aofs += some;
1572         bofs += some;
1573         cofs += some;
1574         oprsz -= some;
1575         maxsz -= some;
1576         /* fallthru */
1577     case TCG_TYPE_V128:
1578         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1579                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1580         break;
1581     case TCG_TYPE_V64:
1582         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1583                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1584         break;
1585 
1586     case 0:
1587         if (g->fni8 && check_size_impl(oprsz, 8)) {
1588             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1589                          g->write_aofs, g->fni8);
1590         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1591             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1592                          g->write_aofs, g->fni4);
1593         } else {
1594             assert(g->fno != NULL);
1595             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1596                                oprsz, maxsz, g->data, g->fno);
1597             oprsz = maxsz;
1598         }
1599         break;
1600 
1601     default:
1602         g_assert_not_reached();
1603     }
1604     tcg_swap_vecop_list(hold_list);
1605 
1606     if (oprsz < maxsz) {
1607         expand_clr(dofs + oprsz, maxsz - oprsz);
1608     }
1609 }
1610 
1611 /* Expand a vector four-operand operation.  */
1612 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1613                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1614                      const GVecGen4i *g)
1615 {
1616     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1617     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1618     TCGType type;
1619     uint32_t some;
1620 
1621     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1622     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1623 
1624     type = 0;
1625     if (g->fniv) {
1626         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1627     }
1628     switch (type) {
1629     case TCG_TYPE_V256:
1630         /*
1631          * Recall that ARM SVE allows vector sizes that are not a
1632          * power of 2, but always a multiple of 16.  The intent is
1633          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1634          */
1635         some = QEMU_ALIGN_DOWN(oprsz, 32);
1636         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1637                       32, TCG_TYPE_V256, c, g->fniv);
1638         if (some == oprsz) {
1639             break;
1640         }
1641         dofs += some;
1642         aofs += some;
1643         bofs += some;
1644         cofs += some;
1645         oprsz -= some;
1646         maxsz -= some;
1647         /* fallthru */
1648     case TCG_TYPE_V128:
1649         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1650                        16, TCG_TYPE_V128, c, g->fniv);
1651         break;
1652     case TCG_TYPE_V64:
1653         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1654                       8, TCG_TYPE_V64, c, g->fniv);
1655         break;
1656 
1657     case 0:
1658         if (g->fni8 && check_size_impl(oprsz, 8)) {
1659             expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1660         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1661             expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1662         } else {
1663             assert(g->fno != NULL);
1664             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1665                                oprsz, maxsz, c, g->fno);
1666             oprsz = maxsz;
1667         }
1668         break;
1669 
1670     default:
1671         g_assert_not_reached();
1672     }
1673     tcg_swap_vecop_list(hold_list);
1674 
1675     if (oprsz < maxsz) {
1676         expand_clr(dofs + oprsz, maxsz - oprsz);
1677     }
1678 }
1679 
1680 /*
1681  * Expand specific vector operations.
1682  */
1683 
1684 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1685 {
1686     tcg_gen_mov_vec(a, b);
1687 }
1688 
1689 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1690                       uint32_t oprsz, uint32_t maxsz)
1691 {
1692     static const GVecGen2 g = {
1693         .fni8 = tcg_gen_mov_i64,
1694         .fniv = vec_mov2,
1695         .fno = gen_helper_gvec_mov,
1696         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1697     };
1698     if (dofs != aofs) {
1699         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1700     } else {
1701         check_size_align(oprsz, maxsz, dofs);
1702         if (oprsz < maxsz) {
1703             expand_clr(dofs + oprsz, maxsz - oprsz);
1704         }
1705     }
1706 }
1707 
1708 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1709                           uint32_t maxsz, TCGv_i32 in)
1710 {
1711     check_size_align(oprsz, maxsz, dofs);
1712     tcg_debug_assert(vece <= MO_32);
1713     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1714 }
1715 
1716 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1717                           uint32_t maxsz, TCGv_i64 in)
1718 {
1719     check_size_align(oprsz, maxsz, dofs);
1720     tcg_debug_assert(vece <= MO_64);
1721     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1722 }
1723 
1724 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1725                           uint32_t oprsz, uint32_t maxsz)
1726 {
1727     check_size_align(oprsz, maxsz, dofs);
1728     if (vece <= MO_64) {
1729         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1730         if (type != 0) {
1731             TCGv_vec t_vec = tcg_temp_new_vec(type);
1732             tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs);
1733             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1734         } else if (vece <= MO_32) {
1735             TCGv_i32 in = tcg_temp_ebb_new_i32();
1736             switch (vece) {
1737             case MO_8:
1738                 tcg_gen_ld8u_i32(in, tcg_env, aofs);
1739                 break;
1740             case MO_16:
1741                 tcg_gen_ld16u_i32(in, tcg_env, aofs);
1742                 break;
1743             default:
1744                 tcg_gen_ld_i32(in, tcg_env, aofs);
1745                 break;
1746             }
1747             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1748             tcg_temp_free_i32(in);
1749         } else {
1750             TCGv_i64 in = tcg_temp_ebb_new_i64();
1751             tcg_gen_ld_i64(in, tcg_env, aofs);
1752             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1753             tcg_temp_free_i64(in);
1754         }
1755     } else if (vece == 4) {
1756         /* 128-bit duplicate.  */
1757         int i;
1758 
1759         tcg_debug_assert(oprsz >= 16);
1760         if (TCG_TARGET_HAS_v128) {
1761             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1762 
1763             tcg_gen_ld_vec(in, tcg_env, aofs);
1764             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1765                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1766             }
1767         } else {
1768             TCGv_i64 in0 = tcg_temp_ebb_new_i64();
1769             TCGv_i64 in1 = tcg_temp_ebb_new_i64();
1770 
1771             tcg_gen_ld_i64(in0, tcg_env, aofs);
1772             tcg_gen_ld_i64(in1, tcg_env, aofs + 8);
1773             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1774                 tcg_gen_st_i64(in0, tcg_env, dofs + i);
1775                 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8);
1776             }
1777             tcg_temp_free_i64(in0);
1778             tcg_temp_free_i64(in1);
1779         }
1780         if (oprsz < maxsz) {
1781             expand_clr(dofs + oprsz, maxsz - oprsz);
1782         }
1783     } else if (vece == 5) {
1784         /* 256-bit duplicate.  */
1785         int i;
1786 
1787         tcg_debug_assert(oprsz >= 32);
1788         tcg_debug_assert(oprsz % 32 == 0);
1789         if (TCG_TARGET_HAS_v256) {
1790             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1791 
1792             tcg_gen_ld_vec(in, tcg_env, aofs);
1793             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1794                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1795             }
1796         } else if (TCG_TARGET_HAS_v128) {
1797             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1798             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1799 
1800             tcg_gen_ld_vec(in0, tcg_env, aofs);
1801             tcg_gen_ld_vec(in1, tcg_env, aofs + 16);
1802             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1803                 tcg_gen_st_vec(in0, tcg_env, dofs + i);
1804                 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16);
1805             }
1806         } else {
1807             TCGv_i64 in[4];
1808             int j;
1809 
1810             for (j = 0; j < 4; ++j) {
1811                 in[j] = tcg_temp_ebb_new_i64();
1812                 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8);
1813             }
1814             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1815                 for (j = 0; j < 4; ++j) {
1816                     tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8);
1817                 }
1818             }
1819             for (j = 0; j < 4; ++j) {
1820                 tcg_temp_free_i64(in[j]);
1821             }
1822         }
1823         if (oprsz < maxsz) {
1824             expand_clr(dofs + oprsz, maxsz - oprsz);
1825         }
1826     } else {
1827         g_assert_not_reached();
1828     }
1829 }
1830 
1831 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1832                           uint32_t maxsz, uint64_t x)
1833 {
1834     check_size_align(oprsz, maxsz, dofs);
1835     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1836 }
1837 
1838 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1839                       uint32_t oprsz, uint32_t maxsz)
1840 {
1841     static const GVecGen2 g = {
1842         .fni8 = tcg_gen_not_i64,
1843         .fniv = tcg_gen_not_vec,
1844         .fno = gen_helper_gvec_not,
1845         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1846     };
1847     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1848 }
1849 
1850 /* Perform a vector addition using normal addition and a mask.  The mask
1851    should be the sign bit of each lane.  This 6-operation form is more
1852    efficient than separate additions when there are 4 or more lanes in
1853    the 64-bit operation.  */
1854 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1855 {
1856     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1857     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1858     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
1859 
1860     tcg_gen_andc_i64(t1, a, m);
1861     tcg_gen_andc_i64(t2, b, m);
1862     tcg_gen_xor_i64(t3, a, b);
1863     tcg_gen_add_i64(d, t1, t2);
1864     tcg_gen_and_i64(t3, t3, m);
1865     tcg_gen_xor_i64(d, d, t3);
1866 
1867     tcg_temp_free_i64(t1);
1868     tcg_temp_free_i64(t2);
1869     tcg_temp_free_i64(t3);
1870 }
1871 
1872 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1873 {
1874     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1875     gen_addv_mask(d, a, b, m);
1876 }
1877 
1878 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1879 {
1880     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1881     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1882     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1883     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
1884 
1885     tcg_gen_andc_i32(t1, a, m);
1886     tcg_gen_andc_i32(t2, b, m);
1887     tcg_gen_xor_i32(t3, a, b);
1888     tcg_gen_add_i32(d, t1, t2);
1889     tcg_gen_and_i32(t3, t3, m);
1890     tcg_gen_xor_i32(d, d, t3);
1891 
1892     tcg_temp_free_i32(t1);
1893     tcg_temp_free_i32(t2);
1894     tcg_temp_free_i32(t3);
1895 }
1896 
1897 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1898 {
1899     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1900     gen_addv_mask(d, a, b, m);
1901 }
1902 
1903 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1904 {
1905     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1906     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1907 
1908     tcg_gen_andi_i32(t1, a, ~0xffff);
1909     tcg_gen_add_i32(t2, a, b);
1910     tcg_gen_add_i32(t1, t1, b);
1911     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1912 
1913     tcg_temp_free_i32(t1);
1914     tcg_temp_free_i32(t2);
1915 }
1916 
1917 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1918 {
1919     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1920     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1921 
1922     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1923     tcg_gen_add_i64(t2, a, b);
1924     tcg_gen_add_i64(t1, t1, b);
1925     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1926 
1927     tcg_temp_free_i64(t1);
1928     tcg_temp_free_i64(t2);
1929 }
1930 
1931 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1932 
1933 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1934                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1935 {
1936     static const GVecGen3 g[4] = {
1937         { .fni8 = tcg_gen_vec_add8_i64,
1938           .fniv = tcg_gen_add_vec,
1939           .fno = gen_helper_gvec_add8,
1940           .opt_opc = vecop_list_add,
1941           .vece = MO_8 },
1942         { .fni8 = tcg_gen_vec_add16_i64,
1943           .fniv = tcg_gen_add_vec,
1944           .fno = gen_helper_gvec_add16,
1945           .opt_opc = vecop_list_add,
1946           .vece = MO_16 },
1947         { .fni4 = tcg_gen_add_i32,
1948           .fniv = tcg_gen_add_vec,
1949           .fno = gen_helper_gvec_add32,
1950           .opt_opc = vecop_list_add,
1951           .vece = MO_32 },
1952         { .fni8 = tcg_gen_add_i64,
1953           .fniv = tcg_gen_add_vec,
1954           .fno = gen_helper_gvec_add64,
1955           .opt_opc = vecop_list_add,
1956           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1957           .vece = MO_64 },
1958     };
1959 
1960     tcg_debug_assert(vece <= MO_64);
1961     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1962 }
1963 
1964 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1965                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1966 {
1967     static const GVecGen2s g[4] = {
1968         { .fni8 = tcg_gen_vec_add8_i64,
1969           .fniv = tcg_gen_add_vec,
1970           .fno = gen_helper_gvec_adds8,
1971           .opt_opc = vecop_list_add,
1972           .vece = MO_8 },
1973         { .fni8 = tcg_gen_vec_add16_i64,
1974           .fniv = tcg_gen_add_vec,
1975           .fno = gen_helper_gvec_adds16,
1976           .opt_opc = vecop_list_add,
1977           .vece = MO_16 },
1978         { .fni4 = tcg_gen_add_i32,
1979           .fniv = tcg_gen_add_vec,
1980           .fno = gen_helper_gvec_adds32,
1981           .opt_opc = vecop_list_add,
1982           .vece = MO_32 },
1983         { .fni8 = tcg_gen_add_i64,
1984           .fniv = tcg_gen_add_vec,
1985           .fno = gen_helper_gvec_adds64,
1986           .opt_opc = vecop_list_add,
1987           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1988           .vece = MO_64 },
1989     };
1990 
1991     tcg_debug_assert(vece <= MO_64);
1992     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1993 }
1994 
1995 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1996                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1997 {
1998     TCGv_i64 tmp = tcg_constant_i64(c);
1999     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
2000 }
2001 
2002 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
2003 
2004 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
2005                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2006 {
2007     static const GVecGen2s g[4] = {
2008         { .fni8 = tcg_gen_vec_sub8_i64,
2009           .fniv = tcg_gen_sub_vec,
2010           .fno = gen_helper_gvec_subs8,
2011           .opt_opc = vecop_list_sub,
2012           .vece = MO_8 },
2013         { .fni8 = tcg_gen_vec_sub16_i64,
2014           .fniv = tcg_gen_sub_vec,
2015           .fno = gen_helper_gvec_subs16,
2016           .opt_opc = vecop_list_sub,
2017           .vece = MO_16 },
2018         { .fni4 = tcg_gen_sub_i32,
2019           .fniv = tcg_gen_sub_vec,
2020           .fno = gen_helper_gvec_subs32,
2021           .opt_opc = vecop_list_sub,
2022           .vece = MO_32 },
2023         { .fni8 = tcg_gen_sub_i64,
2024           .fniv = tcg_gen_sub_vec,
2025           .fno = gen_helper_gvec_subs64,
2026           .opt_opc = vecop_list_sub,
2027           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2028           .vece = MO_64 },
2029     };
2030 
2031     tcg_debug_assert(vece <= MO_64);
2032     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2033 }
2034 
2035 /* Perform a vector subtraction using normal subtraction and a mask.
2036    Compare gen_addv_mask above.  */
2037 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2038 {
2039     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2040     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2041     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2042 
2043     tcg_gen_or_i64(t1, a, m);
2044     tcg_gen_andc_i64(t2, b, m);
2045     tcg_gen_eqv_i64(t3, a, b);
2046     tcg_gen_sub_i64(d, t1, t2);
2047     tcg_gen_and_i64(t3, t3, m);
2048     tcg_gen_xor_i64(d, d, t3);
2049 
2050     tcg_temp_free_i64(t1);
2051     tcg_temp_free_i64(t2);
2052     tcg_temp_free_i64(t3);
2053 }
2054 
2055 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2056 {
2057     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2058     gen_subv_mask(d, a, b, m);
2059 }
2060 
2061 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2062 {
2063     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2064     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2065     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2066     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
2067 
2068     tcg_gen_or_i32(t1, a, m);
2069     tcg_gen_andc_i32(t2, b, m);
2070     tcg_gen_eqv_i32(t3, a, b);
2071     tcg_gen_sub_i32(d, t1, t2);
2072     tcg_gen_and_i32(t3, t3, m);
2073     tcg_gen_xor_i32(d, d, t3);
2074 
2075     tcg_temp_free_i32(t1);
2076     tcg_temp_free_i32(t2);
2077     tcg_temp_free_i32(t3);
2078 }
2079 
2080 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2081 {
2082     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2083     gen_subv_mask(d, a, b, m);
2084 }
2085 
2086 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2087 {
2088     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2089     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2090 
2091     tcg_gen_andi_i32(t1, b, ~0xffff);
2092     tcg_gen_sub_i32(t2, a, b);
2093     tcg_gen_sub_i32(t1, a, t1);
2094     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2095 
2096     tcg_temp_free_i32(t1);
2097     tcg_temp_free_i32(t2);
2098 }
2099 
2100 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2101 {
2102     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2103     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2104 
2105     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2106     tcg_gen_sub_i64(t2, a, b);
2107     tcg_gen_sub_i64(t1, a, t1);
2108     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2109 
2110     tcg_temp_free_i64(t1);
2111     tcg_temp_free_i64(t2);
2112 }
2113 
2114 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2115                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2116 {
2117     static const GVecGen3 g[4] = {
2118         { .fni8 = tcg_gen_vec_sub8_i64,
2119           .fniv = tcg_gen_sub_vec,
2120           .fno = gen_helper_gvec_sub8,
2121           .opt_opc = vecop_list_sub,
2122           .vece = MO_8 },
2123         { .fni8 = tcg_gen_vec_sub16_i64,
2124           .fniv = tcg_gen_sub_vec,
2125           .fno = gen_helper_gvec_sub16,
2126           .opt_opc = vecop_list_sub,
2127           .vece = MO_16 },
2128         { .fni4 = tcg_gen_sub_i32,
2129           .fniv = tcg_gen_sub_vec,
2130           .fno = gen_helper_gvec_sub32,
2131           .opt_opc = vecop_list_sub,
2132           .vece = MO_32 },
2133         { .fni8 = tcg_gen_sub_i64,
2134           .fniv = tcg_gen_sub_vec,
2135           .fno = gen_helper_gvec_sub64,
2136           .opt_opc = vecop_list_sub,
2137           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2138           .vece = MO_64 },
2139     };
2140 
2141     tcg_debug_assert(vece <= MO_64);
2142     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2143 }
2144 
2145 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2146 
2147 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2148                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2149 {
2150     static const GVecGen3 g[4] = {
2151         { .fniv = tcg_gen_mul_vec,
2152           .fno = gen_helper_gvec_mul8,
2153           .opt_opc = vecop_list_mul,
2154           .vece = MO_8 },
2155         { .fniv = tcg_gen_mul_vec,
2156           .fno = gen_helper_gvec_mul16,
2157           .opt_opc = vecop_list_mul,
2158           .vece = MO_16 },
2159         { .fni4 = tcg_gen_mul_i32,
2160           .fniv = tcg_gen_mul_vec,
2161           .fno = gen_helper_gvec_mul32,
2162           .opt_opc = vecop_list_mul,
2163           .vece = MO_32 },
2164         { .fni8 = tcg_gen_mul_i64,
2165           .fniv = tcg_gen_mul_vec,
2166           .fno = gen_helper_gvec_mul64,
2167           .opt_opc = vecop_list_mul,
2168           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2169           .vece = MO_64 },
2170     };
2171 
2172     tcg_debug_assert(vece <= MO_64);
2173     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2174 }
2175 
2176 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2177                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2178 {
2179     static const GVecGen2s g[4] = {
2180         { .fniv = tcg_gen_mul_vec,
2181           .fno = gen_helper_gvec_muls8,
2182           .opt_opc = vecop_list_mul,
2183           .vece = MO_8 },
2184         { .fniv = tcg_gen_mul_vec,
2185           .fno = gen_helper_gvec_muls16,
2186           .opt_opc = vecop_list_mul,
2187           .vece = MO_16 },
2188         { .fni4 = tcg_gen_mul_i32,
2189           .fniv = tcg_gen_mul_vec,
2190           .fno = gen_helper_gvec_muls32,
2191           .opt_opc = vecop_list_mul,
2192           .vece = MO_32 },
2193         { .fni8 = tcg_gen_mul_i64,
2194           .fniv = tcg_gen_mul_vec,
2195           .fno = gen_helper_gvec_muls64,
2196           .opt_opc = vecop_list_mul,
2197           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2198           .vece = MO_64 },
2199     };
2200 
2201     tcg_debug_assert(vece <= MO_64);
2202     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2203 }
2204 
2205 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2206                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2207 {
2208     TCGv_i64 tmp = tcg_constant_i64(c);
2209     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2210 }
2211 
2212 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2213                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2214 {
2215     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2216     static const GVecGen3 g[4] = {
2217         { .fniv = tcg_gen_ssadd_vec,
2218           .fno = gen_helper_gvec_ssadd8,
2219           .opt_opc = vecop_list,
2220           .vece = MO_8 },
2221         { .fniv = tcg_gen_ssadd_vec,
2222           .fno = gen_helper_gvec_ssadd16,
2223           .opt_opc = vecop_list,
2224           .vece = MO_16 },
2225         { .fniv = tcg_gen_ssadd_vec,
2226           .fno = gen_helper_gvec_ssadd32,
2227           .opt_opc = vecop_list,
2228           .vece = MO_32 },
2229         { .fniv = tcg_gen_ssadd_vec,
2230           .fno = gen_helper_gvec_ssadd64,
2231           .opt_opc = vecop_list,
2232           .vece = MO_64 },
2233     };
2234     tcg_debug_assert(vece <= MO_64);
2235     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2236 }
2237 
2238 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2239                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2240 {
2241     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2242     static const GVecGen3 g[4] = {
2243         { .fniv = tcg_gen_sssub_vec,
2244           .fno = gen_helper_gvec_sssub8,
2245           .opt_opc = vecop_list,
2246           .vece = MO_8 },
2247         { .fniv = tcg_gen_sssub_vec,
2248           .fno = gen_helper_gvec_sssub16,
2249           .opt_opc = vecop_list,
2250           .vece = MO_16 },
2251         { .fniv = tcg_gen_sssub_vec,
2252           .fno = gen_helper_gvec_sssub32,
2253           .opt_opc = vecop_list,
2254           .vece = MO_32 },
2255         { .fniv = tcg_gen_sssub_vec,
2256           .fno = gen_helper_gvec_sssub64,
2257           .opt_opc = vecop_list,
2258           .vece = MO_64 },
2259     };
2260     tcg_debug_assert(vece <= MO_64);
2261     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2262 }
2263 
2264 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2265 {
2266     TCGv_i32 max = tcg_constant_i32(-1);
2267     tcg_gen_add_i32(d, a, b);
2268     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2269 }
2270 
2271 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2272 {
2273     TCGv_i64 max = tcg_constant_i64(-1);
2274     tcg_gen_add_i64(d, a, b);
2275     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2276 }
2277 
2278 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2279                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2280 {
2281     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2282     static const GVecGen3 g[4] = {
2283         { .fniv = tcg_gen_usadd_vec,
2284           .fno = gen_helper_gvec_usadd8,
2285           .opt_opc = vecop_list,
2286           .vece = MO_8 },
2287         { .fniv = tcg_gen_usadd_vec,
2288           .fno = gen_helper_gvec_usadd16,
2289           .opt_opc = vecop_list,
2290           .vece = MO_16 },
2291         { .fni4 = tcg_gen_usadd_i32,
2292           .fniv = tcg_gen_usadd_vec,
2293           .fno = gen_helper_gvec_usadd32,
2294           .opt_opc = vecop_list,
2295           .vece = MO_32 },
2296         { .fni8 = tcg_gen_usadd_i64,
2297           .fniv = tcg_gen_usadd_vec,
2298           .fno = gen_helper_gvec_usadd64,
2299           .opt_opc = vecop_list,
2300           .vece = MO_64 }
2301     };
2302     tcg_debug_assert(vece <= MO_64);
2303     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2304 }
2305 
2306 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2307 {
2308     TCGv_i32 min = tcg_constant_i32(0);
2309     tcg_gen_sub_i32(d, a, b);
2310     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2311 }
2312 
2313 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2314 {
2315     TCGv_i64 min = tcg_constant_i64(0);
2316     tcg_gen_sub_i64(d, a, b);
2317     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2318 }
2319 
2320 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2321                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2322 {
2323     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2324     static const GVecGen3 g[4] = {
2325         { .fniv = tcg_gen_ussub_vec,
2326           .fno = gen_helper_gvec_ussub8,
2327           .opt_opc = vecop_list,
2328           .vece = MO_8 },
2329         { .fniv = tcg_gen_ussub_vec,
2330           .fno = gen_helper_gvec_ussub16,
2331           .opt_opc = vecop_list,
2332           .vece = MO_16 },
2333         { .fni4 = tcg_gen_ussub_i32,
2334           .fniv = tcg_gen_ussub_vec,
2335           .fno = gen_helper_gvec_ussub32,
2336           .opt_opc = vecop_list,
2337           .vece = MO_32 },
2338         { .fni8 = tcg_gen_ussub_i64,
2339           .fniv = tcg_gen_ussub_vec,
2340           .fno = gen_helper_gvec_ussub64,
2341           .opt_opc = vecop_list,
2342           .vece = MO_64 }
2343     };
2344     tcg_debug_assert(vece <= MO_64);
2345     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2346 }
2347 
2348 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2349                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2350 {
2351     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2352     static const GVecGen3 g[4] = {
2353         { .fniv = tcg_gen_smin_vec,
2354           .fno = gen_helper_gvec_smin8,
2355           .opt_opc = vecop_list,
2356           .vece = MO_8 },
2357         { .fniv = tcg_gen_smin_vec,
2358           .fno = gen_helper_gvec_smin16,
2359           .opt_opc = vecop_list,
2360           .vece = MO_16 },
2361         { .fni4 = tcg_gen_smin_i32,
2362           .fniv = tcg_gen_smin_vec,
2363           .fno = gen_helper_gvec_smin32,
2364           .opt_opc = vecop_list,
2365           .vece = MO_32 },
2366         { .fni8 = tcg_gen_smin_i64,
2367           .fniv = tcg_gen_smin_vec,
2368           .fno = gen_helper_gvec_smin64,
2369           .opt_opc = vecop_list,
2370           .vece = MO_64 }
2371     };
2372     tcg_debug_assert(vece <= MO_64);
2373     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2374 }
2375 
2376 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2377                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2378 {
2379     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2380     static const GVecGen3 g[4] = {
2381         { .fniv = tcg_gen_umin_vec,
2382           .fno = gen_helper_gvec_umin8,
2383           .opt_opc = vecop_list,
2384           .vece = MO_8 },
2385         { .fniv = tcg_gen_umin_vec,
2386           .fno = gen_helper_gvec_umin16,
2387           .opt_opc = vecop_list,
2388           .vece = MO_16 },
2389         { .fni4 = tcg_gen_umin_i32,
2390           .fniv = tcg_gen_umin_vec,
2391           .fno = gen_helper_gvec_umin32,
2392           .opt_opc = vecop_list,
2393           .vece = MO_32 },
2394         { .fni8 = tcg_gen_umin_i64,
2395           .fniv = tcg_gen_umin_vec,
2396           .fno = gen_helper_gvec_umin64,
2397           .opt_opc = vecop_list,
2398           .vece = MO_64 }
2399     };
2400     tcg_debug_assert(vece <= MO_64);
2401     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2402 }
2403 
2404 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2405                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2406 {
2407     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2408     static const GVecGen3 g[4] = {
2409         { .fniv = tcg_gen_smax_vec,
2410           .fno = gen_helper_gvec_smax8,
2411           .opt_opc = vecop_list,
2412           .vece = MO_8 },
2413         { .fniv = tcg_gen_smax_vec,
2414           .fno = gen_helper_gvec_smax16,
2415           .opt_opc = vecop_list,
2416           .vece = MO_16 },
2417         { .fni4 = tcg_gen_smax_i32,
2418           .fniv = tcg_gen_smax_vec,
2419           .fno = gen_helper_gvec_smax32,
2420           .opt_opc = vecop_list,
2421           .vece = MO_32 },
2422         { .fni8 = tcg_gen_smax_i64,
2423           .fniv = tcg_gen_smax_vec,
2424           .fno = gen_helper_gvec_smax64,
2425           .opt_opc = vecop_list,
2426           .vece = MO_64 }
2427     };
2428     tcg_debug_assert(vece <= MO_64);
2429     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2430 }
2431 
2432 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2433                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2434 {
2435     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2436     static const GVecGen3 g[4] = {
2437         { .fniv = tcg_gen_umax_vec,
2438           .fno = gen_helper_gvec_umax8,
2439           .opt_opc = vecop_list,
2440           .vece = MO_8 },
2441         { .fniv = tcg_gen_umax_vec,
2442           .fno = gen_helper_gvec_umax16,
2443           .opt_opc = vecop_list,
2444           .vece = MO_16 },
2445         { .fni4 = tcg_gen_umax_i32,
2446           .fniv = tcg_gen_umax_vec,
2447           .fno = gen_helper_gvec_umax32,
2448           .opt_opc = vecop_list,
2449           .vece = MO_32 },
2450         { .fni8 = tcg_gen_umax_i64,
2451           .fniv = tcg_gen_umax_vec,
2452           .fno = gen_helper_gvec_umax64,
2453           .opt_opc = vecop_list,
2454           .vece = MO_64 }
2455     };
2456     tcg_debug_assert(vece <= MO_64);
2457     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2458 }
2459 
2460 /* Perform a vector negation using normal negation and a mask.
2461    Compare gen_subv_mask above.  */
2462 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2463 {
2464     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2465     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2466 
2467     tcg_gen_andc_i64(t3, m, b);
2468     tcg_gen_andc_i64(t2, b, m);
2469     tcg_gen_sub_i64(d, m, t2);
2470     tcg_gen_xor_i64(d, d, t3);
2471 
2472     tcg_temp_free_i64(t2);
2473     tcg_temp_free_i64(t3);
2474 }
2475 
2476 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2477 {
2478     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2479     gen_negv_mask(d, b, m);
2480 }
2481 
2482 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2483 {
2484     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2485     gen_negv_mask(d, b, m);
2486 }
2487 
2488 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2489 {
2490     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2491     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2492 
2493     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2494     tcg_gen_neg_i64(t2, b);
2495     tcg_gen_neg_i64(t1, t1);
2496     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2497 
2498     tcg_temp_free_i64(t1);
2499     tcg_temp_free_i64(t2);
2500 }
2501 
2502 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2503                       uint32_t oprsz, uint32_t maxsz)
2504 {
2505     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2506     static const GVecGen2 g[4] = {
2507         { .fni8 = tcg_gen_vec_neg8_i64,
2508           .fniv = tcg_gen_neg_vec,
2509           .fno = gen_helper_gvec_neg8,
2510           .opt_opc = vecop_list,
2511           .vece = MO_8 },
2512         { .fni8 = tcg_gen_vec_neg16_i64,
2513           .fniv = tcg_gen_neg_vec,
2514           .fno = gen_helper_gvec_neg16,
2515           .opt_opc = vecop_list,
2516           .vece = MO_16 },
2517         { .fni4 = tcg_gen_neg_i32,
2518           .fniv = tcg_gen_neg_vec,
2519           .fno = gen_helper_gvec_neg32,
2520           .opt_opc = vecop_list,
2521           .vece = MO_32 },
2522         { .fni8 = tcg_gen_neg_i64,
2523           .fniv = tcg_gen_neg_vec,
2524           .fno = gen_helper_gvec_neg64,
2525           .opt_opc = vecop_list,
2526           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2527           .vece = MO_64 },
2528     };
2529 
2530     tcg_debug_assert(vece <= MO_64);
2531     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2532 }
2533 
2534 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2535 {
2536     TCGv_i64 t = tcg_temp_ebb_new_i64();
2537     int nbit = 8 << vece;
2538 
2539     /* Create -1 for each negative element.  */
2540     tcg_gen_shri_i64(t, b, nbit - 1);
2541     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2542     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2543 
2544     /*
2545      * Invert (via xor -1) and add one.
2546      * Because of the ordering the msb is cleared,
2547      * so we never have carry into the next element.
2548      */
2549     tcg_gen_xor_i64(d, b, t);
2550     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2551     tcg_gen_add_i64(d, d, t);
2552 
2553     tcg_temp_free_i64(t);
2554 }
2555 
2556 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2557 {
2558     gen_absv_mask(d, b, MO_8);
2559 }
2560 
2561 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2562 {
2563     gen_absv_mask(d, b, MO_16);
2564 }
2565 
2566 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2567                       uint32_t oprsz, uint32_t maxsz)
2568 {
2569     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2570     static const GVecGen2 g[4] = {
2571         { .fni8 = tcg_gen_vec_abs8_i64,
2572           .fniv = tcg_gen_abs_vec,
2573           .fno = gen_helper_gvec_abs8,
2574           .opt_opc = vecop_list,
2575           .vece = MO_8 },
2576         { .fni8 = tcg_gen_vec_abs16_i64,
2577           .fniv = tcg_gen_abs_vec,
2578           .fno = gen_helper_gvec_abs16,
2579           .opt_opc = vecop_list,
2580           .vece = MO_16 },
2581         { .fni4 = tcg_gen_abs_i32,
2582           .fniv = tcg_gen_abs_vec,
2583           .fno = gen_helper_gvec_abs32,
2584           .opt_opc = vecop_list,
2585           .vece = MO_32 },
2586         { .fni8 = tcg_gen_abs_i64,
2587           .fniv = tcg_gen_abs_vec,
2588           .fno = gen_helper_gvec_abs64,
2589           .opt_opc = vecop_list,
2590           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2591           .vece = MO_64 },
2592     };
2593 
2594     tcg_debug_assert(vece <= MO_64);
2595     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2596 }
2597 
2598 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2599                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2600 {
2601     static const GVecGen3 g = {
2602         .fni8 = tcg_gen_and_i64,
2603         .fniv = tcg_gen_and_vec,
2604         .fno = gen_helper_gvec_and,
2605         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2606     };
2607 
2608     if (aofs == bofs) {
2609         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2610     } else {
2611         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2612     }
2613 }
2614 
2615 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2616                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2617 {
2618     static const GVecGen3 g = {
2619         .fni8 = tcg_gen_or_i64,
2620         .fniv = tcg_gen_or_vec,
2621         .fno = gen_helper_gvec_or,
2622         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2623     };
2624 
2625     if (aofs == bofs) {
2626         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2627     } else {
2628         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2629     }
2630 }
2631 
2632 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2633                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2634 {
2635     static const GVecGen3 g = {
2636         .fni8 = tcg_gen_xor_i64,
2637         .fniv = tcg_gen_xor_vec,
2638         .fno = gen_helper_gvec_xor,
2639         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2640     };
2641 
2642     if (aofs == bofs) {
2643         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2644     } else {
2645         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2646     }
2647 }
2648 
2649 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2650                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2651 {
2652     static const GVecGen3 g = {
2653         .fni8 = tcg_gen_andc_i64,
2654         .fniv = tcg_gen_andc_vec,
2655         .fno = gen_helper_gvec_andc,
2656         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2657     };
2658 
2659     if (aofs == bofs) {
2660         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2661     } else {
2662         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2663     }
2664 }
2665 
2666 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2667                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2668 {
2669     static const GVecGen3 g = {
2670         .fni8 = tcg_gen_orc_i64,
2671         .fniv = tcg_gen_orc_vec,
2672         .fno = gen_helper_gvec_orc,
2673         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2674     };
2675 
2676     if (aofs == bofs) {
2677         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2678     } else {
2679         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2680     }
2681 }
2682 
2683 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2684                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2685 {
2686     static const GVecGen3 g = {
2687         .fni8 = tcg_gen_nand_i64,
2688         .fniv = tcg_gen_nand_vec,
2689         .fno = gen_helper_gvec_nand,
2690         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2691     };
2692 
2693     if (aofs == bofs) {
2694         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2695     } else {
2696         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2697     }
2698 }
2699 
2700 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2701                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2702 {
2703     static const GVecGen3 g = {
2704         .fni8 = tcg_gen_nor_i64,
2705         .fniv = tcg_gen_nor_vec,
2706         .fno = gen_helper_gvec_nor,
2707         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2708     };
2709 
2710     if (aofs == bofs) {
2711         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2712     } else {
2713         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2714     }
2715 }
2716 
2717 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2718                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2719 {
2720     static const GVecGen3 g = {
2721         .fni8 = tcg_gen_eqv_i64,
2722         .fniv = tcg_gen_eqv_vec,
2723         .fno = gen_helper_gvec_eqv,
2724         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2725     };
2726 
2727     if (aofs == bofs) {
2728         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2729     } else {
2730         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2731     }
2732 }
2733 
2734 static const GVecGen2s gop_ands = {
2735     .fni8 = tcg_gen_and_i64,
2736     .fniv = tcg_gen_and_vec,
2737     .fno = gen_helper_gvec_ands,
2738     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2739     .vece = MO_64
2740 };
2741 
2742 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2743                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2744 {
2745     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2746     tcg_gen_dup_i64(vece, tmp, c);
2747     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2748     tcg_temp_free_i64(tmp);
2749 }
2750 
2751 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2752                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2753 {
2754     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2755     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2756 }
2757 
2758 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
2759                         TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2760 {
2761     static GVecGen2s g = {
2762         .fni8 = tcg_gen_andc_i64,
2763         .fniv = tcg_gen_andc_vec,
2764         .fno = gen_helper_gvec_andcs,
2765         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2766         .vece = MO_64
2767     };
2768 
2769     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2770     tcg_gen_dup_i64(vece, tmp, c);
2771     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g);
2772     tcg_temp_free_i64(tmp);
2773 }
2774 
2775 static const GVecGen2s gop_xors = {
2776     .fni8 = tcg_gen_xor_i64,
2777     .fniv = tcg_gen_xor_vec,
2778     .fno = gen_helper_gvec_xors,
2779     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2780     .vece = MO_64
2781 };
2782 
2783 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2784                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2785 {
2786     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2787     tcg_gen_dup_i64(vece, tmp, c);
2788     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2789     tcg_temp_free_i64(tmp);
2790 }
2791 
2792 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2793                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2794 {
2795     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2796     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2797 }
2798 
2799 static const GVecGen2s gop_ors = {
2800     .fni8 = tcg_gen_or_i64,
2801     .fniv = tcg_gen_or_vec,
2802     .fno = gen_helper_gvec_ors,
2803     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2804     .vece = MO_64
2805 };
2806 
2807 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2808                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2809 {
2810     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2811     tcg_gen_dup_i64(vece, tmp, c);
2812     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2813     tcg_temp_free_i64(tmp);
2814 }
2815 
2816 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2817                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2818 {
2819     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2820     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2821 }
2822 
2823 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2824 {
2825     uint64_t mask = dup_const(MO_8, 0xff << c);
2826     tcg_gen_shli_i64(d, a, c);
2827     tcg_gen_andi_i64(d, d, mask);
2828 }
2829 
2830 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2831 {
2832     uint64_t mask = dup_const(MO_16, 0xffff << c);
2833     tcg_gen_shli_i64(d, a, c);
2834     tcg_gen_andi_i64(d, d, mask);
2835 }
2836 
2837 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2838 {
2839     uint32_t mask = dup_const(MO_8, 0xff << c);
2840     tcg_gen_shli_i32(d, a, c);
2841     tcg_gen_andi_i32(d, d, mask);
2842 }
2843 
2844 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2845 {
2846     uint32_t mask = dup_const(MO_16, 0xffff << c);
2847     tcg_gen_shli_i32(d, a, c);
2848     tcg_gen_andi_i32(d, d, mask);
2849 }
2850 
2851 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2852                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2853 {
2854     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2855     static const GVecGen2i g[4] = {
2856         { .fni8 = tcg_gen_vec_shl8i_i64,
2857           .fniv = tcg_gen_shli_vec,
2858           .fno = gen_helper_gvec_shl8i,
2859           .opt_opc = vecop_list,
2860           .vece = MO_8 },
2861         { .fni8 = tcg_gen_vec_shl16i_i64,
2862           .fniv = tcg_gen_shli_vec,
2863           .fno = gen_helper_gvec_shl16i,
2864           .opt_opc = vecop_list,
2865           .vece = MO_16 },
2866         { .fni4 = tcg_gen_shli_i32,
2867           .fniv = tcg_gen_shli_vec,
2868           .fno = gen_helper_gvec_shl32i,
2869           .opt_opc = vecop_list,
2870           .vece = MO_32 },
2871         { .fni8 = tcg_gen_shli_i64,
2872           .fniv = tcg_gen_shli_vec,
2873           .fno = gen_helper_gvec_shl64i,
2874           .opt_opc = vecop_list,
2875           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2876           .vece = MO_64 },
2877     };
2878 
2879     tcg_debug_assert(vece <= MO_64);
2880     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2881     if (shift == 0) {
2882         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2883     } else {
2884         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2885     }
2886 }
2887 
2888 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2889 {
2890     uint64_t mask = dup_const(MO_8, 0xff >> c);
2891     tcg_gen_shri_i64(d, a, c);
2892     tcg_gen_andi_i64(d, d, mask);
2893 }
2894 
2895 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2896 {
2897     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2898     tcg_gen_shri_i64(d, a, c);
2899     tcg_gen_andi_i64(d, d, mask);
2900 }
2901 
2902 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2903 {
2904     uint32_t mask = dup_const(MO_8, 0xff >> c);
2905     tcg_gen_shri_i32(d, a, c);
2906     tcg_gen_andi_i32(d, d, mask);
2907 }
2908 
2909 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2910 {
2911     uint32_t mask = dup_const(MO_16, 0xffff >> c);
2912     tcg_gen_shri_i32(d, a, c);
2913     tcg_gen_andi_i32(d, d, mask);
2914 }
2915 
2916 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2917                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2918 {
2919     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2920     static const GVecGen2i g[4] = {
2921         { .fni8 = tcg_gen_vec_shr8i_i64,
2922           .fniv = tcg_gen_shri_vec,
2923           .fno = gen_helper_gvec_shr8i,
2924           .opt_opc = vecop_list,
2925           .vece = MO_8 },
2926         { .fni8 = tcg_gen_vec_shr16i_i64,
2927           .fniv = tcg_gen_shri_vec,
2928           .fno = gen_helper_gvec_shr16i,
2929           .opt_opc = vecop_list,
2930           .vece = MO_16 },
2931         { .fni4 = tcg_gen_shri_i32,
2932           .fniv = tcg_gen_shri_vec,
2933           .fno = gen_helper_gvec_shr32i,
2934           .opt_opc = vecop_list,
2935           .vece = MO_32 },
2936         { .fni8 = tcg_gen_shri_i64,
2937           .fniv = tcg_gen_shri_vec,
2938           .fno = gen_helper_gvec_shr64i,
2939           .opt_opc = vecop_list,
2940           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2941           .vece = MO_64 },
2942     };
2943 
2944     tcg_debug_assert(vece <= MO_64);
2945     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2946     if (shift == 0) {
2947         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2948     } else {
2949         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2950     }
2951 }
2952 
2953 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2954 {
2955     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2956     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2957     TCGv_i64 s = tcg_temp_ebb_new_i64();
2958 
2959     tcg_gen_shri_i64(d, a, c);
2960     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2961     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2962     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2963     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2964     tcg_temp_free_i64(s);
2965 }
2966 
2967 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2968 {
2969     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2970     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2971     TCGv_i64 s = tcg_temp_ebb_new_i64();
2972 
2973     tcg_gen_shri_i64(d, a, c);
2974     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2975     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2976     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2977     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2978     tcg_temp_free_i64(s);
2979 }
2980 
2981 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2982 {
2983     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2984     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2985     TCGv_i32 s = tcg_temp_ebb_new_i32();
2986 
2987     tcg_gen_shri_i32(d, a, c);
2988     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2989     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2990     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2991     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2992     tcg_temp_free_i32(s);
2993 }
2994 
2995 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2996 {
2997     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2998     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2999     TCGv_i32 s = tcg_temp_ebb_new_i32();
3000 
3001     tcg_gen_shri_i32(d, a, c);
3002     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
3003     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
3004     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
3005     tcg_gen_or_i32(d, d, s);         /* include sign extension */
3006     tcg_temp_free_i32(s);
3007 }
3008 
3009 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
3010                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
3011 {
3012     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3013     static const GVecGen2i g[4] = {
3014         { .fni8 = tcg_gen_vec_sar8i_i64,
3015           .fniv = tcg_gen_sari_vec,
3016           .fno = gen_helper_gvec_sar8i,
3017           .opt_opc = vecop_list,
3018           .vece = MO_8 },
3019         { .fni8 = tcg_gen_vec_sar16i_i64,
3020           .fniv = tcg_gen_sari_vec,
3021           .fno = gen_helper_gvec_sar16i,
3022           .opt_opc = vecop_list,
3023           .vece = MO_16 },
3024         { .fni4 = tcg_gen_sari_i32,
3025           .fniv = tcg_gen_sari_vec,
3026           .fno = gen_helper_gvec_sar32i,
3027           .opt_opc = vecop_list,
3028           .vece = MO_32 },
3029         { .fni8 = tcg_gen_sari_i64,
3030           .fniv = tcg_gen_sari_vec,
3031           .fno = gen_helper_gvec_sar64i,
3032           .opt_opc = vecop_list,
3033           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3034           .vece = MO_64 },
3035     };
3036 
3037     tcg_debug_assert(vece <= MO_64);
3038     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3039     if (shift == 0) {
3040         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3041     } else {
3042         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3043     }
3044 }
3045 
3046 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3047 {
3048     uint64_t mask = dup_const(MO_8, 0xff << c);
3049 
3050     tcg_gen_shli_i64(d, a, c);
3051     tcg_gen_shri_i64(a, a, 8 - c);
3052     tcg_gen_andi_i64(d, d, mask);
3053     tcg_gen_andi_i64(a, a, ~mask);
3054     tcg_gen_or_i64(d, d, a);
3055 }
3056 
3057 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3058 {
3059     uint64_t mask = dup_const(MO_16, 0xffff << c);
3060 
3061     tcg_gen_shli_i64(d, a, c);
3062     tcg_gen_shri_i64(a, a, 16 - c);
3063     tcg_gen_andi_i64(d, d, mask);
3064     tcg_gen_andi_i64(a, a, ~mask);
3065     tcg_gen_or_i64(d, d, a);
3066 }
3067 
3068 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3069                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3070 {
3071     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3072     static const GVecGen2i g[4] = {
3073         { .fni8 = tcg_gen_vec_rotl8i_i64,
3074           .fniv = tcg_gen_rotli_vec,
3075           .fno = gen_helper_gvec_rotl8i,
3076           .opt_opc = vecop_list,
3077           .vece = MO_8 },
3078         { .fni8 = tcg_gen_vec_rotl16i_i64,
3079           .fniv = tcg_gen_rotli_vec,
3080           .fno = gen_helper_gvec_rotl16i,
3081           .opt_opc = vecop_list,
3082           .vece = MO_16 },
3083         { .fni4 = tcg_gen_rotli_i32,
3084           .fniv = tcg_gen_rotli_vec,
3085           .fno = gen_helper_gvec_rotl32i,
3086           .opt_opc = vecop_list,
3087           .vece = MO_32 },
3088         { .fni8 = tcg_gen_rotli_i64,
3089           .fniv = tcg_gen_rotli_vec,
3090           .fno = gen_helper_gvec_rotl64i,
3091           .opt_opc = vecop_list,
3092           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3093           .vece = MO_64 },
3094     };
3095 
3096     tcg_debug_assert(vece <= MO_64);
3097     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3098     if (shift == 0) {
3099         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3100     } else {
3101         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3102     }
3103 }
3104 
3105 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3106                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3107 {
3108     tcg_debug_assert(vece <= MO_64);
3109     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3110     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3111                        oprsz, maxsz);
3112 }
3113 
3114 /*
3115  * Specialized generation vector shifts by a non-constant scalar.
3116  */
3117 
3118 typedef struct {
3119     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3120     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3121     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3122     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3123     gen_helper_gvec_2 *fno[4];
3124     TCGOpcode s_list[2];
3125     TCGOpcode v_list[2];
3126 } GVecGen2sh;
3127 
3128 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3129                            uint32_t oprsz, uint32_t tysz, TCGType type,
3130                            TCGv_i32 shift,
3131                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3132 {
3133     for (uint32_t i = 0; i < oprsz; i += tysz) {
3134         TCGv_vec t0 = tcg_temp_new_vec(type);
3135         TCGv_vec t1 = tcg_temp_new_vec(type);
3136 
3137         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3138         fni(vece, t1, t0, shift);
3139         tcg_gen_st_vec(t1, tcg_env, dofs + i);
3140     }
3141 }
3142 
3143 static void
3144 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3145                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3146 {
3147     TCGType type;
3148     uint32_t some;
3149 
3150     check_size_align(oprsz, maxsz, dofs | aofs);
3151     check_overlap_2(dofs, aofs, maxsz);
3152 
3153     /* If the backend has a scalar expansion, great.  */
3154     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3155     if (type) {
3156         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3157         switch (type) {
3158         case TCG_TYPE_V256:
3159             some = QEMU_ALIGN_DOWN(oprsz, 32);
3160             expand_2sh_vec(vece, dofs, aofs, some, 32,
3161                            TCG_TYPE_V256, shift, g->fniv_s);
3162             if (some == oprsz) {
3163                 break;
3164             }
3165             dofs += some;
3166             aofs += some;
3167             oprsz -= some;
3168             maxsz -= some;
3169             /* fallthru */
3170         case TCG_TYPE_V128:
3171             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3172                            TCG_TYPE_V128, shift, g->fniv_s);
3173             break;
3174         case TCG_TYPE_V64:
3175             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3176                            TCG_TYPE_V64, shift, g->fniv_s);
3177             break;
3178         default:
3179             g_assert_not_reached();
3180         }
3181         tcg_swap_vecop_list(hold_list);
3182         goto clear_tail;
3183     }
3184 
3185     /* If the backend supports variable vector shifts, also cool.  */
3186     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3187     if (type) {
3188         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3189         TCGv_vec v_shift = tcg_temp_new_vec(type);
3190 
3191         if (vece == MO_64) {
3192             TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3193             tcg_gen_extu_i32_i64(sh64, shift);
3194             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3195             tcg_temp_free_i64(sh64);
3196         } else {
3197             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3198         }
3199 
3200         switch (type) {
3201         case TCG_TYPE_V256:
3202             some = QEMU_ALIGN_DOWN(oprsz, 32);
3203             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3204                           v_shift, false, g->fniv_v);
3205             if (some == oprsz) {
3206                 break;
3207             }
3208             dofs += some;
3209             aofs += some;
3210             oprsz -= some;
3211             maxsz -= some;
3212             /* fallthru */
3213         case TCG_TYPE_V128:
3214             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3215                           v_shift, false, g->fniv_v);
3216             break;
3217         case TCG_TYPE_V64:
3218             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3219                           v_shift, false, g->fniv_v);
3220             break;
3221         default:
3222             g_assert_not_reached();
3223         }
3224         tcg_temp_free_vec(v_shift);
3225         tcg_swap_vecop_list(hold_list);
3226         goto clear_tail;
3227     }
3228 
3229     /* Otherwise fall back to integral... */
3230     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3231         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3232     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3233         TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3234         tcg_gen_extu_i32_i64(sh64, shift);
3235         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3236         tcg_temp_free_i64(sh64);
3237     } else {
3238         TCGv_ptr a0 = tcg_temp_ebb_new_ptr();
3239         TCGv_ptr a1 = tcg_temp_ebb_new_ptr();
3240         TCGv_i32 desc = tcg_temp_ebb_new_i32();
3241 
3242         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3243         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3244         tcg_gen_addi_ptr(a0, tcg_env, dofs);
3245         tcg_gen_addi_ptr(a1, tcg_env, aofs);
3246 
3247         g->fno[vece](a0, a1, desc);
3248 
3249         tcg_temp_free_ptr(a0);
3250         tcg_temp_free_ptr(a1);
3251         tcg_temp_free_i32(desc);
3252         return;
3253     }
3254 
3255  clear_tail:
3256     if (oprsz < maxsz) {
3257         expand_clr(dofs + oprsz, maxsz - oprsz);
3258     }
3259 }
3260 
3261 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3262                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3263 {
3264     static const GVecGen2sh g = {
3265         .fni4 = tcg_gen_shl_i32,
3266         .fni8 = tcg_gen_shl_i64,
3267         .fniv_s = tcg_gen_shls_vec,
3268         .fniv_v = tcg_gen_shlv_vec,
3269         .fno = {
3270             gen_helper_gvec_shl8i,
3271             gen_helper_gvec_shl16i,
3272             gen_helper_gvec_shl32i,
3273             gen_helper_gvec_shl64i,
3274         },
3275         .s_list = { INDEX_op_shls_vec, 0 },
3276         .v_list = { INDEX_op_shlv_vec, 0 },
3277     };
3278 
3279     tcg_debug_assert(vece <= MO_64);
3280     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3281 }
3282 
3283 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3284                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3285 {
3286     static const GVecGen2sh g = {
3287         .fni4 = tcg_gen_shr_i32,
3288         .fni8 = tcg_gen_shr_i64,
3289         .fniv_s = tcg_gen_shrs_vec,
3290         .fniv_v = tcg_gen_shrv_vec,
3291         .fno = {
3292             gen_helper_gvec_shr8i,
3293             gen_helper_gvec_shr16i,
3294             gen_helper_gvec_shr32i,
3295             gen_helper_gvec_shr64i,
3296         },
3297         .s_list = { INDEX_op_shrs_vec, 0 },
3298         .v_list = { INDEX_op_shrv_vec, 0 },
3299     };
3300 
3301     tcg_debug_assert(vece <= MO_64);
3302     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3303 }
3304 
3305 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3306                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3307 {
3308     static const GVecGen2sh g = {
3309         .fni4 = tcg_gen_sar_i32,
3310         .fni8 = tcg_gen_sar_i64,
3311         .fniv_s = tcg_gen_sars_vec,
3312         .fniv_v = tcg_gen_sarv_vec,
3313         .fno = {
3314             gen_helper_gvec_sar8i,
3315             gen_helper_gvec_sar16i,
3316             gen_helper_gvec_sar32i,
3317             gen_helper_gvec_sar64i,
3318         },
3319         .s_list = { INDEX_op_sars_vec, 0 },
3320         .v_list = { INDEX_op_sarv_vec, 0 },
3321     };
3322 
3323     tcg_debug_assert(vece <= MO_64);
3324     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3325 }
3326 
3327 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3328                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3329 {
3330     static const GVecGen2sh g = {
3331         .fni4 = tcg_gen_rotl_i32,
3332         .fni8 = tcg_gen_rotl_i64,
3333         .fniv_s = tcg_gen_rotls_vec,
3334         .fniv_v = tcg_gen_rotlv_vec,
3335         .fno = {
3336             gen_helper_gvec_rotl8i,
3337             gen_helper_gvec_rotl16i,
3338             gen_helper_gvec_rotl32i,
3339             gen_helper_gvec_rotl64i,
3340         },
3341         .s_list = { INDEX_op_rotls_vec, 0 },
3342         .v_list = { INDEX_op_rotlv_vec, 0 },
3343     };
3344 
3345     tcg_debug_assert(vece <= MO_64);
3346     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3347 }
3348 
3349 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3350                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3351 {
3352     TCGv_i32 tmp = tcg_temp_ebb_new_i32();
3353 
3354     tcg_gen_neg_i32(tmp, shift);
3355     tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
3356     tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
3357     tcg_temp_free_i32(tmp);
3358 }
3359 
3360 /*
3361  * Expand D = A << (B % element bits)
3362  *
3363  * Unlike scalar shifts, where it is easy for the target front end
3364  * to include the modulo as part of the expansion.  If the target
3365  * naturally includes the modulo as part of the operation, great!
3366  * If the target has some other behaviour from out-of-range shifts,
3367  * then it could not use this function anyway, and would need to
3368  * do it's own expansion with custom functions.
3369  */
3370 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3371                                  TCGv_vec a, TCGv_vec b)
3372 {
3373     TCGv_vec t = tcg_temp_new_vec_matching(d);
3374     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3375 
3376     tcg_gen_and_vec(vece, t, b, m);
3377     tcg_gen_shlv_vec(vece, d, a, t);
3378     tcg_temp_free_vec(t);
3379 }
3380 
3381 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3382 {
3383     TCGv_i32 t = tcg_temp_ebb_new_i32();
3384 
3385     tcg_gen_andi_i32(t, b, 31);
3386     tcg_gen_shl_i32(d, a, t);
3387     tcg_temp_free_i32(t);
3388 }
3389 
3390 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3391 {
3392     TCGv_i64 t = tcg_temp_ebb_new_i64();
3393 
3394     tcg_gen_andi_i64(t, b, 63);
3395     tcg_gen_shl_i64(d, a, t);
3396     tcg_temp_free_i64(t);
3397 }
3398 
3399 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3400                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3401 {
3402     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3403     static const GVecGen3 g[4] = {
3404         { .fniv = tcg_gen_shlv_mod_vec,
3405           .fno = gen_helper_gvec_shl8v,
3406           .opt_opc = vecop_list,
3407           .vece = MO_8 },
3408         { .fniv = tcg_gen_shlv_mod_vec,
3409           .fno = gen_helper_gvec_shl16v,
3410           .opt_opc = vecop_list,
3411           .vece = MO_16 },
3412         { .fni4 = tcg_gen_shl_mod_i32,
3413           .fniv = tcg_gen_shlv_mod_vec,
3414           .fno = gen_helper_gvec_shl32v,
3415           .opt_opc = vecop_list,
3416           .vece = MO_32 },
3417         { .fni8 = tcg_gen_shl_mod_i64,
3418           .fniv = tcg_gen_shlv_mod_vec,
3419           .fno = gen_helper_gvec_shl64v,
3420           .opt_opc = vecop_list,
3421           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3422           .vece = MO_64 },
3423     };
3424 
3425     tcg_debug_assert(vece <= MO_64);
3426     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3427 }
3428 
3429 /*
3430  * Similarly for logical right shifts.
3431  */
3432 
3433 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3434                                  TCGv_vec a, TCGv_vec b)
3435 {
3436     TCGv_vec t = tcg_temp_new_vec_matching(d);
3437     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3438 
3439     tcg_gen_and_vec(vece, t, b, m);
3440     tcg_gen_shrv_vec(vece, d, a, t);
3441     tcg_temp_free_vec(t);
3442 }
3443 
3444 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3445 {
3446     TCGv_i32 t = tcg_temp_ebb_new_i32();
3447 
3448     tcg_gen_andi_i32(t, b, 31);
3449     tcg_gen_shr_i32(d, a, t);
3450     tcg_temp_free_i32(t);
3451 }
3452 
3453 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3454 {
3455     TCGv_i64 t = tcg_temp_ebb_new_i64();
3456 
3457     tcg_gen_andi_i64(t, b, 63);
3458     tcg_gen_shr_i64(d, a, t);
3459     tcg_temp_free_i64(t);
3460 }
3461 
3462 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3463                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3464 {
3465     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3466     static const GVecGen3 g[4] = {
3467         { .fniv = tcg_gen_shrv_mod_vec,
3468           .fno = gen_helper_gvec_shr8v,
3469           .opt_opc = vecop_list,
3470           .vece = MO_8 },
3471         { .fniv = tcg_gen_shrv_mod_vec,
3472           .fno = gen_helper_gvec_shr16v,
3473           .opt_opc = vecop_list,
3474           .vece = MO_16 },
3475         { .fni4 = tcg_gen_shr_mod_i32,
3476           .fniv = tcg_gen_shrv_mod_vec,
3477           .fno = gen_helper_gvec_shr32v,
3478           .opt_opc = vecop_list,
3479           .vece = MO_32 },
3480         { .fni8 = tcg_gen_shr_mod_i64,
3481           .fniv = tcg_gen_shrv_mod_vec,
3482           .fno = gen_helper_gvec_shr64v,
3483           .opt_opc = vecop_list,
3484           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3485           .vece = MO_64 },
3486     };
3487 
3488     tcg_debug_assert(vece <= MO_64);
3489     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3490 }
3491 
3492 /*
3493  * Similarly for arithmetic right shifts.
3494  */
3495 
3496 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3497                                  TCGv_vec a, TCGv_vec b)
3498 {
3499     TCGv_vec t = tcg_temp_new_vec_matching(d);
3500     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3501 
3502     tcg_gen_and_vec(vece, t, b, m);
3503     tcg_gen_sarv_vec(vece, d, a, t);
3504     tcg_temp_free_vec(t);
3505 }
3506 
3507 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3508 {
3509     TCGv_i32 t = tcg_temp_ebb_new_i32();
3510 
3511     tcg_gen_andi_i32(t, b, 31);
3512     tcg_gen_sar_i32(d, a, t);
3513     tcg_temp_free_i32(t);
3514 }
3515 
3516 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3517 {
3518     TCGv_i64 t = tcg_temp_ebb_new_i64();
3519 
3520     tcg_gen_andi_i64(t, b, 63);
3521     tcg_gen_sar_i64(d, a, t);
3522     tcg_temp_free_i64(t);
3523 }
3524 
3525 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3526                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3527 {
3528     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3529     static const GVecGen3 g[4] = {
3530         { .fniv = tcg_gen_sarv_mod_vec,
3531           .fno = gen_helper_gvec_sar8v,
3532           .opt_opc = vecop_list,
3533           .vece = MO_8 },
3534         { .fniv = tcg_gen_sarv_mod_vec,
3535           .fno = gen_helper_gvec_sar16v,
3536           .opt_opc = vecop_list,
3537           .vece = MO_16 },
3538         { .fni4 = tcg_gen_sar_mod_i32,
3539           .fniv = tcg_gen_sarv_mod_vec,
3540           .fno = gen_helper_gvec_sar32v,
3541           .opt_opc = vecop_list,
3542           .vece = MO_32 },
3543         { .fni8 = tcg_gen_sar_mod_i64,
3544           .fniv = tcg_gen_sarv_mod_vec,
3545           .fno = gen_helper_gvec_sar64v,
3546           .opt_opc = vecop_list,
3547           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3548           .vece = MO_64 },
3549     };
3550 
3551     tcg_debug_assert(vece <= MO_64);
3552     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3553 }
3554 
3555 /*
3556  * Similarly for rotates.
3557  */
3558 
3559 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3560                                   TCGv_vec a, TCGv_vec b)
3561 {
3562     TCGv_vec t = tcg_temp_new_vec_matching(d);
3563     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3564 
3565     tcg_gen_and_vec(vece, t, b, m);
3566     tcg_gen_rotlv_vec(vece, d, a, t);
3567     tcg_temp_free_vec(t);
3568 }
3569 
3570 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3571 {
3572     TCGv_i32 t = tcg_temp_ebb_new_i32();
3573 
3574     tcg_gen_andi_i32(t, b, 31);
3575     tcg_gen_rotl_i32(d, a, t);
3576     tcg_temp_free_i32(t);
3577 }
3578 
3579 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3580 {
3581     TCGv_i64 t = tcg_temp_ebb_new_i64();
3582 
3583     tcg_gen_andi_i64(t, b, 63);
3584     tcg_gen_rotl_i64(d, a, t);
3585     tcg_temp_free_i64(t);
3586 }
3587 
3588 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3589                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3590 {
3591     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3592     static const GVecGen3 g[4] = {
3593         { .fniv = tcg_gen_rotlv_mod_vec,
3594           .fno = gen_helper_gvec_rotl8v,
3595           .opt_opc = vecop_list,
3596           .vece = MO_8 },
3597         { .fniv = tcg_gen_rotlv_mod_vec,
3598           .fno = gen_helper_gvec_rotl16v,
3599           .opt_opc = vecop_list,
3600           .vece = MO_16 },
3601         { .fni4 = tcg_gen_rotl_mod_i32,
3602           .fniv = tcg_gen_rotlv_mod_vec,
3603           .fno = gen_helper_gvec_rotl32v,
3604           .opt_opc = vecop_list,
3605           .vece = MO_32 },
3606         { .fni8 = tcg_gen_rotl_mod_i64,
3607           .fniv = tcg_gen_rotlv_mod_vec,
3608           .fno = gen_helper_gvec_rotl64v,
3609           .opt_opc = vecop_list,
3610           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3611           .vece = MO_64 },
3612     };
3613 
3614     tcg_debug_assert(vece <= MO_64);
3615     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3616 }
3617 
3618 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3619                                   TCGv_vec a, TCGv_vec b)
3620 {
3621     TCGv_vec t = tcg_temp_new_vec_matching(d);
3622     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3623 
3624     tcg_gen_and_vec(vece, t, b, m);
3625     tcg_gen_rotrv_vec(vece, d, a, t);
3626     tcg_temp_free_vec(t);
3627 }
3628 
3629 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3630 {
3631     TCGv_i32 t = tcg_temp_ebb_new_i32();
3632 
3633     tcg_gen_andi_i32(t, b, 31);
3634     tcg_gen_rotr_i32(d, a, t);
3635     tcg_temp_free_i32(t);
3636 }
3637 
3638 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3639 {
3640     TCGv_i64 t = tcg_temp_ebb_new_i64();
3641 
3642     tcg_gen_andi_i64(t, b, 63);
3643     tcg_gen_rotr_i64(d, a, t);
3644     tcg_temp_free_i64(t);
3645 }
3646 
3647 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3648                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3649 {
3650     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3651     static const GVecGen3 g[4] = {
3652         { .fniv = tcg_gen_rotrv_mod_vec,
3653           .fno = gen_helper_gvec_rotr8v,
3654           .opt_opc = vecop_list,
3655           .vece = MO_8 },
3656         { .fniv = tcg_gen_rotrv_mod_vec,
3657           .fno = gen_helper_gvec_rotr16v,
3658           .opt_opc = vecop_list,
3659           .vece = MO_16 },
3660         { .fni4 = tcg_gen_rotr_mod_i32,
3661           .fniv = tcg_gen_rotrv_mod_vec,
3662           .fno = gen_helper_gvec_rotr32v,
3663           .opt_opc = vecop_list,
3664           .vece = MO_32 },
3665         { .fni8 = tcg_gen_rotr_mod_i64,
3666           .fniv = tcg_gen_rotrv_mod_vec,
3667           .fno = gen_helper_gvec_rotr64v,
3668           .opt_opc = vecop_list,
3669           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3670           .vece = MO_64 },
3671     };
3672 
3673     tcg_debug_assert(vece <= MO_64);
3674     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3675 }
3676 
3677 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3678 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3679                            uint32_t oprsz, TCGCond cond)
3680 {
3681     TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3682     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3683     uint32_t i;
3684 
3685     for (i = 0; i < oprsz; i += 4) {
3686         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3687         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
3688         tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3689         tcg_gen_st_i32(t0, tcg_env, dofs + i);
3690     }
3691     tcg_temp_free_i32(t1);
3692     tcg_temp_free_i32(t0);
3693 }
3694 
3695 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3696                            uint32_t oprsz, TCGCond cond)
3697 {
3698     TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3699     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
3700     uint32_t i;
3701 
3702     for (i = 0; i < oprsz; i += 8) {
3703         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3704         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
3705         tcg_gen_negsetcond_i64(cond, t0, t0, t1);
3706         tcg_gen_st_i64(t0, tcg_env, dofs + i);
3707     }
3708     tcg_temp_free_i64(t1);
3709     tcg_temp_free_i64(t0);
3710 }
3711 
3712 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3713                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3714                            TCGType type, TCGCond cond)
3715 {
3716     for (uint32_t i = 0; i < oprsz; i += tysz) {
3717         TCGv_vec t0 = tcg_temp_new_vec(type);
3718         TCGv_vec t1 = tcg_temp_new_vec(type);
3719         TCGv_vec t2 = tcg_temp_new_vec(type);
3720 
3721         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3722         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
3723         tcg_gen_cmp_vec(cond, vece, t2, t0, t1);
3724         tcg_gen_st_vec(t2, tcg_env, dofs + i);
3725     }
3726 }
3727 
3728 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3729                       uint32_t aofs, uint32_t bofs,
3730                       uint32_t oprsz, uint32_t maxsz)
3731 {
3732     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3733     static gen_helper_gvec_3 * const eq_fn[4] = {
3734         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3735         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3736     };
3737     static gen_helper_gvec_3 * const ne_fn[4] = {
3738         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3739         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3740     };
3741     static gen_helper_gvec_3 * const lt_fn[4] = {
3742         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3743         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3744     };
3745     static gen_helper_gvec_3 * const le_fn[4] = {
3746         gen_helper_gvec_le8, gen_helper_gvec_le16,
3747         gen_helper_gvec_le32, gen_helper_gvec_le64
3748     };
3749     static gen_helper_gvec_3 * const ltu_fn[4] = {
3750         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3751         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3752     };
3753     static gen_helper_gvec_3 * const leu_fn[4] = {
3754         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3755         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3756     };
3757     static gen_helper_gvec_3 * const * const fns[16] = {
3758         [TCG_COND_EQ] = eq_fn,
3759         [TCG_COND_NE] = ne_fn,
3760         [TCG_COND_LT] = lt_fn,
3761         [TCG_COND_LE] = le_fn,
3762         [TCG_COND_LTU] = ltu_fn,
3763         [TCG_COND_LEU] = leu_fn,
3764     };
3765 
3766     const TCGOpcode *hold_list;
3767     TCGType type;
3768     uint32_t some;
3769 
3770     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3771     check_overlap_3(dofs, aofs, bofs, maxsz);
3772 
3773     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3774         do_dup(MO_8, dofs, oprsz, maxsz,
3775                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3776         return;
3777     }
3778 
3779     /*
3780      * Implement inline with a vector type, if possible.
3781      * Prefer integer when 64-bit host and 64-bit comparison.
3782      */
3783     hold_list = tcg_swap_vecop_list(cmp_list);
3784     type = choose_vector_type(cmp_list, vece, oprsz,
3785                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3786     switch (type) {
3787     case TCG_TYPE_V256:
3788         /* Recall that ARM SVE allows vector sizes that are not a
3789          * power of 2, but always a multiple of 16.  The intent is
3790          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3791          */
3792         some = QEMU_ALIGN_DOWN(oprsz, 32);
3793         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3794         if (some == oprsz) {
3795             break;
3796         }
3797         dofs += some;
3798         aofs += some;
3799         bofs += some;
3800         oprsz -= some;
3801         maxsz -= some;
3802         /* fallthru */
3803     case TCG_TYPE_V128:
3804         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3805         break;
3806     case TCG_TYPE_V64:
3807         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3808         break;
3809 
3810     case 0:
3811         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3812             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3813         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3814             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3815         } else {
3816             gen_helper_gvec_3 * const *fn = fns[cond];
3817 
3818             if (fn == NULL) {
3819                 uint32_t tmp;
3820                 tmp = aofs, aofs = bofs, bofs = tmp;
3821                 cond = tcg_swap_cond(cond);
3822                 fn = fns[cond];
3823                 assert(fn != NULL);
3824             }
3825             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3826             oprsz = maxsz;
3827         }
3828         break;
3829 
3830     default:
3831         g_assert_not_reached();
3832     }
3833     tcg_swap_vecop_list(hold_list);
3834 
3835     if (oprsz < maxsz) {
3836         expand_clr(dofs + oprsz, maxsz - oprsz);
3837     }
3838 }
3839 
3840 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3841                             uint32_t oprsz, uint32_t tysz, TCGType type,
3842                             TCGCond cond, TCGv_vec c)
3843 {
3844     TCGv_vec t0 = tcg_temp_new_vec(type);
3845     TCGv_vec t1 = tcg_temp_new_vec(type);
3846     uint32_t i;
3847 
3848     for (i = 0; i < oprsz; i += tysz) {
3849         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
3850         tcg_gen_cmp_vec(cond, vece, t0, t1, c);
3851         tcg_gen_st_vec(t0, tcg_env, dofs + i);
3852     }
3853 }
3854 
3855 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
3856                        uint32_t aofs, TCGv_i64 c,
3857                        uint32_t oprsz, uint32_t maxsz)
3858 {
3859     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3860     static gen_helper_gvec_2i * const eq_fn[4] = {
3861         gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
3862         gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
3863     };
3864     static gen_helper_gvec_2i * const lt_fn[4] = {
3865         gen_helper_gvec_lts8, gen_helper_gvec_lts16,
3866         gen_helper_gvec_lts32, gen_helper_gvec_lts64
3867     };
3868     static gen_helper_gvec_2i * const le_fn[4] = {
3869         gen_helper_gvec_les8, gen_helper_gvec_les16,
3870         gen_helper_gvec_les32, gen_helper_gvec_les64
3871     };
3872     static gen_helper_gvec_2i * const ltu_fn[4] = {
3873         gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
3874         gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
3875     };
3876     static gen_helper_gvec_2i * const leu_fn[4] = {
3877         gen_helper_gvec_leus8, gen_helper_gvec_leus16,
3878         gen_helper_gvec_leus32, gen_helper_gvec_leus64
3879     };
3880     static gen_helper_gvec_2i * const * const fns[16] = {
3881         [TCG_COND_EQ] = eq_fn,
3882         [TCG_COND_LT] = lt_fn,
3883         [TCG_COND_LE] = le_fn,
3884         [TCG_COND_LTU] = ltu_fn,
3885         [TCG_COND_LEU] = leu_fn,
3886     };
3887 
3888     TCGType type;
3889 
3890     check_size_align(oprsz, maxsz, dofs | aofs);
3891     check_overlap_2(dofs, aofs, maxsz);
3892 
3893     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3894         do_dup(MO_8, dofs, oprsz, maxsz,
3895                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3896         return;
3897     }
3898 
3899     /*
3900      * Implement inline with a vector type, if possible.
3901      * Prefer integer when 64-bit host and 64-bit comparison.
3902      */
3903     type = choose_vector_type(cmp_list, vece, oprsz,
3904                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3905     if (type != 0) {
3906         const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
3907         TCGv_vec t_vec = tcg_temp_new_vec(type);
3908         uint32_t some;
3909 
3910         tcg_gen_dup_i64_vec(vece, t_vec, c);
3911         switch (type) {
3912         case TCG_TYPE_V256:
3913             some = QEMU_ALIGN_DOWN(oprsz, 32);
3914             expand_cmps_vec(vece, dofs, aofs, some, 32,
3915                             TCG_TYPE_V256, cond, t_vec);
3916             aofs += some;
3917             dofs += some;
3918             oprsz -= some;
3919             maxsz -= some;
3920             /* fallthru */
3921 
3922         case TCG_TYPE_V128:
3923             some = QEMU_ALIGN_DOWN(oprsz, 16);
3924             expand_cmps_vec(vece, dofs, aofs, some, 16,
3925                             TCG_TYPE_V128, cond, t_vec);
3926             break;
3927 
3928         case TCG_TYPE_V64:
3929             some = QEMU_ALIGN_DOWN(oprsz, 8);
3930             expand_cmps_vec(vece, dofs, aofs, some, 8,
3931                             TCG_TYPE_V64, cond, t_vec);
3932             break;
3933 
3934         default:
3935             g_assert_not_reached();
3936         }
3937         tcg_temp_free_vec(t_vec);
3938         tcg_swap_vecop_list(hold_list);
3939     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3940         TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3941         uint32_t i;
3942 
3943         for (i = 0; i < oprsz; i += 8) {
3944             tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3945             tcg_gen_negsetcond_i64(cond, t0, t0, c);
3946             tcg_gen_st_i64(t0, tcg_env, dofs + i);
3947         }
3948         tcg_temp_free_i64(t0);
3949     } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3950         TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3951         TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3952         uint32_t i;
3953 
3954         tcg_gen_extrl_i64_i32(t1, c);
3955         for (i = 0; i < oprsz; i += 4) {
3956             tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3957             tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3958             tcg_gen_st_i32(t0, tcg_env, dofs + i);
3959         }
3960         tcg_temp_free_i32(t0);
3961         tcg_temp_free_i32(t1);
3962     } else {
3963         gen_helper_gvec_2i * const *fn = fns[cond];
3964         bool inv = false;
3965 
3966         if (fn == NULL) {
3967             cond = tcg_invert_cond(cond);
3968             fn = fns[cond];
3969             assert(fn != NULL);
3970             inv = true;
3971         }
3972         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
3973         return;
3974     }
3975 
3976     if (oprsz < maxsz) {
3977         expand_clr(dofs + oprsz, maxsz - oprsz);
3978     }
3979 }
3980 
3981 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
3982                        uint32_t aofs, int64_t c,
3983                        uint32_t oprsz, uint32_t maxsz)
3984 {
3985     TCGv_i64 tmp = tcg_constant_i64(c);
3986     tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
3987 }
3988 
3989 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3990 {
3991     TCGv_i64 t = tcg_temp_ebb_new_i64();
3992 
3993     tcg_gen_and_i64(t, b, a);
3994     tcg_gen_andc_i64(d, c, a);
3995     tcg_gen_or_i64(d, d, t);
3996     tcg_temp_free_i64(t);
3997 }
3998 
3999 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
4000                          uint32_t bofs, uint32_t cofs,
4001                          uint32_t oprsz, uint32_t maxsz)
4002 {
4003     static const GVecGen4 g = {
4004         .fni8 = tcg_gen_bitsel_i64,
4005         .fniv = tcg_gen_bitsel_vec,
4006         .fno = gen_helper_gvec_bitsel,
4007     };
4008 
4009     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
4010 }
4011