xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision 500eb6db)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg.h"
22 #include "tcg-op.h"
23 #include "tcg-op-gvec.h"
24 #include "tcg-gvec-desc.h"
25 
26 #define MAX_UNROLL  4
27 
28 #ifdef CONFIG_DEBUG_TCG
29 static const TCGOpcode vecop_list_empty[1] = { 0 };
30 #else
31 #define vecop_list_empty NULL
32 #endif
33 
34 
35 /* Verify vector size and alignment rules.  OFS should be the OR of all
36    of the operand offsets so that we can check them all at once.  */
37 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
38 {
39     uint32_t opr_align = oprsz >= 16 ? 15 : 7;
40     uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
41     tcg_debug_assert(oprsz > 0);
42     tcg_debug_assert(oprsz <= maxsz);
43     tcg_debug_assert((oprsz & opr_align) == 0);
44     tcg_debug_assert((maxsz & max_align) == 0);
45     tcg_debug_assert((ofs & max_align) == 0);
46 }
47 
48 /* Verify vector overlap rules for two operands.  */
49 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
50 {
51     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
52 }
53 
54 /* Verify vector overlap rules for three operands.  */
55 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
56 {
57     check_overlap_2(d, a, s);
58     check_overlap_2(d, b, s);
59     check_overlap_2(a, b, s);
60 }
61 
62 /* Verify vector overlap rules for four operands.  */
63 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
64                             uint32_t c, uint32_t s)
65 {
66     check_overlap_2(d, a, s);
67     check_overlap_2(d, b, s);
68     check_overlap_2(d, c, s);
69     check_overlap_2(a, b, s);
70     check_overlap_2(a, c, s);
71     check_overlap_2(b, c, s);
72 }
73 
74 /* Create a descriptor from components.  */
75 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
76 {
77     uint32_t desc = 0;
78 
79     assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
80     assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
81     assert(data == sextract32(data, 0, SIMD_DATA_BITS));
82 
83     oprsz = (oprsz / 8) - 1;
84     maxsz = (maxsz / 8) - 1;
85     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
86     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
87     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
88 
89     return desc;
90 }
91 
92 /* Generate a call to a gvec-style helper with two vector operands.  */
93 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
94                         uint32_t oprsz, uint32_t maxsz, int32_t data,
95                         gen_helper_gvec_2 *fn)
96 {
97     TCGv_ptr a0, a1;
98     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
99 
100     a0 = tcg_temp_new_ptr();
101     a1 = tcg_temp_new_ptr();
102 
103     tcg_gen_addi_ptr(a0, cpu_env, dofs);
104     tcg_gen_addi_ptr(a1, cpu_env, aofs);
105 
106     fn(a0, a1, desc);
107 
108     tcg_temp_free_ptr(a0);
109     tcg_temp_free_ptr(a1);
110     tcg_temp_free_i32(desc);
111 }
112 
113 /* Generate a call to a gvec-style helper with two vector operands
114    and one scalar operand.  */
115 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
116                          uint32_t oprsz, uint32_t maxsz, int32_t data,
117                          gen_helper_gvec_2i *fn)
118 {
119     TCGv_ptr a0, a1;
120     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
121 
122     a0 = tcg_temp_new_ptr();
123     a1 = tcg_temp_new_ptr();
124 
125     tcg_gen_addi_ptr(a0, cpu_env, dofs);
126     tcg_gen_addi_ptr(a1, cpu_env, aofs);
127 
128     fn(a0, a1, c, desc);
129 
130     tcg_temp_free_ptr(a0);
131     tcg_temp_free_ptr(a1);
132     tcg_temp_free_i32(desc);
133 }
134 
135 /* Generate a call to a gvec-style helper with three vector operands.  */
136 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
137                         uint32_t oprsz, uint32_t maxsz, int32_t data,
138                         gen_helper_gvec_3 *fn)
139 {
140     TCGv_ptr a0, a1, a2;
141     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
142 
143     a0 = tcg_temp_new_ptr();
144     a1 = tcg_temp_new_ptr();
145     a2 = tcg_temp_new_ptr();
146 
147     tcg_gen_addi_ptr(a0, cpu_env, dofs);
148     tcg_gen_addi_ptr(a1, cpu_env, aofs);
149     tcg_gen_addi_ptr(a2, cpu_env, bofs);
150 
151     fn(a0, a1, a2, desc);
152 
153     tcg_temp_free_ptr(a0);
154     tcg_temp_free_ptr(a1);
155     tcg_temp_free_ptr(a2);
156     tcg_temp_free_i32(desc);
157 }
158 
159 /* Generate a call to a gvec-style helper with four vector operands.  */
160 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
161                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
162                         int32_t data, gen_helper_gvec_4 *fn)
163 {
164     TCGv_ptr a0, a1, a2, a3;
165     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
166 
167     a0 = tcg_temp_new_ptr();
168     a1 = tcg_temp_new_ptr();
169     a2 = tcg_temp_new_ptr();
170     a3 = tcg_temp_new_ptr();
171 
172     tcg_gen_addi_ptr(a0, cpu_env, dofs);
173     tcg_gen_addi_ptr(a1, cpu_env, aofs);
174     tcg_gen_addi_ptr(a2, cpu_env, bofs);
175     tcg_gen_addi_ptr(a3, cpu_env, cofs);
176 
177     fn(a0, a1, a2, a3, desc);
178 
179     tcg_temp_free_ptr(a0);
180     tcg_temp_free_ptr(a1);
181     tcg_temp_free_ptr(a2);
182     tcg_temp_free_ptr(a3);
183     tcg_temp_free_i32(desc);
184 }
185 
186 /* Generate a call to a gvec-style helper with five vector operands.  */
187 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
188                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
189                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
190 {
191     TCGv_ptr a0, a1, a2, a3, a4;
192     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
193 
194     a0 = tcg_temp_new_ptr();
195     a1 = tcg_temp_new_ptr();
196     a2 = tcg_temp_new_ptr();
197     a3 = tcg_temp_new_ptr();
198     a4 = tcg_temp_new_ptr();
199 
200     tcg_gen_addi_ptr(a0, cpu_env, dofs);
201     tcg_gen_addi_ptr(a1, cpu_env, aofs);
202     tcg_gen_addi_ptr(a2, cpu_env, bofs);
203     tcg_gen_addi_ptr(a3, cpu_env, cofs);
204     tcg_gen_addi_ptr(a4, cpu_env, xofs);
205 
206     fn(a0, a1, a2, a3, a4, desc);
207 
208     tcg_temp_free_ptr(a0);
209     tcg_temp_free_ptr(a1);
210     tcg_temp_free_ptr(a2);
211     tcg_temp_free_ptr(a3);
212     tcg_temp_free_ptr(a4);
213     tcg_temp_free_i32(desc);
214 }
215 
216 /* Generate a call to a gvec-style helper with three vector operands
217    and an extra pointer operand.  */
218 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
219                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
220                         int32_t data, gen_helper_gvec_2_ptr *fn)
221 {
222     TCGv_ptr a0, a1;
223     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
224 
225     a0 = tcg_temp_new_ptr();
226     a1 = tcg_temp_new_ptr();
227 
228     tcg_gen_addi_ptr(a0, cpu_env, dofs);
229     tcg_gen_addi_ptr(a1, cpu_env, aofs);
230 
231     fn(a0, a1, ptr, desc);
232 
233     tcg_temp_free_ptr(a0);
234     tcg_temp_free_ptr(a1);
235     tcg_temp_free_i32(desc);
236 }
237 
238 /* Generate a call to a gvec-style helper with three vector operands
239    and an extra pointer operand.  */
240 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
241                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
242                         int32_t data, gen_helper_gvec_3_ptr *fn)
243 {
244     TCGv_ptr a0, a1, a2;
245     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
246 
247     a0 = tcg_temp_new_ptr();
248     a1 = tcg_temp_new_ptr();
249     a2 = tcg_temp_new_ptr();
250 
251     tcg_gen_addi_ptr(a0, cpu_env, dofs);
252     tcg_gen_addi_ptr(a1, cpu_env, aofs);
253     tcg_gen_addi_ptr(a2, cpu_env, bofs);
254 
255     fn(a0, a1, a2, ptr, desc);
256 
257     tcg_temp_free_ptr(a0);
258     tcg_temp_free_ptr(a1);
259     tcg_temp_free_ptr(a2);
260     tcg_temp_free_i32(desc);
261 }
262 
263 /* Generate a call to a gvec-style helper with four vector operands
264    and an extra pointer operand.  */
265 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
266                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
267                         uint32_t maxsz, int32_t data,
268                         gen_helper_gvec_4_ptr *fn)
269 {
270     TCGv_ptr a0, a1, a2, a3;
271     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
272 
273     a0 = tcg_temp_new_ptr();
274     a1 = tcg_temp_new_ptr();
275     a2 = tcg_temp_new_ptr();
276     a3 = tcg_temp_new_ptr();
277 
278     tcg_gen_addi_ptr(a0, cpu_env, dofs);
279     tcg_gen_addi_ptr(a1, cpu_env, aofs);
280     tcg_gen_addi_ptr(a2, cpu_env, bofs);
281     tcg_gen_addi_ptr(a3, cpu_env, cofs);
282 
283     fn(a0, a1, a2, a3, ptr, desc);
284 
285     tcg_temp_free_ptr(a0);
286     tcg_temp_free_ptr(a1);
287     tcg_temp_free_ptr(a2);
288     tcg_temp_free_ptr(a3);
289     tcg_temp_free_i32(desc);
290 }
291 
292 /* Return true if we want to implement something of OPRSZ bytes
293    in units of LNSZ.  This limits the expansion of inline code.  */
294 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
295 {
296     if (oprsz % lnsz == 0) {
297         uint32_t lnct = oprsz / lnsz;
298         return lnct >= 1 && lnct <= MAX_UNROLL;
299     }
300     return false;
301 }
302 
303 static void expand_clr(uint32_t dofs, uint32_t maxsz);
304 
305 /* Duplicate C as per VECE.  */
306 uint64_t (dup_const)(unsigned vece, uint64_t c)
307 {
308     switch (vece) {
309     case MO_8:
310         return 0x0101010101010101ull * (uint8_t)c;
311     case MO_16:
312         return 0x0001000100010001ull * (uint16_t)c;
313     case MO_32:
314         return 0x0000000100000001ull * (uint32_t)c;
315     case MO_64:
316         return c;
317     default:
318         g_assert_not_reached();
319     }
320 }
321 
322 /* Duplicate IN into OUT as per VECE.  */
323 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
324 {
325     switch (vece) {
326     case MO_8:
327         tcg_gen_ext8u_i32(out, in);
328         tcg_gen_muli_i32(out, out, 0x01010101);
329         break;
330     case MO_16:
331         tcg_gen_deposit_i32(out, in, in, 16, 16);
332         break;
333     case MO_32:
334         tcg_gen_mov_i32(out, in);
335         break;
336     default:
337         g_assert_not_reached();
338     }
339 }
340 
341 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
342 {
343     switch (vece) {
344     case MO_8:
345         tcg_gen_ext8u_i64(out, in);
346         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
347         break;
348     case MO_16:
349         tcg_gen_ext16u_i64(out, in);
350         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
351         break;
352     case MO_32:
353         tcg_gen_deposit_i64(out, in, in, 32, 32);
354         break;
355     case MO_64:
356         tcg_gen_mov_i64(out, in);
357         break;
358     default:
359         g_assert_not_reached();
360     }
361 }
362 
363 /* Select a supported vector type for implementing an operation on SIZE
364  * bytes.  If OP is 0, assume that the real operation to be performed is
365  * required by all backends.  Otherwise, make sure than OP can be performed
366  * on elements of size VECE in the selected type.  Do not select V64 if
367  * PREFER_I64 is true.  Return 0 if no vector type is selected.
368  */
369 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
370                                   uint32_t size, bool prefer_i64)
371 {
372     if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
373         /*
374          * Recall that ARM SVE allows vector sizes that are not a
375          * power of 2, but always a multiple of 16.  The intent is
376          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
377          * It is hard to imagine a case in which v256 is supported
378          * but v128 is not, but check anyway.
379          */
380         if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
381             && (size % 32 == 0
382                 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
383             return TCG_TYPE_V256;
384         }
385     }
386     if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
387         && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
388         return TCG_TYPE_V128;
389     }
390     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
391         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
392         return TCG_TYPE_V64;
393     }
394     return 0;
395 }
396 
397 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
398                          uint32_t maxsz, TCGv_vec t_vec)
399 {
400     uint32_t i = 0;
401 
402     switch (type) {
403     case TCG_TYPE_V256:
404         /*
405          * Recall that ARM SVE allows vector sizes that are not a
406          * power of 2, but always a multiple of 16.  The intent is
407          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
408          */
409         for (; i + 32 <= oprsz; i += 32) {
410             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
411         }
412         /* fallthru */
413     case TCG_TYPE_V128:
414         for (; i + 16 <= oprsz; i += 16) {
415             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
416         }
417         break;
418     case TCG_TYPE_V64:
419         for (; i < oprsz; i += 8) {
420             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
421         }
422         break;
423     default:
424         g_assert_not_reached();
425     }
426 
427     if (oprsz < maxsz) {
428         expand_clr(dofs + oprsz, maxsz - oprsz);
429     }
430 }
431 
432 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
433  * Only one of IN_32 or IN_64 may be set;
434  * IN_C is used if IN_32 and IN_64 are unset.
435  */
436 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
437                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
438                    uint64_t in_c)
439 {
440     TCGType type;
441     TCGv_i64 t_64;
442     TCGv_i32 t_32, t_desc;
443     TCGv_ptr t_ptr;
444     uint32_t i;
445 
446     assert(vece <= (in_32 ? MO_32 : MO_64));
447     assert(in_32 == NULL || in_64 == NULL);
448 
449     /* If we're storing 0, expand oprsz to maxsz.  */
450     if (in_32 == NULL && in_64 == NULL) {
451         in_c = dup_const(vece, in_c);
452         if (in_c == 0) {
453             oprsz = maxsz;
454         }
455     }
456 
457     /* Implement inline with a vector type, if possible.
458      * Prefer integer when 64-bit host and no variable dup.
459      */
460     type = choose_vector_type(NULL, vece, oprsz,
461                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
462                                && (in_64 == NULL || vece == MO_64)));
463     if (type != 0) {
464         TCGv_vec t_vec = tcg_temp_new_vec(type);
465 
466         if (in_32) {
467             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
468         } else if (in_64) {
469             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
470         } else {
471             tcg_gen_dupi_vec(vece, t_vec, in_c);
472         }
473         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
474         tcg_temp_free_vec(t_vec);
475         return;
476     }
477 
478     /* Otherwise, inline with an integer type, unless "large".  */
479     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
480         t_64 = NULL;
481         t_32 = NULL;
482 
483         if (in_32) {
484             /* We are given a 32-bit variable input.  For a 64-bit host,
485                use a 64-bit operation unless the 32-bit operation would
486                be simple enough.  */
487             if (TCG_TARGET_REG_BITS == 64
488                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
489                 t_64 = tcg_temp_new_i64();
490                 tcg_gen_extu_i32_i64(t_64, in_32);
491                 gen_dup_i64(vece, t_64, t_64);
492             } else {
493                 t_32 = tcg_temp_new_i32();
494                 gen_dup_i32(vece, t_32, in_32);
495             }
496         } else if (in_64) {
497             /* We are given a 64-bit variable input.  */
498             t_64 = tcg_temp_new_i64();
499             gen_dup_i64(vece, t_64, in_64);
500         } else {
501             /* We are given a constant input.  */
502             /* For 64-bit hosts, use 64-bit constants for "simple" constants
503                or when we'd need too many 32-bit stores, or when a 64-bit
504                constant is really required.  */
505             if (vece == MO_64
506                 || (TCG_TARGET_REG_BITS == 64
507                     && (in_c == 0 || in_c == -1
508                         || !check_size_impl(oprsz, 4)))) {
509                 t_64 = tcg_const_i64(in_c);
510             } else {
511                 t_32 = tcg_const_i32(in_c);
512             }
513         }
514 
515         /* Implement inline if we picked an implementation size above.  */
516         if (t_32) {
517             for (i = 0; i < oprsz; i += 4) {
518                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
519             }
520             tcg_temp_free_i32(t_32);
521             goto done;
522         }
523         if (t_64) {
524             for (i = 0; i < oprsz; i += 8) {
525                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
526             }
527             tcg_temp_free_i64(t_64);
528             goto done;
529         }
530     }
531 
532     /* Otherwise implement out of line.  */
533     t_ptr = tcg_temp_new_ptr();
534     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
535     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
536 
537     if (vece == MO_64) {
538         if (in_64) {
539             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
540         } else {
541             t_64 = tcg_const_i64(in_c);
542             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
543             tcg_temp_free_i64(t_64);
544         }
545     } else {
546         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
547         static dup_fn * const fns[3] = {
548             gen_helper_gvec_dup8,
549             gen_helper_gvec_dup16,
550             gen_helper_gvec_dup32
551         };
552 
553         if (in_32) {
554             fns[vece](t_ptr, t_desc, in_32);
555         } else {
556             t_32 = tcg_temp_new_i32();
557             if (in_64) {
558                 tcg_gen_extrl_i64_i32(t_32, in_64);
559             } else if (vece == MO_8) {
560                 tcg_gen_movi_i32(t_32, in_c & 0xff);
561             } else if (vece == MO_16) {
562                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
563             } else {
564                 tcg_gen_movi_i32(t_32, in_c);
565             }
566             fns[vece](t_ptr, t_desc, t_32);
567             tcg_temp_free_i32(t_32);
568         }
569     }
570 
571     tcg_temp_free_ptr(t_ptr);
572     tcg_temp_free_i32(t_desc);
573     return;
574 
575  done:
576     if (oprsz < maxsz) {
577         expand_clr(dofs + oprsz, maxsz - oprsz);
578     }
579 }
580 
581 /* Likewise, but with zero.  */
582 static void expand_clr(uint32_t dofs, uint32_t maxsz)
583 {
584     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
585 }
586 
587 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
588 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
589                          void (*fni)(TCGv_i32, TCGv_i32))
590 {
591     TCGv_i32 t0 = tcg_temp_new_i32();
592     uint32_t i;
593 
594     for (i = 0; i < oprsz; i += 4) {
595         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
596         fni(t0, t0);
597         tcg_gen_st_i32(t0, cpu_env, dofs + i);
598     }
599     tcg_temp_free_i32(t0);
600 }
601 
602 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
603                           int32_t c, bool load_dest,
604                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
605 {
606     TCGv_i32 t0 = tcg_temp_new_i32();
607     TCGv_i32 t1 = tcg_temp_new_i32();
608     uint32_t i;
609 
610     for (i = 0; i < oprsz; i += 4) {
611         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
612         if (load_dest) {
613             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
614         }
615         fni(t1, t0, c);
616         tcg_gen_st_i32(t1, cpu_env, dofs + i);
617     }
618     tcg_temp_free_i32(t0);
619     tcg_temp_free_i32(t1);
620 }
621 
622 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
623                           TCGv_i32 c, bool scalar_first,
624                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
625 {
626     TCGv_i32 t0 = tcg_temp_new_i32();
627     TCGv_i32 t1 = tcg_temp_new_i32();
628     uint32_t i;
629 
630     for (i = 0; i < oprsz; i += 4) {
631         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
632         if (scalar_first) {
633             fni(t1, c, t0);
634         } else {
635             fni(t1, t0, c);
636         }
637         tcg_gen_st_i32(t1, cpu_env, dofs + i);
638     }
639     tcg_temp_free_i32(t0);
640     tcg_temp_free_i32(t1);
641 }
642 
643 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
644 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
645                          uint32_t bofs, uint32_t oprsz, bool load_dest,
646                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
647 {
648     TCGv_i32 t0 = tcg_temp_new_i32();
649     TCGv_i32 t1 = tcg_temp_new_i32();
650     TCGv_i32 t2 = tcg_temp_new_i32();
651     uint32_t i;
652 
653     for (i = 0; i < oprsz; i += 4) {
654         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
655         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
656         if (load_dest) {
657             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
658         }
659         fni(t2, t0, t1);
660         tcg_gen_st_i32(t2, cpu_env, dofs + i);
661     }
662     tcg_temp_free_i32(t2);
663     tcg_temp_free_i32(t1);
664     tcg_temp_free_i32(t0);
665 }
666 
667 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
668                           uint32_t oprsz, int32_t c, bool load_dest,
669                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
670 {
671     TCGv_i32 t0 = tcg_temp_new_i32();
672     TCGv_i32 t1 = tcg_temp_new_i32();
673     TCGv_i32 t2 = tcg_temp_new_i32();
674     uint32_t i;
675 
676     for (i = 0; i < oprsz; i += 4) {
677         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
678         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
679         if (load_dest) {
680             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
681         }
682         fni(t2, t0, t1, c);
683         tcg_gen_st_i32(t2, cpu_env, dofs + i);
684     }
685     tcg_temp_free_i32(t0);
686     tcg_temp_free_i32(t1);
687     tcg_temp_free_i32(t2);
688 }
689 
690 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
691 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
692                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
693                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
694 {
695     TCGv_i32 t0 = tcg_temp_new_i32();
696     TCGv_i32 t1 = tcg_temp_new_i32();
697     TCGv_i32 t2 = tcg_temp_new_i32();
698     TCGv_i32 t3 = tcg_temp_new_i32();
699     uint32_t i;
700 
701     for (i = 0; i < oprsz; i += 4) {
702         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
703         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
704         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
705         fni(t0, t1, t2, t3);
706         tcg_gen_st_i32(t0, cpu_env, dofs + i);
707         if (write_aofs) {
708             tcg_gen_st_i32(t1, cpu_env, aofs + i);
709         }
710     }
711     tcg_temp_free_i32(t3);
712     tcg_temp_free_i32(t2);
713     tcg_temp_free_i32(t1);
714     tcg_temp_free_i32(t0);
715 }
716 
717 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
718 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
719                          void (*fni)(TCGv_i64, TCGv_i64))
720 {
721     TCGv_i64 t0 = tcg_temp_new_i64();
722     uint32_t i;
723 
724     for (i = 0; i < oprsz; i += 8) {
725         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
726         fni(t0, t0);
727         tcg_gen_st_i64(t0, cpu_env, dofs + i);
728     }
729     tcg_temp_free_i64(t0);
730 }
731 
732 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
733                           int64_t c, bool load_dest,
734                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
735 {
736     TCGv_i64 t0 = tcg_temp_new_i64();
737     TCGv_i64 t1 = tcg_temp_new_i64();
738     uint32_t i;
739 
740     for (i = 0; i < oprsz; i += 8) {
741         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
742         if (load_dest) {
743             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
744         }
745         fni(t1, t0, c);
746         tcg_gen_st_i64(t1, cpu_env, dofs + i);
747     }
748     tcg_temp_free_i64(t0);
749     tcg_temp_free_i64(t1);
750 }
751 
752 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
753                           TCGv_i64 c, bool scalar_first,
754                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
755 {
756     TCGv_i64 t0 = tcg_temp_new_i64();
757     TCGv_i64 t1 = tcg_temp_new_i64();
758     uint32_t i;
759 
760     for (i = 0; i < oprsz; i += 8) {
761         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
762         if (scalar_first) {
763             fni(t1, c, t0);
764         } else {
765             fni(t1, t0, c);
766         }
767         tcg_gen_st_i64(t1, cpu_env, dofs + i);
768     }
769     tcg_temp_free_i64(t0);
770     tcg_temp_free_i64(t1);
771 }
772 
773 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
774 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
775                          uint32_t bofs, uint32_t oprsz, bool load_dest,
776                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
777 {
778     TCGv_i64 t0 = tcg_temp_new_i64();
779     TCGv_i64 t1 = tcg_temp_new_i64();
780     TCGv_i64 t2 = tcg_temp_new_i64();
781     uint32_t i;
782 
783     for (i = 0; i < oprsz; i += 8) {
784         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
785         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
786         if (load_dest) {
787             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
788         }
789         fni(t2, t0, t1);
790         tcg_gen_st_i64(t2, cpu_env, dofs + i);
791     }
792     tcg_temp_free_i64(t2);
793     tcg_temp_free_i64(t1);
794     tcg_temp_free_i64(t0);
795 }
796 
797 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
798                           uint32_t oprsz, int64_t c, bool load_dest,
799                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
800 {
801     TCGv_i64 t0 = tcg_temp_new_i64();
802     TCGv_i64 t1 = tcg_temp_new_i64();
803     TCGv_i64 t2 = tcg_temp_new_i64();
804     uint32_t i;
805 
806     for (i = 0; i < oprsz; i += 8) {
807         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
808         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
809         if (load_dest) {
810             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
811         }
812         fni(t2, t0, t1, c);
813         tcg_gen_st_i64(t2, cpu_env, dofs + i);
814     }
815     tcg_temp_free_i64(t0);
816     tcg_temp_free_i64(t1);
817     tcg_temp_free_i64(t2);
818 }
819 
820 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
821 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
822                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
823                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
824 {
825     TCGv_i64 t0 = tcg_temp_new_i64();
826     TCGv_i64 t1 = tcg_temp_new_i64();
827     TCGv_i64 t2 = tcg_temp_new_i64();
828     TCGv_i64 t3 = tcg_temp_new_i64();
829     uint32_t i;
830 
831     for (i = 0; i < oprsz; i += 8) {
832         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
833         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
834         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
835         fni(t0, t1, t2, t3);
836         tcg_gen_st_i64(t0, cpu_env, dofs + i);
837         if (write_aofs) {
838             tcg_gen_st_i64(t1, cpu_env, aofs + i);
839         }
840     }
841     tcg_temp_free_i64(t3);
842     tcg_temp_free_i64(t2);
843     tcg_temp_free_i64(t1);
844     tcg_temp_free_i64(t0);
845 }
846 
847 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
848 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
849                          uint32_t oprsz, uint32_t tysz, TCGType type,
850                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
851 {
852     TCGv_vec t0 = tcg_temp_new_vec(type);
853     uint32_t i;
854 
855     for (i = 0; i < oprsz; i += tysz) {
856         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
857         fni(vece, t0, t0);
858         tcg_gen_st_vec(t0, cpu_env, dofs + i);
859     }
860     tcg_temp_free_vec(t0);
861 }
862 
863 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
864    using host vectors.  */
865 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
866                           uint32_t oprsz, uint32_t tysz, TCGType type,
867                           int64_t c, bool load_dest,
868                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
869 {
870     TCGv_vec t0 = tcg_temp_new_vec(type);
871     TCGv_vec t1 = tcg_temp_new_vec(type);
872     uint32_t i;
873 
874     for (i = 0; i < oprsz; i += tysz) {
875         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
876         if (load_dest) {
877             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
878         }
879         fni(vece, t1, t0, c);
880         tcg_gen_st_vec(t1, cpu_env, dofs + i);
881     }
882     tcg_temp_free_vec(t0);
883     tcg_temp_free_vec(t1);
884 }
885 
886 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
887                           uint32_t oprsz, uint32_t tysz, TCGType type,
888                           TCGv_vec c, bool scalar_first,
889                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
890 {
891     TCGv_vec t0 = tcg_temp_new_vec(type);
892     TCGv_vec t1 = tcg_temp_new_vec(type);
893     uint32_t i;
894 
895     for (i = 0; i < oprsz; i += tysz) {
896         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
897         if (scalar_first) {
898             fni(vece, t1, c, t0);
899         } else {
900             fni(vece, t1, t0, c);
901         }
902         tcg_gen_st_vec(t1, cpu_env, dofs + i);
903     }
904     tcg_temp_free_vec(t0);
905     tcg_temp_free_vec(t1);
906 }
907 
908 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
909 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
910                          uint32_t bofs, uint32_t oprsz,
911                          uint32_t tysz, TCGType type, bool load_dest,
912                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
913 {
914     TCGv_vec t0 = tcg_temp_new_vec(type);
915     TCGv_vec t1 = tcg_temp_new_vec(type);
916     TCGv_vec t2 = tcg_temp_new_vec(type);
917     uint32_t i;
918 
919     for (i = 0; i < oprsz; i += tysz) {
920         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
921         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
922         if (load_dest) {
923             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
924         }
925         fni(vece, t2, t0, t1);
926         tcg_gen_st_vec(t2, cpu_env, dofs + i);
927     }
928     tcg_temp_free_vec(t2);
929     tcg_temp_free_vec(t1);
930     tcg_temp_free_vec(t0);
931 }
932 
933 /*
934  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
935  * using host vectors.
936  */
937 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
938                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
939                           TCGType type, int64_t c, bool load_dest,
940                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
941                                       int64_t))
942 {
943     TCGv_vec t0 = tcg_temp_new_vec(type);
944     TCGv_vec t1 = tcg_temp_new_vec(type);
945     TCGv_vec t2 = tcg_temp_new_vec(type);
946     uint32_t i;
947 
948     for (i = 0; i < oprsz; i += tysz) {
949         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
950         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
951         if (load_dest) {
952             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
953         }
954         fni(vece, t2, t0, t1, c);
955         tcg_gen_st_vec(t2, cpu_env, dofs + i);
956     }
957     tcg_temp_free_vec(t0);
958     tcg_temp_free_vec(t1);
959     tcg_temp_free_vec(t2);
960 }
961 
962 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
963 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
964                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
965                          uint32_t tysz, TCGType type, bool write_aofs,
966                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
967                                      TCGv_vec, TCGv_vec))
968 {
969     TCGv_vec t0 = tcg_temp_new_vec(type);
970     TCGv_vec t1 = tcg_temp_new_vec(type);
971     TCGv_vec t2 = tcg_temp_new_vec(type);
972     TCGv_vec t3 = tcg_temp_new_vec(type);
973     uint32_t i;
974 
975     for (i = 0; i < oprsz; i += tysz) {
976         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
977         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
978         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
979         fni(vece, t0, t1, t2, t3);
980         tcg_gen_st_vec(t0, cpu_env, dofs + i);
981         if (write_aofs) {
982             tcg_gen_st_vec(t1, cpu_env, aofs + i);
983         }
984     }
985     tcg_temp_free_vec(t3);
986     tcg_temp_free_vec(t2);
987     tcg_temp_free_vec(t1);
988     tcg_temp_free_vec(t0);
989 }
990 
991 /* Expand a vector two-operand operation.  */
992 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
993                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
994 {
995     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
996     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
997     TCGType type;
998     uint32_t some;
999 
1000     check_size_align(oprsz, maxsz, dofs | aofs);
1001     check_overlap_2(dofs, aofs, maxsz);
1002 
1003     type = 0;
1004     if (g->fniv) {
1005         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1006     }
1007     switch (type) {
1008     case TCG_TYPE_V256:
1009         /* Recall that ARM SVE allows vector sizes that are not a
1010          * power of 2, but always a multiple of 16.  The intent is
1011          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1012          */
1013         some = QEMU_ALIGN_DOWN(oprsz, 32);
1014         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
1015         if (some == oprsz) {
1016             break;
1017         }
1018         dofs += some;
1019         aofs += some;
1020         oprsz -= some;
1021         maxsz -= some;
1022         /* fallthru */
1023     case TCG_TYPE_V128:
1024         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
1025         break;
1026     case TCG_TYPE_V64:
1027         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
1028         break;
1029 
1030     case 0:
1031         if (g->fni8 && check_size_impl(oprsz, 8)) {
1032             expand_2_i64(dofs, aofs, oprsz, g->fni8);
1033         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1034             expand_2_i32(dofs, aofs, oprsz, g->fni4);
1035         } else {
1036             assert(g->fno != NULL);
1037             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1038             oprsz = maxsz;
1039         }
1040         break;
1041 
1042     default:
1043         g_assert_not_reached();
1044     }
1045     tcg_swap_vecop_list(hold_list);
1046 
1047     if (oprsz < maxsz) {
1048         expand_clr(dofs + oprsz, maxsz - oprsz);
1049     }
1050 }
1051 
1052 /* Expand a vector operation with two vectors and an immediate.  */
1053 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1054                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1055 {
1056     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1057     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1058     TCGType type;
1059     uint32_t some;
1060 
1061     check_size_align(oprsz, maxsz, dofs | aofs);
1062     check_overlap_2(dofs, aofs, maxsz);
1063 
1064     type = 0;
1065     if (g->fniv) {
1066         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1067     }
1068     switch (type) {
1069     case TCG_TYPE_V256:
1070         /* Recall that ARM SVE allows vector sizes that are not a
1071          * power of 2, but always a multiple of 16.  The intent is
1072          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1073          */
1074         some = QEMU_ALIGN_DOWN(oprsz, 32);
1075         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1076                       c, g->load_dest, g->fniv);
1077         if (some == oprsz) {
1078             break;
1079         }
1080         dofs += some;
1081         aofs += some;
1082         oprsz -= some;
1083         maxsz -= some;
1084         /* fallthru */
1085     case TCG_TYPE_V128:
1086         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1087                       c, g->load_dest, g->fniv);
1088         break;
1089     case TCG_TYPE_V64:
1090         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1091                       c, g->load_dest, g->fniv);
1092         break;
1093 
1094     case 0:
1095         if (g->fni8 && check_size_impl(oprsz, 8)) {
1096             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1097         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1098             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1099         } else {
1100             if (g->fno) {
1101                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1102             } else {
1103                 TCGv_i64 tcg_c = tcg_const_i64(c);
1104                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1105                                     maxsz, c, g->fnoi);
1106                 tcg_temp_free_i64(tcg_c);
1107             }
1108             oprsz = maxsz;
1109         }
1110         break;
1111 
1112     default:
1113         g_assert_not_reached();
1114     }
1115     tcg_swap_vecop_list(hold_list);
1116 
1117     if (oprsz < maxsz) {
1118         expand_clr(dofs + oprsz, maxsz - oprsz);
1119     }
1120 }
1121 
1122 /* Expand a vector operation with two vectors and a scalar.  */
1123 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1124                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1125 {
1126     TCGType type;
1127 
1128     check_size_align(oprsz, maxsz, dofs | aofs);
1129     check_overlap_2(dofs, aofs, maxsz);
1130 
1131     type = 0;
1132     if (g->fniv) {
1133         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1134     }
1135     if (type != 0) {
1136         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1137         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1138         TCGv_vec t_vec = tcg_temp_new_vec(type);
1139         uint32_t some;
1140 
1141         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1142 
1143         switch (type) {
1144         case TCG_TYPE_V256:
1145             /* Recall that ARM SVE allows vector sizes that are not a
1146              * power of 2, but always a multiple of 16.  The intent is
1147              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1148              */
1149             some = QEMU_ALIGN_DOWN(oprsz, 32);
1150             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1151                           t_vec, g->scalar_first, g->fniv);
1152             if (some == oprsz) {
1153                 break;
1154             }
1155             dofs += some;
1156             aofs += some;
1157             oprsz -= some;
1158             maxsz -= some;
1159             /* fallthru */
1160 
1161         case TCG_TYPE_V128:
1162             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1163                           t_vec, g->scalar_first, g->fniv);
1164             break;
1165 
1166         case TCG_TYPE_V64:
1167             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1168                           t_vec, g->scalar_first, g->fniv);
1169             break;
1170 
1171         default:
1172             g_assert_not_reached();
1173         }
1174         tcg_temp_free_vec(t_vec);
1175         tcg_swap_vecop_list(hold_list);
1176     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1177         TCGv_i64 t64 = tcg_temp_new_i64();
1178 
1179         gen_dup_i64(g->vece, t64, c);
1180         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1181         tcg_temp_free_i64(t64);
1182     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1183         TCGv_i32 t32 = tcg_temp_new_i32();
1184 
1185         tcg_gen_extrl_i64_i32(t32, c);
1186         gen_dup_i32(g->vece, t32, t32);
1187         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1188         tcg_temp_free_i32(t32);
1189     } else {
1190         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1191         return;
1192     }
1193 
1194     if (oprsz < maxsz) {
1195         expand_clr(dofs + oprsz, maxsz - oprsz);
1196     }
1197 }
1198 
1199 /* Expand a vector three-operand operation.  */
1200 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1201                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1202 {
1203     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1204     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1205     TCGType type;
1206     uint32_t some;
1207 
1208     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1209     check_overlap_3(dofs, aofs, bofs, maxsz);
1210 
1211     type = 0;
1212     if (g->fniv) {
1213         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1214     }
1215     switch (type) {
1216     case TCG_TYPE_V256:
1217         /* Recall that ARM SVE allows vector sizes that are not a
1218          * power of 2, but always a multiple of 16.  The intent is
1219          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1220          */
1221         some = QEMU_ALIGN_DOWN(oprsz, 32);
1222         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1223                      g->load_dest, g->fniv);
1224         if (some == oprsz) {
1225             break;
1226         }
1227         dofs += some;
1228         aofs += some;
1229         bofs += some;
1230         oprsz -= some;
1231         maxsz -= some;
1232         /* fallthru */
1233     case TCG_TYPE_V128:
1234         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1235                      g->load_dest, g->fniv);
1236         break;
1237     case TCG_TYPE_V64:
1238         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1239                      g->load_dest, g->fniv);
1240         break;
1241 
1242     case 0:
1243         if (g->fni8 && check_size_impl(oprsz, 8)) {
1244             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1245         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1246             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1247         } else {
1248             assert(g->fno != NULL);
1249             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1250                                maxsz, g->data, g->fno);
1251             oprsz = maxsz;
1252         }
1253         break;
1254 
1255     default:
1256         g_assert_not_reached();
1257     }
1258     tcg_swap_vecop_list(hold_list);
1259 
1260     if (oprsz < maxsz) {
1261         expand_clr(dofs + oprsz, maxsz - oprsz);
1262     }
1263 }
1264 
1265 /* Expand a vector operation with three vectors and an immediate.  */
1266 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1267                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1268                      const GVecGen3i *g)
1269 {
1270     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1271     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1272     TCGType type;
1273     uint32_t some;
1274 
1275     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1276     check_overlap_3(dofs, aofs, bofs, maxsz);
1277 
1278     type = 0;
1279     if (g->fniv) {
1280         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1281     }
1282     switch (type) {
1283     case TCG_TYPE_V256:
1284         /*
1285          * Recall that ARM SVE allows vector sizes that are not a
1286          * power of 2, but always a multiple of 16.  The intent is
1287          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1288          */
1289         some = QEMU_ALIGN_DOWN(oprsz, 32);
1290         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1291                       c, g->load_dest, g->fniv);
1292         if (some == oprsz) {
1293             break;
1294         }
1295         dofs += some;
1296         aofs += some;
1297         bofs += some;
1298         oprsz -= some;
1299         maxsz -= some;
1300         /* fallthru */
1301     case TCG_TYPE_V128:
1302         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1303                       c, g->load_dest, g->fniv);
1304         break;
1305     case TCG_TYPE_V64:
1306         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1307                       c, g->load_dest, g->fniv);
1308         break;
1309 
1310     case 0:
1311         if (g->fni8 && check_size_impl(oprsz, 8)) {
1312             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1313         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1314             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1315         } else {
1316             assert(g->fno != NULL);
1317             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1318             oprsz = maxsz;
1319         }
1320         break;
1321 
1322     default:
1323         g_assert_not_reached();
1324     }
1325     tcg_swap_vecop_list(hold_list);
1326 
1327     if (oprsz < maxsz) {
1328         expand_clr(dofs + oprsz, maxsz - oprsz);
1329     }
1330 }
1331 
1332 /* Expand a vector four-operand operation.  */
1333 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1334                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1335 {
1336     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1337     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1338     TCGType type;
1339     uint32_t some;
1340 
1341     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1342     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1343 
1344     type = 0;
1345     if (g->fniv) {
1346         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1347     }
1348     switch (type) {
1349     case TCG_TYPE_V256:
1350         /* Recall that ARM SVE allows vector sizes that are not a
1351          * power of 2, but always a multiple of 16.  The intent is
1352          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1353          */
1354         some = QEMU_ALIGN_DOWN(oprsz, 32);
1355         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1356                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1357         if (some == oprsz) {
1358             break;
1359         }
1360         dofs += some;
1361         aofs += some;
1362         bofs += some;
1363         cofs += some;
1364         oprsz -= some;
1365         maxsz -= some;
1366         /* fallthru */
1367     case TCG_TYPE_V128:
1368         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1369                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1370         break;
1371     case TCG_TYPE_V64:
1372         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1373                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1374         break;
1375 
1376     case 0:
1377         if (g->fni8 && check_size_impl(oprsz, 8)) {
1378             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1379                          g->write_aofs, g->fni8);
1380         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1381             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1382                          g->write_aofs, g->fni4);
1383         } else {
1384             assert(g->fno != NULL);
1385             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1386                                oprsz, maxsz, g->data, g->fno);
1387             oprsz = maxsz;
1388         }
1389         break;
1390 
1391     default:
1392         g_assert_not_reached();
1393     }
1394     tcg_swap_vecop_list(hold_list);
1395 
1396     if (oprsz < maxsz) {
1397         expand_clr(dofs + oprsz, maxsz - oprsz);
1398     }
1399 }
1400 
1401 /*
1402  * Expand specific vector operations.
1403  */
1404 
1405 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1406 {
1407     tcg_gen_mov_vec(a, b);
1408 }
1409 
1410 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1411                       uint32_t oprsz, uint32_t maxsz)
1412 {
1413     static const GVecGen2 g = {
1414         .fni8 = tcg_gen_mov_i64,
1415         .fniv = vec_mov2,
1416         .fno = gen_helper_gvec_mov,
1417         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1418     };
1419     if (dofs != aofs) {
1420         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1421     } else {
1422         check_size_align(oprsz, maxsz, dofs);
1423         if (oprsz < maxsz) {
1424             expand_clr(dofs + oprsz, maxsz - oprsz);
1425         }
1426     }
1427 }
1428 
1429 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1430                           uint32_t maxsz, TCGv_i32 in)
1431 {
1432     check_size_align(oprsz, maxsz, dofs);
1433     tcg_debug_assert(vece <= MO_32);
1434     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1435 }
1436 
1437 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1438                           uint32_t maxsz, TCGv_i64 in)
1439 {
1440     check_size_align(oprsz, maxsz, dofs);
1441     tcg_debug_assert(vece <= MO_64);
1442     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1443 }
1444 
1445 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1446                           uint32_t oprsz, uint32_t maxsz)
1447 {
1448     check_size_align(oprsz, maxsz, dofs);
1449     if (vece <= MO_64) {
1450         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1451         if (type != 0) {
1452             TCGv_vec t_vec = tcg_temp_new_vec(type);
1453             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1454             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1455             tcg_temp_free_vec(t_vec);
1456         } else if (vece <= MO_32) {
1457             TCGv_i32 in = tcg_temp_new_i32();
1458             switch (vece) {
1459             case MO_8:
1460                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1461                 break;
1462             case MO_16:
1463                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1464                 break;
1465             default:
1466                 tcg_gen_ld_i32(in, cpu_env, aofs);
1467                 break;
1468             }
1469             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1470             tcg_temp_free_i32(in);
1471         } else {
1472             TCGv_i64 in = tcg_temp_new_i64();
1473             tcg_gen_ld_i64(in, cpu_env, aofs);
1474             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1475             tcg_temp_free_i64(in);
1476         }
1477     } else {
1478         /* 128-bit duplicate.  */
1479         /* ??? Dup to 256-bit vector.  */
1480         int i;
1481 
1482         tcg_debug_assert(vece == 4);
1483         tcg_debug_assert(oprsz >= 16);
1484         if (TCG_TARGET_HAS_v128) {
1485             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1486 
1487             tcg_gen_ld_vec(in, cpu_env, aofs);
1488             for (i = 0; i < oprsz; i += 16) {
1489                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1490             }
1491             tcg_temp_free_vec(in);
1492         } else {
1493             TCGv_i64 in0 = tcg_temp_new_i64();
1494             TCGv_i64 in1 = tcg_temp_new_i64();
1495 
1496             tcg_gen_ld_i64(in0, cpu_env, aofs);
1497             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1498             for (i = 0; i < oprsz; i += 16) {
1499                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1500                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1501             }
1502             tcg_temp_free_i64(in0);
1503             tcg_temp_free_i64(in1);
1504         }
1505         if (oprsz < maxsz) {
1506             expand_clr(dofs + oprsz, maxsz - oprsz);
1507         }
1508     }
1509 }
1510 
1511 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1512                          uint32_t maxsz, uint64_t x)
1513 {
1514     check_size_align(oprsz, maxsz, dofs);
1515     do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1516 }
1517 
1518 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1519                          uint32_t maxsz, uint32_t x)
1520 {
1521     check_size_align(oprsz, maxsz, dofs);
1522     do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1523 }
1524 
1525 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1526                          uint32_t maxsz, uint16_t x)
1527 {
1528     check_size_align(oprsz, maxsz, dofs);
1529     do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1530 }
1531 
1532 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1533                          uint32_t maxsz, uint8_t x)
1534 {
1535     check_size_align(oprsz, maxsz, dofs);
1536     do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1537 }
1538 
1539 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1540                       uint32_t oprsz, uint32_t maxsz)
1541 {
1542     static const GVecGen2 g = {
1543         .fni8 = tcg_gen_not_i64,
1544         .fniv = tcg_gen_not_vec,
1545         .fno = gen_helper_gvec_not,
1546         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1547     };
1548     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1549 }
1550 
1551 /* Perform a vector addition using normal addition and a mask.  The mask
1552    should be the sign bit of each lane.  This 6-operation form is more
1553    efficient than separate additions when there are 4 or more lanes in
1554    the 64-bit operation.  */
1555 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1556 {
1557     TCGv_i64 t1 = tcg_temp_new_i64();
1558     TCGv_i64 t2 = tcg_temp_new_i64();
1559     TCGv_i64 t3 = tcg_temp_new_i64();
1560 
1561     tcg_gen_andc_i64(t1, a, m);
1562     tcg_gen_andc_i64(t2, b, m);
1563     tcg_gen_xor_i64(t3, a, b);
1564     tcg_gen_add_i64(d, t1, t2);
1565     tcg_gen_and_i64(t3, t3, m);
1566     tcg_gen_xor_i64(d, d, t3);
1567 
1568     tcg_temp_free_i64(t1);
1569     tcg_temp_free_i64(t2);
1570     tcg_temp_free_i64(t3);
1571 }
1572 
1573 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1574 {
1575     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1576     gen_addv_mask(d, a, b, m);
1577     tcg_temp_free_i64(m);
1578 }
1579 
1580 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1581 {
1582     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1583     gen_addv_mask(d, a, b, m);
1584     tcg_temp_free_i64(m);
1585 }
1586 
1587 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1588 {
1589     TCGv_i64 t1 = tcg_temp_new_i64();
1590     TCGv_i64 t2 = tcg_temp_new_i64();
1591 
1592     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1593     tcg_gen_add_i64(t2, a, b);
1594     tcg_gen_add_i64(t1, t1, b);
1595     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1596 
1597     tcg_temp_free_i64(t1);
1598     tcg_temp_free_i64(t2);
1599 }
1600 
1601 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1602 
1603 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1604                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1605 {
1606     static const GVecGen3 g[4] = {
1607         { .fni8 = tcg_gen_vec_add8_i64,
1608           .fniv = tcg_gen_add_vec,
1609           .fno = gen_helper_gvec_add8,
1610           .opt_opc = vecop_list_add,
1611           .vece = MO_8 },
1612         { .fni8 = tcg_gen_vec_add16_i64,
1613           .fniv = tcg_gen_add_vec,
1614           .fno = gen_helper_gvec_add16,
1615           .opt_opc = vecop_list_add,
1616           .vece = MO_16 },
1617         { .fni4 = tcg_gen_add_i32,
1618           .fniv = tcg_gen_add_vec,
1619           .fno = gen_helper_gvec_add32,
1620           .opt_opc = vecop_list_add,
1621           .vece = MO_32 },
1622         { .fni8 = tcg_gen_add_i64,
1623           .fniv = tcg_gen_add_vec,
1624           .fno = gen_helper_gvec_add64,
1625           .opt_opc = vecop_list_add,
1626           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1627           .vece = MO_64 },
1628     };
1629 
1630     tcg_debug_assert(vece <= MO_64);
1631     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1632 }
1633 
1634 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1635                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1636 {
1637     static const GVecGen2s g[4] = {
1638         { .fni8 = tcg_gen_vec_add8_i64,
1639           .fniv = tcg_gen_add_vec,
1640           .fno = gen_helper_gvec_adds8,
1641           .opt_opc = vecop_list_add,
1642           .vece = MO_8 },
1643         { .fni8 = tcg_gen_vec_add16_i64,
1644           .fniv = tcg_gen_add_vec,
1645           .fno = gen_helper_gvec_adds16,
1646           .opt_opc = vecop_list_add,
1647           .vece = MO_16 },
1648         { .fni4 = tcg_gen_add_i32,
1649           .fniv = tcg_gen_add_vec,
1650           .fno = gen_helper_gvec_adds32,
1651           .opt_opc = vecop_list_add,
1652           .vece = MO_32 },
1653         { .fni8 = tcg_gen_add_i64,
1654           .fniv = tcg_gen_add_vec,
1655           .fno = gen_helper_gvec_adds64,
1656           .opt_opc = vecop_list_add,
1657           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1658           .vece = MO_64 },
1659     };
1660 
1661     tcg_debug_assert(vece <= MO_64);
1662     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1663 }
1664 
1665 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1666                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1667 {
1668     TCGv_i64 tmp = tcg_const_i64(c);
1669     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1670     tcg_temp_free_i64(tmp);
1671 }
1672 
1673 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1674 
1675 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1676                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1677 {
1678     static const GVecGen2s g[4] = {
1679         { .fni8 = tcg_gen_vec_sub8_i64,
1680           .fniv = tcg_gen_sub_vec,
1681           .fno = gen_helper_gvec_subs8,
1682           .opt_opc = vecop_list_sub,
1683           .vece = MO_8 },
1684         { .fni8 = tcg_gen_vec_sub16_i64,
1685           .fniv = tcg_gen_sub_vec,
1686           .fno = gen_helper_gvec_subs16,
1687           .opt_opc = vecop_list_sub,
1688           .vece = MO_16 },
1689         { .fni4 = tcg_gen_sub_i32,
1690           .fniv = tcg_gen_sub_vec,
1691           .fno = gen_helper_gvec_subs32,
1692           .opt_opc = vecop_list_sub,
1693           .vece = MO_32 },
1694         { .fni8 = tcg_gen_sub_i64,
1695           .fniv = tcg_gen_sub_vec,
1696           .fno = gen_helper_gvec_subs64,
1697           .opt_opc = vecop_list_sub,
1698           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1699           .vece = MO_64 },
1700     };
1701 
1702     tcg_debug_assert(vece <= MO_64);
1703     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1704 }
1705 
1706 /* Perform a vector subtraction using normal subtraction and a mask.
1707    Compare gen_addv_mask above.  */
1708 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1709 {
1710     TCGv_i64 t1 = tcg_temp_new_i64();
1711     TCGv_i64 t2 = tcg_temp_new_i64();
1712     TCGv_i64 t3 = tcg_temp_new_i64();
1713 
1714     tcg_gen_or_i64(t1, a, m);
1715     tcg_gen_andc_i64(t2, b, m);
1716     tcg_gen_eqv_i64(t3, a, b);
1717     tcg_gen_sub_i64(d, t1, t2);
1718     tcg_gen_and_i64(t3, t3, m);
1719     tcg_gen_xor_i64(d, d, t3);
1720 
1721     tcg_temp_free_i64(t1);
1722     tcg_temp_free_i64(t2);
1723     tcg_temp_free_i64(t3);
1724 }
1725 
1726 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1727 {
1728     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1729     gen_subv_mask(d, a, b, m);
1730     tcg_temp_free_i64(m);
1731 }
1732 
1733 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734 {
1735     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1736     gen_subv_mask(d, a, b, m);
1737     tcg_temp_free_i64(m);
1738 }
1739 
1740 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1741 {
1742     TCGv_i64 t1 = tcg_temp_new_i64();
1743     TCGv_i64 t2 = tcg_temp_new_i64();
1744 
1745     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1746     tcg_gen_sub_i64(t2, a, b);
1747     tcg_gen_sub_i64(t1, a, t1);
1748     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1749 
1750     tcg_temp_free_i64(t1);
1751     tcg_temp_free_i64(t2);
1752 }
1753 
1754 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1755                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1756 {
1757     static const GVecGen3 g[4] = {
1758         { .fni8 = tcg_gen_vec_sub8_i64,
1759           .fniv = tcg_gen_sub_vec,
1760           .fno = gen_helper_gvec_sub8,
1761           .opt_opc = vecop_list_sub,
1762           .vece = MO_8 },
1763         { .fni8 = tcg_gen_vec_sub16_i64,
1764           .fniv = tcg_gen_sub_vec,
1765           .fno = gen_helper_gvec_sub16,
1766           .opt_opc = vecop_list_sub,
1767           .vece = MO_16 },
1768         { .fni4 = tcg_gen_sub_i32,
1769           .fniv = tcg_gen_sub_vec,
1770           .fno = gen_helper_gvec_sub32,
1771           .opt_opc = vecop_list_sub,
1772           .vece = MO_32 },
1773         { .fni8 = tcg_gen_sub_i64,
1774           .fniv = tcg_gen_sub_vec,
1775           .fno = gen_helper_gvec_sub64,
1776           .opt_opc = vecop_list_sub,
1777           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1778           .vece = MO_64 },
1779     };
1780 
1781     tcg_debug_assert(vece <= MO_64);
1782     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1783 }
1784 
1785 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1786 
1787 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1788                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1789 {
1790     static const GVecGen3 g[4] = {
1791         { .fniv = tcg_gen_mul_vec,
1792           .fno = gen_helper_gvec_mul8,
1793           .opt_opc = vecop_list_mul,
1794           .vece = MO_8 },
1795         { .fniv = tcg_gen_mul_vec,
1796           .fno = gen_helper_gvec_mul16,
1797           .opt_opc = vecop_list_mul,
1798           .vece = MO_16 },
1799         { .fni4 = tcg_gen_mul_i32,
1800           .fniv = tcg_gen_mul_vec,
1801           .fno = gen_helper_gvec_mul32,
1802           .opt_opc = vecop_list_mul,
1803           .vece = MO_32 },
1804         { .fni8 = tcg_gen_mul_i64,
1805           .fniv = tcg_gen_mul_vec,
1806           .fno = gen_helper_gvec_mul64,
1807           .opt_opc = vecop_list_mul,
1808           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1809           .vece = MO_64 },
1810     };
1811 
1812     tcg_debug_assert(vece <= MO_64);
1813     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1814 }
1815 
1816 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1817                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1818 {
1819     static const GVecGen2s g[4] = {
1820         { .fniv = tcg_gen_mul_vec,
1821           .fno = gen_helper_gvec_muls8,
1822           .opt_opc = vecop_list_mul,
1823           .vece = MO_8 },
1824         { .fniv = tcg_gen_mul_vec,
1825           .fno = gen_helper_gvec_muls16,
1826           .opt_opc = vecop_list_mul,
1827           .vece = MO_16 },
1828         { .fni4 = tcg_gen_mul_i32,
1829           .fniv = tcg_gen_mul_vec,
1830           .fno = gen_helper_gvec_muls32,
1831           .opt_opc = vecop_list_mul,
1832           .vece = MO_32 },
1833         { .fni8 = tcg_gen_mul_i64,
1834           .fniv = tcg_gen_mul_vec,
1835           .fno = gen_helper_gvec_muls64,
1836           .opt_opc = vecop_list_mul,
1837           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1838           .vece = MO_64 },
1839     };
1840 
1841     tcg_debug_assert(vece <= MO_64);
1842     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1843 }
1844 
1845 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1846                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1847 {
1848     TCGv_i64 tmp = tcg_const_i64(c);
1849     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1850     tcg_temp_free_i64(tmp);
1851 }
1852 
1853 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1854                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1855 {
1856     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1857     static const GVecGen3 g[4] = {
1858         { .fniv = tcg_gen_ssadd_vec,
1859           .fno = gen_helper_gvec_ssadd8,
1860           .opt_opc = vecop_list,
1861           .vece = MO_8 },
1862         { .fniv = tcg_gen_ssadd_vec,
1863           .fno = gen_helper_gvec_ssadd16,
1864           .opt_opc = vecop_list,
1865           .vece = MO_16 },
1866         { .fniv = tcg_gen_ssadd_vec,
1867           .fno = gen_helper_gvec_ssadd32,
1868           .opt_opc = vecop_list,
1869           .vece = MO_32 },
1870         { .fniv = tcg_gen_ssadd_vec,
1871           .fno = gen_helper_gvec_ssadd64,
1872           .opt_opc = vecop_list,
1873           .vece = MO_64 },
1874     };
1875     tcg_debug_assert(vece <= MO_64);
1876     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1877 }
1878 
1879 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1880                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1881 {
1882     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
1883     static const GVecGen3 g[4] = {
1884         { .fniv = tcg_gen_sssub_vec,
1885           .fno = gen_helper_gvec_sssub8,
1886           .opt_opc = vecop_list,
1887           .vece = MO_8 },
1888         { .fniv = tcg_gen_sssub_vec,
1889           .fno = gen_helper_gvec_sssub16,
1890           .opt_opc = vecop_list,
1891           .vece = MO_16 },
1892         { .fniv = tcg_gen_sssub_vec,
1893           .fno = gen_helper_gvec_sssub32,
1894           .opt_opc = vecop_list,
1895           .vece = MO_32 },
1896         { .fniv = tcg_gen_sssub_vec,
1897           .fno = gen_helper_gvec_sssub64,
1898           .opt_opc = vecop_list,
1899           .vece = MO_64 },
1900     };
1901     tcg_debug_assert(vece <= MO_64);
1902     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1903 }
1904 
1905 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1906 {
1907     TCGv_i32 max = tcg_const_i32(-1);
1908     tcg_gen_add_i32(d, a, b);
1909     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1910     tcg_temp_free_i32(max);
1911 }
1912 
1913 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1914 {
1915     TCGv_i64 max = tcg_const_i64(-1);
1916     tcg_gen_add_i64(d, a, b);
1917     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1918     tcg_temp_free_i64(max);
1919 }
1920 
1921 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1922                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1923 {
1924     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
1925     static const GVecGen3 g[4] = {
1926         { .fniv = tcg_gen_usadd_vec,
1927           .fno = gen_helper_gvec_usadd8,
1928           .opt_opc = vecop_list,
1929           .vece = MO_8 },
1930         { .fniv = tcg_gen_usadd_vec,
1931           .fno = gen_helper_gvec_usadd16,
1932           .opt_opc = vecop_list,
1933           .vece = MO_16 },
1934         { .fni4 = tcg_gen_usadd_i32,
1935           .fniv = tcg_gen_usadd_vec,
1936           .fno = gen_helper_gvec_usadd32,
1937           .opt_opc = vecop_list,
1938           .vece = MO_32 },
1939         { .fni8 = tcg_gen_usadd_i64,
1940           .fniv = tcg_gen_usadd_vec,
1941           .fno = gen_helper_gvec_usadd64,
1942           .opt_opc = vecop_list,
1943           .vece = MO_64 }
1944     };
1945     tcg_debug_assert(vece <= MO_64);
1946     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1947 }
1948 
1949 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1950 {
1951     TCGv_i32 min = tcg_const_i32(0);
1952     tcg_gen_sub_i32(d, a, b);
1953     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1954     tcg_temp_free_i32(min);
1955 }
1956 
1957 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1958 {
1959     TCGv_i64 min = tcg_const_i64(0);
1960     tcg_gen_sub_i64(d, a, b);
1961     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1962     tcg_temp_free_i64(min);
1963 }
1964 
1965 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1966                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1967 {
1968     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
1969     static const GVecGen3 g[4] = {
1970         { .fniv = tcg_gen_ussub_vec,
1971           .fno = gen_helper_gvec_ussub8,
1972           .opt_opc = vecop_list,
1973           .vece = MO_8 },
1974         { .fniv = tcg_gen_ussub_vec,
1975           .fno = gen_helper_gvec_ussub16,
1976           .opt_opc = vecop_list,
1977           .vece = MO_16 },
1978         { .fni4 = tcg_gen_ussub_i32,
1979           .fniv = tcg_gen_ussub_vec,
1980           .fno = gen_helper_gvec_ussub32,
1981           .opt_opc = vecop_list,
1982           .vece = MO_32 },
1983         { .fni8 = tcg_gen_ussub_i64,
1984           .fniv = tcg_gen_ussub_vec,
1985           .fno = gen_helper_gvec_ussub64,
1986           .opt_opc = vecop_list,
1987           .vece = MO_64 }
1988     };
1989     tcg_debug_assert(vece <= MO_64);
1990     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1991 }
1992 
1993 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
1994                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1995 {
1996     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
1997     static const GVecGen3 g[4] = {
1998         { .fniv = tcg_gen_smin_vec,
1999           .fno = gen_helper_gvec_smin8,
2000           .opt_opc = vecop_list,
2001           .vece = MO_8 },
2002         { .fniv = tcg_gen_smin_vec,
2003           .fno = gen_helper_gvec_smin16,
2004           .opt_opc = vecop_list,
2005           .vece = MO_16 },
2006         { .fni4 = tcg_gen_smin_i32,
2007           .fniv = tcg_gen_smin_vec,
2008           .fno = gen_helper_gvec_smin32,
2009           .opt_opc = vecop_list,
2010           .vece = MO_32 },
2011         { .fni8 = tcg_gen_smin_i64,
2012           .fniv = tcg_gen_smin_vec,
2013           .fno = gen_helper_gvec_smin64,
2014           .opt_opc = vecop_list,
2015           .vece = MO_64 }
2016     };
2017     tcg_debug_assert(vece <= MO_64);
2018     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2019 }
2020 
2021 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2022                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2023 {
2024     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2025     static const GVecGen3 g[4] = {
2026         { .fniv = tcg_gen_umin_vec,
2027           .fno = gen_helper_gvec_umin8,
2028           .opt_opc = vecop_list,
2029           .vece = MO_8 },
2030         { .fniv = tcg_gen_umin_vec,
2031           .fno = gen_helper_gvec_umin16,
2032           .opt_opc = vecop_list,
2033           .vece = MO_16 },
2034         { .fni4 = tcg_gen_umin_i32,
2035           .fniv = tcg_gen_umin_vec,
2036           .fno = gen_helper_gvec_umin32,
2037           .opt_opc = vecop_list,
2038           .vece = MO_32 },
2039         { .fni8 = tcg_gen_umin_i64,
2040           .fniv = tcg_gen_umin_vec,
2041           .fno = gen_helper_gvec_umin64,
2042           .opt_opc = vecop_list,
2043           .vece = MO_64 }
2044     };
2045     tcg_debug_assert(vece <= MO_64);
2046     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2047 }
2048 
2049 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2050                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2051 {
2052     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2053     static const GVecGen3 g[4] = {
2054         { .fniv = tcg_gen_smax_vec,
2055           .fno = gen_helper_gvec_smax8,
2056           .opt_opc = vecop_list,
2057           .vece = MO_8 },
2058         { .fniv = tcg_gen_smax_vec,
2059           .fno = gen_helper_gvec_smax16,
2060           .opt_opc = vecop_list,
2061           .vece = MO_16 },
2062         { .fni4 = tcg_gen_smax_i32,
2063           .fniv = tcg_gen_smax_vec,
2064           .fno = gen_helper_gvec_smax32,
2065           .opt_opc = vecop_list,
2066           .vece = MO_32 },
2067         { .fni8 = tcg_gen_smax_i64,
2068           .fniv = tcg_gen_smax_vec,
2069           .fno = gen_helper_gvec_smax64,
2070           .opt_opc = vecop_list,
2071           .vece = MO_64 }
2072     };
2073     tcg_debug_assert(vece <= MO_64);
2074     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2075 }
2076 
2077 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2078                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2079 {
2080     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2081     static const GVecGen3 g[4] = {
2082         { .fniv = tcg_gen_umax_vec,
2083           .fno = gen_helper_gvec_umax8,
2084           .opt_opc = vecop_list,
2085           .vece = MO_8 },
2086         { .fniv = tcg_gen_umax_vec,
2087           .fno = gen_helper_gvec_umax16,
2088           .opt_opc = vecop_list,
2089           .vece = MO_16 },
2090         { .fni4 = tcg_gen_umax_i32,
2091           .fniv = tcg_gen_umax_vec,
2092           .fno = gen_helper_gvec_umax32,
2093           .opt_opc = vecop_list,
2094           .vece = MO_32 },
2095         { .fni8 = tcg_gen_umax_i64,
2096           .fniv = tcg_gen_umax_vec,
2097           .fno = gen_helper_gvec_umax64,
2098           .opt_opc = vecop_list,
2099           .vece = MO_64 }
2100     };
2101     tcg_debug_assert(vece <= MO_64);
2102     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2103 }
2104 
2105 /* Perform a vector negation using normal negation and a mask.
2106    Compare gen_subv_mask above.  */
2107 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2108 {
2109     TCGv_i64 t2 = tcg_temp_new_i64();
2110     TCGv_i64 t3 = tcg_temp_new_i64();
2111 
2112     tcg_gen_andc_i64(t3, m, b);
2113     tcg_gen_andc_i64(t2, b, m);
2114     tcg_gen_sub_i64(d, m, t2);
2115     tcg_gen_xor_i64(d, d, t3);
2116 
2117     tcg_temp_free_i64(t2);
2118     tcg_temp_free_i64(t3);
2119 }
2120 
2121 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2122 {
2123     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2124     gen_negv_mask(d, b, m);
2125     tcg_temp_free_i64(m);
2126 }
2127 
2128 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2129 {
2130     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2131     gen_negv_mask(d, b, m);
2132     tcg_temp_free_i64(m);
2133 }
2134 
2135 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2136 {
2137     TCGv_i64 t1 = tcg_temp_new_i64();
2138     TCGv_i64 t2 = tcg_temp_new_i64();
2139 
2140     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2141     tcg_gen_neg_i64(t2, b);
2142     tcg_gen_neg_i64(t1, t1);
2143     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2144 
2145     tcg_temp_free_i64(t1);
2146     tcg_temp_free_i64(t2);
2147 }
2148 
2149 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2150                       uint32_t oprsz, uint32_t maxsz)
2151 {
2152     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2153     static const GVecGen2 g[4] = {
2154         { .fni8 = tcg_gen_vec_neg8_i64,
2155           .fniv = tcg_gen_neg_vec,
2156           .fno = gen_helper_gvec_neg8,
2157           .opt_opc = vecop_list,
2158           .vece = MO_8 },
2159         { .fni8 = tcg_gen_vec_neg16_i64,
2160           .fniv = tcg_gen_neg_vec,
2161           .fno = gen_helper_gvec_neg16,
2162           .opt_opc = vecop_list,
2163           .vece = MO_16 },
2164         { .fni4 = tcg_gen_neg_i32,
2165           .fniv = tcg_gen_neg_vec,
2166           .fno = gen_helper_gvec_neg32,
2167           .opt_opc = vecop_list,
2168           .vece = MO_32 },
2169         { .fni8 = tcg_gen_neg_i64,
2170           .fniv = tcg_gen_neg_vec,
2171           .fno = gen_helper_gvec_neg64,
2172           .opt_opc = vecop_list,
2173           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2174           .vece = MO_64 },
2175     };
2176 
2177     tcg_debug_assert(vece <= MO_64);
2178     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2179 }
2180 
2181 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2182 {
2183     TCGv_i64 t = tcg_temp_new_i64();
2184     int nbit = 8 << vece;
2185 
2186     /* Create -1 for each negative element.  */
2187     tcg_gen_shri_i64(t, b, nbit - 1);
2188     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2189     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2190 
2191     /*
2192      * Invert (via xor -1) and add one (via sub -1).
2193      * Because of the ordering the msb is cleared,
2194      * so we never have carry into the next element.
2195      */
2196     tcg_gen_xor_i64(d, b, t);
2197     tcg_gen_sub_i64(d, d, t);
2198 
2199     tcg_temp_free_i64(t);
2200 }
2201 
2202 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2203 {
2204     gen_absv_mask(d, b, MO_8);
2205 }
2206 
2207 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2208 {
2209     gen_absv_mask(d, b, MO_16);
2210 }
2211 
2212 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2213                       uint32_t oprsz, uint32_t maxsz)
2214 {
2215     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2216     static const GVecGen2 g[4] = {
2217         { .fni8 = tcg_gen_vec_abs8_i64,
2218           .fniv = tcg_gen_abs_vec,
2219           .fno = gen_helper_gvec_abs8,
2220           .opt_opc = vecop_list,
2221           .vece = MO_8 },
2222         { .fni8 = tcg_gen_vec_abs16_i64,
2223           .fniv = tcg_gen_abs_vec,
2224           .fno = gen_helper_gvec_abs16,
2225           .opt_opc = vecop_list,
2226           .vece = MO_16 },
2227         { .fni4 = tcg_gen_abs_i32,
2228           .fniv = tcg_gen_abs_vec,
2229           .fno = gen_helper_gvec_abs32,
2230           .opt_opc = vecop_list,
2231           .vece = MO_32 },
2232         { .fni8 = tcg_gen_abs_i64,
2233           .fniv = tcg_gen_abs_vec,
2234           .fno = gen_helper_gvec_abs64,
2235           .opt_opc = vecop_list,
2236           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2237           .vece = MO_64 },
2238     };
2239 
2240     tcg_debug_assert(vece <= MO_64);
2241     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2242 }
2243 
2244 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2245                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2246 {
2247     static const GVecGen3 g = {
2248         .fni8 = tcg_gen_and_i64,
2249         .fniv = tcg_gen_and_vec,
2250         .fno = gen_helper_gvec_and,
2251         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2252     };
2253 
2254     if (aofs == bofs) {
2255         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2256     } else {
2257         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2258     }
2259 }
2260 
2261 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2262                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2263 {
2264     static const GVecGen3 g = {
2265         .fni8 = tcg_gen_or_i64,
2266         .fniv = tcg_gen_or_vec,
2267         .fno = gen_helper_gvec_or,
2268         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2269     };
2270 
2271     if (aofs == bofs) {
2272         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2273     } else {
2274         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2275     }
2276 }
2277 
2278 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2279                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2280 {
2281     static const GVecGen3 g = {
2282         .fni8 = tcg_gen_xor_i64,
2283         .fniv = tcg_gen_xor_vec,
2284         .fno = gen_helper_gvec_xor,
2285         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2286     };
2287 
2288     if (aofs == bofs) {
2289         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2290     } else {
2291         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2292     }
2293 }
2294 
2295 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2296                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2297 {
2298     static const GVecGen3 g = {
2299         .fni8 = tcg_gen_andc_i64,
2300         .fniv = tcg_gen_andc_vec,
2301         .fno = gen_helper_gvec_andc,
2302         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2303     };
2304 
2305     if (aofs == bofs) {
2306         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2307     } else {
2308         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2309     }
2310 }
2311 
2312 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2313                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2314 {
2315     static const GVecGen3 g = {
2316         .fni8 = tcg_gen_orc_i64,
2317         .fniv = tcg_gen_orc_vec,
2318         .fno = gen_helper_gvec_orc,
2319         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2320     };
2321 
2322     if (aofs == bofs) {
2323         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2324     } else {
2325         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2326     }
2327 }
2328 
2329 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2330                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2331 {
2332     static const GVecGen3 g = {
2333         .fni8 = tcg_gen_nand_i64,
2334         .fniv = tcg_gen_nand_vec,
2335         .fno = gen_helper_gvec_nand,
2336         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2337     };
2338 
2339     if (aofs == bofs) {
2340         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2341     } else {
2342         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2343     }
2344 }
2345 
2346 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2347                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2348 {
2349     static const GVecGen3 g = {
2350         .fni8 = tcg_gen_nor_i64,
2351         .fniv = tcg_gen_nor_vec,
2352         .fno = gen_helper_gvec_nor,
2353         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2354     };
2355 
2356     if (aofs == bofs) {
2357         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2358     } else {
2359         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2360     }
2361 }
2362 
2363 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2364                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2365 {
2366     static const GVecGen3 g = {
2367         .fni8 = tcg_gen_eqv_i64,
2368         .fniv = tcg_gen_eqv_vec,
2369         .fno = gen_helper_gvec_eqv,
2370         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2371     };
2372 
2373     if (aofs == bofs) {
2374         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2375     } else {
2376         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2377     }
2378 }
2379 
2380 static const GVecGen2s gop_ands = {
2381     .fni8 = tcg_gen_and_i64,
2382     .fniv = tcg_gen_and_vec,
2383     .fno = gen_helper_gvec_ands,
2384     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2385     .vece = MO_64
2386 };
2387 
2388 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2389                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2390 {
2391     TCGv_i64 tmp = tcg_temp_new_i64();
2392     gen_dup_i64(vece, tmp, c);
2393     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2394     tcg_temp_free_i64(tmp);
2395 }
2396 
2397 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2398                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2399 {
2400     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2401     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2402     tcg_temp_free_i64(tmp);
2403 }
2404 
2405 static const GVecGen2s gop_xors = {
2406     .fni8 = tcg_gen_xor_i64,
2407     .fniv = tcg_gen_xor_vec,
2408     .fno = gen_helper_gvec_xors,
2409     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2410     .vece = MO_64
2411 };
2412 
2413 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2414                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2415 {
2416     TCGv_i64 tmp = tcg_temp_new_i64();
2417     gen_dup_i64(vece, tmp, c);
2418     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2419     tcg_temp_free_i64(tmp);
2420 }
2421 
2422 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2423                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2424 {
2425     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2426     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2427     tcg_temp_free_i64(tmp);
2428 }
2429 
2430 static const GVecGen2s gop_ors = {
2431     .fni8 = tcg_gen_or_i64,
2432     .fniv = tcg_gen_or_vec,
2433     .fno = gen_helper_gvec_ors,
2434     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2435     .vece = MO_64
2436 };
2437 
2438 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2439                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2440 {
2441     TCGv_i64 tmp = tcg_temp_new_i64();
2442     gen_dup_i64(vece, tmp, c);
2443     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2444     tcg_temp_free_i64(tmp);
2445 }
2446 
2447 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2448                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2449 {
2450     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2451     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2452     tcg_temp_free_i64(tmp);
2453 }
2454 
2455 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2456 {
2457     uint64_t mask = dup_const(MO_8, 0xff << c);
2458     tcg_gen_shli_i64(d, a, c);
2459     tcg_gen_andi_i64(d, d, mask);
2460 }
2461 
2462 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2463 {
2464     uint64_t mask = dup_const(MO_16, 0xffff << c);
2465     tcg_gen_shli_i64(d, a, c);
2466     tcg_gen_andi_i64(d, d, mask);
2467 }
2468 
2469 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2470                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2471 {
2472     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2473     static const GVecGen2i g[4] = {
2474         { .fni8 = tcg_gen_vec_shl8i_i64,
2475           .fniv = tcg_gen_shli_vec,
2476           .fno = gen_helper_gvec_shl8i,
2477           .opt_opc = vecop_list,
2478           .vece = MO_8 },
2479         { .fni8 = tcg_gen_vec_shl16i_i64,
2480           .fniv = tcg_gen_shli_vec,
2481           .fno = gen_helper_gvec_shl16i,
2482           .opt_opc = vecop_list,
2483           .vece = MO_16 },
2484         { .fni4 = tcg_gen_shli_i32,
2485           .fniv = tcg_gen_shli_vec,
2486           .fno = gen_helper_gvec_shl32i,
2487           .opt_opc = vecop_list,
2488           .vece = MO_32 },
2489         { .fni8 = tcg_gen_shli_i64,
2490           .fniv = tcg_gen_shli_vec,
2491           .fno = gen_helper_gvec_shl64i,
2492           .opt_opc = vecop_list,
2493           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2494           .vece = MO_64 },
2495     };
2496 
2497     tcg_debug_assert(vece <= MO_64);
2498     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2499     if (shift == 0) {
2500         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2501     } else {
2502         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2503     }
2504 }
2505 
2506 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2507 {
2508     uint64_t mask = dup_const(MO_8, 0xff >> c);
2509     tcg_gen_shri_i64(d, a, c);
2510     tcg_gen_andi_i64(d, d, mask);
2511 }
2512 
2513 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2514 {
2515     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2516     tcg_gen_shri_i64(d, a, c);
2517     tcg_gen_andi_i64(d, d, mask);
2518 }
2519 
2520 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2521                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2522 {
2523     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2524     static const GVecGen2i g[4] = {
2525         { .fni8 = tcg_gen_vec_shr8i_i64,
2526           .fniv = tcg_gen_shri_vec,
2527           .fno = gen_helper_gvec_shr8i,
2528           .opt_opc = vecop_list,
2529           .vece = MO_8 },
2530         { .fni8 = tcg_gen_vec_shr16i_i64,
2531           .fniv = tcg_gen_shri_vec,
2532           .fno = gen_helper_gvec_shr16i,
2533           .opt_opc = vecop_list,
2534           .vece = MO_16 },
2535         { .fni4 = tcg_gen_shri_i32,
2536           .fniv = tcg_gen_shri_vec,
2537           .fno = gen_helper_gvec_shr32i,
2538           .opt_opc = vecop_list,
2539           .vece = MO_32 },
2540         { .fni8 = tcg_gen_shri_i64,
2541           .fniv = tcg_gen_shri_vec,
2542           .fno = gen_helper_gvec_shr64i,
2543           .opt_opc = vecop_list,
2544           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2545           .vece = MO_64 },
2546     };
2547 
2548     tcg_debug_assert(vece <= MO_64);
2549     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2550     if (shift == 0) {
2551         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2552     } else {
2553         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2554     }
2555 }
2556 
2557 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2558 {
2559     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2560     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2561     TCGv_i64 s = tcg_temp_new_i64();
2562 
2563     tcg_gen_shri_i64(d, a, c);
2564     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2565     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2566     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2567     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2568     tcg_temp_free_i64(s);
2569 }
2570 
2571 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2572 {
2573     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2574     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2575     TCGv_i64 s = tcg_temp_new_i64();
2576 
2577     tcg_gen_shri_i64(d, a, c);
2578     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2579     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2580     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2581     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2582     tcg_temp_free_i64(s);
2583 }
2584 
2585 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2586                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2587 {
2588     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2589     static const GVecGen2i g[4] = {
2590         { .fni8 = tcg_gen_vec_sar8i_i64,
2591           .fniv = tcg_gen_sari_vec,
2592           .fno = gen_helper_gvec_sar8i,
2593           .opt_opc = vecop_list,
2594           .vece = MO_8 },
2595         { .fni8 = tcg_gen_vec_sar16i_i64,
2596           .fniv = tcg_gen_sari_vec,
2597           .fno = gen_helper_gvec_sar16i,
2598           .opt_opc = vecop_list,
2599           .vece = MO_16 },
2600         { .fni4 = tcg_gen_sari_i32,
2601           .fniv = tcg_gen_sari_vec,
2602           .fno = gen_helper_gvec_sar32i,
2603           .opt_opc = vecop_list,
2604           .vece = MO_32 },
2605         { .fni8 = tcg_gen_sari_i64,
2606           .fniv = tcg_gen_sari_vec,
2607           .fno = gen_helper_gvec_sar64i,
2608           .opt_opc = vecop_list,
2609           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2610           .vece = MO_64 },
2611     };
2612 
2613     tcg_debug_assert(vece <= MO_64);
2614     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2615     if (shift == 0) {
2616         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2617     } else {
2618         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2619     }
2620 }
2621 
2622 /*
2623  * Specialized generation vector shifts by a non-constant scalar.
2624  */
2625 
2626 typedef struct {
2627     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2628     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2629     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2630     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2631     gen_helper_gvec_2 *fno[4];
2632     TCGOpcode s_list[2];
2633     TCGOpcode v_list[2];
2634 } GVecGen2sh;
2635 
2636 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2637                            uint32_t oprsz, uint32_t tysz, TCGType type,
2638                            TCGv_i32 shift,
2639                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2640 {
2641     TCGv_vec t0 = tcg_temp_new_vec(type);
2642     uint32_t i;
2643 
2644     for (i = 0; i < oprsz; i += tysz) {
2645         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2646         fni(vece, t0, t0, shift);
2647         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2648     }
2649     tcg_temp_free_vec(t0);
2650 }
2651 
2652 static void
2653 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2654                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2655 {
2656     TCGType type;
2657     uint32_t some;
2658 
2659     check_size_align(oprsz, maxsz, dofs | aofs);
2660     check_overlap_2(dofs, aofs, maxsz);
2661 
2662     /* If the backend has a scalar expansion, great.  */
2663     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2664     if (type) {
2665         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2666         switch (type) {
2667         case TCG_TYPE_V256:
2668             some = QEMU_ALIGN_DOWN(oprsz, 32);
2669             expand_2sh_vec(vece, dofs, aofs, some, 32,
2670                            TCG_TYPE_V256, shift, g->fniv_s);
2671             if (some == oprsz) {
2672                 break;
2673             }
2674             dofs += some;
2675             aofs += some;
2676             oprsz -= some;
2677             maxsz -= some;
2678             /* fallthru */
2679         case TCG_TYPE_V128:
2680             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2681                            TCG_TYPE_V128, shift, g->fniv_s);
2682             break;
2683         case TCG_TYPE_V64:
2684             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2685                            TCG_TYPE_V64, shift, g->fniv_s);
2686             break;
2687         default:
2688             g_assert_not_reached();
2689         }
2690         tcg_swap_vecop_list(hold_list);
2691         goto clear_tail;
2692     }
2693 
2694     /* If the backend supports variable vector shifts, also cool.  */
2695     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2696     if (type) {
2697         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2698         TCGv_vec v_shift = tcg_temp_new_vec(type);
2699 
2700         if (vece == MO_64) {
2701             TCGv_i64 sh64 = tcg_temp_new_i64();
2702             tcg_gen_extu_i32_i64(sh64, shift);
2703             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2704             tcg_temp_free_i64(sh64);
2705         } else {
2706             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2707         }
2708 
2709         switch (type) {
2710         case TCG_TYPE_V256:
2711             some = QEMU_ALIGN_DOWN(oprsz, 32);
2712             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2713                           v_shift, false, g->fniv_v);
2714             if (some == oprsz) {
2715                 break;
2716             }
2717             dofs += some;
2718             aofs += some;
2719             oprsz -= some;
2720             maxsz -= some;
2721             /* fallthru */
2722         case TCG_TYPE_V128:
2723             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2724                           v_shift, false, g->fniv_v);
2725             break;
2726         case TCG_TYPE_V64:
2727             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2728                           v_shift, false, g->fniv_v);
2729             break;
2730         default:
2731             g_assert_not_reached();
2732         }
2733         tcg_temp_free_vec(v_shift);
2734         tcg_swap_vecop_list(hold_list);
2735         goto clear_tail;
2736     }
2737 
2738     /* Otherwise fall back to integral... */
2739     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2740         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2741     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2742         TCGv_i64 sh64 = tcg_temp_new_i64();
2743         tcg_gen_extu_i32_i64(sh64, shift);
2744         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2745         tcg_temp_free_i64(sh64);
2746     } else {
2747         TCGv_ptr a0 = tcg_temp_new_ptr();
2748         TCGv_ptr a1 = tcg_temp_new_ptr();
2749         TCGv_i32 desc = tcg_temp_new_i32();
2750 
2751         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2752         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2753         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2754         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2755 
2756         g->fno[vece](a0, a1, desc);
2757 
2758         tcg_temp_free_ptr(a0);
2759         tcg_temp_free_ptr(a1);
2760         tcg_temp_free_i32(desc);
2761         return;
2762     }
2763 
2764  clear_tail:
2765     if (oprsz < maxsz) {
2766         expand_clr(dofs + oprsz, maxsz - oprsz);
2767     }
2768 }
2769 
2770 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2771                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2772 {
2773     static const GVecGen2sh g = {
2774         .fni4 = tcg_gen_shl_i32,
2775         .fni8 = tcg_gen_shl_i64,
2776         .fniv_s = tcg_gen_shls_vec,
2777         .fniv_v = tcg_gen_shlv_vec,
2778         .fno = {
2779             gen_helper_gvec_shl8i,
2780             gen_helper_gvec_shl16i,
2781             gen_helper_gvec_shl32i,
2782             gen_helper_gvec_shl64i,
2783         },
2784         .s_list = { INDEX_op_shls_vec, 0 },
2785         .v_list = { INDEX_op_shlv_vec, 0 },
2786     };
2787 
2788     tcg_debug_assert(vece <= MO_64);
2789     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2790 }
2791 
2792 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
2793                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2794 {
2795     static const GVecGen2sh g = {
2796         .fni4 = tcg_gen_shr_i32,
2797         .fni8 = tcg_gen_shr_i64,
2798         .fniv_s = tcg_gen_shrs_vec,
2799         .fniv_v = tcg_gen_shrv_vec,
2800         .fno = {
2801             gen_helper_gvec_shr8i,
2802             gen_helper_gvec_shr16i,
2803             gen_helper_gvec_shr32i,
2804             gen_helper_gvec_shr64i,
2805         },
2806         .s_list = { INDEX_op_shrs_vec, 0 },
2807         .v_list = { INDEX_op_shrv_vec, 0 },
2808     };
2809 
2810     tcg_debug_assert(vece <= MO_64);
2811     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2812 }
2813 
2814 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
2815                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2816 {
2817     static const GVecGen2sh g = {
2818         .fni4 = tcg_gen_sar_i32,
2819         .fni8 = tcg_gen_sar_i64,
2820         .fniv_s = tcg_gen_sars_vec,
2821         .fniv_v = tcg_gen_sarv_vec,
2822         .fno = {
2823             gen_helper_gvec_sar8i,
2824             gen_helper_gvec_sar16i,
2825             gen_helper_gvec_sar32i,
2826             gen_helper_gvec_sar64i,
2827         },
2828         .s_list = { INDEX_op_sars_vec, 0 },
2829         .v_list = { INDEX_op_sarv_vec, 0 },
2830     };
2831 
2832     tcg_debug_assert(vece <= MO_64);
2833     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2834 }
2835 
2836 /*
2837  * Expand D = A << (B % element bits)
2838  *
2839  * Unlike scalar shifts, where it is easy for the target front end
2840  * to include the modulo as part of the expansion.  If the target
2841  * naturally includes the modulo as part of the operation, great!
2842  * If the target has some other behaviour from out-of-range shifts,
2843  * then it could not use this function anyway, and would need to
2844  * do it's own expansion with custom functions.
2845  */
2846 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
2847                                  TCGv_vec a, TCGv_vec b)
2848 {
2849     TCGv_vec t = tcg_temp_new_vec_matching(d);
2850 
2851     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2852     tcg_gen_and_vec(vece, t, t, b);
2853     tcg_gen_shlv_vec(vece, d, a, t);
2854     tcg_temp_free_vec(t);
2855 }
2856 
2857 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2858 {
2859     TCGv_i32 t = tcg_temp_new_i32();
2860 
2861     tcg_gen_andi_i32(t, b, 31);
2862     tcg_gen_shl_i32(d, a, t);
2863     tcg_temp_free_i32(t);
2864 }
2865 
2866 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2867 {
2868     TCGv_i64 t = tcg_temp_new_i64();
2869 
2870     tcg_gen_andi_i64(t, b, 63);
2871     tcg_gen_shl_i64(d, a, t);
2872     tcg_temp_free_i64(t);
2873 }
2874 
2875 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
2876                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2877 {
2878     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
2879     static const GVecGen3 g[4] = {
2880         { .fniv = tcg_gen_shlv_mod_vec,
2881           .fno = gen_helper_gvec_shl8v,
2882           .opt_opc = vecop_list,
2883           .vece = MO_8 },
2884         { .fniv = tcg_gen_shlv_mod_vec,
2885           .fno = gen_helper_gvec_shl16v,
2886           .opt_opc = vecop_list,
2887           .vece = MO_16 },
2888         { .fni4 = tcg_gen_shl_mod_i32,
2889           .fniv = tcg_gen_shlv_mod_vec,
2890           .fno = gen_helper_gvec_shl32v,
2891           .opt_opc = vecop_list,
2892           .vece = MO_32 },
2893         { .fni8 = tcg_gen_shl_mod_i64,
2894           .fniv = tcg_gen_shlv_mod_vec,
2895           .fno = gen_helper_gvec_shl64v,
2896           .opt_opc = vecop_list,
2897           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2898           .vece = MO_64 },
2899     };
2900 
2901     tcg_debug_assert(vece <= MO_64);
2902     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2903 }
2904 
2905 /*
2906  * Similarly for logical right shifts.
2907  */
2908 
2909 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
2910                                  TCGv_vec a, TCGv_vec b)
2911 {
2912     TCGv_vec t = tcg_temp_new_vec_matching(d);
2913 
2914     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2915     tcg_gen_and_vec(vece, t, t, b);
2916     tcg_gen_shrv_vec(vece, d, a, t);
2917     tcg_temp_free_vec(t);
2918 }
2919 
2920 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2921 {
2922     TCGv_i32 t = tcg_temp_new_i32();
2923 
2924     tcg_gen_andi_i32(t, b, 31);
2925     tcg_gen_shr_i32(d, a, t);
2926     tcg_temp_free_i32(t);
2927 }
2928 
2929 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2930 {
2931     TCGv_i64 t = tcg_temp_new_i64();
2932 
2933     tcg_gen_andi_i64(t, b, 63);
2934     tcg_gen_shr_i64(d, a, t);
2935     tcg_temp_free_i64(t);
2936 }
2937 
2938 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
2939                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2940 {
2941     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
2942     static const GVecGen3 g[4] = {
2943         { .fniv = tcg_gen_shrv_mod_vec,
2944           .fno = gen_helper_gvec_shr8v,
2945           .opt_opc = vecop_list,
2946           .vece = MO_8 },
2947         { .fniv = tcg_gen_shrv_mod_vec,
2948           .fno = gen_helper_gvec_shr16v,
2949           .opt_opc = vecop_list,
2950           .vece = MO_16 },
2951         { .fni4 = tcg_gen_shr_mod_i32,
2952           .fniv = tcg_gen_shrv_mod_vec,
2953           .fno = gen_helper_gvec_shr32v,
2954           .opt_opc = vecop_list,
2955           .vece = MO_32 },
2956         { .fni8 = tcg_gen_shr_mod_i64,
2957           .fniv = tcg_gen_shrv_mod_vec,
2958           .fno = gen_helper_gvec_shr64v,
2959           .opt_opc = vecop_list,
2960           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2961           .vece = MO_64 },
2962     };
2963 
2964     tcg_debug_assert(vece <= MO_64);
2965     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2966 }
2967 
2968 /*
2969  * Similarly for arithmetic right shifts.
2970  */
2971 
2972 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
2973                                  TCGv_vec a, TCGv_vec b)
2974 {
2975     TCGv_vec t = tcg_temp_new_vec_matching(d);
2976 
2977     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2978     tcg_gen_and_vec(vece, t, t, b);
2979     tcg_gen_sarv_vec(vece, d, a, t);
2980     tcg_temp_free_vec(t);
2981 }
2982 
2983 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2984 {
2985     TCGv_i32 t = tcg_temp_new_i32();
2986 
2987     tcg_gen_andi_i32(t, b, 31);
2988     tcg_gen_sar_i32(d, a, t);
2989     tcg_temp_free_i32(t);
2990 }
2991 
2992 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2993 {
2994     TCGv_i64 t = tcg_temp_new_i64();
2995 
2996     tcg_gen_andi_i64(t, b, 63);
2997     tcg_gen_sar_i64(d, a, t);
2998     tcg_temp_free_i64(t);
2999 }
3000 
3001 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3002                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3003 {
3004     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3005     static const GVecGen3 g[4] = {
3006         { .fniv = tcg_gen_sarv_mod_vec,
3007           .fno = gen_helper_gvec_sar8v,
3008           .opt_opc = vecop_list,
3009           .vece = MO_8 },
3010         { .fniv = tcg_gen_sarv_mod_vec,
3011           .fno = gen_helper_gvec_sar16v,
3012           .opt_opc = vecop_list,
3013           .vece = MO_16 },
3014         { .fni4 = tcg_gen_sar_mod_i32,
3015           .fniv = tcg_gen_sarv_mod_vec,
3016           .fno = gen_helper_gvec_sar32v,
3017           .opt_opc = vecop_list,
3018           .vece = MO_32 },
3019         { .fni8 = tcg_gen_sar_mod_i64,
3020           .fniv = tcg_gen_sarv_mod_vec,
3021           .fno = gen_helper_gvec_sar64v,
3022           .opt_opc = vecop_list,
3023           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3024           .vece = MO_64 },
3025     };
3026 
3027     tcg_debug_assert(vece <= MO_64);
3028     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3029 }
3030 
3031 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3032 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3033                            uint32_t oprsz, TCGCond cond)
3034 {
3035     TCGv_i32 t0 = tcg_temp_new_i32();
3036     TCGv_i32 t1 = tcg_temp_new_i32();
3037     uint32_t i;
3038 
3039     for (i = 0; i < oprsz; i += 4) {
3040         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3041         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3042         tcg_gen_setcond_i32(cond, t0, t0, t1);
3043         tcg_gen_neg_i32(t0, t0);
3044         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3045     }
3046     tcg_temp_free_i32(t1);
3047     tcg_temp_free_i32(t0);
3048 }
3049 
3050 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3051                            uint32_t oprsz, TCGCond cond)
3052 {
3053     TCGv_i64 t0 = tcg_temp_new_i64();
3054     TCGv_i64 t1 = tcg_temp_new_i64();
3055     uint32_t i;
3056 
3057     for (i = 0; i < oprsz; i += 8) {
3058         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3059         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3060         tcg_gen_setcond_i64(cond, t0, t0, t1);
3061         tcg_gen_neg_i64(t0, t0);
3062         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3063     }
3064     tcg_temp_free_i64(t1);
3065     tcg_temp_free_i64(t0);
3066 }
3067 
3068 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3069                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3070                            TCGType type, TCGCond cond)
3071 {
3072     TCGv_vec t0 = tcg_temp_new_vec(type);
3073     TCGv_vec t1 = tcg_temp_new_vec(type);
3074     uint32_t i;
3075 
3076     for (i = 0; i < oprsz; i += tysz) {
3077         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3078         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3079         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3080         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3081     }
3082     tcg_temp_free_vec(t1);
3083     tcg_temp_free_vec(t0);
3084 }
3085 
3086 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3087                       uint32_t aofs, uint32_t bofs,
3088                       uint32_t oprsz, uint32_t maxsz)
3089 {
3090     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3091     static gen_helper_gvec_3 * const eq_fn[4] = {
3092         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3093         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3094     };
3095     static gen_helper_gvec_3 * const ne_fn[4] = {
3096         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3097         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3098     };
3099     static gen_helper_gvec_3 * const lt_fn[4] = {
3100         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3101         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3102     };
3103     static gen_helper_gvec_3 * const le_fn[4] = {
3104         gen_helper_gvec_le8, gen_helper_gvec_le16,
3105         gen_helper_gvec_le32, gen_helper_gvec_le64
3106     };
3107     static gen_helper_gvec_3 * const ltu_fn[4] = {
3108         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3109         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3110     };
3111     static gen_helper_gvec_3 * const leu_fn[4] = {
3112         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3113         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3114     };
3115     static gen_helper_gvec_3 * const * const fns[16] = {
3116         [TCG_COND_EQ] = eq_fn,
3117         [TCG_COND_NE] = ne_fn,
3118         [TCG_COND_LT] = lt_fn,
3119         [TCG_COND_LE] = le_fn,
3120         [TCG_COND_LTU] = ltu_fn,
3121         [TCG_COND_LEU] = leu_fn,
3122     };
3123 
3124     const TCGOpcode *hold_list;
3125     TCGType type;
3126     uint32_t some;
3127 
3128     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3129     check_overlap_3(dofs, aofs, bofs, maxsz);
3130 
3131     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3132         do_dup(MO_8, dofs, oprsz, maxsz,
3133                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3134         return;
3135     }
3136 
3137     /*
3138      * Implement inline with a vector type, if possible.
3139      * Prefer integer when 64-bit host and 64-bit comparison.
3140      */
3141     hold_list = tcg_swap_vecop_list(cmp_list);
3142     type = choose_vector_type(cmp_list, vece, oprsz,
3143                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3144     switch (type) {
3145     case TCG_TYPE_V256:
3146         /* Recall that ARM SVE allows vector sizes that are not a
3147          * power of 2, but always a multiple of 16.  The intent is
3148          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3149          */
3150         some = QEMU_ALIGN_DOWN(oprsz, 32);
3151         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3152         if (some == oprsz) {
3153             break;
3154         }
3155         dofs += some;
3156         aofs += some;
3157         bofs += some;
3158         oprsz -= some;
3159         maxsz -= some;
3160         /* fallthru */
3161     case TCG_TYPE_V128:
3162         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3163         break;
3164     case TCG_TYPE_V64:
3165         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3166         break;
3167 
3168     case 0:
3169         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3170             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3171         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3172             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3173         } else {
3174             gen_helper_gvec_3 * const *fn = fns[cond];
3175 
3176             if (fn == NULL) {
3177                 uint32_t tmp;
3178                 tmp = aofs, aofs = bofs, bofs = tmp;
3179                 cond = tcg_swap_cond(cond);
3180                 fn = fns[cond];
3181                 assert(fn != NULL);
3182             }
3183             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3184             oprsz = maxsz;
3185         }
3186         break;
3187 
3188     default:
3189         g_assert_not_reached();
3190     }
3191     tcg_swap_vecop_list(hold_list);
3192 
3193     if (oprsz < maxsz) {
3194         expand_clr(dofs + oprsz, maxsz - oprsz);
3195     }
3196 }
3197 
3198 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3199 {
3200     TCGv_i64 t = tcg_temp_new_i64();
3201 
3202     tcg_gen_and_i64(t, b, a);
3203     tcg_gen_andc_i64(d, c, a);
3204     tcg_gen_or_i64(d, d, t);
3205     tcg_temp_free_i64(t);
3206 }
3207 
3208 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3209                          uint32_t bofs, uint32_t cofs,
3210                          uint32_t oprsz, uint32_t maxsz)
3211 {
3212     static const GVecGen4 g = {
3213         .fni8 = tcg_gen_bitsel_i64,
3214         .fniv = tcg_gen_bitsel_vec,
3215         .fno = gen_helper_gvec_bitsel,
3216     };
3217 
3218     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3219 }
3220