xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision 93062e23)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "qemu/main-loop.h"
25 #include "tcg/tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t opr_align = oprsz >= 16 ? 15 : 7;
41     uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
42     tcg_debug_assert(oprsz > 0);
43     tcg_debug_assert(oprsz <= maxsz);
44     tcg_debug_assert((oprsz & opr_align) == 0);
45     tcg_debug_assert((maxsz & max_align) == 0);
46     tcg_debug_assert((ofs & max_align) == 0);
47 }
48 
49 /* Verify vector overlap rules for two operands.  */
50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
51 {
52     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
53 }
54 
55 /* Verify vector overlap rules for three operands.  */
56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
57 {
58     check_overlap_2(d, a, s);
59     check_overlap_2(d, b, s);
60     check_overlap_2(a, b, s);
61 }
62 
63 /* Verify vector overlap rules for four operands.  */
64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
65                             uint32_t c, uint32_t s)
66 {
67     check_overlap_2(d, a, s);
68     check_overlap_2(d, b, s);
69     check_overlap_2(d, c, s);
70     check_overlap_2(a, b, s);
71     check_overlap_2(a, c, s);
72     check_overlap_2(b, c, s);
73 }
74 
75 /* Create a descriptor from components.  */
76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
77 {
78     uint32_t desc = 0;
79 
80     assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
81     assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
82     assert(data == sextract32(data, 0, SIMD_DATA_BITS));
83 
84     oprsz = (oprsz / 8) - 1;
85     maxsz = (maxsz / 8) - 1;
86     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
87     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
88     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
89 
90     return desc;
91 }
92 
93 /* Generate a call to a gvec-style helper with two vector operands.  */
94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
95                         uint32_t oprsz, uint32_t maxsz, int32_t data,
96                         gen_helper_gvec_2 *fn)
97 {
98     TCGv_ptr a0, a1;
99     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
100 
101     a0 = tcg_temp_new_ptr();
102     a1 = tcg_temp_new_ptr();
103 
104     tcg_gen_addi_ptr(a0, cpu_env, dofs);
105     tcg_gen_addi_ptr(a1, cpu_env, aofs);
106 
107     fn(a0, a1, desc);
108 
109     tcg_temp_free_ptr(a0);
110     tcg_temp_free_ptr(a1);
111     tcg_temp_free_i32(desc);
112 }
113 
114 /* Generate a call to a gvec-style helper with two vector operands
115    and one scalar operand.  */
116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
117                          uint32_t oprsz, uint32_t maxsz, int32_t data,
118                          gen_helper_gvec_2i *fn)
119 {
120     TCGv_ptr a0, a1;
121     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
122 
123     a0 = tcg_temp_new_ptr();
124     a1 = tcg_temp_new_ptr();
125 
126     tcg_gen_addi_ptr(a0, cpu_env, dofs);
127     tcg_gen_addi_ptr(a1, cpu_env, aofs);
128 
129     fn(a0, a1, c, desc);
130 
131     tcg_temp_free_ptr(a0);
132     tcg_temp_free_ptr(a1);
133     tcg_temp_free_i32(desc);
134 }
135 
136 /* Generate a call to a gvec-style helper with three vector operands.  */
137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
138                         uint32_t oprsz, uint32_t maxsz, int32_t data,
139                         gen_helper_gvec_3 *fn)
140 {
141     TCGv_ptr a0, a1, a2;
142     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
143 
144     a0 = tcg_temp_new_ptr();
145     a1 = tcg_temp_new_ptr();
146     a2 = tcg_temp_new_ptr();
147 
148     tcg_gen_addi_ptr(a0, cpu_env, dofs);
149     tcg_gen_addi_ptr(a1, cpu_env, aofs);
150     tcg_gen_addi_ptr(a2, cpu_env, bofs);
151 
152     fn(a0, a1, a2, desc);
153 
154     tcg_temp_free_ptr(a0);
155     tcg_temp_free_ptr(a1);
156     tcg_temp_free_ptr(a2);
157     tcg_temp_free_i32(desc);
158 }
159 
160 /* Generate a call to a gvec-style helper with four vector operands.  */
161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
162                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
163                         int32_t data, gen_helper_gvec_4 *fn)
164 {
165     TCGv_ptr a0, a1, a2, a3;
166     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
167 
168     a0 = tcg_temp_new_ptr();
169     a1 = tcg_temp_new_ptr();
170     a2 = tcg_temp_new_ptr();
171     a3 = tcg_temp_new_ptr();
172 
173     tcg_gen_addi_ptr(a0, cpu_env, dofs);
174     tcg_gen_addi_ptr(a1, cpu_env, aofs);
175     tcg_gen_addi_ptr(a2, cpu_env, bofs);
176     tcg_gen_addi_ptr(a3, cpu_env, cofs);
177 
178     fn(a0, a1, a2, a3, desc);
179 
180     tcg_temp_free_ptr(a0);
181     tcg_temp_free_ptr(a1);
182     tcg_temp_free_ptr(a2);
183     tcg_temp_free_ptr(a3);
184     tcg_temp_free_i32(desc);
185 }
186 
187 /* Generate a call to a gvec-style helper with five vector operands.  */
188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
189                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
190                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
191 {
192     TCGv_ptr a0, a1, a2, a3, a4;
193     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
194 
195     a0 = tcg_temp_new_ptr();
196     a1 = tcg_temp_new_ptr();
197     a2 = tcg_temp_new_ptr();
198     a3 = tcg_temp_new_ptr();
199     a4 = tcg_temp_new_ptr();
200 
201     tcg_gen_addi_ptr(a0, cpu_env, dofs);
202     tcg_gen_addi_ptr(a1, cpu_env, aofs);
203     tcg_gen_addi_ptr(a2, cpu_env, bofs);
204     tcg_gen_addi_ptr(a3, cpu_env, cofs);
205     tcg_gen_addi_ptr(a4, cpu_env, xofs);
206 
207     fn(a0, a1, a2, a3, a4, desc);
208 
209     tcg_temp_free_ptr(a0);
210     tcg_temp_free_ptr(a1);
211     tcg_temp_free_ptr(a2);
212     tcg_temp_free_ptr(a3);
213     tcg_temp_free_ptr(a4);
214     tcg_temp_free_i32(desc);
215 }
216 
217 /* Generate a call to a gvec-style helper with three vector operands
218    and an extra pointer operand.  */
219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
220                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
221                         int32_t data, gen_helper_gvec_2_ptr *fn)
222 {
223     TCGv_ptr a0, a1;
224     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
225 
226     a0 = tcg_temp_new_ptr();
227     a1 = tcg_temp_new_ptr();
228 
229     tcg_gen_addi_ptr(a0, cpu_env, dofs);
230     tcg_gen_addi_ptr(a1, cpu_env, aofs);
231 
232     fn(a0, a1, ptr, desc);
233 
234     tcg_temp_free_ptr(a0);
235     tcg_temp_free_ptr(a1);
236     tcg_temp_free_i32(desc);
237 }
238 
239 /* Generate a call to a gvec-style helper with three vector operands
240    and an extra pointer operand.  */
241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
242                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
243                         int32_t data, gen_helper_gvec_3_ptr *fn)
244 {
245     TCGv_ptr a0, a1, a2;
246     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
247 
248     a0 = tcg_temp_new_ptr();
249     a1 = tcg_temp_new_ptr();
250     a2 = tcg_temp_new_ptr();
251 
252     tcg_gen_addi_ptr(a0, cpu_env, dofs);
253     tcg_gen_addi_ptr(a1, cpu_env, aofs);
254     tcg_gen_addi_ptr(a2, cpu_env, bofs);
255 
256     fn(a0, a1, a2, ptr, desc);
257 
258     tcg_temp_free_ptr(a0);
259     tcg_temp_free_ptr(a1);
260     tcg_temp_free_ptr(a2);
261     tcg_temp_free_i32(desc);
262 }
263 
264 /* Generate a call to a gvec-style helper with four vector operands
265    and an extra pointer operand.  */
266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
267                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
268                         uint32_t maxsz, int32_t data,
269                         gen_helper_gvec_4_ptr *fn)
270 {
271     TCGv_ptr a0, a1, a2, a3;
272     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
273 
274     a0 = tcg_temp_new_ptr();
275     a1 = tcg_temp_new_ptr();
276     a2 = tcg_temp_new_ptr();
277     a3 = tcg_temp_new_ptr();
278 
279     tcg_gen_addi_ptr(a0, cpu_env, dofs);
280     tcg_gen_addi_ptr(a1, cpu_env, aofs);
281     tcg_gen_addi_ptr(a2, cpu_env, bofs);
282     tcg_gen_addi_ptr(a3, cpu_env, cofs);
283 
284     fn(a0, a1, a2, a3, ptr, desc);
285 
286     tcg_temp_free_ptr(a0);
287     tcg_temp_free_ptr(a1);
288     tcg_temp_free_ptr(a2);
289     tcg_temp_free_ptr(a3);
290     tcg_temp_free_i32(desc);
291 }
292 
293 /* Return true if we want to implement something of OPRSZ bytes
294    in units of LNSZ.  This limits the expansion of inline code.  */
295 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
296 {
297     if (oprsz % lnsz == 0) {
298         uint32_t lnct = oprsz / lnsz;
299         return lnct >= 1 && lnct <= MAX_UNROLL;
300     }
301     return false;
302 }
303 
304 static void expand_clr(uint32_t dofs, uint32_t maxsz);
305 
306 /* Duplicate C as per VECE.  */
307 uint64_t (dup_const)(unsigned vece, uint64_t c)
308 {
309     switch (vece) {
310     case MO_8:
311         return 0x0101010101010101ull * (uint8_t)c;
312     case MO_16:
313         return 0x0001000100010001ull * (uint16_t)c;
314     case MO_32:
315         return 0x0000000100000001ull * (uint32_t)c;
316     case MO_64:
317         return c;
318     default:
319         g_assert_not_reached();
320     }
321 }
322 
323 /* Duplicate IN into OUT as per VECE.  */
324 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
325 {
326     switch (vece) {
327     case MO_8:
328         tcg_gen_ext8u_i32(out, in);
329         tcg_gen_muli_i32(out, out, 0x01010101);
330         break;
331     case MO_16:
332         tcg_gen_deposit_i32(out, in, in, 16, 16);
333         break;
334     case MO_32:
335         tcg_gen_mov_i32(out, in);
336         break;
337     default:
338         g_assert_not_reached();
339     }
340 }
341 
342 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
343 {
344     switch (vece) {
345     case MO_8:
346         tcg_gen_ext8u_i64(out, in);
347         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
348         break;
349     case MO_16:
350         tcg_gen_ext16u_i64(out, in);
351         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
352         break;
353     case MO_32:
354         tcg_gen_deposit_i64(out, in, in, 32, 32);
355         break;
356     case MO_64:
357         tcg_gen_mov_i64(out, in);
358         break;
359     default:
360         g_assert_not_reached();
361     }
362 }
363 
364 /* Select a supported vector type for implementing an operation on SIZE
365  * bytes.  If OP is 0, assume that the real operation to be performed is
366  * required by all backends.  Otherwise, make sure than OP can be performed
367  * on elements of size VECE in the selected type.  Do not select V64 if
368  * PREFER_I64 is true.  Return 0 if no vector type is selected.
369  */
370 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
371                                   uint32_t size, bool prefer_i64)
372 {
373     if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
374         /*
375          * Recall that ARM SVE allows vector sizes that are not a
376          * power of 2, but always a multiple of 16.  The intent is
377          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
378          * It is hard to imagine a case in which v256 is supported
379          * but v128 is not, but check anyway.
380          */
381         if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
382             && (size % 32 == 0
383                 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
384             return TCG_TYPE_V256;
385         }
386     }
387     if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
388         && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
389         return TCG_TYPE_V128;
390     }
391     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
392         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
393         return TCG_TYPE_V64;
394     }
395     return 0;
396 }
397 
398 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
399                          uint32_t maxsz, TCGv_vec t_vec)
400 {
401     uint32_t i = 0;
402 
403     switch (type) {
404     case TCG_TYPE_V256:
405         /*
406          * Recall that ARM SVE allows vector sizes that are not a
407          * power of 2, but always a multiple of 16.  The intent is
408          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
409          */
410         for (; i + 32 <= oprsz; i += 32) {
411             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
412         }
413         /* fallthru */
414     case TCG_TYPE_V128:
415         for (; i + 16 <= oprsz; i += 16) {
416             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
417         }
418         break;
419     case TCG_TYPE_V64:
420         for (; i < oprsz; i += 8) {
421             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
422         }
423         break;
424     default:
425         g_assert_not_reached();
426     }
427 
428     if (oprsz < maxsz) {
429         expand_clr(dofs + oprsz, maxsz - oprsz);
430     }
431 }
432 
433 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
434  * Only one of IN_32 or IN_64 may be set;
435  * IN_C is used if IN_32 and IN_64 are unset.
436  */
437 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
438                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
439                    uint64_t in_c)
440 {
441     TCGType type;
442     TCGv_i64 t_64;
443     TCGv_i32 t_32, t_desc;
444     TCGv_ptr t_ptr;
445     uint32_t i;
446 
447     assert(vece <= (in_32 ? MO_32 : MO_64));
448     assert(in_32 == NULL || in_64 == NULL);
449 
450     /* If we're storing 0, expand oprsz to maxsz.  */
451     if (in_32 == NULL && in_64 == NULL) {
452         in_c = dup_const(vece, in_c);
453         if (in_c == 0) {
454             oprsz = maxsz;
455         }
456     }
457 
458     /* Implement inline with a vector type, if possible.
459      * Prefer integer when 64-bit host and no variable dup.
460      */
461     type = choose_vector_type(NULL, vece, oprsz,
462                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
463                                && (in_64 == NULL || vece == MO_64)));
464     if (type != 0) {
465         TCGv_vec t_vec = tcg_temp_new_vec(type);
466 
467         if (in_32) {
468             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
469         } else if (in_64) {
470             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
471         } else {
472             tcg_gen_dupi_vec(vece, t_vec, in_c);
473         }
474         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
475         tcg_temp_free_vec(t_vec);
476         return;
477     }
478 
479     /* Otherwise, inline with an integer type, unless "large".  */
480     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
481         t_64 = NULL;
482         t_32 = NULL;
483 
484         if (in_32) {
485             /* We are given a 32-bit variable input.  For a 64-bit host,
486                use a 64-bit operation unless the 32-bit operation would
487                be simple enough.  */
488             if (TCG_TARGET_REG_BITS == 64
489                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
490                 t_64 = tcg_temp_new_i64();
491                 tcg_gen_extu_i32_i64(t_64, in_32);
492                 gen_dup_i64(vece, t_64, t_64);
493             } else {
494                 t_32 = tcg_temp_new_i32();
495                 gen_dup_i32(vece, t_32, in_32);
496             }
497         } else if (in_64) {
498             /* We are given a 64-bit variable input.  */
499             t_64 = tcg_temp_new_i64();
500             gen_dup_i64(vece, t_64, in_64);
501         } else {
502             /* We are given a constant input.  */
503             /* For 64-bit hosts, use 64-bit constants for "simple" constants
504                or when we'd need too many 32-bit stores, or when a 64-bit
505                constant is really required.  */
506             if (vece == MO_64
507                 || (TCG_TARGET_REG_BITS == 64
508                     && (in_c == 0 || in_c == -1
509                         || !check_size_impl(oprsz, 4)))) {
510                 t_64 = tcg_const_i64(in_c);
511             } else {
512                 t_32 = tcg_const_i32(in_c);
513             }
514         }
515 
516         /* Implement inline if we picked an implementation size above.  */
517         if (t_32) {
518             for (i = 0; i < oprsz; i += 4) {
519                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
520             }
521             tcg_temp_free_i32(t_32);
522             goto done;
523         }
524         if (t_64) {
525             for (i = 0; i < oprsz; i += 8) {
526                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
527             }
528             tcg_temp_free_i64(t_64);
529             goto done;
530         }
531     }
532 
533     /* Otherwise implement out of line.  */
534     t_ptr = tcg_temp_new_ptr();
535     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
536     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
537 
538     if (vece == MO_64) {
539         if (in_64) {
540             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
541         } else {
542             t_64 = tcg_const_i64(in_c);
543             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
544             tcg_temp_free_i64(t_64);
545         }
546     } else {
547         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
548         static dup_fn * const fns[3] = {
549             gen_helper_gvec_dup8,
550             gen_helper_gvec_dup16,
551             gen_helper_gvec_dup32
552         };
553 
554         if (in_32) {
555             fns[vece](t_ptr, t_desc, in_32);
556         } else {
557             t_32 = tcg_temp_new_i32();
558             if (in_64) {
559                 tcg_gen_extrl_i64_i32(t_32, in_64);
560             } else if (vece == MO_8) {
561                 tcg_gen_movi_i32(t_32, in_c & 0xff);
562             } else if (vece == MO_16) {
563                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
564             } else {
565                 tcg_gen_movi_i32(t_32, in_c);
566             }
567             fns[vece](t_ptr, t_desc, t_32);
568             tcg_temp_free_i32(t_32);
569         }
570     }
571 
572     tcg_temp_free_ptr(t_ptr);
573     tcg_temp_free_i32(t_desc);
574     return;
575 
576  done:
577     if (oprsz < maxsz) {
578         expand_clr(dofs + oprsz, maxsz - oprsz);
579     }
580 }
581 
582 /* Likewise, but with zero.  */
583 static void expand_clr(uint32_t dofs, uint32_t maxsz)
584 {
585     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
586 }
587 
588 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
589 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
590                          void (*fni)(TCGv_i32, TCGv_i32))
591 {
592     TCGv_i32 t0 = tcg_temp_new_i32();
593     uint32_t i;
594 
595     for (i = 0; i < oprsz; i += 4) {
596         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
597         fni(t0, t0);
598         tcg_gen_st_i32(t0, cpu_env, dofs + i);
599     }
600     tcg_temp_free_i32(t0);
601 }
602 
603 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
604                           int32_t c, bool load_dest,
605                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
606 {
607     TCGv_i32 t0 = tcg_temp_new_i32();
608     TCGv_i32 t1 = tcg_temp_new_i32();
609     uint32_t i;
610 
611     for (i = 0; i < oprsz; i += 4) {
612         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
613         if (load_dest) {
614             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
615         }
616         fni(t1, t0, c);
617         tcg_gen_st_i32(t1, cpu_env, dofs + i);
618     }
619     tcg_temp_free_i32(t0);
620     tcg_temp_free_i32(t1);
621 }
622 
623 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
624                           TCGv_i32 c, bool scalar_first,
625                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
626 {
627     TCGv_i32 t0 = tcg_temp_new_i32();
628     TCGv_i32 t1 = tcg_temp_new_i32();
629     uint32_t i;
630 
631     for (i = 0; i < oprsz; i += 4) {
632         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
633         if (scalar_first) {
634             fni(t1, c, t0);
635         } else {
636             fni(t1, t0, c);
637         }
638         tcg_gen_st_i32(t1, cpu_env, dofs + i);
639     }
640     tcg_temp_free_i32(t0);
641     tcg_temp_free_i32(t1);
642 }
643 
644 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
645 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
646                          uint32_t bofs, uint32_t oprsz, bool load_dest,
647                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
648 {
649     TCGv_i32 t0 = tcg_temp_new_i32();
650     TCGv_i32 t1 = tcg_temp_new_i32();
651     TCGv_i32 t2 = tcg_temp_new_i32();
652     uint32_t i;
653 
654     for (i = 0; i < oprsz; i += 4) {
655         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
656         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
657         if (load_dest) {
658             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
659         }
660         fni(t2, t0, t1);
661         tcg_gen_st_i32(t2, cpu_env, dofs + i);
662     }
663     tcg_temp_free_i32(t2);
664     tcg_temp_free_i32(t1);
665     tcg_temp_free_i32(t0);
666 }
667 
668 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
669                           uint32_t oprsz, int32_t c, bool load_dest,
670                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
671 {
672     TCGv_i32 t0 = tcg_temp_new_i32();
673     TCGv_i32 t1 = tcg_temp_new_i32();
674     TCGv_i32 t2 = tcg_temp_new_i32();
675     uint32_t i;
676 
677     for (i = 0; i < oprsz; i += 4) {
678         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
679         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
680         if (load_dest) {
681             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
682         }
683         fni(t2, t0, t1, c);
684         tcg_gen_st_i32(t2, cpu_env, dofs + i);
685     }
686     tcg_temp_free_i32(t0);
687     tcg_temp_free_i32(t1);
688     tcg_temp_free_i32(t2);
689 }
690 
691 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
692 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
693                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
694                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
695 {
696     TCGv_i32 t0 = tcg_temp_new_i32();
697     TCGv_i32 t1 = tcg_temp_new_i32();
698     TCGv_i32 t2 = tcg_temp_new_i32();
699     TCGv_i32 t3 = tcg_temp_new_i32();
700     uint32_t i;
701 
702     for (i = 0; i < oprsz; i += 4) {
703         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
704         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
705         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
706         fni(t0, t1, t2, t3);
707         tcg_gen_st_i32(t0, cpu_env, dofs + i);
708         if (write_aofs) {
709             tcg_gen_st_i32(t1, cpu_env, aofs + i);
710         }
711     }
712     tcg_temp_free_i32(t3);
713     tcg_temp_free_i32(t2);
714     tcg_temp_free_i32(t1);
715     tcg_temp_free_i32(t0);
716 }
717 
718 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
719 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
720                          void (*fni)(TCGv_i64, TCGv_i64))
721 {
722     TCGv_i64 t0 = tcg_temp_new_i64();
723     uint32_t i;
724 
725     for (i = 0; i < oprsz; i += 8) {
726         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
727         fni(t0, t0);
728         tcg_gen_st_i64(t0, cpu_env, dofs + i);
729     }
730     tcg_temp_free_i64(t0);
731 }
732 
733 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
734                           int64_t c, bool load_dest,
735                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
736 {
737     TCGv_i64 t0 = tcg_temp_new_i64();
738     TCGv_i64 t1 = tcg_temp_new_i64();
739     uint32_t i;
740 
741     for (i = 0; i < oprsz; i += 8) {
742         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
743         if (load_dest) {
744             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
745         }
746         fni(t1, t0, c);
747         tcg_gen_st_i64(t1, cpu_env, dofs + i);
748     }
749     tcg_temp_free_i64(t0);
750     tcg_temp_free_i64(t1);
751 }
752 
753 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
754                           TCGv_i64 c, bool scalar_first,
755                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
756 {
757     TCGv_i64 t0 = tcg_temp_new_i64();
758     TCGv_i64 t1 = tcg_temp_new_i64();
759     uint32_t i;
760 
761     for (i = 0; i < oprsz; i += 8) {
762         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
763         if (scalar_first) {
764             fni(t1, c, t0);
765         } else {
766             fni(t1, t0, c);
767         }
768         tcg_gen_st_i64(t1, cpu_env, dofs + i);
769     }
770     tcg_temp_free_i64(t0);
771     tcg_temp_free_i64(t1);
772 }
773 
774 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
775 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
776                          uint32_t bofs, uint32_t oprsz, bool load_dest,
777                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
778 {
779     TCGv_i64 t0 = tcg_temp_new_i64();
780     TCGv_i64 t1 = tcg_temp_new_i64();
781     TCGv_i64 t2 = tcg_temp_new_i64();
782     uint32_t i;
783 
784     for (i = 0; i < oprsz; i += 8) {
785         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
786         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
787         if (load_dest) {
788             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
789         }
790         fni(t2, t0, t1);
791         tcg_gen_st_i64(t2, cpu_env, dofs + i);
792     }
793     tcg_temp_free_i64(t2);
794     tcg_temp_free_i64(t1);
795     tcg_temp_free_i64(t0);
796 }
797 
798 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
799                           uint32_t oprsz, int64_t c, bool load_dest,
800                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
801 {
802     TCGv_i64 t0 = tcg_temp_new_i64();
803     TCGv_i64 t1 = tcg_temp_new_i64();
804     TCGv_i64 t2 = tcg_temp_new_i64();
805     uint32_t i;
806 
807     for (i = 0; i < oprsz; i += 8) {
808         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
809         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
810         if (load_dest) {
811             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
812         }
813         fni(t2, t0, t1, c);
814         tcg_gen_st_i64(t2, cpu_env, dofs + i);
815     }
816     tcg_temp_free_i64(t0);
817     tcg_temp_free_i64(t1);
818     tcg_temp_free_i64(t2);
819 }
820 
821 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
822 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
823                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
824                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
825 {
826     TCGv_i64 t0 = tcg_temp_new_i64();
827     TCGv_i64 t1 = tcg_temp_new_i64();
828     TCGv_i64 t2 = tcg_temp_new_i64();
829     TCGv_i64 t3 = tcg_temp_new_i64();
830     uint32_t i;
831 
832     for (i = 0; i < oprsz; i += 8) {
833         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
834         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
835         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
836         fni(t0, t1, t2, t3);
837         tcg_gen_st_i64(t0, cpu_env, dofs + i);
838         if (write_aofs) {
839             tcg_gen_st_i64(t1, cpu_env, aofs + i);
840         }
841     }
842     tcg_temp_free_i64(t3);
843     tcg_temp_free_i64(t2);
844     tcg_temp_free_i64(t1);
845     tcg_temp_free_i64(t0);
846 }
847 
848 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
849 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
850                          uint32_t oprsz, uint32_t tysz, TCGType type,
851                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
852 {
853     TCGv_vec t0 = tcg_temp_new_vec(type);
854     uint32_t i;
855 
856     for (i = 0; i < oprsz; i += tysz) {
857         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
858         fni(vece, t0, t0);
859         tcg_gen_st_vec(t0, cpu_env, dofs + i);
860     }
861     tcg_temp_free_vec(t0);
862 }
863 
864 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
865    using host vectors.  */
866 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
867                           uint32_t oprsz, uint32_t tysz, TCGType type,
868                           int64_t c, bool load_dest,
869                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
870 {
871     TCGv_vec t0 = tcg_temp_new_vec(type);
872     TCGv_vec t1 = tcg_temp_new_vec(type);
873     uint32_t i;
874 
875     for (i = 0; i < oprsz; i += tysz) {
876         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
877         if (load_dest) {
878             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
879         }
880         fni(vece, t1, t0, c);
881         tcg_gen_st_vec(t1, cpu_env, dofs + i);
882     }
883     tcg_temp_free_vec(t0);
884     tcg_temp_free_vec(t1);
885 }
886 
887 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
888                           uint32_t oprsz, uint32_t tysz, TCGType type,
889                           TCGv_vec c, bool scalar_first,
890                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
891 {
892     TCGv_vec t0 = tcg_temp_new_vec(type);
893     TCGv_vec t1 = tcg_temp_new_vec(type);
894     uint32_t i;
895 
896     for (i = 0; i < oprsz; i += tysz) {
897         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
898         if (scalar_first) {
899             fni(vece, t1, c, t0);
900         } else {
901             fni(vece, t1, t0, c);
902         }
903         tcg_gen_st_vec(t1, cpu_env, dofs + i);
904     }
905     tcg_temp_free_vec(t0);
906     tcg_temp_free_vec(t1);
907 }
908 
909 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
910 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
911                          uint32_t bofs, uint32_t oprsz,
912                          uint32_t tysz, TCGType type, bool load_dest,
913                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
914 {
915     TCGv_vec t0 = tcg_temp_new_vec(type);
916     TCGv_vec t1 = tcg_temp_new_vec(type);
917     TCGv_vec t2 = tcg_temp_new_vec(type);
918     uint32_t i;
919 
920     for (i = 0; i < oprsz; i += tysz) {
921         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
922         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
923         if (load_dest) {
924             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
925         }
926         fni(vece, t2, t0, t1);
927         tcg_gen_st_vec(t2, cpu_env, dofs + i);
928     }
929     tcg_temp_free_vec(t2);
930     tcg_temp_free_vec(t1);
931     tcg_temp_free_vec(t0);
932 }
933 
934 /*
935  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
936  * using host vectors.
937  */
938 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
939                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
940                           TCGType type, int64_t c, bool load_dest,
941                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
942                                       int64_t))
943 {
944     TCGv_vec t0 = tcg_temp_new_vec(type);
945     TCGv_vec t1 = tcg_temp_new_vec(type);
946     TCGv_vec t2 = tcg_temp_new_vec(type);
947     uint32_t i;
948 
949     for (i = 0; i < oprsz; i += tysz) {
950         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
951         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
952         if (load_dest) {
953             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
954         }
955         fni(vece, t2, t0, t1, c);
956         tcg_gen_st_vec(t2, cpu_env, dofs + i);
957     }
958     tcg_temp_free_vec(t0);
959     tcg_temp_free_vec(t1);
960     tcg_temp_free_vec(t2);
961 }
962 
963 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
964 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
965                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
966                          uint32_t tysz, TCGType type, bool write_aofs,
967                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
968                                      TCGv_vec, TCGv_vec))
969 {
970     TCGv_vec t0 = tcg_temp_new_vec(type);
971     TCGv_vec t1 = tcg_temp_new_vec(type);
972     TCGv_vec t2 = tcg_temp_new_vec(type);
973     TCGv_vec t3 = tcg_temp_new_vec(type);
974     uint32_t i;
975 
976     for (i = 0; i < oprsz; i += tysz) {
977         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
978         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
979         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
980         fni(vece, t0, t1, t2, t3);
981         tcg_gen_st_vec(t0, cpu_env, dofs + i);
982         if (write_aofs) {
983             tcg_gen_st_vec(t1, cpu_env, aofs + i);
984         }
985     }
986     tcg_temp_free_vec(t3);
987     tcg_temp_free_vec(t2);
988     tcg_temp_free_vec(t1);
989     tcg_temp_free_vec(t0);
990 }
991 
992 /* Expand a vector two-operand operation.  */
993 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
994                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
995 {
996     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
997     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
998     TCGType type;
999     uint32_t some;
1000 
1001     check_size_align(oprsz, maxsz, dofs | aofs);
1002     check_overlap_2(dofs, aofs, maxsz);
1003 
1004     type = 0;
1005     if (g->fniv) {
1006         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1007     }
1008     switch (type) {
1009     case TCG_TYPE_V256:
1010         /* Recall that ARM SVE allows vector sizes that are not a
1011          * power of 2, but always a multiple of 16.  The intent is
1012          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1013          */
1014         some = QEMU_ALIGN_DOWN(oprsz, 32);
1015         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
1016         if (some == oprsz) {
1017             break;
1018         }
1019         dofs += some;
1020         aofs += some;
1021         oprsz -= some;
1022         maxsz -= some;
1023         /* fallthru */
1024     case TCG_TYPE_V128:
1025         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
1026         break;
1027     case TCG_TYPE_V64:
1028         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
1029         break;
1030 
1031     case 0:
1032         if (g->fni8 && check_size_impl(oprsz, 8)) {
1033             expand_2_i64(dofs, aofs, oprsz, g->fni8);
1034         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1035             expand_2_i32(dofs, aofs, oprsz, g->fni4);
1036         } else {
1037             assert(g->fno != NULL);
1038             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1039             oprsz = maxsz;
1040         }
1041         break;
1042 
1043     default:
1044         g_assert_not_reached();
1045     }
1046     tcg_swap_vecop_list(hold_list);
1047 
1048     if (oprsz < maxsz) {
1049         expand_clr(dofs + oprsz, maxsz - oprsz);
1050     }
1051 }
1052 
1053 /* Expand a vector operation with two vectors and an immediate.  */
1054 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1055                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1056 {
1057     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1058     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1059     TCGType type;
1060     uint32_t some;
1061 
1062     check_size_align(oprsz, maxsz, dofs | aofs);
1063     check_overlap_2(dofs, aofs, maxsz);
1064 
1065     type = 0;
1066     if (g->fniv) {
1067         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1068     }
1069     switch (type) {
1070     case TCG_TYPE_V256:
1071         /* Recall that ARM SVE allows vector sizes that are not a
1072          * power of 2, but always a multiple of 16.  The intent is
1073          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1074          */
1075         some = QEMU_ALIGN_DOWN(oprsz, 32);
1076         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1077                       c, g->load_dest, g->fniv);
1078         if (some == oprsz) {
1079             break;
1080         }
1081         dofs += some;
1082         aofs += some;
1083         oprsz -= some;
1084         maxsz -= some;
1085         /* fallthru */
1086     case TCG_TYPE_V128:
1087         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1088                       c, g->load_dest, g->fniv);
1089         break;
1090     case TCG_TYPE_V64:
1091         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1092                       c, g->load_dest, g->fniv);
1093         break;
1094 
1095     case 0:
1096         if (g->fni8 && check_size_impl(oprsz, 8)) {
1097             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1098         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1099             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1100         } else {
1101             if (g->fno) {
1102                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1103             } else {
1104                 TCGv_i64 tcg_c = tcg_const_i64(c);
1105                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1106                                     maxsz, c, g->fnoi);
1107                 tcg_temp_free_i64(tcg_c);
1108             }
1109             oprsz = maxsz;
1110         }
1111         break;
1112 
1113     default:
1114         g_assert_not_reached();
1115     }
1116     tcg_swap_vecop_list(hold_list);
1117 
1118     if (oprsz < maxsz) {
1119         expand_clr(dofs + oprsz, maxsz - oprsz);
1120     }
1121 }
1122 
1123 /* Expand a vector operation with two vectors and a scalar.  */
1124 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1125                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1126 {
1127     TCGType type;
1128 
1129     check_size_align(oprsz, maxsz, dofs | aofs);
1130     check_overlap_2(dofs, aofs, maxsz);
1131 
1132     type = 0;
1133     if (g->fniv) {
1134         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1135     }
1136     if (type != 0) {
1137         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1138         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1139         TCGv_vec t_vec = tcg_temp_new_vec(type);
1140         uint32_t some;
1141 
1142         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1143 
1144         switch (type) {
1145         case TCG_TYPE_V256:
1146             /* Recall that ARM SVE allows vector sizes that are not a
1147              * power of 2, but always a multiple of 16.  The intent is
1148              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1149              */
1150             some = QEMU_ALIGN_DOWN(oprsz, 32);
1151             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1152                           t_vec, g->scalar_first, g->fniv);
1153             if (some == oprsz) {
1154                 break;
1155             }
1156             dofs += some;
1157             aofs += some;
1158             oprsz -= some;
1159             maxsz -= some;
1160             /* fallthru */
1161 
1162         case TCG_TYPE_V128:
1163             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1164                           t_vec, g->scalar_first, g->fniv);
1165             break;
1166 
1167         case TCG_TYPE_V64:
1168             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1169                           t_vec, g->scalar_first, g->fniv);
1170             break;
1171 
1172         default:
1173             g_assert_not_reached();
1174         }
1175         tcg_temp_free_vec(t_vec);
1176         tcg_swap_vecop_list(hold_list);
1177     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1178         TCGv_i64 t64 = tcg_temp_new_i64();
1179 
1180         gen_dup_i64(g->vece, t64, c);
1181         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1182         tcg_temp_free_i64(t64);
1183     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1184         TCGv_i32 t32 = tcg_temp_new_i32();
1185 
1186         tcg_gen_extrl_i64_i32(t32, c);
1187         gen_dup_i32(g->vece, t32, t32);
1188         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1189         tcg_temp_free_i32(t32);
1190     } else {
1191         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1192         return;
1193     }
1194 
1195     if (oprsz < maxsz) {
1196         expand_clr(dofs + oprsz, maxsz - oprsz);
1197     }
1198 }
1199 
1200 /* Expand a vector three-operand operation.  */
1201 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1202                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1203 {
1204     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1205     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1206     TCGType type;
1207     uint32_t some;
1208 
1209     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1210     check_overlap_3(dofs, aofs, bofs, maxsz);
1211 
1212     type = 0;
1213     if (g->fniv) {
1214         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1215     }
1216     switch (type) {
1217     case TCG_TYPE_V256:
1218         /* Recall that ARM SVE allows vector sizes that are not a
1219          * power of 2, but always a multiple of 16.  The intent is
1220          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1221          */
1222         some = QEMU_ALIGN_DOWN(oprsz, 32);
1223         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1224                      g->load_dest, g->fniv);
1225         if (some == oprsz) {
1226             break;
1227         }
1228         dofs += some;
1229         aofs += some;
1230         bofs += some;
1231         oprsz -= some;
1232         maxsz -= some;
1233         /* fallthru */
1234     case TCG_TYPE_V128:
1235         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1236                      g->load_dest, g->fniv);
1237         break;
1238     case TCG_TYPE_V64:
1239         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1240                      g->load_dest, g->fniv);
1241         break;
1242 
1243     case 0:
1244         if (g->fni8 && check_size_impl(oprsz, 8)) {
1245             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1246         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1247             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1248         } else {
1249             assert(g->fno != NULL);
1250             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1251                                maxsz, g->data, g->fno);
1252             oprsz = maxsz;
1253         }
1254         break;
1255 
1256     default:
1257         g_assert_not_reached();
1258     }
1259     tcg_swap_vecop_list(hold_list);
1260 
1261     if (oprsz < maxsz) {
1262         expand_clr(dofs + oprsz, maxsz - oprsz);
1263     }
1264 }
1265 
1266 /* Expand a vector operation with three vectors and an immediate.  */
1267 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1268                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1269                      const GVecGen3i *g)
1270 {
1271     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1272     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1273     TCGType type;
1274     uint32_t some;
1275 
1276     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1277     check_overlap_3(dofs, aofs, bofs, maxsz);
1278 
1279     type = 0;
1280     if (g->fniv) {
1281         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1282     }
1283     switch (type) {
1284     case TCG_TYPE_V256:
1285         /*
1286          * Recall that ARM SVE allows vector sizes that are not a
1287          * power of 2, but always a multiple of 16.  The intent is
1288          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1289          */
1290         some = QEMU_ALIGN_DOWN(oprsz, 32);
1291         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1292                       c, g->load_dest, g->fniv);
1293         if (some == oprsz) {
1294             break;
1295         }
1296         dofs += some;
1297         aofs += some;
1298         bofs += some;
1299         oprsz -= some;
1300         maxsz -= some;
1301         /* fallthru */
1302     case TCG_TYPE_V128:
1303         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1304                       c, g->load_dest, g->fniv);
1305         break;
1306     case TCG_TYPE_V64:
1307         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1308                       c, g->load_dest, g->fniv);
1309         break;
1310 
1311     case 0:
1312         if (g->fni8 && check_size_impl(oprsz, 8)) {
1313             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1314         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1315             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1316         } else {
1317             assert(g->fno != NULL);
1318             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1319             oprsz = maxsz;
1320         }
1321         break;
1322 
1323     default:
1324         g_assert_not_reached();
1325     }
1326     tcg_swap_vecop_list(hold_list);
1327 
1328     if (oprsz < maxsz) {
1329         expand_clr(dofs + oprsz, maxsz - oprsz);
1330     }
1331 }
1332 
1333 /* Expand a vector four-operand operation.  */
1334 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1335                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1336 {
1337     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1338     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1339     TCGType type;
1340     uint32_t some;
1341 
1342     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1343     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1344 
1345     type = 0;
1346     if (g->fniv) {
1347         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1348     }
1349     switch (type) {
1350     case TCG_TYPE_V256:
1351         /* Recall that ARM SVE allows vector sizes that are not a
1352          * power of 2, but always a multiple of 16.  The intent is
1353          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1354          */
1355         some = QEMU_ALIGN_DOWN(oprsz, 32);
1356         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1357                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1358         if (some == oprsz) {
1359             break;
1360         }
1361         dofs += some;
1362         aofs += some;
1363         bofs += some;
1364         cofs += some;
1365         oprsz -= some;
1366         maxsz -= some;
1367         /* fallthru */
1368     case TCG_TYPE_V128:
1369         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1370                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1371         break;
1372     case TCG_TYPE_V64:
1373         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1374                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1375         break;
1376 
1377     case 0:
1378         if (g->fni8 && check_size_impl(oprsz, 8)) {
1379             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1380                          g->write_aofs, g->fni8);
1381         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1382             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1383                          g->write_aofs, g->fni4);
1384         } else {
1385             assert(g->fno != NULL);
1386             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1387                                oprsz, maxsz, g->data, g->fno);
1388             oprsz = maxsz;
1389         }
1390         break;
1391 
1392     default:
1393         g_assert_not_reached();
1394     }
1395     tcg_swap_vecop_list(hold_list);
1396 
1397     if (oprsz < maxsz) {
1398         expand_clr(dofs + oprsz, maxsz - oprsz);
1399     }
1400 }
1401 
1402 /*
1403  * Expand specific vector operations.
1404  */
1405 
1406 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1407 {
1408     tcg_gen_mov_vec(a, b);
1409 }
1410 
1411 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1412                       uint32_t oprsz, uint32_t maxsz)
1413 {
1414     static const GVecGen2 g = {
1415         .fni8 = tcg_gen_mov_i64,
1416         .fniv = vec_mov2,
1417         .fno = gen_helper_gvec_mov,
1418         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1419     };
1420     if (dofs != aofs) {
1421         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1422     } else {
1423         check_size_align(oprsz, maxsz, dofs);
1424         if (oprsz < maxsz) {
1425             expand_clr(dofs + oprsz, maxsz - oprsz);
1426         }
1427     }
1428 }
1429 
1430 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1431                           uint32_t maxsz, TCGv_i32 in)
1432 {
1433     check_size_align(oprsz, maxsz, dofs);
1434     tcg_debug_assert(vece <= MO_32);
1435     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1436 }
1437 
1438 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1439                           uint32_t maxsz, TCGv_i64 in)
1440 {
1441     check_size_align(oprsz, maxsz, dofs);
1442     tcg_debug_assert(vece <= MO_64);
1443     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1444 }
1445 
1446 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1447                           uint32_t oprsz, uint32_t maxsz)
1448 {
1449     check_size_align(oprsz, maxsz, dofs);
1450     if (vece <= MO_64) {
1451         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1452         if (type != 0) {
1453             TCGv_vec t_vec = tcg_temp_new_vec(type);
1454             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1455             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1456             tcg_temp_free_vec(t_vec);
1457         } else if (vece <= MO_32) {
1458             TCGv_i32 in = tcg_temp_new_i32();
1459             switch (vece) {
1460             case MO_8:
1461                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1462                 break;
1463             case MO_16:
1464                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1465                 break;
1466             default:
1467                 tcg_gen_ld_i32(in, cpu_env, aofs);
1468                 break;
1469             }
1470             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1471             tcg_temp_free_i32(in);
1472         } else {
1473             TCGv_i64 in = tcg_temp_new_i64();
1474             tcg_gen_ld_i64(in, cpu_env, aofs);
1475             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1476             tcg_temp_free_i64(in);
1477         }
1478     } else {
1479         /* 128-bit duplicate.  */
1480         /* ??? Dup to 256-bit vector.  */
1481         int i;
1482 
1483         tcg_debug_assert(vece == 4);
1484         tcg_debug_assert(oprsz >= 16);
1485         if (TCG_TARGET_HAS_v128) {
1486             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1487 
1488             tcg_gen_ld_vec(in, cpu_env, aofs);
1489             for (i = 0; i < oprsz; i += 16) {
1490                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1491             }
1492             tcg_temp_free_vec(in);
1493         } else {
1494             TCGv_i64 in0 = tcg_temp_new_i64();
1495             TCGv_i64 in1 = tcg_temp_new_i64();
1496 
1497             tcg_gen_ld_i64(in0, cpu_env, aofs);
1498             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1499             for (i = 0; i < oprsz; i += 16) {
1500                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1501                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1502             }
1503             tcg_temp_free_i64(in0);
1504             tcg_temp_free_i64(in1);
1505         }
1506         if (oprsz < maxsz) {
1507             expand_clr(dofs + oprsz, maxsz - oprsz);
1508         }
1509     }
1510 }
1511 
1512 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1513                          uint32_t maxsz, uint64_t x)
1514 {
1515     check_size_align(oprsz, maxsz, dofs);
1516     do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1517 }
1518 
1519 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1520                          uint32_t maxsz, uint32_t x)
1521 {
1522     check_size_align(oprsz, maxsz, dofs);
1523     do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1524 }
1525 
1526 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1527                          uint32_t maxsz, uint16_t x)
1528 {
1529     check_size_align(oprsz, maxsz, dofs);
1530     do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1531 }
1532 
1533 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1534                          uint32_t maxsz, uint8_t x)
1535 {
1536     check_size_align(oprsz, maxsz, dofs);
1537     do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1538 }
1539 
1540 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1541                       uint32_t oprsz, uint32_t maxsz)
1542 {
1543     static const GVecGen2 g = {
1544         .fni8 = tcg_gen_not_i64,
1545         .fniv = tcg_gen_not_vec,
1546         .fno = gen_helper_gvec_not,
1547         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1548     };
1549     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1550 }
1551 
1552 /* Perform a vector addition using normal addition and a mask.  The mask
1553    should be the sign bit of each lane.  This 6-operation form is more
1554    efficient than separate additions when there are 4 or more lanes in
1555    the 64-bit operation.  */
1556 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1557 {
1558     TCGv_i64 t1 = tcg_temp_new_i64();
1559     TCGv_i64 t2 = tcg_temp_new_i64();
1560     TCGv_i64 t3 = tcg_temp_new_i64();
1561 
1562     tcg_gen_andc_i64(t1, a, m);
1563     tcg_gen_andc_i64(t2, b, m);
1564     tcg_gen_xor_i64(t3, a, b);
1565     tcg_gen_add_i64(d, t1, t2);
1566     tcg_gen_and_i64(t3, t3, m);
1567     tcg_gen_xor_i64(d, d, t3);
1568 
1569     tcg_temp_free_i64(t1);
1570     tcg_temp_free_i64(t2);
1571     tcg_temp_free_i64(t3);
1572 }
1573 
1574 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1575 {
1576     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1577     gen_addv_mask(d, a, b, m);
1578     tcg_temp_free_i64(m);
1579 }
1580 
1581 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1582 {
1583     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1584     gen_addv_mask(d, a, b, m);
1585     tcg_temp_free_i64(m);
1586 }
1587 
1588 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1589 {
1590     TCGv_i64 t1 = tcg_temp_new_i64();
1591     TCGv_i64 t2 = tcg_temp_new_i64();
1592 
1593     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1594     tcg_gen_add_i64(t2, a, b);
1595     tcg_gen_add_i64(t1, t1, b);
1596     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1597 
1598     tcg_temp_free_i64(t1);
1599     tcg_temp_free_i64(t2);
1600 }
1601 
1602 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1603 
1604 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1605                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1606 {
1607     static const GVecGen3 g[4] = {
1608         { .fni8 = tcg_gen_vec_add8_i64,
1609           .fniv = tcg_gen_add_vec,
1610           .fno = gen_helper_gvec_add8,
1611           .opt_opc = vecop_list_add,
1612           .vece = MO_8 },
1613         { .fni8 = tcg_gen_vec_add16_i64,
1614           .fniv = tcg_gen_add_vec,
1615           .fno = gen_helper_gvec_add16,
1616           .opt_opc = vecop_list_add,
1617           .vece = MO_16 },
1618         { .fni4 = tcg_gen_add_i32,
1619           .fniv = tcg_gen_add_vec,
1620           .fno = gen_helper_gvec_add32,
1621           .opt_opc = vecop_list_add,
1622           .vece = MO_32 },
1623         { .fni8 = tcg_gen_add_i64,
1624           .fniv = tcg_gen_add_vec,
1625           .fno = gen_helper_gvec_add64,
1626           .opt_opc = vecop_list_add,
1627           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1628           .vece = MO_64 },
1629     };
1630 
1631     tcg_debug_assert(vece <= MO_64);
1632     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1633 }
1634 
1635 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1636                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1637 {
1638     static const GVecGen2s g[4] = {
1639         { .fni8 = tcg_gen_vec_add8_i64,
1640           .fniv = tcg_gen_add_vec,
1641           .fno = gen_helper_gvec_adds8,
1642           .opt_opc = vecop_list_add,
1643           .vece = MO_8 },
1644         { .fni8 = tcg_gen_vec_add16_i64,
1645           .fniv = tcg_gen_add_vec,
1646           .fno = gen_helper_gvec_adds16,
1647           .opt_opc = vecop_list_add,
1648           .vece = MO_16 },
1649         { .fni4 = tcg_gen_add_i32,
1650           .fniv = tcg_gen_add_vec,
1651           .fno = gen_helper_gvec_adds32,
1652           .opt_opc = vecop_list_add,
1653           .vece = MO_32 },
1654         { .fni8 = tcg_gen_add_i64,
1655           .fniv = tcg_gen_add_vec,
1656           .fno = gen_helper_gvec_adds64,
1657           .opt_opc = vecop_list_add,
1658           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1659           .vece = MO_64 },
1660     };
1661 
1662     tcg_debug_assert(vece <= MO_64);
1663     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1664 }
1665 
1666 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1667                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1668 {
1669     TCGv_i64 tmp = tcg_const_i64(c);
1670     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1671     tcg_temp_free_i64(tmp);
1672 }
1673 
1674 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1675 
1676 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1677                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1678 {
1679     static const GVecGen2s g[4] = {
1680         { .fni8 = tcg_gen_vec_sub8_i64,
1681           .fniv = tcg_gen_sub_vec,
1682           .fno = gen_helper_gvec_subs8,
1683           .opt_opc = vecop_list_sub,
1684           .vece = MO_8 },
1685         { .fni8 = tcg_gen_vec_sub16_i64,
1686           .fniv = tcg_gen_sub_vec,
1687           .fno = gen_helper_gvec_subs16,
1688           .opt_opc = vecop_list_sub,
1689           .vece = MO_16 },
1690         { .fni4 = tcg_gen_sub_i32,
1691           .fniv = tcg_gen_sub_vec,
1692           .fno = gen_helper_gvec_subs32,
1693           .opt_opc = vecop_list_sub,
1694           .vece = MO_32 },
1695         { .fni8 = tcg_gen_sub_i64,
1696           .fniv = tcg_gen_sub_vec,
1697           .fno = gen_helper_gvec_subs64,
1698           .opt_opc = vecop_list_sub,
1699           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1700           .vece = MO_64 },
1701     };
1702 
1703     tcg_debug_assert(vece <= MO_64);
1704     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1705 }
1706 
1707 /* Perform a vector subtraction using normal subtraction and a mask.
1708    Compare gen_addv_mask above.  */
1709 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1710 {
1711     TCGv_i64 t1 = tcg_temp_new_i64();
1712     TCGv_i64 t2 = tcg_temp_new_i64();
1713     TCGv_i64 t3 = tcg_temp_new_i64();
1714 
1715     tcg_gen_or_i64(t1, a, m);
1716     tcg_gen_andc_i64(t2, b, m);
1717     tcg_gen_eqv_i64(t3, a, b);
1718     tcg_gen_sub_i64(d, t1, t2);
1719     tcg_gen_and_i64(t3, t3, m);
1720     tcg_gen_xor_i64(d, d, t3);
1721 
1722     tcg_temp_free_i64(t1);
1723     tcg_temp_free_i64(t2);
1724     tcg_temp_free_i64(t3);
1725 }
1726 
1727 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1728 {
1729     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1730     gen_subv_mask(d, a, b, m);
1731     tcg_temp_free_i64(m);
1732 }
1733 
1734 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1735 {
1736     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1737     gen_subv_mask(d, a, b, m);
1738     tcg_temp_free_i64(m);
1739 }
1740 
1741 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1742 {
1743     TCGv_i64 t1 = tcg_temp_new_i64();
1744     TCGv_i64 t2 = tcg_temp_new_i64();
1745 
1746     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1747     tcg_gen_sub_i64(t2, a, b);
1748     tcg_gen_sub_i64(t1, a, t1);
1749     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1750 
1751     tcg_temp_free_i64(t1);
1752     tcg_temp_free_i64(t2);
1753 }
1754 
1755 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1756                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1757 {
1758     static const GVecGen3 g[4] = {
1759         { .fni8 = tcg_gen_vec_sub8_i64,
1760           .fniv = tcg_gen_sub_vec,
1761           .fno = gen_helper_gvec_sub8,
1762           .opt_opc = vecop_list_sub,
1763           .vece = MO_8 },
1764         { .fni8 = tcg_gen_vec_sub16_i64,
1765           .fniv = tcg_gen_sub_vec,
1766           .fno = gen_helper_gvec_sub16,
1767           .opt_opc = vecop_list_sub,
1768           .vece = MO_16 },
1769         { .fni4 = tcg_gen_sub_i32,
1770           .fniv = tcg_gen_sub_vec,
1771           .fno = gen_helper_gvec_sub32,
1772           .opt_opc = vecop_list_sub,
1773           .vece = MO_32 },
1774         { .fni8 = tcg_gen_sub_i64,
1775           .fniv = tcg_gen_sub_vec,
1776           .fno = gen_helper_gvec_sub64,
1777           .opt_opc = vecop_list_sub,
1778           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1779           .vece = MO_64 },
1780     };
1781 
1782     tcg_debug_assert(vece <= MO_64);
1783     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1784 }
1785 
1786 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1787 
1788 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1789                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1790 {
1791     static const GVecGen3 g[4] = {
1792         { .fniv = tcg_gen_mul_vec,
1793           .fno = gen_helper_gvec_mul8,
1794           .opt_opc = vecop_list_mul,
1795           .vece = MO_8 },
1796         { .fniv = tcg_gen_mul_vec,
1797           .fno = gen_helper_gvec_mul16,
1798           .opt_opc = vecop_list_mul,
1799           .vece = MO_16 },
1800         { .fni4 = tcg_gen_mul_i32,
1801           .fniv = tcg_gen_mul_vec,
1802           .fno = gen_helper_gvec_mul32,
1803           .opt_opc = vecop_list_mul,
1804           .vece = MO_32 },
1805         { .fni8 = tcg_gen_mul_i64,
1806           .fniv = tcg_gen_mul_vec,
1807           .fno = gen_helper_gvec_mul64,
1808           .opt_opc = vecop_list_mul,
1809           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1810           .vece = MO_64 },
1811     };
1812 
1813     tcg_debug_assert(vece <= MO_64);
1814     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1815 }
1816 
1817 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1818                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1819 {
1820     static const GVecGen2s g[4] = {
1821         { .fniv = tcg_gen_mul_vec,
1822           .fno = gen_helper_gvec_muls8,
1823           .opt_opc = vecop_list_mul,
1824           .vece = MO_8 },
1825         { .fniv = tcg_gen_mul_vec,
1826           .fno = gen_helper_gvec_muls16,
1827           .opt_opc = vecop_list_mul,
1828           .vece = MO_16 },
1829         { .fni4 = tcg_gen_mul_i32,
1830           .fniv = tcg_gen_mul_vec,
1831           .fno = gen_helper_gvec_muls32,
1832           .opt_opc = vecop_list_mul,
1833           .vece = MO_32 },
1834         { .fni8 = tcg_gen_mul_i64,
1835           .fniv = tcg_gen_mul_vec,
1836           .fno = gen_helper_gvec_muls64,
1837           .opt_opc = vecop_list_mul,
1838           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1839           .vece = MO_64 },
1840     };
1841 
1842     tcg_debug_assert(vece <= MO_64);
1843     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1844 }
1845 
1846 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1847                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1848 {
1849     TCGv_i64 tmp = tcg_const_i64(c);
1850     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1851     tcg_temp_free_i64(tmp);
1852 }
1853 
1854 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1855                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1856 {
1857     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1858     static const GVecGen3 g[4] = {
1859         { .fniv = tcg_gen_ssadd_vec,
1860           .fno = gen_helper_gvec_ssadd8,
1861           .opt_opc = vecop_list,
1862           .vece = MO_8 },
1863         { .fniv = tcg_gen_ssadd_vec,
1864           .fno = gen_helper_gvec_ssadd16,
1865           .opt_opc = vecop_list,
1866           .vece = MO_16 },
1867         { .fniv = tcg_gen_ssadd_vec,
1868           .fno = gen_helper_gvec_ssadd32,
1869           .opt_opc = vecop_list,
1870           .vece = MO_32 },
1871         { .fniv = tcg_gen_ssadd_vec,
1872           .fno = gen_helper_gvec_ssadd64,
1873           .opt_opc = vecop_list,
1874           .vece = MO_64 },
1875     };
1876     tcg_debug_assert(vece <= MO_64);
1877     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1878 }
1879 
1880 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1881                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1882 {
1883     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
1884     static const GVecGen3 g[4] = {
1885         { .fniv = tcg_gen_sssub_vec,
1886           .fno = gen_helper_gvec_sssub8,
1887           .opt_opc = vecop_list,
1888           .vece = MO_8 },
1889         { .fniv = tcg_gen_sssub_vec,
1890           .fno = gen_helper_gvec_sssub16,
1891           .opt_opc = vecop_list,
1892           .vece = MO_16 },
1893         { .fniv = tcg_gen_sssub_vec,
1894           .fno = gen_helper_gvec_sssub32,
1895           .opt_opc = vecop_list,
1896           .vece = MO_32 },
1897         { .fniv = tcg_gen_sssub_vec,
1898           .fno = gen_helper_gvec_sssub64,
1899           .opt_opc = vecop_list,
1900           .vece = MO_64 },
1901     };
1902     tcg_debug_assert(vece <= MO_64);
1903     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1904 }
1905 
1906 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1907 {
1908     TCGv_i32 max = tcg_const_i32(-1);
1909     tcg_gen_add_i32(d, a, b);
1910     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1911     tcg_temp_free_i32(max);
1912 }
1913 
1914 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1915 {
1916     TCGv_i64 max = tcg_const_i64(-1);
1917     tcg_gen_add_i64(d, a, b);
1918     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1919     tcg_temp_free_i64(max);
1920 }
1921 
1922 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1923                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1924 {
1925     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
1926     static const GVecGen3 g[4] = {
1927         { .fniv = tcg_gen_usadd_vec,
1928           .fno = gen_helper_gvec_usadd8,
1929           .opt_opc = vecop_list,
1930           .vece = MO_8 },
1931         { .fniv = tcg_gen_usadd_vec,
1932           .fno = gen_helper_gvec_usadd16,
1933           .opt_opc = vecop_list,
1934           .vece = MO_16 },
1935         { .fni4 = tcg_gen_usadd_i32,
1936           .fniv = tcg_gen_usadd_vec,
1937           .fno = gen_helper_gvec_usadd32,
1938           .opt_opc = vecop_list,
1939           .vece = MO_32 },
1940         { .fni8 = tcg_gen_usadd_i64,
1941           .fniv = tcg_gen_usadd_vec,
1942           .fno = gen_helper_gvec_usadd64,
1943           .opt_opc = vecop_list,
1944           .vece = MO_64 }
1945     };
1946     tcg_debug_assert(vece <= MO_64);
1947     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1948 }
1949 
1950 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1951 {
1952     TCGv_i32 min = tcg_const_i32(0);
1953     tcg_gen_sub_i32(d, a, b);
1954     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1955     tcg_temp_free_i32(min);
1956 }
1957 
1958 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1959 {
1960     TCGv_i64 min = tcg_const_i64(0);
1961     tcg_gen_sub_i64(d, a, b);
1962     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1963     tcg_temp_free_i64(min);
1964 }
1965 
1966 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1967                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1968 {
1969     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
1970     static const GVecGen3 g[4] = {
1971         { .fniv = tcg_gen_ussub_vec,
1972           .fno = gen_helper_gvec_ussub8,
1973           .opt_opc = vecop_list,
1974           .vece = MO_8 },
1975         { .fniv = tcg_gen_ussub_vec,
1976           .fno = gen_helper_gvec_ussub16,
1977           .opt_opc = vecop_list,
1978           .vece = MO_16 },
1979         { .fni4 = tcg_gen_ussub_i32,
1980           .fniv = tcg_gen_ussub_vec,
1981           .fno = gen_helper_gvec_ussub32,
1982           .opt_opc = vecop_list,
1983           .vece = MO_32 },
1984         { .fni8 = tcg_gen_ussub_i64,
1985           .fniv = tcg_gen_ussub_vec,
1986           .fno = gen_helper_gvec_ussub64,
1987           .opt_opc = vecop_list,
1988           .vece = MO_64 }
1989     };
1990     tcg_debug_assert(vece <= MO_64);
1991     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1992 }
1993 
1994 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
1995                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1996 {
1997     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
1998     static const GVecGen3 g[4] = {
1999         { .fniv = tcg_gen_smin_vec,
2000           .fno = gen_helper_gvec_smin8,
2001           .opt_opc = vecop_list,
2002           .vece = MO_8 },
2003         { .fniv = tcg_gen_smin_vec,
2004           .fno = gen_helper_gvec_smin16,
2005           .opt_opc = vecop_list,
2006           .vece = MO_16 },
2007         { .fni4 = tcg_gen_smin_i32,
2008           .fniv = tcg_gen_smin_vec,
2009           .fno = gen_helper_gvec_smin32,
2010           .opt_opc = vecop_list,
2011           .vece = MO_32 },
2012         { .fni8 = tcg_gen_smin_i64,
2013           .fniv = tcg_gen_smin_vec,
2014           .fno = gen_helper_gvec_smin64,
2015           .opt_opc = vecop_list,
2016           .vece = MO_64 }
2017     };
2018     tcg_debug_assert(vece <= MO_64);
2019     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2020 }
2021 
2022 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2023                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2024 {
2025     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2026     static const GVecGen3 g[4] = {
2027         { .fniv = tcg_gen_umin_vec,
2028           .fno = gen_helper_gvec_umin8,
2029           .opt_opc = vecop_list,
2030           .vece = MO_8 },
2031         { .fniv = tcg_gen_umin_vec,
2032           .fno = gen_helper_gvec_umin16,
2033           .opt_opc = vecop_list,
2034           .vece = MO_16 },
2035         { .fni4 = tcg_gen_umin_i32,
2036           .fniv = tcg_gen_umin_vec,
2037           .fno = gen_helper_gvec_umin32,
2038           .opt_opc = vecop_list,
2039           .vece = MO_32 },
2040         { .fni8 = tcg_gen_umin_i64,
2041           .fniv = tcg_gen_umin_vec,
2042           .fno = gen_helper_gvec_umin64,
2043           .opt_opc = vecop_list,
2044           .vece = MO_64 }
2045     };
2046     tcg_debug_assert(vece <= MO_64);
2047     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2048 }
2049 
2050 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2051                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2052 {
2053     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2054     static const GVecGen3 g[4] = {
2055         { .fniv = tcg_gen_smax_vec,
2056           .fno = gen_helper_gvec_smax8,
2057           .opt_opc = vecop_list,
2058           .vece = MO_8 },
2059         { .fniv = tcg_gen_smax_vec,
2060           .fno = gen_helper_gvec_smax16,
2061           .opt_opc = vecop_list,
2062           .vece = MO_16 },
2063         { .fni4 = tcg_gen_smax_i32,
2064           .fniv = tcg_gen_smax_vec,
2065           .fno = gen_helper_gvec_smax32,
2066           .opt_opc = vecop_list,
2067           .vece = MO_32 },
2068         { .fni8 = tcg_gen_smax_i64,
2069           .fniv = tcg_gen_smax_vec,
2070           .fno = gen_helper_gvec_smax64,
2071           .opt_opc = vecop_list,
2072           .vece = MO_64 }
2073     };
2074     tcg_debug_assert(vece <= MO_64);
2075     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2076 }
2077 
2078 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2079                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2080 {
2081     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2082     static const GVecGen3 g[4] = {
2083         { .fniv = tcg_gen_umax_vec,
2084           .fno = gen_helper_gvec_umax8,
2085           .opt_opc = vecop_list,
2086           .vece = MO_8 },
2087         { .fniv = tcg_gen_umax_vec,
2088           .fno = gen_helper_gvec_umax16,
2089           .opt_opc = vecop_list,
2090           .vece = MO_16 },
2091         { .fni4 = tcg_gen_umax_i32,
2092           .fniv = tcg_gen_umax_vec,
2093           .fno = gen_helper_gvec_umax32,
2094           .opt_opc = vecop_list,
2095           .vece = MO_32 },
2096         { .fni8 = tcg_gen_umax_i64,
2097           .fniv = tcg_gen_umax_vec,
2098           .fno = gen_helper_gvec_umax64,
2099           .opt_opc = vecop_list,
2100           .vece = MO_64 }
2101     };
2102     tcg_debug_assert(vece <= MO_64);
2103     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2104 }
2105 
2106 /* Perform a vector negation using normal negation and a mask.
2107    Compare gen_subv_mask above.  */
2108 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2109 {
2110     TCGv_i64 t2 = tcg_temp_new_i64();
2111     TCGv_i64 t3 = tcg_temp_new_i64();
2112 
2113     tcg_gen_andc_i64(t3, m, b);
2114     tcg_gen_andc_i64(t2, b, m);
2115     tcg_gen_sub_i64(d, m, t2);
2116     tcg_gen_xor_i64(d, d, t3);
2117 
2118     tcg_temp_free_i64(t2);
2119     tcg_temp_free_i64(t3);
2120 }
2121 
2122 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2123 {
2124     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2125     gen_negv_mask(d, b, m);
2126     tcg_temp_free_i64(m);
2127 }
2128 
2129 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2130 {
2131     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2132     gen_negv_mask(d, b, m);
2133     tcg_temp_free_i64(m);
2134 }
2135 
2136 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2137 {
2138     TCGv_i64 t1 = tcg_temp_new_i64();
2139     TCGv_i64 t2 = tcg_temp_new_i64();
2140 
2141     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2142     tcg_gen_neg_i64(t2, b);
2143     tcg_gen_neg_i64(t1, t1);
2144     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2145 
2146     tcg_temp_free_i64(t1);
2147     tcg_temp_free_i64(t2);
2148 }
2149 
2150 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2151                       uint32_t oprsz, uint32_t maxsz)
2152 {
2153     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2154     static const GVecGen2 g[4] = {
2155         { .fni8 = tcg_gen_vec_neg8_i64,
2156           .fniv = tcg_gen_neg_vec,
2157           .fno = gen_helper_gvec_neg8,
2158           .opt_opc = vecop_list,
2159           .vece = MO_8 },
2160         { .fni8 = tcg_gen_vec_neg16_i64,
2161           .fniv = tcg_gen_neg_vec,
2162           .fno = gen_helper_gvec_neg16,
2163           .opt_opc = vecop_list,
2164           .vece = MO_16 },
2165         { .fni4 = tcg_gen_neg_i32,
2166           .fniv = tcg_gen_neg_vec,
2167           .fno = gen_helper_gvec_neg32,
2168           .opt_opc = vecop_list,
2169           .vece = MO_32 },
2170         { .fni8 = tcg_gen_neg_i64,
2171           .fniv = tcg_gen_neg_vec,
2172           .fno = gen_helper_gvec_neg64,
2173           .opt_opc = vecop_list,
2174           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2175           .vece = MO_64 },
2176     };
2177 
2178     tcg_debug_assert(vece <= MO_64);
2179     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2180 }
2181 
2182 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2183 {
2184     TCGv_i64 t = tcg_temp_new_i64();
2185     int nbit = 8 << vece;
2186 
2187     /* Create -1 for each negative element.  */
2188     tcg_gen_shri_i64(t, b, nbit - 1);
2189     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2190     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2191 
2192     /*
2193      * Invert (via xor -1) and add one (via sub -1).
2194      * Because of the ordering the msb is cleared,
2195      * so we never have carry into the next element.
2196      */
2197     tcg_gen_xor_i64(d, b, t);
2198     tcg_gen_sub_i64(d, d, t);
2199 
2200     tcg_temp_free_i64(t);
2201 }
2202 
2203 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2204 {
2205     gen_absv_mask(d, b, MO_8);
2206 }
2207 
2208 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2209 {
2210     gen_absv_mask(d, b, MO_16);
2211 }
2212 
2213 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2214                       uint32_t oprsz, uint32_t maxsz)
2215 {
2216     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2217     static const GVecGen2 g[4] = {
2218         { .fni8 = tcg_gen_vec_abs8_i64,
2219           .fniv = tcg_gen_abs_vec,
2220           .fno = gen_helper_gvec_abs8,
2221           .opt_opc = vecop_list,
2222           .vece = MO_8 },
2223         { .fni8 = tcg_gen_vec_abs16_i64,
2224           .fniv = tcg_gen_abs_vec,
2225           .fno = gen_helper_gvec_abs16,
2226           .opt_opc = vecop_list,
2227           .vece = MO_16 },
2228         { .fni4 = tcg_gen_abs_i32,
2229           .fniv = tcg_gen_abs_vec,
2230           .fno = gen_helper_gvec_abs32,
2231           .opt_opc = vecop_list,
2232           .vece = MO_32 },
2233         { .fni8 = tcg_gen_abs_i64,
2234           .fniv = tcg_gen_abs_vec,
2235           .fno = gen_helper_gvec_abs64,
2236           .opt_opc = vecop_list,
2237           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2238           .vece = MO_64 },
2239     };
2240 
2241     tcg_debug_assert(vece <= MO_64);
2242     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2243 }
2244 
2245 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2246                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2247 {
2248     static const GVecGen3 g = {
2249         .fni8 = tcg_gen_and_i64,
2250         .fniv = tcg_gen_and_vec,
2251         .fno = gen_helper_gvec_and,
2252         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2253     };
2254 
2255     if (aofs == bofs) {
2256         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2257     } else {
2258         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2259     }
2260 }
2261 
2262 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2263                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2264 {
2265     static const GVecGen3 g = {
2266         .fni8 = tcg_gen_or_i64,
2267         .fniv = tcg_gen_or_vec,
2268         .fno = gen_helper_gvec_or,
2269         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2270     };
2271 
2272     if (aofs == bofs) {
2273         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2274     } else {
2275         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2276     }
2277 }
2278 
2279 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2280                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2281 {
2282     static const GVecGen3 g = {
2283         .fni8 = tcg_gen_xor_i64,
2284         .fniv = tcg_gen_xor_vec,
2285         .fno = gen_helper_gvec_xor,
2286         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2287     };
2288 
2289     if (aofs == bofs) {
2290         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2291     } else {
2292         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2293     }
2294 }
2295 
2296 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2297                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2298 {
2299     static const GVecGen3 g = {
2300         .fni8 = tcg_gen_andc_i64,
2301         .fniv = tcg_gen_andc_vec,
2302         .fno = gen_helper_gvec_andc,
2303         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2304     };
2305 
2306     if (aofs == bofs) {
2307         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2308     } else {
2309         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2310     }
2311 }
2312 
2313 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2314                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2315 {
2316     static const GVecGen3 g = {
2317         .fni8 = tcg_gen_orc_i64,
2318         .fniv = tcg_gen_orc_vec,
2319         .fno = gen_helper_gvec_orc,
2320         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2321     };
2322 
2323     if (aofs == bofs) {
2324         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2325     } else {
2326         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2327     }
2328 }
2329 
2330 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2331                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2332 {
2333     static const GVecGen3 g = {
2334         .fni8 = tcg_gen_nand_i64,
2335         .fniv = tcg_gen_nand_vec,
2336         .fno = gen_helper_gvec_nand,
2337         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2338     };
2339 
2340     if (aofs == bofs) {
2341         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2342     } else {
2343         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2344     }
2345 }
2346 
2347 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2348                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2349 {
2350     static const GVecGen3 g = {
2351         .fni8 = tcg_gen_nor_i64,
2352         .fniv = tcg_gen_nor_vec,
2353         .fno = gen_helper_gvec_nor,
2354         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2355     };
2356 
2357     if (aofs == bofs) {
2358         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2359     } else {
2360         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2361     }
2362 }
2363 
2364 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2365                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2366 {
2367     static const GVecGen3 g = {
2368         .fni8 = tcg_gen_eqv_i64,
2369         .fniv = tcg_gen_eqv_vec,
2370         .fno = gen_helper_gvec_eqv,
2371         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2372     };
2373 
2374     if (aofs == bofs) {
2375         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2376     } else {
2377         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2378     }
2379 }
2380 
2381 static const GVecGen2s gop_ands = {
2382     .fni8 = tcg_gen_and_i64,
2383     .fniv = tcg_gen_and_vec,
2384     .fno = gen_helper_gvec_ands,
2385     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2386     .vece = MO_64
2387 };
2388 
2389 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2390                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2391 {
2392     TCGv_i64 tmp = tcg_temp_new_i64();
2393     gen_dup_i64(vece, tmp, c);
2394     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2395     tcg_temp_free_i64(tmp);
2396 }
2397 
2398 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2399                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2400 {
2401     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2402     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2403     tcg_temp_free_i64(tmp);
2404 }
2405 
2406 static const GVecGen2s gop_xors = {
2407     .fni8 = tcg_gen_xor_i64,
2408     .fniv = tcg_gen_xor_vec,
2409     .fno = gen_helper_gvec_xors,
2410     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2411     .vece = MO_64
2412 };
2413 
2414 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2415                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2416 {
2417     TCGv_i64 tmp = tcg_temp_new_i64();
2418     gen_dup_i64(vece, tmp, c);
2419     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2420     tcg_temp_free_i64(tmp);
2421 }
2422 
2423 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2424                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2425 {
2426     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2427     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2428     tcg_temp_free_i64(tmp);
2429 }
2430 
2431 static const GVecGen2s gop_ors = {
2432     .fni8 = tcg_gen_or_i64,
2433     .fniv = tcg_gen_or_vec,
2434     .fno = gen_helper_gvec_ors,
2435     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2436     .vece = MO_64
2437 };
2438 
2439 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2440                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2441 {
2442     TCGv_i64 tmp = tcg_temp_new_i64();
2443     gen_dup_i64(vece, tmp, c);
2444     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2445     tcg_temp_free_i64(tmp);
2446 }
2447 
2448 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2449                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2450 {
2451     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2452     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2453     tcg_temp_free_i64(tmp);
2454 }
2455 
2456 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2457 {
2458     uint64_t mask = dup_const(MO_8, 0xff << c);
2459     tcg_gen_shli_i64(d, a, c);
2460     tcg_gen_andi_i64(d, d, mask);
2461 }
2462 
2463 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2464 {
2465     uint64_t mask = dup_const(MO_16, 0xffff << c);
2466     tcg_gen_shli_i64(d, a, c);
2467     tcg_gen_andi_i64(d, d, mask);
2468 }
2469 
2470 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2471                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2472 {
2473     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2474     static const GVecGen2i g[4] = {
2475         { .fni8 = tcg_gen_vec_shl8i_i64,
2476           .fniv = tcg_gen_shli_vec,
2477           .fno = gen_helper_gvec_shl8i,
2478           .opt_opc = vecop_list,
2479           .vece = MO_8 },
2480         { .fni8 = tcg_gen_vec_shl16i_i64,
2481           .fniv = tcg_gen_shli_vec,
2482           .fno = gen_helper_gvec_shl16i,
2483           .opt_opc = vecop_list,
2484           .vece = MO_16 },
2485         { .fni4 = tcg_gen_shli_i32,
2486           .fniv = tcg_gen_shli_vec,
2487           .fno = gen_helper_gvec_shl32i,
2488           .opt_opc = vecop_list,
2489           .vece = MO_32 },
2490         { .fni8 = tcg_gen_shli_i64,
2491           .fniv = tcg_gen_shli_vec,
2492           .fno = gen_helper_gvec_shl64i,
2493           .opt_opc = vecop_list,
2494           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2495           .vece = MO_64 },
2496     };
2497 
2498     tcg_debug_assert(vece <= MO_64);
2499     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2500     if (shift == 0) {
2501         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2502     } else {
2503         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2504     }
2505 }
2506 
2507 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2508 {
2509     uint64_t mask = dup_const(MO_8, 0xff >> c);
2510     tcg_gen_shri_i64(d, a, c);
2511     tcg_gen_andi_i64(d, d, mask);
2512 }
2513 
2514 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2515 {
2516     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2517     tcg_gen_shri_i64(d, a, c);
2518     tcg_gen_andi_i64(d, d, mask);
2519 }
2520 
2521 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2522                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2523 {
2524     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2525     static const GVecGen2i g[4] = {
2526         { .fni8 = tcg_gen_vec_shr8i_i64,
2527           .fniv = tcg_gen_shri_vec,
2528           .fno = gen_helper_gvec_shr8i,
2529           .opt_opc = vecop_list,
2530           .vece = MO_8 },
2531         { .fni8 = tcg_gen_vec_shr16i_i64,
2532           .fniv = tcg_gen_shri_vec,
2533           .fno = gen_helper_gvec_shr16i,
2534           .opt_opc = vecop_list,
2535           .vece = MO_16 },
2536         { .fni4 = tcg_gen_shri_i32,
2537           .fniv = tcg_gen_shri_vec,
2538           .fno = gen_helper_gvec_shr32i,
2539           .opt_opc = vecop_list,
2540           .vece = MO_32 },
2541         { .fni8 = tcg_gen_shri_i64,
2542           .fniv = tcg_gen_shri_vec,
2543           .fno = gen_helper_gvec_shr64i,
2544           .opt_opc = vecop_list,
2545           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2546           .vece = MO_64 },
2547     };
2548 
2549     tcg_debug_assert(vece <= MO_64);
2550     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2551     if (shift == 0) {
2552         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2553     } else {
2554         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2555     }
2556 }
2557 
2558 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2559 {
2560     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2561     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2562     TCGv_i64 s = tcg_temp_new_i64();
2563 
2564     tcg_gen_shri_i64(d, a, c);
2565     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2566     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2567     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2568     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2569     tcg_temp_free_i64(s);
2570 }
2571 
2572 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2573 {
2574     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2575     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2576     TCGv_i64 s = tcg_temp_new_i64();
2577 
2578     tcg_gen_shri_i64(d, a, c);
2579     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2580     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2581     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2582     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2583     tcg_temp_free_i64(s);
2584 }
2585 
2586 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2587                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2588 {
2589     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2590     static const GVecGen2i g[4] = {
2591         { .fni8 = tcg_gen_vec_sar8i_i64,
2592           .fniv = tcg_gen_sari_vec,
2593           .fno = gen_helper_gvec_sar8i,
2594           .opt_opc = vecop_list,
2595           .vece = MO_8 },
2596         { .fni8 = tcg_gen_vec_sar16i_i64,
2597           .fniv = tcg_gen_sari_vec,
2598           .fno = gen_helper_gvec_sar16i,
2599           .opt_opc = vecop_list,
2600           .vece = MO_16 },
2601         { .fni4 = tcg_gen_sari_i32,
2602           .fniv = tcg_gen_sari_vec,
2603           .fno = gen_helper_gvec_sar32i,
2604           .opt_opc = vecop_list,
2605           .vece = MO_32 },
2606         { .fni8 = tcg_gen_sari_i64,
2607           .fniv = tcg_gen_sari_vec,
2608           .fno = gen_helper_gvec_sar64i,
2609           .opt_opc = vecop_list,
2610           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2611           .vece = MO_64 },
2612     };
2613 
2614     tcg_debug_assert(vece <= MO_64);
2615     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2616     if (shift == 0) {
2617         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2618     } else {
2619         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2620     }
2621 }
2622 
2623 /*
2624  * Specialized generation vector shifts by a non-constant scalar.
2625  */
2626 
2627 typedef struct {
2628     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2629     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2630     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2631     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2632     gen_helper_gvec_2 *fno[4];
2633     TCGOpcode s_list[2];
2634     TCGOpcode v_list[2];
2635 } GVecGen2sh;
2636 
2637 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2638                            uint32_t oprsz, uint32_t tysz, TCGType type,
2639                            TCGv_i32 shift,
2640                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2641 {
2642     TCGv_vec t0 = tcg_temp_new_vec(type);
2643     uint32_t i;
2644 
2645     for (i = 0; i < oprsz; i += tysz) {
2646         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2647         fni(vece, t0, t0, shift);
2648         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2649     }
2650     tcg_temp_free_vec(t0);
2651 }
2652 
2653 static void
2654 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2655                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2656 {
2657     TCGType type;
2658     uint32_t some;
2659 
2660     check_size_align(oprsz, maxsz, dofs | aofs);
2661     check_overlap_2(dofs, aofs, maxsz);
2662 
2663     /* If the backend has a scalar expansion, great.  */
2664     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2665     if (type) {
2666         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2667         switch (type) {
2668         case TCG_TYPE_V256:
2669             some = QEMU_ALIGN_DOWN(oprsz, 32);
2670             expand_2sh_vec(vece, dofs, aofs, some, 32,
2671                            TCG_TYPE_V256, shift, g->fniv_s);
2672             if (some == oprsz) {
2673                 break;
2674             }
2675             dofs += some;
2676             aofs += some;
2677             oprsz -= some;
2678             maxsz -= some;
2679             /* fallthru */
2680         case TCG_TYPE_V128:
2681             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2682                            TCG_TYPE_V128, shift, g->fniv_s);
2683             break;
2684         case TCG_TYPE_V64:
2685             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2686                            TCG_TYPE_V64, shift, g->fniv_s);
2687             break;
2688         default:
2689             g_assert_not_reached();
2690         }
2691         tcg_swap_vecop_list(hold_list);
2692         goto clear_tail;
2693     }
2694 
2695     /* If the backend supports variable vector shifts, also cool.  */
2696     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2697     if (type) {
2698         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2699         TCGv_vec v_shift = tcg_temp_new_vec(type);
2700 
2701         if (vece == MO_64) {
2702             TCGv_i64 sh64 = tcg_temp_new_i64();
2703             tcg_gen_extu_i32_i64(sh64, shift);
2704             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2705             tcg_temp_free_i64(sh64);
2706         } else {
2707             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2708         }
2709 
2710         switch (type) {
2711         case TCG_TYPE_V256:
2712             some = QEMU_ALIGN_DOWN(oprsz, 32);
2713             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2714                           v_shift, false, g->fniv_v);
2715             if (some == oprsz) {
2716                 break;
2717             }
2718             dofs += some;
2719             aofs += some;
2720             oprsz -= some;
2721             maxsz -= some;
2722             /* fallthru */
2723         case TCG_TYPE_V128:
2724             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2725                           v_shift, false, g->fniv_v);
2726             break;
2727         case TCG_TYPE_V64:
2728             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2729                           v_shift, false, g->fniv_v);
2730             break;
2731         default:
2732             g_assert_not_reached();
2733         }
2734         tcg_temp_free_vec(v_shift);
2735         tcg_swap_vecop_list(hold_list);
2736         goto clear_tail;
2737     }
2738 
2739     /* Otherwise fall back to integral... */
2740     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2741         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2742     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2743         TCGv_i64 sh64 = tcg_temp_new_i64();
2744         tcg_gen_extu_i32_i64(sh64, shift);
2745         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2746         tcg_temp_free_i64(sh64);
2747     } else {
2748         TCGv_ptr a0 = tcg_temp_new_ptr();
2749         TCGv_ptr a1 = tcg_temp_new_ptr();
2750         TCGv_i32 desc = tcg_temp_new_i32();
2751 
2752         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2753         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2754         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2755         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2756 
2757         g->fno[vece](a0, a1, desc);
2758 
2759         tcg_temp_free_ptr(a0);
2760         tcg_temp_free_ptr(a1);
2761         tcg_temp_free_i32(desc);
2762         return;
2763     }
2764 
2765  clear_tail:
2766     if (oprsz < maxsz) {
2767         expand_clr(dofs + oprsz, maxsz - oprsz);
2768     }
2769 }
2770 
2771 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2772                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2773 {
2774     static const GVecGen2sh g = {
2775         .fni4 = tcg_gen_shl_i32,
2776         .fni8 = tcg_gen_shl_i64,
2777         .fniv_s = tcg_gen_shls_vec,
2778         .fniv_v = tcg_gen_shlv_vec,
2779         .fno = {
2780             gen_helper_gvec_shl8i,
2781             gen_helper_gvec_shl16i,
2782             gen_helper_gvec_shl32i,
2783             gen_helper_gvec_shl64i,
2784         },
2785         .s_list = { INDEX_op_shls_vec, 0 },
2786         .v_list = { INDEX_op_shlv_vec, 0 },
2787     };
2788 
2789     tcg_debug_assert(vece <= MO_64);
2790     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2791 }
2792 
2793 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
2794                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2795 {
2796     static const GVecGen2sh g = {
2797         .fni4 = tcg_gen_shr_i32,
2798         .fni8 = tcg_gen_shr_i64,
2799         .fniv_s = tcg_gen_shrs_vec,
2800         .fniv_v = tcg_gen_shrv_vec,
2801         .fno = {
2802             gen_helper_gvec_shr8i,
2803             gen_helper_gvec_shr16i,
2804             gen_helper_gvec_shr32i,
2805             gen_helper_gvec_shr64i,
2806         },
2807         .s_list = { INDEX_op_shrs_vec, 0 },
2808         .v_list = { INDEX_op_shrv_vec, 0 },
2809     };
2810 
2811     tcg_debug_assert(vece <= MO_64);
2812     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2813 }
2814 
2815 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
2816                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2817 {
2818     static const GVecGen2sh g = {
2819         .fni4 = tcg_gen_sar_i32,
2820         .fni8 = tcg_gen_sar_i64,
2821         .fniv_s = tcg_gen_sars_vec,
2822         .fniv_v = tcg_gen_sarv_vec,
2823         .fno = {
2824             gen_helper_gvec_sar8i,
2825             gen_helper_gvec_sar16i,
2826             gen_helper_gvec_sar32i,
2827             gen_helper_gvec_sar64i,
2828         },
2829         .s_list = { INDEX_op_sars_vec, 0 },
2830         .v_list = { INDEX_op_sarv_vec, 0 },
2831     };
2832 
2833     tcg_debug_assert(vece <= MO_64);
2834     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2835 }
2836 
2837 /*
2838  * Expand D = A << (B % element bits)
2839  *
2840  * Unlike scalar shifts, where it is easy for the target front end
2841  * to include the modulo as part of the expansion.  If the target
2842  * naturally includes the modulo as part of the operation, great!
2843  * If the target has some other behaviour from out-of-range shifts,
2844  * then it could not use this function anyway, and would need to
2845  * do it's own expansion with custom functions.
2846  */
2847 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
2848                                  TCGv_vec a, TCGv_vec b)
2849 {
2850     TCGv_vec t = tcg_temp_new_vec_matching(d);
2851 
2852     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2853     tcg_gen_and_vec(vece, t, t, b);
2854     tcg_gen_shlv_vec(vece, d, a, t);
2855     tcg_temp_free_vec(t);
2856 }
2857 
2858 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2859 {
2860     TCGv_i32 t = tcg_temp_new_i32();
2861 
2862     tcg_gen_andi_i32(t, b, 31);
2863     tcg_gen_shl_i32(d, a, t);
2864     tcg_temp_free_i32(t);
2865 }
2866 
2867 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2868 {
2869     TCGv_i64 t = tcg_temp_new_i64();
2870 
2871     tcg_gen_andi_i64(t, b, 63);
2872     tcg_gen_shl_i64(d, a, t);
2873     tcg_temp_free_i64(t);
2874 }
2875 
2876 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
2877                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2878 {
2879     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
2880     static const GVecGen3 g[4] = {
2881         { .fniv = tcg_gen_shlv_mod_vec,
2882           .fno = gen_helper_gvec_shl8v,
2883           .opt_opc = vecop_list,
2884           .vece = MO_8 },
2885         { .fniv = tcg_gen_shlv_mod_vec,
2886           .fno = gen_helper_gvec_shl16v,
2887           .opt_opc = vecop_list,
2888           .vece = MO_16 },
2889         { .fni4 = tcg_gen_shl_mod_i32,
2890           .fniv = tcg_gen_shlv_mod_vec,
2891           .fno = gen_helper_gvec_shl32v,
2892           .opt_opc = vecop_list,
2893           .vece = MO_32 },
2894         { .fni8 = tcg_gen_shl_mod_i64,
2895           .fniv = tcg_gen_shlv_mod_vec,
2896           .fno = gen_helper_gvec_shl64v,
2897           .opt_opc = vecop_list,
2898           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2899           .vece = MO_64 },
2900     };
2901 
2902     tcg_debug_assert(vece <= MO_64);
2903     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2904 }
2905 
2906 /*
2907  * Similarly for logical right shifts.
2908  */
2909 
2910 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
2911                                  TCGv_vec a, TCGv_vec b)
2912 {
2913     TCGv_vec t = tcg_temp_new_vec_matching(d);
2914 
2915     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2916     tcg_gen_and_vec(vece, t, t, b);
2917     tcg_gen_shrv_vec(vece, d, a, t);
2918     tcg_temp_free_vec(t);
2919 }
2920 
2921 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2922 {
2923     TCGv_i32 t = tcg_temp_new_i32();
2924 
2925     tcg_gen_andi_i32(t, b, 31);
2926     tcg_gen_shr_i32(d, a, t);
2927     tcg_temp_free_i32(t);
2928 }
2929 
2930 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2931 {
2932     TCGv_i64 t = tcg_temp_new_i64();
2933 
2934     tcg_gen_andi_i64(t, b, 63);
2935     tcg_gen_shr_i64(d, a, t);
2936     tcg_temp_free_i64(t);
2937 }
2938 
2939 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
2940                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2941 {
2942     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
2943     static const GVecGen3 g[4] = {
2944         { .fniv = tcg_gen_shrv_mod_vec,
2945           .fno = gen_helper_gvec_shr8v,
2946           .opt_opc = vecop_list,
2947           .vece = MO_8 },
2948         { .fniv = tcg_gen_shrv_mod_vec,
2949           .fno = gen_helper_gvec_shr16v,
2950           .opt_opc = vecop_list,
2951           .vece = MO_16 },
2952         { .fni4 = tcg_gen_shr_mod_i32,
2953           .fniv = tcg_gen_shrv_mod_vec,
2954           .fno = gen_helper_gvec_shr32v,
2955           .opt_opc = vecop_list,
2956           .vece = MO_32 },
2957         { .fni8 = tcg_gen_shr_mod_i64,
2958           .fniv = tcg_gen_shrv_mod_vec,
2959           .fno = gen_helper_gvec_shr64v,
2960           .opt_opc = vecop_list,
2961           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2962           .vece = MO_64 },
2963     };
2964 
2965     tcg_debug_assert(vece <= MO_64);
2966     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2967 }
2968 
2969 /*
2970  * Similarly for arithmetic right shifts.
2971  */
2972 
2973 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
2974                                  TCGv_vec a, TCGv_vec b)
2975 {
2976     TCGv_vec t = tcg_temp_new_vec_matching(d);
2977 
2978     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2979     tcg_gen_and_vec(vece, t, t, b);
2980     tcg_gen_sarv_vec(vece, d, a, t);
2981     tcg_temp_free_vec(t);
2982 }
2983 
2984 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2985 {
2986     TCGv_i32 t = tcg_temp_new_i32();
2987 
2988     tcg_gen_andi_i32(t, b, 31);
2989     tcg_gen_sar_i32(d, a, t);
2990     tcg_temp_free_i32(t);
2991 }
2992 
2993 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2994 {
2995     TCGv_i64 t = tcg_temp_new_i64();
2996 
2997     tcg_gen_andi_i64(t, b, 63);
2998     tcg_gen_sar_i64(d, a, t);
2999     tcg_temp_free_i64(t);
3000 }
3001 
3002 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3003                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3004 {
3005     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3006     static const GVecGen3 g[4] = {
3007         { .fniv = tcg_gen_sarv_mod_vec,
3008           .fno = gen_helper_gvec_sar8v,
3009           .opt_opc = vecop_list,
3010           .vece = MO_8 },
3011         { .fniv = tcg_gen_sarv_mod_vec,
3012           .fno = gen_helper_gvec_sar16v,
3013           .opt_opc = vecop_list,
3014           .vece = MO_16 },
3015         { .fni4 = tcg_gen_sar_mod_i32,
3016           .fniv = tcg_gen_sarv_mod_vec,
3017           .fno = gen_helper_gvec_sar32v,
3018           .opt_opc = vecop_list,
3019           .vece = MO_32 },
3020         { .fni8 = tcg_gen_sar_mod_i64,
3021           .fniv = tcg_gen_sarv_mod_vec,
3022           .fno = gen_helper_gvec_sar64v,
3023           .opt_opc = vecop_list,
3024           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3025           .vece = MO_64 },
3026     };
3027 
3028     tcg_debug_assert(vece <= MO_64);
3029     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3030 }
3031 
3032 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3033 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3034                            uint32_t oprsz, TCGCond cond)
3035 {
3036     TCGv_i32 t0 = tcg_temp_new_i32();
3037     TCGv_i32 t1 = tcg_temp_new_i32();
3038     uint32_t i;
3039 
3040     for (i = 0; i < oprsz; i += 4) {
3041         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3042         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3043         tcg_gen_setcond_i32(cond, t0, t0, t1);
3044         tcg_gen_neg_i32(t0, t0);
3045         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3046     }
3047     tcg_temp_free_i32(t1);
3048     tcg_temp_free_i32(t0);
3049 }
3050 
3051 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3052                            uint32_t oprsz, TCGCond cond)
3053 {
3054     TCGv_i64 t0 = tcg_temp_new_i64();
3055     TCGv_i64 t1 = tcg_temp_new_i64();
3056     uint32_t i;
3057 
3058     for (i = 0; i < oprsz; i += 8) {
3059         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3060         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3061         tcg_gen_setcond_i64(cond, t0, t0, t1);
3062         tcg_gen_neg_i64(t0, t0);
3063         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3064     }
3065     tcg_temp_free_i64(t1);
3066     tcg_temp_free_i64(t0);
3067 }
3068 
3069 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3070                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3071                            TCGType type, TCGCond cond)
3072 {
3073     TCGv_vec t0 = tcg_temp_new_vec(type);
3074     TCGv_vec t1 = tcg_temp_new_vec(type);
3075     uint32_t i;
3076 
3077     for (i = 0; i < oprsz; i += tysz) {
3078         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3079         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3080         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3081         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3082     }
3083     tcg_temp_free_vec(t1);
3084     tcg_temp_free_vec(t0);
3085 }
3086 
3087 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3088                       uint32_t aofs, uint32_t bofs,
3089                       uint32_t oprsz, uint32_t maxsz)
3090 {
3091     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3092     static gen_helper_gvec_3 * const eq_fn[4] = {
3093         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3094         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3095     };
3096     static gen_helper_gvec_3 * const ne_fn[4] = {
3097         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3098         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3099     };
3100     static gen_helper_gvec_3 * const lt_fn[4] = {
3101         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3102         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3103     };
3104     static gen_helper_gvec_3 * const le_fn[4] = {
3105         gen_helper_gvec_le8, gen_helper_gvec_le16,
3106         gen_helper_gvec_le32, gen_helper_gvec_le64
3107     };
3108     static gen_helper_gvec_3 * const ltu_fn[4] = {
3109         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3110         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3111     };
3112     static gen_helper_gvec_3 * const leu_fn[4] = {
3113         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3114         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3115     };
3116     static gen_helper_gvec_3 * const * const fns[16] = {
3117         [TCG_COND_EQ] = eq_fn,
3118         [TCG_COND_NE] = ne_fn,
3119         [TCG_COND_LT] = lt_fn,
3120         [TCG_COND_LE] = le_fn,
3121         [TCG_COND_LTU] = ltu_fn,
3122         [TCG_COND_LEU] = leu_fn,
3123     };
3124 
3125     const TCGOpcode *hold_list;
3126     TCGType type;
3127     uint32_t some;
3128 
3129     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3130     check_overlap_3(dofs, aofs, bofs, maxsz);
3131 
3132     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3133         do_dup(MO_8, dofs, oprsz, maxsz,
3134                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3135         return;
3136     }
3137 
3138     /*
3139      * Implement inline with a vector type, if possible.
3140      * Prefer integer when 64-bit host and 64-bit comparison.
3141      */
3142     hold_list = tcg_swap_vecop_list(cmp_list);
3143     type = choose_vector_type(cmp_list, vece, oprsz,
3144                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3145     switch (type) {
3146     case TCG_TYPE_V256:
3147         /* Recall that ARM SVE allows vector sizes that are not a
3148          * power of 2, but always a multiple of 16.  The intent is
3149          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3150          */
3151         some = QEMU_ALIGN_DOWN(oprsz, 32);
3152         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3153         if (some == oprsz) {
3154             break;
3155         }
3156         dofs += some;
3157         aofs += some;
3158         bofs += some;
3159         oprsz -= some;
3160         maxsz -= some;
3161         /* fallthru */
3162     case TCG_TYPE_V128:
3163         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3164         break;
3165     case TCG_TYPE_V64:
3166         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3167         break;
3168 
3169     case 0:
3170         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3171             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3172         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3173             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3174         } else {
3175             gen_helper_gvec_3 * const *fn = fns[cond];
3176 
3177             if (fn == NULL) {
3178                 uint32_t tmp;
3179                 tmp = aofs, aofs = bofs, bofs = tmp;
3180                 cond = tcg_swap_cond(cond);
3181                 fn = fns[cond];
3182                 assert(fn != NULL);
3183             }
3184             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3185             oprsz = maxsz;
3186         }
3187         break;
3188 
3189     default:
3190         g_assert_not_reached();
3191     }
3192     tcg_swap_vecop_list(hold_list);
3193 
3194     if (oprsz < maxsz) {
3195         expand_clr(dofs + oprsz, maxsz - oprsz);
3196     }
3197 }
3198 
3199 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3200 {
3201     TCGv_i64 t = tcg_temp_new_i64();
3202 
3203     tcg_gen_and_i64(t, b, a);
3204     tcg_gen_andc_i64(d, c, a);
3205     tcg_gen_or_i64(d, d, t);
3206     tcg_temp_free_i64(t);
3207 }
3208 
3209 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3210                          uint32_t bofs, uint32_t cofs,
3211                          uint32_t oprsz, uint32_t maxsz)
3212 {
3213     static const GVecGen4 g = {
3214         .fni8 = tcg_gen_bitsel_i64,
3215         .fniv = tcg_gen_bitsel_vec,
3216         .fno = gen_helper_gvec_bitsel,
3217     };
3218 
3219     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3220 }
3221