xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision 1da79ecc)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "qemu/main-loop.h"
25 #include "tcg/tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t max_align;
41 
42     switch (oprsz) {
43     case 8:
44     case 16:
45     case 32:
46         tcg_debug_assert(oprsz <= maxsz);
47         break;
48     default:
49         tcg_debug_assert(oprsz == maxsz);
50         break;
51     }
52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
53 
54     max_align = maxsz >= 16 ? 15 : 7;
55     tcg_debug_assert((maxsz & max_align) == 0);
56     tcg_debug_assert((ofs & max_align) == 0);
57 }
58 
59 /* Verify vector overlap rules for two operands.  */
60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
61 {
62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
63 }
64 
65 /* Verify vector overlap rules for three operands.  */
66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
67 {
68     check_overlap_2(d, a, s);
69     check_overlap_2(d, b, s);
70     check_overlap_2(a, b, s);
71 }
72 
73 /* Verify vector overlap rules for four operands.  */
74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
75                             uint32_t c, uint32_t s)
76 {
77     check_overlap_2(d, a, s);
78     check_overlap_2(d, b, s);
79     check_overlap_2(d, c, s);
80     check_overlap_2(a, b, s);
81     check_overlap_2(a, c, s);
82     check_overlap_2(b, c, s);
83 }
84 
85 /* Create a descriptor from components.  */
86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
87 {
88     uint32_t desc = 0;
89 
90     check_size_align(oprsz, maxsz, 0);
91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
92 
93     oprsz = (oprsz / 8) - 1;
94     maxsz = (maxsz / 8) - 1;
95 
96     /*
97      * We have just asserted in check_size_align that either
98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
99      * case with '2', as that would otherwise map to 24.
100      */
101     if (oprsz == maxsz) {
102         oprsz = 2;
103     }
104 
105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
108 
109     return desc;
110 }
111 
112 /* Generate a call to a gvec-style helper with two vector operands.  */
113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
115                         gen_helper_gvec_2 *fn)
116 {
117     TCGv_ptr a0, a1;
118     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
119 
120     a0 = tcg_temp_new_ptr();
121     a1 = tcg_temp_new_ptr();
122 
123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
125 
126     fn(a0, a1, desc);
127 
128     tcg_temp_free_ptr(a0);
129     tcg_temp_free_ptr(a1);
130 }
131 
132 /* Generate a call to a gvec-style helper with two vector operands
133    and one scalar operand.  */
134 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
135                          uint32_t oprsz, uint32_t maxsz, int32_t data,
136                          gen_helper_gvec_2i *fn)
137 {
138     TCGv_ptr a0, a1;
139     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
140 
141     a0 = tcg_temp_new_ptr();
142     a1 = tcg_temp_new_ptr();
143 
144     tcg_gen_addi_ptr(a0, cpu_env, dofs);
145     tcg_gen_addi_ptr(a1, cpu_env, aofs);
146 
147     fn(a0, a1, c, desc);
148 
149     tcg_temp_free_ptr(a0);
150     tcg_temp_free_ptr(a1);
151 }
152 
153 /* Generate a call to a gvec-style helper with three vector operands.  */
154 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
155                         uint32_t oprsz, uint32_t maxsz, int32_t data,
156                         gen_helper_gvec_3 *fn)
157 {
158     TCGv_ptr a0, a1, a2;
159     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
160 
161     a0 = tcg_temp_new_ptr();
162     a1 = tcg_temp_new_ptr();
163     a2 = tcg_temp_new_ptr();
164 
165     tcg_gen_addi_ptr(a0, cpu_env, dofs);
166     tcg_gen_addi_ptr(a1, cpu_env, aofs);
167     tcg_gen_addi_ptr(a2, cpu_env, bofs);
168 
169     fn(a0, a1, a2, desc);
170 
171     tcg_temp_free_ptr(a0);
172     tcg_temp_free_ptr(a1);
173     tcg_temp_free_ptr(a2);
174 }
175 
176 /* Generate a call to a gvec-style helper with four vector operands.  */
177 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
178                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
179                         int32_t data, gen_helper_gvec_4 *fn)
180 {
181     TCGv_ptr a0, a1, a2, a3;
182     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
183 
184     a0 = tcg_temp_new_ptr();
185     a1 = tcg_temp_new_ptr();
186     a2 = tcg_temp_new_ptr();
187     a3 = tcg_temp_new_ptr();
188 
189     tcg_gen_addi_ptr(a0, cpu_env, dofs);
190     tcg_gen_addi_ptr(a1, cpu_env, aofs);
191     tcg_gen_addi_ptr(a2, cpu_env, bofs);
192     tcg_gen_addi_ptr(a3, cpu_env, cofs);
193 
194     fn(a0, a1, a2, a3, desc);
195 
196     tcg_temp_free_ptr(a0);
197     tcg_temp_free_ptr(a1);
198     tcg_temp_free_ptr(a2);
199     tcg_temp_free_ptr(a3);
200 }
201 
202 /* Generate a call to a gvec-style helper with five vector operands.  */
203 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
204                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
205                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
206 {
207     TCGv_ptr a0, a1, a2, a3, a4;
208     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
209 
210     a0 = tcg_temp_new_ptr();
211     a1 = tcg_temp_new_ptr();
212     a2 = tcg_temp_new_ptr();
213     a3 = tcg_temp_new_ptr();
214     a4 = tcg_temp_new_ptr();
215 
216     tcg_gen_addi_ptr(a0, cpu_env, dofs);
217     tcg_gen_addi_ptr(a1, cpu_env, aofs);
218     tcg_gen_addi_ptr(a2, cpu_env, bofs);
219     tcg_gen_addi_ptr(a3, cpu_env, cofs);
220     tcg_gen_addi_ptr(a4, cpu_env, xofs);
221 
222     fn(a0, a1, a2, a3, a4, desc);
223 
224     tcg_temp_free_ptr(a0);
225     tcg_temp_free_ptr(a1);
226     tcg_temp_free_ptr(a2);
227     tcg_temp_free_ptr(a3);
228     tcg_temp_free_ptr(a4);
229 }
230 
231 /* Generate a call to a gvec-style helper with three vector operands
232    and an extra pointer operand.  */
233 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
234                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
235                         int32_t data, gen_helper_gvec_2_ptr *fn)
236 {
237     TCGv_ptr a0, a1;
238     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
239 
240     a0 = tcg_temp_new_ptr();
241     a1 = tcg_temp_new_ptr();
242 
243     tcg_gen_addi_ptr(a0, cpu_env, dofs);
244     tcg_gen_addi_ptr(a1, cpu_env, aofs);
245 
246     fn(a0, a1, ptr, desc);
247 
248     tcg_temp_free_ptr(a0);
249     tcg_temp_free_ptr(a1);
250 }
251 
252 /* Generate a call to a gvec-style helper with three vector operands
253    and an extra pointer operand.  */
254 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
255                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
256                         int32_t data, gen_helper_gvec_3_ptr *fn)
257 {
258     TCGv_ptr a0, a1, a2;
259     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
260 
261     a0 = tcg_temp_new_ptr();
262     a1 = tcg_temp_new_ptr();
263     a2 = tcg_temp_new_ptr();
264 
265     tcg_gen_addi_ptr(a0, cpu_env, dofs);
266     tcg_gen_addi_ptr(a1, cpu_env, aofs);
267     tcg_gen_addi_ptr(a2, cpu_env, bofs);
268 
269     fn(a0, a1, a2, ptr, desc);
270 
271     tcg_temp_free_ptr(a0);
272     tcg_temp_free_ptr(a1);
273     tcg_temp_free_ptr(a2);
274 }
275 
276 /* Generate a call to a gvec-style helper with four vector operands
277    and an extra pointer operand.  */
278 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
279                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
280                         uint32_t maxsz, int32_t data,
281                         gen_helper_gvec_4_ptr *fn)
282 {
283     TCGv_ptr a0, a1, a2, a3;
284     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
285 
286     a0 = tcg_temp_new_ptr();
287     a1 = tcg_temp_new_ptr();
288     a2 = tcg_temp_new_ptr();
289     a3 = tcg_temp_new_ptr();
290 
291     tcg_gen_addi_ptr(a0, cpu_env, dofs);
292     tcg_gen_addi_ptr(a1, cpu_env, aofs);
293     tcg_gen_addi_ptr(a2, cpu_env, bofs);
294     tcg_gen_addi_ptr(a3, cpu_env, cofs);
295 
296     fn(a0, a1, a2, a3, ptr, desc);
297 
298     tcg_temp_free_ptr(a0);
299     tcg_temp_free_ptr(a1);
300     tcg_temp_free_ptr(a2);
301     tcg_temp_free_ptr(a3);
302 }
303 
304 /* Generate a call to a gvec-style helper with five vector operands
305    and an extra pointer operand.  */
306 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
307                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
308                         uint32_t oprsz, uint32_t maxsz, int32_t data,
309                         gen_helper_gvec_5_ptr *fn)
310 {
311     TCGv_ptr a0, a1, a2, a3, a4;
312     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
313 
314     a0 = tcg_temp_new_ptr();
315     a1 = tcg_temp_new_ptr();
316     a2 = tcg_temp_new_ptr();
317     a3 = tcg_temp_new_ptr();
318     a4 = tcg_temp_new_ptr();
319 
320     tcg_gen_addi_ptr(a0, cpu_env, dofs);
321     tcg_gen_addi_ptr(a1, cpu_env, aofs);
322     tcg_gen_addi_ptr(a2, cpu_env, bofs);
323     tcg_gen_addi_ptr(a3, cpu_env, cofs);
324     tcg_gen_addi_ptr(a4, cpu_env, eofs);
325 
326     fn(a0, a1, a2, a3, a4, ptr, desc);
327 
328     tcg_temp_free_ptr(a0);
329     tcg_temp_free_ptr(a1);
330     tcg_temp_free_ptr(a2);
331     tcg_temp_free_ptr(a3);
332     tcg_temp_free_ptr(a4);
333 }
334 
335 /* Return true if we want to implement something of OPRSZ bytes
336    in units of LNSZ.  This limits the expansion of inline code.  */
337 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
338 {
339     uint32_t q, r;
340 
341     if (oprsz < lnsz) {
342         return false;
343     }
344 
345     q = oprsz / lnsz;
346     r = oprsz % lnsz;
347     tcg_debug_assert((r & 7) == 0);
348 
349     if (lnsz < 16) {
350         /* For sizes below 16, accept no remainder. */
351         if (r != 0) {
352             return false;
353         }
354     } else {
355         /*
356          * Recall that ARM SVE allows vector sizes that are not a
357          * power of 2, but always a multiple of 16.  The intent is
358          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
359          * In addition, expand_clr needs to handle a multiple of 8.
360          * Thus we can handle the tail with one more operation per
361          * diminishing power of 2.
362          */
363         q += ctpop32(r);
364     }
365 
366     return q <= MAX_UNROLL;
367 }
368 
369 static void expand_clr(uint32_t dofs, uint32_t maxsz);
370 
371 /* Duplicate C as per VECE.  */
372 uint64_t (dup_const)(unsigned vece, uint64_t c)
373 {
374     switch (vece) {
375     case MO_8:
376         return 0x0101010101010101ull * (uint8_t)c;
377     case MO_16:
378         return 0x0001000100010001ull * (uint16_t)c;
379     case MO_32:
380         return 0x0000000100000001ull * (uint32_t)c;
381     case MO_64:
382         return c;
383     default:
384         g_assert_not_reached();
385     }
386 }
387 
388 /* Duplicate IN into OUT as per VECE.  */
389 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
390 {
391     switch (vece) {
392     case MO_8:
393         tcg_gen_ext8u_i32(out, in);
394         tcg_gen_muli_i32(out, out, 0x01010101);
395         break;
396     case MO_16:
397         tcg_gen_deposit_i32(out, in, in, 16, 16);
398         break;
399     case MO_32:
400         tcg_gen_mov_i32(out, in);
401         break;
402     default:
403         g_assert_not_reached();
404     }
405 }
406 
407 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
408 {
409     switch (vece) {
410     case MO_8:
411         tcg_gen_ext8u_i64(out, in);
412         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
413         break;
414     case MO_16:
415         tcg_gen_ext16u_i64(out, in);
416         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
417         break;
418     case MO_32:
419         tcg_gen_deposit_i64(out, in, in, 32, 32);
420         break;
421     case MO_64:
422         tcg_gen_mov_i64(out, in);
423         break;
424     default:
425         g_assert_not_reached();
426     }
427 }
428 
429 /* Select a supported vector type for implementing an operation on SIZE
430  * bytes.  If OP is 0, assume that the real operation to be performed is
431  * required by all backends.  Otherwise, make sure than OP can be performed
432  * on elements of size VECE in the selected type.  Do not select V64 if
433  * PREFER_I64 is true.  Return 0 if no vector type is selected.
434  */
435 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
436                                   uint32_t size, bool prefer_i64)
437 {
438     /*
439      * Recall that ARM SVE allows vector sizes that are not a
440      * power of 2, but always a multiple of 16.  The intent is
441      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
442      * It is hard to imagine a case in which v256 is supported
443      * but v128 is not, but check anyway.
444      * In addition, expand_clr needs to handle a multiple of 8.
445      */
446     if (TCG_TARGET_HAS_v256 &&
447         check_size_impl(size, 32) &&
448         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
449         (!(size & 16) ||
450          (TCG_TARGET_HAS_v128 &&
451           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
452         (!(size & 8) ||
453          (TCG_TARGET_HAS_v64 &&
454           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
455         return TCG_TYPE_V256;
456     }
457     if (TCG_TARGET_HAS_v128 &&
458         check_size_impl(size, 16) &&
459         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
460         (!(size & 8) ||
461          (TCG_TARGET_HAS_v64 &&
462           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
463         return TCG_TYPE_V128;
464     }
465     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
466         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
467         return TCG_TYPE_V64;
468     }
469     return 0;
470 }
471 
472 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
473                          uint32_t maxsz, TCGv_vec t_vec)
474 {
475     uint32_t i = 0;
476 
477     tcg_debug_assert(oprsz >= 8);
478 
479     /*
480      * This may be expand_clr for the tail of an operation, e.g.
481      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
482      * are misaligned wrt the maximum vector size, so do that first.
483      */
484     if (dofs & 8) {
485         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
486         i += 8;
487     }
488 
489     switch (type) {
490     case TCG_TYPE_V256:
491         /*
492          * Recall that ARM SVE allows vector sizes that are not a
493          * power of 2, but always a multiple of 16.  The intent is
494          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
495          */
496         for (; i + 32 <= oprsz; i += 32) {
497             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
498         }
499         /* fallthru */
500     case TCG_TYPE_V128:
501         for (; i + 16 <= oprsz; i += 16) {
502             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
503         }
504         break;
505     case TCG_TYPE_V64:
506         for (; i < oprsz; i += 8) {
507             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
508         }
509         break;
510     default:
511         g_assert_not_reached();
512     }
513 
514     if (oprsz < maxsz) {
515         expand_clr(dofs + oprsz, maxsz - oprsz);
516     }
517 }
518 
519 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
520  * Only one of IN_32 or IN_64 may be set;
521  * IN_C is used if IN_32 and IN_64 are unset.
522  */
523 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
524                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
525                    uint64_t in_c)
526 {
527     TCGType type;
528     TCGv_i64 t_64;
529     TCGv_i32 t_32, t_desc;
530     TCGv_ptr t_ptr;
531     uint32_t i;
532 
533     assert(vece <= (in_32 ? MO_32 : MO_64));
534     assert(in_32 == NULL || in_64 == NULL);
535 
536     /* If we're storing 0, expand oprsz to maxsz.  */
537     if (in_32 == NULL && in_64 == NULL) {
538         in_c = dup_const(vece, in_c);
539         if (in_c == 0) {
540             oprsz = maxsz;
541             vece = MO_8;
542         } else if (in_c == dup_const(MO_8, in_c)) {
543             vece = MO_8;
544         }
545     }
546 
547     /* Implement inline with a vector type, if possible.
548      * Prefer integer when 64-bit host and no variable dup.
549      */
550     type = choose_vector_type(NULL, vece, oprsz,
551                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
552                                && (in_64 == NULL || vece == MO_64)));
553     if (type != 0) {
554         TCGv_vec t_vec = tcg_temp_new_vec(type);
555 
556         if (in_32) {
557             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
558         } else if (in_64) {
559             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
560         } else {
561             tcg_gen_dupi_vec(vece, t_vec, in_c);
562         }
563         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
564         tcg_temp_free_vec(t_vec);
565         return;
566     }
567 
568     /* Otherwise, inline with an integer type, unless "large".  */
569     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
570         t_64 = NULL;
571         t_32 = NULL;
572 
573         if (in_32) {
574             /* We are given a 32-bit variable input.  For a 64-bit host,
575                use a 64-bit operation unless the 32-bit operation would
576                be simple enough.  */
577             if (TCG_TARGET_REG_BITS == 64
578                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
579                 t_64 = tcg_temp_new_i64();
580                 tcg_gen_extu_i32_i64(t_64, in_32);
581                 gen_dup_i64(vece, t_64, t_64);
582             } else {
583                 t_32 = tcg_temp_new_i32();
584                 gen_dup_i32(vece, t_32, in_32);
585             }
586         } else if (in_64) {
587             /* We are given a 64-bit variable input.  */
588             t_64 = tcg_temp_new_i64();
589             gen_dup_i64(vece, t_64, in_64);
590         } else {
591             /* We are given a constant input.  */
592             /* For 64-bit hosts, use 64-bit constants for "simple" constants
593                or when we'd need too many 32-bit stores, or when a 64-bit
594                constant is really required.  */
595             if (vece == MO_64
596                 || (TCG_TARGET_REG_BITS == 64
597                     && (in_c == 0 || in_c == -1
598                         || !check_size_impl(oprsz, 4)))) {
599                 t_64 = tcg_constant_i64(in_c);
600             } else {
601                 t_32 = tcg_constant_i32(in_c);
602             }
603         }
604 
605         /* Implement inline if we picked an implementation size above.  */
606         if (t_32) {
607             for (i = 0; i < oprsz; i += 4) {
608                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
609             }
610             tcg_temp_free_i32(t_32);
611             goto done;
612         }
613         if (t_64) {
614             for (i = 0; i < oprsz; i += 8) {
615                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
616             }
617             tcg_temp_free_i64(t_64);
618             goto done;
619         }
620     }
621 
622     /* Otherwise implement out of line.  */
623     t_ptr = tcg_temp_new_ptr();
624     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
625 
626     /*
627      * This may be expand_clr for the tail of an operation, e.g.
628      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
629      * wrt simd_desc and will assert.  Simply pass all replicated byte
630      * stores through to memset.
631      */
632     if (oprsz == maxsz && vece == MO_8) {
633         TCGv_ptr t_size = tcg_const_ptr(oprsz);
634         TCGv_i32 t_val;
635 
636         if (in_32) {
637             t_val = in_32;
638         } else if (in_64) {
639             t_val = tcg_temp_new_i32();
640             tcg_gen_extrl_i64_i32(t_val, in_64);
641         } else {
642             t_val = tcg_constant_i32(in_c);
643         }
644         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
645 
646         if (in_64) {
647             tcg_temp_free_i32(t_val);
648         }
649         tcg_temp_free_ptr(t_size);
650         tcg_temp_free_ptr(t_ptr);
651         return;
652     }
653 
654     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
655 
656     if (vece == MO_64) {
657         if (in_64) {
658             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
659         } else {
660             t_64 = tcg_constant_i64(in_c);
661             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
662         }
663     } else {
664         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
665         static dup_fn * const fns[3] = {
666             gen_helper_gvec_dup8,
667             gen_helper_gvec_dup16,
668             gen_helper_gvec_dup32
669         };
670 
671         if (in_32) {
672             fns[vece](t_ptr, t_desc, in_32);
673         } else if (in_64) {
674             t_32 = tcg_temp_new_i32();
675             tcg_gen_extrl_i64_i32(t_32, in_64);
676             fns[vece](t_ptr, t_desc, t_32);
677             tcg_temp_free_i32(t_32);
678         } else {
679             if (vece == MO_8) {
680                 in_c &= 0xff;
681             } else if (vece == MO_16) {
682                 in_c &= 0xffff;
683             }
684             t_32 = tcg_constant_i32(in_c);
685             fns[vece](t_ptr, t_desc, t_32);
686         }
687     }
688 
689     tcg_temp_free_ptr(t_ptr);
690     return;
691 
692  done:
693     if (oprsz < maxsz) {
694         expand_clr(dofs + oprsz, maxsz - oprsz);
695     }
696 }
697 
698 /* Likewise, but with zero.  */
699 static void expand_clr(uint32_t dofs, uint32_t maxsz)
700 {
701     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
702 }
703 
704 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
705 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
706                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
707 {
708     TCGv_i32 t0 = tcg_temp_new_i32();
709     TCGv_i32 t1 = tcg_temp_new_i32();
710     uint32_t i;
711 
712     for (i = 0; i < oprsz; i += 4) {
713         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
714         if (load_dest) {
715             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
716         }
717         fni(t1, t0);
718         tcg_gen_st_i32(t1, cpu_env, dofs + i);
719     }
720     tcg_temp_free_i32(t0);
721     tcg_temp_free_i32(t1);
722 }
723 
724 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
725                           int32_t c, bool load_dest,
726                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
727 {
728     TCGv_i32 t0 = tcg_temp_new_i32();
729     TCGv_i32 t1 = tcg_temp_new_i32();
730     uint32_t i;
731 
732     for (i = 0; i < oprsz; i += 4) {
733         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
734         if (load_dest) {
735             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
736         }
737         fni(t1, t0, c);
738         tcg_gen_st_i32(t1, cpu_env, dofs + i);
739     }
740     tcg_temp_free_i32(t0);
741     tcg_temp_free_i32(t1);
742 }
743 
744 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
745                           TCGv_i32 c, bool scalar_first,
746                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
747 {
748     TCGv_i32 t0 = tcg_temp_new_i32();
749     TCGv_i32 t1 = tcg_temp_new_i32();
750     uint32_t i;
751 
752     for (i = 0; i < oprsz; i += 4) {
753         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
754         if (scalar_first) {
755             fni(t1, c, t0);
756         } else {
757             fni(t1, t0, c);
758         }
759         tcg_gen_st_i32(t1, cpu_env, dofs + i);
760     }
761     tcg_temp_free_i32(t0);
762     tcg_temp_free_i32(t1);
763 }
764 
765 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
766 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
767                          uint32_t bofs, uint32_t oprsz, bool load_dest,
768                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
769 {
770     TCGv_i32 t0 = tcg_temp_new_i32();
771     TCGv_i32 t1 = tcg_temp_new_i32();
772     TCGv_i32 t2 = tcg_temp_new_i32();
773     uint32_t i;
774 
775     for (i = 0; i < oprsz; i += 4) {
776         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
777         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
778         if (load_dest) {
779             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
780         }
781         fni(t2, t0, t1);
782         tcg_gen_st_i32(t2, cpu_env, dofs + i);
783     }
784     tcg_temp_free_i32(t2);
785     tcg_temp_free_i32(t1);
786     tcg_temp_free_i32(t0);
787 }
788 
789 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
790                           uint32_t oprsz, int32_t c, bool load_dest,
791                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
792 {
793     TCGv_i32 t0 = tcg_temp_new_i32();
794     TCGv_i32 t1 = tcg_temp_new_i32();
795     TCGv_i32 t2 = tcg_temp_new_i32();
796     uint32_t i;
797 
798     for (i = 0; i < oprsz; i += 4) {
799         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
800         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
801         if (load_dest) {
802             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
803         }
804         fni(t2, t0, t1, c);
805         tcg_gen_st_i32(t2, cpu_env, dofs + i);
806     }
807     tcg_temp_free_i32(t0);
808     tcg_temp_free_i32(t1);
809     tcg_temp_free_i32(t2);
810 }
811 
812 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
813 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
814                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
815                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
816 {
817     TCGv_i32 t0 = tcg_temp_new_i32();
818     TCGv_i32 t1 = tcg_temp_new_i32();
819     TCGv_i32 t2 = tcg_temp_new_i32();
820     TCGv_i32 t3 = tcg_temp_new_i32();
821     uint32_t i;
822 
823     for (i = 0; i < oprsz; i += 4) {
824         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
825         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
826         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
827         fni(t0, t1, t2, t3);
828         tcg_gen_st_i32(t0, cpu_env, dofs + i);
829         if (write_aofs) {
830             tcg_gen_st_i32(t1, cpu_env, aofs + i);
831         }
832     }
833     tcg_temp_free_i32(t3);
834     tcg_temp_free_i32(t2);
835     tcg_temp_free_i32(t1);
836     tcg_temp_free_i32(t0);
837 }
838 
839 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
840 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
841                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
842 {
843     TCGv_i64 t0 = tcg_temp_new_i64();
844     TCGv_i64 t1 = tcg_temp_new_i64();
845     uint32_t i;
846 
847     for (i = 0; i < oprsz; i += 8) {
848         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
849         if (load_dest) {
850             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
851         }
852         fni(t1, t0);
853         tcg_gen_st_i64(t1, cpu_env, dofs + i);
854     }
855     tcg_temp_free_i64(t0);
856     tcg_temp_free_i64(t1);
857 }
858 
859 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
860                           int64_t c, bool load_dest,
861                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
862 {
863     TCGv_i64 t0 = tcg_temp_new_i64();
864     TCGv_i64 t1 = tcg_temp_new_i64();
865     uint32_t i;
866 
867     for (i = 0; i < oprsz; i += 8) {
868         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
869         if (load_dest) {
870             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
871         }
872         fni(t1, t0, c);
873         tcg_gen_st_i64(t1, cpu_env, dofs + i);
874     }
875     tcg_temp_free_i64(t0);
876     tcg_temp_free_i64(t1);
877 }
878 
879 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
880                           TCGv_i64 c, bool scalar_first,
881                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
882 {
883     TCGv_i64 t0 = tcg_temp_new_i64();
884     TCGv_i64 t1 = tcg_temp_new_i64();
885     uint32_t i;
886 
887     for (i = 0; i < oprsz; i += 8) {
888         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
889         if (scalar_first) {
890             fni(t1, c, t0);
891         } else {
892             fni(t1, t0, c);
893         }
894         tcg_gen_st_i64(t1, cpu_env, dofs + i);
895     }
896     tcg_temp_free_i64(t0);
897     tcg_temp_free_i64(t1);
898 }
899 
900 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
901 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
902                          uint32_t bofs, uint32_t oprsz, bool load_dest,
903                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
904 {
905     TCGv_i64 t0 = tcg_temp_new_i64();
906     TCGv_i64 t1 = tcg_temp_new_i64();
907     TCGv_i64 t2 = tcg_temp_new_i64();
908     uint32_t i;
909 
910     for (i = 0; i < oprsz; i += 8) {
911         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
912         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
913         if (load_dest) {
914             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
915         }
916         fni(t2, t0, t1);
917         tcg_gen_st_i64(t2, cpu_env, dofs + i);
918     }
919     tcg_temp_free_i64(t2);
920     tcg_temp_free_i64(t1);
921     tcg_temp_free_i64(t0);
922 }
923 
924 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
925                           uint32_t oprsz, int64_t c, bool load_dest,
926                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
927 {
928     TCGv_i64 t0 = tcg_temp_new_i64();
929     TCGv_i64 t1 = tcg_temp_new_i64();
930     TCGv_i64 t2 = tcg_temp_new_i64();
931     uint32_t i;
932 
933     for (i = 0; i < oprsz; i += 8) {
934         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
935         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
936         if (load_dest) {
937             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
938         }
939         fni(t2, t0, t1, c);
940         tcg_gen_st_i64(t2, cpu_env, dofs + i);
941     }
942     tcg_temp_free_i64(t0);
943     tcg_temp_free_i64(t1);
944     tcg_temp_free_i64(t2);
945 }
946 
947 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
948 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
949                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
950                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
951 {
952     TCGv_i64 t0 = tcg_temp_new_i64();
953     TCGv_i64 t1 = tcg_temp_new_i64();
954     TCGv_i64 t2 = tcg_temp_new_i64();
955     TCGv_i64 t3 = tcg_temp_new_i64();
956     uint32_t i;
957 
958     for (i = 0; i < oprsz; i += 8) {
959         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
960         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
961         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
962         fni(t0, t1, t2, t3);
963         tcg_gen_st_i64(t0, cpu_env, dofs + i);
964         if (write_aofs) {
965             tcg_gen_st_i64(t1, cpu_env, aofs + i);
966         }
967     }
968     tcg_temp_free_i64(t3);
969     tcg_temp_free_i64(t2);
970     tcg_temp_free_i64(t1);
971     tcg_temp_free_i64(t0);
972 }
973 
974 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
975 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
976                          uint32_t oprsz, uint32_t tysz, TCGType type,
977                          bool load_dest,
978                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
979 {
980     TCGv_vec t0 = tcg_temp_new_vec(type);
981     TCGv_vec t1 = tcg_temp_new_vec(type);
982     uint32_t i;
983 
984     for (i = 0; i < oprsz; i += tysz) {
985         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
986         if (load_dest) {
987             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
988         }
989         fni(vece, t1, t0);
990         tcg_gen_st_vec(t1, cpu_env, dofs + i);
991     }
992     tcg_temp_free_vec(t0);
993     tcg_temp_free_vec(t1);
994 }
995 
996 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
997    using host vectors.  */
998 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
999                           uint32_t oprsz, uint32_t tysz, TCGType type,
1000                           int64_t c, bool load_dest,
1001                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1002 {
1003     TCGv_vec t0 = tcg_temp_new_vec(type);
1004     TCGv_vec t1 = tcg_temp_new_vec(type);
1005     uint32_t i;
1006 
1007     for (i = 0; i < oprsz; i += tysz) {
1008         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1009         if (load_dest) {
1010             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1011         }
1012         fni(vece, t1, t0, c);
1013         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1014     }
1015     tcg_temp_free_vec(t0);
1016     tcg_temp_free_vec(t1);
1017 }
1018 
1019 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1020                           uint32_t oprsz, uint32_t tysz, TCGType type,
1021                           TCGv_vec c, bool scalar_first,
1022                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1023 {
1024     TCGv_vec t0 = tcg_temp_new_vec(type);
1025     TCGv_vec t1 = tcg_temp_new_vec(type);
1026     uint32_t i;
1027 
1028     for (i = 0; i < oprsz; i += tysz) {
1029         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1030         if (scalar_first) {
1031             fni(vece, t1, c, t0);
1032         } else {
1033             fni(vece, t1, t0, c);
1034         }
1035         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1036     }
1037     tcg_temp_free_vec(t0);
1038     tcg_temp_free_vec(t1);
1039 }
1040 
1041 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1042 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1043                          uint32_t bofs, uint32_t oprsz,
1044                          uint32_t tysz, TCGType type, bool load_dest,
1045                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1046 {
1047     TCGv_vec t0 = tcg_temp_new_vec(type);
1048     TCGv_vec t1 = tcg_temp_new_vec(type);
1049     TCGv_vec t2 = tcg_temp_new_vec(type);
1050     uint32_t i;
1051 
1052     for (i = 0; i < oprsz; i += tysz) {
1053         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1054         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1055         if (load_dest) {
1056             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1057         }
1058         fni(vece, t2, t0, t1);
1059         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1060     }
1061     tcg_temp_free_vec(t2);
1062     tcg_temp_free_vec(t1);
1063     tcg_temp_free_vec(t0);
1064 }
1065 
1066 /*
1067  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1068  * using host vectors.
1069  */
1070 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1071                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1072                           TCGType type, int64_t c, bool load_dest,
1073                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1074                                       int64_t))
1075 {
1076     TCGv_vec t0 = tcg_temp_new_vec(type);
1077     TCGv_vec t1 = tcg_temp_new_vec(type);
1078     TCGv_vec t2 = tcg_temp_new_vec(type);
1079     uint32_t i;
1080 
1081     for (i = 0; i < oprsz; i += tysz) {
1082         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1083         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1084         if (load_dest) {
1085             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1086         }
1087         fni(vece, t2, t0, t1, c);
1088         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1089     }
1090     tcg_temp_free_vec(t0);
1091     tcg_temp_free_vec(t1);
1092     tcg_temp_free_vec(t2);
1093 }
1094 
1095 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1096 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1097                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1098                          uint32_t tysz, TCGType type, bool write_aofs,
1099                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1100                                      TCGv_vec, TCGv_vec))
1101 {
1102     TCGv_vec t0 = tcg_temp_new_vec(type);
1103     TCGv_vec t1 = tcg_temp_new_vec(type);
1104     TCGv_vec t2 = tcg_temp_new_vec(type);
1105     TCGv_vec t3 = tcg_temp_new_vec(type);
1106     uint32_t i;
1107 
1108     for (i = 0; i < oprsz; i += tysz) {
1109         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1110         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1111         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1112         fni(vece, t0, t1, t2, t3);
1113         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1114         if (write_aofs) {
1115             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1116         }
1117     }
1118     tcg_temp_free_vec(t3);
1119     tcg_temp_free_vec(t2);
1120     tcg_temp_free_vec(t1);
1121     tcg_temp_free_vec(t0);
1122 }
1123 
1124 /* Expand a vector two-operand operation.  */
1125 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1126                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1127 {
1128     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1129     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1130     TCGType type;
1131     uint32_t some;
1132 
1133     check_size_align(oprsz, maxsz, dofs | aofs);
1134     check_overlap_2(dofs, aofs, maxsz);
1135 
1136     type = 0;
1137     if (g->fniv) {
1138         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1139     }
1140     switch (type) {
1141     case TCG_TYPE_V256:
1142         /* Recall that ARM SVE allows vector sizes that are not a
1143          * power of 2, but always a multiple of 16.  The intent is
1144          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1145          */
1146         some = QEMU_ALIGN_DOWN(oprsz, 32);
1147         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1148                      g->load_dest, g->fniv);
1149         if (some == oprsz) {
1150             break;
1151         }
1152         dofs += some;
1153         aofs += some;
1154         oprsz -= some;
1155         maxsz -= some;
1156         /* fallthru */
1157     case TCG_TYPE_V128:
1158         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1159                      g->load_dest, g->fniv);
1160         break;
1161     case TCG_TYPE_V64:
1162         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1163                      g->load_dest, g->fniv);
1164         break;
1165 
1166     case 0:
1167         if (g->fni8 && check_size_impl(oprsz, 8)) {
1168             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1169         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1170             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1171         } else {
1172             assert(g->fno != NULL);
1173             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1174             oprsz = maxsz;
1175         }
1176         break;
1177 
1178     default:
1179         g_assert_not_reached();
1180     }
1181     tcg_swap_vecop_list(hold_list);
1182 
1183     if (oprsz < maxsz) {
1184         expand_clr(dofs + oprsz, maxsz - oprsz);
1185     }
1186 }
1187 
1188 /* Expand a vector operation with two vectors and an immediate.  */
1189 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1190                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1191 {
1192     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1193     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1194     TCGType type;
1195     uint32_t some;
1196 
1197     check_size_align(oprsz, maxsz, dofs | aofs);
1198     check_overlap_2(dofs, aofs, maxsz);
1199 
1200     type = 0;
1201     if (g->fniv) {
1202         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1203     }
1204     switch (type) {
1205     case TCG_TYPE_V256:
1206         /* Recall that ARM SVE allows vector sizes that are not a
1207          * power of 2, but always a multiple of 16.  The intent is
1208          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1209          */
1210         some = QEMU_ALIGN_DOWN(oprsz, 32);
1211         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1212                       c, g->load_dest, g->fniv);
1213         if (some == oprsz) {
1214             break;
1215         }
1216         dofs += some;
1217         aofs += some;
1218         oprsz -= some;
1219         maxsz -= some;
1220         /* fallthru */
1221     case TCG_TYPE_V128:
1222         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1223                       c, g->load_dest, g->fniv);
1224         break;
1225     case TCG_TYPE_V64:
1226         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1227                       c, g->load_dest, g->fniv);
1228         break;
1229 
1230     case 0:
1231         if (g->fni8 && check_size_impl(oprsz, 8)) {
1232             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1233         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1234             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1235         } else {
1236             if (g->fno) {
1237                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1238             } else {
1239                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1240                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1241                                     maxsz, c, g->fnoi);
1242             }
1243             oprsz = maxsz;
1244         }
1245         break;
1246 
1247     default:
1248         g_assert_not_reached();
1249     }
1250     tcg_swap_vecop_list(hold_list);
1251 
1252     if (oprsz < maxsz) {
1253         expand_clr(dofs + oprsz, maxsz - oprsz);
1254     }
1255 }
1256 
1257 /* Expand a vector operation with two vectors and a scalar.  */
1258 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1259                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1260 {
1261     TCGType type;
1262 
1263     check_size_align(oprsz, maxsz, dofs | aofs);
1264     check_overlap_2(dofs, aofs, maxsz);
1265 
1266     type = 0;
1267     if (g->fniv) {
1268         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1269     }
1270     if (type != 0) {
1271         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1272         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1273         TCGv_vec t_vec = tcg_temp_new_vec(type);
1274         uint32_t some;
1275 
1276         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1277 
1278         switch (type) {
1279         case TCG_TYPE_V256:
1280             /* Recall that ARM SVE allows vector sizes that are not a
1281              * power of 2, but always a multiple of 16.  The intent is
1282              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1283              */
1284             some = QEMU_ALIGN_DOWN(oprsz, 32);
1285             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1286                           t_vec, g->scalar_first, g->fniv);
1287             if (some == oprsz) {
1288                 break;
1289             }
1290             dofs += some;
1291             aofs += some;
1292             oprsz -= some;
1293             maxsz -= some;
1294             /* fallthru */
1295 
1296         case TCG_TYPE_V128:
1297             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1298                           t_vec, g->scalar_first, g->fniv);
1299             break;
1300 
1301         case TCG_TYPE_V64:
1302             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1303                           t_vec, g->scalar_first, g->fniv);
1304             break;
1305 
1306         default:
1307             g_assert_not_reached();
1308         }
1309         tcg_temp_free_vec(t_vec);
1310         tcg_swap_vecop_list(hold_list);
1311     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1312         TCGv_i64 t64 = tcg_temp_new_i64();
1313 
1314         gen_dup_i64(g->vece, t64, c);
1315         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1316         tcg_temp_free_i64(t64);
1317     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1318         TCGv_i32 t32 = tcg_temp_new_i32();
1319 
1320         tcg_gen_extrl_i64_i32(t32, c);
1321         gen_dup_i32(g->vece, t32, t32);
1322         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1323         tcg_temp_free_i32(t32);
1324     } else {
1325         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1326         return;
1327     }
1328 
1329     if (oprsz < maxsz) {
1330         expand_clr(dofs + oprsz, maxsz - oprsz);
1331     }
1332 }
1333 
1334 /* Expand a vector three-operand operation.  */
1335 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1336                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1337 {
1338     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1339     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1340     TCGType type;
1341     uint32_t some;
1342 
1343     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1344     check_overlap_3(dofs, aofs, bofs, maxsz);
1345 
1346     type = 0;
1347     if (g->fniv) {
1348         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1349     }
1350     switch (type) {
1351     case TCG_TYPE_V256:
1352         /* Recall that ARM SVE allows vector sizes that are not a
1353          * power of 2, but always a multiple of 16.  The intent is
1354          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1355          */
1356         some = QEMU_ALIGN_DOWN(oprsz, 32);
1357         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1358                      g->load_dest, g->fniv);
1359         if (some == oprsz) {
1360             break;
1361         }
1362         dofs += some;
1363         aofs += some;
1364         bofs += some;
1365         oprsz -= some;
1366         maxsz -= some;
1367         /* fallthru */
1368     case TCG_TYPE_V128:
1369         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1370                      g->load_dest, g->fniv);
1371         break;
1372     case TCG_TYPE_V64:
1373         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1374                      g->load_dest, g->fniv);
1375         break;
1376 
1377     case 0:
1378         if (g->fni8 && check_size_impl(oprsz, 8)) {
1379             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1380         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1381             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1382         } else {
1383             assert(g->fno != NULL);
1384             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1385                                maxsz, g->data, g->fno);
1386             oprsz = maxsz;
1387         }
1388         break;
1389 
1390     default:
1391         g_assert_not_reached();
1392     }
1393     tcg_swap_vecop_list(hold_list);
1394 
1395     if (oprsz < maxsz) {
1396         expand_clr(dofs + oprsz, maxsz - oprsz);
1397     }
1398 }
1399 
1400 /* Expand a vector operation with three vectors and an immediate.  */
1401 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1402                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1403                      const GVecGen3i *g)
1404 {
1405     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1406     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1407     TCGType type;
1408     uint32_t some;
1409 
1410     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1411     check_overlap_3(dofs, aofs, bofs, maxsz);
1412 
1413     type = 0;
1414     if (g->fniv) {
1415         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1416     }
1417     switch (type) {
1418     case TCG_TYPE_V256:
1419         /*
1420          * Recall that ARM SVE allows vector sizes that are not a
1421          * power of 2, but always a multiple of 16.  The intent is
1422          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1423          */
1424         some = QEMU_ALIGN_DOWN(oprsz, 32);
1425         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1426                       c, g->load_dest, g->fniv);
1427         if (some == oprsz) {
1428             break;
1429         }
1430         dofs += some;
1431         aofs += some;
1432         bofs += some;
1433         oprsz -= some;
1434         maxsz -= some;
1435         /* fallthru */
1436     case TCG_TYPE_V128:
1437         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1438                       c, g->load_dest, g->fniv);
1439         break;
1440     case TCG_TYPE_V64:
1441         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1442                       c, g->load_dest, g->fniv);
1443         break;
1444 
1445     case 0:
1446         if (g->fni8 && check_size_impl(oprsz, 8)) {
1447             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1448         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1449             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1450         } else {
1451             assert(g->fno != NULL);
1452             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1453             oprsz = maxsz;
1454         }
1455         break;
1456 
1457     default:
1458         g_assert_not_reached();
1459     }
1460     tcg_swap_vecop_list(hold_list);
1461 
1462     if (oprsz < maxsz) {
1463         expand_clr(dofs + oprsz, maxsz - oprsz);
1464     }
1465 }
1466 
1467 /* Expand a vector four-operand operation.  */
1468 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1469                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1470 {
1471     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1472     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1473     TCGType type;
1474     uint32_t some;
1475 
1476     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1477     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1478 
1479     type = 0;
1480     if (g->fniv) {
1481         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1482     }
1483     switch (type) {
1484     case TCG_TYPE_V256:
1485         /* Recall that ARM SVE allows vector sizes that are not a
1486          * power of 2, but always a multiple of 16.  The intent is
1487          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1488          */
1489         some = QEMU_ALIGN_DOWN(oprsz, 32);
1490         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1491                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1492         if (some == oprsz) {
1493             break;
1494         }
1495         dofs += some;
1496         aofs += some;
1497         bofs += some;
1498         cofs += some;
1499         oprsz -= some;
1500         maxsz -= some;
1501         /* fallthru */
1502     case TCG_TYPE_V128:
1503         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1504                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1505         break;
1506     case TCG_TYPE_V64:
1507         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1508                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1509         break;
1510 
1511     case 0:
1512         if (g->fni8 && check_size_impl(oprsz, 8)) {
1513             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1514                          g->write_aofs, g->fni8);
1515         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1516             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1517                          g->write_aofs, g->fni4);
1518         } else {
1519             assert(g->fno != NULL);
1520             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1521                                oprsz, maxsz, g->data, g->fno);
1522             oprsz = maxsz;
1523         }
1524         break;
1525 
1526     default:
1527         g_assert_not_reached();
1528     }
1529     tcg_swap_vecop_list(hold_list);
1530 
1531     if (oprsz < maxsz) {
1532         expand_clr(dofs + oprsz, maxsz - oprsz);
1533     }
1534 }
1535 
1536 /*
1537  * Expand specific vector operations.
1538  */
1539 
1540 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1541 {
1542     tcg_gen_mov_vec(a, b);
1543 }
1544 
1545 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1546                       uint32_t oprsz, uint32_t maxsz)
1547 {
1548     static const GVecGen2 g = {
1549         .fni8 = tcg_gen_mov_i64,
1550         .fniv = vec_mov2,
1551         .fno = gen_helper_gvec_mov,
1552         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1553     };
1554     if (dofs != aofs) {
1555         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1556     } else {
1557         check_size_align(oprsz, maxsz, dofs);
1558         if (oprsz < maxsz) {
1559             expand_clr(dofs + oprsz, maxsz - oprsz);
1560         }
1561     }
1562 }
1563 
1564 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1565                           uint32_t maxsz, TCGv_i32 in)
1566 {
1567     check_size_align(oprsz, maxsz, dofs);
1568     tcg_debug_assert(vece <= MO_32);
1569     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1570 }
1571 
1572 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1573                           uint32_t maxsz, TCGv_i64 in)
1574 {
1575     check_size_align(oprsz, maxsz, dofs);
1576     tcg_debug_assert(vece <= MO_64);
1577     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1578 }
1579 
1580 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1581                           uint32_t oprsz, uint32_t maxsz)
1582 {
1583     check_size_align(oprsz, maxsz, dofs);
1584     if (vece <= MO_64) {
1585         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1586         if (type != 0) {
1587             TCGv_vec t_vec = tcg_temp_new_vec(type);
1588             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1589             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1590             tcg_temp_free_vec(t_vec);
1591         } else if (vece <= MO_32) {
1592             TCGv_i32 in = tcg_temp_new_i32();
1593             switch (vece) {
1594             case MO_8:
1595                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1596                 break;
1597             case MO_16:
1598                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1599                 break;
1600             default:
1601                 tcg_gen_ld_i32(in, cpu_env, aofs);
1602                 break;
1603             }
1604             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1605             tcg_temp_free_i32(in);
1606         } else {
1607             TCGv_i64 in = tcg_temp_new_i64();
1608             tcg_gen_ld_i64(in, cpu_env, aofs);
1609             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1610             tcg_temp_free_i64(in);
1611         }
1612     } else if (vece == 4) {
1613         /* 128-bit duplicate.  */
1614         int i;
1615 
1616         tcg_debug_assert(oprsz >= 16);
1617         if (TCG_TARGET_HAS_v128) {
1618             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1619 
1620             tcg_gen_ld_vec(in, cpu_env, aofs);
1621             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1622                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1623             }
1624             tcg_temp_free_vec(in);
1625         } else {
1626             TCGv_i64 in0 = tcg_temp_new_i64();
1627             TCGv_i64 in1 = tcg_temp_new_i64();
1628 
1629             tcg_gen_ld_i64(in0, cpu_env, aofs);
1630             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1631             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1632                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1633                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1634             }
1635             tcg_temp_free_i64(in0);
1636             tcg_temp_free_i64(in1);
1637         }
1638         if (oprsz < maxsz) {
1639             expand_clr(dofs + oprsz, maxsz - oprsz);
1640         }
1641     } else if (vece == 5) {
1642         /* 256-bit duplicate.  */
1643         int i;
1644 
1645         tcg_debug_assert(oprsz >= 32);
1646         tcg_debug_assert(oprsz % 32 == 0);
1647         if (TCG_TARGET_HAS_v256) {
1648             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1649 
1650             tcg_gen_ld_vec(in, cpu_env, aofs);
1651             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1652                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1653             }
1654             tcg_temp_free_vec(in);
1655         } else if (TCG_TARGET_HAS_v128) {
1656             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1657             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1658 
1659             tcg_gen_ld_vec(in0, cpu_env, aofs);
1660             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1661             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1662                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1663                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1664             }
1665             tcg_temp_free_vec(in0);
1666             tcg_temp_free_vec(in1);
1667         } else {
1668             TCGv_i64 in[4];
1669             int j;
1670 
1671             for (j = 0; j < 4; ++j) {
1672                 in[j] = tcg_temp_new_i64();
1673                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1674             }
1675             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1676                 for (j = 0; j < 4; ++j) {
1677                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1678                 }
1679             }
1680             for (j = 0; j < 4; ++j) {
1681                 tcg_temp_free_i64(in[j]);
1682             }
1683         }
1684         if (oprsz < maxsz) {
1685             expand_clr(dofs + oprsz, maxsz - oprsz);
1686         }
1687     } else {
1688         g_assert_not_reached();
1689     }
1690 }
1691 
1692 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1693                           uint32_t maxsz, uint64_t x)
1694 {
1695     check_size_align(oprsz, maxsz, dofs);
1696     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1697 }
1698 
1699 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1700                       uint32_t oprsz, uint32_t maxsz)
1701 {
1702     static const GVecGen2 g = {
1703         .fni8 = tcg_gen_not_i64,
1704         .fniv = tcg_gen_not_vec,
1705         .fno = gen_helper_gvec_not,
1706         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1707     };
1708     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1709 }
1710 
1711 /* Perform a vector addition using normal addition and a mask.  The mask
1712    should be the sign bit of each lane.  This 6-operation form is more
1713    efficient than separate additions when there are 4 or more lanes in
1714    the 64-bit operation.  */
1715 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1716 {
1717     TCGv_i64 t1 = tcg_temp_new_i64();
1718     TCGv_i64 t2 = tcg_temp_new_i64();
1719     TCGv_i64 t3 = tcg_temp_new_i64();
1720 
1721     tcg_gen_andc_i64(t1, a, m);
1722     tcg_gen_andc_i64(t2, b, m);
1723     tcg_gen_xor_i64(t3, a, b);
1724     tcg_gen_add_i64(d, t1, t2);
1725     tcg_gen_and_i64(t3, t3, m);
1726     tcg_gen_xor_i64(d, d, t3);
1727 
1728     tcg_temp_free_i64(t1);
1729     tcg_temp_free_i64(t2);
1730     tcg_temp_free_i64(t3);
1731 }
1732 
1733 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734 {
1735     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1736     gen_addv_mask(d, a, b, m);
1737 }
1738 
1739 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1740 {
1741     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1742     gen_addv_mask(d, a, b, m);
1743 }
1744 
1745 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1746 {
1747     TCGv_i64 t1 = tcg_temp_new_i64();
1748     TCGv_i64 t2 = tcg_temp_new_i64();
1749 
1750     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1751     tcg_gen_add_i64(t2, a, b);
1752     tcg_gen_add_i64(t1, t1, b);
1753     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1754 
1755     tcg_temp_free_i64(t1);
1756     tcg_temp_free_i64(t2);
1757 }
1758 
1759 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1760 
1761 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1762                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1763 {
1764     static const GVecGen3 g[4] = {
1765         { .fni8 = tcg_gen_vec_add8_i64,
1766           .fniv = tcg_gen_add_vec,
1767           .fno = gen_helper_gvec_add8,
1768           .opt_opc = vecop_list_add,
1769           .vece = MO_8 },
1770         { .fni8 = tcg_gen_vec_add16_i64,
1771           .fniv = tcg_gen_add_vec,
1772           .fno = gen_helper_gvec_add16,
1773           .opt_opc = vecop_list_add,
1774           .vece = MO_16 },
1775         { .fni4 = tcg_gen_add_i32,
1776           .fniv = tcg_gen_add_vec,
1777           .fno = gen_helper_gvec_add32,
1778           .opt_opc = vecop_list_add,
1779           .vece = MO_32 },
1780         { .fni8 = tcg_gen_add_i64,
1781           .fniv = tcg_gen_add_vec,
1782           .fno = gen_helper_gvec_add64,
1783           .opt_opc = vecop_list_add,
1784           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1785           .vece = MO_64 },
1786     };
1787 
1788     tcg_debug_assert(vece <= MO_64);
1789     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1790 }
1791 
1792 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1793                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1794 {
1795     static const GVecGen2s g[4] = {
1796         { .fni8 = tcg_gen_vec_add8_i64,
1797           .fniv = tcg_gen_add_vec,
1798           .fno = gen_helper_gvec_adds8,
1799           .opt_opc = vecop_list_add,
1800           .vece = MO_8 },
1801         { .fni8 = tcg_gen_vec_add16_i64,
1802           .fniv = tcg_gen_add_vec,
1803           .fno = gen_helper_gvec_adds16,
1804           .opt_opc = vecop_list_add,
1805           .vece = MO_16 },
1806         { .fni4 = tcg_gen_add_i32,
1807           .fniv = tcg_gen_add_vec,
1808           .fno = gen_helper_gvec_adds32,
1809           .opt_opc = vecop_list_add,
1810           .vece = MO_32 },
1811         { .fni8 = tcg_gen_add_i64,
1812           .fniv = tcg_gen_add_vec,
1813           .fno = gen_helper_gvec_adds64,
1814           .opt_opc = vecop_list_add,
1815           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1816           .vece = MO_64 },
1817     };
1818 
1819     tcg_debug_assert(vece <= MO_64);
1820     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1821 }
1822 
1823 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1824                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1825 {
1826     TCGv_i64 tmp = tcg_constant_i64(c);
1827     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1828 }
1829 
1830 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1831 
1832 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1833                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1834 {
1835     static const GVecGen2s g[4] = {
1836         { .fni8 = tcg_gen_vec_sub8_i64,
1837           .fniv = tcg_gen_sub_vec,
1838           .fno = gen_helper_gvec_subs8,
1839           .opt_opc = vecop_list_sub,
1840           .vece = MO_8 },
1841         { .fni8 = tcg_gen_vec_sub16_i64,
1842           .fniv = tcg_gen_sub_vec,
1843           .fno = gen_helper_gvec_subs16,
1844           .opt_opc = vecop_list_sub,
1845           .vece = MO_16 },
1846         { .fni4 = tcg_gen_sub_i32,
1847           .fniv = tcg_gen_sub_vec,
1848           .fno = gen_helper_gvec_subs32,
1849           .opt_opc = vecop_list_sub,
1850           .vece = MO_32 },
1851         { .fni8 = tcg_gen_sub_i64,
1852           .fniv = tcg_gen_sub_vec,
1853           .fno = gen_helper_gvec_subs64,
1854           .opt_opc = vecop_list_sub,
1855           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1856           .vece = MO_64 },
1857     };
1858 
1859     tcg_debug_assert(vece <= MO_64);
1860     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1861 }
1862 
1863 /* Perform a vector subtraction using normal subtraction and a mask.
1864    Compare gen_addv_mask above.  */
1865 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1866 {
1867     TCGv_i64 t1 = tcg_temp_new_i64();
1868     TCGv_i64 t2 = tcg_temp_new_i64();
1869     TCGv_i64 t3 = tcg_temp_new_i64();
1870 
1871     tcg_gen_or_i64(t1, a, m);
1872     tcg_gen_andc_i64(t2, b, m);
1873     tcg_gen_eqv_i64(t3, a, b);
1874     tcg_gen_sub_i64(d, t1, t2);
1875     tcg_gen_and_i64(t3, t3, m);
1876     tcg_gen_xor_i64(d, d, t3);
1877 
1878     tcg_temp_free_i64(t1);
1879     tcg_temp_free_i64(t2);
1880     tcg_temp_free_i64(t3);
1881 }
1882 
1883 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1884 {
1885     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1886     gen_subv_mask(d, a, b, m);
1887 }
1888 
1889 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1890 {
1891     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1892     gen_subv_mask(d, a, b, m);
1893 }
1894 
1895 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1896 {
1897     TCGv_i64 t1 = tcg_temp_new_i64();
1898     TCGv_i64 t2 = tcg_temp_new_i64();
1899 
1900     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1901     tcg_gen_sub_i64(t2, a, b);
1902     tcg_gen_sub_i64(t1, a, t1);
1903     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1904 
1905     tcg_temp_free_i64(t1);
1906     tcg_temp_free_i64(t2);
1907 }
1908 
1909 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1910                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1911 {
1912     static const GVecGen3 g[4] = {
1913         { .fni8 = tcg_gen_vec_sub8_i64,
1914           .fniv = tcg_gen_sub_vec,
1915           .fno = gen_helper_gvec_sub8,
1916           .opt_opc = vecop_list_sub,
1917           .vece = MO_8 },
1918         { .fni8 = tcg_gen_vec_sub16_i64,
1919           .fniv = tcg_gen_sub_vec,
1920           .fno = gen_helper_gvec_sub16,
1921           .opt_opc = vecop_list_sub,
1922           .vece = MO_16 },
1923         { .fni4 = tcg_gen_sub_i32,
1924           .fniv = tcg_gen_sub_vec,
1925           .fno = gen_helper_gvec_sub32,
1926           .opt_opc = vecop_list_sub,
1927           .vece = MO_32 },
1928         { .fni8 = tcg_gen_sub_i64,
1929           .fniv = tcg_gen_sub_vec,
1930           .fno = gen_helper_gvec_sub64,
1931           .opt_opc = vecop_list_sub,
1932           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1933           .vece = MO_64 },
1934     };
1935 
1936     tcg_debug_assert(vece <= MO_64);
1937     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1938 }
1939 
1940 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1941 
1942 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1943                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1944 {
1945     static const GVecGen3 g[4] = {
1946         { .fniv = tcg_gen_mul_vec,
1947           .fno = gen_helper_gvec_mul8,
1948           .opt_opc = vecop_list_mul,
1949           .vece = MO_8 },
1950         { .fniv = tcg_gen_mul_vec,
1951           .fno = gen_helper_gvec_mul16,
1952           .opt_opc = vecop_list_mul,
1953           .vece = MO_16 },
1954         { .fni4 = tcg_gen_mul_i32,
1955           .fniv = tcg_gen_mul_vec,
1956           .fno = gen_helper_gvec_mul32,
1957           .opt_opc = vecop_list_mul,
1958           .vece = MO_32 },
1959         { .fni8 = tcg_gen_mul_i64,
1960           .fniv = tcg_gen_mul_vec,
1961           .fno = gen_helper_gvec_mul64,
1962           .opt_opc = vecop_list_mul,
1963           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1964           .vece = MO_64 },
1965     };
1966 
1967     tcg_debug_assert(vece <= MO_64);
1968     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1969 }
1970 
1971 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1972                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1973 {
1974     static const GVecGen2s g[4] = {
1975         { .fniv = tcg_gen_mul_vec,
1976           .fno = gen_helper_gvec_muls8,
1977           .opt_opc = vecop_list_mul,
1978           .vece = MO_8 },
1979         { .fniv = tcg_gen_mul_vec,
1980           .fno = gen_helper_gvec_muls16,
1981           .opt_opc = vecop_list_mul,
1982           .vece = MO_16 },
1983         { .fni4 = tcg_gen_mul_i32,
1984           .fniv = tcg_gen_mul_vec,
1985           .fno = gen_helper_gvec_muls32,
1986           .opt_opc = vecop_list_mul,
1987           .vece = MO_32 },
1988         { .fni8 = tcg_gen_mul_i64,
1989           .fniv = tcg_gen_mul_vec,
1990           .fno = gen_helper_gvec_muls64,
1991           .opt_opc = vecop_list_mul,
1992           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1993           .vece = MO_64 },
1994     };
1995 
1996     tcg_debug_assert(vece <= MO_64);
1997     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1998 }
1999 
2000 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2001                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2002 {
2003     TCGv_i64 tmp = tcg_constant_i64(c);
2004     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2005 }
2006 
2007 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2008                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2009 {
2010     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2011     static const GVecGen3 g[4] = {
2012         { .fniv = tcg_gen_ssadd_vec,
2013           .fno = gen_helper_gvec_ssadd8,
2014           .opt_opc = vecop_list,
2015           .vece = MO_8 },
2016         { .fniv = tcg_gen_ssadd_vec,
2017           .fno = gen_helper_gvec_ssadd16,
2018           .opt_opc = vecop_list,
2019           .vece = MO_16 },
2020         { .fniv = tcg_gen_ssadd_vec,
2021           .fno = gen_helper_gvec_ssadd32,
2022           .opt_opc = vecop_list,
2023           .vece = MO_32 },
2024         { .fniv = tcg_gen_ssadd_vec,
2025           .fno = gen_helper_gvec_ssadd64,
2026           .opt_opc = vecop_list,
2027           .vece = MO_64 },
2028     };
2029     tcg_debug_assert(vece <= MO_64);
2030     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2031 }
2032 
2033 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2034                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2035 {
2036     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2037     static const GVecGen3 g[4] = {
2038         { .fniv = tcg_gen_sssub_vec,
2039           .fno = gen_helper_gvec_sssub8,
2040           .opt_opc = vecop_list,
2041           .vece = MO_8 },
2042         { .fniv = tcg_gen_sssub_vec,
2043           .fno = gen_helper_gvec_sssub16,
2044           .opt_opc = vecop_list,
2045           .vece = MO_16 },
2046         { .fniv = tcg_gen_sssub_vec,
2047           .fno = gen_helper_gvec_sssub32,
2048           .opt_opc = vecop_list,
2049           .vece = MO_32 },
2050         { .fniv = tcg_gen_sssub_vec,
2051           .fno = gen_helper_gvec_sssub64,
2052           .opt_opc = vecop_list,
2053           .vece = MO_64 },
2054     };
2055     tcg_debug_assert(vece <= MO_64);
2056     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2057 }
2058 
2059 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2060 {
2061     TCGv_i32 max = tcg_constant_i32(-1);
2062     tcg_gen_add_i32(d, a, b);
2063     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2064 }
2065 
2066 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2067 {
2068     TCGv_i64 max = tcg_constant_i64(-1);
2069     tcg_gen_add_i64(d, a, b);
2070     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2071 }
2072 
2073 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2074                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2075 {
2076     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2077     static const GVecGen3 g[4] = {
2078         { .fniv = tcg_gen_usadd_vec,
2079           .fno = gen_helper_gvec_usadd8,
2080           .opt_opc = vecop_list,
2081           .vece = MO_8 },
2082         { .fniv = tcg_gen_usadd_vec,
2083           .fno = gen_helper_gvec_usadd16,
2084           .opt_opc = vecop_list,
2085           .vece = MO_16 },
2086         { .fni4 = tcg_gen_usadd_i32,
2087           .fniv = tcg_gen_usadd_vec,
2088           .fno = gen_helper_gvec_usadd32,
2089           .opt_opc = vecop_list,
2090           .vece = MO_32 },
2091         { .fni8 = tcg_gen_usadd_i64,
2092           .fniv = tcg_gen_usadd_vec,
2093           .fno = gen_helper_gvec_usadd64,
2094           .opt_opc = vecop_list,
2095           .vece = MO_64 }
2096     };
2097     tcg_debug_assert(vece <= MO_64);
2098     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2099 }
2100 
2101 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2102 {
2103     TCGv_i32 min = tcg_constant_i32(0);
2104     tcg_gen_sub_i32(d, a, b);
2105     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2106 }
2107 
2108 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2109 {
2110     TCGv_i64 min = tcg_constant_i64(0);
2111     tcg_gen_sub_i64(d, a, b);
2112     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2113 }
2114 
2115 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2116                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2117 {
2118     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2119     static const GVecGen3 g[4] = {
2120         { .fniv = tcg_gen_ussub_vec,
2121           .fno = gen_helper_gvec_ussub8,
2122           .opt_opc = vecop_list,
2123           .vece = MO_8 },
2124         { .fniv = tcg_gen_ussub_vec,
2125           .fno = gen_helper_gvec_ussub16,
2126           .opt_opc = vecop_list,
2127           .vece = MO_16 },
2128         { .fni4 = tcg_gen_ussub_i32,
2129           .fniv = tcg_gen_ussub_vec,
2130           .fno = gen_helper_gvec_ussub32,
2131           .opt_opc = vecop_list,
2132           .vece = MO_32 },
2133         { .fni8 = tcg_gen_ussub_i64,
2134           .fniv = tcg_gen_ussub_vec,
2135           .fno = gen_helper_gvec_ussub64,
2136           .opt_opc = vecop_list,
2137           .vece = MO_64 }
2138     };
2139     tcg_debug_assert(vece <= MO_64);
2140     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2141 }
2142 
2143 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2144                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2145 {
2146     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2147     static const GVecGen3 g[4] = {
2148         { .fniv = tcg_gen_smin_vec,
2149           .fno = gen_helper_gvec_smin8,
2150           .opt_opc = vecop_list,
2151           .vece = MO_8 },
2152         { .fniv = tcg_gen_smin_vec,
2153           .fno = gen_helper_gvec_smin16,
2154           .opt_opc = vecop_list,
2155           .vece = MO_16 },
2156         { .fni4 = tcg_gen_smin_i32,
2157           .fniv = tcg_gen_smin_vec,
2158           .fno = gen_helper_gvec_smin32,
2159           .opt_opc = vecop_list,
2160           .vece = MO_32 },
2161         { .fni8 = tcg_gen_smin_i64,
2162           .fniv = tcg_gen_smin_vec,
2163           .fno = gen_helper_gvec_smin64,
2164           .opt_opc = vecop_list,
2165           .vece = MO_64 }
2166     };
2167     tcg_debug_assert(vece <= MO_64);
2168     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2169 }
2170 
2171 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2172                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2173 {
2174     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2175     static const GVecGen3 g[4] = {
2176         { .fniv = tcg_gen_umin_vec,
2177           .fno = gen_helper_gvec_umin8,
2178           .opt_opc = vecop_list,
2179           .vece = MO_8 },
2180         { .fniv = tcg_gen_umin_vec,
2181           .fno = gen_helper_gvec_umin16,
2182           .opt_opc = vecop_list,
2183           .vece = MO_16 },
2184         { .fni4 = tcg_gen_umin_i32,
2185           .fniv = tcg_gen_umin_vec,
2186           .fno = gen_helper_gvec_umin32,
2187           .opt_opc = vecop_list,
2188           .vece = MO_32 },
2189         { .fni8 = tcg_gen_umin_i64,
2190           .fniv = tcg_gen_umin_vec,
2191           .fno = gen_helper_gvec_umin64,
2192           .opt_opc = vecop_list,
2193           .vece = MO_64 }
2194     };
2195     tcg_debug_assert(vece <= MO_64);
2196     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2197 }
2198 
2199 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2200                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2201 {
2202     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2203     static const GVecGen3 g[4] = {
2204         { .fniv = tcg_gen_smax_vec,
2205           .fno = gen_helper_gvec_smax8,
2206           .opt_opc = vecop_list,
2207           .vece = MO_8 },
2208         { .fniv = tcg_gen_smax_vec,
2209           .fno = gen_helper_gvec_smax16,
2210           .opt_opc = vecop_list,
2211           .vece = MO_16 },
2212         { .fni4 = tcg_gen_smax_i32,
2213           .fniv = tcg_gen_smax_vec,
2214           .fno = gen_helper_gvec_smax32,
2215           .opt_opc = vecop_list,
2216           .vece = MO_32 },
2217         { .fni8 = tcg_gen_smax_i64,
2218           .fniv = tcg_gen_smax_vec,
2219           .fno = gen_helper_gvec_smax64,
2220           .opt_opc = vecop_list,
2221           .vece = MO_64 }
2222     };
2223     tcg_debug_assert(vece <= MO_64);
2224     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2225 }
2226 
2227 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2228                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2229 {
2230     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2231     static const GVecGen3 g[4] = {
2232         { .fniv = tcg_gen_umax_vec,
2233           .fno = gen_helper_gvec_umax8,
2234           .opt_opc = vecop_list,
2235           .vece = MO_8 },
2236         { .fniv = tcg_gen_umax_vec,
2237           .fno = gen_helper_gvec_umax16,
2238           .opt_opc = vecop_list,
2239           .vece = MO_16 },
2240         { .fni4 = tcg_gen_umax_i32,
2241           .fniv = tcg_gen_umax_vec,
2242           .fno = gen_helper_gvec_umax32,
2243           .opt_opc = vecop_list,
2244           .vece = MO_32 },
2245         { .fni8 = tcg_gen_umax_i64,
2246           .fniv = tcg_gen_umax_vec,
2247           .fno = gen_helper_gvec_umax64,
2248           .opt_opc = vecop_list,
2249           .vece = MO_64 }
2250     };
2251     tcg_debug_assert(vece <= MO_64);
2252     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2253 }
2254 
2255 /* Perform a vector negation using normal negation and a mask.
2256    Compare gen_subv_mask above.  */
2257 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2258 {
2259     TCGv_i64 t2 = tcg_temp_new_i64();
2260     TCGv_i64 t3 = tcg_temp_new_i64();
2261 
2262     tcg_gen_andc_i64(t3, m, b);
2263     tcg_gen_andc_i64(t2, b, m);
2264     tcg_gen_sub_i64(d, m, t2);
2265     tcg_gen_xor_i64(d, d, t3);
2266 
2267     tcg_temp_free_i64(t2);
2268     tcg_temp_free_i64(t3);
2269 }
2270 
2271 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2272 {
2273     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2274     gen_negv_mask(d, b, m);
2275 }
2276 
2277 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2278 {
2279     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2280     gen_negv_mask(d, b, m);
2281 }
2282 
2283 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2284 {
2285     TCGv_i64 t1 = tcg_temp_new_i64();
2286     TCGv_i64 t2 = tcg_temp_new_i64();
2287 
2288     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2289     tcg_gen_neg_i64(t2, b);
2290     tcg_gen_neg_i64(t1, t1);
2291     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2292 
2293     tcg_temp_free_i64(t1);
2294     tcg_temp_free_i64(t2);
2295 }
2296 
2297 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2298                       uint32_t oprsz, uint32_t maxsz)
2299 {
2300     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2301     static const GVecGen2 g[4] = {
2302         { .fni8 = tcg_gen_vec_neg8_i64,
2303           .fniv = tcg_gen_neg_vec,
2304           .fno = gen_helper_gvec_neg8,
2305           .opt_opc = vecop_list,
2306           .vece = MO_8 },
2307         { .fni8 = tcg_gen_vec_neg16_i64,
2308           .fniv = tcg_gen_neg_vec,
2309           .fno = gen_helper_gvec_neg16,
2310           .opt_opc = vecop_list,
2311           .vece = MO_16 },
2312         { .fni4 = tcg_gen_neg_i32,
2313           .fniv = tcg_gen_neg_vec,
2314           .fno = gen_helper_gvec_neg32,
2315           .opt_opc = vecop_list,
2316           .vece = MO_32 },
2317         { .fni8 = tcg_gen_neg_i64,
2318           .fniv = tcg_gen_neg_vec,
2319           .fno = gen_helper_gvec_neg64,
2320           .opt_opc = vecop_list,
2321           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2322           .vece = MO_64 },
2323     };
2324 
2325     tcg_debug_assert(vece <= MO_64);
2326     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2327 }
2328 
2329 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2330 {
2331     TCGv_i64 t = tcg_temp_new_i64();
2332     int nbit = 8 << vece;
2333 
2334     /* Create -1 for each negative element.  */
2335     tcg_gen_shri_i64(t, b, nbit - 1);
2336     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2337     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2338 
2339     /*
2340      * Invert (via xor -1) and add one.
2341      * Because of the ordering the msb is cleared,
2342      * so we never have carry into the next element.
2343      */
2344     tcg_gen_xor_i64(d, b, t);
2345     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2346     tcg_gen_add_i64(d, d, t);
2347 
2348     tcg_temp_free_i64(t);
2349 }
2350 
2351 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2352 {
2353     gen_absv_mask(d, b, MO_8);
2354 }
2355 
2356 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2357 {
2358     gen_absv_mask(d, b, MO_16);
2359 }
2360 
2361 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2362                       uint32_t oprsz, uint32_t maxsz)
2363 {
2364     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2365     static const GVecGen2 g[4] = {
2366         { .fni8 = tcg_gen_vec_abs8_i64,
2367           .fniv = tcg_gen_abs_vec,
2368           .fno = gen_helper_gvec_abs8,
2369           .opt_opc = vecop_list,
2370           .vece = MO_8 },
2371         { .fni8 = tcg_gen_vec_abs16_i64,
2372           .fniv = tcg_gen_abs_vec,
2373           .fno = gen_helper_gvec_abs16,
2374           .opt_opc = vecop_list,
2375           .vece = MO_16 },
2376         { .fni4 = tcg_gen_abs_i32,
2377           .fniv = tcg_gen_abs_vec,
2378           .fno = gen_helper_gvec_abs32,
2379           .opt_opc = vecop_list,
2380           .vece = MO_32 },
2381         { .fni8 = tcg_gen_abs_i64,
2382           .fniv = tcg_gen_abs_vec,
2383           .fno = gen_helper_gvec_abs64,
2384           .opt_opc = vecop_list,
2385           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2386           .vece = MO_64 },
2387     };
2388 
2389     tcg_debug_assert(vece <= MO_64);
2390     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2391 }
2392 
2393 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2394                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2395 {
2396     static const GVecGen3 g = {
2397         .fni8 = tcg_gen_and_i64,
2398         .fniv = tcg_gen_and_vec,
2399         .fno = gen_helper_gvec_and,
2400         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2401     };
2402 
2403     if (aofs == bofs) {
2404         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2405     } else {
2406         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2407     }
2408 }
2409 
2410 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2411                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2412 {
2413     static const GVecGen3 g = {
2414         .fni8 = tcg_gen_or_i64,
2415         .fniv = tcg_gen_or_vec,
2416         .fno = gen_helper_gvec_or,
2417         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2418     };
2419 
2420     if (aofs == bofs) {
2421         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2422     } else {
2423         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2424     }
2425 }
2426 
2427 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2428                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2429 {
2430     static const GVecGen3 g = {
2431         .fni8 = tcg_gen_xor_i64,
2432         .fniv = tcg_gen_xor_vec,
2433         .fno = gen_helper_gvec_xor,
2434         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2435     };
2436 
2437     if (aofs == bofs) {
2438         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2439     } else {
2440         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2441     }
2442 }
2443 
2444 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2445                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2446 {
2447     static const GVecGen3 g = {
2448         .fni8 = tcg_gen_andc_i64,
2449         .fniv = tcg_gen_andc_vec,
2450         .fno = gen_helper_gvec_andc,
2451         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2452     };
2453 
2454     if (aofs == bofs) {
2455         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2456     } else {
2457         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2458     }
2459 }
2460 
2461 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2462                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2463 {
2464     static const GVecGen3 g = {
2465         .fni8 = tcg_gen_orc_i64,
2466         .fniv = tcg_gen_orc_vec,
2467         .fno = gen_helper_gvec_orc,
2468         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2469     };
2470 
2471     if (aofs == bofs) {
2472         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2473     } else {
2474         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2475     }
2476 }
2477 
2478 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2479                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2480 {
2481     static const GVecGen3 g = {
2482         .fni8 = tcg_gen_nand_i64,
2483         .fniv = tcg_gen_nand_vec,
2484         .fno = gen_helper_gvec_nand,
2485         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2486     };
2487 
2488     if (aofs == bofs) {
2489         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2490     } else {
2491         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2492     }
2493 }
2494 
2495 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2496                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2497 {
2498     static const GVecGen3 g = {
2499         .fni8 = tcg_gen_nor_i64,
2500         .fniv = tcg_gen_nor_vec,
2501         .fno = gen_helper_gvec_nor,
2502         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2503     };
2504 
2505     if (aofs == bofs) {
2506         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2507     } else {
2508         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2509     }
2510 }
2511 
2512 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2513                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2514 {
2515     static const GVecGen3 g = {
2516         .fni8 = tcg_gen_eqv_i64,
2517         .fniv = tcg_gen_eqv_vec,
2518         .fno = gen_helper_gvec_eqv,
2519         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2520     };
2521 
2522     if (aofs == bofs) {
2523         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2524     } else {
2525         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2526     }
2527 }
2528 
2529 static const GVecGen2s gop_ands = {
2530     .fni8 = tcg_gen_and_i64,
2531     .fniv = tcg_gen_and_vec,
2532     .fno = gen_helper_gvec_ands,
2533     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2534     .vece = MO_64
2535 };
2536 
2537 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2538                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2539 {
2540     TCGv_i64 tmp = tcg_temp_new_i64();
2541     gen_dup_i64(vece, tmp, c);
2542     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2543     tcg_temp_free_i64(tmp);
2544 }
2545 
2546 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2547                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2548 {
2549     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2550     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2551 }
2552 
2553 static const GVecGen2s gop_xors = {
2554     .fni8 = tcg_gen_xor_i64,
2555     .fniv = tcg_gen_xor_vec,
2556     .fno = gen_helper_gvec_xors,
2557     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2558     .vece = MO_64
2559 };
2560 
2561 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2562                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2563 {
2564     TCGv_i64 tmp = tcg_temp_new_i64();
2565     gen_dup_i64(vece, tmp, c);
2566     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2567     tcg_temp_free_i64(tmp);
2568 }
2569 
2570 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2571                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2572 {
2573     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2574     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2575 }
2576 
2577 static const GVecGen2s gop_ors = {
2578     .fni8 = tcg_gen_or_i64,
2579     .fniv = tcg_gen_or_vec,
2580     .fno = gen_helper_gvec_ors,
2581     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2582     .vece = MO_64
2583 };
2584 
2585 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2586                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2587 {
2588     TCGv_i64 tmp = tcg_temp_new_i64();
2589     gen_dup_i64(vece, tmp, c);
2590     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2591     tcg_temp_free_i64(tmp);
2592 }
2593 
2594 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2595                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2596 {
2597     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2598     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2599 }
2600 
2601 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2602 {
2603     uint64_t mask = dup_const(MO_8, 0xff << c);
2604     tcg_gen_shli_i64(d, a, c);
2605     tcg_gen_andi_i64(d, d, mask);
2606 }
2607 
2608 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2609 {
2610     uint64_t mask = dup_const(MO_16, 0xffff << c);
2611     tcg_gen_shli_i64(d, a, c);
2612     tcg_gen_andi_i64(d, d, mask);
2613 }
2614 
2615 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2616                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2617 {
2618     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2619     static const GVecGen2i g[4] = {
2620         { .fni8 = tcg_gen_vec_shl8i_i64,
2621           .fniv = tcg_gen_shli_vec,
2622           .fno = gen_helper_gvec_shl8i,
2623           .opt_opc = vecop_list,
2624           .vece = MO_8 },
2625         { .fni8 = tcg_gen_vec_shl16i_i64,
2626           .fniv = tcg_gen_shli_vec,
2627           .fno = gen_helper_gvec_shl16i,
2628           .opt_opc = vecop_list,
2629           .vece = MO_16 },
2630         { .fni4 = tcg_gen_shli_i32,
2631           .fniv = tcg_gen_shli_vec,
2632           .fno = gen_helper_gvec_shl32i,
2633           .opt_opc = vecop_list,
2634           .vece = MO_32 },
2635         { .fni8 = tcg_gen_shli_i64,
2636           .fniv = tcg_gen_shli_vec,
2637           .fno = gen_helper_gvec_shl64i,
2638           .opt_opc = vecop_list,
2639           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2640           .vece = MO_64 },
2641     };
2642 
2643     tcg_debug_assert(vece <= MO_64);
2644     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2645     if (shift == 0) {
2646         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2647     } else {
2648         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2649     }
2650 }
2651 
2652 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2653 {
2654     uint64_t mask = dup_const(MO_8, 0xff >> c);
2655     tcg_gen_shri_i64(d, a, c);
2656     tcg_gen_andi_i64(d, d, mask);
2657 }
2658 
2659 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2660 {
2661     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2662     tcg_gen_shri_i64(d, a, c);
2663     tcg_gen_andi_i64(d, d, mask);
2664 }
2665 
2666 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2667                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2668 {
2669     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2670     static const GVecGen2i g[4] = {
2671         { .fni8 = tcg_gen_vec_shr8i_i64,
2672           .fniv = tcg_gen_shri_vec,
2673           .fno = gen_helper_gvec_shr8i,
2674           .opt_opc = vecop_list,
2675           .vece = MO_8 },
2676         { .fni8 = tcg_gen_vec_shr16i_i64,
2677           .fniv = tcg_gen_shri_vec,
2678           .fno = gen_helper_gvec_shr16i,
2679           .opt_opc = vecop_list,
2680           .vece = MO_16 },
2681         { .fni4 = tcg_gen_shri_i32,
2682           .fniv = tcg_gen_shri_vec,
2683           .fno = gen_helper_gvec_shr32i,
2684           .opt_opc = vecop_list,
2685           .vece = MO_32 },
2686         { .fni8 = tcg_gen_shri_i64,
2687           .fniv = tcg_gen_shri_vec,
2688           .fno = gen_helper_gvec_shr64i,
2689           .opt_opc = vecop_list,
2690           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2691           .vece = MO_64 },
2692     };
2693 
2694     tcg_debug_assert(vece <= MO_64);
2695     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2696     if (shift == 0) {
2697         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2698     } else {
2699         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2700     }
2701 }
2702 
2703 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2704 {
2705     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2706     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2707     TCGv_i64 s = tcg_temp_new_i64();
2708 
2709     tcg_gen_shri_i64(d, a, c);
2710     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2711     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2712     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2713     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2714     tcg_temp_free_i64(s);
2715 }
2716 
2717 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2718 {
2719     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2720     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2721     TCGv_i64 s = tcg_temp_new_i64();
2722 
2723     tcg_gen_shri_i64(d, a, c);
2724     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2725     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2726     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2727     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2728     tcg_temp_free_i64(s);
2729 }
2730 
2731 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2732                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2733 {
2734     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2735     static const GVecGen2i g[4] = {
2736         { .fni8 = tcg_gen_vec_sar8i_i64,
2737           .fniv = tcg_gen_sari_vec,
2738           .fno = gen_helper_gvec_sar8i,
2739           .opt_opc = vecop_list,
2740           .vece = MO_8 },
2741         { .fni8 = tcg_gen_vec_sar16i_i64,
2742           .fniv = tcg_gen_sari_vec,
2743           .fno = gen_helper_gvec_sar16i,
2744           .opt_opc = vecop_list,
2745           .vece = MO_16 },
2746         { .fni4 = tcg_gen_sari_i32,
2747           .fniv = tcg_gen_sari_vec,
2748           .fno = gen_helper_gvec_sar32i,
2749           .opt_opc = vecop_list,
2750           .vece = MO_32 },
2751         { .fni8 = tcg_gen_sari_i64,
2752           .fniv = tcg_gen_sari_vec,
2753           .fno = gen_helper_gvec_sar64i,
2754           .opt_opc = vecop_list,
2755           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2756           .vece = MO_64 },
2757     };
2758 
2759     tcg_debug_assert(vece <= MO_64);
2760     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2761     if (shift == 0) {
2762         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2763     } else {
2764         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2765     }
2766 }
2767 
2768 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2769 {
2770     uint64_t mask = dup_const(MO_8, 0xff << c);
2771 
2772     tcg_gen_shli_i64(d, a, c);
2773     tcg_gen_shri_i64(a, a, 8 - c);
2774     tcg_gen_andi_i64(d, d, mask);
2775     tcg_gen_andi_i64(a, a, ~mask);
2776     tcg_gen_or_i64(d, d, a);
2777 }
2778 
2779 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2780 {
2781     uint64_t mask = dup_const(MO_16, 0xffff << c);
2782 
2783     tcg_gen_shli_i64(d, a, c);
2784     tcg_gen_shri_i64(a, a, 16 - c);
2785     tcg_gen_andi_i64(d, d, mask);
2786     tcg_gen_andi_i64(a, a, ~mask);
2787     tcg_gen_or_i64(d, d, a);
2788 }
2789 
2790 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2791                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2792 {
2793     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2794     static const GVecGen2i g[4] = {
2795         { .fni8 = tcg_gen_vec_rotl8i_i64,
2796           .fniv = tcg_gen_rotli_vec,
2797           .fno = gen_helper_gvec_rotl8i,
2798           .opt_opc = vecop_list,
2799           .vece = MO_8 },
2800         { .fni8 = tcg_gen_vec_rotl16i_i64,
2801           .fniv = tcg_gen_rotli_vec,
2802           .fno = gen_helper_gvec_rotl16i,
2803           .opt_opc = vecop_list,
2804           .vece = MO_16 },
2805         { .fni4 = tcg_gen_rotli_i32,
2806           .fniv = tcg_gen_rotli_vec,
2807           .fno = gen_helper_gvec_rotl32i,
2808           .opt_opc = vecop_list,
2809           .vece = MO_32 },
2810         { .fni8 = tcg_gen_rotli_i64,
2811           .fniv = tcg_gen_rotli_vec,
2812           .fno = gen_helper_gvec_rotl64i,
2813           .opt_opc = vecop_list,
2814           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2815           .vece = MO_64 },
2816     };
2817 
2818     tcg_debug_assert(vece <= MO_64);
2819     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2820     if (shift == 0) {
2821         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2822     } else {
2823         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2824     }
2825 }
2826 
2827 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2828                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2829 {
2830     tcg_debug_assert(vece <= MO_64);
2831     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2832     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2833                        oprsz, maxsz);
2834 }
2835 
2836 /*
2837  * Specialized generation vector shifts by a non-constant scalar.
2838  */
2839 
2840 typedef struct {
2841     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2842     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2843     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2844     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2845     gen_helper_gvec_2 *fno[4];
2846     TCGOpcode s_list[2];
2847     TCGOpcode v_list[2];
2848 } GVecGen2sh;
2849 
2850 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2851                            uint32_t oprsz, uint32_t tysz, TCGType type,
2852                            TCGv_i32 shift,
2853                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2854 {
2855     TCGv_vec t0 = tcg_temp_new_vec(type);
2856     uint32_t i;
2857 
2858     for (i = 0; i < oprsz; i += tysz) {
2859         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2860         fni(vece, t0, t0, shift);
2861         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2862     }
2863     tcg_temp_free_vec(t0);
2864 }
2865 
2866 static void
2867 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2868                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2869 {
2870     TCGType type;
2871     uint32_t some;
2872 
2873     check_size_align(oprsz, maxsz, dofs | aofs);
2874     check_overlap_2(dofs, aofs, maxsz);
2875 
2876     /* If the backend has a scalar expansion, great.  */
2877     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2878     if (type) {
2879         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2880         switch (type) {
2881         case TCG_TYPE_V256:
2882             some = QEMU_ALIGN_DOWN(oprsz, 32);
2883             expand_2sh_vec(vece, dofs, aofs, some, 32,
2884                            TCG_TYPE_V256, shift, g->fniv_s);
2885             if (some == oprsz) {
2886                 break;
2887             }
2888             dofs += some;
2889             aofs += some;
2890             oprsz -= some;
2891             maxsz -= some;
2892             /* fallthru */
2893         case TCG_TYPE_V128:
2894             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2895                            TCG_TYPE_V128, shift, g->fniv_s);
2896             break;
2897         case TCG_TYPE_V64:
2898             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2899                            TCG_TYPE_V64, shift, g->fniv_s);
2900             break;
2901         default:
2902             g_assert_not_reached();
2903         }
2904         tcg_swap_vecop_list(hold_list);
2905         goto clear_tail;
2906     }
2907 
2908     /* If the backend supports variable vector shifts, also cool.  */
2909     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2910     if (type) {
2911         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2912         TCGv_vec v_shift = tcg_temp_new_vec(type);
2913 
2914         if (vece == MO_64) {
2915             TCGv_i64 sh64 = tcg_temp_new_i64();
2916             tcg_gen_extu_i32_i64(sh64, shift);
2917             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2918             tcg_temp_free_i64(sh64);
2919         } else {
2920             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2921         }
2922 
2923         switch (type) {
2924         case TCG_TYPE_V256:
2925             some = QEMU_ALIGN_DOWN(oprsz, 32);
2926             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2927                           v_shift, false, g->fniv_v);
2928             if (some == oprsz) {
2929                 break;
2930             }
2931             dofs += some;
2932             aofs += some;
2933             oprsz -= some;
2934             maxsz -= some;
2935             /* fallthru */
2936         case TCG_TYPE_V128:
2937             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2938                           v_shift, false, g->fniv_v);
2939             break;
2940         case TCG_TYPE_V64:
2941             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2942                           v_shift, false, g->fniv_v);
2943             break;
2944         default:
2945             g_assert_not_reached();
2946         }
2947         tcg_temp_free_vec(v_shift);
2948         tcg_swap_vecop_list(hold_list);
2949         goto clear_tail;
2950     }
2951 
2952     /* Otherwise fall back to integral... */
2953     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2954         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2955     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2956         TCGv_i64 sh64 = tcg_temp_new_i64();
2957         tcg_gen_extu_i32_i64(sh64, shift);
2958         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2959         tcg_temp_free_i64(sh64);
2960     } else {
2961         TCGv_ptr a0 = tcg_temp_new_ptr();
2962         TCGv_ptr a1 = tcg_temp_new_ptr();
2963         TCGv_i32 desc = tcg_temp_new_i32();
2964 
2965         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2966         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2967         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2968         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2969 
2970         g->fno[vece](a0, a1, desc);
2971 
2972         tcg_temp_free_ptr(a0);
2973         tcg_temp_free_ptr(a1);
2974         tcg_temp_free_i32(desc);
2975         return;
2976     }
2977 
2978  clear_tail:
2979     if (oprsz < maxsz) {
2980         expand_clr(dofs + oprsz, maxsz - oprsz);
2981     }
2982 }
2983 
2984 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2985                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2986 {
2987     static const GVecGen2sh g = {
2988         .fni4 = tcg_gen_shl_i32,
2989         .fni8 = tcg_gen_shl_i64,
2990         .fniv_s = tcg_gen_shls_vec,
2991         .fniv_v = tcg_gen_shlv_vec,
2992         .fno = {
2993             gen_helper_gvec_shl8i,
2994             gen_helper_gvec_shl16i,
2995             gen_helper_gvec_shl32i,
2996             gen_helper_gvec_shl64i,
2997         },
2998         .s_list = { INDEX_op_shls_vec, 0 },
2999         .v_list = { INDEX_op_shlv_vec, 0 },
3000     };
3001 
3002     tcg_debug_assert(vece <= MO_64);
3003     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3004 }
3005 
3006 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3007                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3008 {
3009     static const GVecGen2sh g = {
3010         .fni4 = tcg_gen_shr_i32,
3011         .fni8 = tcg_gen_shr_i64,
3012         .fniv_s = tcg_gen_shrs_vec,
3013         .fniv_v = tcg_gen_shrv_vec,
3014         .fno = {
3015             gen_helper_gvec_shr8i,
3016             gen_helper_gvec_shr16i,
3017             gen_helper_gvec_shr32i,
3018             gen_helper_gvec_shr64i,
3019         },
3020         .s_list = { INDEX_op_shrs_vec, 0 },
3021         .v_list = { INDEX_op_shrv_vec, 0 },
3022     };
3023 
3024     tcg_debug_assert(vece <= MO_64);
3025     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3026 }
3027 
3028 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3029                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3030 {
3031     static const GVecGen2sh g = {
3032         .fni4 = tcg_gen_sar_i32,
3033         .fni8 = tcg_gen_sar_i64,
3034         .fniv_s = tcg_gen_sars_vec,
3035         .fniv_v = tcg_gen_sarv_vec,
3036         .fno = {
3037             gen_helper_gvec_sar8i,
3038             gen_helper_gvec_sar16i,
3039             gen_helper_gvec_sar32i,
3040             gen_helper_gvec_sar64i,
3041         },
3042         .s_list = { INDEX_op_sars_vec, 0 },
3043         .v_list = { INDEX_op_sarv_vec, 0 },
3044     };
3045 
3046     tcg_debug_assert(vece <= MO_64);
3047     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3048 }
3049 
3050 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3051                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3052 {
3053     static const GVecGen2sh g = {
3054         .fni4 = tcg_gen_rotl_i32,
3055         .fni8 = tcg_gen_rotl_i64,
3056         .fniv_s = tcg_gen_rotls_vec,
3057         .fniv_v = tcg_gen_rotlv_vec,
3058         .fno = {
3059             gen_helper_gvec_rotl8i,
3060             gen_helper_gvec_rotl16i,
3061             gen_helper_gvec_rotl32i,
3062             gen_helper_gvec_rotl64i,
3063         },
3064         .s_list = { INDEX_op_rotls_vec, 0 },
3065         .v_list = { INDEX_op_rotlv_vec, 0 },
3066     };
3067 
3068     tcg_debug_assert(vece <= MO_64);
3069     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3070 }
3071 
3072 /*
3073  * Expand D = A << (B % element bits)
3074  *
3075  * Unlike scalar shifts, where it is easy for the target front end
3076  * to include the modulo as part of the expansion.  If the target
3077  * naturally includes the modulo as part of the operation, great!
3078  * If the target has some other behaviour from out-of-range shifts,
3079  * then it could not use this function anyway, and would need to
3080  * do it's own expansion with custom functions.
3081  */
3082 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3083                                  TCGv_vec a, TCGv_vec b)
3084 {
3085     TCGv_vec t = tcg_temp_new_vec_matching(d);
3086     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3087 
3088     tcg_gen_and_vec(vece, t, b, m);
3089     tcg_gen_shlv_vec(vece, d, a, t);
3090     tcg_temp_free_vec(t);
3091 }
3092 
3093 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3094 {
3095     TCGv_i32 t = tcg_temp_new_i32();
3096 
3097     tcg_gen_andi_i32(t, b, 31);
3098     tcg_gen_shl_i32(d, a, t);
3099     tcg_temp_free_i32(t);
3100 }
3101 
3102 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3103 {
3104     TCGv_i64 t = tcg_temp_new_i64();
3105 
3106     tcg_gen_andi_i64(t, b, 63);
3107     tcg_gen_shl_i64(d, a, t);
3108     tcg_temp_free_i64(t);
3109 }
3110 
3111 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3112                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3113 {
3114     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3115     static const GVecGen3 g[4] = {
3116         { .fniv = tcg_gen_shlv_mod_vec,
3117           .fno = gen_helper_gvec_shl8v,
3118           .opt_opc = vecop_list,
3119           .vece = MO_8 },
3120         { .fniv = tcg_gen_shlv_mod_vec,
3121           .fno = gen_helper_gvec_shl16v,
3122           .opt_opc = vecop_list,
3123           .vece = MO_16 },
3124         { .fni4 = tcg_gen_shl_mod_i32,
3125           .fniv = tcg_gen_shlv_mod_vec,
3126           .fno = gen_helper_gvec_shl32v,
3127           .opt_opc = vecop_list,
3128           .vece = MO_32 },
3129         { .fni8 = tcg_gen_shl_mod_i64,
3130           .fniv = tcg_gen_shlv_mod_vec,
3131           .fno = gen_helper_gvec_shl64v,
3132           .opt_opc = vecop_list,
3133           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3134           .vece = MO_64 },
3135     };
3136 
3137     tcg_debug_assert(vece <= MO_64);
3138     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3139 }
3140 
3141 /*
3142  * Similarly for logical right shifts.
3143  */
3144 
3145 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3146                                  TCGv_vec a, TCGv_vec b)
3147 {
3148     TCGv_vec t = tcg_temp_new_vec_matching(d);
3149     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3150 
3151     tcg_gen_and_vec(vece, t, b, m);
3152     tcg_gen_shrv_vec(vece, d, a, t);
3153     tcg_temp_free_vec(t);
3154 }
3155 
3156 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3157 {
3158     TCGv_i32 t = tcg_temp_new_i32();
3159 
3160     tcg_gen_andi_i32(t, b, 31);
3161     tcg_gen_shr_i32(d, a, t);
3162     tcg_temp_free_i32(t);
3163 }
3164 
3165 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3166 {
3167     TCGv_i64 t = tcg_temp_new_i64();
3168 
3169     tcg_gen_andi_i64(t, b, 63);
3170     tcg_gen_shr_i64(d, a, t);
3171     tcg_temp_free_i64(t);
3172 }
3173 
3174 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3175                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3176 {
3177     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3178     static const GVecGen3 g[4] = {
3179         { .fniv = tcg_gen_shrv_mod_vec,
3180           .fno = gen_helper_gvec_shr8v,
3181           .opt_opc = vecop_list,
3182           .vece = MO_8 },
3183         { .fniv = tcg_gen_shrv_mod_vec,
3184           .fno = gen_helper_gvec_shr16v,
3185           .opt_opc = vecop_list,
3186           .vece = MO_16 },
3187         { .fni4 = tcg_gen_shr_mod_i32,
3188           .fniv = tcg_gen_shrv_mod_vec,
3189           .fno = gen_helper_gvec_shr32v,
3190           .opt_opc = vecop_list,
3191           .vece = MO_32 },
3192         { .fni8 = tcg_gen_shr_mod_i64,
3193           .fniv = tcg_gen_shrv_mod_vec,
3194           .fno = gen_helper_gvec_shr64v,
3195           .opt_opc = vecop_list,
3196           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3197           .vece = MO_64 },
3198     };
3199 
3200     tcg_debug_assert(vece <= MO_64);
3201     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3202 }
3203 
3204 /*
3205  * Similarly for arithmetic right shifts.
3206  */
3207 
3208 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3209                                  TCGv_vec a, TCGv_vec b)
3210 {
3211     TCGv_vec t = tcg_temp_new_vec_matching(d);
3212     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3213 
3214     tcg_gen_and_vec(vece, t, b, m);
3215     tcg_gen_sarv_vec(vece, d, a, t);
3216     tcg_temp_free_vec(t);
3217 }
3218 
3219 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3220 {
3221     TCGv_i32 t = tcg_temp_new_i32();
3222 
3223     tcg_gen_andi_i32(t, b, 31);
3224     tcg_gen_sar_i32(d, a, t);
3225     tcg_temp_free_i32(t);
3226 }
3227 
3228 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3229 {
3230     TCGv_i64 t = tcg_temp_new_i64();
3231 
3232     tcg_gen_andi_i64(t, b, 63);
3233     tcg_gen_sar_i64(d, a, t);
3234     tcg_temp_free_i64(t);
3235 }
3236 
3237 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3238                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3239 {
3240     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3241     static const GVecGen3 g[4] = {
3242         { .fniv = tcg_gen_sarv_mod_vec,
3243           .fno = gen_helper_gvec_sar8v,
3244           .opt_opc = vecop_list,
3245           .vece = MO_8 },
3246         { .fniv = tcg_gen_sarv_mod_vec,
3247           .fno = gen_helper_gvec_sar16v,
3248           .opt_opc = vecop_list,
3249           .vece = MO_16 },
3250         { .fni4 = tcg_gen_sar_mod_i32,
3251           .fniv = tcg_gen_sarv_mod_vec,
3252           .fno = gen_helper_gvec_sar32v,
3253           .opt_opc = vecop_list,
3254           .vece = MO_32 },
3255         { .fni8 = tcg_gen_sar_mod_i64,
3256           .fniv = tcg_gen_sarv_mod_vec,
3257           .fno = gen_helper_gvec_sar64v,
3258           .opt_opc = vecop_list,
3259           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3260           .vece = MO_64 },
3261     };
3262 
3263     tcg_debug_assert(vece <= MO_64);
3264     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3265 }
3266 
3267 /*
3268  * Similarly for rotates.
3269  */
3270 
3271 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3272                                   TCGv_vec a, TCGv_vec b)
3273 {
3274     TCGv_vec t = tcg_temp_new_vec_matching(d);
3275     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3276 
3277     tcg_gen_and_vec(vece, t, b, m);
3278     tcg_gen_rotlv_vec(vece, d, a, t);
3279     tcg_temp_free_vec(t);
3280 }
3281 
3282 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3283 {
3284     TCGv_i32 t = tcg_temp_new_i32();
3285 
3286     tcg_gen_andi_i32(t, b, 31);
3287     tcg_gen_rotl_i32(d, a, t);
3288     tcg_temp_free_i32(t);
3289 }
3290 
3291 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3292 {
3293     TCGv_i64 t = tcg_temp_new_i64();
3294 
3295     tcg_gen_andi_i64(t, b, 63);
3296     tcg_gen_rotl_i64(d, a, t);
3297     tcg_temp_free_i64(t);
3298 }
3299 
3300 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3301                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3302 {
3303     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3304     static const GVecGen3 g[4] = {
3305         { .fniv = tcg_gen_rotlv_mod_vec,
3306           .fno = gen_helper_gvec_rotl8v,
3307           .opt_opc = vecop_list,
3308           .vece = MO_8 },
3309         { .fniv = tcg_gen_rotlv_mod_vec,
3310           .fno = gen_helper_gvec_rotl16v,
3311           .opt_opc = vecop_list,
3312           .vece = MO_16 },
3313         { .fni4 = tcg_gen_rotl_mod_i32,
3314           .fniv = tcg_gen_rotlv_mod_vec,
3315           .fno = gen_helper_gvec_rotl32v,
3316           .opt_opc = vecop_list,
3317           .vece = MO_32 },
3318         { .fni8 = tcg_gen_rotl_mod_i64,
3319           .fniv = tcg_gen_rotlv_mod_vec,
3320           .fno = gen_helper_gvec_rotl64v,
3321           .opt_opc = vecop_list,
3322           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3323           .vece = MO_64 },
3324     };
3325 
3326     tcg_debug_assert(vece <= MO_64);
3327     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3328 }
3329 
3330 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3331                                   TCGv_vec a, TCGv_vec b)
3332 {
3333     TCGv_vec t = tcg_temp_new_vec_matching(d);
3334     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3335 
3336     tcg_gen_and_vec(vece, t, b, m);
3337     tcg_gen_rotrv_vec(vece, d, a, t);
3338     tcg_temp_free_vec(t);
3339 }
3340 
3341 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3342 {
3343     TCGv_i32 t = tcg_temp_new_i32();
3344 
3345     tcg_gen_andi_i32(t, b, 31);
3346     tcg_gen_rotr_i32(d, a, t);
3347     tcg_temp_free_i32(t);
3348 }
3349 
3350 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3351 {
3352     TCGv_i64 t = tcg_temp_new_i64();
3353 
3354     tcg_gen_andi_i64(t, b, 63);
3355     tcg_gen_rotr_i64(d, a, t);
3356     tcg_temp_free_i64(t);
3357 }
3358 
3359 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3360                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3361 {
3362     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3363     static const GVecGen3 g[4] = {
3364         { .fniv = tcg_gen_rotrv_mod_vec,
3365           .fno = gen_helper_gvec_rotr8v,
3366           .opt_opc = vecop_list,
3367           .vece = MO_8 },
3368         { .fniv = tcg_gen_rotrv_mod_vec,
3369           .fno = gen_helper_gvec_rotr16v,
3370           .opt_opc = vecop_list,
3371           .vece = MO_16 },
3372         { .fni4 = tcg_gen_rotr_mod_i32,
3373           .fniv = tcg_gen_rotrv_mod_vec,
3374           .fno = gen_helper_gvec_rotr32v,
3375           .opt_opc = vecop_list,
3376           .vece = MO_32 },
3377         { .fni8 = tcg_gen_rotr_mod_i64,
3378           .fniv = tcg_gen_rotrv_mod_vec,
3379           .fno = gen_helper_gvec_rotr64v,
3380           .opt_opc = vecop_list,
3381           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3382           .vece = MO_64 },
3383     };
3384 
3385     tcg_debug_assert(vece <= MO_64);
3386     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3387 }
3388 
3389 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3390 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3391                            uint32_t oprsz, TCGCond cond)
3392 {
3393     TCGv_i32 t0 = tcg_temp_new_i32();
3394     TCGv_i32 t1 = tcg_temp_new_i32();
3395     uint32_t i;
3396 
3397     for (i = 0; i < oprsz; i += 4) {
3398         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3399         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3400         tcg_gen_setcond_i32(cond, t0, t0, t1);
3401         tcg_gen_neg_i32(t0, t0);
3402         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3403     }
3404     tcg_temp_free_i32(t1);
3405     tcg_temp_free_i32(t0);
3406 }
3407 
3408 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3409                            uint32_t oprsz, TCGCond cond)
3410 {
3411     TCGv_i64 t0 = tcg_temp_new_i64();
3412     TCGv_i64 t1 = tcg_temp_new_i64();
3413     uint32_t i;
3414 
3415     for (i = 0; i < oprsz; i += 8) {
3416         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3417         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3418         tcg_gen_setcond_i64(cond, t0, t0, t1);
3419         tcg_gen_neg_i64(t0, t0);
3420         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3421     }
3422     tcg_temp_free_i64(t1);
3423     tcg_temp_free_i64(t0);
3424 }
3425 
3426 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3427                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3428                            TCGType type, TCGCond cond)
3429 {
3430     TCGv_vec t0 = tcg_temp_new_vec(type);
3431     TCGv_vec t1 = tcg_temp_new_vec(type);
3432     uint32_t i;
3433 
3434     for (i = 0; i < oprsz; i += tysz) {
3435         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3436         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3437         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3438         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3439     }
3440     tcg_temp_free_vec(t1);
3441     tcg_temp_free_vec(t0);
3442 }
3443 
3444 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3445                       uint32_t aofs, uint32_t bofs,
3446                       uint32_t oprsz, uint32_t maxsz)
3447 {
3448     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3449     static gen_helper_gvec_3 * const eq_fn[4] = {
3450         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3451         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3452     };
3453     static gen_helper_gvec_3 * const ne_fn[4] = {
3454         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3455         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3456     };
3457     static gen_helper_gvec_3 * const lt_fn[4] = {
3458         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3459         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3460     };
3461     static gen_helper_gvec_3 * const le_fn[4] = {
3462         gen_helper_gvec_le8, gen_helper_gvec_le16,
3463         gen_helper_gvec_le32, gen_helper_gvec_le64
3464     };
3465     static gen_helper_gvec_3 * const ltu_fn[4] = {
3466         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3467         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3468     };
3469     static gen_helper_gvec_3 * const leu_fn[4] = {
3470         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3471         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3472     };
3473     static gen_helper_gvec_3 * const * const fns[16] = {
3474         [TCG_COND_EQ] = eq_fn,
3475         [TCG_COND_NE] = ne_fn,
3476         [TCG_COND_LT] = lt_fn,
3477         [TCG_COND_LE] = le_fn,
3478         [TCG_COND_LTU] = ltu_fn,
3479         [TCG_COND_LEU] = leu_fn,
3480     };
3481 
3482     const TCGOpcode *hold_list;
3483     TCGType type;
3484     uint32_t some;
3485 
3486     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3487     check_overlap_3(dofs, aofs, bofs, maxsz);
3488 
3489     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3490         do_dup(MO_8, dofs, oprsz, maxsz,
3491                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3492         return;
3493     }
3494 
3495     /*
3496      * Implement inline with a vector type, if possible.
3497      * Prefer integer when 64-bit host and 64-bit comparison.
3498      */
3499     hold_list = tcg_swap_vecop_list(cmp_list);
3500     type = choose_vector_type(cmp_list, vece, oprsz,
3501                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3502     switch (type) {
3503     case TCG_TYPE_V256:
3504         /* Recall that ARM SVE allows vector sizes that are not a
3505          * power of 2, but always a multiple of 16.  The intent is
3506          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3507          */
3508         some = QEMU_ALIGN_DOWN(oprsz, 32);
3509         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3510         if (some == oprsz) {
3511             break;
3512         }
3513         dofs += some;
3514         aofs += some;
3515         bofs += some;
3516         oprsz -= some;
3517         maxsz -= some;
3518         /* fallthru */
3519     case TCG_TYPE_V128:
3520         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3521         break;
3522     case TCG_TYPE_V64:
3523         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3524         break;
3525 
3526     case 0:
3527         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3528             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3529         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3530             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3531         } else {
3532             gen_helper_gvec_3 * const *fn = fns[cond];
3533 
3534             if (fn == NULL) {
3535                 uint32_t tmp;
3536                 tmp = aofs, aofs = bofs, bofs = tmp;
3537                 cond = tcg_swap_cond(cond);
3538                 fn = fns[cond];
3539                 assert(fn != NULL);
3540             }
3541             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3542             oprsz = maxsz;
3543         }
3544         break;
3545 
3546     default:
3547         g_assert_not_reached();
3548     }
3549     tcg_swap_vecop_list(hold_list);
3550 
3551     if (oprsz < maxsz) {
3552         expand_clr(dofs + oprsz, maxsz - oprsz);
3553     }
3554 }
3555 
3556 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3557 {
3558     TCGv_i64 t = tcg_temp_new_i64();
3559 
3560     tcg_gen_and_i64(t, b, a);
3561     tcg_gen_andc_i64(d, c, a);
3562     tcg_gen_or_i64(d, d, t);
3563     tcg_temp_free_i64(t);
3564 }
3565 
3566 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3567                          uint32_t bofs, uint32_t cofs,
3568                          uint32_t oprsz, uint32_t maxsz)
3569 {
3570     static const GVecGen4 g = {
3571         .fni8 = tcg_gen_bitsel_i64,
3572         .fniv = tcg_gen_bitsel_vec,
3573         .fno = gen_helper_gvec_bitsel,
3574     };
3575 
3576     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3577 }
3578