xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision ad66b5cb)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "tcg/tcg-gvec-desc.h"
25 
26 #define MAX_UNROLL  4
27 
28 #ifdef CONFIG_DEBUG_TCG
29 static const TCGOpcode vecop_list_empty[1] = { 0 };
30 #else
31 #define vecop_list_empty NULL
32 #endif
33 
34 
35 /* Verify vector size and alignment rules.  OFS should be the OR of all
36    of the operand offsets so that we can check them all at once.  */
37 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
38 {
39     uint32_t max_align;
40 
41     switch (oprsz) {
42     case 8:
43     case 16:
44     case 32:
45         tcg_debug_assert(oprsz <= maxsz);
46         break;
47     default:
48         tcg_debug_assert(oprsz == maxsz);
49         break;
50     }
51     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
52 
53     max_align = maxsz >= 16 ? 15 : 7;
54     tcg_debug_assert((maxsz & max_align) == 0);
55     tcg_debug_assert((ofs & max_align) == 0);
56 }
57 
58 /* Verify vector overlap rules for two operands.  */
59 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
60 {
61     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
62 }
63 
64 /* Verify vector overlap rules for three operands.  */
65 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
66 {
67     check_overlap_2(d, a, s);
68     check_overlap_2(d, b, s);
69     check_overlap_2(a, b, s);
70 }
71 
72 /* Verify vector overlap rules for four operands.  */
73 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
74                             uint32_t c, uint32_t s)
75 {
76     check_overlap_2(d, a, s);
77     check_overlap_2(d, b, s);
78     check_overlap_2(d, c, s);
79     check_overlap_2(a, b, s);
80     check_overlap_2(a, c, s);
81     check_overlap_2(b, c, s);
82 }
83 
84 /* Create a descriptor from components.  */
85 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
86 {
87     uint32_t desc = 0;
88 
89     check_size_align(oprsz, maxsz, 0);
90     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
91 
92     oprsz = (oprsz / 8) - 1;
93     maxsz = (maxsz / 8) - 1;
94 
95     /*
96      * We have just asserted in check_size_align that either
97      * oprsz is {8,16,32} or matches maxsz.  Encode the final
98      * case with '2', as that would otherwise map to 24.
99      */
100     if (oprsz == maxsz) {
101         oprsz = 2;
102     }
103 
104     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
105     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
106     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
107 
108     return desc;
109 }
110 
111 /* Generate a call to a gvec-style helper with two vector operands.  */
112 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
113                         uint32_t oprsz, uint32_t maxsz, int32_t data,
114                         gen_helper_gvec_2 *fn)
115 {
116     TCGv_ptr a0, a1;
117     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
118 
119     a0 = tcg_temp_ebb_new_ptr();
120     a1 = tcg_temp_ebb_new_ptr();
121 
122     tcg_gen_addi_ptr(a0, cpu_env, dofs);
123     tcg_gen_addi_ptr(a1, cpu_env, aofs);
124 
125     fn(a0, a1, desc);
126 
127     tcg_temp_free_ptr(a0);
128     tcg_temp_free_ptr(a1);
129 }
130 
131 /* Generate a call to a gvec-style helper with two vector operands
132    and one scalar operand.  */
133 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
134                          uint32_t oprsz, uint32_t maxsz, int32_t data,
135                          gen_helper_gvec_2i *fn)
136 {
137     TCGv_ptr a0, a1;
138     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
139 
140     a0 = tcg_temp_ebb_new_ptr();
141     a1 = tcg_temp_ebb_new_ptr();
142 
143     tcg_gen_addi_ptr(a0, cpu_env, dofs);
144     tcg_gen_addi_ptr(a1, cpu_env, aofs);
145 
146     fn(a0, a1, c, desc);
147 
148     tcg_temp_free_ptr(a0);
149     tcg_temp_free_ptr(a1);
150 }
151 
152 /* Generate a call to a gvec-style helper with three vector operands.  */
153 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
154                         uint32_t oprsz, uint32_t maxsz, int32_t data,
155                         gen_helper_gvec_3 *fn)
156 {
157     TCGv_ptr a0, a1, a2;
158     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
159 
160     a0 = tcg_temp_ebb_new_ptr();
161     a1 = tcg_temp_ebb_new_ptr();
162     a2 = tcg_temp_ebb_new_ptr();
163 
164     tcg_gen_addi_ptr(a0, cpu_env, dofs);
165     tcg_gen_addi_ptr(a1, cpu_env, aofs);
166     tcg_gen_addi_ptr(a2, cpu_env, bofs);
167 
168     fn(a0, a1, a2, desc);
169 
170     tcg_temp_free_ptr(a0);
171     tcg_temp_free_ptr(a1);
172     tcg_temp_free_ptr(a2);
173 }
174 
175 /* Generate a call to a gvec-style helper with four vector operands.  */
176 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
177                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
178                         int32_t data, gen_helper_gvec_4 *fn)
179 {
180     TCGv_ptr a0, a1, a2, a3;
181     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
182 
183     a0 = tcg_temp_ebb_new_ptr();
184     a1 = tcg_temp_ebb_new_ptr();
185     a2 = tcg_temp_ebb_new_ptr();
186     a3 = tcg_temp_ebb_new_ptr();
187 
188     tcg_gen_addi_ptr(a0, cpu_env, dofs);
189     tcg_gen_addi_ptr(a1, cpu_env, aofs);
190     tcg_gen_addi_ptr(a2, cpu_env, bofs);
191     tcg_gen_addi_ptr(a3, cpu_env, cofs);
192 
193     fn(a0, a1, a2, a3, desc);
194 
195     tcg_temp_free_ptr(a0);
196     tcg_temp_free_ptr(a1);
197     tcg_temp_free_ptr(a2);
198     tcg_temp_free_ptr(a3);
199 }
200 
201 /* Generate a call to a gvec-style helper with five vector operands.  */
202 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
203                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
204                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
205 {
206     TCGv_ptr a0, a1, a2, a3, a4;
207     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
208 
209     a0 = tcg_temp_ebb_new_ptr();
210     a1 = tcg_temp_ebb_new_ptr();
211     a2 = tcg_temp_ebb_new_ptr();
212     a3 = tcg_temp_ebb_new_ptr();
213     a4 = tcg_temp_ebb_new_ptr();
214 
215     tcg_gen_addi_ptr(a0, cpu_env, dofs);
216     tcg_gen_addi_ptr(a1, cpu_env, aofs);
217     tcg_gen_addi_ptr(a2, cpu_env, bofs);
218     tcg_gen_addi_ptr(a3, cpu_env, cofs);
219     tcg_gen_addi_ptr(a4, cpu_env, xofs);
220 
221     fn(a0, a1, a2, a3, a4, desc);
222 
223     tcg_temp_free_ptr(a0);
224     tcg_temp_free_ptr(a1);
225     tcg_temp_free_ptr(a2);
226     tcg_temp_free_ptr(a3);
227     tcg_temp_free_ptr(a4);
228 }
229 
230 /* Generate a call to a gvec-style helper with three vector operands
231    and an extra pointer operand.  */
232 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
233                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
234                         int32_t data, gen_helper_gvec_2_ptr *fn)
235 {
236     TCGv_ptr a0, a1;
237     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
238 
239     a0 = tcg_temp_ebb_new_ptr();
240     a1 = tcg_temp_ebb_new_ptr();
241 
242     tcg_gen_addi_ptr(a0, cpu_env, dofs);
243     tcg_gen_addi_ptr(a1, cpu_env, aofs);
244 
245     fn(a0, a1, ptr, desc);
246 
247     tcg_temp_free_ptr(a0);
248     tcg_temp_free_ptr(a1);
249 }
250 
251 /* Generate a call to a gvec-style helper with three vector operands
252    and an extra pointer operand.  */
253 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
254                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
255                         int32_t data, gen_helper_gvec_3_ptr *fn)
256 {
257     TCGv_ptr a0, a1, a2;
258     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
259 
260     a0 = tcg_temp_ebb_new_ptr();
261     a1 = tcg_temp_ebb_new_ptr();
262     a2 = tcg_temp_ebb_new_ptr();
263 
264     tcg_gen_addi_ptr(a0, cpu_env, dofs);
265     tcg_gen_addi_ptr(a1, cpu_env, aofs);
266     tcg_gen_addi_ptr(a2, cpu_env, bofs);
267 
268     fn(a0, a1, a2, ptr, desc);
269 
270     tcg_temp_free_ptr(a0);
271     tcg_temp_free_ptr(a1);
272     tcg_temp_free_ptr(a2);
273 }
274 
275 /* Generate a call to a gvec-style helper with four vector operands
276    and an extra pointer operand.  */
277 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
278                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
279                         uint32_t maxsz, int32_t data,
280                         gen_helper_gvec_4_ptr *fn)
281 {
282     TCGv_ptr a0, a1, a2, a3;
283     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
284 
285     a0 = tcg_temp_ebb_new_ptr();
286     a1 = tcg_temp_ebb_new_ptr();
287     a2 = tcg_temp_ebb_new_ptr();
288     a3 = tcg_temp_ebb_new_ptr();
289 
290     tcg_gen_addi_ptr(a0, cpu_env, dofs);
291     tcg_gen_addi_ptr(a1, cpu_env, aofs);
292     tcg_gen_addi_ptr(a2, cpu_env, bofs);
293     tcg_gen_addi_ptr(a3, cpu_env, cofs);
294 
295     fn(a0, a1, a2, a3, ptr, desc);
296 
297     tcg_temp_free_ptr(a0);
298     tcg_temp_free_ptr(a1);
299     tcg_temp_free_ptr(a2);
300     tcg_temp_free_ptr(a3);
301 }
302 
303 /* Generate a call to a gvec-style helper with five vector operands
304    and an extra pointer operand.  */
305 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
306                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
307                         uint32_t oprsz, uint32_t maxsz, int32_t data,
308                         gen_helper_gvec_5_ptr *fn)
309 {
310     TCGv_ptr a0, a1, a2, a3, a4;
311     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
312 
313     a0 = tcg_temp_ebb_new_ptr();
314     a1 = tcg_temp_ebb_new_ptr();
315     a2 = tcg_temp_ebb_new_ptr();
316     a3 = tcg_temp_ebb_new_ptr();
317     a4 = tcg_temp_ebb_new_ptr();
318 
319     tcg_gen_addi_ptr(a0, cpu_env, dofs);
320     tcg_gen_addi_ptr(a1, cpu_env, aofs);
321     tcg_gen_addi_ptr(a2, cpu_env, bofs);
322     tcg_gen_addi_ptr(a3, cpu_env, cofs);
323     tcg_gen_addi_ptr(a4, cpu_env, eofs);
324 
325     fn(a0, a1, a2, a3, a4, ptr, desc);
326 
327     tcg_temp_free_ptr(a0);
328     tcg_temp_free_ptr(a1);
329     tcg_temp_free_ptr(a2);
330     tcg_temp_free_ptr(a3);
331     tcg_temp_free_ptr(a4);
332 }
333 
334 /* Return true if we want to implement something of OPRSZ bytes
335    in units of LNSZ.  This limits the expansion of inline code.  */
336 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
337 {
338     uint32_t q, r;
339 
340     if (oprsz < lnsz) {
341         return false;
342     }
343 
344     q = oprsz / lnsz;
345     r = oprsz % lnsz;
346     tcg_debug_assert((r & 7) == 0);
347 
348     if (lnsz < 16) {
349         /* For sizes below 16, accept no remainder. */
350         if (r != 0) {
351             return false;
352         }
353     } else {
354         /*
355          * Recall that ARM SVE allows vector sizes that are not a
356          * power of 2, but always a multiple of 16.  The intent is
357          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
358          * In addition, expand_clr needs to handle a multiple of 8.
359          * Thus we can handle the tail with one more operation per
360          * diminishing power of 2.
361          */
362         q += ctpop32(r);
363     }
364 
365     return q <= MAX_UNROLL;
366 }
367 
368 static void expand_clr(uint32_t dofs, uint32_t maxsz);
369 
370 /* Duplicate C as per VECE.  */
371 uint64_t (dup_const)(unsigned vece, uint64_t c)
372 {
373     switch (vece) {
374     case MO_8:
375         return 0x0101010101010101ull * (uint8_t)c;
376     case MO_16:
377         return 0x0001000100010001ull * (uint16_t)c;
378     case MO_32:
379         return 0x0000000100000001ull * (uint32_t)c;
380     case MO_64:
381         return c;
382     default:
383         g_assert_not_reached();
384     }
385 }
386 
387 /* Duplicate IN into OUT as per VECE.  */
388 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
389 {
390     switch (vece) {
391     case MO_8:
392         tcg_gen_ext8u_i32(out, in);
393         tcg_gen_muli_i32(out, out, 0x01010101);
394         break;
395     case MO_16:
396         tcg_gen_deposit_i32(out, in, in, 16, 16);
397         break;
398     case MO_32:
399         tcg_gen_mov_i32(out, in);
400         break;
401     default:
402         g_assert_not_reached();
403     }
404 }
405 
406 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
407 {
408     switch (vece) {
409     case MO_8:
410         tcg_gen_ext8u_i64(out, in);
411         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
412         break;
413     case MO_16:
414         tcg_gen_ext16u_i64(out, in);
415         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
416         break;
417     case MO_32:
418         tcg_gen_deposit_i64(out, in, in, 32, 32);
419         break;
420     case MO_64:
421         tcg_gen_mov_i64(out, in);
422         break;
423     default:
424         g_assert_not_reached();
425     }
426 }
427 
428 /* Select a supported vector type for implementing an operation on SIZE
429  * bytes.  If OP is 0, assume that the real operation to be performed is
430  * required by all backends.  Otherwise, make sure than OP can be performed
431  * on elements of size VECE in the selected type.  Do not select V64 if
432  * PREFER_I64 is true.  Return 0 if no vector type is selected.
433  */
434 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
435                                   uint32_t size, bool prefer_i64)
436 {
437     /*
438      * Recall that ARM SVE allows vector sizes that are not a
439      * power of 2, but always a multiple of 16.  The intent is
440      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
441      * It is hard to imagine a case in which v256 is supported
442      * but v128 is not, but check anyway.
443      * In addition, expand_clr needs to handle a multiple of 8.
444      */
445     if (TCG_TARGET_HAS_v256 &&
446         check_size_impl(size, 32) &&
447         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
448         (!(size & 16) ||
449          (TCG_TARGET_HAS_v128 &&
450           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
451         (!(size & 8) ||
452          (TCG_TARGET_HAS_v64 &&
453           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
454         return TCG_TYPE_V256;
455     }
456     if (TCG_TARGET_HAS_v128 &&
457         check_size_impl(size, 16) &&
458         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
459         (!(size & 8) ||
460          (TCG_TARGET_HAS_v64 &&
461           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
462         return TCG_TYPE_V128;
463     }
464     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
465         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
466         return TCG_TYPE_V64;
467     }
468     return 0;
469 }
470 
471 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
472                          uint32_t maxsz, TCGv_vec t_vec)
473 {
474     uint32_t i = 0;
475 
476     tcg_debug_assert(oprsz >= 8);
477 
478     /*
479      * This may be expand_clr for the tail of an operation, e.g.
480      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
481      * are misaligned wrt the maximum vector size, so do that first.
482      */
483     if (dofs & 8) {
484         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
485         i += 8;
486     }
487 
488     switch (type) {
489     case TCG_TYPE_V256:
490         /*
491          * Recall that ARM SVE allows vector sizes that are not a
492          * power of 2, but always a multiple of 16.  The intent is
493          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
494          */
495         for (; i + 32 <= oprsz; i += 32) {
496             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
497         }
498         /* fallthru */
499     case TCG_TYPE_V128:
500         for (; i + 16 <= oprsz; i += 16) {
501             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
502         }
503         break;
504     case TCG_TYPE_V64:
505         for (; i < oprsz; i += 8) {
506             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
507         }
508         break;
509     default:
510         g_assert_not_reached();
511     }
512 
513     if (oprsz < maxsz) {
514         expand_clr(dofs + oprsz, maxsz - oprsz);
515     }
516 }
517 
518 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
519  * Only one of IN_32 or IN_64 may be set;
520  * IN_C is used if IN_32 and IN_64 are unset.
521  */
522 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
523                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
524                    uint64_t in_c)
525 {
526     TCGType type;
527     TCGv_i64 t_64;
528     TCGv_i32 t_32, t_desc;
529     TCGv_ptr t_ptr;
530     uint32_t i;
531 
532     assert(vece <= (in_32 ? MO_32 : MO_64));
533     assert(in_32 == NULL || in_64 == NULL);
534 
535     /* If we're storing 0, expand oprsz to maxsz.  */
536     if (in_32 == NULL && in_64 == NULL) {
537         in_c = dup_const(vece, in_c);
538         if (in_c == 0) {
539             oprsz = maxsz;
540             vece = MO_8;
541         } else if (in_c == dup_const(MO_8, in_c)) {
542             vece = MO_8;
543         }
544     }
545 
546     /* Implement inline with a vector type, if possible.
547      * Prefer integer when 64-bit host and no variable dup.
548      */
549     type = choose_vector_type(NULL, vece, oprsz,
550                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
551                                && (in_64 == NULL || vece == MO_64)));
552     if (type != 0) {
553         TCGv_vec t_vec = tcg_temp_new_vec(type);
554 
555         if (in_32) {
556             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
557         } else if (in_64) {
558             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
559         } else {
560             tcg_gen_dupi_vec(vece, t_vec, in_c);
561         }
562         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
563         tcg_temp_free_vec(t_vec);
564         return;
565     }
566 
567     /* Otherwise, inline with an integer type, unless "large".  */
568     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
569         t_64 = NULL;
570         t_32 = NULL;
571 
572         if (in_32) {
573             /* We are given a 32-bit variable input.  For a 64-bit host,
574                use a 64-bit operation unless the 32-bit operation would
575                be simple enough.  */
576             if (TCG_TARGET_REG_BITS == 64
577                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
578                 t_64 = tcg_temp_ebb_new_i64();
579                 tcg_gen_extu_i32_i64(t_64, in_32);
580                 tcg_gen_dup_i64(vece, t_64, t_64);
581             } else {
582                 t_32 = tcg_temp_ebb_new_i32();
583                 tcg_gen_dup_i32(vece, t_32, in_32);
584             }
585         } else if (in_64) {
586             /* We are given a 64-bit variable input.  */
587             t_64 = tcg_temp_ebb_new_i64();
588             tcg_gen_dup_i64(vece, t_64, in_64);
589         } else {
590             /* We are given a constant input.  */
591             /* For 64-bit hosts, use 64-bit constants for "simple" constants
592                or when we'd need too many 32-bit stores, or when a 64-bit
593                constant is really required.  */
594             if (vece == MO_64
595                 || (TCG_TARGET_REG_BITS == 64
596                     && (in_c == 0 || in_c == -1
597                         || !check_size_impl(oprsz, 4)))) {
598                 t_64 = tcg_constant_i64(in_c);
599             } else {
600                 t_32 = tcg_constant_i32(in_c);
601             }
602         }
603 
604         /* Implement inline if we picked an implementation size above.  */
605         if (t_32) {
606             for (i = 0; i < oprsz; i += 4) {
607                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
608             }
609             tcg_temp_free_i32(t_32);
610             goto done;
611         }
612         if (t_64) {
613             for (i = 0; i < oprsz; i += 8) {
614                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
615             }
616             tcg_temp_free_i64(t_64);
617             goto done;
618         }
619     }
620 
621     /* Otherwise implement out of line.  */
622     t_ptr = tcg_temp_ebb_new_ptr();
623     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
624 
625     /*
626      * This may be expand_clr for the tail of an operation, e.g.
627      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
628      * wrt simd_desc and will assert.  Simply pass all replicated byte
629      * stores through to memset.
630      */
631     if (oprsz == maxsz && vece == MO_8) {
632         TCGv_ptr t_size = tcg_constant_ptr(oprsz);
633         TCGv_i32 t_val;
634 
635         if (in_32) {
636             t_val = in_32;
637         } else if (in_64) {
638             t_val = tcg_temp_ebb_new_i32();
639             tcg_gen_extrl_i64_i32(t_val, in_64);
640         } else {
641             t_val = tcg_constant_i32(in_c);
642         }
643         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
644 
645         if (in_64) {
646             tcg_temp_free_i32(t_val);
647         }
648         tcg_temp_free_ptr(t_ptr);
649         return;
650     }
651 
652     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
653 
654     if (vece == MO_64) {
655         if (in_64) {
656             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
657         } else {
658             t_64 = tcg_constant_i64(in_c);
659             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
660         }
661     } else {
662         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
663         static dup_fn * const fns[3] = {
664             gen_helper_gvec_dup8,
665             gen_helper_gvec_dup16,
666             gen_helper_gvec_dup32
667         };
668 
669         if (in_32) {
670             fns[vece](t_ptr, t_desc, in_32);
671         } else if (in_64) {
672             t_32 = tcg_temp_ebb_new_i32();
673             tcg_gen_extrl_i64_i32(t_32, in_64);
674             fns[vece](t_ptr, t_desc, t_32);
675             tcg_temp_free_i32(t_32);
676         } else {
677             if (vece == MO_8) {
678                 in_c &= 0xff;
679             } else if (vece == MO_16) {
680                 in_c &= 0xffff;
681             }
682             t_32 = tcg_constant_i32(in_c);
683             fns[vece](t_ptr, t_desc, t_32);
684         }
685     }
686 
687     tcg_temp_free_ptr(t_ptr);
688     return;
689 
690  done:
691     if (oprsz < maxsz) {
692         expand_clr(dofs + oprsz, maxsz - oprsz);
693     }
694 }
695 
696 /* Likewise, but with zero.  */
697 static void expand_clr(uint32_t dofs, uint32_t maxsz)
698 {
699     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
700 }
701 
702 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
703 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
704                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
705 {
706     TCGv_i32 t0 = tcg_temp_new_i32();
707     TCGv_i32 t1 = tcg_temp_new_i32();
708     uint32_t i;
709 
710     for (i = 0; i < oprsz; i += 4) {
711         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
712         if (load_dest) {
713             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
714         }
715         fni(t1, t0);
716         tcg_gen_st_i32(t1, cpu_env, dofs + i);
717     }
718     tcg_temp_free_i32(t0);
719     tcg_temp_free_i32(t1);
720 }
721 
722 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
723                           int32_t c, bool load_dest,
724                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
725 {
726     TCGv_i32 t0 = tcg_temp_new_i32();
727     TCGv_i32 t1 = tcg_temp_new_i32();
728     uint32_t i;
729 
730     for (i = 0; i < oprsz; i += 4) {
731         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
732         if (load_dest) {
733             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
734         }
735         fni(t1, t0, c);
736         tcg_gen_st_i32(t1, cpu_env, dofs + i);
737     }
738     tcg_temp_free_i32(t0);
739     tcg_temp_free_i32(t1);
740 }
741 
742 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
743                           TCGv_i32 c, bool scalar_first,
744                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
745 {
746     TCGv_i32 t0 = tcg_temp_new_i32();
747     TCGv_i32 t1 = tcg_temp_new_i32();
748     uint32_t i;
749 
750     for (i = 0; i < oprsz; i += 4) {
751         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
752         if (scalar_first) {
753             fni(t1, c, t0);
754         } else {
755             fni(t1, t0, c);
756         }
757         tcg_gen_st_i32(t1, cpu_env, dofs + i);
758     }
759     tcg_temp_free_i32(t0);
760     tcg_temp_free_i32(t1);
761 }
762 
763 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
764 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
765                          uint32_t bofs, uint32_t oprsz, bool load_dest,
766                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
767 {
768     TCGv_i32 t0 = tcg_temp_new_i32();
769     TCGv_i32 t1 = tcg_temp_new_i32();
770     TCGv_i32 t2 = tcg_temp_new_i32();
771     uint32_t i;
772 
773     for (i = 0; i < oprsz; i += 4) {
774         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
775         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
776         if (load_dest) {
777             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
778         }
779         fni(t2, t0, t1);
780         tcg_gen_st_i32(t2, cpu_env, dofs + i);
781     }
782     tcg_temp_free_i32(t2);
783     tcg_temp_free_i32(t1);
784     tcg_temp_free_i32(t0);
785 }
786 
787 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
788                           uint32_t oprsz, int32_t c, bool load_dest,
789                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
790 {
791     TCGv_i32 t0 = tcg_temp_new_i32();
792     TCGv_i32 t1 = tcg_temp_new_i32();
793     TCGv_i32 t2 = tcg_temp_new_i32();
794     uint32_t i;
795 
796     for (i = 0; i < oprsz; i += 4) {
797         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
798         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
799         if (load_dest) {
800             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
801         }
802         fni(t2, t0, t1, c);
803         tcg_gen_st_i32(t2, cpu_env, dofs + i);
804     }
805     tcg_temp_free_i32(t0);
806     tcg_temp_free_i32(t1);
807     tcg_temp_free_i32(t2);
808 }
809 
810 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
811 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
812                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
813                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
814 {
815     TCGv_i32 t0 = tcg_temp_new_i32();
816     TCGv_i32 t1 = tcg_temp_new_i32();
817     TCGv_i32 t2 = tcg_temp_new_i32();
818     TCGv_i32 t3 = tcg_temp_new_i32();
819     uint32_t i;
820 
821     for (i = 0; i < oprsz; i += 4) {
822         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
823         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
824         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
825         fni(t0, t1, t2, t3);
826         tcg_gen_st_i32(t0, cpu_env, dofs + i);
827         if (write_aofs) {
828             tcg_gen_st_i32(t1, cpu_env, aofs + i);
829         }
830     }
831     tcg_temp_free_i32(t3);
832     tcg_temp_free_i32(t2);
833     tcg_temp_free_i32(t1);
834     tcg_temp_free_i32(t0);
835 }
836 
837 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
838                           uint32_t cofs, uint32_t oprsz, int32_t c,
839                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
840                                       int32_t))
841 {
842     TCGv_i32 t0 = tcg_temp_new_i32();
843     TCGv_i32 t1 = tcg_temp_new_i32();
844     TCGv_i32 t2 = tcg_temp_new_i32();
845     TCGv_i32 t3 = tcg_temp_new_i32();
846     uint32_t i;
847 
848     for (i = 0; i < oprsz; i += 4) {
849         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
850         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
851         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
852         fni(t0, t1, t2, t3, c);
853         tcg_gen_st_i32(t0, cpu_env, dofs + i);
854     }
855     tcg_temp_free_i32(t3);
856     tcg_temp_free_i32(t2);
857     tcg_temp_free_i32(t1);
858     tcg_temp_free_i32(t0);
859 }
860 
861 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
862 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
863                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
864 {
865     TCGv_i64 t0 = tcg_temp_new_i64();
866     TCGv_i64 t1 = tcg_temp_new_i64();
867     uint32_t i;
868 
869     for (i = 0; i < oprsz; i += 8) {
870         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
871         if (load_dest) {
872             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
873         }
874         fni(t1, t0);
875         tcg_gen_st_i64(t1, cpu_env, dofs + i);
876     }
877     tcg_temp_free_i64(t0);
878     tcg_temp_free_i64(t1);
879 }
880 
881 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
882                           int64_t c, bool load_dest,
883                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
884 {
885     TCGv_i64 t0 = tcg_temp_new_i64();
886     TCGv_i64 t1 = tcg_temp_new_i64();
887     uint32_t i;
888 
889     for (i = 0; i < oprsz; i += 8) {
890         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
891         if (load_dest) {
892             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
893         }
894         fni(t1, t0, c);
895         tcg_gen_st_i64(t1, cpu_env, dofs + i);
896     }
897     tcg_temp_free_i64(t0);
898     tcg_temp_free_i64(t1);
899 }
900 
901 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
902                           TCGv_i64 c, bool scalar_first,
903                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
904 {
905     TCGv_i64 t0 = tcg_temp_new_i64();
906     TCGv_i64 t1 = tcg_temp_new_i64();
907     uint32_t i;
908 
909     for (i = 0; i < oprsz; i += 8) {
910         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
911         if (scalar_first) {
912             fni(t1, c, t0);
913         } else {
914             fni(t1, t0, c);
915         }
916         tcg_gen_st_i64(t1, cpu_env, dofs + i);
917     }
918     tcg_temp_free_i64(t0);
919     tcg_temp_free_i64(t1);
920 }
921 
922 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
923 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
924                          uint32_t bofs, uint32_t oprsz, bool load_dest,
925                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
926 {
927     TCGv_i64 t0 = tcg_temp_new_i64();
928     TCGv_i64 t1 = tcg_temp_new_i64();
929     TCGv_i64 t2 = tcg_temp_new_i64();
930     uint32_t i;
931 
932     for (i = 0; i < oprsz; i += 8) {
933         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
934         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
935         if (load_dest) {
936             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
937         }
938         fni(t2, t0, t1);
939         tcg_gen_st_i64(t2, cpu_env, dofs + i);
940     }
941     tcg_temp_free_i64(t2);
942     tcg_temp_free_i64(t1);
943     tcg_temp_free_i64(t0);
944 }
945 
946 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
947                           uint32_t oprsz, int64_t c, bool load_dest,
948                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
949 {
950     TCGv_i64 t0 = tcg_temp_new_i64();
951     TCGv_i64 t1 = tcg_temp_new_i64();
952     TCGv_i64 t2 = tcg_temp_new_i64();
953     uint32_t i;
954 
955     for (i = 0; i < oprsz; i += 8) {
956         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
957         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
958         if (load_dest) {
959             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
960         }
961         fni(t2, t0, t1, c);
962         tcg_gen_st_i64(t2, cpu_env, dofs + i);
963     }
964     tcg_temp_free_i64(t0);
965     tcg_temp_free_i64(t1);
966     tcg_temp_free_i64(t2);
967 }
968 
969 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
970 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
971                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
972                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
973 {
974     TCGv_i64 t0 = tcg_temp_new_i64();
975     TCGv_i64 t1 = tcg_temp_new_i64();
976     TCGv_i64 t2 = tcg_temp_new_i64();
977     TCGv_i64 t3 = tcg_temp_new_i64();
978     uint32_t i;
979 
980     for (i = 0; i < oprsz; i += 8) {
981         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
982         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
983         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
984         fni(t0, t1, t2, t3);
985         tcg_gen_st_i64(t0, cpu_env, dofs + i);
986         if (write_aofs) {
987             tcg_gen_st_i64(t1, cpu_env, aofs + i);
988         }
989     }
990     tcg_temp_free_i64(t3);
991     tcg_temp_free_i64(t2);
992     tcg_temp_free_i64(t1);
993     tcg_temp_free_i64(t0);
994 }
995 
996 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
997                           uint32_t cofs, uint32_t oprsz, int64_t c,
998                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
999                                       int64_t))
1000 {
1001     TCGv_i64 t0 = tcg_temp_new_i64();
1002     TCGv_i64 t1 = tcg_temp_new_i64();
1003     TCGv_i64 t2 = tcg_temp_new_i64();
1004     TCGv_i64 t3 = tcg_temp_new_i64();
1005     uint32_t i;
1006 
1007     for (i = 0; i < oprsz; i += 8) {
1008         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
1009         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
1010         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
1011         fni(t0, t1, t2, t3, c);
1012         tcg_gen_st_i64(t0, cpu_env, dofs + i);
1013     }
1014     tcg_temp_free_i64(t3);
1015     tcg_temp_free_i64(t2);
1016     tcg_temp_free_i64(t1);
1017     tcg_temp_free_i64(t0);
1018 }
1019 
1020 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
1021 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1022                          uint32_t oprsz, uint32_t tysz, TCGType type,
1023                          bool load_dest,
1024                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1025 {
1026     TCGv_vec t0 = tcg_temp_new_vec(type);
1027     TCGv_vec t1 = tcg_temp_new_vec(type);
1028     uint32_t i;
1029 
1030     for (i = 0; i < oprsz; i += tysz) {
1031         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1032         if (load_dest) {
1033             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1034         }
1035         fni(vece, t1, t0);
1036         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1037     }
1038     tcg_temp_free_vec(t0);
1039     tcg_temp_free_vec(t1);
1040 }
1041 
1042 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1043    using host vectors.  */
1044 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1045                           uint32_t oprsz, uint32_t tysz, TCGType type,
1046                           int64_t c, bool load_dest,
1047                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1048 {
1049     TCGv_vec t0 = tcg_temp_new_vec(type);
1050     TCGv_vec t1 = tcg_temp_new_vec(type);
1051     uint32_t i;
1052 
1053     for (i = 0; i < oprsz; i += tysz) {
1054         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1055         if (load_dest) {
1056             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1057         }
1058         fni(vece, t1, t0, c);
1059         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1060     }
1061     tcg_temp_free_vec(t0);
1062     tcg_temp_free_vec(t1);
1063 }
1064 
1065 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1066                           uint32_t oprsz, uint32_t tysz, TCGType type,
1067                           TCGv_vec c, bool scalar_first,
1068                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1069 {
1070     TCGv_vec t0 = tcg_temp_new_vec(type);
1071     TCGv_vec t1 = tcg_temp_new_vec(type);
1072     uint32_t i;
1073 
1074     for (i = 0; i < oprsz; i += tysz) {
1075         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1076         if (scalar_first) {
1077             fni(vece, t1, c, t0);
1078         } else {
1079             fni(vece, t1, t0, c);
1080         }
1081         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1082     }
1083     tcg_temp_free_vec(t0);
1084     tcg_temp_free_vec(t1);
1085 }
1086 
1087 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1088 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1089                          uint32_t bofs, uint32_t oprsz,
1090                          uint32_t tysz, TCGType type, bool load_dest,
1091                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1092 {
1093     TCGv_vec t0 = tcg_temp_new_vec(type);
1094     TCGv_vec t1 = tcg_temp_new_vec(type);
1095     TCGv_vec t2 = tcg_temp_new_vec(type);
1096     uint32_t i;
1097 
1098     for (i = 0; i < oprsz; i += tysz) {
1099         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1100         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1101         if (load_dest) {
1102             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1103         }
1104         fni(vece, t2, t0, t1);
1105         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1106     }
1107     tcg_temp_free_vec(t2);
1108     tcg_temp_free_vec(t1);
1109     tcg_temp_free_vec(t0);
1110 }
1111 
1112 /*
1113  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1114  * using host vectors.
1115  */
1116 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1117                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1118                           TCGType type, int64_t c, bool load_dest,
1119                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1120                                       int64_t))
1121 {
1122     TCGv_vec t0 = tcg_temp_new_vec(type);
1123     TCGv_vec t1 = tcg_temp_new_vec(type);
1124     TCGv_vec t2 = tcg_temp_new_vec(type);
1125     uint32_t i;
1126 
1127     for (i = 0; i < oprsz; i += tysz) {
1128         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1129         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1130         if (load_dest) {
1131             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1132         }
1133         fni(vece, t2, t0, t1, c);
1134         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1135     }
1136     tcg_temp_free_vec(t0);
1137     tcg_temp_free_vec(t1);
1138     tcg_temp_free_vec(t2);
1139 }
1140 
1141 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1142 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1143                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1144                          uint32_t tysz, TCGType type, bool write_aofs,
1145                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1146                                      TCGv_vec, TCGv_vec))
1147 {
1148     TCGv_vec t0 = tcg_temp_new_vec(type);
1149     TCGv_vec t1 = tcg_temp_new_vec(type);
1150     TCGv_vec t2 = tcg_temp_new_vec(type);
1151     TCGv_vec t3 = tcg_temp_new_vec(type);
1152     uint32_t i;
1153 
1154     for (i = 0; i < oprsz; i += tysz) {
1155         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1156         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1157         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1158         fni(vece, t0, t1, t2, t3);
1159         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1160         if (write_aofs) {
1161             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1162         }
1163     }
1164     tcg_temp_free_vec(t3);
1165     tcg_temp_free_vec(t2);
1166     tcg_temp_free_vec(t1);
1167     tcg_temp_free_vec(t0);
1168 }
1169 
1170 /*
1171  * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1172  * using host vectors.
1173  */
1174 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1175                           uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1176                           uint32_t tysz, TCGType type, int64_t c,
1177                           void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1178                                      TCGv_vec, TCGv_vec, int64_t))
1179 {
1180     TCGv_vec t0 = tcg_temp_new_vec(type);
1181     TCGv_vec t1 = tcg_temp_new_vec(type);
1182     TCGv_vec t2 = tcg_temp_new_vec(type);
1183     TCGv_vec t3 = tcg_temp_new_vec(type);
1184     uint32_t i;
1185 
1186     for (i = 0; i < oprsz; i += tysz) {
1187         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1188         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1189         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1190         fni(vece, t0, t1, t2, t3, c);
1191         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1192     }
1193     tcg_temp_free_vec(t3);
1194     tcg_temp_free_vec(t2);
1195     tcg_temp_free_vec(t1);
1196     tcg_temp_free_vec(t0);
1197 }
1198 
1199 /* Expand a vector two-operand operation.  */
1200 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1201                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1202 {
1203     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1204     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1205     TCGType type;
1206     uint32_t some;
1207 
1208     check_size_align(oprsz, maxsz, dofs | aofs);
1209     check_overlap_2(dofs, aofs, maxsz);
1210 
1211     type = 0;
1212     if (g->fniv) {
1213         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1214     }
1215     switch (type) {
1216     case TCG_TYPE_V256:
1217         /* Recall that ARM SVE allows vector sizes that are not a
1218          * power of 2, but always a multiple of 16.  The intent is
1219          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1220          */
1221         some = QEMU_ALIGN_DOWN(oprsz, 32);
1222         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1223                      g->load_dest, g->fniv);
1224         if (some == oprsz) {
1225             break;
1226         }
1227         dofs += some;
1228         aofs += some;
1229         oprsz -= some;
1230         maxsz -= some;
1231         /* fallthru */
1232     case TCG_TYPE_V128:
1233         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1234                      g->load_dest, g->fniv);
1235         break;
1236     case TCG_TYPE_V64:
1237         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1238                      g->load_dest, g->fniv);
1239         break;
1240 
1241     case 0:
1242         if (g->fni8 && check_size_impl(oprsz, 8)) {
1243             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1244         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1245             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1246         } else {
1247             assert(g->fno != NULL);
1248             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1249             oprsz = maxsz;
1250         }
1251         break;
1252 
1253     default:
1254         g_assert_not_reached();
1255     }
1256     tcg_swap_vecop_list(hold_list);
1257 
1258     if (oprsz < maxsz) {
1259         expand_clr(dofs + oprsz, maxsz - oprsz);
1260     }
1261 }
1262 
1263 /* Expand a vector operation with two vectors and an immediate.  */
1264 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1265                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1266 {
1267     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1268     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1269     TCGType type;
1270     uint32_t some;
1271 
1272     check_size_align(oprsz, maxsz, dofs | aofs);
1273     check_overlap_2(dofs, aofs, maxsz);
1274 
1275     type = 0;
1276     if (g->fniv) {
1277         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1278     }
1279     switch (type) {
1280     case TCG_TYPE_V256:
1281         /* Recall that ARM SVE allows vector sizes that are not a
1282          * power of 2, but always a multiple of 16.  The intent is
1283          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1284          */
1285         some = QEMU_ALIGN_DOWN(oprsz, 32);
1286         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1287                       c, g->load_dest, g->fniv);
1288         if (some == oprsz) {
1289             break;
1290         }
1291         dofs += some;
1292         aofs += some;
1293         oprsz -= some;
1294         maxsz -= some;
1295         /* fallthru */
1296     case TCG_TYPE_V128:
1297         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1298                       c, g->load_dest, g->fniv);
1299         break;
1300     case TCG_TYPE_V64:
1301         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1302                       c, g->load_dest, g->fniv);
1303         break;
1304 
1305     case 0:
1306         if (g->fni8 && check_size_impl(oprsz, 8)) {
1307             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1308         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1309             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1310         } else {
1311             if (g->fno) {
1312                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1313             } else {
1314                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1315                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1316                                     maxsz, c, g->fnoi);
1317             }
1318             oprsz = maxsz;
1319         }
1320         break;
1321 
1322     default:
1323         g_assert_not_reached();
1324     }
1325     tcg_swap_vecop_list(hold_list);
1326 
1327     if (oprsz < maxsz) {
1328         expand_clr(dofs + oprsz, maxsz - oprsz);
1329     }
1330 }
1331 
1332 /* Expand a vector operation with two vectors and a scalar.  */
1333 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1334                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1335 {
1336     TCGType type;
1337 
1338     check_size_align(oprsz, maxsz, dofs | aofs);
1339     check_overlap_2(dofs, aofs, maxsz);
1340 
1341     type = 0;
1342     if (g->fniv) {
1343         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1344     }
1345     if (type != 0) {
1346         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1347         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1348         TCGv_vec t_vec = tcg_temp_new_vec(type);
1349         uint32_t some;
1350 
1351         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1352 
1353         switch (type) {
1354         case TCG_TYPE_V256:
1355             /* Recall that ARM SVE allows vector sizes that are not a
1356              * power of 2, but always a multiple of 16.  The intent is
1357              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1358              */
1359             some = QEMU_ALIGN_DOWN(oprsz, 32);
1360             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1361                           t_vec, g->scalar_first, g->fniv);
1362             if (some == oprsz) {
1363                 break;
1364             }
1365             dofs += some;
1366             aofs += some;
1367             oprsz -= some;
1368             maxsz -= some;
1369             /* fallthru */
1370 
1371         case TCG_TYPE_V128:
1372             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1373                           t_vec, g->scalar_first, g->fniv);
1374             break;
1375 
1376         case TCG_TYPE_V64:
1377             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1378                           t_vec, g->scalar_first, g->fniv);
1379             break;
1380 
1381         default:
1382             g_assert_not_reached();
1383         }
1384         tcg_temp_free_vec(t_vec);
1385         tcg_swap_vecop_list(hold_list);
1386     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1387         TCGv_i64 t64 = tcg_temp_new_i64();
1388 
1389         tcg_gen_dup_i64(g->vece, t64, c);
1390         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1391         tcg_temp_free_i64(t64);
1392     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1393         TCGv_i32 t32 = tcg_temp_new_i32();
1394 
1395         tcg_gen_extrl_i64_i32(t32, c);
1396         tcg_gen_dup_i32(g->vece, t32, t32);
1397         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1398         tcg_temp_free_i32(t32);
1399     } else {
1400         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1401         return;
1402     }
1403 
1404     if (oprsz < maxsz) {
1405         expand_clr(dofs + oprsz, maxsz - oprsz);
1406     }
1407 }
1408 
1409 /* Expand a vector three-operand operation.  */
1410 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1411                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1412 {
1413     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1414     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1415     TCGType type;
1416     uint32_t some;
1417 
1418     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1419     check_overlap_3(dofs, aofs, bofs, maxsz);
1420 
1421     type = 0;
1422     if (g->fniv) {
1423         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1424     }
1425     switch (type) {
1426     case TCG_TYPE_V256:
1427         /* Recall that ARM SVE allows vector sizes that are not a
1428          * power of 2, but always a multiple of 16.  The intent is
1429          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1430          */
1431         some = QEMU_ALIGN_DOWN(oprsz, 32);
1432         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1433                      g->load_dest, g->fniv);
1434         if (some == oprsz) {
1435             break;
1436         }
1437         dofs += some;
1438         aofs += some;
1439         bofs += some;
1440         oprsz -= some;
1441         maxsz -= some;
1442         /* fallthru */
1443     case TCG_TYPE_V128:
1444         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1445                      g->load_dest, g->fniv);
1446         break;
1447     case TCG_TYPE_V64:
1448         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1449                      g->load_dest, g->fniv);
1450         break;
1451 
1452     case 0:
1453         if (g->fni8 && check_size_impl(oprsz, 8)) {
1454             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1455         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1456             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1457         } else {
1458             assert(g->fno != NULL);
1459             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1460                                maxsz, g->data, g->fno);
1461             oprsz = maxsz;
1462         }
1463         break;
1464 
1465     default:
1466         g_assert_not_reached();
1467     }
1468     tcg_swap_vecop_list(hold_list);
1469 
1470     if (oprsz < maxsz) {
1471         expand_clr(dofs + oprsz, maxsz - oprsz);
1472     }
1473 }
1474 
1475 /* Expand a vector operation with three vectors and an immediate.  */
1476 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1477                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1478                      const GVecGen3i *g)
1479 {
1480     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1481     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1482     TCGType type;
1483     uint32_t some;
1484 
1485     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1486     check_overlap_3(dofs, aofs, bofs, maxsz);
1487 
1488     type = 0;
1489     if (g->fniv) {
1490         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1491     }
1492     switch (type) {
1493     case TCG_TYPE_V256:
1494         /*
1495          * Recall that ARM SVE allows vector sizes that are not a
1496          * power of 2, but always a multiple of 16.  The intent is
1497          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1498          */
1499         some = QEMU_ALIGN_DOWN(oprsz, 32);
1500         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1501                       c, g->load_dest, g->fniv);
1502         if (some == oprsz) {
1503             break;
1504         }
1505         dofs += some;
1506         aofs += some;
1507         bofs += some;
1508         oprsz -= some;
1509         maxsz -= some;
1510         /* fallthru */
1511     case TCG_TYPE_V128:
1512         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1513                       c, g->load_dest, g->fniv);
1514         break;
1515     case TCG_TYPE_V64:
1516         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1517                       c, g->load_dest, g->fniv);
1518         break;
1519 
1520     case 0:
1521         if (g->fni8 && check_size_impl(oprsz, 8)) {
1522             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1523         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1524             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1525         } else {
1526             assert(g->fno != NULL);
1527             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1528             oprsz = maxsz;
1529         }
1530         break;
1531 
1532     default:
1533         g_assert_not_reached();
1534     }
1535     tcg_swap_vecop_list(hold_list);
1536 
1537     if (oprsz < maxsz) {
1538         expand_clr(dofs + oprsz, maxsz - oprsz);
1539     }
1540 }
1541 
1542 /* Expand a vector four-operand operation.  */
1543 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1544                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1545 {
1546     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1547     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1548     TCGType type;
1549     uint32_t some;
1550 
1551     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1552     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1553 
1554     type = 0;
1555     if (g->fniv) {
1556         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1557     }
1558     switch (type) {
1559     case TCG_TYPE_V256:
1560         /* Recall that ARM SVE allows vector sizes that are not a
1561          * power of 2, but always a multiple of 16.  The intent is
1562          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1563          */
1564         some = QEMU_ALIGN_DOWN(oprsz, 32);
1565         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1566                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1567         if (some == oprsz) {
1568             break;
1569         }
1570         dofs += some;
1571         aofs += some;
1572         bofs += some;
1573         cofs += some;
1574         oprsz -= some;
1575         maxsz -= some;
1576         /* fallthru */
1577     case TCG_TYPE_V128:
1578         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1579                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1580         break;
1581     case TCG_TYPE_V64:
1582         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1583                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1584         break;
1585 
1586     case 0:
1587         if (g->fni8 && check_size_impl(oprsz, 8)) {
1588             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1589                          g->write_aofs, g->fni8);
1590         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1591             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1592                          g->write_aofs, g->fni4);
1593         } else {
1594             assert(g->fno != NULL);
1595             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1596                                oprsz, maxsz, g->data, g->fno);
1597             oprsz = maxsz;
1598         }
1599         break;
1600 
1601     default:
1602         g_assert_not_reached();
1603     }
1604     tcg_swap_vecop_list(hold_list);
1605 
1606     if (oprsz < maxsz) {
1607         expand_clr(dofs + oprsz, maxsz - oprsz);
1608     }
1609 }
1610 
1611 /* Expand a vector four-operand operation.  */
1612 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1613                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1614                      const GVecGen4i *g)
1615 {
1616     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1617     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1618     TCGType type;
1619     uint32_t some;
1620 
1621     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1622     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1623 
1624     type = 0;
1625     if (g->fniv) {
1626         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1627     }
1628     switch (type) {
1629     case TCG_TYPE_V256:
1630         /*
1631          * Recall that ARM SVE allows vector sizes that are not a
1632          * power of 2, but always a multiple of 16.  The intent is
1633          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1634          */
1635         some = QEMU_ALIGN_DOWN(oprsz, 32);
1636         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1637                       32, TCG_TYPE_V256, c, g->fniv);
1638         if (some == oprsz) {
1639             break;
1640         }
1641         dofs += some;
1642         aofs += some;
1643         bofs += some;
1644         cofs += some;
1645         oprsz -= some;
1646         maxsz -= some;
1647         /* fallthru */
1648     case TCG_TYPE_V128:
1649         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1650                        16, TCG_TYPE_V128, c, g->fniv);
1651         break;
1652     case TCG_TYPE_V64:
1653         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1654                       8, TCG_TYPE_V64, c, g->fniv);
1655         break;
1656 
1657     case 0:
1658         if (g->fni8 && check_size_impl(oprsz, 8)) {
1659             expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1660         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1661             expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1662         } else {
1663             assert(g->fno != NULL);
1664             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1665                                oprsz, maxsz, c, g->fno);
1666             oprsz = maxsz;
1667         }
1668         break;
1669 
1670     default:
1671         g_assert_not_reached();
1672     }
1673     tcg_swap_vecop_list(hold_list);
1674 
1675     if (oprsz < maxsz) {
1676         expand_clr(dofs + oprsz, maxsz - oprsz);
1677     }
1678 }
1679 
1680 /*
1681  * Expand specific vector operations.
1682  */
1683 
1684 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1685 {
1686     tcg_gen_mov_vec(a, b);
1687 }
1688 
1689 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1690                       uint32_t oprsz, uint32_t maxsz)
1691 {
1692     static const GVecGen2 g = {
1693         .fni8 = tcg_gen_mov_i64,
1694         .fniv = vec_mov2,
1695         .fno = gen_helper_gvec_mov,
1696         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1697     };
1698     if (dofs != aofs) {
1699         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1700     } else {
1701         check_size_align(oprsz, maxsz, dofs);
1702         if (oprsz < maxsz) {
1703             expand_clr(dofs + oprsz, maxsz - oprsz);
1704         }
1705     }
1706 }
1707 
1708 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1709                           uint32_t maxsz, TCGv_i32 in)
1710 {
1711     check_size_align(oprsz, maxsz, dofs);
1712     tcg_debug_assert(vece <= MO_32);
1713     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1714 }
1715 
1716 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1717                           uint32_t maxsz, TCGv_i64 in)
1718 {
1719     check_size_align(oprsz, maxsz, dofs);
1720     tcg_debug_assert(vece <= MO_64);
1721     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1722 }
1723 
1724 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1725                           uint32_t oprsz, uint32_t maxsz)
1726 {
1727     check_size_align(oprsz, maxsz, dofs);
1728     if (vece <= MO_64) {
1729         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1730         if (type != 0) {
1731             TCGv_vec t_vec = tcg_temp_new_vec(type);
1732             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1733             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1734             tcg_temp_free_vec(t_vec);
1735         } else if (vece <= MO_32) {
1736             TCGv_i32 in = tcg_temp_ebb_new_i32();
1737             switch (vece) {
1738             case MO_8:
1739                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1740                 break;
1741             case MO_16:
1742                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1743                 break;
1744             default:
1745                 tcg_gen_ld_i32(in, cpu_env, aofs);
1746                 break;
1747             }
1748             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1749             tcg_temp_free_i32(in);
1750         } else {
1751             TCGv_i64 in = tcg_temp_ebb_new_i64();
1752             tcg_gen_ld_i64(in, cpu_env, aofs);
1753             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1754             tcg_temp_free_i64(in);
1755         }
1756     } else if (vece == 4) {
1757         /* 128-bit duplicate.  */
1758         int i;
1759 
1760         tcg_debug_assert(oprsz >= 16);
1761         if (TCG_TARGET_HAS_v128) {
1762             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1763 
1764             tcg_gen_ld_vec(in, cpu_env, aofs);
1765             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1766                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1767             }
1768             tcg_temp_free_vec(in);
1769         } else {
1770             TCGv_i64 in0 = tcg_temp_ebb_new_i64();
1771             TCGv_i64 in1 = tcg_temp_ebb_new_i64();
1772 
1773             tcg_gen_ld_i64(in0, cpu_env, aofs);
1774             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1775             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1776                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1777                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1778             }
1779             tcg_temp_free_i64(in0);
1780             tcg_temp_free_i64(in1);
1781         }
1782         if (oprsz < maxsz) {
1783             expand_clr(dofs + oprsz, maxsz - oprsz);
1784         }
1785     } else if (vece == 5) {
1786         /* 256-bit duplicate.  */
1787         int i;
1788 
1789         tcg_debug_assert(oprsz >= 32);
1790         tcg_debug_assert(oprsz % 32 == 0);
1791         if (TCG_TARGET_HAS_v256) {
1792             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1793 
1794             tcg_gen_ld_vec(in, cpu_env, aofs);
1795             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1796                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1797             }
1798             tcg_temp_free_vec(in);
1799         } else if (TCG_TARGET_HAS_v128) {
1800             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1801             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1802 
1803             tcg_gen_ld_vec(in0, cpu_env, aofs);
1804             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1805             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1806                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1807                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1808             }
1809             tcg_temp_free_vec(in0);
1810             tcg_temp_free_vec(in1);
1811         } else {
1812             TCGv_i64 in[4];
1813             int j;
1814 
1815             for (j = 0; j < 4; ++j) {
1816                 in[j] = tcg_temp_ebb_new_i64();
1817                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1818             }
1819             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1820                 for (j = 0; j < 4; ++j) {
1821                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1822                 }
1823             }
1824             for (j = 0; j < 4; ++j) {
1825                 tcg_temp_free_i64(in[j]);
1826             }
1827         }
1828         if (oprsz < maxsz) {
1829             expand_clr(dofs + oprsz, maxsz - oprsz);
1830         }
1831     } else {
1832         g_assert_not_reached();
1833     }
1834 }
1835 
1836 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1837                           uint32_t maxsz, uint64_t x)
1838 {
1839     check_size_align(oprsz, maxsz, dofs);
1840     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1841 }
1842 
1843 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1844                       uint32_t oprsz, uint32_t maxsz)
1845 {
1846     static const GVecGen2 g = {
1847         .fni8 = tcg_gen_not_i64,
1848         .fniv = tcg_gen_not_vec,
1849         .fno = gen_helper_gvec_not,
1850         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1851     };
1852     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1853 }
1854 
1855 /* Perform a vector addition using normal addition and a mask.  The mask
1856    should be the sign bit of each lane.  This 6-operation form is more
1857    efficient than separate additions when there are 4 or more lanes in
1858    the 64-bit operation.  */
1859 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1860 {
1861     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1862     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1863     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
1864 
1865     tcg_gen_andc_i64(t1, a, m);
1866     tcg_gen_andc_i64(t2, b, m);
1867     tcg_gen_xor_i64(t3, a, b);
1868     tcg_gen_add_i64(d, t1, t2);
1869     tcg_gen_and_i64(t3, t3, m);
1870     tcg_gen_xor_i64(d, d, t3);
1871 
1872     tcg_temp_free_i64(t1);
1873     tcg_temp_free_i64(t2);
1874     tcg_temp_free_i64(t3);
1875 }
1876 
1877 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1878 {
1879     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1880     gen_addv_mask(d, a, b, m);
1881 }
1882 
1883 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1884 {
1885     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1886     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1887     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1888     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
1889 
1890     tcg_gen_andc_i32(t1, a, m);
1891     tcg_gen_andc_i32(t2, b, m);
1892     tcg_gen_xor_i32(t3, a, b);
1893     tcg_gen_add_i32(d, t1, t2);
1894     tcg_gen_and_i32(t3, t3, m);
1895     tcg_gen_xor_i32(d, d, t3);
1896 
1897     tcg_temp_free_i32(t1);
1898     tcg_temp_free_i32(t2);
1899     tcg_temp_free_i32(t3);
1900 }
1901 
1902 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1903 {
1904     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1905     gen_addv_mask(d, a, b, m);
1906 }
1907 
1908 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1909 {
1910     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1911     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1912 
1913     tcg_gen_andi_i32(t1, a, ~0xffff);
1914     tcg_gen_add_i32(t2, a, b);
1915     tcg_gen_add_i32(t1, t1, b);
1916     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1917 
1918     tcg_temp_free_i32(t1);
1919     tcg_temp_free_i32(t2);
1920 }
1921 
1922 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1923 {
1924     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1925     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1926 
1927     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1928     tcg_gen_add_i64(t2, a, b);
1929     tcg_gen_add_i64(t1, t1, b);
1930     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1931 
1932     tcg_temp_free_i64(t1);
1933     tcg_temp_free_i64(t2);
1934 }
1935 
1936 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1937 
1938 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1939                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1940 {
1941     static const GVecGen3 g[4] = {
1942         { .fni8 = tcg_gen_vec_add8_i64,
1943           .fniv = tcg_gen_add_vec,
1944           .fno = gen_helper_gvec_add8,
1945           .opt_opc = vecop_list_add,
1946           .vece = MO_8 },
1947         { .fni8 = tcg_gen_vec_add16_i64,
1948           .fniv = tcg_gen_add_vec,
1949           .fno = gen_helper_gvec_add16,
1950           .opt_opc = vecop_list_add,
1951           .vece = MO_16 },
1952         { .fni4 = tcg_gen_add_i32,
1953           .fniv = tcg_gen_add_vec,
1954           .fno = gen_helper_gvec_add32,
1955           .opt_opc = vecop_list_add,
1956           .vece = MO_32 },
1957         { .fni8 = tcg_gen_add_i64,
1958           .fniv = tcg_gen_add_vec,
1959           .fno = gen_helper_gvec_add64,
1960           .opt_opc = vecop_list_add,
1961           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1962           .vece = MO_64 },
1963     };
1964 
1965     tcg_debug_assert(vece <= MO_64);
1966     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1967 }
1968 
1969 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1970                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1971 {
1972     static const GVecGen2s g[4] = {
1973         { .fni8 = tcg_gen_vec_add8_i64,
1974           .fniv = tcg_gen_add_vec,
1975           .fno = gen_helper_gvec_adds8,
1976           .opt_opc = vecop_list_add,
1977           .vece = MO_8 },
1978         { .fni8 = tcg_gen_vec_add16_i64,
1979           .fniv = tcg_gen_add_vec,
1980           .fno = gen_helper_gvec_adds16,
1981           .opt_opc = vecop_list_add,
1982           .vece = MO_16 },
1983         { .fni4 = tcg_gen_add_i32,
1984           .fniv = tcg_gen_add_vec,
1985           .fno = gen_helper_gvec_adds32,
1986           .opt_opc = vecop_list_add,
1987           .vece = MO_32 },
1988         { .fni8 = tcg_gen_add_i64,
1989           .fniv = tcg_gen_add_vec,
1990           .fno = gen_helper_gvec_adds64,
1991           .opt_opc = vecop_list_add,
1992           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1993           .vece = MO_64 },
1994     };
1995 
1996     tcg_debug_assert(vece <= MO_64);
1997     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1998 }
1999 
2000 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
2001                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2002 {
2003     TCGv_i64 tmp = tcg_constant_i64(c);
2004     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
2005 }
2006 
2007 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
2008 
2009 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
2010                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2011 {
2012     static const GVecGen2s g[4] = {
2013         { .fni8 = tcg_gen_vec_sub8_i64,
2014           .fniv = tcg_gen_sub_vec,
2015           .fno = gen_helper_gvec_subs8,
2016           .opt_opc = vecop_list_sub,
2017           .vece = MO_8 },
2018         { .fni8 = tcg_gen_vec_sub16_i64,
2019           .fniv = tcg_gen_sub_vec,
2020           .fno = gen_helper_gvec_subs16,
2021           .opt_opc = vecop_list_sub,
2022           .vece = MO_16 },
2023         { .fni4 = tcg_gen_sub_i32,
2024           .fniv = tcg_gen_sub_vec,
2025           .fno = gen_helper_gvec_subs32,
2026           .opt_opc = vecop_list_sub,
2027           .vece = MO_32 },
2028         { .fni8 = tcg_gen_sub_i64,
2029           .fniv = tcg_gen_sub_vec,
2030           .fno = gen_helper_gvec_subs64,
2031           .opt_opc = vecop_list_sub,
2032           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2033           .vece = MO_64 },
2034     };
2035 
2036     tcg_debug_assert(vece <= MO_64);
2037     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2038 }
2039 
2040 /* Perform a vector subtraction using normal subtraction and a mask.
2041    Compare gen_addv_mask above.  */
2042 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2043 {
2044     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2045     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2046     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2047 
2048     tcg_gen_or_i64(t1, a, m);
2049     tcg_gen_andc_i64(t2, b, m);
2050     tcg_gen_eqv_i64(t3, a, b);
2051     tcg_gen_sub_i64(d, t1, t2);
2052     tcg_gen_and_i64(t3, t3, m);
2053     tcg_gen_xor_i64(d, d, t3);
2054 
2055     tcg_temp_free_i64(t1);
2056     tcg_temp_free_i64(t2);
2057     tcg_temp_free_i64(t3);
2058 }
2059 
2060 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2061 {
2062     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2063     gen_subv_mask(d, a, b, m);
2064 }
2065 
2066 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2067 {
2068     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2069     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2070     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2071     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
2072 
2073     tcg_gen_or_i32(t1, a, m);
2074     tcg_gen_andc_i32(t2, b, m);
2075     tcg_gen_eqv_i32(t3, a, b);
2076     tcg_gen_sub_i32(d, t1, t2);
2077     tcg_gen_and_i32(t3, t3, m);
2078     tcg_gen_xor_i32(d, d, t3);
2079 
2080     tcg_temp_free_i32(t1);
2081     tcg_temp_free_i32(t2);
2082     tcg_temp_free_i32(t3);
2083 }
2084 
2085 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2086 {
2087     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2088     gen_subv_mask(d, a, b, m);
2089 }
2090 
2091 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2092 {
2093     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2094     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2095 
2096     tcg_gen_andi_i32(t1, b, ~0xffff);
2097     tcg_gen_sub_i32(t2, a, b);
2098     tcg_gen_sub_i32(t1, a, t1);
2099     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2100 
2101     tcg_temp_free_i32(t1);
2102     tcg_temp_free_i32(t2);
2103 }
2104 
2105 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2106 {
2107     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2108     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2109 
2110     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2111     tcg_gen_sub_i64(t2, a, b);
2112     tcg_gen_sub_i64(t1, a, t1);
2113     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2114 
2115     tcg_temp_free_i64(t1);
2116     tcg_temp_free_i64(t2);
2117 }
2118 
2119 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2120                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2121 {
2122     static const GVecGen3 g[4] = {
2123         { .fni8 = tcg_gen_vec_sub8_i64,
2124           .fniv = tcg_gen_sub_vec,
2125           .fno = gen_helper_gvec_sub8,
2126           .opt_opc = vecop_list_sub,
2127           .vece = MO_8 },
2128         { .fni8 = tcg_gen_vec_sub16_i64,
2129           .fniv = tcg_gen_sub_vec,
2130           .fno = gen_helper_gvec_sub16,
2131           .opt_opc = vecop_list_sub,
2132           .vece = MO_16 },
2133         { .fni4 = tcg_gen_sub_i32,
2134           .fniv = tcg_gen_sub_vec,
2135           .fno = gen_helper_gvec_sub32,
2136           .opt_opc = vecop_list_sub,
2137           .vece = MO_32 },
2138         { .fni8 = tcg_gen_sub_i64,
2139           .fniv = tcg_gen_sub_vec,
2140           .fno = gen_helper_gvec_sub64,
2141           .opt_opc = vecop_list_sub,
2142           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2143           .vece = MO_64 },
2144     };
2145 
2146     tcg_debug_assert(vece <= MO_64);
2147     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2148 }
2149 
2150 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2151 
2152 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2153                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2154 {
2155     static const GVecGen3 g[4] = {
2156         { .fniv = tcg_gen_mul_vec,
2157           .fno = gen_helper_gvec_mul8,
2158           .opt_opc = vecop_list_mul,
2159           .vece = MO_8 },
2160         { .fniv = tcg_gen_mul_vec,
2161           .fno = gen_helper_gvec_mul16,
2162           .opt_opc = vecop_list_mul,
2163           .vece = MO_16 },
2164         { .fni4 = tcg_gen_mul_i32,
2165           .fniv = tcg_gen_mul_vec,
2166           .fno = gen_helper_gvec_mul32,
2167           .opt_opc = vecop_list_mul,
2168           .vece = MO_32 },
2169         { .fni8 = tcg_gen_mul_i64,
2170           .fniv = tcg_gen_mul_vec,
2171           .fno = gen_helper_gvec_mul64,
2172           .opt_opc = vecop_list_mul,
2173           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2174           .vece = MO_64 },
2175     };
2176 
2177     tcg_debug_assert(vece <= MO_64);
2178     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2179 }
2180 
2181 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2182                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2183 {
2184     static const GVecGen2s g[4] = {
2185         { .fniv = tcg_gen_mul_vec,
2186           .fno = gen_helper_gvec_muls8,
2187           .opt_opc = vecop_list_mul,
2188           .vece = MO_8 },
2189         { .fniv = tcg_gen_mul_vec,
2190           .fno = gen_helper_gvec_muls16,
2191           .opt_opc = vecop_list_mul,
2192           .vece = MO_16 },
2193         { .fni4 = tcg_gen_mul_i32,
2194           .fniv = tcg_gen_mul_vec,
2195           .fno = gen_helper_gvec_muls32,
2196           .opt_opc = vecop_list_mul,
2197           .vece = MO_32 },
2198         { .fni8 = tcg_gen_mul_i64,
2199           .fniv = tcg_gen_mul_vec,
2200           .fno = gen_helper_gvec_muls64,
2201           .opt_opc = vecop_list_mul,
2202           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2203           .vece = MO_64 },
2204     };
2205 
2206     tcg_debug_assert(vece <= MO_64);
2207     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2208 }
2209 
2210 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2211                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2212 {
2213     TCGv_i64 tmp = tcg_constant_i64(c);
2214     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2215 }
2216 
2217 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2218                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2219 {
2220     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2221     static const GVecGen3 g[4] = {
2222         { .fniv = tcg_gen_ssadd_vec,
2223           .fno = gen_helper_gvec_ssadd8,
2224           .opt_opc = vecop_list,
2225           .vece = MO_8 },
2226         { .fniv = tcg_gen_ssadd_vec,
2227           .fno = gen_helper_gvec_ssadd16,
2228           .opt_opc = vecop_list,
2229           .vece = MO_16 },
2230         { .fniv = tcg_gen_ssadd_vec,
2231           .fno = gen_helper_gvec_ssadd32,
2232           .opt_opc = vecop_list,
2233           .vece = MO_32 },
2234         { .fniv = tcg_gen_ssadd_vec,
2235           .fno = gen_helper_gvec_ssadd64,
2236           .opt_opc = vecop_list,
2237           .vece = MO_64 },
2238     };
2239     tcg_debug_assert(vece <= MO_64);
2240     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2241 }
2242 
2243 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2244                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2245 {
2246     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2247     static const GVecGen3 g[4] = {
2248         { .fniv = tcg_gen_sssub_vec,
2249           .fno = gen_helper_gvec_sssub8,
2250           .opt_opc = vecop_list,
2251           .vece = MO_8 },
2252         { .fniv = tcg_gen_sssub_vec,
2253           .fno = gen_helper_gvec_sssub16,
2254           .opt_opc = vecop_list,
2255           .vece = MO_16 },
2256         { .fniv = tcg_gen_sssub_vec,
2257           .fno = gen_helper_gvec_sssub32,
2258           .opt_opc = vecop_list,
2259           .vece = MO_32 },
2260         { .fniv = tcg_gen_sssub_vec,
2261           .fno = gen_helper_gvec_sssub64,
2262           .opt_opc = vecop_list,
2263           .vece = MO_64 },
2264     };
2265     tcg_debug_assert(vece <= MO_64);
2266     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2267 }
2268 
2269 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2270 {
2271     TCGv_i32 max = tcg_constant_i32(-1);
2272     tcg_gen_add_i32(d, a, b);
2273     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2274 }
2275 
2276 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2277 {
2278     TCGv_i64 max = tcg_constant_i64(-1);
2279     tcg_gen_add_i64(d, a, b);
2280     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2281 }
2282 
2283 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2284                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2285 {
2286     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2287     static const GVecGen3 g[4] = {
2288         { .fniv = tcg_gen_usadd_vec,
2289           .fno = gen_helper_gvec_usadd8,
2290           .opt_opc = vecop_list,
2291           .vece = MO_8 },
2292         { .fniv = tcg_gen_usadd_vec,
2293           .fno = gen_helper_gvec_usadd16,
2294           .opt_opc = vecop_list,
2295           .vece = MO_16 },
2296         { .fni4 = tcg_gen_usadd_i32,
2297           .fniv = tcg_gen_usadd_vec,
2298           .fno = gen_helper_gvec_usadd32,
2299           .opt_opc = vecop_list,
2300           .vece = MO_32 },
2301         { .fni8 = tcg_gen_usadd_i64,
2302           .fniv = tcg_gen_usadd_vec,
2303           .fno = gen_helper_gvec_usadd64,
2304           .opt_opc = vecop_list,
2305           .vece = MO_64 }
2306     };
2307     tcg_debug_assert(vece <= MO_64);
2308     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2309 }
2310 
2311 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2312 {
2313     TCGv_i32 min = tcg_constant_i32(0);
2314     tcg_gen_sub_i32(d, a, b);
2315     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2316 }
2317 
2318 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2319 {
2320     TCGv_i64 min = tcg_constant_i64(0);
2321     tcg_gen_sub_i64(d, a, b);
2322     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2323 }
2324 
2325 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2326                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2327 {
2328     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2329     static const GVecGen3 g[4] = {
2330         { .fniv = tcg_gen_ussub_vec,
2331           .fno = gen_helper_gvec_ussub8,
2332           .opt_opc = vecop_list,
2333           .vece = MO_8 },
2334         { .fniv = tcg_gen_ussub_vec,
2335           .fno = gen_helper_gvec_ussub16,
2336           .opt_opc = vecop_list,
2337           .vece = MO_16 },
2338         { .fni4 = tcg_gen_ussub_i32,
2339           .fniv = tcg_gen_ussub_vec,
2340           .fno = gen_helper_gvec_ussub32,
2341           .opt_opc = vecop_list,
2342           .vece = MO_32 },
2343         { .fni8 = tcg_gen_ussub_i64,
2344           .fniv = tcg_gen_ussub_vec,
2345           .fno = gen_helper_gvec_ussub64,
2346           .opt_opc = vecop_list,
2347           .vece = MO_64 }
2348     };
2349     tcg_debug_assert(vece <= MO_64);
2350     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2351 }
2352 
2353 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2354                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2355 {
2356     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2357     static const GVecGen3 g[4] = {
2358         { .fniv = tcg_gen_smin_vec,
2359           .fno = gen_helper_gvec_smin8,
2360           .opt_opc = vecop_list,
2361           .vece = MO_8 },
2362         { .fniv = tcg_gen_smin_vec,
2363           .fno = gen_helper_gvec_smin16,
2364           .opt_opc = vecop_list,
2365           .vece = MO_16 },
2366         { .fni4 = tcg_gen_smin_i32,
2367           .fniv = tcg_gen_smin_vec,
2368           .fno = gen_helper_gvec_smin32,
2369           .opt_opc = vecop_list,
2370           .vece = MO_32 },
2371         { .fni8 = tcg_gen_smin_i64,
2372           .fniv = tcg_gen_smin_vec,
2373           .fno = gen_helper_gvec_smin64,
2374           .opt_opc = vecop_list,
2375           .vece = MO_64 }
2376     };
2377     tcg_debug_assert(vece <= MO_64);
2378     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2379 }
2380 
2381 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2382                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2383 {
2384     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2385     static const GVecGen3 g[4] = {
2386         { .fniv = tcg_gen_umin_vec,
2387           .fno = gen_helper_gvec_umin8,
2388           .opt_opc = vecop_list,
2389           .vece = MO_8 },
2390         { .fniv = tcg_gen_umin_vec,
2391           .fno = gen_helper_gvec_umin16,
2392           .opt_opc = vecop_list,
2393           .vece = MO_16 },
2394         { .fni4 = tcg_gen_umin_i32,
2395           .fniv = tcg_gen_umin_vec,
2396           .fno = gen_helper_gvec_umin32,
2397           .opt_opc = vecop_list,
2398           .vece = MO_32 },
2399         { .fni8 = tcg_gen_umin_i64,
2400           .fniv = tcg_gen_umin_vec,
2401           .fno = gen_helper_gvec_umin64,
2402           .opt_opc = vecop_list,
2403           .vece = MO_64 }
2404     };
2405     tcg_debug_assert(vece <= MO_64);
2406     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2407 }
2408 
2409 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2410                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2411 {
2412     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2413     static const GVecGen3 g[4] = {
2414         { .fniv = tcg_gen_smax_vec,
2415           .fno = gen_helper_gvec_smax8,
2416           .opt_opc = vecop_list,
2417           .vece = MO_8 },
2418         { .fniv = tcg_gen_smax_vec,
2419           .fno = gen_helper_gvec_smax16,
2420           .opt_opc = vecop_list,
2421           .vece = MO_16 },
2422         { .fni4 = tcg_gen_smax_i32,
2423           .fniv = tcg_gen_smax_vec,
2424           .fno = gen_helper_gvec_smax32,
2425           .opt_opc = vecop_list,
2426           .vece = MO_32 },
2427         { .fni8 = tcg_gen_smax_i64,
2428           .fniv = tcg_gen_smax_vec,
2429           .fno = gen_helper_gvec_smax64,
2430           .opt_opc = vecop_list,
2431           .vece = MO_64 }
2432     };
2433     tcg_debug_assert(vece <= MO_64);
2434     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2435 }
2436 
2437 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2438                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2439 {
2440     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2441     static const GVecGen3 g[4] = {
2442         { .fniv = tcg_gen_umax_vec,
2443           .fno = gen_helper_gvec_umax8,
2444           .opt_opc = vecop_list,
2445           .vece = MO_8 },
2446         { .fniv = tcg_gen_umax_vec,
2447           .fno = gen_helper_gvec_umax16,
2448           .opt_opc = vecop_list,
2449           .vece = MO_16 },
2450         { .fni4 = tcg_gen_umax_i32,
2451           .fniv = tcg_gen_umax_vec,
2452           .fno = gen_helper_gvec_umax32,
2453           .opt_opc = vecop_list,
2454           .vece = MO_32 },
2455         { .fni8 = tcg_gen_umax_i64,
2456           .fniv = tcg_gen_umax_vec,
2457           .fno = gen_helper_gvec_umax64,
2458           .opt_opc = vecop_list,
2459           .vece = MO_64 }
2460     };
2461     tcg_debug_assert(vece <= MO_64);
2462     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2463 }
2464 
2465 /* Perform a vector negation using normal negation and a mask.
2466    Compare gen_subv_mask above.  */
2467 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2468 {
2469     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2470     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2471 
2472     tcg_gen_andc_i64(t3, m, b);
2473     tcg_gen_andc_i64(t2, b, m);
2474     tcg_gen_sub_i64(d, m, t2);
2475     tcg_gen_xor_i64(d, d, t3);
2476 
2477     tcg_temp_free_i64(t2);
2478     tcg_temp_free_i64(t3);
2479 }
2480 
2481 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2482 {
2483     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2484     gen_negv_mask(d, b, m);
2485 }
2486 
2487 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2488 {
2489     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2490     gen_negv_mask(d, b, m);
2491 }
2492 
2493 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2494 {
2495     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2496     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2497 
2498     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2499     tcg_gen_neg_i64(t2, b);
2500     tcg_gen_neg_i64(t1, t1);
2501     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2502 
2503     tcg_temp_free_i64(t1);
2504     tcg_temp_free_i64(t2);
2505 }
2506 
2507 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2508                       uint32_t oprsz, uint32_t maxsz)
2509 {
2510     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2511     static const GVecGen2 g[4] = {
2512         { .fni8 = tcg_gen_vec_neg8_i64,
2513           .fniv = tcg_gen_neg_vec,
2514           .fno = gen_helper_gvec_neg8,
2515           .opt_opc = vecop_list,
2516           .vece = MO_8 },
2517         { .fni8 = tcg_gen_vec_neg16_i64,
2518           .fniv = tcg_gen_neg_vec,
2519           .fno = gen_helper_gvec_neg16,
2520           .opt_opc = vecop_list,
2521           .vece = MO_16 },
2522         { .fni4 = tcg_gen_neg_i32,
2523           .fniv = tcg_gen_neg_vec,
2524           .fno = gen_helper_gvec_neg32,
2525           .opt_opc = vecop_list,
2526           .vece = MO_32 },
2527         { .fni8 = tcg_gen_neg_i64,
2528           .fniv = tcg_gen_neg_vec,
2529           .fno = gen_helper_gvec_neg64,
2530           .opt_opc = vecop_list,
2531           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2532           .vece = MO_64 },
2533     };
2534 
2535     tcg_debug_assert(vece <= MO_64);
2536     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2537 }
2538 
2539 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2540 {
2541     TCGv_i64 t = tcg_temp_ebb_new_i64();
2542     int nbit = 8 << vece;
2543 
2544     /* Create -1 for each negative element.  */
2545     tcg_gen_shri_i64(t, b, nbit - 1);
2546     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2547     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2548 
2549     /*
2550      * Invert (via xor -1) and add one.
2551      * Because of the ordering the msb is cleared,
2552      * so we never have carry into the next element.
2553      */
2554     tcg_gen_xor_i64(d, b, t);
2555     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2556     tcg_gen_add_i64(d, d, t);
2557 
2558     tcg_temp_free_i64(t);
2559 }
2560 
2561 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2562 {
2563     gen_absv_mask(d, b, MO_8);
2564 }
2565 
2566 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2567 {
2568     gen_absv_mask(d, b, MO_16);
2569 }
2570 
2571 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2572                       uint32_t oprsz, uint32_t maxsz)
2573 {
2574     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2575     static const GVecGen2 g[4] = {
2576         { .fni8 = tcg_gen_vec_abs8_i64,
2577           .fniv = tcg_gen_abs_vec,
2578           .fno = gen_helper_gvec_abs8,
2579           .opt_opc = vecop_list,
2580           .vece = MO_8 },
2581         { .fni8 = tcg_gen_vec_abs16_i64,
2582           .fniv = tcg_gen_abs_vec,
2583           .fno = gen_helper_gvec_abs16,
2584           .opt_opc = vecop_list,
2585           .vece = MO_16 },
2586         { .fni4 = tcg_gen_abs_i32,
2587           .fniv = tcg_gen_abs_vec,
2588           .fno = gen_helper_gvec_abs32,
2589           .opt_opc = vecop_list,
2590           .vece = MO_32 },
2591         { .fni8 = tcg_gen_abs_i64,
2592           .fniv = tcg_gen_abs_vec,
2593           .fno = gen_helper_gvec_abs64,
2594           .opt_opc = vecop_list,
2595           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2596           .vece = MO_64 },
2597     };
2598 
2599     tcg_debug_assert(vece <= MO_64);
2600     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2601 }
2602 
2603 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2604                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2605 {
2606     static const GVecGen3 g = {
2607         .fni8 = tcg_gen_and_i64,
2608         .fniv = tcg_gen_and_vec,
2609         .fno = gen_helper_gvec_and,
2610         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2611     };
2612 
2613     if (aofs == bofs) {
2614         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2615     } else {
2616         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2617     }
2618 }
2619 
2620 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2621                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2622 {
2623     static const GVecGen3 g = {
2624         .fni8 = tcg_gen_or_i64,
2625         .fniv = tcg_gen_or_vec,
2626         .fno = gen_helper_gvec_or,
2627         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2628     };
2629 
2630     if (aofs == bofs) {
2631         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2632     } else {
2633         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2634     }
2635 }
2636 
2637 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2638                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2639 {
2640     static const GVecGen3 g = {
2641         .fni8 = tcg_gen_xor_i64,
2642         .fniv = tcg_gen_xor_vec,
2643         .fno = gen_helper_gvec_xor,
2644         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2645     };
2646 
2647     if (aofs == bofs) {
2648         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2649     } else {
2650         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2651     }
2652 }
2653 
2654 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2655                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2656 {
2657     static const GVecGen3 g = {
2658         .fni8 = tcg_gen_andc_i64,
2659         .fniv = tcg_gen_andc_vec,
2660         .fno = gen_helper_gvec_andc,
2661         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2662     };
2663 
2664     if (aofs == bofs) {
2665         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2666     } else {
2667         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2668     }
2669 }
2670 
2671 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2672                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2673 {
2674     static const GVecGen3 g = {
2675         .fni8 = tcg_gen_orc_i64,
2676         .fniv = tcg_gen_orc_vec,
2677         .fno = gen_helper_gvec_orc,
2678         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2679     };
2680 
2681     if (aofs == bofs) {
2682         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2683     } else {
2684         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2685     }
2686 }
2687 
2688 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2689                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2690 {
2691     static const GVecGen3 g = {
2692         .fni8 = tcg_gen_nand_i64,
2693         .fniv = tcg_gen_nand_vec,
2694         .fno = gen_helper_gvec_nand,
2695         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2696     };
2697 
2698     if (aofs == bofs) {
2699         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2700     } else {
2701         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2702     }
2703 }
2704 
2705 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2706                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2707 {
2708     static const GVecGen3 g = {
2709         .fni8 = tcg_gen_nor_i64,
2710         .fniv = tcg_gen_nor_vec,
2711         .fno = gen_helper_gvec_nor,
2712         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2713     };
2714 
2715     if (aofs == bofs) {
2716         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2717     } else {
2718         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2719     }
2720 }
2721 
2722 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2723                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2724 {
2725     static const GVecGen3 g = {
2726         .fni8 = tcg_gen_eqv_i64,
2727         .fniv = tcg_gen_eqv_vec,
2728         .fno = gen_helper_gvec_eqv,
2729         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2730     };
2731 
2732     if (aofs == bofs) {
2733         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2734     } else {
2735         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2736     }
2737 }
2738 
2739 static const GVecGen2s gop_ands = {
2740     .fni8 = tcg_gen_and_i64,
2741     .fniv = tcg_gen_and_vec,
2742     .fno = gen_helper_gvec_ands,
2743     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2744     .vece = MO_64
2745 };
2746 
2747 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2748                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2749 {
2750     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2751     tcg_gen_dup_i64(vece, tmp, c);
2752     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2753     tcg_temp_free_i64(tmp);
2754 }
2755 
2756 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2757                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2758 {
2759     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2760     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2761 }
2762 
2763 static const GVecGen2s gop_xors = {
2764     .fni8 = tcg_gen_xor_i64,
2765     .fniv = tcg_gen_xor_vec,
2766     .fno = gen_helper_gvec_xors,
2767     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2768     .vece = MO_64
2769 };
2770 
2771 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2772                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2773 {
2774     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2775     tcg_gen_dup_i64(vece, tmp, c);
2776     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2777     tcg_temp_free_i64(tmp);
2778 }
2779 
2780 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2781                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2782 {
2783     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2784     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2785 }
2786 
2787 static const GVecGen2s gop_ors = {
2788     .fni8 = tcg_gen_or_i64,
2789     .fniv = tcg_gen_or_vec,
2790     .fno = gen_helper_gvec_ors,
2791     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2792     .vece = MO_64
2793 };
2794 
2795 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2796                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2797 {
2798     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2799     tcg_gen_dup_i64(vece, tmp, c);
2800     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2801     tcg_temp_free_i64(tmp);
2802 }
2803 
2804 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2805                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2806 {
2807     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2808     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2809 }
2810 
2811 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2812 {
2813     uint64_t mask = dup_const(MO_8, 0xff << c);
2814     tcg_gen_shli_i64(d, a, c);
2815     tcg_gen_andi_i64(d, d, mask);
2816 }
2817 
2818 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2819 {
2820     uint64_t mask = dup_const(MO_16, 0xffff << c);
2821     tcg_gen_shli_i64(d, a, c);
2822     tcg_gen_andi_i64(d, d, mask);
2823 }
2824 
2825 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2826 {
2827     uint32_t mask = dup_const(MO_8, 0xff << c);
2828     tcg_gen_shli_i32(d, a, c);
2829     tcg_gen_andi_i32(d, d, mask);
2830 }
2831 
2832 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2833 {
2834     uint32_t mask = dup_const(MO_16, 0xffff << c);
2835     tcg_gen_shli_i32(d, a, c);
2836     tcg_gen_andi_i32(d, d, mask);
2837 }
2838 
2839 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2840                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2841 {
2842     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2843     static const GVecGen2i g[4] = {
2844         { .fni8 = tcg_gen_vec_shl8i_i64,
2845           .fniv = tcg_gen_shli_vec,
2846           .fno = gen_helper_gvec_shl8i,
2847           .opt_opc = vecop_list,
2848           .vece = MO_8 },
2849         { .fni8 = tcg_gen_vec_shl16i_i64,
2850           .fniv = tcg_gen_shli_vec,
2851           .fno = gen_helper_gvec_shl16i,
2852           .opt_opc = vecop_list,
2853           .vece = MO_16 },
2854         { .fni4 = tcg_gen_shli_i32,
2855           .fniv = tcg_gen_shli_vec,
2856           .fno = gen_helper_gvec_shl32i,
2857           .opt_opc = vecop_list,
2858           .vece = MO_32 },
2859         { .fni8 = tcg_gen_shli_i64,
2860           .fniv = tcg_gen_shli_vec,
2861           .fno = gen_helper_gvec_shl64i,
2862           .opt_opc = vecop_list,
2863           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2864           .vece = MO_64 },
2865     };
2866 
2867     tcg_debug_assert(vece <= MO_64);
2868     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2869     if (shift == 0) {
2870         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2871     } else {
2872         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2873     }
2874 }
2875 
2876 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2877 {
2878     uint64_t mask = dup_const(MO_8, 0xff >> c);
2879     tcg_gen_shri_i64(d, a, c);
2880     tcg_gen_andi_i64(d, d, mask);
2881 }
2882 
2883 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2884 {
2885     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2886     tcg_gen_shri_i64(d, a, c);
2887     tcg_gen_andi_i64(d, d, mask);
2888 }
2889 
2890 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2891 {
2892     uint32_t mask = dup_const(MO_8, 0xff >> c);
2893     tcg_gen_shri_i32(d, a, c);
2894     tcg_gen_andi_i32(d, d, mask);
2895 }
2896 
2897 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2898 {
2899     uint32_t mask = dup_const(MO_16, 0xffff >> c);
2900     tcg_gen_shri_i32(d, a, c);
2901     tcg_gen_andi_i32(d, d, mask);
2902 }
2903 
2904 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2905                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2906 {
2907     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2908     static const GVecGen2i g[4] = {
2909         { .fni8 = tcg_gen_vec_shr8i_i64,
2910           .fniv = tcg_gen_shri_vec,
2911           .fno = gen_helper_gvec_shr8i,
2912           .opt_opc = vecop_list,
2913           .vece = MO_8 },
2914         { .fni8 = tcg_gen_vec_shr16i_i64,
2915           .fniv = tcg_gen_shri_vec,
2916           .fno = gen_helper_gvec_shr16i,
2917           .opt_opc = vecop_list,
2918           .vece = MO_16 },
2919         { .fni4 = tcg_gen_shri_i32,
2920           .fniv = tcg_gen_shri_vec,
2921           .fno = gen_helper_gvec_shr32i,
2922           .opt_opc = vecop_list,
2923           .vece = MO_32 },
2924         { .fni8 = tcg_gen_shri_i64,
2925           .fniv = tcg_gen_shri_vec,
2926           .fno = gen_helper_gvec_shr64i,
2927           .opt_opc = vecop_list,
2928           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2929           .vece = MO_64 },
2930     };
2931 
2932     tcg_debug_assert(vece <= MO_64);
2933     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2934     if (shift == 0) {
2935         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2936     } else {
2937         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2938     }
2939 }
2940 
2941 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2942 {
2943     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2944     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2945     TCGv_i64 s = tcg_temp_ebb_new_i64();
2946 
2947     tcg_gen_shri_i64(d, a, c);
2948     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2949     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2950     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2951     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2952     tcg_temp_free_i64(s);
2953 }
2954 
2955 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2956 {
2957     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2958     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2959     TCGv_i64 s = tcg_temp_ebb_new_i64();
2960 
2961     tcg_gen_shri_i64(d, a, c);
2962     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2963     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2964     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2965     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2966     tcg_temp_free_i64(s);
2967 }
2968 
2969 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2970 {
2971     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2972     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2973     TCGv_i32 s = tcg_temp_ebb_new_i32();
2974 
2975     tcg_gen_shri_i32(d, a, c);
2976     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2977     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2978     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2979     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2980     tcg_temp_free_i32(s);
2981 }
2982 
2983 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2984 {
2985     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2986     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2987     TCGv_i32 s = tcg_temp_ebb_new_i32();
2988 
2989     tcg_gen_shri_i32(d, a, c);
2990     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2991     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2992     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2993     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2994     tcg_temp_free_i32(s);
2995 }
2996 
2997 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2998                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2999 {
3000     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3001     static const GVecGen2i g[4] = {
3002         { .fni8 = tcg_gen_vec_sar8i_i64,
3003           .fniv = tcg_gen_sari_vec,
3004           .fno = gen_helper_gvec_sar8i,
3005           .opt_opc = vecop_list,
3006           .vece = MO_8 },
3007         { .fni8 = tcg_gen_vec_sar16i_i64,
3008           .fniv = tcg_gen_sari_vec,
3009           .fno = gen_helper_gvec_sar16i,
3010           .opt_opc = vecop_list,
3011           .vece = MO_16 },
3012         { .fni4 = tcg_gen_sari_i32,
3013           .fniv = tcg_gen_sari_vec,
3014           .fno = gen_helper_gvec_sar32i,
3015           .opt_opc = vecop_list,
3016           .vece = MO_32 },
3017         { .fni8 = tcg_gen_sari_i64,
3018           .fniv = tcg_gen_sari_vec,
3019           .fno = gen_helper_gvec_sar64i,
3020           .opt_opc = vecop_list,
3021           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3022           .vece = MO_64 },
3023     };
3024 
3025     tcg_debug_assert(vece <= MO_64);
3026     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3027     if (shift == 0) {
3028         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3029     } else {
3030         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3031     }
3032 }
3033 
3034 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3035 {
3036     uint64_t mask = dup_const(MO_8, 0xff << c);
3037 
3038     tcg_gen_shli_i64(d, a, c);
3039     tcg_gen_shri_i64(a, a, 8 - c);
3040     tcg_gen_andi_i64(d, d, mask);
3041     tcg_gen_andi_i64(a, a, ~mask);
3042     tcg_gen_or_i64(d, d, a);
3043 }
3044 
3045 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3046 {
3047     uint64_t mask = dup_const(MO_16, 0xffff << c);
3048 
3049     tcg_gen_shli_i64(d, a, c);
3050     tcg_gen_shri_i64(a, a, 16 - c);
3051     tcg_gen_andi_i64(d, d, mask);
3052     tcg_gen_andi_i64(a, a, ~mask);
3053     tcg_gen_or_i64(d, d, a);
3054 }
3055 
3056 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3057                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3058 {
3059     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3060     static const GVecGen2i g[4] = {
3061         { .fni8 = tcg_gen_vec_rotl8i_i64,
3062           .fniv = tcg_gen_rotli_vec,
3063           .fno = gen_helper_gvec_rotl8i,
3064           .opt_opc = vecop_list,
3065           .vece = MO_8 },
3066         { .fni8 = tcg_gen_vec_rotl16i_i64,
3067           .fniv = tcg_gen_rotli_vec,
3068           .fno = gen_helper_gvec_rotl16i,
3069           .opt_opc = vecop_list,
3070           .vece = MO_16 },
3071         { .fni4 = tcg_gen_rotli_i32,
3072           .fniv = tcg_gen_rotli_vec,
3073           .fno = gen_helper_gvec_rotl32i,
3074           .opt_opc = vecop_list,
3075           .vece = MO_32 },
3076         { .fni8 = tcg_gen_rotli_i64,
3077           .fniv = tcg_gen_rotli_vec,
3078           .fno = gen_helper_gvec_rotl64i,
3079           .opt_opc = vecop_list,
3080           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3081           .vece = MO_64 },
3082     };
3083 
3084     tcg_debug_assert(vece <= MO_64);
3085     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3086     if (shift == 0) {
3087         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3088     } else {
3089         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3090     }
3091 }
3092 
3093 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3094                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3095 {
3096     tcg_debug_assert(vece <= MO_64);
3097     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3098     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3099                        oprsz, maxsz);
3100 }
3101 
3102 /*
3103  * Specialized generation vector shifts by a non-constant scalar.
3104  */
3105 
3106 typedef struct {
3107     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3108     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3109     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3110     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3111     gen_helper_gvec_2 *fno[4];
3112     TCGOpcode s_list[2];
3113     TCGOpcode v_list[2];
3114 } GVecGen2sh;
3115 
3116 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3117                            uint32_t oprsz, uint32_t tysz, TCGType type,
3118                            TCGv_i32 shift,
3119                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3120 {
3121     TCGv_vec t0 = tcg_temp_new_vec(type);
3122     uint32_t i;
3123 
3124     for (i = 0; i < oprsz; i += tysz) {
3125         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3126         fni(vece, t0, t0, shift);
3127         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3128     }
3129     tcg_temp_free_vec(t0);
3130 }
3131 
3132 static void
3133 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3134                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3135 {
3136     TCGType type;
3137     uint32_t some;
3138 
3139     check_size_align(oprsz, maxsz, dofs | aofs);
3140     check_overlap_2(dofs, aofs, maxsz);
3141 
3142     /* If the backend has a scalar expansion, great.  */
3143     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3144     if (type) {
3145         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3146         switch (type) {
3147         case TCG_TYPE_V256:
3148             some = QEMU_ALIGN_DOWN(oprsz, 32);
3149             expand_2sh_vec(vece, dofs, aofs, some, 32,
3150                            TCG_TYPE_V256, shift, g->fniv_s);
3151             if (some == oprsz) {
3152                 break;
3153             }
3154             dofs += some;
3155             aofs += some;
3156             oprsz -= some;
3157             maxsz -= some;
3158             /* fallthru */
3159         case TCG_TYPE_V128:
3160             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3161                            TCG_TYPE_V128, shift, g->fniv_s);
3162             break;
3163         case TCG_TYPE_V64:
3164             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3165                            TCG_TYPE_V64, shift, g->fniv_s);
3166             break;
3167         default:
3168             g_assert_not_reached();
3169         }
3170         tcg_swap_vecop_list(hold_list);
3171         goto clear_tail;
3172     }
3173 
3174     /* If the backend supports variable vector shifts, also cool.  */
3175     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3176     if (type) {
3177         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3178         TCGv_vec v_shift = tcg_temp_new_vec(type);
3179 
3180         if (vece == MO_64) {
3181             TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3182             tcg_gen_extu_i32_i64(sh64, shift);
3183             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3184             tcg_temp_free_i64(sh64);
3185         } else {
3186             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3187         }
3188 
3189         switch (type) {
3190         case TCG_TYPE_V256:
3191             some = QEMU_ALIGN_DOWN(oprsz, 32);
3192             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3193                           v_shift, false, g->fniv_v);
3194             if (some == oprsz) {
3195                 break;
3196             }
3197             dofs += some;
3198             aofs += some;
3199             oprsz -= some;
3200             maxsz -= some;
3201             /* fallthru */
3202         case TCG_TYPE_V128:
3203             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3204                           v_shift, false, g->fniv_v);
3205             break;
3206         case TCG_TYPE_V64:
3207             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3208                           v_shift, false, g->fniv_v);
3209             break;
3210         default:
3211             g_assert_not_reached();
3212         }
3213         tcg_temp_free_vec(v_shift);
3214         tcg_swap_vecop_list(hold_list);
3215         goto clear_tail;
3216     }
3217 
3218     /* Otherwise fall back to integral... */
3219     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3220         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3221     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3222         TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3223         tcg_gen_extu_i32_i64(sh64, shift);
3224         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3225         tcg_temp_free_i64(sh64);
3226     } else {
3227         TCGv_ptr a0 = tcg_temp_ebb_new_ptr();
3228         TCGv_ptr a1 = tcg_temp_ebb_new_ptr();
3229         TCGv_i32 desc = tcg_temp_ebb_new_i32();
3230 
3231         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3232         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3233         tcg_gen_addi_ptr(a0, cpu_env, dofs);
3234         tcg_gen_addi_ptr(a1, cpu_env, aofs);
3235 
3236         g->fno[vece](a0, a1, desc);
3237 
3238         tcg_temp_free_ptr(a0);
3239         tcg_temp_free_ptr(a1);
3240         tcg_temp_free_i32(desc);
3241         return;
3242     }
3243 
3244  clear_tail:
3245     if (oprsz < maxsz) {
3246         expand_clr(dofs + oprsz, maxsz - oprsz);
3247     }
3248 }
3249 
3250 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3251                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3252 {
3253     static const GVecGen2sh g = {
3254         .fni4 = tcg_gen_shl_i32,
3255         .fni8 = tcg_gen_shl_i64,
3256         .fniv_s = tcg_gen_shls_vec,
3257         .fniv_v = tcg_gen_shlv_vec,
3258         .fno = {
3259             gen_helper_gvec_shl8i,
3260             gen_helper_gvec_shl16i,
3261             gen_helper_gvec_shl32i,
3262             gen_helper_gvec_shl64i,
3263         },
3264         .s_list = { INDEX_op_shls_vec, 0 },
3265         .v_list = { INDEX_op_shlv_vec, 0 },
3266     };
3267 
3268     tcg_debug_assert(vece <= MO_64);
3269     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3270 }
3271 
3272 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3273                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3274 {
3275     static const GVecGen2sh g = {
3276         .fni4 = tcg_gen_shr_i32,
3277         .fni8 = tcg_gen_shr_i64,
3278         .fniv_s = tcg_gen_shrs_vec,
3279         .fniv_v = tcg_gen_shrv_vec,
3280         .fno = {
3281             gen_helper_gvec_shr8i,
3282             gen_helper_gvec_shr16i,
3283             gen_helper_gvec_shr32i,
3284             gen_helper_gvec_shr64i,
3285         },
3286         .s_list = { INDEX_op_shrs_vec, 0 },
3287         .v_list = { INDEX_op_shrv_vec, 0 },
3288     };
3289 
3290     tcg_debug_assert(vece <= MO_64);
3291     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3292 }
3293 
3294 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3295                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3296 {
3297     static const GVecGen2sh g = {
3298         .fni4 = tcg_gen_sar_i32,
3299         .fni8 = tcg_gen_sar_i64,
3300         .fniv_s = tcg_gen_sars_vec,
3301         .fniv_v = tcg_gen_sarv_vec,
3302         .fno = {
3303             gen_helper_gvec_sar8i,
3304             gen_helper_gvec_sar16i,
3305             gen_helper_gvec_sar32i,
3306             gen_helper_gvec_sar64i,
3307         },
3308         .s_list = { INDEX_op_sars_vec, 0 },
3309         .v_list = { INDEX_op_sarv_vec, 0 },
3310     };
3311 
3312     tcg_debug_assert(vece <= MO_64);
3313     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3314 }
3315 
3316 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3317                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3318 {
3319     static const GVecGen2sh g = {
3320         .fni4 = tcg_gen_rotl_i32,
3321         .fni8 = tcg_gen_rotl_i64,
3322         .fniv_s = tcg_gen_rotls_vec,
3323         .fniv_v = tcg_gen_rotlv_vec,
3324         .fno = {
3325             gen_helper_gvec_rotl8i,
3326             gen_helper_gvec_rotl16i,
3327             gen_helper_gvec_rotl32i,
3328             gen_helper_gvec_rotl64i,
3329         },
3330         .s_list = { INDEX_op_rotls_vec, 0 },
3331         .v_list = { INDEX_op_rotlv_vec, 0 },
3332     };
3333 
3334     tcg_debug_assert(vece <= MO_64);
3335     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3336 }
3337 
3338 /*
3339  * Expand D = A << (B % element bits)
3340  *
3341  * Unlike scalar shifts, where it is easy for the target front end
3342  * to include the modulo as part of the expansion.  If the target
3343  * naturally includes the modulo as part of the operation, great!
3344  * If the target has some other behaviour from out-of-range shifts,
3345  * then it could not use this function anyway, and would need to
3346  * do it's own expansion with custom functions.
3347  */
3348 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3349                                  TCGv_vec a, TCGv_vec b)
3350 {
3351     TCGv_vec t = tcg_temp_new_vec_matching(d);
3352     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3353 
3354     tcg_gen_and_vec(vece, t, b, m);
3355     tcg_gen_shlv_vec(vece, d, a, t);
3356     tcg_temp_free_vec(t);
3357 }
3358 
3359 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3360 {
3361     TCGv_i32 t = tcg_temp_ebb_new_i32();
3362 
3363     tcg_gen_andi_i32(t, b, 31);
3364     tcg_gen_shl_i32(d, a, t);
3365     tcg_temp_free_i32(t);
3366 }
3367 
3368 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3369 {
3370     TCGv_i64 t = tcg_temp_ebb_new_i64();
3371 
3372     tcg_gen_andi_i64(t, b, 63);
3373     tcg_gen_shl_i64(d, a, t);
3374     tcg_temp_free_i64(t);
3375 }
3376 
3377 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3378                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3379 {
3380     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3381     static const GVecGen3 g[4] = {
3382         { .fniv = tcg_gen_shlv_mod_vec,
3383           .fno = gen_helper_gvec_shl8v,
3384           .opt_opc = vecop_list,
3385           .vece = MO_8 },
3386         { .fniv = tcg_gen_shlv_mod_vec,
3387           .fno = gen_helper_gvec_shl16v,
3388           .opt_opc = vecop_list,
3389           .vece = MO_16 },
3390         { .fni4 = tcg_gen_shl_mod_i32,
3391           .fniv = tcg_gen_shlv_mod_vec,
3392           .fno = gen_helper_gvec_shl32v,
3393           .opt_opc = vecop_list,
3394           .vece = MO_32 },
3395         { .fni8 = tcg_gen_shl_mod_i64,
3396           .fniv = tcg_gen_shlv_mod_vec,
3397           .fno = gen_helper_gvec_shl64v,
3398           .opt_opc = vecop_list,
3399           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3400           .vece = MO_64 },
3401     };
3402 
3403     tcg_debug_assert(vece <= MO_64);
3404     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3405 }
3406 
3407 /*
3408  * Similarly for logical right shifts.
3409  */
3410 
3411 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3412                                  TCGv_vec a, TCGv_vec b)
3413 {
3414     TCGv_vec t = tcg_temp_new_vec_matching(d);
3415     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3416 
3417     tcg_gen_and_vec(vece, t, b, m);
3418     tcg_gen_shrv_vec(vece, d, a, t);
3419     tcg_temp_free_vec(t);
3420 }
3421 
3422 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3423 {
3424     TCGv_i32 t = tcg_temp_ebb_new_i32();
3425 
3426     tcg_gen_andi_i32(t, b, 31);
3427     tcg_gen_shr_i32(d, a, t);
3428     tcg_temp_free_i32(t);
3429 }
3430 
3431 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3432 {
3433     TCGv_i64 t = tcg_temp_ebb_new_i64();
3434 
3435     tcg_gen_andi_i64(t, b, 63);
3436     tcg_gen_shr_i64(d, a, t);
3437     tcg_temp_free_i64(t);
3438 }
3439 
3440 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3441                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3442 {
3443     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3444     static const GVecGen3 g[4] = {
3445         { .fniv = tcg_gen_shrv_mod_vec,
3446           .fno = gen_helper_gvec_shr8v,
3447           .opt_opc = vecop_list,
3448           .vece = MO_8 },
3449         { .fniv = tcg_gen_shrv_mod_vec,
3450           .fno = gen_helper_gvec_shr16v,
3451           .opt_opc = vecop_list,
3452           .vece = MO_16 },
3453         { .fni4 = tcg_gen_shr_mod_i32,
3454           .fniv = tcg_gen_shrv_mod_vec,
3455           .fno = gen_helper_gvec_shr32v,
3456           .opt_opc = vecop_list,
3457           .vece = MO_32 },
3458         { .fni8 = tcg_gen_shr_mod_i64,
3459           .fniv = tcg_gen_shrv_mod_vec,
3460           .fno = gen_helper_gvec_shr64v,
3461           .opt_opc = vecop_list,
3462           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3463           .vece = MO_64 },
3464     };
3465 
3466     tcg_debug_assert(vece <= MO_64);
3467     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3468 }
3469 
3470 /*
3471  * Similarly for arithmetic right shifts.
3472  */
3473 
3474 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3475                                  TCGv_vec a, TCGv_vec b)
3476 {
3477     TCGv_vec t = tcg_temp_new_vec_matching(d);
3478     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3479 
3480     tcg_gen_and_vec(vece, t, b, m);
3481     tcg_gen_sarv_vec(vece, d, a, t);
3482     tcg_temp_free_vec(t);
3483 }
3484 
3485 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3486 {
3487     TCGv_i32 t = tcg_temp_ebb_new_i32();
3488 
3489     tcg_gen_andi_i32(t, b, 31);
3490     tcg_gen_sar_i32(d, a, t);
3491     tcg_temp_free_i32(t);
3492 }
3493 
3494 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3495 {
3496     TCGv_i64 t = tcg_temp_ebb_new_i64();
3497 
3498     tcg_gen_andi_i64(t, b, 63);
3499     tcg_gen_sar_i64(d, a, t);
3500     tcg_temp_free_i64(t);
3501 }
3502 
3503 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3504                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3505 {
3506     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3507     static const GVecGen3 g[4] = {
3508         { .fniv = tcg_gen_sarv_mod_vec,
3509           .fno = gen_helper_gvec_sar8v,
3510           .opt_opc = vecop_list,
3511           .vece = MO_8 },
3512         { .fniv = tcg_gen_sarv_mod_vec,
3513           .fno = gen_helper_gvec_sar16v,
3514           .opt_opc = vecop_list,
3515           .vece = MO_16 },
3516         { .fni4 = tcg_gen_sar_mod_i32,
3517           .fniv = tcg_gen_sarv_mod_vec,
3518           .fno = gen_helper_gvec_sar32v,
3519           .opt_opc = vecop_list,
3520           .vece = MO_32 },
3521         { .fni8 = tcg_gen_sar_mod_i64,
3522           .fniv = tcg_gen_sarv_mod_vec,
3523           .fno = gen_helper_gvec_sar64v,
3524           .opt_opc = vecop_list,
3525           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3526           .vece = MO_64 },
3527     };
3528 
3529     tcg_debug_assert(vece <= MO_64);
3530     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3531 }
3532 
3533 /*
3534  * Similarly for rotates.
3535  */
3536 
3537 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3538                                   TCGv_vec a, TCGv_vec b)
3539 {
3540     TCGv_vec t = tcg_temp_new_vec_matching(d);
3541     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3542 
3543     tcg_gen_and_vec(vece, t, b, m);
3544     tcg_gen_rotlv_vec(vece, d, a, t);
3545     tcg_temp_free_vec(t);
3546 }
3547 
3548 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3549 {
3550     TCGv_i32 t = tcg_temp_ebb_new_i32();
3551 
3552     tcg_gen_andi_i32(t, b, 31);
3553     tcg_gen_rotl_i32(d, a, t);
3554     tcg_temp_free_i32(t);
3555 }
3556 
3557 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3558 {
3559     TCGv_i64 t = tcg_temp_ebb_new_i64();
3560 
3561     tcg_gen_andi_i64(t, b, 63);
3562     tcg_gen_rotl_i64(d, a, t);
3563     tcg_temp_free_i64(t);
3564 }
3565 
3566 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3567                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3568 {
3569     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3570     static const GVecGen3 g[4] = {
3571         { .fniv = tcg_gen_rotlv_mod_vec,
3572           .fno = gen_helper_gvec_rotl8v,
3573           .opt_opc = vecop_list,
3574           .vece = MO_8 },
3575         { .fniv = tcg_gen_rotlv_mod_vec,
3576           .fno = gen_helper_gvec_rotl16v,
3577           .opt_opc = vecop_list,
3578           .vece = MO_16 },
3579         { .fni4 = tcg_gen_rotl_mod_i32,
3580           .fniv = tcg_gen_rotlv_mod_vec,
3581           .fno = gen_helper_gvec_rotl32v,
3582           .opt_opc = vecop_list,
3583           .vece = MO_32 },
3584         { .fni8 = tcg_gen_rotl_mod_i64,
3585           .fniv = tcg_gen_rotlv_mod_vec,
3586           .fno = gen_helper_gvec_rotl64v,
3587           .opt_opc = vecop_list,
3588           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3589           .vece = MO_64 },
3590     };
3591 
3592     tcg_debug_assert(vece <= MO_64);
3593     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3594 }
3595 
3596 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3597                                   TCGv_vec a, TCGv_vec b)
3598 {
3599     TCGv_vec t = tcg_temp_new_vec_matching(d);
3600     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3601 
3602     tcg_gen_and_vec(vece, t, b, m);
3603     tcg_gen_rotrv_vec(vece, d, a, t);
3604     tcg_temp_free_vec(t);
3605 }
3606 
3607 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3608 {
3609     TCGv_i32 t = tcg_temp_ebb_new_i32();
3610 
3611     tcg_gen_andi_i32(t, b, 31);
3612     tcg_gen_rotr_i32(d, a, t);
3613     tcg_temp_free_i32(t);
3614 }
3615 
3616 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3617 {
3618     TCGv_i64 t = tcg_temp_ebb_new_i64();
3619 
3620     tcg_gen_andi_i64(t, b, 63);
3621     tcg_gen_rotr_i64(d, a, t);
3622     tcg_temp_free_i64(t);
3623 }
3624 
3625 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3626                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3627 {
3628     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3629     static const GVecGen3 g[4] = {
3630         { .fniv = tcg_gen_rotrv_mod_vec,
3631           .fno = gen_helper_gvec_rotr8v,
3632           .opt_opc = vecop_list,
3633           .vece = MO_8 },
3634         { .fniv = tcg_gen_rotrv_mod_vec,
3635           .fno = gen_helper_gvec_rotr16v,
3636           .opt_opc = vecop_list,
3637           .vece = MO_16 },
3638         { .fni4 = tcg_gen_rotr_mod_i32,
3639           .fniv = tcg_gen_rotrv_mod_vec,
3640           .fno = gen_helper_gvec_rotr32v,
3641           .opt_opc = vecop_list,
3642           .vece = MO_32 },
3643         { .fni8 = tcg_gen_rotr_mod_i64,
3644           .fniv = tcg_gen_rotrv_mod_vec,
3645           .fno = gen_helper_gvec_rotr64v,
3646           .opt_opc = vecop_list,
3647           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3648           .vece = MO_64 },
3649     };
3650 
3651     tcg_debug_assert(vece <= MO_64);
3652     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3653 }
3654 
3655 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3656 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3657                            uint32_t oprsz, TCGCond cond)
3658 {
3659     TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3660     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3661     uint32_t i;
3662 
3663     for (i = 0; i < oprsz; i += 4) {
3664         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3665         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3666         tcg_gen_setcond_i32(cond, t0, t0, t1);
3667         tcg_gen_neg_i32(t0, t0);
3668         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3669     }
3670     tcg_temp_free_i32(t1);
3671     tcg_temp_free_i32(t0);
3672 }
3673 
3674 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3675                            uint32_t oprsz, TCGCond cond)
3676 {
3677     TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3678     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
3679     uint32_t i;
3680 
3681     for (i = 0; i < oprsz; i += 8) {
3682         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3683         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3684         tcg_gen_setcond_i64(cond, t0, t0, t1);
3685         tcg_gen_neg_i64(t0, t0);
3686         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3687     }
3688     tcg_temp_free_i64(t1);
3689     tcg_temp_free_i64(t0);
3690 }
3691 
3692 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3693                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3694                            TCGType type, TCGCond cond)
3695 {
3696     TCGv_vec t0 = tcg_temp_new_vec(type);
3697     TCGv_vec t1 = tcg_temp_new_vec(type);
3698     uint32_t i;
3699 
3700     for (i = 0; i < oprsz; i += tysz) {
3701         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3702         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3703         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3704         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3705     }
3706     tcg_temp_free_vec(t1);
3707     tcg_temp_free_vec(t0);
3708 }
3709 
3710 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3711                       uint32_t aofs, uint32_t bofs,
3712                       uint32_t oprsz, uint32_t maxsz)
3713 {
3714     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3715     static gen_helper_gvec_3 * const eq_fn[4] = {
3716         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3717         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3718     };
3719     static gen_helper_gvec_3 * const ne_fn[4] = {
3720         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3721         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3722     };
3723     static gen_helper_gvec_3 * const lt_fn[4] = {
3724         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3725         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3726     };
3727     static gen_helper_gvec_3 * const le_fn[4] = {
3728         gen_helper_gvec_le8, gen_helper_gvec_le16,
3729         gen_helper_gvec_le32, gen_helper_gvec_le64
3730     };
3731     static gen_helper_gvec_3 * const ltu_fn[4] = {
3732         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3733         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3734     };
3735     static gen_helper_gvec_3 * const leu_fn[4] = {
3736         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3737         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3738     };
3739     static gen_helper_gvec_3 * const * const fns[16] = {
3740         [TCG_COND_EQ] = eq_fn,
3741         [TCG_COND_NE] = ne_fn,
3742         [TCG_COND_LT] = lt_fn,
3743         [TCG_COND_LE] = le_fn,
3744         [TCG_COND_LTU] = ltu_fn,
3745         [TCG_COND_LEU] = leu_fn,
3746     };
3747 
3748     const TCGOpcode *hold_list;
3749     TCGType type;
3750     uint32_t some;
3751 
3752     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3753     check_overlap_3(dofs, aofs, bofs, maxsz);
3754 
3755     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3756         do_dup(MO_8, dofs, oprsz, maxsz,
3757                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3758         return;
3759     }
3760 
3761     /*
3762      * Implement inline with a vector type, if possible.
3763      * Prefer integer when 64-bit host and 64-bit comparison.
3764      */
3765     hold_list = tcg_swap_vecop_list(cmp_list);
3766     type = choose_vector_type(cmp_list, vece, oprsz,
3767                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3768     switch (type) {
3769     case TCG_TYPE_V256:
3770         /* Recall that ARM SVE allows vector sizes that are not a
3771          * power of 2, but always a multiple of 16.  The intent is
3772          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3773          */
3774         some = QEMU_ALIGN_DOWN(oprsz, 32);
3775         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3776         if (some == oprsz) {
3777             break;
3778         }
3779         dofs += some;
3780         aofs += some;
3781         bofs += some;
3782         oprsz -= some;
3783         maxsz -= some;
3784         /* fallthru */
3785     case TCG_TYPE_V128:
3786         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3787         break;
3788     case TCG_TYPE_V64:
3789         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3790         break;
3791 
3792     case 0:
3793         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3794             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3795         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3796             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3797         } else {
3798             gen_helper_gvec_3 * const *fn = fns[cond];
3799 
3800             if (fn == NULL) {
3801                 uint32_t tmp;
3802                 tmp = aofs, aofs = bofs, bofs = tmp;
3803                 cond = tcg_swap_cond(cond);
3804                 fn = fns[cond];
3805                 assert(fn != NULL);
3806             }
3807             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3808             oprsz = maxsz;
3809         }
3810         break;
3811 
3812     default:
3813         g_assert_not_reached();
3814     }
3815     tcg_swap_vecop_list(hold_list);
3816 
3817     if (oprsz < maxsz) {
3818         expand_clr(dofs + oprsz, maxsz - oprsz);
3819     }
3820 }
3821 
3822 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3823 {
3824     TCGv_i64 t = tcg_temp_ebb_new_i64();
3825 
3826     tcg_gen_and_i64(t, b, a);
3827     tcg_gen_andc_i64(d, c, a);
3828     tcg_gen_or_i64(d, d, t);
3829     tcg_temp_free_i64(t);
3830 }
3831 
3832 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3833                          uint32_t bofs, uint32_t cofs,
3834                          uint32_t oprsz, uint32_t maxsz)
3835 {
3836     static const GVecGen4 g = {
3837         .fni8 = tcg_gen_bitsel_i64,
3838         .fniv = tcg_gen_bitsel_vec,
3839         .fno = gen_helper_gvec_bitsel,
3840     };
3841 
3842     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3843 }
3844