xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision 72baef13b9dce71f20ae840d9951e559e14abf6d)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-temp-internal.h"
23 #include "tcg/tcg-op-common.h"
24 #include "tcg/tcg-op-gvec-common.h"
25 #include "tcg/tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t max_align;
41 
42     switch (oprsz) {
43     case 8:
44     case 16:
45     case 32:
46         tcg_debug_assert(oprsz <= maxsz);
47         break;
48     default:
49         tcg_debug_assert(oprsz == maxsz);
50         break;
51     }
52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
53 
54     max_align = maxsz >= 16 ? 15 : 7;
55     tcg_debug_assert((maxsz & max_align) == 0);
56     tcg_debug_assert((ofs & max_align) == 0);
57 }
58 
59 /* Verify vector overlap rules for two operands.  */
60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
61 {
62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
63 }
64 
65 /* Verify vector overlap rules for three operands.  */
66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
67 {
68     check_overlap_2(d, a, s);
69     check_overlap_2(d, b, s);
70     check_overlap_2(a, b, s);
71 }
72 
73 /* Verify vector overlap rules for four operands.  */
74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
75                             uint32_t c, uint32_t s)
76 {
77     check_overlap_2(d, a, s);
78     check_overlap_2(d, b, s);
79     check_overlap_2(d, c, s);
80     check_overlap_2(a, b, s);
81     check_overlap_2(a, c, s);
82     check_overlap_2(b, c, s);
83 }
84 
85 /* Create a descriptor from components.  */
86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
87 {
88     uint32_t desc = 0;
89 
90     check_size_align(oprsz, maxsz, 0);
91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
92 
93     oprsz = (oprsz / 8) - 1;
94     maxsz = (maxsz / 8) - 1;
95 
96     /*
97      * We have just asserted in check_size_align that either
98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
99      * case with '2', as that would otherwise map to 24.
100      */
101     if (oprsz == maxsz) {
102         oprsz = 2;
103     }
104 
105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
108 
109     return desc;
110 }
111 
112 /* Generate a call to a gvec-style helper with two vector operands.  */
113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
115                         gen_helper_gvec_2 *fn)
116 {
117     TCGv_ptr a0, a1;
118     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
119 
120     a0 = tcg_temp_ebb_new_ptr();
121     a1 = tcg_temp_ebb_new_ptr();
122 
123     tcg_gen_addi_ptr(a0, tcg_env, dofs);
124     tcg_gen_addi_ptr(a1, tcg_env, aofs);
125 
126     fn(a0, a1, desc);
127 
128     tcg_temp_free_ptr(a0);
129     tcg_temp_free_ptr(a1);
130 }
131 
132 /* Generate a call to a gvec-style helper with two vector operands
133    and one scalar operand.  */
134 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
135                          uint32_t oprsz, uint32_t maxsz, int32_t data,
136                          gen_helper_gvec_2i *fn)
137 {
138     TCGv_ptr a0, a1;
139     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
140 
141     a0 = tcg_temp_ebb_new_ptr();
142     a1 = tcg_temp_ebb_new_ptr();
143 
144     tcg_gen_addi_ptr(a0, tcg_env, dofs);
145     tcg_gen_addi_ptr(a1, tcg_env, aofs);
146 
147     fn(a0, a1, c, desc);
148 
149     tcg_temp_free_ptr(a0);
150     tcg_temp_free_ptr(a1);
151 }
152 
153 /* Generate a call to a gvec-style helper with three vector operands.  */
154 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
155                         uint32_t oprsz, uint32_t maxsz, int32_t data,
156                         gen_helper_gvec_3 *fn)
157 {
158     TCGv_ptr a0, a1, a2;
159     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
160 
161     a0 = tcg_temp_ebb_new_ptr();
162     a1 = tcg_temp_ebb_new_ptr();
163     a2 = tcg_temp_ebb_new_ptr();
164 
165     tcg_gen_addi_ptr(a0, tcg_env, dofs);
166     tcg_gen_addi_ptr(a1, tcg_env, aofs);
167     tcg_gen_addi_ptr(a2, tcg_env, bofs);
168 
169     fn(a0, a1, a2, desc);
170 
171     tcg_temp_free_ptr(a0);
172     tcg_temp_free_ptr(a1);
173     tcg_temp_free_ptr(a2);
174 }
175 
176 /* Generate a call to a gvec-style helper with four vector operands.  */
177 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
178                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
179                         int32_t data, gen_helper_gvec_4 *fn)
180 {
181     TCGv_ptr a0, a1, a2, a3;
182     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
183 
184     a0 = tcg_temp_ebb_new_ptr();
185     a1 = tcg_temp_ebb_new_ptr();
186     a2 = tcg_temp_ebb_new_ptr();
187     a3 = tcg_temp_ebb_new_ptr();
188 
189     tcg_gen_addi_ptr(a0, tcg_env, dofs);
190     tcg_gen_addi_ptr(a1, tcg_env, aofs);
191     tcg_gen_addi_ptr(a2, tcg_env, bofs);
192     tcg_gen_addi_ptr(a3, tcg_env, cofs);
193 
194     fn(a0, a1, a2, a3, desc);
195 
196     tcg_temp_free_ptr(a0);
197     tcg_temp_free_ptr(a1);
198     tcg_temp_free_ptr(a2);
199     tcg_temp_free_ptr(a3);
200 }
201 
202 /* Generate a call to a gvec-style helper with five vector operands.  */
203 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
204                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
205                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
206 {
207     TCGv_ptr a0, a1, a2, a3, a4;
208     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
209 
210     a0 = tcg_temp_ebb_new_ptr();
211     a1 = tcg_temp_ebb_new_ptr();
212     a2 = tcg_temp_ebb_new_ptr();
213     a3 = tcg_temp_ebb_new_ptr();
214     a4 = tcg_temp_ebb_new_ptr();
215 
216     tcg_gen_addi_ptr(a0, tcg_env, dofs);
217     tcg_gen_addi_ptr(a1, tcg_env, aofs);
218     tcg_gen_addi_ptr(a2, tcg_env, bofs);
219     tcg_gen_addi_ptr(a3, tcg_env, cofs);
220     tcg_gen_addi_ptr(a4, tcg_env, xofs);
221 
222     fn(a0, a1, a2, a3, a4, desc);
223 
224     tcg_temp_free_ptr(a0);
225     tcg_temp_free_ptr(a1);
226     tcg_temp_free_ptr(a2);
227     tcg_temp_free_ptr(a3);
228     tcg_temp_free_ptr(a4);
229 }
230 
231 /* Generate a call to a gvec-style helper with three vector operands
232    and an extra pointer operand.  */
233 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
234                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
235                         int32_t data, gen_helper_gvec_2_ptr *fn)
236 {
237     TCGv_ptr a0, a1;
238     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
239 
240     a0 = tcg_temp_ebb_new_ptr();
241     a1 = tcg_temp_ebb_new_ptr();
242 
243     tcg_gen_addi_ptr(a0, tcg_env, dofs);
244     tcg_gen_addi_ptr(a1, tcg_env, aofs);
245 
246     fn(a0, a1, ptr, desc);
247 
248     tcg_temp_free_ptr(a0);
249     tcg_temp_free_ptr(a1);
250 }
251 
252 /* Generate a call to a gvec-style helper with three vector operands
253    and an extra pointer operand.  */
254 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
255                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
256                         int32_t data, gen_helper_gvec_3_ptr *fn)
257 {
258     TCGv_ptr a0, a1, a2;
259     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
260 
261     a0 = tcg_temp_ebb_new_ptr();
262     a1 = tcg_temp_ebb_new_ptr();
263     a2 = tcg_temp_ebb_new_ptr();
264 
265     tcg_gen_addi_ptr(a0, tcg_env, dofs);
266     tcg_gen_addi_ptr(a1, tcg_env, aofs);
267     tcg_gen_addi_ptr(a2, tcg_env, bofs);
268 
269     fn(a0, a1, a2, ptr, desc);
270 
271     tcg_temp_free_ptr(a0);
272     tcg_temp_free_ptr(a1);
273     tcg_temp_free_ptr(a2);
274 }
275 
276 /* Generate a call to a gvec-style helper with four vector operands
277    and an extra pointer operand.  */
278 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
279                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
280                         uint32_t maxsz, int32_t data,
281                         gen_helper_gvec_4_ptr *fn)
282 {
283     TCGv_ptr a0, a1, a2, a3;
284     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
285 
286     a0 = tcg_temp_ebb_new_ptr();
287     a1 = tcg_temp_ebb_new_ptr();
288     a2 = tcg_temp_ebb_new_ptr();
289     a3 = tcg_temp_ebb_new_ptr();
290 
291     tcg_gen_addi_ptr(a0, tcg_env, dofs);
292     tcg_gen_addi_ptr(a1, tcg_env, aofs);
293     tcg_gen_addi_ptr(a2, tcg_env, bofs);
294     tcg_gen_addi_ptr(a3, tcg_env, cofs);
295 
296     fn(a0, a1, a2, a3, ptr, desc);
297 
298     tcg_temp_free_ptr(a0);
299     tcg_temp_free_ptr(a1);
300     tcg_temp_free_ptr(a2);
301     tcg_temp_free_ptr(a3);
302 }
303 
304 /* Generate a call to a gvec-style helper with five vector operands
305    and an extra pointer operand.  */
306 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
307                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
308                         uint32_t oprsz, uint32_t maxsz, int32_t data,
309                         gen_helper_gvec_5_ptr *fn)
310 {
311     TCGv_ptr a0, a1, a2, a3, a4;
312     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
313 
314     a0 = tcg_temp_ebb_new_ptr();
315     a1 = tcg_temp_ebb_new_ptr();
316     a2 = tcg_temp_ebb_new_ptr();
317     a3 = tcg_temp_ebb_new_ptr();
318     a4 = tcg_temp_ebb_new_ptr();
319 
320     tcg_gen_addi_ptr(a0, tcg_env, dofs);
321     tcg_gen_addi_ptr(a1, tcg_env, aofs);
322     tcg_gen_addi_ptr(a2, tcg_env, bofs);
323     tcg_gen_addi_ptr(a3, tcg_env, cofs);
324     tcg_gen_addi_ptr(a4, tcg_env, eofs);
325 
326     fn(a0, a1, a2, a3, a4, ptr, desc);
327 
328     tcg_temp_free_ptr(a0);
329     tcg_temp_free_ptr(a1);
330     tcg_temp_free_ptr(a2);
331     tcg_temp_free_ptr(a3);
332     tcg_temp_free_ptr(a4);
333 }
334 
335 /* Return true if we want to implement something of OPRSZ bytes
336    in units of LNSZ.  This limits the expansion of inline code.  */
337 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
338 {
339     uint32_t q, r;
340 
341     if (oprsz < lnsz) {
342         return false;
343     }
344 
345     q = oprsz / lnsz;
346     r = oprsz % lnsz;
347     tcg_debug_assert((r & 7) == 0);
348 
349     if (lnsz < 16) {
350         /* For sizes below 16, accept no remainder. */
351         if (r != 0) {
352             return false;
353         }
354     } else {
355         /*
356          * Recall that ARM SVE allows vector sizes that are not a
357          * power of 2, but always a multiple of 16.  The intent is
358          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
359          * In addition, expand_clr needs to handle a multiple of 8.
360          * Thus we can handle the tail with one more operation per
361          * diminishing power of 2.
362          */
363         q += ctpop32(r);
364     }
365 
366     return q <= MAX_UNROLL;
367 }
368 
369 static void expand_clr(uint32_t dofs, uint32_t maxsz);
370 
371 /* Duplicate C as per VECE.  */
372 uint64_t (dup_const)(unsigned vece, uint64_t c)
373 {
374     switch (vece) {
375     case MO_8:
376         return 0x0101010101010101ull * (uint8_t)c;
377     case MO_16:
378         return 0x0001000100010001ull * (uint16_t)c;
379     case MO_32:
380         return 0x0000000100000001ull * (uint32_t)c;
381     case MO_64:
382         return c;
383     default:
384         g_assert_not_reached();
385     }
386 }
387 
388 /* Duplicate IN into OUT as per VECE.  */
389 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
390 {
391     switch (vece) {
392     case MO_8:
393         tcg_gen_ext8u_i32(out, in);
394         tcg_gen_muli_i32(out, out, 0x01010101);
395         break;
396     case MO_16:
397         tcg_gen_deposit_i32(out, in, in, 16, 16);
398         break;
399     case MO_32:
400         tcg_gen_mov_i32(out, in);
401         break;
402     default:
403         g_assert_not_reached();
404     }
405 }
406 
407 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
408 {
409     switch (vece) {
410     case MO_8:
411         tcg_gen_ext8u_i64(out, in);
412         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
413         break;
414     case MO_16:
415         tcg_gen_ext16u_i64(out, in);
416         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
417         break;
418     case MO_32:
419         tcg_gen_deposit_i64(out, in, in, 32, 32);
420         break;
421     case MO_64:
422         tcg_gen_mov_i64(out, in);
423         break;
424     default:
425         g_assert_not_reached();
426     }
427 }
428 
429 /* Select a supported vector type for implementing an operation on SIZE
430  * bytes.  If OP is 0, assume that the real operation to be performed is
431  * required by all backends.  Otherwise, make sure than OP can be performed
432  * on elements of size VECE in the selected type.  Do not select V64 if
433  * PREFER_I64 is true.  Return 0 if no vector type is selected.
434  */
435 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
436                                   uint32_t size, bool prefer_i64)
437 {
438     /*
439      * Recall that ARM SVE allows vector sizes that are not a
440      * power of 2, but always a multiple of 16.  The intent is
441      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
442      * It is hard to imagine a case in which v256 is supported
443      * but v128 is not, but check anyway.
444      * In addition, expand_clr needs to handle a multiple of 8.
445      */
446     if (TCG_TARGET_HAS_v256 &&
447         check_size_impl(size, 32) &&
448         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
449         (!(size & 16) ||
450          (TCG_TARGET_HAS_v128 &&
451           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
452         (!(size & 8) ||
453          (TCG_TARGET_HAS_v64 &&
454           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
455         return TCG_TYPE_V256;
456     }
457     if (TCG_TARGET_HAS_v128 &&
458         check_size_impl(size, 16) &&
459         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
460         (!(size & 8) ||
461          (TCG_TARGET_HAS_v64 &&
462           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
463         return TCG_TYPE_V128;
464     }
465     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
466         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
467         return TCG_TYPE_V64;
468     }
469     return 0;
470 }
471 
472 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
473                          uint32_t maxsz, TCGv_vec t_vec)
474 {
475     uint32_t i = 0;
476 
477     tcg_debug_assert(oprsz >= 8);
478 
479     /*
480      * This may be expand_clr for the tail of an operation, e.g.
481      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
482      * are misaligned wrt the maximum vector size, so do that first.
483      */
484     if (dofs & 8) {
485         tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64);
486         i += 8;
487     }
488 
489     switch (type) {
490     case TCG_TYPE_V256:
491         /*
492          * Recall that ARM SVE allows vector sizes that are not a
493          * power of 2, but always a multiple of 16.  The intent is
494          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
495          */
496         for (; i + 32 <= oprsz; i += 32) {
497             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V256);
498         }
499         /* fallthru */
500     case TCG_TYPE_V128:
501         for (; i + 16 <= oprsz; i += 16) {
502             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V128);
503         }
504         break;
505     case TCG_TYPE_V64:
506         for (; i < oprsz; i += 8) {
507             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64);
508         }
509         break;
510     default:
511         g_assert_not_reached();
512     }
513 
514     if (oprsz < maxsz) {
515         expand_clr(dofs + oprsz, maxsz - oprsz);
516     }
517 }
518 
519 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
520  * Only one of IN_32 or IN_64 may be set;
521  * IN_C is used if IN_32 and IN_64 are unset.
522  */
523 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
524                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
525                    uint64_t in_c)
526 {
527     TCGType type;
528     TCGv_i64 t_64;
529     TCGv_i32 t_32, t_desc;
530     TCGv_ptr t_ptr;
531     uint32_t i;
532 
533     assert(vece <= (in_32 ? MO_32 : MO_64));
534     assert(in_32 == NULL || in_64 == NULL);
535 
536     /* If we're storing 0, expand oprsz to maxsz.  */
537     if (in_32 == NULL && in_64 == NULL) {
538         in_c = dup_const(vece, in_c);
539         if (in_c == 0) {
540             oprsz = maxsz;
541             vece = MO_8;
542         } else if (in_c == dup_const(MO_8, in_c)) {
543             vece = MO_8;
544         }
545     }
546 
547     /* Implement inline with a vector type, if possible.
548      * Prefer integer when 64-bit host and no variable dup.
549      */
550     type = choose_vector_type(NULL, vece, oprsz,
551                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
552                                && (in_64 == NULL || vece == MO_64)));
553     if (type != 0) {
554         TCGv_vec t_vec = tcg_temp_new_vec(type);
555 
556         if (in_32) {
557             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
558         } else if (in_64) {
559             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
560         } else {
561             tcg_gen_dupi_vec(vece, t_vec, in_c);
562         }
563         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
564         return;
565     }
566 
567     /* Otherwise, inline with an integer type, unless "large".  */
568     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
569         t_64 = NULL;
570         t_32 = NULL;
571 
572         if (in_32) {
573             /* We are given a 32-bit variable input.  For a 64-bit host,
574                use a 64-bit operation unless the 32-bit operation would
575                be simple enough.  */
576             if (TCG_TARGET_REG_BITS == 64
577                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
578                 t_64 = tcg_temp_ebb_new_i64();
579                 tcg_gen_extu_i32_i64(t_64, in_32);
580                 tcg_gen_dup_i64(vece, t_64, t_64);
581             } else {
582                 t_32 = tcg_temp_ebb_new_i32();
583                 tcg_gen_dup_i32(vece, t_32, in_32);
584             }
585         } else if (in_64) {
586             /* We are given a 64-bit variable input.  */
587             t_64 = tcg_temp_ebb_new_i64();
588             tcg_gen_dup_i64(vece, t_64, in_64);
589         } else {
590             /* We are given a constant input.  */
591             /* For 64-bit hosts, use 64-bit constants for "simple" constants
592                or when we'd need too many 32-bit stores, or when a 64-bit
593                constant is really required.  */
594             if (vece == MO_64
595                 || (TCG_TARGET_REG_BITS == 64
596                     && (in_c == 0 || in_c == -1
597                         || !check_size_impl(oprsz, 4)))) {
598                 t_64 = tcg_constant_i64(in_c);
599             } else {
600                 t_32 = tcg_constant_i32(in_c);
601             }
602         }
603 
604         /* Implement inline if we picked an implementation size above.  */
605         if (t_32) {
606             for (i = 0; i < oprsz; i += 4) {
607                 tcg_gen_st_i32(t_32, tcg_env, dofs + i);
608             }
609             tcg_temp_free_i32(t_32);
610             goto done;
611         }
612         if (t_64) {
613             for (i = 0; i < oprsz; i += 8) {
614                 tcg_gen_st_i64(t_64, tcg_env, dofs + i);
615             }
616             tcg_temp_free_i64(t_64);
617             goto done;
618         }
619     }
620 
621     /* Otherwise implement out of line.  */
622     t_ptr = tcg_temp_ebb_new_ptr();
623     tcg_gen_addi_ptr(t_ptr, tcg_env, dofs);
624 
625     /*
626      * This may be expand_clr for the tail of an operation, e.g.
627      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
628      * wrt simd_desc and will assert.  Simply pass all replicated byte
629      * stores through to memset.
630      */
631     if (oprsz == maxsz && vece == MO_8) {
632         TCGv_ptr t_size = tcg_constant_ptr(oprsz);
633         TCGv_i32 t_val;
634 
635         if (in_32) {
636             t_val = in_32;
637         } else if (in_64) {
638             t_val = tcg_temp_ebb_new_i32();
639             tcg_gen_extrl_i64_i32(t_val, in_64);
640         } else {
641             t_val = tcg_constant_i32(in_c);
642         }
643         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
644 
645         if (in_64) {
646             tcg_temp_free_i32(t_val);
647         }
648         tcg_temp_free_ptr(t_ptr);
649         return;
650     }
651 
652     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
653 
654     if (vece == MO_64) {
655         if (in_64) {
656             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
657         } else {
658             t_64 = tcg_constant_i64(in_c);
659             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
660         }
661     } else {
662         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
663         static dup_fn * const fns[3] = {
664             gen_helper_gvec_dup8,
665             gen_helper_gvec_dup16,
666             gen_helper_gvec_dup32
667         };
668 
669         if (in_32) {
670             fns[vece](t_ptr, t_desc, in_32);
671         } else if (in_64) {
672             t_32 = tcg_temp_ebb_new_i32();
673             tcg_gen_extrl_i64_i32(t_32, in_64);
674             fns[vece](t_ptr, t_desc, t_32);
675             tcg_temp_free_i32(t_32);
676         } else {
677             if (vece == MO_8) {
678                 in_c &= 0xff;
679             } else if (vece == MO_16) {
680                 in_c &= 0xffff;
681             }
682             t_32 = tcg_constant_i32(in_c);
683             fns[vece](t_ptr, t_desc, t_32);
684         }
685     }
686 
687     tcg_temp_free_ptr(t_ptr);
688     return;
689 
690  done:
691     if (oprsz < maxsz) {
692         expand_clr(dofs + oprsz, maxsz - oprsz);
693     }
694 }
695 
696 /* Likewise, but with zero.  */
697 static void expand_clr(uint32_t dofs, uint32_t maxsz)
698 {
699     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
700 }
701 
702 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
703 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
704                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
705 {
706     TCGv_i32 t0 = tcg_temp_new_i32();
707     TCGv_i32 t1 = tcg_temp_new_i32();
708     uint32_t i;
709 
710     for (i = 0; i < oprsz; i += 4) {
711         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
712         if (load_dest) {
713             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
714         }
715         fni(t1, t0);
716         tcg_gen_st_i32(t1, tcg_env, dofs + i);
717     }
718     tcg_temp_free_i32(t0);
719     tcg_temp_free_i32(t1);
720 }
721 
722 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
723                           int32_t c, bool load_dest,
724                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
725 {
726     TCGv_i32 t0 = tcg_temp_new_i32();
727     TCGv_i32 t1 = tcg_temp_new_i32();
728     uint32_t i;
729 
730     for (i = 0; i < oprsz; i += 4) {
731         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
732         if (load_dest) {
733             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
734         }
735         fni(t1, t0, c);
736         tcg_gen_st_i32(t1, tcg_env, dofs + i);
737     }
738     tcg_temp_free_i32(t0);
739     tcg_temp_free_i32(t1);
740 }
741 
742 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
743                           TCGv_i32 c, bool scalar_first,
744                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
745 {
746     TCGv_i32 t0 = tcg_temp_new_i32();
747     TCGv_i32 t1 = tcg_temp_new_i32();
748     uint32_t i;
749 
750     for (i = 0; i < oprsz; i += 4) {
751         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
752         if (scalar_first) {
753             fni(t1, c, t0);
754         } else {
755             fni(t1, t0, c);
756         }
757         tcg_gen_st_i32(t1, tcg_env, dofs + i);
758     }
759     tcg_temp_free_i32(t0);
760     tcg_temp_free_i32(t1);
761 }
762 
763 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
764 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
765                          uint32_t bofs, uint32_t oprsz, bool load_dest,
766                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
767 {
768     TCGv_i32 t0 = tcg_temp_new_i32();
769     TCGv_i32 t1 = tcg_temp_new_i32();
770     TCGv_i32 t2 = tcg_temp_new_i32();
771     uint32_t i;
772 
773     for (i = 0; i < oprsz; i += 4) {
774         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
775         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
776         if (load_dest) {
777             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
778         }
779         fni(t2, t0, t1);
780         tcg_gen_st_i32(t2, tcg_env, dofs + i);
781     }
782     tcg_temp_free_i32(t2);
783     tcg_temp_free_i32(t1);
784     tcg_temp_free_i32(t0);
785 }
786 
787 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
788                           uint32_t oprsz, int32_t c,
789                           bool load_dest, bool write_aofs,
790                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
791 {
792     TCGv_i32 t0 = tcg_temp_new_i32();
793     TCGv_i32 t1 = tcg_temp_new_i32();
794     TCGv_i32 t2 = tcg_temp_new_i32();
795     uint32_t i;
796 
797     for (i = 0; i < oprsz; i += 4) {
798         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
799         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
800         if (load_dest) {
801             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
802         }
803         fni(t2, t0, t1, c);
804         tcg_gen_st_i32(t2, tcg_env, dofs + i);
805         if (write_aofs) {
806             tcg_gen_st_i32(t0, tcg_env, aofs + i);
807         }
808     }
809     tcg_temp_free_i32(t0);
810     tcg_temp_free_i32(t1);
811     tcg_temp_free_i32(t2);
812 }
813 
814 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
815 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
816                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
817                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
818 {
819     TCGv_i32 t0 = tcg_temp_new_i32();
820     TCGv_i32 t1 = tcg_temp_new_i32();
821     TCGv_i32 t2 = tcg_temp_new_i32();
822     TCGv_i32 t3 = tcg_temp_new_i32();
823     uint32_t i;
824 
825     for (i = 0; i < oprsz; i += 4) {
826         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
827         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
828         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
829         fni(t0, t1, t2, t3);
830         tcg_gen_st_i32(t0, tcg_env, dofs + i);
831         if (write_aofs) {
832             tcg_gen_st_i32(t1, tcg_env, aofs + i);
833         }
834     }
835     tcg_temp_free_i32(t3);
836     tcg_temp_free_i32(t2);
837     tcg_temp_free_i32(t1);
838     tcg_temp_free_i32(t0);
839 }
840 
841 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
842                           uint32_t cofs, uint32_t oprsz, int32_t c,
843                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
844                                       int32_t))
845 {
846     TCGv_i32 t0 = tcg_temp_new_i32();
847     TCGv_i32 t1 = tcg_temp_new_i32();
848     TCGv_i32 t2 = tcg_temp_new_i32();
849     TCGv_i32 t3 = tcg_temp_new_i32();
850     uint32_t i;
851 
852     for (i = 0; i < oprsz; i += 4) {
853         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
854         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
855         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
856         fni(t0, t1, t2, t3, c);
857         tcg_gen_st_i32(t0, tcg_env, dofs + i);
858     }
859     tcg_temp_free_i32(t3);
860     tcg_temp_free_i32(t2);
861     tcg_temp_free_i32(t1);
862     tcg_temp_free_i32(t0);
863 }
864 
865 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
866 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
867                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
868 {
869     TCGv_i64 t0 = tcg_temp_new_i64();
870     TCGv_i64 t1 = tcg_temp_new_i64();
871     uint32_t i;
872 
873     for (i = 0; i < oprsz; i += 8) {
874         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
875         if (load_dest) {
876             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
877         }
878         fni(t1, t0);
879         tcg_gen_st_i64(t1, tcg_env, dofs + i);
880     }
881     tcg_temp_free_i64(t0);
882     tcg_temp_free_i64(t1);
883 }
884 
885 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
886                           int64_t c, bool load_dest,
887                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
888 {
889     TCGv_i64 t0 = tcg_temp_new_i64();
890     TCGv_i64 t1 = tcg_temp_new_i64();
891     uint32_t i;
892 
893     for (i = 0; i < oprsz; i += 8) {
894         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
895         if (load_dest) {
896             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
897         }
898         fni(t1, t0, c);
899         tcg_gen_st_i64(t1, tcg_env, dofs + i);
900     }
901     tcg_temp_free_i64(t0);
902     tcg_temp_free_i64(t1);
903 }
904 
905 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
906                           TCGv_i64 c, bool scalar_first,
907                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
908 {
909     TCGv_i64 t0 = tcg_temp_new_i64();
910     TCGv_i64 t1 = tcg_temp_new_i64();
911     uint32_t i;
912 
913     for (i = 0; i < oprsz; i += 8) {
914         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
915         if (scalar_first) {
916             fni(t1, c, t0);
917         } else {
918             fni(t1, t0, c);
919         }
920         tcg_gen_st_i64(t1, tcg_env, dofs + i);
921     }
922     tcg_temp_free_i64(t0);
923     tcg_temp_free_i64(t1);
924 }
925 
926 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
927 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
928                          uint32_t bofs, uint32_t oprsz, bool load_dest,
929                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
930 {
931     TCGv_i64 t0 = tcg_temp_new_i64();
932     TCGv_i64 t1 = tcg_temp_new_i64();
933     TCGv_i64 t2 = tcg_temp_new_i64();
934     uint32_t i;
935 
936     for (i = 0; i < oprsz; i += 8) {
937         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
938         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
939         if (load_dest) {
940             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
941         }
942         fni(t2, t0, t1);
943         tcg_gen_st_i64(t2, tcg_env, dofs + i);
944     }
945     tcg_temp_free_i64(t2);
946     tcg_temp_free_i64(t1);
947     tcg_temp_free_i64(t0);
948 }
949 
950 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
951                           uint32_t oprsz, int64_t c,
952                           bool load_dest, bool write_aofs,
953                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
954 {
955     TCGv_i64 t0 = tcg_temp_new_i64();
956     TCGv_i64 t1 = tcg_temp_new_i64();
957     TCGv_i64 t2 = tcg_temp_new_i64();
958     uint32_t i;
959 
960     for (i = 0; i < oprsz; i += 8) {
961         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
962         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
963         if (load_dest) {
964             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
965         }
966         fni(t2, t0, t1, c);
967         tcg_gen_st_i64(t2, tcg_env, dofs + i);
968         if (write_aofs) {
969             tcg_gen_st_i64(t0, tcg_env, aofs + i);
970         }
971     }
972     tcg_temp_free_i64(t0);
973     tcg_temp_free_i64(t1);
974     tcg_temp_free_i64(t2);
975 }
976 
977 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
978 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
979                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
980                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
981 {
982     TCGv_i64 t0 = tcg_temp_new_i64();
983     TCGv_i64 t1 = tcg_temp_new_i64();
984     TCGv_i64 t2 = tcg_temp_new_i64();
985     TCGv_i64 t3 = tcg_temp_new_i64();
986     uint32_t i;
987 
988     for (i = 0; i < oprsz; i += 8) {
989         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
990         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
991         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
992         fni(t0, t1, t2, t3);
993         tcg_gen_st_i64(t0, tcg_env, dofs + i);
994         if (write_aofs) {
995             tcg_gen_st_i64(t1, tcg_env, aofs + i);
996         }
997     }
998     tcg_temp_free_i64(t3);
999     tcg_temp_free_i64(t2);
1000     tcg_temp_free_i64(t1);
1001     tcg_temp_free_i64(t0);
1002 }
1003 
1004 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1005                           uint32_t cofs, uint32_t oprsz, int64_t c,
1006                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
1007                                       int64_t))
1008 {
1009     TCGv_i64 t0 = tcg_temp_new_i64();
1010     TCGv_i64 t1 = tcg_temp_new_i64();
1011     TCGv_i64 t2 = tcg_temp_new_i64();
1012     TCGv_i64 t3 = tcg_temp_new_i64();
1013     uint32_t i;
1014 
1015     for (i = 0; i < oprsz; i += 8) {
1016         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1017         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1018         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1019         fni(t0, t1, t2, t3, c);
1020         tcg_gen_st_i64(t0, tcg_env, dofs + i);
1021     }
1022     tcg_temp_free_i64(t3);
1023     tcg_temp_free_i64(t2);
1024     tcg_temp_free_i64(t1);
1025     tcg_temp_free_i64(t0);
1026 }
1027 
1028 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
1029 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1030                          uint32_t oprsz, uint32_t tysz, TCGType type,
1031                          bool load_dest,
1032                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1033 {
1034     for (uint32_t i = 0; i < oprsz; i += tysz) {
1035         TCGv_vec t0 = tcg_temp_new_vec(type);
1036         TCGv_vec t1 = tcg_temp_new_vec(type);
1037 
1038         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1039         if (load_dest) {
1040             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1041         }
1042         fni(vece, t1, t0);
1043         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1044     }
1045 }
1046 
1047 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1048    using host vectors.  */
1049 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1050                           uint32_t oprsz, uint32_t tysz, TCGType type,
1051                           int64_t c, bool load_dest,
1052                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1053 {
1054     for (uint32_t i = 0; i < oprsz; i += tysz) {
1055         TCGv_vec t0 = tcg_temp_new_vec(type);
1056         TCGv_vec t1 = tcg_temp_new_vec(type);
1057 
1058         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1059         if (load_dest) {
1060             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1061         }
1062         fni(vece, t1, t0, c);
1063         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1064     }
1065 }
1066 
1067 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1068                           uint32_t oprsz, uint32_t tysz, TCGType type,
1069                           TCGv_vec c, bool scalar_first,
1070                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1071 {
1072     for (uint32_t i = 0; i < oprsz; i += tysz) {
1073         TCGv_vec t0 = tcg_temp_new_vec(type);
1074         TCGv_vec t1 = tcg_temp_new_vec(type);
1075 
1076         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1077         if (scalar_first) {
1078             fni(vece, t1, c, t0);
1079         } else {
1080             fni(vece, t1, t0, c);
1081         }
1082         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1083     }
1084 }
1085 
1086 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1087 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1088                          uint32_t bofs, uint32_t oprsz,
1089                          uint32_t tysz, TCGType type, bool load_dest,
1090                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1091 {
1092     for (uint32_t i = 0; i < oprsz; i += tysz) {
1093         TCGv_vec t0 = tcg_temp_new_vec(type);
1094         TCGv_vec t1 = tcg_temp_new_vec(type);
1095         TCGv_vec t2 = tcg_temp_new_vec(type);
1096 
1097         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1098         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1099         if (load_dest) {
1100             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1101         }
1102         fni(vece, t2, t0, t1);
1103         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1104     }
1105 }
1106 
1107 /*
1108  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1109  * using host vectors.
1110  */
1111 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1112                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1113                           TCGType type, int64_t c,
1114                           bool load_dest, bool write_aofs,
1115                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1116                                       int64_t))
1117 {
1118     for (uint32_t i = 0; i < oprsz; i += tysz) {
1119         TCGv_vec t0 = tcg_temp_new_vec(type);
1120         TCGv_vec t1 = tcg_temp_new_vec(type);
1121         TCGv_vec t2 = tcg_temp_new_vec(type);
1122 
1123         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1124         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1125         if (load_dest) {
1126             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1127         }
1128         fni(vece, t2, t0, t1, c);
1129         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1130         if (write_aofs) {
1131             tcg_gen_st_vec(t0, tcg_env, aofs + i);
1132         }
1133     }
1134 }
1135 
1136 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1137 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1138                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1139                          uint32_t tysz, TCGType type, bool write_aofs,
1140                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1141                                      TCGv_vec, TCGv_vec))
1142 {
1143     for (uint32_t i = 0; i < oprsz; i += tysz) {
1144         TCGv_vec t0 = tcg_temp_new_vec(type);
1145         TCGv_vec t1 = tcg_temp_new_vec(type);
1146         TCGv_vec t2 = tcg_temp_new_vec(type);
1147         TCGv_vec t3 = tcg_temp_new_vec(type);
1148 
1149         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1150         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1151         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1152         fni(vece, t0, t1, t2, t3);
1153         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1154         if (write_aofs) {
1155             tcg_gen_st_vec(t1, tcg_env, aofs + i);
1156         }
1157     }
1158 }
1159 
1160 /*
1161  * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1162  * using host vectors.
1163  */
1164 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1165                           uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1166                           uint32_t tysz, TCGType type, int64_t c,
1167                           void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1168                                      TCGv_vec, TCGv_vec, int64_t))
1169 {
1170     for (uint32_t i = 0; i < oprsz; i += tysz) {
1171         TCGv_vec t0 = tcg_temp_new_vec(type);
1172         TCGv_vec t1 = tcg_temp_new_vec(type);
1173         TCGv_vec t2 = tcg_temp_new_vec(type);
1174         TCGv_vec t3 = tcg_temp_new_vec(type);
1175 
1176         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1177         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1178         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1179         fni(vece, t0, t1, t2, t3, c);
1180         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1181     }
1182 }
1183 
1184 /* Expand a vector two-operand operation.  */
1185 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1186                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1187 {
1188     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1189     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1190     TCGType type;
1191     uint32_t some;
1192 
1193     check_size_align(oprsz, maxsz, dofs | aofs);
1194     check_overlap_2(dofs, aofs, maxsz);
1195 
1196     type = 0;
1197     if (g->fniv) {
1198         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1199     }
1200     switch (type) {
1201     case TCG_TYPE_V256:
1202         /* Recall that ARM SVE allows vector sizes that are not a
1203          * power of 2, but always a multiple of 16.  The intent is
1204          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1205          */
1206         some = QEMU_ALIGN_DOWN(oprsz, 32);
1207         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1208                      g->load_dest, g->fniv);
1209         if (some == oprsz) {
1210             break;
1211         }
1212         dofs += some;
1213         aofs += some;
1214         oprsz -= some;
1215         maxsz -= some;
1216         /* fallthru */
1217     case TCG_TYPE_V128:
1218         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1219                      g->load_dest, g->fniv);
1220         break;
1221     case TCG_TYPE_V64:
1222         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1223                      g->load_dest, g->fniv);
1224         break;
1225 
1226     case 0:
1227         if (g->fni8 && check_size_impl(oprsz, 8)) {
1228             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1229         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1230             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1231         } else {
1232             assert(g->fno != NULL);
1233             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1234             oprsz = maxsz;
1235         }
1236         break;
1237 
1238     default:
1239         g_assert_not_reached();
1240     }
1241     tcg_swap_vecop_list(hold_list);
1242 
1243     if (oprsz < maxsz) {
1244         expand_clr(dofs + oprsz, maxsz - oprsz);
1245     }
1246 }
1247 
1248 /* Expand a vector operation with two vectors and an immediate.  */
1249 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1250                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1251 {
1252     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1253     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1254     TCGType type;
1255     uint32_t some;
1256 
1257     check_size_align(oprsz, maxsz, dofs | aofs);
1258     check_overlap_2(dofs, aofs, maxsz);
1259 
1260     type = 0;
1261     if (g->fniv) {
1262         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1263     }
1264     switch (type) {
1265     case TCG_TYPE_V256:
1266         /* Recall that ARM SVE allows vector sizes that are not a
1267          * power of 2, but always a multiple of 16.  The intent is
1268          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1269          */
1270         some = QEMU_ALIGN_DOWN(oprsz, 32);
1271         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1272                       c, g->load_dest, g->fniv);
1273         if (some == oprsz) {
1274             break;
1275         }
1276         dofs += some;
1277         aofs += some;
1278         oprsz -= some;
1279         maxsz -= some;
1280         /* fallthru */
1281     case TCG_TYPE_V128:
1282         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1283                       c, g->load_dest, g->fniv);
1284         break;
1285     case TCG_TYPE_V64:
1286         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1287                       c, g->load_dest, g->fniv);
1288         break;
1289 
1290     case 0:
1291         if (g->fni8 && check_size_impl(oprsz, 8)) {
1292             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1293         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1294             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1295         } else {
1296             if (g->fno) {
1297                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1298             } else {
1299                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1300                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1301                                     maxsz, c, g->fnoi);
1302             }
1303             oprsz = maxsz;
1304         }
1305         break;
1306 
1307     default:
1308         g_assert_not_reached();
1309     }
1310     tcg_swap_vecop_list(hold_list);
1311 
1312     if (oprsz < maxsz) {
1313         expand_clr(dofs + oprsz, maxsz - oprsz);
1314     }
1315 }
1316 
1317 /* Expand a vector operation with two vectors and a scalar.  */
1318 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1319                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1320 {
1321     TCGType type;
1322 
1323     check_size_align(oprsz, maxsz, dofs | aofs);
1324     check_overlap_2(dofs, aofs, maxsz);
1325 
1326     type = 0;
1327     if (g->fniv) {
1328         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1329     }
1330     if (type != 0) {
1331         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1332         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1333         TCGv_vec t_vec = tcg_temp_new_vec(type);
1334         uint32_t some;
1335 
1336         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1337 
1338         switch (type) {
1339         case TCG_TYPE_V256:
1340             /* Recall that ARM SVE allows vector sizes that are not a
1341              * power of 2, but always a multiple of 16.  The intent is
1342              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1343              */
1344             some = QEMU_ALIGN_DOWN(oprsz, 32);
1345             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1346                           t_vec, g->scalar_first, g->fniv);
1347             if (some == oprsz) {
1348                 break;
1349             }
1350             dofs += some;
1351             aofs += some;
1352             oprsz -= some;
1353             maxsz -= some;
1354             /* fallthru */
1355 
1356         case TCG_TYPE_V128:
1357             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1358                           t_vec, g->scalar_first, g->fniv);
1359             break;
1360 
1361         case TCG_TYPE_V64:
1362             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1363                           t_vec, g->scalar_first, g->fniv);
1364             break;
1365 
1366         default:
1367             g_assert_not_reached();
1368         }
1369         tcg_temp_free_vec(t_vec);
1370         tcg_swap_vecop_list(hold_list);
1371     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1372         TCGv_i64 t64 = tcg_temp_new_i64();
1373 
1374         tcg_gen_dup_i64(g->vece, t64, c);
1375         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1376         tcg_temp_free_i64(t64);
1377     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1378         TCGv_i32 t32 = tcg_temp_new_i32();
1379 
1380         tcg_gen_extrl_i64_i32(t32, c);
1381         tcg_gen_dup_i32(g->vece, t32, t32);
1382         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1383         tcg_temp_free_i32(t32);
1384     } else {
1385         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1386         return;
1387     }
1388 
1389     if (oprsz < maxsz) {
1390         expand_clr(dofs + oprsz, maxsz - oprsz);
1391     }
1392 }
1393 
1394 /* Expand a vector three-operand operation.  */
1395 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1396                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1397 {
1398     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1399     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1400     TCGType type;
1401     uint32_t some;
1402 
1403     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1404     check_overlap_3(dofs, aofs, bofs, maxsz);
1405 
1406     type = 0;
1407     if (g->fniv) {
1408         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1409     }
1410     switch (type) {
1411     case TCG_TYPE_V256:
1412         /* Recall that ARM SVE allows vector sizes that are not a
1413          * power of 2, but always a multiple of 16.  The intent is
1414          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1415          */
1416         some = QEMU_ALIGN_DOWN(oprsz, 32);
1417         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1418                      g->load_dest, g->fniv);
1419         if (some == oprsz) {
1420             break;
1421         }
1422         dofs += some;
1423         aofs += some;
1424         bofs += some;
1425         oprsz -= some;
1426         maxsz -= some;
1427         /* fallthru */
1428     case TCG_TYPE_V128:
1429         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1430                      g->load_dest, g->fniv);
1431         break;
1432     case TCG_TYPE_V64:
1433         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1434                      g->load_dest, g->fniv);
1435         break;
1436 
1437     case 0:
1438         if (g->fni8 && check_size_impl(oprsz, 8)) {
1439             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1440         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1441             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1442         } else {
1443             assert(g->fno != NULL);
1444             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1445                                maxsz, g->data, g->fno);
1446             oprsz = maxsz;
1447         }
1448         break;
1449 
1450     default:
1451         g_assert_not_reached();
1452     }
1453     tcg_swap_vecop_list(hold_list);
1454 
1455     if (oprsz < maxsz) {
1456         expand_clr(dofs + oprsz, maxsz - oprsz);
1457     }
1458 }
1459 
1460 /* Expand a vector operation with three vectors and an immediate.  */
1461 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1462                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1463                      const GVecGen3i *g)
1464 {
1465     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1466     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1467     TCGType type;
1468     uint32_t some;
1469 
1470     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1471     check_overlap_3(dofs, aofs, bofs, maxsz);
1472 
1473     type = 0;
1474     if (g->fniv) {
1475         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1476     }
1477     switch (type) {
1478     case TCG_TYPE_V256:
1479         /*
1480          * Recall that ARM SVE allows vector sizes that are not a
1481          * power of 2, but always a multiple of 16.  The intent is
1482          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1483          */
1484         some = QEMU_ALIGN_DOWN(oprsz, 32);
1485         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1486                       c, g->load_dest, g->write_aofs, g->fniv);
1487         if (some == oprsz) {
1488             break;
1489         }
1490         dofs += some;
1491         aofs += some;
1492         bofs += some;
1493         oprsz -= some;
1494         maxsz -= some;
1495         /* fallthru */
1496     case TCG_TYPE_V128:
1497         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1498                       c, g->load_dest, g->write_aofs, g->fniv);
1499         break;
1500     case TCG_TYPE_V64:
1501         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1502                       c, g->load_dest, g->write_aofs, g->fniv);
1503         break;
1504 
1505     case 0:
1506         if (g->fni8 && check_size_impl(oprsz, 8)) {
1507             expand_3i_i64(dofs, aofs, bofs, oprsz, c,
1508                           g->load_dest, g->write_aofs, g->fni8);
1509         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1510             expand_3i_i32(dofs, aofs, bofs, oprsz, c,
1511                           g->load_dest, g->write_aofs, g->fni4);
1512         } else {
1513             assert(g->fno != NULL);
1514             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1515             oprsz = maxsz;
1516         }
1517         break;
1518 
1519     default:
1520         g_assert_not_reached();
1521     }
1522     tcg_swap_vecop_list(hold_list);
1523 
1524     if (oprsz < maxsz) {
1525         expand_clr(dofs + oprsz, maxsz - oprsz);
1526     }
1527 }
1528 
1529 /* Expand a vector four-operand operation.  */
1530 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1531                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1532 {
1533     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1534     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1535     TCGType type;
1536     uint32_t some;
1537 
1538     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1539     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1540 
1541     type = 0;
1542     if (g->fniv) {
1543         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1544     }
1545     switch (type) {
1546     case TCG_TYPE_V256:
1547         /* Recall that ARM SVE allows vector sizes that are not a
1548          * power of 2, but always a multiple of 16.  The intent is
1549          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1550          */
1551         some = QEMU_ALIGN_DOWN(oprsz, 32);
1552         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1553                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1554         if (some == oprsz) {
1555             break;
1556         }
1557         dofs += some;
1558         aofs += some;
1559         bofs += some;
1560         cofs += some;
1561         oprsz -= some;
1562         maxsz -= some;
1563         /* fallthru */
1564     case TCG_TYPE_V128:
1565         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1566                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1567         break;
1568     case TCG_TYPE_V64:
1569         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1570                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1571         break;
1572 
1573     case 0:
1574         if (g->fni8 && check_size_impl(oprsz, 8)) {
1575             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1576                          g->write_aofs, g->fni8);
1577         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1578             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1579                          g->write_aofs, g->fni4);
1580         } else {
1581             assert(g->fno != NULL);
1582             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1583                                oprsz, maxsz, g->data, g->fno);
1584             oprsz = maxsz;
1585         }
1586         break;
1587 
1588     default:
1589         g_assert_not_reached();
1590     }
1591     tcg_swap_vecop_list(hold_list);
1592 
1593     if (oprsz < maxsz) {
1594         expand_clr(dofs + oprsz, maxsz - oprsz);
1595     }
1596 }
1597 
1598 /* Expand a vector four-operand operation.  */
1599 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1600                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1601                      const GVecGen4i *g)
1602 {
1603     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1604     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1605     TCGType type;
1606     uint32_t some;
1607 
1608     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1609     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1610 
1611     type = 0;
1612     if (g->fniv) {
1613         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1614     }
1615     switch (type) {
1616     case TCG_TYPE_V256:
1617         /*
1618          * Recall that ARM SVE allows vector sizes that are not a
1619          * power of 2, but always a multiple of 16.  The intent is
1620          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1621          */
1622         some = QEMU_ALIGN_DOWN(oprsz, 32);
1623         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1624                       32, TCG_TYPE_V256, c, g->fniv);
1625         if (some == oprsz) {
1626             break;
1627         }
1628         dofs += some;
1629         aofs += some;
1630         bofs += some;
1631         cofs += some;
1632         oprsz -= some;
1633         maxsz -= some;
1634         /* fallthru */
1635     case TCG_TYPE_V128:
1636         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1637                        16, TCG_TYPE_V128, c, g->fniv);
1638         break;
1639     case TCG_TYPE_V64:
1640         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1641                       8, TCG_TYPE_V64, c, g->fniv);
1642         break;
1643 
1644     case 0:
1645         if (g->fni8 && check_size_impl(oprsz, 8)) {
1646             expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1647         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1648             expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1649         } else {
1650             assert(g->fno != NULL);
1651             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1652                                oprsz, maxsz, c, g->fno);
1653             oprsz = maxsz;
1654         }
1655         break;
1656 
1657     default:
1658         g_assert_not_reached();
1659     }
1660     tcg_swap_vecop_list(hold_list);
1661 
1662     if (oprsz < maxsz) {
1663         expand_clr(dofs + oprsz, maxsz - oprsz);
1664     }
1665 }
1666 
1667 /*
1668  * Expand specific vector operations.
1669  */
1670 
1671 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1672 {
1673     tcg_gen_mov_vec(a, b);
1674 }
1675 
1676 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1677                       uint32_t oprsz, uint32_t maxsz)
1678 {
1679     static const GVecGen2 g = {
1680         .fni8 = tcg_gen_mov_i64,
1681         .fniv = vec_mov2,
1682         .fno = gen_helper_gvec_mov,
1683         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1684     };
1685     if (dofs != aofs) {
1686         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1687     } else {
1688         check_size_align(oprsz, maxsz, dofs);
1689         if (oprsz < maxsz) {
1690             expand_clr(dofs + oprsz, maxsz - oprsz);
1691         }
1692     }
1693 }
1694 
1695 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1696                           uint32_t maxsz, TCGv_i32 in)
1697 {
1698     check_size_align(oprsz, maxsz, dofs);
1699     tcg_debug_assert(vece <= MO_32);
1700     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1701 }
1702 
1703 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1704                           uint32_t maxsz, TCGv_i64 in)
1705 {
1706     check_size_align(oprsz, maxsz, dofs);
1707     tcg_debug_assert(vece <= MO_64);
1708     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1709 }
1710 
1711 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1712                           uint32_t oprsz, uint32_t maxsz)
1713 {
1714     check_size_align(oprsz, maxsz, dofs);
1715     if (vece <= MO_64) {
1716         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1717         if (type != 0) {
1718             TCGv_vec t_vec = tcg_temp_new_vec(type);
1719             tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs);
1720             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1721         } else if (vece <= MO_32) {
1722             TCGv_i32 in = tcg_temp_ebb_new_i32();
1723             switch (vece) {
1724             case MO_8:
1725                 tcg_gen_ld8u_i32(in, tcg_env, aofs);
1726                 break;
1727             case MO_16:
1728                 tcg_gen_ld16u_i32(in, tcg_env, aofs);
1729                 break;
1730             default:
1731                 tcg_gen_ld_i32(in, tcg_env, aofs);
1732                 break;
1733             }
1734             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1735             tcg_temp_free_i32(in);
1736         } else {
1737             TCGv_i64 in = tcg_temp_ebb_new_i64();
1738             tcg_gen_ld_i64(in, tcg_env, aofs);
1739             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1740             tcg_temp_free_i64(in);
1741         }
1742     } else if (vece == 4) {
1743         /* 128-bit duplicate.  */
1744         int i;
1745 
1746         tcg_debug_assert(oprsz >= 16);
1747         if (TCG_TARGET_HAS_v128) {
1748             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1749 
1750             tcg_gen_ld_vec(in, tcg_env, aofs);
1751             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1752                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1753             }
1754         } else {
1755             TCGv_i64 in0 = tcg_temp_ebb_new_i64();
1756             TCGv_i64 in1 = tcg_temp_ebb_new_i64();
1757 
1758             tcg_gen_ld_i64(in0, tcg_env, aofs);
1759             tcg_gen_ld_i64(in1, tcg_env, aofs + 8);
1760             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1761                 tcg_gen_st_i64(in0, tcg_env, dofs + i);
1762                 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8);
1763             }
1764             tcg_temp_free_i64(in0);
1765             tcg_temp_free_i64(in1);
1766         }
1767         if (oprsz < maxsz) {
1768             expand_clr(dofs + oprsz, maxsz - oprsz);
1769         }
1770     } else if (vece == 5) {
1771         /* 256-bit duplicate.  */
1772         int i;
1773 
1774         tcg_debug_assert(oprsz >= 32);
1775         tcg_debug_assert(oprsz % 32 == 0);
1776         if (TCG_TARGET_HAS_v256) {
1777             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1778 
1779             tcg_gen_ld_vec(in, tcg_env, aofs);
1780             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1781                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1782             }
1783         } else if (TCG_TARGET_HAS_v128) {
1784             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1785             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1786 
1787             tcg_gen_ld_vec(in0, tcg_env, aofs);
1788             tcg_gen_ld_vec(in1, tcg_env, aofs + 16);
1789             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1790                 tcg_gen_st_vec(in0, tcg_env, dofs + i);
1791                 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16);
1792             }
1793         } else {
1794             TCGv_i64 in[4];
1795             int j;
1796 
1797             for (j = 0; j < 4; ++j) {
1798                 in[j] = tcg_temp_ebb_new_i64();
1799                 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8);
1800             }
1801             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1802                 for (j = 0; j < 4; ++j) {
1803                     tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8);
1804                 }
1805             }
1806             for (j = 0; j < 4; ++j) {
1807                 tcg_temp_free_i64(in[j]);
1808             }
1809         }
1810         if (oprsz < maxsz) {
1811             expand_clr(dofs + oprsz, maxsz - oprsz);
1812         }
1813     } else {
1814         g_assert_not_reached();
1815     }
1816 }
1817 
1818 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1819                           uint32_t maxsz, uint64_t x)
1820 {
1821     check_size_align(oprsz, maxsz, dofs);
1822     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1823 }
1824 
1825 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1826                       uint32_t oprsz, uint32_t maxsz)
1827 {
1828     static const GVecGen2 g = {
1829         .fni8 = tcg_gen_not_i64,
1830         .fniv = tcg_gen_not_vec,
1831         .fno = gen_helper_gvec_not,
1832         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1833     };
1834     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1835 }
1836 
1837 /* Perform a vector addition using normal addition and a mask.  The mask
1838    should be the sign bit of each lane.  This 6-operation form is more
1839    efficient than separate additions when there are 4 or more lanes in
1840    the 64-bit operation.  */
1841 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1842 {
1843     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1844     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1845     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
1846 
1847     tcg_gen_andc_i64(t1, a, m);
1848     tcg_gen_andc_i64(t2, b, m);
1849     tcg_gen_xor_i64(t3, a, b);
1850     tcg_gen_add_i64(d, t1, t2);
1851     tcg_gen_and_i64(t3, t3, m);
1852     tcg_gen_xor_i64(d, d, t3);
1853 
1854     tcg_temp_free_i64(t1);
1855     tcg_temp_free_i64(t2);
1856     tcg_temp_free_i64(t3);
1857 }
1858 
1859 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1860 {
1861     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1862     gen_addv_mask(d, a, b, m);
1863 }
1864 
1865 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1866 {
1867     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1868     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1869     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1870     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
1871 
1872     tcg_gen_andc_i32(t1, a, m);
1873     tcg_gen_andc_i32(t2, b, m);
1874     tcg_gen_xor_i32(t3, a, b);
1875     tcg_gen_add_i32(d, t1, t2);
1876     tcg_gen_and_i32(t3, t3, m);
1877     tcg_gen_xor_i32(d, d, t3);
1878 
1879     tcg_temp_free_i32(t1);
1880     tcg_temp_free_i32(t2);
1881     tcg_temp_free_i32(t3);
1882 }
1883 
1884 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1885 {
1886     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1887     gen_addv_mask(d, a, b, m);
1888 }
1889 
1890 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1891 {
1892     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1893     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1894 
1895     tcg_gen_andi_i32(t1, a, ~0xffff);
1896     tcg_gen_add_i32(t2, a, b);
1897     tcg_gen_add_i32(t1, t1, b);
1898     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1899 
1900     tcg_temp_free_i32(t1);
1901     tcg_temp_free_i32(t2);
1902 }
1903 
1904 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1905 {
1906     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1907     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1908 
1909     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1910     tcg_gen_add_i64(t2, a, b);
1911     tcg_gen_add_i64(t1, t1, b);
1912     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1913 
1914     tcg_temp_free_i64(t1);
1915     tcg_temp_free_i64(t2);
1916 }
1917 
1918 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1919 
1920 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1921                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1922 {
1923     static const GVecGen3 g[4] = {
1924         { .fni8 = tcg_gen_vec_add8_i64,
1925           .fniv = tcg_gen_add_vec,
1926           .fno = gen_helper_gvec_add8,
1927           .opt_opc = vecop_list_add,
1928           .vece = MO_8 },
1929         { .fni8 = tcg_gen_vec_add16_i64,
1930           .fniv = tcg_gen_add_vec,
1931           .fno = gen_helper_gvec_add16,
1932           .opt_opc = vecop_list_add,
1933           .vece = MO_16 },
1934         { .fni4 = tcg_gen_add_i32,
1935           .fniv = tcg_gen_add_vec,
1936           .fno = gen_helper_gvec_add32,
1937           .opt_opc = vecop_list_add,
1938           .vece = MO_32 },
1939         { .fni8 = tcg_gen_add_i64,
1940           .fniv = tcg_gen_add_vec,
1941           .fno = gen_helper_gvec_add64,
1942           .opt_opc = vecop_list_add,
1943           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1944           .vece = MO_64 },
1945     };
1946 
1947     tcg_debug_assert(vece <= MO_64);
1948     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1949 }
1950 
1951 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1952                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1953 {
1954     static const GVecGen2s g[4] = {
1955         { .fni8 = tcg_gen_vec_add8_i64,
1956           .fniv = tcg_gen_add_vec,
1957           .fno = gen_helper_gvec_adds8,
1958           .opt_opc = vecop_list_add,
1959           .vece = MO_8 },
1960         { .fni8 = tcg_gen_vec_add16_i64,
1961           .fniv = tcg_gen_add_vec,
1962           .fno = gen_helper_gvec_adds16,
1963           .opt_opc = vecop_list_add,
1964           .vece = MO_16 },
1965         { .fni4 = tcg_gen_add_i32,
1966           .fniv = tcg_gen_add_vec,
1967           .fno = gen_helper_gvec_adds32,
1968           .opt_opc = vecop_list_add,
1969           .vece = MO_32 },
1970         { .fni8 = tcg_gen_add_i64,
1971           .fniv = tcg_gen_add_vec,
1972           .fno = gen_helper_gvec_adds64,
1973           .opt_opc = vecop_list_add,
1974           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1975           .vece = MO_64 },
1976     };
1977 
1978     tcg_debug_assert(vece <= MO_64);
1979     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1980 }
1981 
1982 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1983                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1984 {
1985     TCGv_i64 tmp = tcg_constant_i64(c);
1986     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1987 }
1988 
1989 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1990 
1991 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1992                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1993 {
1994     static const GVecGen2s g[4] = {
1995         { .fni8 = tcg_gen_vec_sub8_i64,
1996           .fniv = tcg_gen_sub_vec,
1997           .fno = gen_helper_gvec_subs8,
1998           .opt_opc = vecop_list_sub,
1999           .vece = MO_8 },
2000         { .fni8 = tcg_gen_vec_sub16_i64,
2001           .fniv = tcg_gen_sub_vec,
2002           .fno = gen_helper_gvec_subs16,
2003           .opt_opc = vecop_list_sub,
2004           .vece = MO_16 },
2005         { .fni4 = tcg_gen_sub_i32,
2006           .fniv = tcg_gen_sub_vec,
2007           .fno = gen_helper_gvec_subs32,
2008           .opt_opc = vecop_list_sub,
2009           .vece = MO_32 },
2010         { .fni8 = tcg_gen_sub_i64,
2011           .fniv = tcg_gen_sub_vec,
2012           .fno = gen_helper_gvec_subs64,
2013           .opt_opc = vecop_list_sub,
2014           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2015           .vece = MO_64 },
2016     };
2017 
2018     tcg_debug_assert(vece <= MO_64);
2019     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2020 }
2021 
2022 /* Perform a vector subtraction using normal subtraction and a mask.
2023    Compare gen_addv_mask above.  */
2024 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2025 {
2026     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2027     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2028     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2029 
2030     tcg_gen_or_i64(t1, a, m);
2031     tcg_gen_andc_i64(t2, b, m);
2032     tcg_gen_eqv_i64(t3, a, b);
2033     tcg_gen_sub_i64(d, t1, t2);
2034     tcg_gen_and_i64(t3, t3, m);
2035     tcg_gen_xor_i64(d, d, t3);
2036 
2037     tcg_temp_free_i64(t1);
2038     tcg_temp_free_i64(t2);
2039     tcg_temp_free_i64(t3);
2040 }
2041 
2042 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2043 {
2044     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2045     gen_subv_mask(d, a, b, m);
2046 }
2047 
2048 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2049 {
2050     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2051     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2052     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2053     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
2054 
2055     tcg_gen_or_i32(t1, a, m);
2056     tcg_gen_andc_i32(t2, b, m);
2057     tcg_gen_eqv_i32(t3, a, b);
2058     tcg_gen_sub_i32(d, t1, t2);
2059     tcg_gen_and_i32(t3, t3, m);
2060     tcg_gen_xor_i32(d, d, t3);
2061 
2062     tcg_temp_free_i32(t1);
2063     tcg_temp_free_i32(t2);
2064     tcg_temp_free_i32(t3);
2065 }
2066 
2067 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2068 {
2069     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2070     gen_subv_mask(d, a, b, m);
2071 }
2072 
2073 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2074 {
2075     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2076     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2077 
2078     tcg_gen_andi_i32(t1, b, ~0xffff);
2079     tcg_gen_sub_i32(t2, a, b);
2080     tcg_gen_sub_i32(t1, a, t1);
2081     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2082 
2083     tcg_temp_free_i32(t1);
2084     tcg_temp_free_i32(t2);
2085 }
2086 
2087 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2088 {
2089     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2090     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2091 
2092     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2093     tcg_gen_sub_i64(t2, a, b);
2094     tcg_gen_sub_i64(t1, a, t1);
2095     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2096 
2097     tcg_temp_free_i64(t1);
2098     tcg_temp_free_i64(t2);
2099 }
2100 
2101 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2102                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2103 {
2104     static const GVecGen3 g[4] = {
2105         { .fni8 = tcg_gen_vec_sub8_i64,
2106           .fniv = tcg_gen_sub_vec,
2107           .fno = gen_helper_gvec_sub8,
2108           .opt_opc = vecop_list_sub,
2109           .vece = MO_8 },
2110         { .fni8 = tcg_gen_vec_sub16_i64,
2111           .fniv = tcg_gen_sub_vec,
2112           .fno = gen_helper_gvec_sub16,
2113           .opt_opc = vecop_list_sub,
2114           .vece = MO_16 },
2115         { .fni4 = tcg_gen_sub_i32,
2116           .fniv = tcg_gen_sub_vec,
2117           .fno = gen_helper_gvec_sub32,
2118           .opt_opc = vecop_list_sub,
2119           .vece = MO_32 },
2120         { .fni8 = tcg_gen_sub_i64,
2121           .fniv = tcg_gen_sub_vec,
2122           .fno = gen_helper_gvec_sub64,
2123           .opt_opc = vecop_list_sub,
2124           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2125           .vece = MO_64 },
2126     };
2127 
2128     tcg_debug_assert(vece <= MO_64);
2129     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2130 }
2131 
2132 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2133 
2134 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2135                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2136 {
2137     static const GVecGen3 g[4] = {
2138         { .fniv = tcg_gen_mul_vec,
2139           .fno = gen_helper_gvec_mul8,
2140           .opt_opc = vecop_list_mul,
2141           .vece = MO_8 },
2142         { .fniv = tcg_gen_mul_vec,
2143           .fno = gen_helper_gvec_mul16,
2144           .opt_opc = vecop_list_mul,
2145           .vece = MO_16 },
2146         { .fni4 = tcg_gen_mul_i32,
2147           .fniv = tcg_gen_mul_vec,
2148           .fno = gen_helper_gvec_mul32,
2149           .opt_opc = vecop_list_mul,
2150           .vece = MO_32 },
2151         { .fni8 = tcg_gen_mul_i64,
2152           .fniv = tcg_gen_mul_vec,
2153           .fno = gen_helper_gvec_mul64,
2154           .opt_opc = vecop_list_mul,
2155           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2156           .vece = MO_64 },
2157     };
2158 
2159     tcg_debug_assert(vece <= MO_64);
2160     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2161 }
2162 
2163 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2164                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2165 {
2166     static const GVecGen2s g[4] = {
2167         { .fniv = tcg_gen_mul_vec,
2168           .fno = gen_helper_gvec_muls8,
2169           .opt_opc = vecop_list_mul,
2170           .vece = MO_8 },
2171         { .fniv = tcg_gen_mul_vec,
2172           .fno = gen_helper_gvec_muls16,
2173           .opt_opc = vecop_list_mul,
2174           .vece = MO_16 },
2175         { .fni4 = tcg_gen_mul_i32,
2176           .fniv = tcg_gen_mul_vec,
2177           .fno = gen_helper_gvec_muls32,
2178           .opt_opc = vecop_list_mul,
2179           .vece = MO_32 },
2180         { .fni8 = tcg_gen_mul_i64,
2181           .fniv = tcg_gen_mul_vec,
2182           .fno = gen_helper_gvec_muls64,
2183           .opt_opc = vecop_list_mul,
2184           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2185           .vece = MO_64 },
2186     };
2187 
2188     tcg_debug_assert(vece <= MO_64);
2189     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2190 }
2191 
2192 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2193                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2194 {
2195     TCGv_i64 tmp = tcg_constant_i64(c);
2196     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2197 }
2198 
2199 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2200                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2201 {
2202     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2203     static const GVecGen3 g[4] = {
2204         { .fniv = tcg_gen_ssadd_vec,
2205           .fno = gen_helper_gvec_ssadd8,
2206           .opt_opc = vecop_list,
2207           .vece = MO_8 },
2208         { .fniv = tcg_gen_ssadd_vec,
2209           .fno = gen_helper_gvec_ssadd16,
2210           .opt_opc = vecop_list,
2211           .vece = MO_16 },
2212         { .fniv = tcg_gen_ssadd_vec,
2213           .fno = gen_helper_gvec_ssadd32,
2214           .opt_opc = vecop_list,
2215           .vece = MO_32 },
2216         { .fniv = tcg_gen_ssadd_vec,
2217           .fno = gen_helper_gvec_ssadd64,
2218           .opt_opc = vecop_list,
2219           .vece = MO_64 },
2220     };
2221     tcg_debug_assert(vece <= MO_64);
2222     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2223 }
2224 
2225 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2226                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2227 {
2228     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2229     static const GVecGen3 g[4] = {
2230         { .fniv = tcg_gen_sssub_vec,
2231           .fno = gen_helper_gvec_sssub8,
2232           .opt_opc = vecop_list,
2233           .vece = MO_8 },
2234         { .fniv = tcg_gen_sssub_vec,
2235           .fno = gen_helper_gvec_sssub16,
2236           .opt_opc = vecop_list,
2237           .vece = MO_16 },
2238         { .fniv = tcg_gen_sssub_vec,
2239           .fno = gen_helper_gvec_sssub32,
2240           .opt_opc = vecop_list,
2241           .vece = MO_32 },
2242         { .fniv = tcg_gen_sssub_vec,
2243           .fno = gen_helper_gvec_sssub64,
2244           .opt_opc = vecop_list,
2245           .vece = MO_64 },
2246     };
2247     tcg_debug_assert(vece <= MO_64);
2248     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2249 }
2250 
2251 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2252 {
2253     TCGv_i32 max = tcg_constant_i32(-1);
2254     tcg_gen_add_i32(d, a, b);
2255     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2256 }
2257 
2258 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2259 {
2260     TCGv_i64 max = tcg_constant_i64(-1);
2261     tcg_gen_add_i64(d, a, b);
2262     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2263 }
2264 
2265 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2266                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2267 {
2268     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2269     static const GVecGen3 g[4] = {
2270         { .fniv = tcg_gen_usadd_vec,
2271           .fno = gen_helper_gvec_usadd8,
2272           .opt_opc = vecop_list,
2273           .vece = MO_8 },
2274         { .fniv = tcg_gen_usadd_vec,
2275           .fno = gen_helper_gvec_usadd16,
2276           .opt_opc = vecop_list,
2277           .vece = MO_16 },
2278         { .fni4 = tcg_gen_usadd_i32,
2279           .fniv = tcg_gen_usadd_vec,
2280           .fno = gen_helper_gvec_usadd32,
2281           .opt_opc = vecop_list,
2282           .vece = MO_32 },
2283         { .fni8 = tcg_gen_usadd_i64,
2284           .fniv = tcg_gen_usadd_vec,
2285           .fno = gen_helper_gvec_usadd64,
2286           .opt_opc = vecop_list,
2287           .vece = MO_64 }
2288     };
2289     tcg_debug_assert(vece <= MO_64);
2290     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2291 }
2292 
2293 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2294 {
2295     TCGv_i32 min = tcg_constant_i32(0);
2296     tcg_gen_sub_i32(d, a, b);
2297     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2298 }
2299 
2300 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2301 {
2302     TCGv_i64 min = tcg_constant_i64(0);
2303     tcg_gen_sub_i64(d, a, b);
2304     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2305 }
2306 
2307 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2308                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2309 {
2310     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2311     static const GVecGen3 g[4] = {
2312         { .fniv = tcg_gen_ussub_vec,
2313           .fno = gen_helper_gvec_ussub8,
2314           .opt_opc = vecop_list,
2315           .vece = MO_8 },
2316         { .fniv = tcg_gen_ussub_vec,
2317           .fno = gen_helper_gvec_ussub16,
2318           .opt_opc = vecop_list,
2319           .vece = MO_16 },
2320         { .fni4 = tcg_gen_ussub_i32,
2321           .fniv = tcg_gen_ussub_vec,
2322           .fno = gen_helper_gvec_ussub32,
2323           .opt_opc = vecop_list,
2324           .vece = MO_32 },
2325         { .fni8 = tcg_gen_ussub_i64,
2326           .fniv = tcg_gen_ussub_vec,
2327           .fno = gen_helper_gvec_ussub64,
2328           .opt_opc = vecop_list,
2329           .vece = MO_64 }
2330     };
2331     tcg_debug_assert(vece <= MO_64);
2332     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2333 }
2334 
2335 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2336                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2337 {
2338     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2339     static const GVecGen3 g[4] = {
2340         { .fniv = tcg_gen_smin_vec,
2341           .fno = gen_helper_gvec_smin8,
2342           .opt_opc = vecop_list,
2343           .vece = MO_8 },
2344         { .fniv = tcg_gen_smin_vec,
2345           .fno = gen_helper_gvec_smin16,
2346           .opt_opc = vecop_list,
2347           .vece = MO_16 },
2348         { .fni4 = tcg_gen_smin_i32,
2349           .fniv = tcg_gen_smin_vec,
2350           .fno = gen_helper_gvec_smin32,
2351           .opt_opc = vecop_list,
2352           .vece = MO_32 },
2353         { .fni8 = tcg_gen_smin_i64,
2354           .fniv = tcg_gen_smin_vec,
2355           .fno = gen_helper_gvec_smin64,
2356           .opt_opc = vecop_list,
2357           .vece = MO_64 }
2358     };
2359     tcg_debug_assert(vece <= MO_64);
2360     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2361 }
2362 
2363 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2364                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2365 {
2366     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2367     static const GVecGen3 g[4] = {
2368         { .fniv = tcg_gen_umin_vec,
2369           .fno = gen_helper_gvec_umin8,
2370           .opt_opc = vecop_list,
2371           .vece = MO_8 },
2372         { .fniv = tcg_gen_umin_vec,
2373           .fno = gen_helper_gvec_umin16,
2374           .opt_opc = vecop_list,
2375           .vece = MO_16 },
2376         { .fni4 = tcg_gen_umin_i32,
2377           .fniv = tcg_gen_umin_vec,
2378           .fno = gen_helper_gvec_umin32,
2379           .opt_opc = vecop_list,
2380           .vece = MO_32 },
2381         { .fni8 = tcg_gen_umin_i64,
2382           .fniv = tcg_gen_umin_vec,
2383           .fno = gen_helper_gvec_umin64,
2384           .opt_opc = vecop_list,
2385           .vece = MO_64 }
2386     };
2387     tcg_debug_assert(vece <= MO_64);
2388     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2389 }
2390 
2391 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2392                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2393 {
2394     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2395     static const GVecGen3 g[4] = {
2396         { .fniv = tcg_gen_smax_vec,
2397           .fno = gen_helper_gvec_smax8,
2398           .opt_opc = vecop_list,
2399           .vece = MO_8 },
2400         { .fniv = tcg_gen_smax_vec,
2401           .fno = gen_helper_gvec_smax16,
2402           .opt_opc = vecop_list,
2403           .vece = MO_16 },
2404         { .fni4 = tcg_gen_smax_i32,
2405           .fniv = tcg_gen_smax_vec,
2406           .fno = gen_helper_gvec_smax32,
2407           .opt_opc = vecop_list,
2408           .vece = MO_32 },
2409         { .fni8 = tcg_gen_smax_i64,
2410           .fniv = tcg_gen_smax_vec,
2411           .fno = gen_helper_gvec_smax64,
2412           .opt_opc = vecop_list,
2413           .vece = MO_64 }
2414     };
2415     tcg_debug_assert(vece <= MO_64);
2416     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2417 }
2418 
2419 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2420                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2421 {
2422     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2423     static const GVecGen3 g[4] = {
2424         { .fniv = tcg_gen_umax_vec,
2425           .fno = gen_helper_gvec_umax8,
2426           .opt_opc = vecop_list,
2427           .vece = MO_8 },
2428         { .fniv = tcg_gen_umax_vec,
2429           .fno = gen_helper_gvec_umax16,
2430           .opt_opc = vecop_list,
2431           .vece = MO_16 },
2432         { .fni4 = tcg_gen_umax_i32,
2433           .fniv = tcg_gen_umax_vec,
2434           .fno = gen_helper_gvec_umax32,
2435           .opt_opc = vecop_list,
2436           .vece = MO_32 },
2437         { .fni8 = tcg_gen_umax_i64,
2438           .fniv = tcg_gen_umax_vec,
2439           .fno = gen_helper_gvec_umax64,
2440           .opt_opc = vecop_list,
2441           .vece = MO_64 }
2442     };
2443     tcg_debug_assert(vece <= MO_64);
2444     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2445 }
2446 
2447 /* Perform a vector negation using normal negation and a mask.
2448    Compare gen_subv_mask above.  */
2449 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2450 {
2451     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2452     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2453 
2454     tcg_gen_andc_i64(t3, m, b);
2455     tcg_gen_andc_i64(t2, b, m);
2456     tcg_gen_sub_i64(d, m, t2);
2457     tcg_gen_xor_i64(d, d, t3);
2458 
2459     tcg_temp_free_i64(t2);
2460     tcg_temp_free_i64(t3);
2461 }
2462 
2463 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2464 {
2465     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2466     gen_negv_mask(d, b, m);
2467 }
2468 
2469 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2470 {
2471     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2472     gen_negv_mask(d, b, m);
2473 }
2474 
2475 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2476 {
2477     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2478     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2479 
2480     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2481     tcg_gen_neg_i64(t2, b);
2482     tcg_gen_neg_i64(t1, t1);
2483     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2484 
2485     tcg_temp_free_i64(t1);
2486     tcg_temp_free_i64(t2);
2487 }
2488 
2489 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2490                       uint32_t oprsz, uint32_t maxsz)
2491 {
2492     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2493     static const GVecGen2 g[4] = {
2494         { .fni8 = tcg_gen_vec_neg8_i64,
2495           .fniv = tcg_gen_neg_vec,
2496           .fno = gen_helper_gvec_neg8,
2497           .opt_opc = vecop_list,
2498           .vece = MO_8 },
2499         { .fni8 = tcg_gen_vec_neg16_i64,
2500           .fniv = tcg_gen_neg_vec,
2501           .fno = gen_helper_gvec_neg16,
2502           .opt_opc = vecop_list,
2503           .vece = MO_16 },
2504         { .fni4 = tcg_gen_neg_i32,
2505           .fniv = tcg_gen_neg_vec,
2506           .fno = gen_helper_gvec_neg32,
2507           .opt_opc = vecop_list,
2508           .vece = MO_32 },
2509         { .fni8 = tcg_gen_neg_i64,
2510           .fniv = tcg_gen_neg_vec,
2511           .fno = gen_helper_gvec_neg64,
2512           .opt_opc = vecop_list,
2513           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2514           .vece = MO_64 },
2515     };
2516 
2517     tcg_debug_assert(vece <= MO_64);
2518     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2519 }
2520 
2521 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2522 {
2523     TCGv_i64 t = tcg_temp_ebb_new_i64();
2524     int nbit = 8 << vece;
2525 
2526     /* Create -1 for each negative element.  */
2527     tcg_gen_shri_i64(t, b, nbit - 1);
2528     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2529     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2530 
2531     /*
2532      * Invert (via xor -1) and add one.
2533      * Because of the ordering the msb is cleared,
2534      * so we never have carry into the next element.
2535      */
2536     tcg_gen_xor_i64(d, b, t);
2537     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2538     tcg_gen_add_i64(d, d, t);
2539 
2540     tcg_temp_free_i64(t);
2541 }
2542 
2543 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2544 {
2545     gen_absv_mask(d, b, MO_8);
2546 }
2547 
2548 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2549 {
2550     gen_absv_mask(d, b, MO_16);
2551 }
2552 
2553 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2554                       uint32_t oprsz, uint32_t maxsz)
2555 {
2556     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2557     static const GVecGen2 g[4] = {
2558         { .fni8 = tcg_gen_vec_abs8_i64,
2559           .fniv = tcg_gen_abs_vec,
2560           .fno = gen_helper_gvec_abs8,
2561           .opt_opc = vecop_list,
2562           .vece = MO_8 },
2563         { .fni8 = tcg_gen_vec_abs16_i64,
2564           .fniv = tcg_gen_abs_vec,
2565           .fno = gen_helper_gvec_abs16,
2566           .opt_opc = vecop_list,
2567           .vece = MO_16 },
2568         { .fni4 = tcg_gen_abs_i32,
2569           .fniv = tcg_gen_abs_vec,
2570           .fno = gen_helper_gvec_abs32,
2571           .opt_opc = vecop_list,
2572           .vece = MO_32 },
2573         { .fni8 = tcg_gen_abs_i64,
2574           .fniv = tcg_gen_abs_vec,
2575           .fno = gen_helper_gvec_abs64,
2576           .opt_opc = vecop_list,
2577           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2578           .vece = MO_64 },
2579     };
2580 
2581     tcg_debug_assert(vece <= MO_64);
2582     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2583 }
2584 
2585 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2586                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2587 {
2588     static const GVecGen3 g = {
2589         .fni8 = tcg_gen_and_i64,
2590         .fniv = tcg_gen_and_vec,
2591         .fno = gen_helper_gvec_and,
2592         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2593     };
2594 
2595     if (aofs == bofs) {
2596         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2597     } else {
2598         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2599     }
2600 }
2601 
2602 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2603                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2604 {
2605     static const GVecGen3 g = {
2606         .fni8 = tcg_gen_or_i64,
2607         .fniv = tcg_gen_or_vec,
2608         .fno = gen_helper_gvec_or,
2609         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2610     };
2611 
2612     if (aofs == bofs) {
2613         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2614     } else {
2615         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2616     }
2617 }
2618 
2619 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2620                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2621 {
2622     static const GVecGen3 g = {
2623         .fni8 = tcg_gen_xor_i64,
2624         .fniv = tcg_gen_xor_vec,
2625         .fno = gen_helper_gvec_xor,
2626         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2627     };
2628 
2629     if (aofs == bofs) {
2630         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2631     } else {
2632         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2633     }
2634 }
2635 
2636 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2637                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2638 {
2639     static const GVecGen3 g = {
2640         .fni8 = tcg_gen_andc_i64,
2641         .fniv = tcg_gen_andc_vec,
2642         .fno = gen_helper_gvec_andc,
2643         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2644     };
2645 
2646     if (aofs == bofs) {
2647         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2648     } else {
2649         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2650     }
2651 }
2652 
2653 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2654                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2655 {
2656     static const GVecGen3 g = {
2657         .fni8 = tcg_gen_orc_i64,
2658         .fniv = tcg_gen_orc_vec,
2659         .fno = gen_helper_gvec_orc,
2660         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2661     };
2662 
2663     if (aofs == bofs) {
2664         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2665     } else {
2666         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2667     }
2668 }
2669 
2670 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2671                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2672 {
2673     static const GVecGen3 g = {
2674         .fni8 = tcg_gen_nand_i64,
2675         .fniv = tcg_gen_nand_vec,
2676         .fno = gen_helper_gvec_nand,
2677         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2678     };
2679 
2680     if (aofs == bofs) {
2681         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2682     } else {
2683         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2684     }
2685 }
2686 
2687 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2688                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2689 {
2690     static const GVecGen3 g = {
2691         .fni8 = tcg_gen_nor_i64,
2692         .fniv = tcg_gen_nor_vec,
2693         .fno = gen_helper_gvec_nor,
2694         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2695     };
2696 
2697     if (aofs == bofs) {
2698         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2699     } else {
2700         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2701     }
2702 }
2703 
2704 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2705                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2706 {
2707     static const GVecGen3 g = {
2708         .fni8 = tcg_gen_eqv_i64,
2709         .fniv = tcg_gen_eqv_vec,
2710         .fno = gen_helper_gvec_eqv,
2711         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2712     };
2713 
2714     if (aofs == bofs) {
2715         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2716     } else {
2717         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2718     }
2719 }
2720 
2721 static const GVecGen2s gop_ands = {
2722     .fni8 = tcg_gen_and_i64,
2723     .fniv = tcg_gen_and_vec,
2724     .fno = gen_helper_gvec_ands,
2725     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2726     .vece = MO_64
2727 };
2728 
2729 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2730                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2731 {
2732     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2733     tcg_gen_dup_i64(vece, tmp, c);
2734     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2735     tcg_temp_free_i64(tmp);
2736 }
2737 
2738 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2739                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2740 {
2741     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2742     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2743 }
2744 
2745 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
2746                         TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2747 {
2748     static GVecGen2s g = {
2749         .fni8 = tcg_gen_andc_i64,
2750         .fniv = tcg_gen_andc_vec,
2751         .fno = gen_helper_gvec_andcs,
2752         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2753         .vece = MO_64
2754     };
2755 
2756     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2757     tcg_gen_dup_i64(vece, tmp, c);
2758     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g);
2759     tcg_temp_free_i64(tmp);
2760 }
2761 
2762 static const GVecGen2s gop_xors = {
2763     .fni8 = tcg_gen_xor_i64,
2764     .fniv = tcg_gen_xor_vec,
2765     .fno = gen_helper_gvec_xors,
2766     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2767     .vece = MO_64
2768 };
2769 
2770 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2771                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2772 {
2773     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2774     tcg_gen_dup_i64(vece, tmp, c);
2775     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2776     tcg_temp_free_i64(tmp);
2777 }
2778 
2779 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2780                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2781 {
2782     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2783     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2784 }
2785 
2786 static const GVecGen2s gop_ors = {
2787     .fni8 = tcg_gen_or_i64,
2788     .fniv = tcg_gen_or_vec,
2789     .fno = gen_helper_gvec_ors,
2790     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2791     .vece = MO_64
2792 };
2793 
2794 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2795                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2796 {
2797     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2798     tcg_gen_dup_i64(vece, tmp, c);
2799     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2800     tcg_temp_free_i64(tmp);
2801 }
2802 
2803 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2804                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2805 {
2806     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2807     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2808 }
2809 
2810 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2811 {
2812     uint64_t mask = dup_const(MO_8, 0xff << c);
2813     tcg_gen_shli_i64(d, a, c);
2814     tcg_gen_andi_i64(d, d, mask);
2815 }
2816 
2817 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2818 {
2819     uint64_t mask = dup_const(MO_16, 0xffff << c);
2820     tcg_gen_shli_i64(d, a, c);
2821     tcg_gen_andi_i64(d, d, mask);
2822 }
2823 
2824 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2825 {
2826     uint32_t mask = dup_const(MO_8, 0xff << c);
2827     tcg_gen_shli_i32(d, a, c);
2828     tcg_gen_andi_i32(d, d, mask);
2829 }
2830 
2831 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2832 {
2833     uint32_t mask = dup_const(MO_16, 0xffff << c);
2834     tcg_gen_shli_i32(d, a, c);
2835     tcg_gen_andi_i32(d, d, mask);
2836 }
2837 
2838 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2839                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2840 {
2841     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2842     static const GVecGen2i g[4] = {
2843         { .fni8 = tcg_gen_vec_shl8i_i64,
2844           .fniv = tcg_gen_shli_vec,
2845           .fno = gen_helper_gvec_shl8i,
2846           .opt_opc = vecop_list,
2847           .vece = MO_8 },
2848         { .fni8 = tcg_gen_vec_shl16i_i64,
2849           .fniv = tcg_gen_shli_vec,
2850           .fno = gen_helper_gvec_shl16i,
2851           .opt_opc = vecop_list,
2852           .vece = MO_16 },
2853         { .fni4 = tcg_gen_shli_i32,
2854           .fniv = tcg_gen_shli_vec,
2855           .fno = gen_helper_gvec_shl32i,
2856           .opt_opc = vecop_list,
2857           .vece = MO_32 },
2858         { .fni8 = tcg_gen_shli_i64,
2859           .fniv = tcg_gen_shli_vec,
2860           .fno = gen_helper_gvec_shl64i,
2861           .opt_opc = vecop_list,
2862           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2863           .vece = MO_64 },
2864     };
2865 
2866     tcg_debug_assert(vece <= MO_64);
2867     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2868     if (shift == 0) {
2869         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2870     } else {
2871         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2872     }
2873 }
2874 
2875 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2876 {
2877     uint64_t mask = dup_const(MO_8, 0xff >> c);
2878     tcg_gen_shri_i64(d, a, c);
2879     tcg_gen_andi_i64(d, d, mask);
2880 }
2881 
2882 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2883 {
2884     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2885     tcg_gen_shri_i64(d, a, c);
2886     tcg_gen_andi_i64(d, d, mask);
2887 }
2888 
2889 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2890 {
2891     uint32_t mask = dup_const(MO_8, 0xff >> c);
2892     tcg_gen_shri_i32(d, a, c);
2893     tcg_gen_andi_i32(d, d, mask);
2894 }
2895 
2896 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2897 {
2898     uint32_t mask = dup_const(MO_16, 0xffff >> c);
2899     tcg_gen_shri_i32(d, a, c);
2900     tcg_gen_andi_i32(d, d, mask);
2901 }
2902 
2903 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2904                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2905 {
2906     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2907     static const GVecGen2i g[4] = {
2908         { .fni8 = tcg_gen_vec_shr8i_i64,
2909           .fniv = tcg_gen_shri_vec,
2910           .fno = gen_helper_gvec_shr8i,
2911           .opt_opc = vecop_list,
2912           .vece = MO_8 },
2913         { .fni8 = tcg_gen_vec_shr16i_i64,
2914           .fniv = tcg_gen_shri_vec,
2915           .fno = gen_helper_gvec_shr16i,
2916           .opt_opc = vecop_list,
2917           .vece = MO_16 },
2918         { .fni4 = tcg_gen_shri_i32,
2919           .fniv = tcg_gen_shri_vec,
2920           .fno = gen_helper_gvec_shr32i,
2921           .opt_opc = vecop_list,
2922           .vece = MO_32 },
2923         { .fni8 = tcg_gen_shri_i64,
2924           .fniv = tcg_gen_shri_vec,
2925           .fno = gen_helper_gvec_shr64i,
2926           .opt_opc = vecop_list,
2927           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2928           .vece = MO_64 },
2929     };
2930 
2931     tcg_debug_assert(vece <= MO_64);
2932     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2933     if (shift == 0) {
2934         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2935     } else {
2936         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2937     }
2938 }
2939 
2940 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2941 {
2942     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2943     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2944     TCGv_i64 s = tcg_temp_ebb_new_i64();
2945 
2946     tcg_gen_shri_i64(d, a, c);
2947     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2948     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2949     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2950     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2951     tcg_temp_free_i64(s);
2952 }
2953 
2954 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2955 {
2956     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2957     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2958     TCGv_i64 s = tcg_temp_ebb_new_i64();
2959 
2960     tcg_gen_shri_i64(d, a, c);
2961     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2962     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2963     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2964     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2965     tcg_temp_free_i64(s);
2966 }
2967 
2968 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2969 {
2970     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2971     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2972     TCGv_i32 s = tcg_temp_ebb_new_i32();
2973 
2974     tcg_gen_shri_i32(d, a, c);
2975     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2976     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2977     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2978     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2979     tcg_temp_free_i32(s);
2980 }
2981 
2982 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2983 {
2984     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2985     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2986     TCGv_i32 s = tcg_temp_ebb_new_i32();
2987 
2988     tcg_gen_shri_i32(d, a, c);
2989     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2990     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2991     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2992     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2993     tcg_temp_free_i32(s);
2994 }
2995 
2996 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2997                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2998 {
2999     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3000     static const GVecGen2i g[4] = {
3001         { .fni8 = tcg_gen_vec_sar8i_i64,
3002           .fniv = tcg_gen_sari_vec,
3003           .fno = gen_helper_gvec_sar8i,
3004           .opt_opc = vecop_list,
3005           .vece = MO_8 },
3006         { .fni8 = tcg_gen_vec_sar16i_i64,
3007           .fniv = tcg_gen_sari_vec,
3008           .fno = gen_helper_gvec_sar16i,
3009           .opt_opc = vecop_list,
3010           .vece = MO_16 },
3011         { .fni4 = tcg_gen_sari_i32,
3012           .fniv = tcg_gen_sari_vec,
3013           .fno = gen_helper_gvec_sar32i,
3014           .opt_opc = vecop_list,
3015           .vece = MO_32 },
3016         { .fni8 = tcg_gen_sari_i64,
3017           .fniv = tcg_gen_sari_vec,
3018           .fno = gen_helper_gvec_sar64i,
3019           .opt_opc = vecop_list,
3020           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3021           .vece = MO_64 },
3022     };
3023 
3024     tcg_debug_assert(vece <= MO_64);
3025     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3026     if (shift == 0) {
3027         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3028     } else {
3029         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3030     }
3031 }
3032 
3033 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3034 {
3035     uint64_t mask = dup_const(MO_8, 0xff << c);
3036 
3037     tcg_gen_shli_i64(d, a, c);
3038     tcg_gen_shri_i64(a, a, 8 - c);
3039     tcg_gen_andi_i64(d, d, mask);
3040     tcg_gen_andi_i64(a, a, ~mask);
3041     tcg_gen_or_i64(d, d, a);
3042 }
3043 
3044 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3045 {
3046     uint64_t mask = dup_const(MO_16, 0xffff << c);
3047 
3048     tcg_gen_shli_i64(d, a, c);
3049     tcg_gen_shri_i64(a, a, 16 - c);
3050     tcg_gen_andi_i64(d, d, mask);
3051     tcg_gen_andi_i64(a, a, ~mask);
3052     tcg_gen_or_i64(d, d, a);
3053 }
3054 
3055 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3056                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3057 {
3058     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3059     static const GVecGen2i g[4] = {
3060         { .fni8 = tcg_gen_vec_rotl8i_i64,
3061           .fniv = tcg_gen_rotli_vec,
3062           .fno = gen_helper_gvec_rotl8i,
3063           .opt_opc = vecop_list,
3064           .vece = MO_8 },
3065         { .fni8 = tcg_gen_vec_rotl16i_i64,
3066           .fniv = tcg_gen_rotli_vec,
3067           .fno = gen_helper_gvec_rotl16i,
3068           .opt_opc = vecop_list,
3069           .vece = MO_16 },
3070         { .fni4 = tcg_gen_rotli_i32,
3071           .fniv = tcg_gen_rotli_vec,
3072           .fno = gen_helper_gvec_rotl32i,
3073           .opt_opc = vecop_list,
3074           .vece = MO_32 },
3075         { .fni8 = tcg_gen_rotli_i64,
3076           .fniv = tcg_gen_rotli_vec,
3077           .fno = gen_helper_gvec_rotl64i,
3078           .opt_opc = vecop_list,
3079           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3080           .vece = MO_64 },
3081     };
3082 
3083     tcg_debug_assert(vece <= MO_64);
3084     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3085     if (shift == 0) {
3086         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3087     } else {
3088         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3089     }
3090 }
3091 
3092 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3093                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3094 {
3095     tcg_debug_assert(vece <= MO_64);
3096     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3097     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3098                        oprsz, maxsz);
3099 }
3100 
3101 /*
3102  * Specialized generation vector shifts by a non-constant scalar.
3103  */
3104 
3105 typedef struct {
3106     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3107     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3108     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3109     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3110     gen_helper_gvec_2 *fno[4];
3111     TCGOpcode s_list[2];
3112     TCGOpcode v_list[2];
3113 } GVecGen2sh;
3114 
3115 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3116                            uint32_t oprsz, uint32_t tysz, TCGType type,
3117                            TCGv_i32 shift,
3118                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3119 {
3120     for (uint32_t i = 0; i < oprsz; i += tysz) {
3121         TCGv_vec t0 = tcg_temp_new_vec(type);
3122         TCGv_vec t1 = tcg_temp_new_vec(type);
3123 
3124         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3125         fni(vece, t1, t0, shift);
3126         tcg_gen_st_vec(t1, tcg_env, dofs + i);
3127     }
3128 }
3129 
3130 static void
3131 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3132                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3133 {
3134     TCGType type;
3135     uint32_t some;
3136 
3137     check_size_align(oprsz, maxsz, dofs | aofs);
3138     check_overlap_2(dofs, aofs, maxsz);
3139 
3140     /* If the backend has a scalar expansion, great.  */
3141     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3142     if (type) {
3143         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3144         switch (type) {
3145         case TCG_TYPE_V256:
3146             some = QEMU_ALIGN_DOWN(oprsz, 32);
3147             expand_2sh_vec(vece, dofs, aofs, some, 32,
3148                            TCG_TYPE_V256, shift, g->fniv_s);
3149             if (some == oprsz) {
3150                 break;
3151             }
3152             dofs += some;
3153             aofs += some;
3154             oprsz -= some;
3155             maxsz -= some;
3156             /* fallthru */
3157         case TCG_TYPE_V128:
3158             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3159                            TCG_TYPE_V128, shift, g->fniv_s);
3160             break;
3161         case TCG_TYPE_V64:
3162             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3163                            TCG_TYPE_V64, shift, g->fniv_s);
3164             break;
3165         default:
3166             g_assert_not_reached();
3167         }
3168         tcg_swap_vecop_list(hold_list);
3169         goto clear_tail;
3170     }
3171 
3172     /* If the backend supports variable vector shifts, also cool.  */
3173     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3174     if (type) {
3175         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3176         TCGv_vec v_shift = tcg_temp_new_vec(type);
3177 
3178         if (vece == MO_64) {
3179             TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3180             tcg_gen_extu_i32_i64(sh64, shift);
3181             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3182             tcg_temp_free_i64(sh64);
3183         } else {
3184             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3185         }
3186 
3187         switch (type) {
3188         case TCG_TYPE_V256:
3189             some = QEMU_ALIGN_DOWN(oprsz, 32);
3190             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3191                           v_shift, false, g->fniv_v);
3192             if (some == oprsz) {
3193                 break;
3194             }
3195             dofs += some;
3196             aofs += some;
3197             oprsz -= some;
3198             maxsz -= some;
3199             /* fallthru */
3200         case TCG_TYPE_V128:
3201             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3202                           v_shift, false, g->fniv_v);
3203             break;
3204         case TCG_TYPE_V64:
3205             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3206                           v_shift, false, g->fniv_v);
3207             break;
3208         default:
3209             g_assert_not_reached();
3210         }
3211         tcg_temp_free_vec(v_shift);
3212         tcg_swap_vecop_list(hold_list);
3213         goto clear_tail;
3214     }
3215 
3216     /* Otherwise fall back to integral... */
3217     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3218         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3219     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3220         TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3221         tcg_gen_extu_i32_i64(sh64, shift);
3222         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3223         tcg_temp_free_i64(sh64);
3224     } else {
3225         TCGv_ptr a0 = tcg_temp_ebb_new_ptr();
3226         TCGv_ptr a1 = tcg_temp_ebb_new_ptr();
3227         TCGv_i32 desc = tcg_temp_ebb_new_i32();
3228 
3229         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3230         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3231         tcg_gen_addi_ptr(a0, tcg_env, dofs);
3232         tcg_gen_addi_ptr(a1, tcg_env, aofs);
3233 
3234         g->fno[vece](a0, a1, desc);
3235 
3236         tcg_temp_free_ptr(a0);
3237         tcg_temp_free_ptr(a1);
3238         tcg_temp_free_i32(desc);
3239         return;
3240     }
3241 
3242  clear_tail:
3243     if (oprsz < maxsz) {
3244         expand_clr(dofs + oprsz, maxsz - oprsz);
3245     }
3246 }
3247 
3248 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3249                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3250 {
3251     static const GVecGen2sh g = {
3252         .fni4 = tcg_gen_shl_i32,
3253         .fni8 = tcg_gen_shl_i64,
3254         .fniv_s = tcg_gen_shls_vec,
3255         .fniv_v = tcg_gen_shlv_vec,
3256         .fno = {
3257             gen_helper_gvec_shl8i,
3258             gen_helper_gvec_shl16i,
3259             gen_helper_gvec_shl32i,
3260             gen_helper_gvec_shl64i,
3261         },
3262         .s_list = { INDEX_op_shls_vec, 0 },
3263         .v_list = { INDEX_op_shlv_vec, 0 },
3264     };
3265 
3266     tcg_debug_assert(vece <= MO_64);
3267     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3268 }
3269 
3270 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3271                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3272 {
3273     static const GVecGen2sh g = {
3274         .fni4 = tcg_gen_shr_i32,
3275         .fni8 = tcg_gen_shr_i64,
3276         .fniv_s = tcg_gen_shrs_vec,
3277         .fniv_v = tcg_gen_shrv_vec,
3278         .fno = {
3279             gen_helper_gvec_shr8i,
3280             gen_helper_gvec_shr16i,
3281             gen_helper_gvec_shr32i,
3282             gen_helper_gvec_shr64i,
3283         },
3284         .s_list = { INDEX_op_shrs_vec, 0 },
3285         .v_list = { INDEX_op_shrv_vec, 0 },
3286     };
3287 
3288     tcg_debug_assert(vece <= MO_64);
3289     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3290 }
3291 
3292 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3293                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3294 {
3295     static const GVecGen2sh g = {
3296         .fni4 = tcg_gen_sar_i32,
3297         .fni8 = tcg_gen_sar_i64,
3298         .fniv_s = tcg_gen_sars_vec,
3299         .fniv_v = tcg_gen_sarv_vec,
3300         .fno = {
3301             gen_helper_gvec_sar8i,
3302             gen_helper_gvec_sar16i,
3303             gen_helper_gvec_sar32i,
3304             gen_helper_gvec_sar64i,
3305         },
3306         .s_list = { INDEX_op_sars_vec, 0 },
3307         .v_list = { INDEX_op_sarv_vec, 0 },
3308     };
3309 
3310     tcg_debug_assert(vece <= MO_64);
3311     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3312 }
3313 
3314 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3315                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3316 {
3317     static const GVecGen2sh g = {
3318         .fni4 = tcg_gen_rotl_i32,
3319         .fni8 = tcg_gen_rotl_i64,
3320         .fniv_s = tcg_gen_rotls_vec,
3321         .fniv_v = tcg_gen_rotlv_vec,
3322         .fno = {
3323             gen_helper_gvec_rotl8i,
3324             gen_helper_gvec_rotl16i,
3325             gen_helper_gvec_rotl32i,
3326             gen_helper_gvec_rotl64i,
3327         },
3328         .s_list = { INDEX_op_rotls_vec, 0 },
3329         .v_list = { INDEX_op_rotlv_vec, 0 },
3330     };
3331 
3332     tcg_debug_assert(vece <= MO_64);
3333     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3334 }
3335 
3336 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3337                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3338 {
3339     TCGv_i32 tmp = tcg_temp_ebb_new_i32();
3340 
3341     tcg_gen_neg_i32(tmp, shift);
3342     tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
3343     tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
3344     tcg_temp_free_i32(tmp);
3345 }
3346 
3347 /*
3348  * Expand D = A << (B % element bits)
3349  *
3350  * Unlike scalar shifts, where it is easy for the target front end
3351  * to include the modulo as part of the expansion.  If the target
3352  * naturally includes the modulo as part of the operation, great!
3353  * If the target has some other behaviour from out-of-range shifts,
3354  * then it could not use this function anyway, and would need to
3355  * do it's own expansion with custom functions.
3356  */
3357 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3358                                  TCGv_vec a, TCGv_vec b)
3359 {
3360     TCGv_vec t = tcg_temp_new_vec_matching(d);
3361     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3362 
3363     tcg_gen_and_vec(vece, t, b, m);
3364     tcg_gen_shlv_vec(vece, d, a, t);
3365     tcg_temp_free_vec(t);
3366 }
3367 
3368 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3369 {
3370     TCGv_i32 t = tcg_temp_ebb_new_i32();
3371 
3372     tcg_gen_andi_i32(t, b, 31);
3373     tcg_gen_shl_i32(d, a, t);
3374     tcg_temp_free_i32(t);
3375 }
3376 
3377 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3378 {
3379     TCGv_i64 t = tcg_temp_ebb_new_i64();
3380 
3381     tcg_gen_andi_i64(t, b, 63);
3382     tcg_gen_shl_i64(d, a, t);
3383     tcg_temp_free_i64(t);
3384 }
3385 
3386 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3387                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3388 {
3389     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3390     static const GVecGen3 g[4] = {
3391         { .fniv = tcg_gen_shlv_mod_vec,
3392           .fno = gen_helper_gvec_shl8v,
3393           .opt_opc = vecop_list,
3394           .vece = MO_8 },
3395         { .fniv = tcg_gen_shlv_mod_vec,
3396           .fno = gen_helper_gvec_shl16v,
3397           .opt_opc = vecop_list,
3398           .vece = MO_16 },
3399         { .fni4 = tcg_gen_shl_mod_i32,
3400           .fniv = tcg_gen_shlv_mod_vec,
3401           .fno = gen_helper_gvec_shl32v,
3402           .opt_opc = vecop_list,
3403           .vece = MO_32 },
3404         { .fni8 = tcg_gen_shl_mod_i64,
3405           .fniv = tcg_gen_shlv_mod_vec,
3406           .fno = gen_helper_gvec_shl64v,
3407           .opt_opc = vecop_list,
3408           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3409           .vece = MO_64 },
3410     };
3411 
3412     tcg_debug_assert(vece <= MO_64);
3413     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3414 }
3415 
3416 /*
3417  * Similarly for logical right shifts.
3418  */
3419 
3420 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3421                                  TCGv_vec a, TCGv_vec b)
3422 {
3423     TCGv_vec t = tcg_temp_new_vec_matching(d);
3424     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3425 
3426     tcg_gen_and_vec(vece, t, b, m);
3427     tcg_gen_shrv_vec(vece, d, a, t);
3428     tcg_temp_free_vec(t);
3429 }
3430 
3431 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3432 {
3433     TCGv_i32 t = tcg_temp_ebb_new_i32();
3434 
3435     tcg_gen_andi_i32(t, b, 31);
3436     tcg_gen_shr_i32(d, a, t);
3437     tcg_temp_free_i32(t);
3438 }
3439 
3440 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3441 {
3442     TCGv_i64 t = tcg_temp_ebb_new_i64();
3443 
3444     tcg_gen_andi_i64(t, b, 63);
3445     tcg_gen_shr_i64(d, a, t);
3446     tcg_temp_free_i64(t);
3447 }
3448 
3449 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3450                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3451 {
3452     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3453     static const GVecGen3 g[4] = {
3454         { .fniv = tcg_gen_shrv_mod_vec,
3455           .fno = gen_helper_gvec_shr8v,
3456           .opt_opc = vecop_list,
3457           .vece = MO_8 },
3458         { .fniv = tcg_gen_shrv_mod_vec,
3459           .fno = gen_helper_gvec_shr16v,
3460           .opt_opc = vecop_list,
3461           .vece = MO_16 },
3462         { .fni4 = tcg_gen_shr_mod_i32,
3463           .fniv = tcg_gen_shrv_mod_vec,
3464           .fno = gen_helper_gvec_shr32v,
3465           .opt_opc = vecop_list,
3466           .vece = MO_32 },
3467         { .fni8 = tcg_gen_shr_mod_i64,
3468           .fniv = tcg_gen_shrv_mod_vec,
3469           .fno = gen_helper_gvec_shr64v,
3470           .opt_opc = vecop_list,
3471           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3472           .vece = MO_64 },
3473     };
3474 
3475     tcg_debug_assert(vece <= MO_64);
3476     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3477 }
3478 
3479 /*
3480  * Similarly for arithmetic right shifts.
3481  */
3482 
3483 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3484                                  TCGv_vec a, TCGv_vec b)
3485 {
3486     TCGv_vec t = tcg_temp_new_vec_matching(d);
3487     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3488 
3489     tcg_gen_and_vec(vece, t, b, m);
3490     tcg_gen_sarv_vec(vece, d, a, t);
3491     tcg_temp_free_vec(t);
3492 }
3493 
3494 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3495 {
3496     TCGv_i32 t = tcg_temp_ebb_new_i32();
3497 
3498     tcg_gen_andi_i32(t, b, 31);
3499     tcg_gen_sar_i32(d, a, t);
3500     tcg_temp_free_i32(t);
3501 }
3502 
3503 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3504 {
3505     TCGv_i64 t = tcg_temp_ebb_new_i64();
3506 
3507     tcg_gen_andi_i64(t, b, 63);
3508     tcg_gen_sar_i64(d, a, t);
3509     tcg_temp_free_i64(t);
3510 }
3511 
3512 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3513                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3514 {
3515     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3516     static const GVecGen3 g[4] = {
3517         { .fniv = tcg_gen_sarv_mod_vec,
3518           .fno = gen_helper_gvec_sar8v,
3519           .opt_opc = vecop_list,
3520           .vece = MO_8 },
3521         { .fniv = tcg_gen_sarv_mod_vec,
3522           .fno = gen_helper_gvec_sar16v,
3523           .opt_opc = vecop_list,
3524           .vece = MO_16 },
3525         { .fni4 = tcg_gen_sar_mod_i32,
3526           .fniv = tcg_gen_sarv_mod_vec,
3527           .fno = gen_helper_gvec_sar32v,
3528           .opt_opc = vecop_list,
3529           .vece = MO_32 },
3530         { .fni8 = tcg_gen_sar_mod_i64,
3531           .fniv = tcg_gen_sarv_mod_vec,
3532           .fno = gen_helper_gvec_sar64v,
3533           .opt_opc = vecop_list,
3534           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3535           .vece = MO_64 },
3536     };
3537 
3538     tcg_debug_assert(vece <= MO_64);
3539     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3540 }
3541 
3542 /*
3543  * Similarly for rotates.
3544  */
3545 
3546 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3547                                   TCGv_vec a, TCGv_vec b)
3548 {
3549     TCGv_vec t = tcg_temp_new_vec_matching(d);
3550     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3551 
3552     tcg_gen_and_vec(vece, t, b, m);
3553     tcg_gen_rotlv_vec(vece, d, a, t);
3554     tcg_temp_free_vec(t);
3555 }
3556 
3557 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3558 {
3559     TCGv_i32 t = tcg_temp_ebb_new_i32();
3560 
3561     tcg_gen_andi_i32(t, b, 31);
3562     tcg_gen_rotl_i32(d, a, t);
3563     tcg_temp_free_i32(t);
3564 }
3565 
3566 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3567 {
3568     TCGv_i64 t = tcg_temp_ebb_new_i64();
3569 
3570     tcg_gen_andi_i64(t, b, 63);
3571     tcg_gen_rotl_i64(d, a, t);
3572     tcg_temp_free_i64(t);
3573 }
3574 
3575 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3576                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3577 {
3578     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3579     static const GVecGen3 g[4] = {
3580         { .fniv = tcg_gen_rotlv_mod_vec,
3581           .fno = gen_helper_gvec_rotl8v,
3582           .opt_opc = vecop_list,
3583           .vece = MO_8 },
3584         { .fniv = tcg_gen_rotlv_mod_vec,
3585           .fno = gen_helper_gvec_rotl16v,
3586           .opt_opc = vecop_list,
3587           .vece = MO_16 },
3588         { .fni4 = tcg_gen_rotl_mod_i32,
3589           .fniv = tcg_gen_rotlv_mod_vec,
3590           .fno = gen_helper_gvec_rotl32v,
3591           .opt_opc = vecop_list,
3592           .vece = MO_32 },
3593         { .fni8 = tcg_gen_rotl_mod_i64,
3594           .fniv = tcg_gen_rotlv_mod_vec,
3595           .fno = gen_helper_gvec_rotl64v,
3596           .opt_opc = vecop_list,
3597           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3598           .vece = MO_64 },
3599     };
3600 
3601     tcg_debug_assert(vece <= MO_64);
3602     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3603 }
3604 
3605 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3606                                   TCGv_vec a, TCGv_vec b)
3607 {
3608     TCGv_vec t = tcg_temp_new_vec_matching(d);
3609     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3610 
3611     tcg_gen_and_vec(vece, t, b, m);
3612     tcg_gen_rotrv_vec(vece, d, a, t);
3613     tcg_temp_free_vec(t);
3614 }
3615 
3616 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3617 {
3618     TCGv_i32 t = tcg_temp_ebb_new_i32();
3619 
3620     tcg_gen_andi_i32(t, b, 31);
3621     tcg_gen_rotr_i32(d, a, t);
3622     tcg_temp_free_i32(t);
3623 }
3624 
3625 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3626 {
3627     TCGv_i64 t = tcg_temp_ebb_new_i64();
3628 
3629     tcg_gen_andi_i64(t, b, 63);
3630     tcg_gen_rotr_i64(d, a, t);
3631     tcg_temp_free_i64(t);
3632 }
3633 
3634 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3635                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3636 {
3637     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3638     static const GVecGen3 g[4] = {
3639         { .fniv = tcg_gen_rotrv_mod_vec,
3640           .fno = gen_helper_gvec_rotr8v,
3641           .opt_opc = vecop_list,
3642           .vece = MO_8 },
3643         { .fniv = tcg_gen_rotrv_mod_vec,
3644           .fno = gen_helper_gvec_rotr16v,
3645           .opt_opc = vecop_list,
3646           .vece = MO_16 },
3647         { .fni4 = tcg_gen_rotr_mod_i32,
3648           .fniv = tcg_gen_rotrv_mod_vec,
3649           .fno = gen_helper_gvec_rotr32v,
3650           .opt_opc = vecop_list,
3651           .vece = MO_32 },
3652         { .fni8 = tcg_gen_rotr_mod_i64,
3653           .fniv = tcg_gen_rotrv_mod_vec,
3654           .fno = gen_helper_gvec_rotr64v,
3655           .opt_opc = vecop_list,
3656           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3657           .vece = MO_64 },
3658     };
3659 
3660     tcg_debug_assert(vece <= MO_64);
3661     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3662 }
3663 
3664 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3665 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3666                            uint32_t oprsz, TCGCond cond)
3667 {
3668     TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3669     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3670     uint32_t i;
3671 
3672     for (i = 0; i < oprsz; i += 4) {
3673         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3674         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
3675         tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3676         tcg_gen_st_i32(t0, tcg_env, dofs + i);
3677     }
3678     tcg_temp_free_i32(t1);
3679     tcg_temp_free_i32(t0);
3680 }
3681 
3682 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3683                            uint32_t oprsz, TCGCond cond)
3684 {
3685     TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3686     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
3687     uint32_t i;
3688 
3689     for (i = 0; i < oprsz; i += 8) {
3690         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3691         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
3692         tcg_gen_negsetcond_i64(cond, t0, t0, t1);
3693         tcg_gen_st_i64(t0, tcg_env, dofs + i);
3694     }
3695     tcg_temp_free_i64(t1);
3696     tcg_temp_free_i64(t0);
3697 }
3698 
3699 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3700                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3701                            TCGType type, TCGCond cond)
3702 {
3703     for (uint32_t i = 0; i < oprsz; i += tysz) {
3704         TCGv_vec t0 = tcg_temp_new_vec(type);
3705         TCGv_vec t1 = tcg_temp_new_vec(type);
3706         TCGv_vec t2 = tcg_temp_new_vec(type);
3707 
3708         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3709         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
3710         tcg_gen_cmp_vec(cond, vece, t2, t0, t1);
3711         tcg_gen_st_vec(t2, tcg_env, dofs + i);
3712     }
3713 }
3714 
3715 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3716                       uint32_t aofs, uint32_t bofs,
3717                       uint32_t oprsz, uint32_t maxsz)
3718 {
3719     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3720     static gen_helper_gvec_3 * const eq_fn[4] = {
3721         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3722         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3723     };
3724     static gen_helper_gvec_3 * const ne_fn[4] = {
3725         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3726         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3727     };
3728     static gen_helper_gvec_3 * const lt_fn[4] = {
3729         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3730         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3731     };
3732     static gen_helper_gvec_3 * const le_fn[4] = {
3733         gen_helper_gvec_le8, gen_helper_gvec_le16,
3734         gen_helper_gvec_le32, gen_helper_gvec_le64
3735     };
3736     static gen_helper_gvec_3 * const ltu_fn[4] = {
3737         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3738         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3739     };
3740     static gen_helper_gvec_3 * const leu_fn[4] = {
3741         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3742         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3743     };
3744     static gen_helper_gvec_3 * const * const fns[16] = {
3745         [TCG_COND_EQ] = eq_fn,
3746         [TCG_COND_NE] = ne_fn,
3747         [TCG_COND_LT] = lt_fn,
3748         [TCG_COND_LE] = le_fn,
3749         [TCG_COND_LTU] = ltu_fn,
3750         [TCG_COND_LEU] = leu_fn,
3751     };
3752 
3753     const TCGOpcode *hold_list;
3754     TCGType type;
3755     uint32_t some;
3756 
3757     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3758     check_overlap_3(dofs, aofs, bofs, maxsz);
3759 
3760     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3761         do_dup(MO_8, dofs, oprsz, maxsz,
3762                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3763         return;
3764     }
3765 
3766     /*
3767      * Implement inline with a vector type, if possible.
3768      * Prefer integer when 64-bit host and 64-bit comparison.
3769      */
3770     hold_list = tcg_swap_vecop_list(cmp_list);
3771     type = choose_vector_type(cmp_list, vece, oprsz,
3772                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3773     switch (type) {
3774     case TCG_TYPE_V256:
3775         /* Recall that ARM SVE allows vector sizes that are not a
3776          * power of 2, but always a multiple of 16.  The intent is
3777          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3778          */
3779         some = QEMU_ALIGN_DOWN(oprsz, 32);
3780         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3781         if (some == oprsz) {
3782             break;
3783         }
3784         dofs += some;
3785         aofs += some;
3786         bofs += some;
3787         oprsz -= some;
3788         maxsz -= some;
3789         /* fallthru */
3790     case TCG_TYPE_V128:
3791         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3792         break;
3793     case TCG_TYPE_V64:
3794         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3795         break;
3796 
3797     case 0:
3798         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3799             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3800         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3801             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3802         } else {
3803             gen_helper_gvec_3 * const *fn = fns[cond];
3804 
3805             if (fn == NULL) {
3806                 uint32_t tmp;
3807                 tmp = aofs, aofs = bofs, bofs = tmp;
3808                 cond = tcg_swap_cond(cond);
3809                 fn = fns[cond];
3810                 assert(fn != NULL);
3811             }
3812             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3813             oprsz = maxsz;
3814         }
3815         break;
3816 
3817     default:
3818         g_assert_not_reached();
3819     }
3820     tcg_swap_vecop_list(hold_list);
3821 
3822     if (oprsz < maxsz) {
3823         expand_clr(dofs + oprsz, maxsz - oprsz);
3824     }
3825 }
3826 
3827 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3828                             uint32_t oprsz, uint32_t tysz, TCGType type,
3829                             TCGCond cond, TCGv_vec c)
3830 {
3831     TCGv_vec t0 = tcg_temp_new_vec(type);
3832     TCGv_vec t1 = tcg_temp_new_vec(type);
3833     uint32_t i;
3834 
3835     for (i = 0; i < oprsz; i += tysz) {
3836         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
3837         tcg_gen_cmp_vec(cond, vece, t0, t1, c);
3838         tcg_gen_st_vec(t0, tcg_env, dofs + i);
3839     }
3840 }
3841 
3842 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
3843                        uint32_t aofs, TCGv_i64 c,
3844                        uint32_t oprsz, uint32_t maxsz)
3845 {
3846     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3847     static gen_helper_gvec_2i * const eq_fn[4] = {
3848         gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
3849         gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
3850     };
3851     static gen_helper_gvec_2i * const lt_fn[4] = {
3852         gen_helper_gvec_lts8, gen_helper_gvec_lts16,
3853         gen_helper_gvec_lts32, gen_helper_gvec_lts64
3854     };
3855     static gen_helper_gvec_2i * const le_fn[4] = {
3856         gen_helper_gvec_les8, gen_helper_gvec_les16,
3857         gen_helper_gvec_les32, gen_helper_gvec_les64
3858     };
3859     static gen_helper_gvec_2i * const ltu_fn[4] = {
3860         gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
3861         gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
3862     };
3863     static gen_helper_gvec_2i * const leu_fn[4] = {
3864         gen_helper_gvec_leus8, gen_helper_gvec_leus16,
3865         gen_helper_gvec_leus32, gen_helper_gvec_leus64
3866     };
3867     static gen_helper_gvec_2i * const * const fns[16] = {
3868         [TCG_COND_EQ] = eq_fn,
3869         [TCG_COND_LT] = lt_fn,
3870         [TCG_COND_LE] = le_fn,
3871         [TCG_COND_LTU] = ltu_fn,
3872         [TCG_COND_LEU] = leu_fn,
3873     };
3874 
3875     TCGType type;
3876 
3877     check_size_align(oprsz, maxsz, dofs | aofs);
3878     check_overlap_2(dofs, aofs, maxsz);
3879 
3880     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3881         do_dup(MO_8, dofs, oprsz, maxsz,
3882                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3883         return;
3884     }
3885 
3886     /*
3887      * Implement inline with a vector type, if possible.
3888      * Prefer integer when 64-bit host and 64-bit comparison.
3889      */
3890     type = choose_vector_type(cmp_list, vece, oprsz,
3891                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3892     if (type != 0) {
3893         const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
3894         TCGv_vec t_vec = tcg_temp_new_vec(type);
3895         uint32_t some;
3896 
3897         tcg_gen_dup_i64_vec(vece, t_vec, c);
3898         switch (type) {
3899         case TCG_TYPE_V256:
3900             some = QEMU_ALIGN_DOWN(oprsz, 32);
3901             expand_cmps_vec(vece, dofs, aofs, some, 32,
3902                             TCG_TYPE_V256, cond, t_vec);
3903             aofs += some;
3904             dofs += some;
3905             oprsz -= some;
3906             maxsz -= some;
3907             /* fallthru */
3908 
3909         case TCG_TYPE_V128:
3910             some = QEMU_ALIGN_DOWN(oprsz, 16);
3911             expand_cmps_vec(vece, dofs, aofs, some, 16,
3912                             TCG_TYPE_V128, cond, t_vec);
3913             break;
3914 
3915         case TCG_TYPE_V64:
3916             some = QEMU_ALIGN_DOWN(oprsz, 8);
3917             expand_cmps_vec(vece, dofs, aofs, some, 8,
3918                             TCG_TYPE_V64, cond, t_vec);
3919             break;
3920 
3921         default:
3922             g_assert_not_reached();
3923         }
3924         tcg_temp_free_vec(t_vec);
3925         tcg_swap_vecop_list(hold_list);
3926     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3927         TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3928         uint32_t i;
3929 
3930         for (i = 0; i < oprsz; i += 8) {
3931             tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3932             tcg_gen_negsetcond_i64(cond, t0, t0, c);
3933             tcg_gen_st_i64(t0, tcg_env, dofs + i);
3934         }
3935         tcg_temp_free_i64(t0);
3936     } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3937         TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3938         TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3939         uint32_t i;
3940 
3941         tcg_gen_extrl_i64_i32(t1, c);
3942         for (i = 0; i < oprsz; i += 8) {
3943             tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3944             tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3945             tcg_gen_st_i32(t0, tcg_env, dofs + i);
3946         }
3947         tcg_temp_free_i32(t0);
3948         tcg_temp_free_i32(t1);
3949     } else {
3950         gen_helper_gvec_2i * const *fn = fns[cond];
3951         bool inv = false;
3952 
3953         if (fn == NULL) {
3954             cond = tcg_invert_cond(cond);
3955             fn = fns[cond];
3956             assert(fn != NULL);
3957             inv = true;
3958         }
3959         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
3960         return;
3961     }
3962 
3963     if (oprsz < maxsz) {
3964         expand_clr(dofs + oprsz, maxsz - oprsz);
3965     }
3966 }
3967 
3968 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
3969                        uint32_t aofs, int64_t c,
3970                        uint32_t oprsz, uint32_t maxsz)
3971 {
3972     TCGv_i64 tmp = tcg_constant_i64(c);
3973     tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
3974 }
3975 
3976 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3977 {
3978     TCGv_i64 t = tcg_temp_ebb_new_i64();
3979 
3980     tcg_gen_and_i64(t, b, a);
3981     tcg_gen_andc_i64(d, c, a);
3982     tcg_gen_or_i64(d, d, t);
3983     tcg_temp_free_i64(t);
3984 }
3985 
3986 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3987                          uint32_t bofs, uint32_t cofs,
3988                          uint32_t oprsz, uint32_t maxsz)
3989 {
3990     static const GVecGen4 g = {
3991         .fni8 = tcg_gen_bitsel_i64,
3992         .fniv = tcg_gen_bitsel_vec,
3993         .fno = gen_helper_gvec_bitsel,
3994     };
3995 
3996     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3997 }
3998