xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision b4b9a0e32f93c0700f46617524317b0580126592)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "qemu/main-loop.h"
25 #include "tcg/tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t max_align;
41 
42     switch (oprsz) {
43     case 8:
44     case 16:
45     case 32:
46         tcg_debug_assert(oprsz <= maxsz);
47         break;
48     default:
49         tcg_debug_assert(oprsz == maxsz);
50         break;
51     }
52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
53 
54     max_align = maxsz >= 16 ? 15 : 7;
55     tcg_debug_assert((maxsz & max_align) == 0);
56     tcg_debug_assert((ofs & max_align) == 0);
57 }
58 
59 /* Verify vector overlap rules for two operands.  */
60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
61 {
62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
63 }
64 
65 /* Verify vector overlap rules for three operands.  */
66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
67 {
68     check_overlap_2(d, a, s);
69     check_overlap_2(d, b, s);
70     check_overlap_2(a, b, s);
71 }
72 
73 /* Verify vector overlap rules for four operands.  */
74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
75                             uint32_t c, uint32_t s)
76 {
77     check_overlap_2(d, a, s);
78     check_overlap_2(d, b, s);
79     check_overlap_2(d, c, s);
80     check_overlap_2(a, b, s);
81     check_overlap_2(a, c, s);
82     check_overlap_2(b, c, s);
83 }
84 
85 /* Create a descriptor from components.  */
86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
87 {
88     uint32_t desc = 0;
89 
90     check_size_align(oprsz, maxsz, 0);
91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
92 
93     oprsz = (oprsz / 8) - 1;
94     maxsz = (maxsz / 8) - 1;
95 
96     /*
97      * We have just asserted in check_size_align that either
98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
99      * case with '2', as that would otherwise map to 24.
100      */
101     if (oprsz == maxsz) {
102         oprsz = 2;
103     }
104 
105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
108 
109     return desc;
110 }
111 
112 /* Generate a call to a gvec-style helper with two vector operands.  */
113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
115                         gen_helper_gvec_2 *fn)
116 {
117     TCGv_ptr a0, a1;
118     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
119 
120     a0 = tcg_temp_new_ptr();
121     a1 = tcg_temp_new_ptr();
122 
123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
125 
126     fn(a0, a1, desc);
127 
128     tcg_temp_free_ptr(a0);
129     tcg_temp_free_ptr(a1);
130 }
131 
132 /* Generate a call to a gvec-style helper with two vector operands
133    and one scalar operand.  */
134 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
135                          uint32_t oprsz, uint32_t maxsz, int32_t data,
136                          gen_helper_gvec_2i *fn)
137 {
138     TCGv_ptr a0, a1;
139     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
140 
141     a0 = tcg_temp_new_ptr();
142     a1 = tcg_temp_new_ptr();
143 
144     tcg_gen_addi_ptr(a0, cpu_env, dofs);
145     tcg_gen_addi_ptr(a1, cpu_env, aofs);
146 
147     fn(a0, a1, c, desc);
148 
149     tcg_temp_free_ptr(a0);
150     tcg_temp_free_ptr(a1);
151 }
152 
153 /* Generate a call to a gvec-style helper with three vector operands.  */
154 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
155                         uint32_t oprsz, uint32_t maxsz, int32_t data,
156                         gen_helper_gvec_3 *fn)
157 {
158     TCGv_ptr a0, a1, a2;
159     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
160 
161     a0 = tcg_temp_new_ptr();
162     a1 = tcg_temp_new_ptr();
163     a2 = tcg_temp_new_ptr();
164 
165     tcg_gen_addi_ptr(a0, cpu_env, dofs);
166     tcg_gen_addi_ptr(a1, cpu_env, aofs);
167     tcg_gen_addi_ptr(a2, cpu_env, bofs);
168 
169     fn(a0, a1, a2, desc);
170 
171     tcg_temp_free_ptr(a0);
172     tcg_temp_free_ptr(a1);
173     tcg_temp_free_ptr(a2);
174 }
175 
176 /* Generate a call to a gvec-style helper with four vector operands.  */
177 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
178                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
179                         int32_t data, gen_helper_gvec_4 *fn)
180 {
181     TCGv_ptr a0, a1, a2, a3;
182     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
183 
184     a0 = tcg_temp_new_ptr();
185     a1 = tcg_temp_new_ptr();
186     a2 = tcg_temp_new_ptr();
187     a3 = tcg_temp_new_ptr();
188 
189     tcg_gen_addi_ptr(a0, cpu_env, dofs);
190     tcg_gen_addi_ptr(a1, cpu_env, aofs);
191     tcg_gen_addi_ptr(a2, cpu_env, bofs);
192     tcg_gen_addi_ptr(a3, cpu_env, cofs);
193 
194     fn(a0, a1, a2, a3, desc);
195 
196     tcg_temp_free_ptr(a0);
197     tcg_temp_free_ptr(a1);
198     tcg_temp_free_ptr(a2);
199     tcg_temp_free_ptr(a3);
200 }
201 
202 /* Generate a call to a gvec-style helper with five vector operands.  */
203 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
204                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
205                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
206 {
207     TCGv_ptr a0, a1, a2, a3, a4;
208     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
209 
210     a0 = tcg_temp_new_ptr();
211     a1 = tcg_temp_new_ptr();
212     a2 = tcg_temp_new_ptr();
213     a3 = tcg_temp_new_ptr();
214     a4 = tcg_temp_new_ptr();
215 
216     tcg_gen_addi_ptr(a0, cpu_env, dofs);
217     tcg_gen_addi_ptr(a1, cpu_env, aofs);
218     tcg_gen_addi_ptr(a2, cpu_env, bofs);
219     tcg_gen_addi_ptr(a3, cpu_env, cofs);
220     tcg_gen_addi_ptr(a4, cpu_env, xofs);
221 
222     fn(a0, a1, a2, a3, a4, desc);
223 
224     tcg_temp_free_ptr(a0);
225     tcg_temp_free_ptr(a1);
226     tcg_temp_free_ptr(a2);
227     tcg_temp_free_ptr(a3);
228     tcg_temp_free_ptr(a4);
229 }
230 
231 /* Generate a call to a gvec-style helper with three vector operands
232    and an extra pointer operand.  */
233 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
234                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
235                         int32_t data, gen_helper_gvec_2_ptr *fn)
236 {
237     TCGv_ptr a0, a1;
238     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
239 
240     a0 = tcg_temp_new_ptr();
241     a1 = tcg_temp_new_ptr();
242 
243     tcg_gen_addi_ptr(a0, cpu_env, dofs);
244     tcg_gen_addi_ptr(a1, cpu_env, aofs);
245 
246     fn(a0, a1, ptr, desc);
247 
248     tcg_temp_free_ptr(a0);
249     tcg_temp_free_ptr(a1);
250 }
251 
252 /* Generate a call to a gvec-style helper with three vector operands
253    and an extra pointer operand.  */
254 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
255                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
256                         int32_t data, gen_helper_gvec_3_ptr *fn)
257 {
258     TCGv_ptr a0, a1, a2;
259     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
260 
261     a0 = tcg_temp_new_ptr();
262     a1 = tcg_temp_new_ptr();
263     a2 = tcg_temp_new_ptr();
264 
265     tcg_gen_addi_ptr(a0, cpu_env, dofs);
266     tcg_gen_addi_ptr(a1, cpu_env, aofs);
267     tcg_gen_addi_ptr(a2, cpu_env, bofs);
268 
269     fn(a0, a1, a2, ptr, desc);
270 
271     tcg_temp_free_ptr(a0);
272     tcg_temp_free_ptr(a1);
273     tcg_temp_free_ptr(a2);
274 }
275 
276 /* Generate a call to a gvec-style helper with four vector operands
277    and an extra pointer operand.  */
278 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
279                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
280                         uint32_t maxsz, int32_t data,
281                         gen_helper_gvec_4_ptr *fn)
282 {
283     TCGv_ptr a0, a1, a2, a3;
284     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
285 
286     a0 = tcg_temp_new_ptr();
287     a1 = tcg_temp_new_ptr();
288     a2 = tcg_temp_new_ptr();
289     a3 = tcg_temp_new_ptr();
290 
291     tcg_gen_addi_ptr(a0, cpu_env, dofs);
292     tcg_gen_addi_ptr(a1, cpu_env, aofs);
293     tcg_gen_addi_ptr(a2, cpu_env, bofs);
294     tcg_gen_addi_ptr(a3, cpu_env, cofs);
295 
296     fn(a0, a1, a2, a3, ptr, desc);
297 
298     tcg_temp_free_ptr(a0);
299     tcg_temp_free_ptr(a1);
300     tcg_temp_free_ptr(a2);
301     tcg_temp_free_ptr(a3);
302 }
303 
304 /* Generate a call to a gvec-style helper with five vector operands
305    and an extra pointer operand.  */
306 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
307                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
308                         uint32_t oprsz, uint32_t maxsz, int32_t data,
309                         gen_helper_gvec_5_ptr *fn)
310 {
311     TCGv_ptr a0, a1, a2, a3, a4;
312     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
313 
314     a0 = tcg_temp_new_ptr();
315     a1 = tcg_temp_new_ptr();
316     a2 = tcg_temp_new_ptr();
317     a3 = tcg_temp_new_ptr();
318     a4 = tcg_temp_new_ptr();
319 
320     tcg_gen_addi_ptr(a0, cpu_env, dofs);
321     tcg_gen_addi_ptr(a1, cpu_env, aofs);
322     tcg_gen_addi_ptr(a2, cpu_env, bofs);
323     tcg_gen_addi_ptr(a3, cpu_env, cofs);
324     tcg_gen_addi_ptr(a4, cpu_env, eofs);
325 
326     fn(a0, a1, a2, a3, a4, ptr, desc);
327 
328     tcg_temp_free_ptr(a0);
329     tcg_temp_free_ptr(a1);
330     tcg_temp_free_ptr(a2);
331     tcg_temp_free_ptr(a3);
332     tcg_temp_free_ptr(a4);
333 }
334 
335 /* Return true if we want to implement something of OPRSZ bytes
336    in units of LNSZ.  This limits the expansion of inline code.  */
337 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
338 {
339     uint32_t q, r;
340 
341     if (oprsz < lnsz) {
342         return false;
343     }
344 
345     q = oprsz / lnsz;
346     r = oprsz % lnsz;
347     tcg_debug_assert((r & 7) == 0);
348 
349     if (lnsz < 16) {
350         /* For sizes below 16, accept no remainder. */
351         if (r != 0) {
352             return false;
353         }
354     } else {
355         /*
356          * Recall that ARM SVE allows vector sizes that are not a
357          * power of 2, but always a multiple of 16.  The intent is
358          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
359          * In addition, expand_clr needs to handle a multiple of 8.
360          * Thus we can handle the tail with one more operation per
361          * diminishing power of 2.
362          */
363         q += ctpop32(r);
364     }
365 
366     return q <= MAX_UNROLL;
367 }
368 
369 static void expand_clr(uint32_t dofs, uint32_t maxsz);
370 
371 /* Duplicate C as per VECE.  */
372 uint64_t (dup_const)(unsigned vece, uint64_t c)
373 {
374     switch (vece) {
375     case MO_8:
376         return 0x0101010101010101ull * (uint8_t)c;
377     case MO_16:
378         return 0x0001000100010001ull * (uint16_t)c;
379     case MO_32:
380         return 0x0000000100000001ull * (uint32_t)c;
381     case MO_64:
382         return c;
383     default:
384         g_assert_not_reached();
385     }
386 }
387 
388 /* Duplicate IN into OUT as per VECE.  */
389 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
390 {
391     switch (vece) {
392     case MO_8:
393         tcg_gen_ext8u_i32(out, in);
394         tcg_gen_muli_i32(out, out, 0x01010101);
395         break;
396     case MO_16:
397         tcg_gen_deposit_i32(out, in, in, 16, 16);
398         break;
399     case MO_32:
400         tcg_gen_mov_i32(out, in);
401         break;
402     default:
403         g_assert_not_reached();
404     }
405 }
406 
407 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
408 {
409     switch (vece) {
410     case MO_8:
411         tcg_gen_ext8u_i64(out, in);
412         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
413         break;
414     case MO_16:
415         tcg_gen_ext16u_i64(out, in);
416         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
417         break;
418     case MO_32:
419         tcg_gen_deposit_i64(out, in, in, 32, 32);
420         break;
421     case MO_64:
422         tcg_gen_mov_i64(out, in);
423         break;
424     default:
425         g_assert_not_reached();
426     }
427 }
428 
429 /* Select a supported vector type for implementing an operation on SIZE
430  * bytes.  If OP is 0, assume that the real operation to be performed is
431  * required by all backends.  Otherwise, make sure than OP can be performed
432  * on elements of size VECE in the selected type.  Do not select V64 if
433  * PREFER_I64 is true.  Return 0 if no vector type is selected.
434  */
435 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
436                                   uint32_t size, bool prefer_i64)
437 {
438     /*
439      * Recall that ARM SVE allows vector sizes that are not a
440      * power of 2, but always a multiple of 16.  The intent is
441      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
442      * It is hard to imagine a case in which v256 is supported
443      * but v128 is not, but check anyway.
444      * In addition, expand_clr needs to handle a multiple of 8.
445      */
446     if (TCG_TARGET_HAS_v256 &&
447         check_size_impl(size, 32) &&
448         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
449         (!(size & 16) ||
450          (TCG_TARGET_HAS_v128 &&
451           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
452         (!(size & 8) ||
453          (TCG_TARGET_HAS_v64 &&
454           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
455         return TCG_TYPE_V256;
456     }
457     if (TCG_TARGET_HAS_v128 &&
458         check_size_impl(size, 16) &&
459         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
460         (!(size & 8) ||
461          (TCG_TARGET_HAS_v64 &&
462           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
463         return TCG_TYPE_V128;
464     }
465     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
466         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
467         return TCG_TYPE_V64;
468     }
469     return 0;
470 }
471 
472 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
473                          uint32_t maxsz, TCGv_vec t_vec)
474 {
475     uint32_t i = 0;
476 
477     tcg_debug_assert(oprsz >= 8);
478 
479     /*
480      * This may be expand_clr for the tail of an operation, e.g.
481      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
482      * are misaligned wrt the maximum vector size, so do that first.
483      */
484     if (dofs & 8) {
485         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
486         i += 8;
487     }
488 
489     switch (type) {
490     case TCG_TYPE_V256:
491         /*
492          * Recall that ARM SVE allows vector sizes that are not a
493          * power of 2, but always a multiple of 16.  The intent is
494          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
495          */
496         for (; i + 32 <= oprsz; i += 32) {
497             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
498         }
499         /* fallthru */
500     case TCG_TYPE_V128:
501         for (; i + 16 <= oprsz; i += 16) {
502             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
503         }
504         break;
505     case TCG_TYPE_V64:
506         for (; i < oprsz; i += 8) {
507             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
508         }
509         break;
510     default:
511         g_assert_not_reached();
512     }
513 
514     if (oprsz < maxsz) {
515         expand_clr(dofs + oprsz, maxsz - oprsz);
516     }
517 }
518 
519 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
520  * Only one of IN_32 or IN_64 may be set;
521  * IN_C is used if IN_32 and IN_64 are unset.
522  */
523 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
524                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
525                    uint64_t in_c)
526 {
527     TCGType type;
528     TCGv_i64 t_64;
529     TCGv_i32 t_32, t_desc;
530     TCGv_ptr t_ptr;
531     uint32_t i;
532 
533     assert(vece <= (in_32 ? MO_32 : MO_64));
534     assert(in_32 == NULL || in_64 == NULL);
535 
536     /* If we're storing 0, expand oprsz to maxsz.  */
537     if (in_32 == NULL && in_64 == NULL) {
538         in_c = dup_const(vece, in_c);
539         if (in_c == 0) {
540             oprsz = maxsz;
541             vece = MO_8;
542         } else if (in_c == dup_const(MO_8, in_c)) {
543             vece = MO_8;
544         }
545     }
546 
547     /* Implement inline with a vector type, if possible.
548      * Prefer integer when 64-bit host and no variable dup.
549      */
550     type = choose_vector_type(NULL, vece, oprsz,
551                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
552                                && (in_64 == NULL || vece == MO_64)));
553     if (type != 0) {
554         TCGv_vec t_vec = tcg_temp_new_vec(type);
555 
556         if (in_32) {
557             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
558         } else if (in_64) {
559             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
560         } else {
561             tcg_gen_dupi_vec(vece, t_vec, in_c);
562         }
563         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
564         tcg_temp_free_vec(t_vec);
565         return;
566     }
567 
568     /* Otherwise, inline with an integer type, unless "large".  */
569     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
570         t_64 = NULL;
571         t_32 = NULL;
572 
573         if (in_32) {
574             /* We are given a 32-bit variable input.  For a 64-bit host,
575                use a 64-bit operation unless the 32-bit operation would
576                be simple enough.  */
577             if (TCG_TARGET_REG_BITS == 64
578                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
579                 t_64 = tcg_temp_new_i64();
580                 tcg_gen_extu_i32_i64(t_64, in_32);
581                 tcg_gen_dup_i64(vece, t_64, t_64);
582             } else {
583                 t_32 = tcg_temp_new_i32();
584                 tcg_gen_dup_i32(vece, t_32, in_32);
585             }
586         } else if (in_64) {
587             /* We are given a 64-bit variable input.  */
588             t_64 = tcg_temp_new_i64();
589             tcg_gen_dup_i64(vece, t_64, in_64);
590         } else {
591             /* We are given a constant input.  */
592             /* For 64-bit hosts, use 64-bit constants for "simple" constants
593                or when we'd need too many 32-bit stores, or when a 64-bit
594                constant is really required.  */
595             if (vece == MO_64
596                 || (TCG_TARGET_REG_BITS == 64
597                     && (in_c == 0 || in_c == -1
598                         || !check_size_impl(oprsz, 4)))) {
599                 t_64 = tcg_constant_i64(in_c);
600             } else {
601                 t_32 = tcg_constant_i32(in_c);
602             }
603         }
604 
605         /* Implement inline if we picked an implementation size above.  */
606         if (t_32) {
607             for (i = 0; i < oprsz; i += 4) {
608                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
609             }
610             tcg_temp_free_i32(t_32);
611             goto done;
612         }
613         if (t_64) {
614             for (i = 0; i < oprsz; i += 8) {
615                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
616             }
617             tcg_temp_free_i64(t_64);
618             goto done;
619         }
620     }
621 
622     /* Otherwise implement out of line.  */
623     t_ptr = tcg_temp_new_ptr();
624     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
625 
626     /*
627      * This may be expand_clr for the tail of an operation, e.g.
628      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
629      * wrt simd_desc and will assert.  Simply pass all replicated byte
630      * stores through to memset.
631      */
632     if (oprsz == maxsz && vece == MO_8) {
633         TCGv_ptr t_size = tcg_const_ptr(oprsz);
634         TCGv_i32 t_val;
635 
636         if (in_32) {
637             t_val = in_32;
638         } else if (in_64) {
639             t_val = tcg_temp_new_i32();
640             tcg_gen_extrl_i64_i32(t_val, in_64);
641         } else {
642             t_val = tcg_constant_i32(in_c);
643         }
644         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
645 
646         if (in_64) {
647             tcg_temp_free_i32(t_val);
648         }
649         tcg_temp_free_ptr(t_size);
650         tcg_temp_free_ptr(t_ptr);
651         return;
652     }
653 
654     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
655 
656     if (vece == MO_64) {
657         if (in_64) {
658             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
659         } else {
660             t_64 = tcg_constant_i64(in_c);
661             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
662         }
663     } else {
664         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
665         static dup_fn * const fns[3] = {
666             gen_helper_gvec_dup8,
667             gen_helper_gvec_dup16,
668             gen_helper_gvec_dup32
669         };
670 
671         if (in_32) {
672             fns[vece](t_ptr, t_desc, in_32);
673         } else if (in_64) {
674             t_32 = tcg_temp_new_i32();
675             tcg_gen_extrl_i64_i32(t_32, in_64);
676             fns[vece](t_ptr, t_desc, t_32);
677             tcg_temp_free_i32(t_32);
678         } else {
679             if (vece == MO_8) {
680                 in_c &= 0xff;
681             } else if (vece == MO_16) {
682                 in_c &= 0xffff;
683             }
684             t_32 = tcg_constant_i32(in_c);
685             fns[vece](t_ptr, t_desc, t_32);
686         }
687     }
688 
689     tcg_temp_free_ptr(t_ptr);
690     return;
691 
692  done:
693     if (oprsz < maxsz) {
694         expand_clr(dofs + oprsz, maxsz - oprsz);
695     }
696 }
697 
698 /* Likewise, but with zero.  */
699 static void expand_clr(uint32_t dofs, uint32_t maxsz)
700 {
701     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
702 }
703 
704 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
705 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
706                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
707 {
708     TCGv_i32 t0 = tcg_temp_new_i32();
709     TCGv_i32 t1 = tcg_temp_new_i32();
710     uint32_t i;
711 
712     for (i = 0; i < oprsz; i += 4) {
713         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
714         if (load_dest) {
715             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
716         }
717         fni(t1, t0);
718         tcg_gen_st_i32(t1, cpu_env, dofs + i);
719     }
720     tcg_temp_free_i32(t0);
721     tcg_temp_free_i32(t1);
722 }
723 
724 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
725                           int32_t c, bool load_dest,
726                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
727 {
728     TCGv_i32 t0 = tcg_temp_new_i32();
729     TCGv_i32 t1 = tcg_temp_new_i32();
730     uint32_t i;
731 
732     for (i = 0; i < oprsz; i += 4) {
733         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
734         if (load_dest) {
735             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
736         }
737         fni(t1, t0, c);
738         tcg_gen_st_i32(t1, cpu_env, dofs + i);
739     }
740     tcg_temp_free_i32(t0);
741     tcg_temp_free_i32(t1);
742 }
743 
744 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
745                           TCGv_i32 c, bool scalar_first,
746                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
747 {
748     TCGv_i32 t0 = tcg_temp_new_i32();
749     TCGv_i32 t1 = tcg_temp_new_i32();
750     uint32_t i;
751 
752     for (i = 0; i < oprsz; i += 4) {
753         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
754         if (scalar_first) {
755             fni(t1, c, t0);
756         } else {
757             fni(t1, t0, c);
758         }
759         tcg_gen_st_i32(t1, cpu_env, dofs + i);
760     }
761     tcg_temp_free_i32(t0);
762     tcg_temp_free_i32(t1);
763 }
764 
765 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
766 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
767                          uint32_t bofs, uint32_t oprsz, bool load_dest,
768                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
769 {
770     TCGv_i32 t0 = tcg_temp_new_i32();
771     TCGv_i32 t1 = tcg_temp_new_i32();
772     TCGv_i32 t2 = tcg_temp_new_i32();
773     uint32_t i;
774 
775     for (i = 0; i < oprsz; i += 4) {
776         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
777         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
778         if (load_dest) {
779             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
780         }
781         fni(t2, t0, t1);
782         tcg_gen_st_i32(t2, cpu_env, dofs + i);
783     }
784     tcg_temp_free_i32(t2);
785     tcg_temp_free_i32(t1);
786     tcg_temp_free_i32(t0);
787 }
788 
789 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
790                           uint32_t oprsz, int32_t c, bool load_dest,
791                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
792 {
793     TCGv_i32 t0 = tcg_temp_new_i32();
794     TCGv_i32 t1 = tcg_temp_new_i32();
795     TCGv_i32 t2 = tcg_temp_new_i32();
796     uint32_t i;
797 
798     for (i = 0; i < oprsz; i += 4) {
799         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
800         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
801         if (load_dest) {
802             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
803         }
804         fni(t2, t0, t1, c);
805         tcg_gen_st_i32(t2, cpu_env, dofs + i);
806     }
807     tcg_temp_free_i32(t0);
808     tcg_temp_free_i32(t1);
809     tcg_temp_free_i32(t2);
810 }
811 
812 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
813 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
814                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
815                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
816 {
817     TCGv_i32 t0 = tcg_temp_new_i32();
818     TCGv_i32 t1 = tcg_temp_new_i32();
819     TCGv_i32 t2 = tcg_temp_new_i32();
820     TCGv_i32 t3 = tcg_temp_new_i32();
821     uint32_t i;
822 
823     for (i = 0; i < oprsz; i += 4) {
824         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
825         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
826         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
827         fni(t0, t1, t2, t3);
828         tcg_gen_st_i32(t0, cpu_env, dofs + i);
829         if (write_aofs) {
830             tcg_gen_st_i32(t1, cpu_env, aofs + i);
831         }
832     }
833     tcg_temp_free_i32(t3);
834     tcg_temp_free_i32(t2);
835     tcg_temp_free_i32(t1);
836     tcg_temp_free_i32(t0);
837 }
838 
839 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
840 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
841                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
842 {
843     TCGv_i64 t0 = tcg_temp_new_i64();
844     TCGv_i64 t1 = tcg_temp_new_i64();
845     uint32_t i;
846 
847     for (i = 0; i < oprsz; i += 8) {
848         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
849         if (load_dest) {
850             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
851         }
852         fni(t1, t0);
853         tcg_gen_st_i64(t1, cpu_env, dofs + i);
854     }
855     tcg_temp_free_i64(t0);
856     tcg_temp_free_i64(t1);
857 }
858 
859 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
860                           int64_t c, bool load_dest,
861                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
862 {
863     TCGv_i64 t0 = tcg_temp_new_i64();
864     TCGv_i64 t1 = tcg_temp_new_i64();
865     uint32_t i;
866 
867     for (i = 0; i < oprsz; i += 8) {
868         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
869         if (load_dest) {
870             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
871         }
872         fni(t1, t0, c);
873         tcg_gen_st_i64(t1, cpu_env, dofs + i);
874     }
875     tcg_temp_free_i64(t0);
876     tcg_temp_free_i64(t1);
877 }
878 
879 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
880                           TCGv_i64 c, bool scalar_first,
881                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
882 {
883     TCGv_i64 t0 = tcg_temp_new_i64();
884     TCGv_i64 t1 = tcg_temp_new_i64();
885     uint32_t i;
886 
887     for (i = 0; i < oprsz; i += 8) {
888         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
889         if (scalar_first) {
890             fni(t1, c, t0);
891         } else {
892             fni(t1, t0, c);
893         }
894         tcg_gen_st_i64(t1, cpu_env, dofs + i);
895     }
896     tcg_temp_free_i64(t0);
897     tcg_temp_free_i64(t1);
898 }
899 
900 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
901 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
902                          uint32_t bofs, uint32_t oprsz, bool load_dest,
903                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
904 {
905     TCGv_i64 t0 = tcg_temp_new_i64();
906     TCGv_i64 t1 = tcg_temp_new_i64();
907     TCGv_i64 t2 = tcg_temp_new_i64();
908     uint32_t i;
909 
910     for (i = 0; i < oprsz; i += 8) {
911         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
912         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
913         if (load_dest) {
914             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
915         }
916         fni(t2, t0, t1);
917         tcg_gen_st_i64(t2, cpu_env, dofs + i);
918     }
919     tcg_temp_free_i64(t2);
920     tcg_temp_free_i64(t1);
921     tcg_temp_free_i64(t0);
922 }
923 
924 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
925                           uint32_t oprsz, int64_t c, bool load_dest,
926                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
927 {
928     TCGv_i64 t0 = tcg_temp_new_i64();
929     TCGv_i64 t1 = tcg_temp_new_i64();
930     TCGv_i64 t2 = tcg_temp_new_i64();
931     uint32_t i;
932 
933     for (i = 0; i < oprsz; i += 8) {
934         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
935         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
936         if (load_dest) {
937             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
938         }
939         fni(t2, t0, t1, c);
940         tcg_gen_st_i64(t2, cpu_env, dofs + i);
941     }
942     tcg_temp_free_i64(t0);
943     tcg_temp_free_i64(t1);
944     tcg_temp_free_i64(t2);
945 }
946 
947 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
948 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
949                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
950                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
951 {
952     TCGv_i64 t0 = tcg_temp_new_i64();
953     TCGv_i64 t1 = tcg_temp_new_i64();
954     TCGv_i64 t2 = tcg_temp_new_i64();
955     TCGv_i64 t3 = tcg_temp_new_i64();
956     uint32_t i;
957 
958     for (i = 0; i < oprsz; i += 8) {
959         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
960         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
961         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
962         fni(t0, t1, t2, t3);
963         tcg_gen_st_i64(t0, cpu_env, dofs + i);
964         if (write_aofs) {
965             tcg_gen_st_i64(t1, cpu_env, aofs + i);
966         }
967     }
968     tcg_temp_free_i64(t3);
969     tcg_temp_free_i64(t2);
970     tcg_temp_free_i64(t1);
971     tcg_temp_free_i64(t0);
972 }
973 
974 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
975 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
976                          uint32_t oprsz, uint32_t tysz, TCGType type,
977                          bool load_dest,
978                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
979 {
980     TCGv_vec t0 = tcg_temp_new_vec(type);
981     TCGv_vec t1 = tcg_temp_new_vec(type);
982     uint32_t i;
983 
984     for (i = 0; i < oprsz; i += tysz) {
985         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
986         if (load_dest) {
987             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
988         }
989         fni(vece, t1, t0);
990         tcg_gen_st_vec(t1, cpu_env, dofs + i);
991     }
992     tcg_temp_free_vec(t0);
993     tcg_temp_free_vec(t1);
994 }
995 
996 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
997    using host vectors.  */
998 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
999                           uint32_t oprsz, uint32_t tysz, TCGType type,
1000                           int64_t c, bool load_dest,
1001                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1002 {
1003     TCGv_vec t0 = tcg_temp_new_vec(type);
1004     TCGv_vec t1 = tcg_temp_new_vec(type);
1005     uint32_t i;
1006 
1007     for (i = 0; i < oprsz; i += tysz) {
1008         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1009         if (load_dest) {
1010             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1011         }
1012         fni(vece, t1, t0, c);
1013         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1014     }
1015     tcg_temp_free_vec(t0);
1016     tcg_temp_free_vec(t1);
1017 }
1018 
1019 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1020                           uint32_t oprsz, uint32_t tysz, TCGType type,
1021                           TCGv_vec c, bool scalar_first,
1022                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1023 {
1024     TCGv_vec t0 = tcg_temp_new_vec(type);
1025     TCGv_vec t1 = tcg_temp_new_vec(type);
1026     uint32_t i;
1027 
1028     for (i = 0; i < oprsz; i += tysz) {
1029         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1030         if (scalar_first) {
1031             fni(vece, t1, c, t0);
1032         } else {
1033             fni(vece, t1, t0, c);
1034         }
1035         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1036     }
1037     tcg_temp_free_vec(t0);
1038     tcg_temp_free_vec(t1);
1039 }
1040 
1041 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1042 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1043                          uint32_t bofs, uint32_t oprsz,
1044                          uint32_t tysz, TCGType type, bool load_dest,
1045                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1046 {
1047     TCGv_vec t0 = tcg_temp_new_vec(type);
1048     TCGv_vec t1 = tcg_temp_new_vec(type);
1049     TCGv_vec t2 = tcg_temp_new_vec(type);
1050     uint32_t i;
1051 
1052     for (i = 0; i < oprsz; i += tysz) {
1053         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1054         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1055         if (load_dest) {
1056             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1057         }
1058         fni(vece, t2, t0, t1);
1059         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1060     }
1061     tcg_temp_free_vec(t2);
1062     tcg_temp_free_vec(t1);
1063     tcg_temp_free_vec(t0);
1064 }
1065 
1066 /*
1067  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1068  * using host vectors.
1069  */
1070 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1071                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1072                           TCGType type, int64_t c, bool load_dest,
1073                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1074                                       int64_t))
1075 {
1076     TCGv_vec t0 = tcg_temp_new_vec(type);
1077     TCGv_vec t1 = tcg_temp_new_vec(type);
1078     TCGv_vec t2 = tcg_temp_new_vec(type);
1079     uint32_t i;
1080 
1081     for (i = 0; i < oprsz; i += tysz) {
1082         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1083         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1084         if (load_dest) {
1085             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1086         }
1087         fni(vece, t2, t0, t1, c);
1088         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1089     }
1090     tcg_temp_free_vec(t0);
1091     tcg_temp_free_vec(t1);
1092     tcg_temp_free_vec(t2);
1093 }
1094 
1095 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1096 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1097                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1098                          uint32_t tysz, TCGType type, bool write_aofs,
1099                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1100                                      TCGv_vec, TCGv_vec))
1101 {
1102     TCGv_vec t0 = tcg_temp_new_vec(type);
1103     TCGv_vec t1 = tcg_temp_new_vec(type);
1104     TCGv_vec t2 = tcg_temp_new_vec(type);
1105     TCGv_vec t3 = tcg_temp_new_vec(type);
1106     uint32_t i;
1107 
1108     for (i = 0; i < oprsz; i += tysz) {
1109         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1110         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1111         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1112         fni(vece, t0, t1, t2, t3);
1113         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1114         if (write_aofs) {
1115             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1116         }
1117     }
1118     tcg_temp_free_vec(t3);
1119     tcg_temp_free_vec(t2);
1120     tcg_temp_free_vec(t1);
1121     tcg_temp_free_vec(t0);
1122 }
1123 
1124 /* Expand a vector two-operand operation.  */
1125 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1126                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1127 {
1128     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1129     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1130     TCGType type;
1131     uint32_t some;
1132 
1133     check_size_align(oprsz, maxsz, dofs | aofs);
1134     check_overlap_2(dofs, aofs, maxsz);
1135 
1136     type = 0;
1137     if (g->fniv) {
1138         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1139     }
1140     switch (type) {
1141     case TCG_TYPE_V256:
1142         /* Recall that ARM SVE allows vector sizes that are not a
1143          * power of 2, but always a multiple of 16.  The intent is
1144          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1145          */
1146         some = QEMU_ALIGN_DOWN(oprsz, 32);
1147         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1148                      g->load_dest, g->fniv);
1149         if (some == oprsz) {
1150             break;
1151         }
1152         dofs += some;
1153         aofs += some;
1154         oprsz -= some;
1155         maxsz -= some;
1156         /* fallthru */
1157     case TCG_TYPE_V128:
1158         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1159                      g->load_dest, g->fniv);
1160         break;
1161     case TCG_TYPE_V64:
1162         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1163                      g->load_dest, g->fniv);
1164         break;
1165 
1166     case 0:
1167         if (g->fni8 && check_size_impl(oprsz, 8)) {
1168             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1169         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1170             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1171         } else {
1172             assert(g->fno != NULL);
1173             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1174             oprsz = maxsz;
1175         }
1176         break;
1177 
1178     default:
1179         g_assert_not_reached();
1180     }
1181     tcg_swap_vecop_list(hold_list);
1182 
1183     if (oprsz < maxsz) {
1184         expand_clr(dofs + oprsz, maxsz - oprsz);
1185     }
1186 }
1187 
1188 /* Expand a vector operation with two vectors and an immediate.  */
1189 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1190                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1191 {
1192     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1193     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1194     TCGType type;
1195     uint32_t some;
1196 
1197     check_size_align(oprsz, maxsz, dofs | aofs);
1198     check_overlap_2(dofs, aofs, maxsz);
1199 
1200     type = 0;
1201     if (g->fniv) {
1202         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1203     }
1204     switch (type) {
1205     case TCG_TYPE_V256:
1206         /* Recall that ARM SVE allows vector sizes that are not a
1207          * power of 2, but always a multiple of 16.  The intent is
1208          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1209          */
1210         some = QEMU_ALIGN_DOWN(oprsz, 32);
1211         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1212                       c, g->load_dest, g->fniv);
1213         if (some == oprsz) {
1214             break;
1215         }
1216         dofs += some;
1217         aofs += some;
1218         oprsz -= some;
1219         maxsz -= some;
1220         /* fallthru */
1221     case TCG_TYPE_V128:
1222         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1223                       c, g->load_dest, g->fniv);
1224         break;
1225     case TCG_TYPE_V64:
1226         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1227                       c, g->load_dest, g->fniv);
1228         break;
1229 
1230     case 0:
1231         if (g->fni8 && check_size_impl(oprsz, 8)) {
1232             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1233         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1234             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1235         } else {
1236             if (g->fno) {
1237                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1238             } else {
1239                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1240                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1241                                     maxsz, c, g->fnoi);
1242             }
1243             oprsz = maxsz;
1244         }
1245         break;
1246 
1247     default:
1248         g_assert_not_reached();
1249     }
1250     tcg_swap_vecop_list(hold_list);
1251 
1252     if (oprsz < maxsz) {
1253         expand_clr(dofs + oprsz, maxsz - oprsz);
1254     }
1255 }
1256 
1257 /* Expand a vector operation with two vectors and a scalar.  */
1258 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1259                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1260 {
1261     TCGType type;
1262 
1263     check_size_align(oprsz, maxsz, dofs | aofs);
1264     check_overlap_2(dofs, aofs, maxsz);
1265 
1266     type = 0;
1267     if (g->fniv) {
1268         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1269     }
1270     if (type != 0) {
1271         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1272         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1273         TCGv_vec t_vec = tcg_temp_new_vec(type);
1274         uint32_t some;
1275 
1276         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1277 
1278         switch (type) {
1279         case TCG_TYPE_V256:
1280             /* Recall that ARM SVE allows vector sizes that are not a
1281              * power of 2, but always a multiple of 16.  The intent is
1282              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1283              */
1284             some = QEMU_ALIGN_DOWN(oprsz, 32);
1285             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1286                           t_vec, g->scalar_first, g->fniv);
1287             if (some == oprsz) {
1288                 break;
1289             }
1290             dofs += some;
1291             aofs += some;
1292             oprsz -= some;
1293             maxsz -= some;
1294             /* fallthru */
1295 
1296         case TCG_TYPE_V128:
1297             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1298                           t_vec, g->scalar_first, g->fniv);
1299             break;
1300 
1301         case TCG_TYPE_V64:
1302             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1303                           t_vec, g->scalar_first, g->fniv);
1304             break;
1305 
1306         default:
1307             g_assert_not_reached();
1308         }
1309         tcg_temp_free_vec(t_vec);
1310         tcg_swap_vecop_list(hold_list);
1311     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1312         TCGv_i64 t64 = tcg_temp_new_i64();
1313 
1314         tcg_gen_dup_i64(g->vece, t64, c);
1315         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1316         tcg_temp_free_i64(t64);
1317     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1318         TCGv_i32 t32 = tcg_temp_new_i32();
1319 
1320         tcg_gen_extrl_i64_i32(t32, c);
1321         tcg_gen_dup_i32(g->vece, t32, t32);
1322         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1323         tcg_temp_free_i32(t32);
1324     } else {
1325         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1326         return;
1327     }
1328 
1329     if (oprsz < maxsz) {
1330         expand_clr(dofs + oprsz, maxsz - oprsz);
1331     }
1332 }
1333 
1334 /* Expand a vector three-operand operation.  */
1335 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1336                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1337 {
1338     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1339     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1340     TCGType type;
1341     uint32_t some;
1342 
1343     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1344     check_overlap_3(dofs, aofs, bofs, maxsz);
1345 
1346     type = 0;
1347     if (g->fniv) {
1348         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1349     }
1350     switch (type) {
1351     case TCG_TYPE_V256:
1352         /* Recall that ARM SVE allows vector sizes that are not a
1353          * power of 2, but always a multiple of 16.  The intent is
1354          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1355          */
1356         some = QEMU_ALIGN_DOWN(oprsz, 32);
1357         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1358                      g->load_dest, g->fniv);
1359         if (some == oprsz) {
1360             break;
1361         }
1362         dofs += some;
1363         aofs += some;
1364         bofs += some;
1365         oprsz -= some;
1366         maxsz -= some;
1367         /* fallthru */
1368     case TCG_TYPE_V128:
1369         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1370                      g->load_dest, g->fniv);
1371         break;
1372     case TCG_TYPE_V64:
1373         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1374                      g->load_dest, g->fniv);
1375         break;
1376 
1377     case 0:
1378         if (g->fni8 && check_size_impl(oprsz, 8)) {
1379             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1380         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1381             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1382         } else {
1383             assert(g->fno != NULL);
1384             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1385                                maxsz, g->data, g->fno);
1386             oprsz = maxsz;
1387         }
1388         break;
1389 
1390     default:
1391         g_assert_not_reached();
1392     }
1393     tcg_swap_vecop_list(hold_list);
1394 
1395     if (oprsz < maxsz) {
1396         expand_clr(dofs + oprsz, maxsz - oprsz);
1397     }
1398 }
1399 
1400 /* Expand a vector operation with three vectors and an immediate.  */
1401 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1402                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1403                      const GVecGen3i *g)
1404 {
1405     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1406     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1407     TCGType type;
1408     uint32_t some;
1409 
1410     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1411     check_overlap_3(dofs, aofs, bofs, maxsz);
1412 
1413     type = 0;
1414     if (g->fniv) {
1415         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1416     }
1417     switch (type) {
1418     case TCG_TYPE_V256:
1419         /*
1420          * Recall that ARM SVE allows vector sizes that are not a
1421          * power of 2, but always a multiple of 16.  The intent is
1422          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1423          */
1424         some = QEMU_ALIGN_DOWN(oprsz, 32);
1425         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1426                       c, g->load_dest, g->fniv);
1427         if (some == oprsz) {
1428             break;
1429         }
1430         dofs += some;
1431         aofs += some;
1432         bofs += some;
1433         oprsz -= some;
1434         maxsz -= some;
1435         /* fallthru */
1436     case TCG_TYPE_V128:
1437         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1438                       c, g->load_dest, g->fniv);
1439         break;
1440     case TCG_TYPE_V64:
1441         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1442                       c, g->load_dest, g->fniv);
1443         break;
1444 
1445     case 0:
1446         if (g->fni8 && check_size_impl(oprsz, 8)) {
1447             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1448         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1449             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1450         } else {
1451             assert(g->fno != NULL);
1452             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1453             oprsz = maxsz;
1454         }
1455         break;
1456 
1457     default:
1458         g_assert_not_reached();
1459     }
1460     tcg_swap_vecop_list(hold_list);
1461 
1462     if (oprsz < maxsz) {
1463         expand_clr(dofs + oprsz, maxsz - oprsz);
1464     }
1465 }
1466 
1467 /* Expand a vector four-operand operation.  */
1468 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1469                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1470 {
1471     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1472     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1473     TCGType type;
1474     uint32_t some;
1475 
1476     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1477     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1478 
1479     type = 0;
1480     if (g->fniv) {
1481         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1482     }
1483     switch (type) {
1484     case TCG_TYPE_V256:
1485         /* Recall that ARM SVE allows vector sizes that are not a
1486          * power of 2, but always a multiple of 16.  The intent is
1487          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1488          */
1489         some = QEMU_ALIGN_DOWN(oprsz, 32);
1490         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1491                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1492         if (some == oprsz) {
1493             break;
1494         }
1495         dofs += some;
1496         aofs += some;
1497         bofs += some;
1498         cofs += some;
1499         oprsz -= some;
1500         maxsz -= some;
1501         /* fallthru */
1502     case TCG_TYPE_V128:
1503         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1504                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1505         break;
1506     case TCG_TYPE_V64:
1507         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1508                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1509         break;
1510 
1511     case 0:
1512         if (g->fni8 && check_size_impl(oprsz, 8)) {
1513             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1514                          g->write_aofs, g->fni8);
1515         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1516             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1517                          g->write_aofs, g->fni4);
1518         } else {
1519             assert(g->fno != NULL);
1520             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1521                                oprsz, maxsz, g->data, g->fno);
1522             oprsz = maxsz;
1523         }
1524         break;
1525 
1526     default:
1527         g_assert_not_reached();
1528     }
1529     tcg_swap_vecop_list(hold_list);
1530 
1531     if (oprsz < maxsz) {
1532         expand_clr(dofs + oprsz, maxsz - oprsz);
1533     }
1534 }
1535 
1536 /*
1537  * Expand specific vector operations.
1538  */
1539 
1540 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1541 {
1542     tcg_gen_mov_vec(a, b);
1543 }
1544 
1545 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1546                       uint32_t oprsz, uint32_t maxsz)
1547 {
1548     static const GVecGen2 g = {
1549         .fni8 = tcg_gen_mov_i64,
1550         .fniv = vec_mov2,
1551         .fno = gen_helper_gvec_mov,
1552         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1553     };
1554     if (dofs != aofs) {
1555         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1556     } else {
1557         check_size_align(oprsz, maxsz, dofs);
1558         if (oprsz < maxsz) {
1559             expand_clr(dofs + oprsz, maxsz - oprsz);
1560         }
1561     }
1562 }
1563 
1564 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1565                           uint32_t maxsz, TCGv_i32 in)
1566 {
1567     check_size_align(oprsz, maxsz, dofs);
1568     tcg_debug_assert(vece <= MO_32);
1569     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1570 }
1571 
1572 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1573                           uint32_t maxsz, TCGv_i64 in)
1574 {
1575     check_size_align(oprsz, maxsz, dofs);
1576     tcg_debug_assert(vece <= MO_64);
1577     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1578 }
1579 
1580 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1581                           uint32_t oprsz, uint32_t maxsz)
1582 {
1583     check_size_align(oprsz, maxsz, dofs);
1584     if (vece <= MO_64) {
1585         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1586         if (type != 0) {
1587             TCGv_vec t_vec = tcg_temp_new_vec(type);
1588             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1589             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1590             tcg_temp_free_vec(t_vec);
1591         } else if (vece <= MO_32) {
1592             TCGv_i32 in = tcg_temp_new_i32();
1593             switch (vece) {
1594             case MO_8:
1595                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1596                 break;
1597             case MO_16:
1598                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1599                 break;
1600             default:
1601                 tcg_gen_ld_i32(in, cpu_env, aofs);
1602                 break;
1603             }
1604             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1605             tcg_temp_free_i32(in);
1606         } else {
1607             TCGv_i64 in = tcg_temp_new_i64();
1608             tcg_gen_ld_i64(in, cpu_env, aofs);
1609             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1610             tcg_temp_free_i64(in);
1611         }
1612     } else if (vece == 4) {
1613         /* 128-bit duplicate.  */
1614         int i;
1615 
1616         tcg_debug_assert(oprsz >= 16);
1617         if (TCG_TARGET_HAS_v128) {
1618             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1619 
1620             tcg_gen_ld_vec(in, cpu_env, aofs);
1621             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1622                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1623             }
1624             tcg_temp_free_vec(in);
1625         } else {
1626             TCGv_i64 in0 = tcg_temp_new_i64();
1627             TCGv_i64 in1 = tcg_temp_new_i64();
1628 
1629             tcg_gen_ld_i64(in0, cpu_env, aofs);
1630             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1631             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1632                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1633                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1634             }
1635             tcg_temp_free_i64(in0);
1636             tcg_temp_free_i64(in1);
1637         }
1638         if (oprsz < maxsz) {
1639             expand_clr(dofs + oprsz, maxsz - oprsz);
1640         }
1641     } else if (vece == 5) {
1642         /* 256-bit duplicate.  */
1643         int i;
1644 
1645         tcg_debug_assert(oprsz >= 32);
1646         tcg_debug_assert(oprsz % 32 == 0);
1647         if (TCG_TARGET_HAS_v256) {
1648             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1649 
1650             tcg_gen_ld_vec(in, cpu_env, aofs);
1651             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1652                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1653             }
1654             tcg_temp_free_vec(in);
1655         } else if (TCG_TARGET_HAS_v128) {
1656             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1657             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1658 
1659             tcg_gen_ld_vec(in0, cpu_env, aofs);
1660             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1661             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1662                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1663                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1664             }
1665             tcg_temp_free_vec(in0);
1666             tcg_temp_free_vec(in1);
1667         } else {
1668             TCGv_i64 in[4];
1669             int j;
1670 
1671             for (j = 0; j < 4; ++j) {
1672                 in[j] = tcg_temp_new_i64();
1673                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1674             }
1675             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1676                 for (j = 0; j < 4; ++j) {
1677                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1678                 }
1679             }
1680             for (j = 0; j < 4; ++j) {
1681                 tcg_temp_free_i64(in[j]);
1682             }
1683         }
1684         if (oprsz < maxsz) {
1685             expand_clr(dofs + oprsz, maxsz - oprsz);
1686         }
1687     } else {
1688         g_assert_not_reached();
1689     }
1690 }
1691 
1692 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1693                           uint32_t maxsz, uint64_t x)
1694 {
1695     check_size_align(oprsz, maxsz, dofs);
1696     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1697 }
1698 
1699 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1700                       uint32_t oprsz, uint32_t maxsz)
1701 {
1702     static const GVecGen2 g = {
1703         .fni8 = tcg_gen_not_i64,
1704         .fniv = tcg_gen_not_vec,
1705         .fno = gen_helper_gvec_not,
1706         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1707     };
1708     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1709 }
1710 
1711 /* Perform a vector addition using normal addition and a mask.  The mask
1712    should be the sign bit of each lane.  This 6-operation form is more
1713    efficient than separate additions when there are 4 or more lanes in
1714    the 64-bit operation.  */
1715 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1716 {
1717     TCGv_i64 t1 = tcg_temp_new_i64();
1718     TCGv_i64 t2 = tcg_temp_new_i64();
1719     TCGv_i64 t3 = tcg_temp_new_i64();
1720 
1721     tcg_gen_andc_i64(t1, a, m);
1722     tcg_gen_andc_i64(t2, b, m);
1723     tcg_gen_xor_i64(t3, a, b);
1724     tcg_gen_add_i64(d, t1, t2);
1725     tcg_gen_and_i64(t3, t3, m);
1726     tcg_gen_xor_i64(d, d, t3);
1727 
1728     tcg_temp_free_i64(t1);
1729     tcg_temp_free_i64(t2);
1730     tcg_temp_free_i64(t3);
1731 }
1732 
1733 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734 {
1735     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1736     gen_addv_mask(d, a, b, m);
1737 }
1738 
1739 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1740 {
1741     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1742     TCGv_i32 t1 = tcg_temp_new_i32();
1743     TCGv_i32 t2 = tcg_temp_new_i32();
1744     TCGv_i32 t3 = tcg_temp_new_i32();
1745 
1746     tcg_gen_andc_i32(t1, a, m);
1747     tcg_gen_andc_i32(t2, b, m);
1748     tcg_gen_xor_i32(t3, a, b);
1749     tcg_gen_add_i32(d, t1, t2);
1750     tcg_gen_and_i32(t3, t3, m);
1751     tcg_gen_xor_i32(d, d, t3);
1752 
1753     tcg_temp_free_i32(t1);
1754     tcg_temp_free_i32(t2);
1755     tcg_temp_free_i32(t3);
1756 }
1757 
1758 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1759 {
1760     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1761     gen_addv_mask(d, a, b, m);
1762 }
1763 
1764 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1765 {
1766     TCGv_i32 t1 = tcg_temp_new_i32();
1767     TCGv_i32 t2 = tcg_temp_new_i32();
1768 
1769     tcg_gen_andi_i32(t1, a, ~0xffff);
1770     tcg_gen_add_i32(t2, a, b);
1771     tcg_gen_add_i32(t1, t1, b);
1772     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1773 
1774     tcg_temp_free_i32(t1);
1775     tcg_temp_free_i32(t2);
1776 }
1777 
1778 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1779 {
1780     TCGv_i64 t1 = tcg_temp_new_i64();
1781     TCGv_i64 t2 = tcg_temp_new_i64();
1782 
1783     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1784     tcg_gen_add_i64(t2, a, b);
1785     tcg_gen_add_i64(t1, t1, b);
1786     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1787 
1788     tcg_temp_free_i64(t1);
1789     tcg_temp_free_i64(t2);
1790 }
1791 
1792 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1793 
1794 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1795                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1796 {
1797     static const GVecGen3 g[4] = {
1798         { .fni8 = tcg_gen_vec_add8_i64,
1799           .fniv = tcg_gen_add_vec,
1800           .fno = gen_helper_gvec_add8,
1801           .opt_opc = vecop_list_add,
1802           .vece = MO_8 },
1803         { .fni8 = tcg_gen_vec_add16_i64,
1804           .fniv = tcg_gen_add_vec,
1805           .fno = gen_helper_gvec_add16,
1806           .opt_opc = vecop_list_add,
1807           .vece = MO_16 },
1808         { .fni4 = tcg_gen_add_i32,
1809           .fniv = tcg_gen_add_vec,
1810           .fno = gen_helper_gvec_add32,
1811           .opt_opc = vecop_list_add,
1812           .vece = MO_32 },
1813         { .fni8 = tcg_gen_add_i64,
1814           .fniv = tcg_gen_add_vec,
1815           .fno = gen_helper_gvec_add64,
1816           .opt_opc = vecop_list_add,
1817           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1818           .vece = MO_64 },
1819     };
1820 
1821     tcg_debug_assert(vece <= MO_64);
1822     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1823 }
1824 
1825 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1826                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1827 {
1828     static const GVecGen2s g[4] = {
1829         { .fni8 = tcg_gen_vec_add8_i64,
1830           .fniv = tcg_gen_add_vec,
1831           .fno = gen_helper_gvec_adds8,
1832           .opt_opc = vecop_list_add,
1833           .vece = MO_8 },
1834         { .fni8 = tcg_gen_vec_add16_i64,
1835           .fniv = tcg_gen_add_vec,
1836           .fno = gen_helper_gvec_adds16,
1837           .opt_opc = vecop_list_add,
1838           .vece = MO_16 },
1839         { .fni4 = tcg_gen_add_i32,
1840           .fniv = tcg_gen_add_vec,
1841           .fno = gen_helper_gvec_adds32,
1842           .opt_opc = vecop_list_add,
1843           .vece = MO_32 },
1844         { .fni8 = tcg_gen_add_i64,
1845           .fniv = tcg_gen_add_vec,
1846           .fno = gen_helper_gvec_adds64,
1847           .opt_opc = vecop_list_add,
1848           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1849           .vece = MO_64 },
1850     };
1851 
1852     tcg_debug_assert(vece <= MO_64);
1853     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1854 }
1855 
1856 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1857                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1858 {
1859     TCGv_i64 tmp = tcg_constant_i64(c);
1860     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1861 }
1862 
1863 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1864 
1865 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1866                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1867 {
1868     static const GVecGen2s g[4] = {
1869         { .fni8 = tcg_gen_vec_sub8_i64,
1870           .fniv = tcg_gen_sub_vec,
1871           .fno = gen_helper_gvec_subs8,
1872           .opt_opc = vecop_list_sub,
1873           .vece = MO_8 },
1874         { .fni8 = tcg_gen_vec_sub16_i64,
1875           .fniv = tcg_gen_sub_vec,
1876           .fno = gen_helper_gvec_subs16,
1877           .opt_opc = vecop_list_sub,
1878           .vece = MO_16 },
1879         { .fni4 = tcg_gen_sub_i32,
1880           .fniv = tcg_gen_sub_vec,
1881           .fno = gen_helper_gvec_subs32,
1882           .opt_opc = vecop_list_sub,
1883           .vece = MO_32 },
1884         { .fni8 = tcg_gen_sub_i64,
1885           .fniv = tcg_gen_sub_vec,
1886           .fno = gen_helper_gvec_subs64,
1887           .opt_opc = vecop_list_sub,
1888           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1889           .vece = MO_64 },
1890     };
1891 
1892     tcg_debug_assert(vece <= MO_64);
1893     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1894 }
1895 
1896 /* Perform a vector subtraction using normal subtraction and a mask.
1897    Compare gen_addv_mask above.  */
1898 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1899 {
1900     TCGv_i64 t1 = tcg_temp_new_i64();
1901     TCGv_i64 t2 = tcg_temp_new_i64();
1902     TCGv_i64 t3 = tcg_temp_new_i64();
1903 
1904     tcg_gen_or_i64(t1, a, m);
1905     tcg_gen_andc_i64(t2, b, m);
1906     tcg_gen_eqv_i64(t3, a, b);
1907     tcg_gen_sub_i64(d, t1, t2);
1908     tcg_gen_and_i64(t3, t3, m);
1909     tcg_gen_xor_i64(d, d, t3);
1910 
1911     tcg_temp_free_i64(t1);
1912     tcg_temp_free_i64(t2);
1913     tcg_temp_free_i64(t3);
1914 }
1915 
1916 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1917 {
1918     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1919     gen_subv_mask(d, a, b, m);
1920 }
1921 
1922 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1923 {
1924     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1925     TCGv_i32 t1 = tcg_temp_new_i32();
1926     TCGv_i32 t2 = tcg_temp_new_i32();
1927     TCGv_i32 t3 = tcg_temp_new_i32();
1928 
1929     tcg_gen_or_i32(t1, a, m);
1930     tcg_gen_andc_i32(t2, b, m);
1931     tcg_gen_eqv_i32(t3, a, b);
1932     tcg_gen_sub_i32(d, t1, t2);
1933     tcg_gen_and_i32(t3, t3, m);
1934     tcg_gen_xor_i32(d, d, t3);
1935 
1936     tcg_temp_free_i32(t1);
1937     tcg_temp_free_i32(t2);
1938     tcg_temp_free_i32(t3);
1939 }
1940 
1941 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1942 {
1943     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1944     gen_subv_mask(d, a, b, m);
1945 }
1946 
1947 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1948 {
1949     TCGv_i32 t1 = tcg_temp_new_i32();
1950     TCGv_i32 t2 = tcg_temp_new_i32();
1951 
1952     tcg_gen_andi_i32(t1, b, ~0xffff);
1953     tcg_gen_sub_i32(t2, a, b);
1954     tcg_gen_sub_i32(t1, a, t1);
1955     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1956 
1957     tcg_temp_free_i32(t1);
1958     tcg_temp_free_i32(t2);
1959 }
1960 
1961 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1962 {
1963     TCGv_i64 t1 = tcg_temp_new_i64();
1964     TCGv_i64 t2 = tcg_temp_new_i64();
1965 
1966     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1967     tcg_gen_sub_i64(t2, a, b);
1968     tcg_gen_sub_i64(t1, a, t1);
1969     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1970 
1971     tcg_temp_free_i64(t1);
1972     tcg_temp_free_i64(t2);
1973 }
1974 
1975 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1976                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1977 {
1978     static const GVecGen3 g[4] = {
1979         { .fni8 = tcg_gen_vec_sub8_i64,
1980           .fniv = tcg_gen_sub_vec,
1981           .fno = gen_helper_gvec_sub8,
1982           .opt_opc = vecop_list_sub,
1983           .vece = MO_8 },
1984         { .fni8 = tcg_gen_vec_sub16_i64,
1985           .fniv = tcg_gen_sub_vec,
1986           .fno = gen_helper_gvec_sub16,
1987           .opt_opc = vecop_list_sub,
1988           .vece = MO_16 },
1989         { .fni4 = tcg_gen_sub_i32,
1990           .fniv = tcg_gen_sub_vec,
1991           .fno = gen_helper_gvec_sub32,
1992           .opt_opc = vecop_list_sub,
1993           .vece = MO_32 },
1994         { .fni8 = tcg_gen_sub_i64,
1995           .fniv = tcg_gen_sub_vec,
1996           .fno = gen_helper_gvec_sub64,
1997           .opt_opc = vecop_list_sub,
1998           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1999           .vece = MO_64 },
2000     };
2001 
2002     tcg_debug_assert(vece <= MO_64);
2003     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2004 }
2005 
2006 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2007 
2008 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2009                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2010 {
2011     static const GVecGen3 g[4] = {
2012         { .fniv = tcg_gen_mul_vec,
2013           .fno = gen_helper_gvec_mul8,
2014           .opt_opc = vecop_list_mul,
2015           .vece = MO_8 },
2016         { .fniv = tcg_gen_mul_vec,
2017           .fno = gen_helper_gvec_mul16,
2018           .opt_opc = vecop_list_mul,
2019           .vece = MO_16 },
2020         { .fni4 = tcg_gen_mul_i32,
2021           .fniv = tcg_gen_mul_vec,
2022           .fno = gen_helper_gvec_mul32,
2023           .opt_opc = vecop_list_mul,
2024           .vece = MO_32 },
2025         { .fni8 = tcg_gen_mul_i64,
2026           .fniv = tcg_gen_mul_vec,
2027           .fno = gen_helper_gvec_mul64,
2028           .opt_opc = vecop_list_mul,
2029           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2030           .vece = MO_64 },
2031     };
2032 
2033     tcg_debug_assert(vece <= MO_64);
2034     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2035 }
2036 
2037 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2038                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2039 {
2040     static const GVecGen2s g[4] = {
2041         { .fniv = tcg_gen_mul_vec,
2042           .fno = gen_helper_gvec_muls8,
2043           .opt_opc = vecop_list_mul,
2044           .vece = MO_8 },
2045         { .fniv = tcg_gen_mul_vec,
2046           .fno = gen_helper_gvec_muls16,
2047           .opt_opc = vecop_list_mul,
2048           .vece = MO_16 },
2049         { .fni4 = tcg_gen_mul_i32,
2050           .fniv = tcg_gen_mul_vec,
2051           .fno = gen_helper_gvec_muls32,
2052           .opt_opc = vecop_list_mul,
2053           .vece = MO_32 },
2054         { .fni8 = tcg_gen_mul_i64,
2055           .fniv = tcg_gen_mul_vec,
2056           .fno = gen_helper_gvec_muls64,
2057           .opt_opc = vecop_list_mul,
2058           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2059           .vece = MO_64 },
2060     };
2061 
2062     tcg_debug_assert(vece <= MO_64);
2063     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2064 }
2065 
2066 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2067                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2068 {
2069     TCGv_i64 tmp = tcg_constant_i64(c);
2070     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2071 }
2072 
2073 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2074                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2075 {
2076     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2077     static const GVecGen3 g[4] = {
2078         { .fniv = tcg_gen_ssadd_vec,
2079           .fno = gen_helper_gvec_ssadd8,
2080           .opt_opc = vecop_list,
2081           .vece = MO_8 },
2082         { .fniv = tcg_gen_ssadd_vec,
2083           .fno = gen_helper_gvec_ssadd16,
2084           .opt_opc = vecop_list,
2085           .vece = MO_16 },
2086         { .fniv = tcg_gen_ssadd_vec,
2087           .fno = gen_helper_gvec_ssadd32,
2088           .opt_opc = vecop_list,
2089           .vece = MO_32 },
2090         { .fniv = tcg_gen_ssadd_vec,
2091           .fno = gen_helper_gvec_ssadd64,
2092           .opt_opc = vecop_list,
2093           .vece = MO_64 },
2094     };
2095     tcg_debug_assert(vece <= MO_64);
2096     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2097 }
2098 
2099 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2100                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2101 {
2102     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2103     static const GVecGen3 g[4] = {
2104         { .fniv = tcg_gen_sssub_vec,
2105           .fno = gen_helper_gvec_sssub8,
2106           .opt_opc = vecop_list,
2107           .vece = MO_8 },
2108         { .fniv = tcg_gen_sssub_vec,
2109           .fno = gen_helper_gvec_sssub16,
2110           .opt_opc = vecop_list,
2111           .vece = MO_16 },
2112         { .fniv = tcg_gen_sssub_vec,
2113           .fno = gen_helper_gvec_sssub32,
2114           .opt_opc = vecop_list,
2115           .vece = MO_32 },
2116         { .fniv = tcg_gen_sssub_vec,
2117           .fno = gen_helper_gvec_sssub64,
2118           .opt_opc = vecop_list,
2119           .vece = MO_64 },
2120     };
2121     tcg_debug_assert(vece <= MO_64);
2122     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2123 }
2124 
2125 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2126 {
2127     TCGv_i32 max = tcg_constant_i32(-1);
2128     tcg_gen_add_i32(d, a, b);
2129     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2130 }
2131 
2132 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2133 {
2134     TCGv_i64 max = tcg_constant_i64(-1);
2135     tcg_gen_add_i64(d, a, b);
2136     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2137 }
2138 
2139 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2140                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2141 {
2142     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2143     static const GVecGen3 g[4] = {
2144         { .fniv = tcg_gen_usadd_vec,
2145           .fno = gen_helper_gvec_usadd8,
2146           .opt_opc = vecop_list,
2147           .vece = MO_8 },
2148         { .fniv = tcg_gen_usadd_vec,
2149           .fno = gen_helper_gvec_usadd16,
2150           .opt_opc = vecop_list,
2151           .vece = MO_16 },
2152         { .fni4 = tcg_gen_usadd_i32,
2153           .fniv = tcg_gen_usadd_vec,
2154           .fno = gen_helper_gvec_usadd32,
2155           .opt_opc = vecop_list,
2156           .vece = MO_32 },
2157         { .fni8 = tcg_gen_usadd_i64,
2158           .fniv = tcg_gen_usadd_vec,
2159           .fno = gen_helper_gvec_usadd64,
2160           .opt_opc = vecop_list,
2161           .vece = MO_64 }
2162     };
2163     tcg_debug_assert(vece <= MO_64);
2164     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2165 }
2166 
2167 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2168 {
2169     TCGv_i32 min = tcg_constant_i32(0);
2170     tcg_gen_sub_i32(d, a, b);
2171     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2172 }
2173 
2174 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2175 {
2176     TCGv_i64 min = tcg_constant_i64(0);
2177     tcg_gen_sub_i64(d, a, b);
2178     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2179 }
2180 
2181 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2182                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2183 {
2184     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2185     static const GVecGen3 g[4] = {
2186         { .fniv = tcg_gen_ussub_vec,
2187           .fno = gen_helper_gvec_ussub8,
2188           .opt_opc = vecop_list,
2189           .vece = MO_8 },
2190         { .fniv = tcg_gen_ussub_vec,
2191           .fno = gen_helper_gvec_ussub16,
2192           .opt_opc = vecop_list,
2193           .vece = MO_16 },
2194         { .fni4 = tcg_gen_ussub_i32,
2195           .fniv = tcg_gen_ussub_vec,
2196           .fno = gen_helper_gvec_ussub32,
2197           .opt_opc = vecop_list,
2198           .vece = MO_32 },
2199         { .fni8 = tcg_gen_ussub_i64,
2200           .fniv = tcg_gen_ussub_vec,
2201           .fno = gen_helper_gvec_ussub64,
2202           .opt_opc = vecop_list,
2203           .vece = MO_64 }
2204     };
2205     tcg_debug_assert(vece <= MO_64);
2206     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2207 }
2208 
2209 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2210                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2211 {
2212     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2213     static const GVecGen3 g[4] = {
2214         { .fniv = tcg_gen_smin_vec,
2215           .fno = gen_helper_gvec_smin8,
2216           .opt_opc = vecop_list,
2217           .vece = MO_8 },
2218         { .fniv = tcg_gen_smin_vec,
2219           .fno = gen_helper_gvec_smin16,
2220           .opt_opc = vecop_list,
2221           .vece = MO_16 },
2222         { .fni4 = tcg_gen_smin_i32,
2223           .fniv = tcg_gen_smin_vec,
2224           .fno = gen_helper_gvec_smin32,
2225           .opt_opc = vecop_list,
2226           .vece = MO_32 },
2227         { .fni8 = tcg_gen_smin_i64,
2228           .fniv = tcg_gen_smin_vec,
2229           .fno = gen_helper_gvec_smin64,
2230           .opt_opc = vecop_list,
2231           .vece = MO_64 }
2232     };
2233     tcg_debug_assert(vece <= MO_64);
2234     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2235 }
2236 
2237 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2238                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2239 {
2240     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2241     static const GVecGen3 g[4] = {
2242         { .fniv = tcg_gen_umin_vec,
2243           .fno = gen_helper_gvec_umin8,
2244           .opt_opc = vecop_list,
2245           .vece = MO_8 },
2246         { .fniv = tcg_gen_umin_vec,
2247           .fno = gen_helper_gvec_umin16,
2248           .opt_opc = vecop_list,
2249           .vece = MO_16 },
2250         { .fni4 = tcg_gen_umin_i32,
2251           .fniv = tcg_gen_umin_vec,
2252           .fno = gen_helper_gvec_umin32,
2253           .opt_opc = vecop_list,
2254           .vece = MO_32 },
2255         { .fni8 = tcg_gen_umin_i64,
2256           .fniv = tcg_gen_umin_vec,
2257           .fno = gen_helper_gvec_umin64,
2258           .opt_opc = vecop_list,
2259           .vece = MO_64 }
2260     };
2261     tcg_debug_assert(vece <= MO_64);
2262     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2263 }
2264 
2265 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2266                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2267 {
2268     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2269     static const GVecGen3 g[4] = {
2270         { .fniv = tcg_gen_smax_vec,
2271           .fno = gen_helper_gvec_smax8,
2272           .opt_opc = vecop_list,
2273           .vece = MO_8 },
2274         { .fniv = tcg_gen_smax_vec,
2275           .fno = gen_helper_gvec_smax16,
2276           .opt_opc = vecop_list,
2277           .vece = MO_16 },
2278         { .fni4 = tcg_gen_smax_i32,
2279           .fniv = tcg_gen_smax_vec,
2280           .fno = gen_helper_gvec_smax32,
2281           .opt_opc = vecop_list,
2282           .vece = MO_32 },
2283         { .fni8 = tcg_gen_smax_i64,
2284           .fniv = tcg_gen_smax_vec,
2285           .fno = gen_helper_gvec_smax64,
2286           .opt_opc = vecop_list,
2287           .vece = MO_64 }
2288     };
2289     tcg_debug_assert(vece <= MO_64);
2290     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2291 }
2292 
2293 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2294                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2295 {
2296     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2297     static const GVecGen3 g[4] = {
2298         { .fniv = tcg_gen_umax_vec,
2299           .fno = gen_helper_gvec_umax8,
2300           .opt_opc = vecop_list,
2301           .vece = MO_8 },
2302         { .fniv = tcg_gen_umax_vec,
2303           .fno = gen_helper_gvec_umax16,
2304           .opt_opc = vecop_list,
2305           .vece = MO_16 },
2306         { .fni4 = tcg_gen_umax_i32,
2307           .fniv = tcg_gen_umax_vec,
2308           .fno = gen_helper_gvec_umax32,
2309           .opt_opc = vecop_list,
2310           .vece = MO_32 },
2311         { .fni8 = tcg_gen_umax_i64,
2312           .fniv = tcg_gen_umax_vec,
2313           .fno = gen_helper_gvec_umax64,
2314           .opt_opc = vecop_list,
2315           .vece = MO_64 }
2316     };
2317     tcg_debug_assert(vece <= MO_64);
2318     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2319 }
2320 
2321 /* Perform a vector negation using normal negation and a mask.
2322    Compare gen_subv_mask above.  */
2323 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2324 {
2325     TCGv_i64 t2 = tcg_temp_new_i64();
2326     TCGv_i64 t3 = tcg_temp_new_i64();
2327 
2328     tcg_gen_andc_i64(t3, m, b);
2329     tcg_gen_andc_i64(t2, b, m);
2330     tcg_gen_sub_i64(d, m, t2);
2331     tcg_gen_xor_i64(d, d, t3);
2332 
2333     tcg_temp_free_i64(t2);
2334     tcg_temp_free_i64(t3);
2335 }
2336 
2337 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2338 {
2339     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2340     gen_negv_mask(d, b, m);
2341 }
2342 
2343 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2344 {
2345     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2346     gen_negv_mask(d, b, m);
2347 }
2348 
2349 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2350 {
2351     TCGv_i64 t1 = tcg_temp_new_i64();
2352     TCGv_i64 t2 = tcg_temp_new_i64();
2353 
2354     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2355     tcg_gen_neg_i64(t2, b);
2356     tcg_gen_neg_i64(t1, t1);
2357     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2358 
2359     tcg_temp_free_i64(t1);
2360     tcg_temp_free_i64(t2);
2361 }
2362 
2363 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2364                       uint32_t oprsz, uint32_t maxsz)
2365 {
2366     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2367     static const GVecGen2 g[4] = {
2368         { .fni8 = tcg_gen_vec_neg8_i64,
2369           .fniv = tcg_gen_neg_vec,
2370           .fno = gen_helper_gvec_neg8,
2371           .opt_opc = vecop_list,
2372           .vece = MO_8 },
2373         { .fni8 = tcg_gen_vec_neg16_i64,
2374           .fniv = tcg_gen_neg_vec,
2375           .fno = gen_helper_gvec_neg16,
2376           .opt_opc = vecop_list,
2377           .vece = MO_16 },
2378         { .fni4 = tcg_gen_neg_i32,
2379           .fniv = tcg_gen_neg_vec,
2380           .fno = gen_helper_gvec_neg32,
2381           .opt_opc = vecop_list,
2382           .vece = MO_32 },
2383         { .fni8 = tcg_gen_neg_i64,
2384           .fniv = tcg_gen_neg_vec,
2385           .fno = gen_helper_gvec_neg64,
2386           .opt_opc = vecop_list,
2387           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2388           .vece = MO_64 },
2389     };
2390 
2391     tcg_debug_assert(vece <= MO_64);
2392     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2393 }
2394 
2395 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2396 {
2397     TCGv_i64 t = tcg_temp_new_i64();
2398     int nbit = 8 << vece;
2399 
2400     /* Create -1 for each negative element.  */
2401     tcg_gen_shri_i64(t, b, nbit - 1);
2402     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2403     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2404 
2405     /*
2406      * Invert (via xor -1) and add one.
2407      * Because of the ordering the msb is cleared,
2408      * so we never have carry into the next element.
2409      */
2410     tcg_gen_xor_i64(d, b, t);
2411     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2412     tcg_gen_add_i64(d, d, t);
2413 
2414     tcg_temp_free_i64(t);
2415 }
2416 
2417 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2418 {
2419     gen_absv_mask(d, b, MO_8);
2420 }
2421 
2422 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2423 {
2424     gen_absv_mask(d, b, MO_16);
2425 }
2426 
2427 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2428                       uint32_t oprsz, uint32_t maxsz)
2429 {
2430     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2431     static const GVecGen2 g[4] = {
2432         { .fni8 = tcg_gen_vec_abs8_i64,
2433           .fniv = tcg_gen_abs_vec,
2434           .fno = gen_helper_gvec_abs8,
2435           .opt_opc = vecop_list,
2436           .vece = MO_8 },
2437         { .fni8 = tcg_gen_vec_abs16_i64,
2438           .fniv = tcg_gen_abs_vec,
2439           .fno = gen_helper_gvec_abs16,
2440           .opt_opc = vecop_list,
2441           .vece = MO_16 },
2442         { .fni4 = tcg_gen_abs_i32,
2443           .fniv = tcg_gen_abs_vec,
2444           .fno = gen_helper_gvec_abs32,
2445           .opt_opc = vecop_list,
2446           .vece = MO_32 },
2447         { .fni8 = tcg_gen_abs_i64,
2448           .fniv = tcg_gen_abs_vec,
2449           .fno = gen_helper_gvec_abs64,
2450           .opt_opc = vecop_list,
2451           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2452           .vece = MO_64 },
2453     };
2454 
2455     tcg_debug_assert(vece <= MO_64);
2456     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2457 }
2458 
2459 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2460                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2461 {
2462     static const GVecGen3 g = {
2463         .fni8 = tcg_gen_and_i64,
2464         .fniv = tcg_gen_and_vec,
2465         .fno = gen_helper_gvec_and,
2466         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2467     };
2468 
2469     if (aofs == bofs) {
2470         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2471     } else {
2472         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2473     }
2474 }
2475 
2476 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2477                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2478 {
2479     static const GVecGen3 g = {
2480         .fni8 = tcg_gen_or_i64,
2481         .fniv = tcg_gen_or_vec,
2482         .fno = gen_helper_gvec_or,
2483         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2484     };
2485 
2486     if (aofs == bofs) {
2487         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2488     } else {
2489         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2490     }
2491 }
2492 
2493 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2494                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2495 {
2496     static const GVecGen3 g = {
2497         .fni8 = tcg_gen_xor_i64,
2498         .fniv = tcg_gen_xor_vec,
2499         .fno = gen_helper_gvec_xor,
2500         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2501     };
2502 
2503     if (aofs == bofs) {
2504         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2505     } else {
2506         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2507     }
2508 }
2509 
2510 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2511                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2512 {
2513     static const GVecGen3 g = {
2514         .fni8 = tcg_gen_andc_i64,
2515         .fniv = tcg_gen_andc_vec,
2516         .fno = gen_helper_gvec_andc,
2517         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2518     };
2519 
2520     if (aofs == bofs) {
2521         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2522     } else {
2523         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2524     }
2525 }
2526 
2527 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2528                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2529 {
2530     static const GVecGen3 g = {
2531         .fni8 = tcg_gen_orc_i64,
2532         .fniv = tcg_gen_orc_vec,
2533         .fno = gen_helper_gvec_orc,
2534         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2535     };
2536 
2537     if (aofs == bofs) {
2538         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2539     } else {
2540         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2541     }
2542 }
2543 
2544 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2545                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2546 {
2547     static const GVecGen3 g = {
2548         .fni8 = tcg_gen_nand_i64,
2549         .fniv = tcg_gen_nand_vec,
2550         .fno = gen_helper_gvec_nand,
2551         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2552     };
2553 
2554     if (aofs == bofs) {
2555         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2556     } else {
2557         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2558     }
2559 }
2560 
2561 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2562                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2563 {
2564     static const GVecGen3 g = {
2565         .fni8 = tcg_gen_nor_i64,
2566         .fniv = tcg_gen_nor_vec,
2567         .fno = gen_helper_gvec_nor,
2568         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2569     };
2570 
2571     if (aofs == bofs) {
2572         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2573     } else {
2574         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2575     }
2576 }
2577 
2578 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2579                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2580 {
2581     static const GVecGen3 g = {
2582         .fni8 = tcg_gen_eqv_i64,
2583         .fniv = tcg_gen_eqv_vec,
2584         .fno = gen_helper_gvec_eqv,
2585         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2586     };
2587 
2588     if (aofs == bofs) {
2589         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2590     } else {
2591         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2592     }
2593 }
2594 
2595 static const GVecGen2s gop_ands = {
2596     .fni8 = tcg_gen_and_i64,
2597     .fniv = tcg_gen_and_vec,
2598     .fno = gen_helper_gvec_ands,
2599     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2600     .vece = MO_64
2601 };
2602 
2603 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2604                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2605 {
2606     TCGv_i64 tmp = tcg_temp_new_i64();
2607     tcg_gen_dup_i64(vece, tmp, c);
2608     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2609     tcg_temp_free_i64(tmp);
2610 }
2611 
2612 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2613                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2614 {
2615     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2616     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2617 }
2618 
2619 static const GVecGen2s gop_xors = {
2620     .fni8 = tcg_gen_xor_i64,
2621     .fniv = tcg_gen_xor_vec,
2622     .fno = gen_helper_gvec_xors,
2623     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2624     .vece = MO_64
2625 };
2626 
2627 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2628                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2629 {
2630     TCGv_i64 tmp = tcg_temp_new_i64();
2631     tcg_gen_dup_i64(vece, tmp, c);
2632     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2633     tcg_temp_free_i64(tmp);
2634 }
2635 
2636 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2637                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2638 {
2639     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2640     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2641 }
2642 
2643 static const GVecGen2s gop_ors = {
2644     .fni8 = tcg_gen_or_i64,
2645     .fniv = tcg_gen_or_vec,
2646     .fno = gen_helper_gvec_ors,
2647     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2648     .vece = MO_64
2649 };
2650 
2651 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2652                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2653 {
2654     TCGv_i64 tmp = tcg_temp_new_i64();
2655     tcg_gen_dup_i64(vece, tmp, c);
2656     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2657     tcg_temp_free_i64(tmp);
2658 }
2659 
2660 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2661                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2662 {
2663     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2664     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2665 }
2666 
2667 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2668 {
2669     uint64_t mask = dup_const(MO_8, 0xff << c);
2670     tcg_gen_shli_i64(d, a, c);
2671     tcg_gen_andi_i64(d, d, mask);
2672 }
2673 
2674 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2675 {
2676     uint64_t mask = dup_const(MO_16, 0xffff << c);
2677     tcg_gen_shli_i64(d, a, c);
2678     tcg_gen_andi_i64(d, d, mask);
2679 }
2680 
2681 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2682 {
2683     uint32_t mask = dup_const(MO_8, 0xff << c);
2684     tcg_gen_shli_i32(d, a, c);
2685     tcg_gen_andi_i32(d, d, mask);
2686 }
2687 
2688 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2689 {
2690     uint32_t mask = dup_const(MO_16, 0xffff << c);
2691     tcg_gen_shli_i32(d, a, c);
2692     tcg_gen_andi_i32(d, d, mask);
2693 }
2694 
2695 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2696                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2697 {
2698     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2699     static const GVecGen2i g[4] = {
2700         { .fni8 = tcg_gen_vec_shl8i_i64,
2701           .fniv = tcg_gen_shli_vec,
2702           .fno = gen_helper_gvec_shl8i,
2703           .opt_opc = vecop_list,
2704           .vece = MO_8 },
2705         { .fni8 = tcg_gen_vec_shl16i_i64,
2706           .fniv = tcg_gen_shli_vec,
2707           .fno = gen_helper_gvec_shl16i,
2708           .opt_opc = vecop_list,
2709           .vece = MO_16 },
2710         { .fni4 = tcg_gen_shli_i32,
2711           .fniv = tcg_gen_shli_vec,
2712           .fno = gen_helper_gvec_shl32i,
2713           .opt_opc = vecop_list,
2714           .vece = MO_32 },
2715         { .fni8 = tcg_gen_shli_i64,
2716           .fniv = tcg_gen_shli_vec,
2717           .fno = gen_helper_gvec_shl64i,
2718           .opt_opc = vecop_list,
2719           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2720           .vece = MO_64 },
2721     };
2722 
2723     tcg_debug_assert(vece <= MO_64);
2724     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2725     if (shift == 0) {
2726         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2727     } else {
2728         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2729     }
2730 }
2731 
2732 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2733 {
2734     uint64_t mask = dup_const(MO_8, 0xff >> c);
2735     tcg_gen_shri_i64(d, a, c);
2736     tcg_gen_andi_i64(d, d, mask);
2737 }
2738 
2739 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2740 {
2741     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2742     tcg_gen_shri_i64(d, a, c);
2743     tcg_gen_andi_i64(d, d, mask);
2744 }
2745 
2746 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2747 {
2748     uint32_t mask = dup_const(MO_8, 0xff >> c);
2749     tcg_gen_shri_i32(d, a, c);
2750     tcg_gen_andi_i32(d, d, mask);
2751 }
2752 
2753 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2754 {
2755     uint32_t mask = dup_const(MO_16, 0xffff >> c);
2756     tcg_gen_shri_i32(d, a, c);
2757     tcg_gen_andi_i32(d, d, mask);
2758 }
2759 
2760 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2761                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2762 {
2763     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2764     static const GVecGen2i g[4] = {
2765         { .fni8 = tcg_gen_vec_shr8i_i64,
2766           .fniv = tcg_gen_shri_vec,
2767           .fno = gen_helper_gvec_shr8i,
2768           .opt_opc = vecop_list,
2769           .vece = MO_8 },
2770         { .fni8 = tcg_gen_vec_shr16i_i64,
2771           .fniv = tcg_gen_shri_vec,
2772           .fno = gen_helper_gvec_shr16i,
2773           .opt_opc = vecop_list,
2774           .vece = MO_16 },
2775         { .fni4 = tcg_gen_shri_i32,
2776           .fniv = tcg_gen_shri_vec,
2777           .fno = gen_helper_gvec_shr32i,
2778           .opt_opc = vecop_list,
2779           .vece = MO_32 },
2780         { .fni8 = tcg_gen_shri_i64,
2781           .fniv = tcg_gen_shri_vec,
2782           .fno = gen_helper_gvec_shr64i,
2783           .opt_opc = vecop_list,
2784           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2785           .vece = MO_64 },
2786     };
2787 
2788     tcg_debug_assert(vece <= MO_64);
2789     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2790     if (shift == 0) {
2791         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2792     } else {
2793         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2794     }
2795 }
2796 
2797 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2798 {
2799     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2800     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2801     TCGv_i64 s = tcg_temp_new_i64();
2802 
2803     tcg_gen_shri_i64(d, a, c);
2804     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2805     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2806     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2807     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2808     tcg_temp_free_i64(s);
2809 }
2810 
2811 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2812 {
2813     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2814     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2815     TCGv_i64 s = tcg_temp_new_i64();
2816 
2817     tcg_gen_shri_i64(d, a, c);
2818     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2819     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2820     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2821     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2822     tcg_temp_free_i64(s);
2823 }
2824 
2825 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2826 {
2827     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2828     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2829     TCGv_i32 s = tcg_temp_new_i32();
2830 
2831     tcg_gen_shri_i32(d, a, c);
2832     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2833     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2834     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2835     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2836     tcg_temp_free_i32(s);
2837 }
2838 
2839 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2840 {
2841     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2842     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2843     TCGv_i32 s = tcg_temp_new_i32();
2844 
2845     tcg_gen_shri_i32(d, a, c);
2846     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2847     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2848     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2849     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2850     tcg_temp_free_i32(s);
2851 }
2852 
2853 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2854                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2855 {
2856     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2857     static const GVecGen2i g[4] = {
2858         { .fni8 = tcg_gen_vec_sar8i_i64,
2859           .fniv = tcg_gen_sari_vec,
2860           .fno = gen_helper_gvec_sar8i,
2861           .opt_opc = vecop_list,
2862           .vece = MO_8 },
2863         { .fni8 = tcg_gen_vec_sar16i_i64,
2864           .fniv = tcg_gen_sari_vec,
2865           .fno = gen_helper_gvec_sar16i,
2866           .opt_opc = vecop_list,
2867           .vece = MO_16 },
2868         { .fni4 = tcg_gen_sari_i32,
2869           .fniv = tcg_gen_sari_vec,
2870           .fno = gen_helper_gvec_sar32i,
2871           .opt_opc = vecop_list,
2872           .vece = MO_32 },
2873         { .fni8 = tcg_gen_sari_i64,
2874           .fniv = tcg_gen_sari_vec,
2875           .fno = gen_helper_gvec_sar64i,
2876           .opt_opc = vecop_list,
2877           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2878           .vece = MO_64 },
2879     };
2880 
2881     tcg_debug_assert(vece <= MO_64);
2882     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2883     if (shift == 0) {
2884         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2885     } else {
2886         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2887     }
2888 }
2889 
2890 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2891 {
2892     uint64_t mask = dup_const(MO_8, 0xff << c);
2893 
2894     tcg_gen_shli_i64(d, a, c);
2895     tcg_gen_shri_i64(a, a, 8 - c);
2896     tcg_gen_andi_i64(d, d, mask);
2897     tcg_gen_andi_i64(a, a, ~mask);
2898     tcg_gen_or_i64(d, d, a);
2899 }
2900 
2901 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2902 {
2903     uint64_t mask = dup_const(MO_16, 0xffff << c);
2904 
2905     tcg_gen_shli_i64(d, a, c);
2906     tcg_gen_shri_i64(a, a, 16 - c);
2907     tcg_gen_andi_i64(d, d, mask);
2908     tcg_gen_andi_i64(a, a, ~mask);
2909     tcg_gen_or_i64(d, d, a);
2910 }
2911 
2912 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2913                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2914 {
2915     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2916     static const GVecGen2i g[4] = {
2917         { .fni8 = tcg_gen_vec_rotl8i_i64,
2918           .fniv = tcg_gen_rotli_vec,
2919           .fno = gen_helper_gvec_rotl8i,
2920           .opt_opc = vecop_list,
2921           .vece = MO_8 },
2922         { .fni8 = tcg_gen_vec_rotl16i_i64,
2923           .fniv = tcg_gen_rotli_vec,
2924           .fno = gen_helper_gvec_rotl16i,
2925           .opt_opc = vecop_list,
2926           .vece = MO_16 },
2927         { .fni4 = tcg_gen_rotli_i32,
2928           .fniv = tcg_gen_rotli_vec,
2929           .fno = gen_helper_gvec_rotl32i,
2930           .opt_opc = vecop_list,
2931           .vece = MO_32 },
2932         { .fni8 = tcg_gen_rotli_i64,
2933           .fniv = tcg_gen_rotli_vec,
2934           .fno = gen_helper_gvec_rotl64i,
2935           .opt_opc = vecop_list,
2936           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2937           .vece = MO_64 },
2938     };
2939 
2940     tcg_debug_assert(vece <= MO_64);
2941     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2942     if (shift == 0) {
2943         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2944     } else {
2945         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2946     }
2947 }
2948 
2949 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2950                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2951 {
2952     tcg_debug_assert(vece <= MO_64);
2953     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2954     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2955                        oprsz, maxsz);
2956 }
2957 
2958 /*
2959  * Specialized generation vector shifts by a non-constant scalar.
2960  */
2961 
2962 typedef struct {
2963     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2964     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2965     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2966     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2967     gen_helper_gvec_2 *fno[4];
2968     TCGOpcode s_list[2];
2969     TCGOpcode v_list[2];
2970 } GVecGen2sh;
2971 
2972 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2973                            uint32_t oprsz, uint32_t tysz, TCGType type,
2974                            TCGv_i32 shift,
2975                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2976 {
2977     TCGv_vec t0 = tcg_temp_new_vec(type);
2978     uint32_t i;
2979 
2980     for (i = 0; i < oprsz; i += tysz) {
2981         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2982         fni(vece, t0, t0, shift);
2983         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2984     }
2985     tcg_temp_free_vec(t0);
2986 }
2987 
2988 static void
2989 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2990                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2991 {
2992     TCGType type;
2993     uint32_t some;
2994 
2995     check_size_align(oprsz, maxsz, dofs | aofs);
2996     check_overlap_2(dofs, aofs, maxsz);
2997 
2998     /* If the backend has a scalar expansion, great.  */
2999     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3000     if (type) {
3001         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3002         switch (type) {
3003         case TCG_TYPE_V256:
3004             some = QEMU_ALIGN_DOWN(oprsz, 32);
3005             expand_2sh_vec(vece, dofs, aofs, some, 32,
3006                            TCG_TYPE_V256, shift, g->fniv_s);
3007             if (some == oprsz) {
3008                 break;
3009             }
3010             dofs += some;
3011             aofs += some;
3012             oprsz -= some;
3013             maxsz -= some;
3014             /* fallthru */
3015         case TCG_TYPE_V128:
3016             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3017                            TCG_TYPE_V128, shift, g->fniv_s);
3018             break;
3019         case TCG_TYPE_V64:
3020             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3021                            TCG_TYPE_V64, shift, g->fniv_s);
3022             break;
3023         default:
3024             g_assert_not_reached();
3025         }
3026         tcg_swap_vecop_list(hold_list);
3027         goto clear_tail;
3028     }
3029 
3030     /* If the backend supports variable vector shifts, also cool.  */
3031     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3032     if (type) {
3033         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3034         TCGv_vec v_shift = tcg_temp_new_vec(type);
3035 
3036         if (vece == MO_64) {
3037             TCGv_i64 sh64 = tcg_temp_new_i64();
3038             tcg_gen_extu_i32_i64(sh64, shift);
3039             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3040             tcg_temp_free_i64(sh64);
3041         } else {
3042             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3043         }
3044 
3045         switch (type) {
3046         case TCG_TYPE_V256:
3047             some = QEMU_ALIGN_DOWN(oprsz, 32);
3048             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3049                           v_shift, false, g->fniv_v);
3050             if (some == oprsz) {
3051                 break;
3052             }
3053             dofs += some;
3054             aofs += some;
3055             oprsz -= some;
3056             maxsz -= some;
3057             /* fallthru */
3058         case TCG_TYPE_V128:
3059             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3060                           v_shift, false, g->fniv_v);
3061             break;
3062         case TCG_TYPE_V64:
3063             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3064                           v_shift, false, g->fniv_v);
3065             break;
3066         default:
3067             g_assert_not_reached();
3068         }
3069         tcg_temp_free_vec(v_shift);
3070         tcg_swap_vecop_list(hold_list);
3071         goto clear_tail;
3072     }
3073 
3074     /* Otherwise fall back to integral... */
3075     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3076         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3077     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3078         TCGv_i64 sh64 = tcg_temp_new_i64();
3079         tcg_gen_extu_i32_i64(sh64, shift);
3080         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3081         tcg_temp_free_i64(sh64);
3082     } else {
3083         TCGv_ptr a0 = tcg_temp_new_ptr();
3084         TCGv_ptr a1 = tcg_temp_new_ptr();
3085         TCGv_i32 desc = tcg_temp_new_i32();
3086 
3087         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3088         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3089         tcg_gen_addi_ptr(a0, cpu_env, dofs);
3090         tcg_gen_addi_ptr(a1, cpu_env, aofs);
3091 
3092         g->fno[vece](a0, a1, desc);
3093 
3094         tcg_temp_free_ptr(a0);
3095         tcg_temp_free_ptr(a1);
3096         tcg_temp_free_i32(desc);
3097         return;
3098     }
3099 
3100  clear_tail:
3101     if (oprsz < maxsz) {
3102         expand_clr(dofs + oprsz, maxsz - oprsz);
3103     }
3104 }
3105 
3106 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3107                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3108 {
3109     static const GVecGen2sh g = {
3110         .fni4 = tcg_gen_shl_i32,
3111         .fni8 = tcg_gen_shl_i64,
3112         .fniv_s = tcg_gen_shls_vec,
3113         .fniv_v = tcg_gen_shlv_vec,
3114         .fno = {
3115             gen_helper_gvec_shl8i,
3116             gen_helper_gvec_shl16i,
3117             gen_helper_gvec_shl32i,
3118             gen_helper_gvec_shl64i,
3119         },
3120         .s_list = { INDEX_op_shls_vec, 0 },
3121         .v_list = { INDEX_op_shlv_vec, 0 },
3122     };
3123 
3124     tcg_debug_assert(vece <= MO_64);
3125     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3126 }
3127 
3128 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3129                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3130 {
3131     static const GVecGen2sh g = {
3132         .fni4 = tcg_gen_shr_i32,
3133         .fni8 = tcg_gen_shr_i64,
3134         .fniv_s = tcg_gen_shrs_vec,
3135         .fniv_v = tcg_gen_shrv_vec,
3136         .fno = {
3137             gen_helper_gvec_shr8i,
3138             gen_helper_gvec_shr16i,
3139             gen_helper_gvec_shr32i,
3140             gen_helper_gvec_shr64i,
3141         },
3142         .s_list = { INDEX_op_shrs_vec, 0 },
3143         .v_list = { INDEX_op_shrv_vec, 0 },
3144     };
3145 
3146     tcg_debug_assert(vece <= MO_64);
3147     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3148 }
3149 
3150 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3151                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3152 {
3153     static const GVecGen2sh g = {
3154         .fni4 = tcg_gen_sar_i32,
3155         .fni8 = tcg_gen_sar_i64,
3156         .fniv_s = tcg_gen_sars_vec,
3157         .fniv_v = tcg_gen_sarv_vec,
3158         .fno = {
3159             gen_helper_gvec_sar8i,
3160             gen_helper_gvec_sar16i,
3161             gen_helper_gvec_sar32i,
3162             gen_helper_gvec_sar64i,
3163         },
3164         .s_list = { INDEX_op_sars_vec, 0 },
3165         .v_list = { INDEX_op_sarv_vec, 0 },
3166     };
3167 
3168     tcg_debug_assert(vece <= MO_64);
3169     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3170 }
3171 
3172 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3173                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3174 {
3175     static const GVecGen2sh g = {
3176         .fni4 = tcg_gen_rotl_i32,
3177         .fni8 = tcg_gen_rotl_i64,
3178         .fniv_s = tcg_gen_rotls_vec,
3179         .fniv_v = tcg_gen_rotlv_vec,
3180         .fno = {
3181             gen_helper_gvec_rotl8i,
3182             gen_helper_gvec_rotl16i,
3183             gen_helper_gvec_rotl32i,
3184             gen_helper_gvec_rotl64i,
3185         },
3186         .s_list = { INDEX_op_rotls_vec, 0 },
3187         .v_list = { INDEX_op_rotlv_vec, 0 },
3188     };
3189 
3190     tcg_debug_assert(vece <= MO_64);
3191     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3192 }
3193 
3194 /*
3195  * Expand D = A << (B % element bits)
3196  *
3197  * Unlike scalar shifts, where it is easy for the target front end
3198  * to include the modulo as part of the expansion.  If the target
3199  * naturally includes the modulo as part of the operation, great!
3200  * If the target has some other behaviour from out-of-range shifts,
3201  * then it could not use this function anyway, and would need to
3202  * do it's own expansion with custom functions.
3203  */
3204 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3205                                  TCGv_vec a, TCGv_vec b)
3206 {
3207     TCGv_vec t = tcg_temp_new_vec_matching(d);
3208     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3209 
3210     tcg_gen_and_vec(vece, t, b, m);
3211     tcg_gen_shlv_vec(vece, d, a, t);
3212     tcg_temp_free_vec(t);
3213 }
3214 
3215 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3216 {
3217     TCGv_i32 t = tcg_temp_new_i32();
3218 
3219     tcg_gen_andi_i32(t, b, 31);
3220     tcg_gen_shl_i32(d, a, t);
3221     tcg_temp_free_i32(t);
3222 }
3223 
3224 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3225 {
3226     TCGv_i64 t = tcg_temp_new_i64();
3227 
3228     tcg_gen_andi_i64(t, b, 63);
3229     tcg_gen_shl_i64(d, a, t);
3230     tcg_temp_free_i64(t);
3231 }
3232 
3233 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3234                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3235 {
3236     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3237     static const GVecGen3 g[4] = {
3238         { .fniv = tcg_gen_shlv_mod_vec,
3239           .fno = gen_helper_gvec_shl8v,
3240           .opt_opc = vecop_list,
3241           .vece = MO_8 },
3242         { .fniv = tcg_gen_shlv_mod_vec,
3243           .fno = gen_helper_gvec_shl16v,
3244           .opt_opc = vecop_list,
3245           .vece = MO_16 },
3246         { .fni4 = tcg_gen_shl_mod_i32,
3247           .fniv = tcg_gen_shlv_mod_vec,
3248           .fno = gen_helper_gvec_shl32v,
3249           .opt_opc = vecop_list,
3250           .vece = MO_32 },
3251         { .fni8 = tcg_gen_shl_mod_i64,
3252           .fniv = tcg_gen_shlv_mod_vec,
3253           .fno = gen_helper_gvec_shl64v,
3254           .opt_opc = vecop_list,
3255           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3256           .vece = MO_64 },
3257     };
3258 
3259     tcg_debug_assert(vece <= MO_64);
3260     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3261 }
3262 
3263 /*
3264  * Similarly for logical right shifts.
3265  */
3266 
3267 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3268                                  TCGv_vec a, TCGv_vec b)
3269 {
3270     TCGv_vec t = tcg_temp_new_vec_matching(d);
3271     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3272 
3273     tcg_gen_and_vec(vece, t, b, m);
3274     tcg_gen_shrv_vec(vece, d, a, t);
3275     tcg_temp_free_vec(t);
3276 }
3277 
3278 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3279 {
3280     TCGv_i32 t = tcg_temp_new_i32();
3281 
3282     tcg_gen_andi_i32(t, b, 31);
3283     tcg_gen_shr_i32(d, a, t);
3284     tcg_temp_free_i32(t);
3285 }
3286 
3287 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3288 {
3289     TCGv_i64 t = tcg_temp_new_i64();
3290 
3291     tcg_gen_andi_i64(t, b, 63);
3292     tcg_gen_shr_i64(d, a, t);
3293     tcg_temp_free_i64(t);
3294 }
3295 
3296 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3297                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3298 {
3299     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3300     static const GVecGen3 g[4] = {
3301         { .fniv = tcg_gen_shrv_mod_vec,
3302           .fno = gen_helper_gvec_shr8v,
3303           .opt_opc = vecop_list,
3304           .vece = MO_8 },
3305         { .fniv = tcg_gen_shrv_mod_vec,
3306           .fno = gen_helper_gvec_shr16v,
3307           .opt_opc = vecop_list,
3308           .vece = MO_16 },
3309         { .fni4 = tcg_gen_shr_mod_i32,
3310           .fniv = tcg_gen_shrv_mod_vec,
3311           .fno = gen_helper_gvec_shr32v,
3312           .opt_opc = vecop_list,
3313           .vece = MO_32 },
3314         { .fni8 = tcg_gen_shr_mod_i64,
3315           .fniv = tcg_gen_shrv_mod_vec,
3316           .fno = gen_helper_gvec_shr64v,
3317           .opt_opc = vecop_list,
3318           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3319           .vece = MO_64 },
3320     };
3321 
3322     tcg_debug_assert(vece <= MO_64);
3323     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3324 }
3325 
3326 /*
3327  * Similarly for arithmetic right shifts.
3328  */
3329 
3330 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3331                                  TCGv_vec a, TCGv_vec b)
3332 {
3333     TCGv_vec t = tcg_temp_new_vec_matching(d);
3334     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3335 
3336     tcg_gen_and_vec(vece, t, b, m);
3337     tcg_gen_sarv_vec(vece, d, a, t);
3338     tcg_temp_free_vec(t);
3339 }
3340 
3341 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3342 {
3343     TCGv_i32 t = tcg_temp_new_i32();
3344 
3345     tcg_gen_andi_i32(t, b, 31);
3346     tcg_gen_sar_i32(d, a, t);
3347     tcg_temp_free_i32(t);
3348 }
3349 
3350 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3351 {
3352     TCGv_i64 t = tcg_temp_new_i64();
3353 
3354     tcg_gen_andi_i64(t, b, 63);
3355     tcg_gen_sar_i64(d, a, t);
3356     tcg_temp_free_i64(t);
3357 }
3358 
3359 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3360                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3361 {
3362     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3363     static const GVecGen3 g[4] = {
3364         { .fniv = tcg_gen_sarv_mod_vec,
3365           .fno = gen_helper_gvec_sar8v,
3366           .opt_opc = vecop_list,
3367           .vece = MO_8 },
3368         { .fniv = tcg_gen_sarv_mod_vec,
3369           .fno = gen_helper_gvec_sar16v,
3370           .opt_opc = vecop_list,
3371           .vece = MO_16 },
3372         { .fni4 = tcg_gen_sar_mod_i32,
3373           .fniv = tcg_gen_sarv_mod_vec,
3374           .fno = gen_helper_gvec_sar32v,
3375           .opt_opc = vecop_list,
3376           .vece = MO_32 },
3377         { .fni8 = tcg_gen_sar_mod_i64,
3378           .fniv = tcg_gen_sarv_mod_vec,
3379           .fno = gen_helper_gvec_sar64v,
3380           .opt_opc = vecop_list,
3381           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3382           .vece = MO_64 },
3383     };
3384 
3385     tcg_debug_assert(vece <= MO_64);
3386     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3387 }
3388 
3389 /*
3390  * Similarly for rotates.
3391  */
3392 
3393 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3394                                   TCGv_vec a, TCGv_vec b)
3395 {
3396     TCGv_vec t = tcg_temp_new_vec_matching(d);
3397     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3398 
3399     tcg_gen_and_vec(vece, t, b, m);
3400     tcg_gen_rotlv_vec(vece, d, a, t);
3401     tcg_temp_free_vec(t);
3402 }
3403 
3404 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3405 {
3406     TCGv_i32 t = tcg_temp_new_i32();
3407 
3408     tcg_gen_andi_i32(t, b, 31);
3409     tcg_gen_rotl_i32(d, a, t);
3410     tcg_temp_free_i32(t);
3411 }
3412 
3413 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3414 {
3415     TCGv_i64 t = tcg_temp_new_i64();
3416 
3417     tcg_gen_andi_i64(t, b, 63);
3418     tcg_gen_rotl_i64(d, a, t);
3419     tcg_temp_free_i64(t);
3420 }
3421 
3422 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3423                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3424 {
3425     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3426     static const GVecGen3 g[4] = {
3427         { .fniv = tcg_gen_rotlv_mod_vec,
3428           .fno = gen_helper_gvec_rotl8v,
3429           .opt_opc = vecop_list,
3430           .vece = MO_8 },
3431         { .fniv = tcg_gen_rotlv_mod_vec,
3432           .fno = gen_helper_gvec_rotl16v,
3433           .opt_opc = vecop_list,
3434           .vece = MO_16 },
3435         { .fni4 = tcg_gen_rotl_mod_i32,
3436           .fniv = tcg_gen_rotlv_mod_vec,
3437           .fno = gen_helper_gvec_rotl32v,
3438           .opt_opc = vecop_list,
3439           .vece = MO_32 },
3440         { .fni8 = tcg_gen_rotl_mod_i64,
3441           .fniv = tcg_gen_rotlv_mod_vec,
3442           .fno = gen_helper_gvec_rotl64v,
3443           .opt_opc = vecop_list,
3444           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3445           .vece = MO_64 },
3446     };
3447 
3448     tcg_debug_assert(vece <= MO_64);
3449     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3450 }
3451 
3452 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3453                                   TCGv_vec a, TCGv_vec b)
3454 {
3455     TCGv_vec t = tcg_temp_new_vec_matching(d);
3456     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3457 
3458     tcg_gen_and_vec(vece, t, b, m);
3459     tcg_gen_rotrv_vec(vece, d, a, t);
3460     tcg_temp_free_vec(t);
3461 }
3462 
3463 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3464 {
3465     TCGv_i32 t = tcg_temp_new_i32();
3466 
3467     tcg_gen_andi_i32(t, b, 31);
3468     tcg_gen_rotr_i32(d, a, t);
3469     tcg_temp_free_i32(t);
3470 }
3471 
3472 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3473 {
3474     TCGv_i64 t = tcg_temp_new_i64();
3475 
3476     tcg_gen_andi_i64(t, b, 63);
3477     tcg_gen_rotr_i64(d, a, t);
3478     tcg_temp_free_i64(t);
3479 }
3480 
3481 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3482                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3483 {
3484     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3485     static const GVecGen3 g[4] = {
3486         { .fniv = tcg_gen_rotrv_mod_vec,
3487           .fno = gen_helper_gvec_rotr8v,
3488           .opt_opc = vecop_list,
3489           .vece = MO_8 },
3490         { .fniv = tcg_gen_rotrv_mod_vec,
3491           .fno = gen_helper_gvec_rotr16v,
3492           .opt_opc = vecop_list,
3493           .vece = MO_16 },
3494         { .fni4 = tcg_gen_rotr_mod_i32,
3495           .fniv = tcg_gen_rotrv_mod_vec,
3496           .fno = gen_helper_gvec_rotr32v,
3497           .opt_opc = vecop_list,
3498           .vece = MO_32 },
3499         { .fni8 = tcg_gen_rotr_mod_i64,
3500           .fniv = tcg_gen_rotrv_mod_vec,
3501           .fno = gen_helper_gvec_rotr64v,
3502           .opt_opc = vecop_list,
3503           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3504           .vece = MO_64 },
3505     };
3506 
3507     tcg_debug_assert(vece <= MO_64);
3508     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3509 }
3510 
3511 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3512 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3513                            uint32_t oprsz, TCGCond cond)
3514 {
3515     TCGv_i32 t0 = tcg_temp_new_i32();
3516     TCGv_i32 t1 = tcg_temp_new_i32();
3517     uint32_t i;
3518 
3519     for (i = 0; i < oprsz; i += 4) {
3520         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3521         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3522         tcg_gen_setcond_i32(cond, t0, t0, t1);
3523         tcg_gen_neg_i32(t0, t0);
3524         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3525     }
3526     tcg_temp_free_i32(t1);
3527     tcg_temp_free_i32(t0);
3528 }
3529 
3530 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3531                            uint32_t oprsz, TCGCond cond)
3532 {
3533     TCGv_i64 t0 = tcg_temp_new_i64();
3534     TCGv_i64 t1 = tcg_temp_new_i64();
3535     uint32_t i;
3536 
3537     for (i = 0; i < oprsz; i += 8) {
3538         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3539         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3540         tcg_gen_setcond_i64(cond, t0, t0, t1);
3541         tcg_gen_neg_i64(t0, t0);
3542         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3543     }
3544     tcg_temp_free_i64(t1);
3545     tcg_temp_free_i64(t0);
3546 }
3547 
3548 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3549                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3550                            TCGType type, TCGCond cond)
3551 {
3552     TCGv_vec t0 = tcg_temp_new_vec(type);
3553     TCGv_vec t1 = tcg_temp_new_vec(type);
3554     uint32_t i;
3555 
3556     for (i = 0; i < oprsz; i += tysz) {
3557         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3558         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3559         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3560         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3561     }
3562     tcg_temp_free_vec(t1);
3563     tcg_temp_free_vec(t0);
3564 }
3565 
3566 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3567                       uint32_t aofs, uint32_t bofs,
3568                       uint32_t oprsz, uint32_t maxsz)
3569 {
3570     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3571     static gen_helper_gvec_3 * const eq_fn[4] = {
3572         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3573         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3574     };
3575     static gen_helper_gvec_3 * const ne_fn[4] = {
3576         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3577         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3578     };
3579     static gen_helper_gvec_3 * const lt_fn[4] = {
3580         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3581         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3582     };
3583     static gen_helper_gvec_3 * const le_fn[4] = {
3584         gen_helper_gvec_le8, gen_helper_gvec_le16,
3585         gen_helper_gvec_le32, gen_helper_gvec_le64
3586     };
3587     static gen_helper_gvec_3 * const ltu_fn[4] = {
3588         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3589         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3590     };
3591     static gen_helper_gvec_3 * const leu_fn[4] = {
3592         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3593         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3594     };
3595     static gen_helper_gvec_3 * const * const fns[16] = {
3596         [TCG_COND_EQ] = eq_fn,
3597         [TCG_COND_NE] = ne_fn,
3598         [TCG_COND_LT] = lt_fn,
3599         [TCG_COND_LE] = le_fn,
3600         [TCG_COND_LTU] = ltu_fn,
3601         [TCG_COND_LEU] = leu_fn,
3602     };
3603 
3604     const TCGOpcode *hold_list;
3605     TCGType type;
3606     uint32_t some;
3607 
3608     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3609     check_overlap_3(dofs, aofs, bofs, maxsz);
3610 
3611     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3612         do_dup(MO_8, dofs, oprsz, maxsz,
3613                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3614         return;
3615     }
3616 
3617     /*
3618      * Implement inline with a vector type, if possible.
3619      * Prefer integer when 64-bit host and 64-bit comparison.
3620      */
3621     hold_list = tcg_swap_vecop_list(cmp_list);
3622     type = choose_vector_type(cmp_list, vece, oprsz,
3623                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3624     switch (type) {
3625     case TCG_TYPE_V256:
3626         /* Recall that ARM SVE allows vector sizes that are not a
3627          * power of 2, but always a multiple of 16.  The intent is
3628          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3629          */
3630         some = QEMU_ALIGN_DOWN(oprsz, 32);
3631         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3632         if (some == oprsz) {
3633             break;
3634         }
3635         dofs += some;
3636         aofs += some;
3637         bofs += some;
3638         oprsz -= some;
3639         maxsz -= some;
3640         /* fallthru */
3641     case TCG_TYPE_V128:
3642         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3643         break;
3644     case TCG_TYPE_V64:
3645         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3646         break;
3647 
3648     case 0:
3649         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3650             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3651         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3652             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3653         } else {
3654             gen_helper_gvec_3 * const *fn = fns[cond];
3655 
3656             if (fn == NULL) {
3657                 uint32_t tmp;
3658                 tmp = aofs, aofs = bofs, bofs = tmp;
3659                 cond = tcg_swap_cond(cond);
3660                 fn = fns[cond];
3661                 assert(fn != NULL);
3662             }
3663             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3664             oprsz = maxsz;
3665         }
3666         break;
3667 
3668     default:
3669         g_assert_not_reached();
3670     }
3671     tcg_swap_vecop_list(hold_list);
3672 
3673     if (oprsz < maxsz) {
3674         expand_clr(dofs + oprsz, maxsz - oprsz);
3675     }
3676 }
3677 
3678 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3679 {
3680     TCGv_i64 t = tcg_temp_new_i64();
3681 
3682     tcg_gen_and_i64(t, b, a);
3683     tcg_gen_andc_i64(d, c, a);
3684     tcg_gen_or_i64(d, d, t);
3685     tcg_temp_free_i64(t);
3686 }
3687 
3688 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3689                          uint32_t bofs, uint32_t cofs,
3690                          uint32_t oprsz, uint32_t maxsz)
3691 {
3692     static const GVecGen4 g = {
3693         .fni8 = tcg_gen_bitsel_i64,
3694         .fniv = tcg_gen_bitsel_vec,
3695         .fno = gen_helper_gvec_bitsel,
3696     };
3697 
3698     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3699 }
3700