xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision ef26632e)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "qemu/main-loop.h"
25 #include "tcg/tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t max_align;
41 
42     switch (oprsz) {
43     case 8:
44     case 16:
45     case 32:
46         tcg_debug_assert(oprsz <= maxsz);
47         break;
48     default:
49         tcg_debug_assert(oprsz == maxsz);
50         break;
51     }
52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
53 
54     max_align = maxsz >= 16 ? 15 : 7;
55     tcg_debug_assert((maxsz & max_align) == 0);
56     tcg_debug_assert((ofs & max_align) == 0);
57 }
58 
59 /* Verify vector overlap rules for two operands.  */
60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
61 {
62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
63 }
64 
65 /* Verify vector overlap rules for three operands.  */
66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
67 {
68     check_overlap_2(d, a, s);
69     check_overlap_2(d, b, s);
70     check_overlap_2(a, b, s);
71 }
72 
73 /* Verify vector overlap rules for four operands.  */
74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
75                             uint32_t c, uint32_t s)
76 {
77     check_overlap_2(d, a, s);
78     check_overlap_2(d, b, s);
79     check_overlap_2(d, c, s);
80     check_overlap_2(a, b, s);
81     check_overlap_2(a, c, s);
82     check_overlap_2(b, c, s);
83 }
84 
85 /* Create a descriptor from components.  */
86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
87 {
88     uint32_t desc = 0;
89 
90     check_size_align(oprsz, maxsz, 0);
91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
92 
93     oprsz = (oprsz / 8) - 1;
94     maxsz = (maxsz / 8) - 1;
95 
96     /*
97      * We have just asserted in check_size_align that either
98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
99      * case with '2', as that would otherwise map to 24.
100      */
101     if (oprsz == maxsz) {
102         oprsz = 2;
103     }
104 
105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
108 
109     return desc;
110 }
111 
112 /* Generate a call to a gvec-style helper with two vector operands.  */
113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
115                         gen_helper_gvec_2 *fn)
116 {
117     TCGv_ptr a0, a1;
118     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
119 
120     a0 = tcg_temp_new_ptr();
121     a1 = tcg_temp_new_ptr();
122 
123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
125 
126     fn(a0, a1, desc);
127 
128     tcg_temp_free_ptr(a0);
129     tcg_temp_free_ptr(a1);
130     tcg_temp_free_i32(desc);
131 }
132 
133 /* Generate a call to a gvec-style helper with two vector operands
134    and one scalar operand.  */
135 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
136                          uint32_t oprsz, uint32_t maxsz, int32_t data,
137                          gen_helper_gvec_2i *fn)
138 {
139     TCGv_ptr a0, a1;
140     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
141 
142     a0 = tcg_temp_new_ptr();
143     a1 = tcg_temp_new_ptr();
144 
145     tcg_gen_addi_ptr(a0, cpu_env, dofs);
146     tcg_gen_addi_ptr(a1, cpu_env, aofs);
147 
148     fn(a0, a1, c, desc);
149 
150     tcg_temp_free_ptr(a0);
151     tcg_temp_free_ptr(a1);
152     tcg_temp_free_i32(desc);
153 }
154 
155 /* Generate a call to a gvec-style helper with three vector operands.  */
156 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
157                         uint32_t oprsz, uint32_t maxsz, int32_t data,
158                         gen_helper_gvec_3 *fn)
159 {
160     TCGv_ptr a0, a1, a2;
161     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
162 
163     a0 = tcg_temp_new_ptr();
164     a1 = tcg_temp_new_ptr();
165     a2 = tcg_temp_new_ptr();
166 
167     tcg_gen_addi_ptr(a0, cpu_env, dofs);
168     tcg_gen_addi_ptr(a1, cpu_env, aofs);
169     tcg_gen_addi_ptr(a2, cpu_env, bofs);
170 
171     fn(a0, a1, a2, desc);
172 
173     tcg_temp_free_ptr(a0);
174     tcg_temp_free_ptr(a1);
175     tcg_temp_free_ptr(a2);
176     tcg_temp_free_i32(desc);
177 }
178 
179 /* Generate a call to a gvec-style helper with four vector operands.  */
180 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
181                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
182                         int32_t data, gen_helper_gvec_4 *fn)
183 {
184     TCGv_ptr a0, a1, a2, a3;
185     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
186 
187     a0 = tcg_temp_new_ptr();
188     a1 = tcg_temp_new_ptr();
189     a2 = tcg_temp_new_ptr();
190     a3 = tcg_temp_new_ptr();
191 
192     tcg_gen_addi_ptr(a0, cpu_env, dofs);
193     tcg_gen_addi_ptr(a1, cpu_env, aofs);
194     tcg_gen_addi_ptr(a2, cpu_env, bofs);
195     tcg_gen_addi_ptr(a3, cpu_env, cofs);
196 
197     fn(a0, a1, a2, a3, desc);
198 
199     tcg_temp_free_ptr(a0);
200     tcg_temp_free_ptr(a1);
201     tcg_temp_free_ptr(a2);
202     tcg_temp_free_ptr(a3);
203     tcg_temp_free_i32(desc);
204 }
205 
206 /* Generate a call to a gvec-style helper with five vector operands.  */
207 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
208                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
209                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
210 {
211     TCGv_ptr a0, a1, a2, a3, a4;
212     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
213 
214     a0 = tcg_temp_new_ptr();
215     a1 = tcg_temp_new_ptr();
216     a2 = tcg_temp_new_ptr();
217     a3 = tcg_temp_new_ptr();
218     a4 = tcg_temp_new_ptr();
219 
220     tcg_gen_addi_ptr(a0, cpu_env, dofs);
221     tcg_gen_addi_ptr(a1, cpu_env, aofs);
222     tcg_gen_addi_ptr(a2, cpu_env, bofs);
223     tcg_gen_addi_ptr(a3, cpu_env, cofs);
224     tcg_gen_addi_ptr(a4, cpu_env, xofs);
225 
226     fn(a0, a1, a2, a3, a4, desc);
227 
228     tcg_temp_free_ptr(a0);
229     tcg_temp_free_ptr(a1);
230     tcg_temp_free_ptr(a2);
231     tcg_temp_free_ptr(a3);
232     tcg_temp_free_ptr(a4);
233     tcg_temp_free_i32(desc);
234 }
235 
236 /* Generate a call to a gvec-style helper with three vector operands
237    and an extra pointer operand.  */
238 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
239                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
240                         int32_t data, gen_helper_gvec_2_ptr *fn)
241 {
242     TCGv_ptr a0, a1;
243     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
244 
245     a0 = tcg_temp_new_ptr();
246     a1 = tcg_temp_new_ptr();
247 
248     tcg_gen_addi_ptr(a0, cpu_env, dofs);
249     tcg_gen_addi_ptr(a1, cpu_env, aofs);
250 
251     fn(a0, a1, ptr, desc);
252 
253     tcg_temp_free_ptr(a0);
254     tcg_temp_free_ptr(a1);
255     tcg_temp_free_i32(desc);
256 }
257 
258 /* Generate a call to a gvec-style helper with three vector operands
259    and an extra pointer operand.  */
260 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
261                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
262                         int32_t data, gen_helper_gvec_3_ptr *fn)
263 {
264     TCGv_ptr a0, a1, a2;
265     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
266 
267     a0 = tcg_temp_new_ptr();
268     a1 = tcg_temp_new_ptr();
269     a2 = tcg_temp_new_ptr();
270 
271     tcg_gen_addi_ptr(a0, cpu_env, dofs);
272     tcg_gen_addi_ptr(a1, cpu_env, aofs);
273     tcg_gen_addi_ptr(a2, cpu_env, bofs);
274 
275     fn(a0, a1, a2, ptr, desc);
276 
277     tcg_temp_free_ptr(a0);
278     tcg_temp_free_ptr(a1);
279     tcg_temp_free_ptr(a2);
280     tcg_temp_free_i32(desc);
281 }
282 
283 /* Generate a call to a gvec-style helper with four vector operands
284    and an extra pointer operand.  */
285 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
286                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
287                         uint32_t maxsz, int32_t data,
288                         gen_helper_gvec_4_ptr *fn)
289 {
290     TCGv_ptr a0, a1, a2, a3;
291     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
292 
293     a0 = tcg_temp_new_ptr();
294     a1 = tcg_temp_new_ptr();
295     a2 = tcg_temp_new_ptr();
296     a3 = tcg_temp_new_ptr();
297 
298     tcg_gen_addi_ptr(a0, cpu_env, dofs);
299     tcg_gen_addi_ptr(a1, cpu_env, aofs);
300     tcg_gen_addi_ptr(a2, cpu_env, bofs);
301     tcg_gen_addi_ptr(a3, cpu_env, cofs);
302 
303     fn(a0, a1, a2, a3, ptr, desc);
304 
305     tcg_temp_free_ptr(a0);
306     tcg_temp_free_ptr(a1);
307     tcg_temp_free_ptr(a2);
308     tcg_temp_free_ptr(a3);
309     tcg_temp_free_i32(desc);
310 }
311 
312 /* Generate a call to a gvec-style helper with five vector operands
313    and an extra pointer operand.  */
314 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
315                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
316                         uint32_t oprsz, uint32_t maxsz, int32_t data,
317                         gen_helper_gvec_5_ptr *fn)
318 {
319     TCGv_ptr a0, a1, a2, a3, a4;
320     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
321 
322     a0 = tcg_temp_new_ptr();
323     a1 = tcg_temp_new_ptr();
324     a2 = tcg_temp_new_ptr();
325     a3 = tcg_temp_new_ptr();
326     a4 = tcg_temp_new_ptr();
327 
328     tcg_gen_addi_ptr(a0, cpu_env, dofs);
329     tcg_gen_addi_ptr(a1, cpu_env, aofs);
330     tcg_gen_addi_ptr(a2, cpu_env, bofs);
331     tcg_gen_addi_ptr(a3, cpu_env, cofs);
332     tcg_gen_addi_ptr(a4, cpu_env, eofs);
333 
334     fn(a0, a1, a2, a3, a4, ptr, desc);
335 
336     tcg_temp_free_ptr(a0);
337     tcg_temp_free_ptr(a1);
338     tcg_temp_free_ptr(a2);
339     tcg_temp_free_ptr(a3);
340     tcg_temp_free_ptr(a4);
341     tcg_temp_free_i32(desc);
342 }
343 
344 /* Return true if we want to implement something of OPRSZ bytes
345    in units of LNSZ.  This limits the expansion of inline code.  */
346 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
347 {
348     uint32_t q, r;
349 
350     if (oprsz < lnsz) {
351         return false;
352     }
353 
354     q = oprsz / lnsz;
355     r = oprsz % lnsz;
356     tcg_debug_assert((r & 7) == 0);
357 
358     if (lnsz < 16) {
359         /* For sizes below 16, accept no remainder. */
360         if (r != 0) {
361             return false;
362         }
363     } else {
364         /*
365          * Recall that ARM SVE allows vector sizes that are not a
366          * power of 2, but always a multiple of 16.  The intent is
367          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
368          * In addition, expand_clr needs to handle a multiple of 8.
369          * Thus we can handle the tail with one more operation per
370          * diminishing power of 2.
371          */
372         q += ctpop32(r);
373     }
374 
375     return q <= MAX_UNROLL;
376 }
377 
378 static void expand_clr(uint32_t dofs, uint32_t maxsz);
379 
380 /* Duplicate C as per VECE.  */
381 uint64_t (dup_const)(unsigned vece, uint64_t c)
382 {
383     switch (vece) {
384     case MO_8:
385         return 0x0101010101010101ull * (uint8_t)c;
386     case MO_16:
387         return 0x0001000100010001ull * (uint16_t)c;
388     case MO_32:
389         return 0x0000000100000001ull * (uint32_t)c;
390     case MO_64:
391         return c;
392     default:
393         g_assert_not_reached();
394     }
395 }
396 
397 /* Duplicate IN into OUT as per VECE.  */
398 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
399 {
400     switch (vece) {
401     case MO_8:
402         tcg_gen_ext8u_i32(out, in);
403         tcg_gen_muli_i32(out, out, 0x01010101);
404         break;
405     case MO_16:
406         tcg_gen_deposit_i32(out, in, in, 16, 16);
407         break;
408     case MO_32:
409         tcg_gen_mov_i32(out, in);
410         break;
411     default:
412         g_assert_not_reached();
413     }
414 }
415 
416 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
417 {
418     switch (vece) {
419     case MO_8:
420         tcg_gen_ext8u_i64(out, in);
421         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
422         break;
423     case MO_16:
424         tcg_gen_ext16u_i64(out, in);
425         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
426         break;
427     case MO_32:
428         tcg_gen_deposit_i64(out, in, in, 32, 32);
429         break;
430     case MO_64:
431         tcg_gen_mov_i64(out, in);
432         break;
433     default:
434         g_assert_not_reached();
435     }
436 }
437 
438 /* Select a supported vector type for implementing an operation on SIZE
439  * bytes.  If OP is 0, assume that the real operation to be performed is
440  * required by all backends.  Otherwise, make sure than OP can be performed
441  * on elements of size VECE in the selected type.  Do not select V64 if
442  * PREFER_I64 is true.  Return 0 if no vector type is selected.
443  */
444 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
445                                   uint32_t size, bool prefer_i64)
446 {
447     /*
448      * Recall that ARM SVE allows vector sizes that are not a
449      * power of 2, but always a multiple of 16.  The intent is
450      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
451      * It is hard to imagine a case in which v256 is supported
452      * but v128 is not, but check anyway.
453      * In addition, expand_clr needs to handle a multiple of 8.
454      */
455     if (TCG_TARGET_HAS_v256 &&
456         check_size_impl(size, 32) &&
457         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
458         (!(size & 16) ||
459          (TCG_TARGET_HAS_v128 &&
460           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
461         (!(size & 8) ||
462          (TCG_TARGET_HAS_v64 &&
463           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
464         return TCG_TYPE_V256;
465     }
466     if (TCG_TARGET_HAS_v128 &&
467         check_size_impl(size, 16) &&
468         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
469         (!(size & 8) ||
470          (TCG_TARGET_HAS_v64 &&
471           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
472         return TCG_TYPE_V128;
473     }
474     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
475         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
476         return TCG_TYPE_V64;
477     }
478     return 0;
479 }
480 
481 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
482                          uint32_t maxsz, TCGv_vec t_vec)
483 {
484     uint32_t i = 0;
485 
486     tcg_debug_assert(oprsz >= 8);
487 
488     /*
489      * This may be expand_clr for the tail of an operation, e.g.
490      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
491      * are misaligned wrt the maximum vector size, so do that first.
492      */
493     if (dofs & 8) {
494         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
495         i += 8;
496     }
497 
498     switch (type) {
499     case TCG_TYPE_V256:
500         /*
501          * Recall that ARM SVE allows vector sizes that are not a
502          * power of 2, but always a multiple of 16.  The intent is
503          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
504          */
505         for (; i + 32 <= oprsz; i += 32) {
506             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
507         }
508         /* fallthru */
509     case TCG_TYPE_V128:
510         for (; i + 16 <= oprsz; i += 16) {
511             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
512         }
513         break;
514     case TCG_TYPE_V64:
515         for (; i < oprsz; i += 8) {
516             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
517         }
518         break;
519     default:
520         g_assert_not_reached();
521     }
522 
523     if (oprsz < maxsz) {
524         expand_clr(dofs + oprsz, maxsz - oprsz);
525     }
526 }
527 
528 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
529  * Only one of IN_32 or IN_64 may be set;
530  * IN_C is used if IN_32 and IN_64 are unset.
531  */
532 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
533                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
534                    uint64_t in_c)
535 {
536     TCGType type;
537     TCGv_i64 t_64;
538     TCGv_i32 t_32, t_desc;
539     TCGv_ptr t_ptr;
540     uint32_t i;
541 
542     assert(vece <= (in_32 ? MO_32 : MO_64));
543     assert(in_32 == NULL || in_64 == NULL);
544 
545     /* If we're storing 0, expand oprsz to maxsz.  */
546     if (in_32 == NULL && in_64 == NULL) {
547         in_c = dup_const(vece, in_c);
548         if (in_c == 0) {
549             oprsz = maxsz;
550             vece = MO_8;
551         } else if (in_c == dup_const(MO_8, in_c)) {
552             vece = MO_8;
553         }
554     }
555 
556     /* Implement inline with a vector type, if possible.
557      * Prefer integer when 64-bit host and no variable dup.
558      */
559     type = choose_vector_type(NULL, vece, oprsz,
560                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
561                                && (in_64 == NULL || vece == MO_64)));
562     if (type != 0) {
563         TCGv_vec t_vec = tcg_temp_new_vec(type);
564 
565         if (in_32) {
566             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
567         } else if (in_64) {
568             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
569         } else {
570             tcg_gen_dupi_vec(vece, t_vec, in_c);
571         }
572         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
573         tcg_temp_free_vec(t_vec);
574         return;
575     }
576 
577     /* Otherwise, inline with an integer type, unless "large".  */
578     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
579         t_64 = NULL;
580         t_32 = NULL;
581 
582         if (in_32) {
583             /* We are given a 32-bit variable input.  For a 64-bit host,
584                use a 64-bit operation unless the 32-bit operation would
585                be simple enough.  */
586             if (TCG_TARGET_REG_BITS == 64
587                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
588                 t_64 = tcg_temp_new_i64();
589                 tcg_gen_extu_i32_i64(t_64, in_32);
590                 gen_dup_i64(vece, t_64, t_64);
591             } else {
592                 t_32 = tcg_temp_new_i32();
593                 gen_dup_i32(vece, t_32, in_32);
594             }
595         } else if (in_64) {
596             /* We are given a 64-bit variable input.  */
597             t_64 = tcg_temp_new_i64();
598             gen_dup_i64(vece, t_64, in_64);
599         } else {
600             /* We are given a constant input.  */
601             /* For 64-bit hosts, use 64-bit constants for "simple" constants
602                or when we'd need too many 32-bit stores, or when a 64-bit
603                constant is really required.  */
604             if (vece == MO_64
605                 || (TCG_TARGET_REG_BITS == 64
606                     && (in_c == 0 || in_c == -1
607                         || !check_size_impl(oprsz, 4)))) {
608                 t_64 = tcg_const_i64(in_c);
609             } else {
610                 t_32 = tcg_const_i32(in_c);
611             }
612         }
613 
614         /* Implement inline if we picked an implementation size above.  */
615         if (t_32) {
616             for (i = 0; i < oprsz; i += 4) {
617                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
618             }
619             tcg_temp_free_i32(t_32);
620             goto done;
621         }
622         if (t_64) {
623             for (i = 0; i < oprsz; i += 8) {
624                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
625             }
626             tcg_temp_free_i64(t_64);
627             goto done;
628         }
629     }
630 
631     /* Otherwise implement out of line.  */
632     t_ptr = tcg_temp_new_ptr();
633     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
634 
635     /*
636      * This may be expand_clr for the tail of an operation, e.g.
637      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
638      * wrt simd_desc and will assert.  Simply pass all replicated byte
639      * stores through to memset.
640      */
641     if (oprsz == maxsz && vece == MO_8) {
642         TCGv_ptr t_size = tcg_const_ptr(oprsz);
643         TCGv_i32 t_val;
644 
645         if (in_32) {
646             t_val = in_32;
647         } else if (in_64) {
648             t_val = tcg_temp_new_i32();
649             tcg_gen_extrl_i64_i32(t_val, in_64);
650         } else {
651             t_val = tcg_const_i32(in_c);
652         }
653         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
654 
655         if (!in_32) {
656             tcg_temp_free_i32(t_val);
657         }
658         tcg_temp_free_ptr(t_size);
659         tcg_temp_free_ptr(t_ptr);
660         return;
661     }
662 
663     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
664 
665     if (vece == MO_64) {
666         if (in_64) {
667             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
668         } else {
669             t_64 = tcg_const_i64(in_c);
670             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
671             tcg_temp_free_i64(t_64);
672         }
673     } else {
674         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
675         static dup_fn * const fns[3] = {
676             gen_helper_gvec_dup8,
677             gen_helper_gvec_dup16,
678             gen_helper_gvec_dup32
679         };
680 
681         if (in_32) {
682             fns[vece](t_ptr, t_desc, in_32);
683         } else {
684             t_32 = tcg_temp_new_i32();
685             if (in_64) {
686                 tcg_gen_extrl_i64_i32(t_32, in_64);
687             } else if (vece == MO_8) {
688                 tcg_gen_movi_i32(t_32, in_c & 0xff);
689             } else if (vece == MO_16) {
690                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
691             } else {
692                 tcg_gen_movi_i32(t_32, in_c);
693             }
694             fns[vece](t_ptr, t_desc, t_32);
695             tcg_temp_free_i32(t_32);
696         }
697     }
698 
699     tcg_temp_free_ptr(t_ptr);
700     tcg_temp_free_i32(t_desc);
701     return;
702 
703  done:
704     if (oprsz < maxsz) {
705         expand_clr(dofs + oprsz, maxsz - oprsz);
706     }
707 }
708 
709 /* Likewise, but with zero.  */
710 static void expand_clr(uint32_t dofs, uint32_t maxsz)
711 {
712     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
713 }
714 
715 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
716 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
717                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
718 {
719     TCGv_i32 t0 = tcg_temp_new_i32();
720     TCGv_i32 t1 = tcg_temp_new_i32();
721     uint32_t i;
722 
723     for (i = 0; i < oprsz; i += 4) {
724         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
725         if (load_dest) {
726             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
727         }
728         fni(t1, t0);
729         tcg_gen_st_i32(t1, cpu_env, dofs + i);
730     }
731     tcg_temp_free_i32(t0);
732     tcg_temp_free_i32(t1);
733 }
734 
735 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
736                           int32_t c, bool load_dest,
737                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
738 {
739     TCGv_i32 t0 = tcg_temp_new_i32();
740     TCGv_i32 t1 = tcg_temp_new_i32();
741     uint32_t i;
742 
743     for (i = 0; i < oprsz; i += 4) {
744         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
745         if (load_dest) {
746             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
747         }
748         fni(t1, t0, c);
749         tcg_gen_st_i32(t1, cpu_env, dofs + i);
750     }
751     tcg_temp_free_i32(t0);
752     tcg_temp_free_i32(t1);
753 }
754 
755 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
756                           TCGv_i32 c, bool scalar_first,
757                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
758 {
759     TCGv_i32 t0 = tcg_temp_new_i32();
760     TCGv_i32 t1 = tcg_temp_new_i32();
761     uint32_t i;
762 
763     for (i = 0; i < oprsz; i += 4) {
764         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
765         if (scalar_first) {
766             fni(t1, c, t0);
767         } else {
768             fni(t1, t0, c);
769         }
770         tcg_gen_st_i32(t1, cpu_env, dofs + i);
771     }
772     tcg_temp_free_i32(t0);
773     tcg_temp_free_i32(t1);
774 }
775 
776 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
777 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
778                          uint32_t bofs, uint32_t oprsz, bool load_dest,
779                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
780 {
781     TCGv_i32 t0 = tcg_temp_new_i32();
782     TCGv_i32 t1 = tcg_temp_new_i32();
783     TCGv_i32 t2 = tcg_temp_new_i32();
784     uint32_t i;
785 
786     for (i = 0; i < oprsz; i += 4) {
787         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
788         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
789         if (load_dest) {
790             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
791         }
792         fni(t2, t0, t1);
793         tcg_gen_st_i32(t2, cpu_env, dofs + i);
794     }
795     tcg_temp_free_i32(t2);
796     tcg_temp_free_i32(t1);
797     tcg_temp_free_i32(t0);
798 }
799 
800 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
801                           uint32_t oprsz, int32_t c, bool load_dest,
802                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
803 {
804     TCGv_i32 t0 = tcg_temp_new_i32();
805     TCGv_i32 t1 = tcg_temp_new_i32();
806     TCGv_i32 t2 = tcg_temp_new_i32();
807     uint32_t i;
808 
809     for (i = 0; i < oprsz; i += 4) {
810         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
811         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
812         if (load_dest) {
813             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
814         }
815         fni(t2, t0, t1, c);
816         tcg_gen_st_i32(t2, cpu_env, dofs + i);
817     }
818     tcg_temp_free_i32(t0);
819     tcg_temp_free_i32(t1);
820     tcg_temp_free_i32(t2);
821 }
822 
823 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
824 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
825                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
826                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
827 {
828     TCGv_i32 t0 = tcg_temp_new_i32();
829     TCGv_i32 t1 = tcg_temp_new_i32();
830     TCGv_i32 t2 = tcg_temp_new_i32();
831     TCGv_i32 t3 = tcg_temp_new_i32();
832     uint32_t i;
833 
834     for (i = 0; i < oprsz; i += 4) {
835         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
836         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
837         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
838         fni(t0, t1, t2, t3);
839         tcg_gen_st_i32(t0, cpu_env, dofs + i);
840         if (write_aofs) {
841             tcg_gen_st_i32(t1, cpu_env, aofs + i);
842         }
843     }
844     tcg_temp_free_i32(t3);
845     tcg_temp_free_i32(t2);
846     tcg_temp_free_i32(t1);
847     tcg_temp_free_i32(t0);
848 }
849 
850 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
851 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
852                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
853 {
854     TCGv_i64 t0 = tcg_temp_new_i64();
855     TCGv_i64 t1 = tcg_temp_new_i64();
856     uint32_t i;
857 
858     for (i = 0; i < oprsz; i += 8) {
859         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
860         if (load_dest) {
861             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
862         }
863         fni(t1, t0);
864         tcg_gen_st_i64(t1, cpu_env, dofs + i);
865     }
866     tcg_temp_free_i64(t0);
867     tcg_temp_free_i64(t1);
868 }
869 
870 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
871                           int64_t c, bool load_dest,
872                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
873 {
874     TCGv_i64 t0 = tcg_temp_new_i64();
875     TCGv_i64 t1 = tcg_temp_new_i64();
876     uint32_t i;
877 
878     for (i = 0; i < oprsz; i += 8) {
879         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
880         if (load_dest) {
881             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
882         }
883         fni(t1, t0, c);
884         tcg_gen_st_i64(t1, cpu_env, dofs + i);
885     }
886     tcg_temp_free_i64(t0);
887     tcg_temp_free_i64(t1);
888 }
889 
890 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
891                           TCGv_i64 c, bool scalar_first,
892                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
893 {
894     TCGv_i64 t0 = tcg_temp_new_i64();
895     TCGv_i64 t1 = tcg_temp_new_i64();
896     uint32_t i;
897 
898     for (i = 0; i < oprsz; i += 8) {
899         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
900         if (scalar_first) {
901             fni(t1, c, t0);
902         } else {
903             fni(t1, t0, c);
904         }
905         tcg_gen_st_i64(t1, cpu_env, dofs + i);
906     }
907     tcg_temp_free_i64(t0);
908     tcg_temp_free_i64(t1);
909 }
910 
911 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
912 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
913                          uint32_t bofs, uint32_t oprsz, bool load_dest,
914                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
915 {
916     TCGv_i64 t0 = tcg_temp_new_i64();
917     TCGv_i64 t1 = tcg_temp_new_i64();
918     TCGv_i64 t2 = tcg_temp_new_i64();
919     uint32_t i;
920 
921     for (i = 0; i < oprsz; i += 8) {
922         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
923         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
924         if (load_dest) {
925             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
926         }
927         fni(t2, t0, t1);
928         tcg_gen_st_i64(t2, cpu_env, dofs + i);
929     }
930     tcg_temp_free_i64(t2);
931     tcg_temp_free_i64(t1);
932     tcg_temp_free_i64(t0);
933 }
934 
935 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
936                           uint32_t oprsz, int64_t c, bool load_dest,
937                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
938 {
939     TCGv_i64 t0 = tcg_temp_new_i64();
940     TCGv_i64 t1 = tcg_temp_new_i64();
941     TCGv_i64 t2 = tcg_temp_new_i64();
942     uint32_t i;
943 
944     for (i = 0; i < oprsz; i += 8) {
945         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
946         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
947         if (load_dest) {
948             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
949         }
950         fni(t2, t0, t1, c);
951         tcg_gen_st_i64(t2, cpu_env, dofs + i);
952     }
953     tcg_temp_free_i64(t0);
954     tcg_temp_free_i64(t1);
955     tcg_temp_free_i64(t2);
956 }
957 
958 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
959 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
960                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
961                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
962 {
963     TCGv_i64 t0 = tcg_temp_new_i64();
964     TCGv_i64 t1 = tcg_temp_new_i64();
965     TCGv_i64 t2 = tcg_temp_new_i64();
966     TCGv_i64 t3 = tcg_temp_new_i64();
967     uint32_t i;
968 
969     for (i = 0; i < oprsz; i += 8) {
970         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
971         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
972         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
973         fni(t0, t1, t2, t3);
974         tcg_gen_st_i64(t0, cpu_env, dofs + i);
975         if (write_aofs) {
976             tcg_gen_st_i64(t1, cpu_env, aofs + i);
977         }
978     }
979     tcg_temp_free_i64(t3);
980     tcg_temp_free_i64(t2);
981     tcg_temp_free_i64(t1);
982     tcg_temp_free_i64(t0);
983 }
984 
985 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
986 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
987                          uint32_t oprsz, uint32_t tysz, TCGType type,
988                          bool load_dest,
989                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
990 {
991     TCGv_vec t0 = tcg_temp_new_vec(type);
992     TCGv_vec t1 = tcg_temp_new_vec(type);
993     uint32_t i;
994 
995     for (i = 0; i < oprsz; i += tysz) {
996         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
997         if (load_dest) {
998             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
999         }
1000         fni(vece, t1, t0);
1001         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1002     }
1003     tcg_temp_free_vec(t0);
1004     tcg_temp_free_vec(t1);
1005 }
1006 
1007 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1008    using host vectors.  */
1009 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1010                           uint32_t oprsz, uint32_t tysz, TCGType type,
1011                           int64_t c, bool load_dest,
1012                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1013 {
1014     TCGv_vec t0 = tcg_temp_new_vec(type);
1015     TCGv_vec t1 = tcg_temp_new_vec(type);
1016     uint32_t i;
1017 
1018     for (i = 0; i < oprsz; i += tysz) {
1019         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1020         if (load_dest) {
1021             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1022         }
1023         fni(vece, t1, t0, c);
1024         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1025     }
1026     tcg_temp_free_vec(t0);
1027     tcg_temp_free_vec(t1);
1028 }
1029 
1030 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1031                           uint32_t oprsz, uint32_t tysz, TCGType type,
1032                           TCGv_vec c, bool scalar_first,
1033                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1034 {
1035     TCGv_vec t0 = tcg_temp_new_vec(type);
1036     TCGv_vec t1 = tcg_temp_new_vec(type);
1037     uint32_t i;
1038 
1039     for (i = 0; i < oprsz; i += tysz) {
1040         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1041         if (scalar_first) {
1042             fni(vece, t1, c, t0);
1043         } else {
1044             fni(vece, t1, t0, c);
1045         }
1046         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1047     }
1048     tcg_temp_free_vec(t0);
1049     tcg_temp_free_vec(t1);
1050 }
1051 
1052 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1053 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1054                          uint32_t bofs, uint32_t oprsz,
1055                          uint32_t tysz, TCGType type, bool load_dest,
1056                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1057 {
1058     TCGv_vec t0 = tcg_temp_new_vec(type);
1059     TCGv_vec t1 = tcg_temp_new_vec(type);
1060     TCGv_vec t2 = tcg_temp_new_vec(type);
1061     uint32_t i;
1062 
1063     for (i = 0; i < oprsz; i += tysz) {
1064         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1065         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1066         if (load_dest) {
1067             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1068         }
1069         fni(vece, t2, t0, t1);
1070         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1071     }
1072     tcg_temp_free_vec(t2);
1073     tcg_temp_free_vec(t1);
1074     tcg_temp_free_vec(t0);
1075 }
1076 
1077 /*
1078  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1079  * using host vectors.
1080  */
1081 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1082                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1083                           TCGType type, int64_t c, bool load_dest,
1084                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1085                                       int64_t))
1086 {
1087     TCGv_vec t0 = tcg_temp_new_vec(type);
1088     TCGv_vec t1 = tcg_temp_new_vec(type);
1089     TCGv_vec t2 = tcg_temp_new_vec(type);
1090     uint32_t i;
1091 
1092     for (i = 0; i < oprsz; i += tysz) {
1093         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1094         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1095         if (load_dest) {
1096             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1097         }
1098         fni(vece, t2, t0, t1, c);
1099         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1100     }
1101     tcg_temp_free_vec(t0);
1102     tcg_temp_free_vec(t1);
1103     tcg_temp_free_vec(t2);
1104 }
1105 
1106 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1107 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1108                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1109                          uint32_t tysz, TCGType type, bool write_aofs,
1110                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1111                                      TCGv_vec, TCGv_vec))
1112 {
1113     TCGv_vec t0 = tcg_temp_new_vec(type);
1114     TCGv_vec t1 = tcg_temp_new_vec(type);
1115     TCGv_vec t2 = tcg_temp_new_vec(type);
1116     TCGv_vec t3 = tcg_temp_new_vec(type);
1117     uint32_t i;
1118 
1119     for (i = 0; i < oprsz; i += tysz) {
1120         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1121         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1122         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1123         fni(vece, t0, t1, t2, t3);
1124         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1125         if (write_aofs) {
1126             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1127         }
1128     }
1129     tcg_temp_free_vec(t3);
1130     tcg_temp_free_vec(t2);
1131     tcg_temp_free_vec(t1);
1132     tcg_temp_free_vec(t0);
1133 }
1134 
1135 /* Expand a vector two-operand operation.  */
1136 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1137                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1138 {
1139     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1140     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1141     TCGType type;
1142     uint32_t some;
1143 
1144     check_size_align(oprsz, maxsz, dofs | aofs);
1145     check_overlap_2(dofs, aofs, maxsz);
1146 
1147     type = 0;
1148     if (g->fniv) {
1149         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1150     }
1151     switch (type) {
1152     case TCG_TYPE_V256:
1153         /* Recall that ARM SVE allows vector sizes that are not a
1154          * power of 2, but always a multiple of 16.  The intent is
1155          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1156          */
1157         some = QEMU_ALIGN_DOWN(oprsz, 32);
1158         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1159                      g->load_dest, g->fniv);
1160         if (some == oprsz) {
1161             break;
1162         }
1163         dofs += some;
1164         aofs += some;
1165         oprsz -= some;
1166         maxsz -= some;
1167         /* fallthru */
1168     case TCG_TYPE_V128:
1169         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1170                      g->load_dest, g->fniv);
1171         break;
1172     case TCG_TYPE_V64:
1173         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1174                      g->load_dest, g->fniv);
1175         break;
1176 
1177     case 0:
1178         if (g->fni8 && check_size_impl(oprsz, 8)) {
1179             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1180         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1181             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1182         } else {
1183             assert(g->fno != NULL);
1184             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1185             oprsz = maxsz;
1186         }
1187         break;
1188 
1189     default:
1190         g_assert_not_reached();
1191     }
1192     tcg_swap_vecop_list(hold_list);
1193 
1194     if (oprsz < maxsz) {
1195         expand_clr(dofs + oprsz, maxsz - oprsz);
1196     }
1197 }
1198 
1199 /* Expand a vector operation with two vectors and an immediate.  */
1200 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1201                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1202 {
1203     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1204     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1205     TCGType type;
1206     uint32_t some;
1207 
1208     check_size_align(oprsz, maxsz, dofs | aofs);
1209     check_overlap_2(dofs, aofs, maxsz);
1210 
1211     type = 0;
1212     if (g->fniv) {
1213         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1214     }
1215     switch (type) {
1216     case TCG_TYPE_V256:
1217         /* Recall that ARM SVE allows vector sizes that are not a
1218          * power of 2, but always a multiple of 16.  The intent is
1219          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1220          */
1221         some = QEMU_ALIGN_DOWN(oprsz, 32);
1222         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1223                       c, g->load_dest, g->fniv);
1224         if (some == oprsz) {
1225             break;
1226         }
1227         dofs += some;
1228         aofs += some;
1229         oprsz -= some;
1230         maxsz -= some;
1231         /* fallthru */
1232     case TCG_TYPE_V128:
1233         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1234                       c, g->load_dest, g->fniv);
1235         break;
1236     case TCG_TYPE_V64:
1237         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1238                       c, g->load_dest, g->fniv);
1239         break;
1240 
1241     case 0:
1242         if (g->fni8 && check_size_impl(oprsz, 8)) {
1243             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1244         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1245             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1246         } else {
1247             if (g->fno) {
1248                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1249             } else {
1250                 TCGv_i64 tcg_c = tcg_const_i64(c);
1251                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1252                                     maxsz, c, g->fnoi);
1253                 tcg_temp_free_i64(tcg_c);
1254             }
1255             oprsz = maxsz;
1256         }
1257         break;
1258 
1259     default:
1260         g_assert_not_reached();
1261     }
1262     tcg_swap_vecop_list(hold_list);
1263 
1264     if (oprsz < maxsz) {
1265         expand_clr(dofs + oprsz, maxsz - oprsz);
1266     }
1267 }
1268 
1269 /* Expand a vector operation with two vectors and a scalar.  */
1270 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1271                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1272 {
1273     TCGType type;
1274 
1275     check_size_align(oprsz, maxsz, dofs | aofs);
1276     check_overlap_2(dofs, aofs, maxsz);
1277 
1278     type = 0;
1279     if (g->fniv) {
1280         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1281     }
1282     if (type != 0) {
1283         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1284         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1285         TCGv_vec t_vec = tcg_temp_new_vec(type);
1286         uint32_t some;
1287 
1288         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1289 
1290         switch (type) {
1291         case TCG_TYPE_V256:
1292             /* Recall that ARM SVE allows vector sizes that are not a
1293              * power of 2, but always a multiple of 16.  The intent is
1294              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1295              */
1296             some = QEMU_ALIGN_DOWN(oprsz, 32);
1297             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1298                           t_vec, g->scalar_first, g->fniv);
1299             if (some == oprsz) {
1300                 break;
1301             }
1302             dofs += some;
1303             aofs += some;
1304             oprsz -= some;
1305             maxsz -= some;
1306             /* fallthru */
1307 
1308         case TCG_TYPE_V128:
1309             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1310                           t_vec, g->scalar_first, g->fniv);
1311             break;
1312 
1313         case TCG_TYPE_V64:
1314             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1315                           t_vec, g->scalar_first, g->fniv);
1316             break;
1317 
1318         default:
1319             g_assert_not_reached();
1320         }
1321         tcg_temp_free_vec(t_vec);
1322         tcg_swap_vecop_list(hold_list);
1323     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1324         TCGv_i64 t64 = tcg_temp_new_i64();
1325 
1326         gen_dup_i64(g->vece, t64, c);
1327         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1328         tcg_temp_free_i64(t64);
1329     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1330         TCGv_i32 t32 = tcg_temp_new_i32();
1331 
1332         tcg_gen_extrl_i64_i32(t32, c);
1333         gen_dup_i32(g->vece, t32, t32);
1334         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1335         tcg_temp_free_i32(t32);
1336     } else {
1337         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1338         return;
1339     }
1340 
1341     if (oprsz < maxsz) {
1342         expand_clr(dofs + oprsz, maxsz - oprsz);
1343     }
1344 }
1345 
1346 /* Expand a vector three-operand operation.  */
1347 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1348                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1349 {
1350     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1351     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1352     TCGType type;
1353     uint32_t some;
1354 
1355     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1356     check_overlap_3(dofs, aofs, bofs, maxsz);
1357 
1358     type = 0;
1359     if (g->fniv) {
1360         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1361     }
1362     switch (type) {
1363     case TCG_TYPE_V256:
1364         /* Recall that ARM SVE allows vector sizes that are not a
1365          * power of 2, but always a multiple of 16.  The intent is
1366          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1367          */
1368         some = QEMU_ALIGN_DOWN(oprsz, 32);
1369         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1370                      g->load_dest, g->fniv);
1371         if (some == oprsz) {
1372             break;
1373         }
1374         dofs += some;
1375         aofs += some;
1376         bofs += some;
1377         oprsz -= some;
1378         maxsz -= some;
1379         /* fallthru */
1380     case TCG_TYPE_V128:
1381         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1382                      g->load_dest, g->fniv);
1383         break;
1384     case TCG_TYPE_V64:
1385         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1386                      g->load_dest, g->fniv);
1387         break;
1388 
1389     case 0:
1390         if (g->fni8 && check_size_impl(oprsz, 8)) {
1391             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1392         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1393             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1394         } else {
1395             assert(g->fno != NULL);
1396             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1397                                maxsz, g->data, g->fno);
1398             oprsz = maxsz;
1399         }
1400         break;
1401 
1402     default:
1403         g_assert_not_reached();
1404     }
1405     tcg_swap_vecop_list(hold_list);
1406 
1407     if (oprsz < maxsz) {
1408         expand_clr(dofs + oprsz, maxsz - oprsz);
1409     }
1410 }
1411 
1412 /* Expand a vector operation with three vectors and an immediate.  */
1413 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1414                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1415                      const GVecGen3i *g)
1416 {
1417     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1418     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1419     TCGType type;
1420     uint32_t some;
1421 
1422     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1423     check_overlap_3(dofs, aofs, bofs, maxsz);
1424 
1425     type = 0;
1426     if (g->fniv) {
1427         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1428     }
1429     switch (type) {
1430     case TCG_TYPE_V256:
1431         /*
1432          * Recall that ARM SVE allows vector sizes that are not a
1433          * power of 2, but always a multiple of 16.  The intent is
1434          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1435          */
1436         some = QEMU_ALIGN_DOWN(oprsz, 32);
1437         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1438                       c, g->load_dest, g->fniv);
1439         if (some == oprsz) {
1440             break;
1441         }
1442         dofs += some;
1443         aofs += some;
1444         bofs += some;
1445         oprsz -= some;
1446         maxsz -= some;
1447         /* fallthru */
1448     case TCG_TYPE_V128:
1449         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1450                       c, g->load_dest, g->fniv);
1451         break;
1452     case TCG_TYPE_V64:
1453         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1454                       c, g->load_dest, g->fniv);
1455         break;
1456 
1457     case 0:
1458         if (g->fni8 && check_size_impl(oprsz, 8)) {
1459             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1460         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1461             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1462         } else {
1463             assert(g->fno != NULL);
1464             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1465             oprsz = maxsz;
1466         }
1467         break;
1468 
1469     default:
1470         g_assert_not_reached();
1471     }
1472     tcg_swap_vecop_list(hold_list);
1473 
1474     if (oprsz < maxsz) {
1475         expand_clr(dofs + oprsz, maxsz - oprsz);
1476     }
1477 }
1478 
1479 /* Expand a vector four-operand operation.  */
1480 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1481                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1482 {
1483     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1484     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1485     TCGType type;
1486     uint32_t some;
1487 
1488     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1489     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1490 
1491     type = 0;
1492     if (g->fniv) {
1493         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1494     }
1495     switch (type) {
1496     case TCG_TYPE_V256:
1497         /* Recall that ARM SVE allows vector sizes that are not a
1498          * power of 2, but always a multiple of 16.  The intent is
1499          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1500          */
1501         some = QEMU_ALIGN_DOWN(oprsz, 32);
1502         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1503                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1504         if (some == oprsz) {
1505             break;
1506         }
1507         dofs += some;
1508         aofs += some;
1509         bofs += some;
1510         cofs += some;
1511         oprsz -= some;
1512         maxsz -= some;
1513         /* fallthru */
1514     case TCG_TYPE_V128:
1515         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1516                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1517         break;
1518     case TCG_TYPE_V64:
1519         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1520                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1521         break;
1522 
1523     case 0:
1524         if (g->fni8 && check_size_impl(oprsz, 8)) {
1525             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1526                          g->write_aofs, g->fni8);
1527         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1528             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1529                          g->write_aofs, g->fni4);
1530         } else {
1531             assert(g->fno != NULL);
1532             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1533                                oprsz, maxsz, g->data, g->fno);
1534             oprsz = maxsz;
1535         }
1536         break;
1537 
1538     default:
1539         g_assert_not_reached();
1540     }
1541     tcg_swap_vecop_list(hold_list);
1542 
1543     if (oprsz < maxsz) {
1544         expand_clr(dofs + oprsz, maxsz - oprsz);
1545     }
1546 }
1547 
1548 /*
1549  * Expand specific vector operations.
1550  */
1551 
1552 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1553 {
1554     tcg_gen_mov_vec(a, b);
1555 }
1556 
1557 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1558                       uint32_t oprsz, uint32_t maxsz)
1559 {
1560     static const GVecGen2 g = {
1561         .fni8 = tcg_gen_mov_i64,
1562         .fniv = vec_mov2,
1563         .fno = gen_helper_gvec_mov,
1564         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1565     };
1566     if (dofs != aofs) {
1567         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1568     } else {
1569         check_size_align(oprsz, maxsz, dofs);
1570         if (oprsz < maxsz) {
1571             expand_clr(dofs + oprsz, maxsz - oprsz);
1572         }
1573     }
1574 }
1575 
1576 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1577                           uint32_t maxsz, TCGv_i32 in)
1578 {
1579     check_size_align(oprsz, maxsz, dofs);
1580     tcg_debug_assert(vece <= MO_32);
1581     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1582 }
1583 
1584 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1585                           uint32_t maxsz, TCGv_i64 in)
1586 {
1587     check_size_align(oprsz, maxsz, dofs);
1588     tcg_debug_assert(vece <= MO_64);
1589     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1590 }
1591 
1592 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1593                           uint32_t oprsz, uint32_t maxsz)
1594 {
1595     check_size_align(oprsz, maxsz, dofs);
1596     if (vece <= MO_64) {
1597         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1598         if (type != 0) {
1599             TCGv_vec t_vec = tcg_temp_new_vec(type);
1600             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1601             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1602             tcg_temp_free_vec(t_vec);
1603         } else if (vece <= MO_32) {
1604             TCGv_i32 in = tcg_temp_new_i32();
1605             switch (vece) {
1606             case MO_8:
1607                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1608                 break;
1609             case MO_16:
1610                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1611                 break;
1612             default:
1613                 tcg_gen_ld_i32(in, cpu_env, aofs);
1614                 break;
1615             }
1616             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1617             tcg_temp_free_i32(in);
1618         } else {
1619             TCGv_i64 in = tcg_temp_new_i64();
1620             tcg_gen_ld_i64(in, cpu_env, aofs);
1621             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1622             tcg_temp_free_i64(in);
1623         }
1624     } else if (vece == 4) {
1625         /* 128-bit duplicate.  */
1626         int i;
1627 
1628         tcg_debug_assert(oprsz >= 16);
1629         if (TCG_TARGET_HAS_v128) {
1630             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1631 
1632             tcg_gen_ld_vec(in, cpu_env, aofs);
1633             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1634                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1635             }
1636             tcg_temp_free_vec(in);
1637         } else {
1638             TCGv_i64 in0 = tcg_temp_new_i64();
1639             TCGv_i64 in1 = tcg_temp_new_i64();
1640 
1641             tcg_gen_ld_i64(in0, cpu_env, aofs);
1642             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1643             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1644                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1645                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1646             }
1647             tcg_temp_free_i64(in0);
1648             tcg_temp_free_i64(in1);
1649         }
1650         if (oprsz < maxsz) {
1651             expand_clr(dofs + oprsz, maxsz - oprsz);
1652         }
1653     } else if (vece == 5) {
1654         /* 256-bit duplicate.  */
1655         int i;
1656 
1657         tcg_debug_assert(oprsz >= 32);
1658         tcg_debug_assert(oprsz % 32 == 0);
1659         if (TCG_TARGET_HAS_v256) {
1660             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1661 
1662             tcg_gen_ld_vec(in, cpu_env, aofs);
1663             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1664                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1665             }
1666             tcg_temp_free_vec(in);
1667         } else if (TCG_TARGET_HAS_v128) {
1668             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1669             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1670 
1671             tcg_gen_ld_vec(in0, cpu_env, aofs);
1672             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1673             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1674                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1675                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1676             }
1677             tcg_temp_free_vec(in0);
1678             tcg_temp_free_vec(in1);
1679         } else {
1680             TCGv_i64 in[4];
1681             int j;
1682 
1683             for (j = 0; j < 4; ++j) {
1684                 in[j] = tcg_temp_new_i64();
1685                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1686             }
1687             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1688                 for (j = 0; j < 4; ++j) {
1689                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1690                 }
1691             }
1692             for (j = 0; j < 4; ++j) {
1693                 tcg_temp_free_i64(in[j]);
1694             }
1695         }
1696         if (oprsz < maxsz) {
1697             expand_clr(dofs + oprsz, maxsz - oprsz);
1698         }
1699     } else {
1700         g_assert_not_reached();
1701     }
1702 }
1703 
1704 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1705                           uint32_t maxsz, uint64_t x)
1706 {
1707     check_size_align(oprsz, maxsz, dofs);
1708     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1709 }
1710 
1711 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1712                       uint32_t oprsz, uint32_t maxsz)
1713 {
1714     static const GVecGen2 g = {
1715         .fni8 = tcg_gen_not_i64,
1716         .fniv = tcg_gen_not_vec,
1717         .fno = gen_helper_gvec_not,
1718         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1719     };
1720     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1721 }
1722 
1723 /* Perform a vector addition using normal addition and a mask.  The mask
1724    should be the sign bit of each lane.  This 6-operation form is more
1725    efficient than separate additions when there are 4 or more lanes in
1726    the 64-bit operation.  */
1727 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1728 {
1729     TCGv_i64 t1 = tcg_temp_new_i64();
1730     TCGv_i64 t2 = tcg_temp_new_i64();
1731     TCGv_i64 t3 = tcg_temp_new_i64();
1732 
1733     tcg_gen_andc_i64(t1, a, m);
1734     tcg_gen_andc_i64(t2, b, m);
1735     tcg_gen_xor_i64(t3, a, b);
1736     tcg_gen_add_i64(d, t1, t2);
1737     tcg_gen_and_i64(t3, t3, m);
1738     tcg_gen_xor_i64(d, d, t3);
1739 
1740     tcg_temp_free_i64(t1);
1741     tcg_temp_free_i64(t2);
1742     tcg_temp_free_i64(t3);
1743 }
1744 
1745 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1746 {
1747     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1748     gen_addv_mask(d, a, b, m);
1749     tcg_temp_free_i64(m);
1750 }
1751 
1752 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1753 {
1754     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1755     gen_addv_mask(d, a, b, m);
1756     tcg_temp_free_i64(m);
1757 }
1758 
1759 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1760 {
1761     TCGv_i64 t1 = tcg_temp_new_i64();
1762     TCGv_i64 t2 = tcg_temp_new_i64();
1763 
1764     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1765     tcg_gen_add_i64(t2, a, b);
1766     tcg_gen_add_i64(t1, t1, b);
1767     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1768 
1769     tcg_temp_free_i64(t1);
1770     tcg_temp_free_i64(t2);
1771 }
1772 
1773 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1774 
1775 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1776                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1777 {
1778     static const GVecGen3 g[4] = {
1779         { .fni8 = tcg_gen_vec_add8_i64,
1780           .fniv = tcg_gen_add_vec,
1781           .fno = gen_helper_gvec_add8,
1782           .opt_opc = vecop_list_add,
1783           .vece = MO_8 },
1784         { .fni8 = tcg_gen_vec_add16_i64,
1785           .fniv = tcg_gen_add_vec,
1786           .fno = gen_helper_gvec_add16,
1787           .opt_opc = vecop_list_add,
1788           .vece = MO_16 },
1789         { .fni4 = tcg_gen_add_i32,
1790           .fniv = tcg_gen_add_vec,
1791           .fno = gen_helper_gvec_add32,
1792           .opt_opc = vecop_list_add,
1793           .vece = MO_32 },
1794         { .fni8 = tcg_gen_add_i64,
1795           .fniv = tcg_gen_add_vec,
1796           .fno = gen_helper_gvec_add64,
1797           .opt_opc = vecop_list_add,
1798           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1799           .vece = MO_64 },
1800     };
1801 
1802     tcg_debug_assert(vece <= MO_64);
1803     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1804 }
1805 
1806 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1807                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1808 {
1809     static const GVecGen2s g[4] = {
1810         { .fni8 = tcg_gen_vec_add8_i64,
1811           .fniv = tcg_gen_add_vec,
1812           .fno = gen_helper_gvec_adds8,
1813           .opt_opc = vecop_list_add,
1814           .vece = MO_8 },
1815         { .fni8 = tcg_gen_vec_add16_i64,
1816           .fniv = tcg_gen_add_vec,
1817           .fno = gen_helper_gvec_adds16,
1818           .opt_opc = vecop_list_add,
1819           .vece = MO_16 },
1820         { .fni4 = tcg_gen_add_i32,
1821           .fniv = tcg_gen_add_vec,
1822           .fno = gen_helper_gvec_adds32,
1823           .opt_opc = vecop_list_add,
1824           .vece = MO_32 },
1825         { .fni8 = tcg_gen_add_i64,
1826           .fniv = tcg_gen_add_vec,
1827           .fno = gen_helper_gvec_adds64,
1828           .opt_opc = vecop_list_add,
1829           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1830           .vece = MO_64 },
1831     };
1832 
1833     tcg_debug_assert(vece <= MO_64);
1834     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1835 }
1836 
1837 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1838                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1839 {
1840     TCGv_i64 tmp = tcg_const_i64(c);
1841     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1842     tcg_temp_free_i64(tmp);
1843 }
1844 
1845 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1846 
1847 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1848                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1849 {
1850     static const GVecGen2s g[4] = {
1851         { .fni8 = tcg_gen_vec_sub8_i64,
1852           .fniv = tcg_gen_sub_vec,
1853           .fno = gen_helper_gvec_subs8,
1854           .opt_opc = vecop_list_sub,
1855           .vece = MO_8 },
1856         { .fni8 = tcg_gen_vec_sub16_i64,
1857           .fniv = tcg_gen_sub_vec,
1858           .fno = gen_helper_gvec_subs16,
1859           .opt_opc = vecop_list_sub,
1860           .vece = MO_16 },
1861         { .fni4 = tcg_gen_sub_i32,
1862           .fniv = tcg_gen_sub_vec,
1863           .fno = gen_helper_gvec_subs32,
1864           .opt_opc = vecop_list_sub,
1865           .vece = MO_32 },
1866         { .fni8 = tcg_gen_sub_i64,
1867           .fniv = tcg_gen_sub_vec,
1868           .fno = gen_helper_gvec_subs64,
1869           .opt_opc = vecop_list_sub,
1870           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1871           .vece = MO_64 },
1872     };
1873 
1874     tcg_debug_assert(vece <= MO_64);
1875     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1876 }
1877 
1878 /* Perform a vector subtraction using normal subtraction and a mask.
1879    Compare gen_addv_mask above.  */
1880 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1881 {
1882     TCGv_i64 t1 = tcg_temp_new_i64();
1883     TCGv_i64 t2 = tcg_temp_new_i64();
1884     TCGv_i64 t3 = tcg_temp_new_i64();
1885 
1886     tcg_gen_or_i64(t1, a, m);
1887     tcg_gen_andc_i64(t2, b, m);
1888     tcg_gen_eqv_i64(t3, a, b);
1889     tcg_gen_sub_i64(d, t1, t2);
1890     tcg_gen_and_i64(t3, t3, m);
1891     tcg_gen_xor_i64(d, d, t3);
1892 
1893     tcg_temp_free_i64(t1);
1894     tcg_temp_free_i64(t2);
1895     tcg_temp_free_i64(t3);
1896 }
1897 
1898 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1899 {
1900     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1901     gen_subv_mask(d, a, b, m);
1902     tcg_temp_free_i64(m);
1903 }
1904 
1905 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1906 {
1907     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1908     gen_subv_mask(d, a, b, m);
1909     tcg_temp_free_i64(m);
1910 }
1911 
1912 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1913 {
1914     TCGv_i64 t1 = tcg_temp_new_i64();
1915     TCGv_i64 t2 = tcg_temp_new_i64();
1916 
1917     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1918     tcg_gen_sub_i64(t2, a, b);
1919     tcg_gen_sub_i64(t1, a, t1);
1920     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1921 
1922     tcg_temp_free_i64(t1);
1923     tcg_temp_free_i64(t2);
1924 }
1925 
1926 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1927                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1928 {
1929     static const GVecGen3 g[4] = {
1930         { .fni8 = tcg_gen_vec_sub8_i64,
1931           .fniv = tcg_gen_sub_vec,
1932           .fno = gen_helper_gvec_sub8,
1933           .opt_opc = vecop_list_sub,
1934           .vece = MO_8 },
1935         { .fni8 = tcg_gen_vec_sub16_i64,
1936           .fniv = tcg_gen_sub_vec,
1937           .fno = gen_helper_gvec_sub16,
1938           .opt_opc = vecop_list_sub,
1939           .vece = MO_16 },
1940         { .fni4 = tcg_gen_sub_i32,
1941           .fniv = tcg_gen_sub_vec,
1942           .fno = gen_helper_gvec_sub32,
1943           .opt_opc = vecop_list_sub,
1944           .vece = MO_32 },
1945         { .fni8 = tcg_gen_sub_i64,
1946           .fniv = tcg_gen_sub_vec,
1947           .fno = gen_helper_gvec_sub64,
1948           .opt_opc = vecop_list_sub,
1949           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1950           .vece = MO_64 },
1951     };
1952 
1953     tcg_debug_assert(vece <= MO_64);
1954     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1955 }
1956 
1957 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1958 
1959 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1960                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1961 {
1962     static const GVecGen3 g[4] = {
1963         { .fniv = tcg_gen_mul_vec,
1964           .fno = gen_helper_gvec_mul8,
1965           .opt_opc = vecop_list_mul,
1966           .vece = MO_8 },
1967         { .fniv = tcg_gen_mul_vec,
1968           .fno = gen_helper_gvec_mul16,
1969           .opt_opc = vecop_list_mul,
1970           .vece = MO_16 },
1971         { .fni4 = tcg_gen_mul_i32,
1972           .fniv = tcg_gen_mul_vec,
1973           .fno = gen_helper_gvec_mul32,
1974           .opt_opc = vecop_list_mul,
1975           .vece = MO_32 },
1976         { .fni8 = tcg_gen_mul_i64,
1977           .fniv = tcg_gen_mul_vec,
1978           .fno = gen_helper_gvec_mul64,
1979           .opt_opc = vecop_list_mul,
1980           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1981           .vece = MO_64 },
1982     };
1983 
1984     tcg_debug_assert(vece <= MO_64);
1985     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1986 }
1987 
1988 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1989                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1990 {
1991     static const GVecGen2s g[4] = {
1992         { .fniv = tcg_gen_mul_vec,
1993           .fno = gen_helper_gvec_muls8,
1994           .opt_opc = vecop_list_mul,
1995           .vece = MO_8 },
1996         { .fniv = tcg_gen_mul_vec,
1997           .fno = gen_helper_gvec_muls16,
1998           .opt_opc = vecop_list_mul,
1999           .vece = MO_16 },
2000         { .fni4 = tcg_gen_mul_i32,
2001           .fniv = tcg_gen_mul_vec,
2002           .fno = gen_helper_gvec_muls32,
2003           .opt_opc = vecop_list_mul,
2004           .vece = MO_32 },
2005         { .fni8 = tcg_gen_mul_i64,
2006           .fniv = tcg_gen_mul_vec,
2007           .fno = gen_helper_gvec_muls64,
2008           .opt_opc = vecop_list_mul,
2009           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2010           .vece = MO_64 },
2011     };
2012 
2013     tcg_debug_assert(vece <= MO_64);
2014     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2015 }
2016 
2017 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2018                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2019 {
2020     TCGv_i64 tmp = tcg_const_i64(c);
2021     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2022     tcg_temp_free_i64(tmp);
2023 }
2024 
2025 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2026                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2027 {
2028     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2029     static const GVecGen3 g[4] = {
2030         { .fniv = tcg_gen_ssadd_vec,
2031           .fno = gen_helper_gvec_ssadd8,
2032           .opt_opc = vecop_list,
2033           .vece = MO_8 },
2034         { .fniv = tcg_gen_ssadd_vec,
2035           .fno = gen_helper_gvec_ssadd16,
2036           .opt_opc = vecop_list,
2037           .vece = MO_16 },
2038         { .fniv = tcg_gen_ssadd_vec,
2039           .fno = gen_helper_gvec_ssadd32,
2040           .opt_opc = vecop_list,
2041           .vece = MO_32 },
2042         { .fniv = tcg_gen_ssadd_vec,
2043           .fno = gen_helper_gvec_ssadd64,
2044           .opt_opc = vecop_list,
2045           .vece = MO_64 },
2046     };
2047     tcg_debug_assert(vece <= MO_64);
2048     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2049 }
2050 
2051 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2052                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2053 {
2054     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2055     static const GVecGen3 g[4] = {
2056         { .fniv = tcg_gen_sssub_vec,
2057           .fno = gen_helper_gvec_sssub8,
2058           .opt_opc = vecop_list,
2059           .vece = MO_8 },
2060         { .fniv = tcg_gen_sssub_vec,
2061           .fno = gen_helper_gvec_sssub16,
2062           .opt_opc = vecop_list,
2063           .vece = MO_16 },
2064         { .fniv = tcg_gen_sssub_vec,
2065           .fno = gen_helper_gvec_sssub32,
2066           .opt_opc = vecop_list,
2067           .vece = MO_32 },
2068         { .fniv = tcg_gen_sssub_vec,
2069           .fno = gen_helper_gvec_sssub64,
2070           .opt_opc = vecop_list,
2071           .vece = MO_64 },
2072     };
2073     tcg_debug_assert(vece <= MO_64);
2074     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2075 }
2076 
2077 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2078 {
2079     TCGv_i32 max = tcg_const_i32(-1);
2080     tcg_gen_add_i32(d, a, b);
2081     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2082     tcg_temp_free_i32(max);
2083 }
2084 
2085 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2086 {
2087     TCGv_i64 max = tcg_const_i64(-1);
2088     tcg_gen_add_i64(d, a, b);
2089     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2090     tcg_temp_free_i64(max);
2091 }
2092 
2093 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2094                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2095 {
2096     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2097     static const GVecGen3 g[4] = {
2098         { .fniv = tcg_gen_usadd_vec,
2099           .fno = gen_helper_gvec_usadd8,
2100           .opt_opc = vecop_list,
2101           .vece = MO_8 },
2102         { .fniv = tcg_gen_usadd_vec,
2103           .fno = gen_helper_gvec_usadd16,
2104           .opt_opc = vecop_list,
2105           .vece = MO_16 },
2106         { .fni4 = tcg_gen_usadd_i32,
2107           .fniv = tcg_gen_usadd_vec,
2108           .fno = gen_helper_gvec_usadd32,
2109           .opt_opc = vecop_list,
2110           .vece = MO_32 },
2111         { .fni8 = tcg_gen_usadd_i64,
2112           .fniv = tcg_gen_usadd_vec,
2113           .fno = gen_helper_gvec_usadd64,
2114           .opt_opc = vecop_list,
2115           .vece = MO_64 }
2116     };
2117     tcg_debug_assert(vece <= MO_64);
2118     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2119 }
2120 
2121 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2122 {
2123     TCGv_i32 min = tcg_const_i32(0);
2124     tcg_gen_sub_i32(d, a, b);
2125     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2126     tcg_temp_free_i32(min);
2127 }
2128 
2129 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2130 {
2131     TCGv_i64 min = tcg_const_i64(0);
2132     tcg_gen_sub_i64(d, a, b);
2133     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2134     tcg_temp_free_i64(min);
2135 }
2136 
2137 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2138                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2139 {
2140     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2141     static const GVecGen3 g[4] = {
2142         { .fniv = tcg_gen_ussub_vec,
2143           .fno = gen_helper_gvec_ussub8,
2144           .opt_opc = vecop_list,
2145           .vece = MO_8 },
2146         { .fniv = tcg_gen_ussub_vec,
2147           .fno = gen_helper_gvec_ussub16,
2148           .opt_opc = vecop_list,
2149           .vece = MO_16 },
2150         { .fni4 = tcg_gen_ussub_i32,
2151           .fniv = tcg_gen_ussub_vec,
2152           .fno = gen_helper_gvec_ussub32,
2153           .opt_opc = vecop_list,
2154           .vece = MO_32 },
2155         { .fni8 = tcg_gen_ussub_i64,
2156           .fniv = tcg_gen_ussub_vec,
2157           .fno = gen_helper_gvec_ussub64,
2158           .opt_opc = vecop_list,
2159           .vece = MO_64 }
2160     };
2161     tcg_debug_assert(vece <= MO_64);
2162     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2163 }
2164 
2165 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2166                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2167 {
2168     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2169     static const GVecGen3 g[4] = {
2170         { .fniv = tcg_gen_smin_vec,
2171           .fno = gen_helper_gvec_smin8,
2172           .opt_opc = vecop_list,
2173           .vece = MO_8 },
2174         { .fniv = tcg_gen_smin_vec,
2175           .fno = gen_helper_gvec_smin16,
2176           .opt_opc = vecop_list,
2177           .vece = MO_16 },
2178         { .fni4 = tcg_gen_smin_i32,
2179           .fniv = tcg_gen_smin_vec,
2180           .fno = gen_helper_gvec_smin32,
2181           .opt_opc = vecop_list,
2182           .vece = MO_32 },
2183         { .fni8 = tcg_gen_smin_i64,
2184           .fniv = tcg_gen_smin_vec,
2185           .fno = gen_helper_gvec_smin64,
2186           .opt_opc = vecop_list,
2187           .vece = MO_64 }
2188     };
2189     tcg_debug_assert(vece <= MO_64);
2190     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2191 }
2192 
2193 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2194                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2195 {
2196     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2197     static const GVecGen3 g[4] = {
2198         { .fniv = tcg_gen_umin_vec,
2199           .fno = gen_helper_gvec_umin8,
2200           .opt_opc = vecop_list,
2201           .vece = MO_8 },
2202         { .fniv = tcg_gen_umin_vec,
2203           .fno = gen_helper_gvec_umin16,
2204           .opt_opc = vecop_list,
2205           .vece = MO_16 },
2206         { .fni4 = tcg_gen_umin_i32,
2207           .fniv = tcg_gen_umin_vec,
2208           .fno = gen_helper_gvec_umin32,
2209           .opt_opc = vecop_list,
2210           .vece = MO_32 },
2211         { .fni8 = tcg_gen_umin_i64,
2212           .fniv = tcg_gen_umin_vec,
2213           .fno = gen_helper_gvec_umin64,
2214           .opt_opc = vecop_list,
2215           .vece = MO_64 }
2216     };
2217     tcg_debug_assert(vece <= MO_64);
2218     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2219 }
2220 
2221 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2222                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2223 {
2224     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2225     static const GVecGen3 g[4] = {
2226         { .fniv = tcg_gen_smax_vec,
2227           .fno = gen_helper_gvec_smax8,
2228           .opt_opc = vecop_list,
2229           .vece = MO_8 },
2230         { .fniv = tcg_gen_smax_vec,
2231           .fno = gen_helper_gvec_smax16,
2232           .opt_opc = vecop_list,
2233           .vece = MO_16 },
2234         { .fni4 = tcg_gen_smax_i32,
2235           .fniv = tcg_gen_smax_vec,
2236           .fno = gen_helper_gvec_smax32,
2237           .opt_opc = vecop_list,
2238           .vece = MO_32 },
2239         { .fni8 = tcg_gen_smax_i64,
2240           .fniv = tcg_gen_smax_vec,
2241           .fno = gen_helper_gvec_smax64,
2242           .opt_opc = vecop_list,
2243           .vece = MO_64 }
2244     };
2245     tcg_debug_assert(vece <= MO_64);
2246     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2247 }
2248 
2249 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2250                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2251 {
2252     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2253     static const GVecGen3 g[4] = {
2254         { .fniv = tcg_gen_umax_vec,
2255           .fno = gen_helper_gvec_umax8,
2256           .opt_opc = vecop_list,
2257           .vece = MO_8 },
2258         { .fniv = tcg_gen_umax_vec,
2259           .fno = gen_helper_gvec_umax16,
2260           .opt_opc = vecop_list,
2261           .vece = MO_16 },
2262         { .fni4 = tcg_gen_umax_i32,
2263           .fniv = tcg_gen_umax_vec,
2264           .fno = gen_helper_gvec_umax32,
2265           .opt_opc = vecop_list,
2266           .vece = MO_32 },
2267         { .fni8 = tcg_gen_umax_i64,
2268           .fniv = tcg_gen_umax_vec,
2269           .fno = gen_helper_gvec_umax64,
2270           .opt_opc = vecop_list,
2271           .vece = MO_64 }
2272     };
2273     tcg_debug_assert(vece <= MO_64);
2274     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2275 }
2276 
2277 /* Perform a vector negation using normal negation and a mask.
2278    Compare gen_subv_mask above.  */
2279 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2280 {
2281     TCGv_i64 t2 = tcg_temp_new_i64();
2282     TCGv_i64 t3 = tcg_temp_new_i64();
2283 
2284     tcg_gen_andc_i64(t3, m, b);
2285     tcg_gen_andc_i64(t2, b, m);
2286     tcg_gen_sub_i64(d, m, t2);
2287     tcg_gen_xor_i64(d, d, t3);
2288 
2289     tcg_temp_free_i64(t2);
2290     tcg_temp_free_i64(t3);
2291 }
2292 
2293 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2294 {
2295     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2296     gen_negv_mask(d, b, m);
2297     tcg_temp_free_i64(m);
2298 }
2299 
2300 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2301 {
2302     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2303     gen_negv_mask(d, b, m);
2304     tcg_temp_free_i64(m);
2305 }
2306 
2307 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2308 {
2309     TCGv_i64 t1 = tcg_temp_new_i64();
2310     TCGv_i64 t2 = tcg_temp_new_i64();
2311 
2312     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2313     tcg_gen_neg_i64(t2, b);
2314     tcg_gen_neg_i64(t1, t1);
2315     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2316 
2317     tcg_temp_free_i64(t1);
2318     tcg_temp_free_i64(t2);
2319 }
2320 
2321 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2322                       uint32_t oprsz, uint32_t maxsz)
2323 {
2324     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2325     static const GVecGen2 g[4] = {
2326         { .fni8 = tcg_gen_vec_neg8_i64,
2327           .fniv = tcg_gen_neg_vec,
2328           .fno = gen_helper_gvec_neg8,
2329           .opt_opc = vecop_list,
2330           .vece = MO_8 },
2331         { .fni8 = tcg_gen_vec_neg16_i64,
2332           .fniv = tcg_gen_neg_vec,
2333           .fno = gen_helper_gvec_neg16,
2334           .opt_opc = vecop_list,
2335           .vece = MO_16 },
2336         { .fni4 = tcg_gen_neg_i32,
2337           .fniv = tcg_gen_neg_vec,
2338           .fno = gen_helper_gvec_neg32,
2339           .opt_opc = vecop_list,
2340           .vece = MO_32 },
2341         { .fni8 = tcg_gen_neg_i64,
2342           .fniv = tcg_gen_neg_vec,
2343           .fno = gen_helper_gvec_neg64,
2344           .opt_opc = vecop_list,
2345           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2346           .vece = MO_64 },
2347     };
2348 
2349     tcg_debug_assert(vece <= MO_64);
2350     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2351 }
2352 
2353 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2354 {
2355     TCGv_i64 t = tcg_temp_new_i64();
2356     int nbit = 8 << vece;
2357 
2358     /* Create -1 for each negative element.  */
2359     tcg_gen_shri_i64(t, b, nbit - 1);
2360     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2361     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2362 
2363     /*
2364      * Invert (via xor -1) and add one.
2365      * Because of the ordering the msb is cleared,
2366      * so we never have carry into the next element.
2367      */
2368     tcg_gen_xor_i64(d, b, t);
2369     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2370     tcg_gen_add_i64(d, d, t);
2371 
2372     tcg_temp_free_i64(t);
2373 }
2374 
2375 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2376 {
2377     gen_absv_mask(d, b, MO_8);
2378 }
2379 
2380 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2381 {
2382     gen_absv_mask(d, b, MO_16);
2383 }
2384 
2385 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2386                       uint32_t oprsz, uint32_t maxsz)
2387 {
2388     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2389     static const GVecGen2 g[4] = {
2390         { .fni8 = tcg_gen_vec_abs8_i64,
2391           .fniv = tcg_gen_abs_vec,
2392           .fno = gen_helper_gvec_abs8,
2393           .opt_opc = vecop_list,
2394           .vece = MO_8 },
2395         { .fni8 = tcg_gen_vec_abs16_i64,
2396           .fniv = tcg_gen_abs_vec,
2397           .fno = gen_helper_gvec_abs16,
2398           .opt_opc = vecop_list,
2399           .vece = MO_16 },
2400         { .fni4 = tcg_gen_abs_i32,
2401           .fniv = tcg_gen_abs_vec,
2402           .fno = gen_helper_gvec_abs32,
2403           .opt_opc = vecop_list,
2404           .vece = MO_32 },
2405         { .fni8 = tcg_gen_abs_i64,
2406           .fniv = tcg_gen_abs_vec,
2407           .fno = gen_helper_gvec_abs64,
2408           .opt_opc = vecop_list,
2409           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2410           .vece = MO_64 },
2411     };
2412 
2413     tcg_debug_assert(vece <= MO_64);
2414     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2415 }
2416 
2417 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2418                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2419 {
2420     static const GVecGen3 g = {
2421         .fni8 = tcg_gen_and_i64,
2422         .fniv = tcg_gen_and_vec,
2423         .fno = gen_helper_gvec_and,
2424         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2425     };
2426 
2427     if (aofs == bofs) {
2428         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2429     } else {
2430         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2431     }
2432 }
2433 
2434 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2435                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2436 {
2437     static const GVecGen3 g = {
2438         .fni8 = tcg_gen_or_i64,
2439         .fniv = tcg_gen_or_vec,
2440         .fno = gen_helper_gvec_or,
2441         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2442     };
2443 
2444     if (aofs == bofs) {
2445         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2446     } else {
2447         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2448     }
2449 }
2450 
2451 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2452                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2453 {
2454     static const GVecGen3 g = {
2455         .fni8 = tcg_gen_xor_i64,
2456         .fniv = tcg_gen_xor_vec,
2457         .fno = gen_helper_gvec_xor,
2458         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2459     };
2460 
2461     if (aofs == bofs) {
2462         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2463     } else {
2464         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2465     }
2466 }
2467 
2468 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2469                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2470 {
2471     static const GVecGen3 g = {
2472         .fni8 = tcg_gen_andc_i64,
2473         .fniv = tcg_gen_andc_vec,
2474         .fno = gen_helper_gvec_andc,
2475         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2476     };
2477 
2478     if (aofs == bofs) {
2479         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2480     } else {
2481         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2482     }
2483 }
2484 
2485 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2486                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2487 {
2488     static const GVecGen3 g = {
2489         .fni8 = tcg_gen_orc_i64,
2490         .fniv = tcg_gen_orc_vec,
2491         .fno = gen_helper_gvec_orc,
2492         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2493     };
2494 
2495     if (aofs == bofs) {
2496         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2497     } else {
2498         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2499     }
2500 }
2501 
2502 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2503                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2504 {
2505     static const GVecGen3 g = {
2506         .fni8 = tcg_gen_nand_i64,
2507         .fniv = tcg_gen_nand_vec,
2508         .fno = gen_helper_gvec_nand,
2509         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2510     };
2511 
2512     if (aofs == bofs) {
2513         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2514     } else {
2515         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2516     }
2517 }
2518 
2519 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2520                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2521 {
2522     static const GVecGen3 g = {
2523         .fni8 = tcg_gen_nor_i64,
2524         .fniv = tcg_gen_nor_vec,
2525         .fno = gen_helper_gvec_nor,
2526         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2527     };
2528 
2529     if (aofs == bofs) {
2530         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2531     } else {
2532         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2533     }
2534 }
2535 
2536 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2537                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2538 {
2539     static const GVecGen3 g = {
2540         .fni8 = tcg_gen_eqv_i64,
2541         .fniv = tcg_gen_eqv_vec,
2542         .fno = gen_helper_gvec_eqv,
2543         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2544     };
2545 
2546     if (aofs == bofs) {
2547         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2548     } else {
2549         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2550     }
2551 }
2552 
2553 static const GVecGen2s gop_ands = {
2554     .fni8 = tcg_gen_and_i64,
2555     .fniv = tcg_gen_and_vec,
2556     .fno = gen_helper_gvec_ands,
2557     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2558     .vece = MO_64
2559 };
2560 
2561 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2562                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2563 {
2564     TCGv_i64 tmp = tcg_temp_new_i64();
2565     gen_dup_i64(vece, tmp, c);
2566     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2567     tcg_temp_free_i64(tmp);
2568 }
2569 
2570 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2571                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2572 {
2573     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2574     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2575     tcg_temp_free_i64(tmp);
2576 }
2577 
2578 static const GVecGen2s gop_xors = {
2579     .fni8 = tcg_gen_xor_i64,
2580     .fniv = tcg_gen_xor_vec,
2581     .fno = gen_helper_gvec_xors,
2582     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2583     .vece = MO_64
2584 };
2585 
2586 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2587                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2588 {
2589     TCGv_i64 tmp = tcg_temp_new_i64();
2590     gen_dup_i64(vece, tmp, c);
2591     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2592     tcg_temp_free_i64(tmp);
2593 }
2594 
2595 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2596                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2597 {
2598     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2599     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2600     tcg_temp_free_i64(tmp);
2601 }
2602 
2603 static const GVecGen2s gop_ors = {
2604     .fni8 = tcg_gen_or_i64,
2605     .fniv = tcg_gen_or_vec,
2606     .fno = gen_helper_gvec_ors,
2607     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2608     .vece = MO_64
2609 };
2610 
2611 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2612                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2613 {
2614     TCGv_i64 tmp = tcg_temp_new_i64();
2615     gen_dup_i64(vece, tmp, c);
2616     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2617     tcg_temp_free_i64(tmp);
2618 }
2619 
2620 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2621                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2622 {
2623     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2624     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2625     tcg_temp_free_i64(tmp);
2626 }
2627 
2628 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2629 {
2630     uint64_t mask = dup_const(MO_8, 0xff << c);
2631     tcg_gen_shli_i64(d, a, c);
2632     tcg_gen_andi_i64(d, d, mask);
2633 }
2634 
2635 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2636 {
2637     uint64_t mask = dup_const(MO_16, 0xffff << c);
2638     tcg_gen_shli_i64(d, a, c);
2639     tcg_gen_andi_i64(d, d, mask);
2640 }
2641 
2642 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2643                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2644 {
2645     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2646     static const GVecGen2i g[4] = {
2647         { .fni8 = tcg_gen_vec_shl8i_i64,
2648           .fniv = tcg_gen_shli_vec,
2649           .fno = gen_helper_gvec_shl8i,
2650           .opt_opc = vecop_list,
2651           .vece = MO_8 },
2652         { .fni8 = tcg_gen_vec_shl16i_i64,
2653           .fniv = tcg_gen_shli_vec,
2654           .fno = gen_helper_gvec_shl16i,
2655           .opt_opc = vecop_list,
2656           .vece = MO_16 },
2657         { .fni4 = tcg_gen_shli_i32,
2658           .fniv = tcg_gen_shli_vec,
2659           .fno = gen_helper_gvec_shl32i,
2660           .opt_opc = vecop_list,
2661           .vece = MO_32 },
2662         { .fni8 = tcg_gen_shli_i64,
2663           .fniv = tcg_gen_shli_vec,
2664           .fno = gen_helper_gvec_shl64i,
2665           .opt_opc = vecop_list,
2666           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2667           .vece = MO_64 },
2668     };
2669 
2670     tcg_debug_assert(vece <= MO_64);
2671     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2672     if (shift == 0) {
2673         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2674     } else {
2675         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2676     }
2677 }
2678 
2679 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2680 {
2681     uint64_t mask = dup_const(MO_8, 0xff >> c);
2682     tcg_gen_shri_i64(d, a, c);
2683     tcg_gen_andi_i64(d, d, mask);
2684 }
2685 
2686 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2687 {
2688     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2689     tcg_gen_shri_i64(d, a, c);
2690     tcg_gen_andi_i64(d, d, mask);
2691 }
2692 
2693 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2694                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2695 {
2696     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2697     static const GVecGen2i g[4] = {
2698         { .fni8 = tcg_gen_vec_shr8i_i64,
2699           .fniv = tcg_gen_shri_vec,
2700           .fno = gen_helper_gvec_shr8i,
2701           .opt_opc = vecop_list,
2702           .vece = MO_8 },
2703         { .fni8 = tcg_gen_vec_shr16i_i64,
2704           .fniv = tcg_gen_shri_vec,
2705           .fno = gen_helper_gvec_shr16i,
2706           .opt_opc = vecop_list,
2707           .vece = MO_16 },
2708         { .fni4 = tcg_gen_shri_i32,
2709           .fniv = tcg_gen_shri_vec,
2710           .fno = gen_helper_gvec_shr32i,
2711           .opt_opc = vecop_list,
2712           .vece = MO_32 },
2713         { .fni8 = tcg_gen_shri_i64,
2714           .fniv = tcg_gen_shri_vec,
2715           .fno = gen_helper_gvec_shr64i,
2716           .opt_opc = vecop_list,
2717           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2718           .vece = MO_64 },
2719     };
2720 
2721     tcg_debug_assert(vece <= MO_64);
2722     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2723     if (shift == 0) {
2724         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2725     } else {
2726         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2727     }
2728 }
2729 
2730 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2731 {
2732     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2733     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2734     TCGv_i64 s = tcg_temp_new_i64();
2735 
2736     tcg_gen_shri_i64(d, a, c);
2737     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2738     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2739     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2740     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2741     tcg_temp_free_i64(s);
2742 }
2743 
2744 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2745 {
2746     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2747     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2748     TCGv_i64 s = tcg_temp_new_i64();
2749 
2750     tcg_gen_shri_i64(d, a, c);
2751     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2752     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2753     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2754     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2755     tcg_temp_free_i64(s);
2756 }
2757 
2758 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2759                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2760 {
2761     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2762     static const GVecGen2i g[4] = {
2763         { .fni8 = tcg_gen_vec_sar8i_i64,
2764           .fniv = tcg_gen_sari_vec,
2765           .fno = gen_helper_gvec_sar8i,
2766           .opt_opc = vecop_list,
2767           .vece = MO_8 },
2768         { .fni8 = tcg_gen_vec_sar16i_i64,
2769           .fniv = tcg_gen_sari_vec,
2770           .fno = gen_helper_gvec_sar16i,
2771           .opt_opc = vecop_list,
2772           .vece = MO_16 },
2773         { .fni4 = tcg_gen_sari_i32,
2774           .fniv = tcg_gen_sari_vec,
2775           .fno = gen_helper_gvec_sar32i,
2776           .opt_opc = vecop_list,
2777           .vece = MO_32 },
2778         { .fni8 = tcg_gen_sari_i64,
2779           .fniv = tcg_gen_sari_vec,
2780           .fno = gen_helper_gvec_sar64i,
2781           .opt_opc = vecop_list,
2782           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2783           .vece = MO_64 },
2784     };
2785 
2786     tcg_debug_assert(vece <= MO_64);
2787     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2788     if (shift == 0) {
2789         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2790     } else {
2791         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2792     }
2793 }
2794 
2795 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2796 {
2797     uint64_t mask = dup_const(MO_8, 0xff << c);
2798 
2799     tcg_gen_shli_i64(d, a, c);
2800     tcg_gen_shri_i64(a, a, 8 - c);
2801     tcg_gen_andi_i64(d, d, mask);
2802     tcg_gen_andi_i64(a, a, ~mask);
2803     tcg_gen_or_i64(d, d, a);
2804 }
2805 
2806 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2807 {
2808     uint64_t mask = dup_const(MO_16, 0xffff << c);
2809 
2810     tcg_gen_shli_i64(d, a, c);
2811     tcg_gen_shri_i64(a, a, 16 - c);
2812     tcg_gen_andi_i64(d, d, mask);
2813     tcg_gen_andi_i64(a, a, ~mask);
2814     tcg_gen_or_i64(d, d, a);
2815 }
2816 
2817 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2818                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2819 {
2820     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2821     static const GVecGen2i g[4] = {
2822         { .fni8 = tcg_gen_vec_rotl8i_i64,
2823           .fniv = tcg_gen_rotli_vec,
2824           .fno = gen_helper_gvec_rotl8i,
2825           .opt_opc = vecop_list,
2826           .vece = MO_8 },
2827         { .fni8 = tcg_gen_vec_rotl16i_i64,
2828           .fniv = tcg_gen_rotli_vec,
2829           .fno = gen_helper_gvec_rotl16i,
2830           .opt_opc = vecop_list,
2831           .vece = MO_16 },
2832         { .fni4 = tcg_gen_rotli_i32,
2833           .fniv = tcg_gen_rotli_vec,
2834           .fno = gen_helper_gvec_rotl32i,
2835           .opt_opc = vecop_list,
2836           .vece = MO_32 },
2837         { .fni8 = tcg_gen_rotli_i64,
2838           .fniv = tcg_gen_rotli_vec,
2839           .fno = gen_helper_gvec_rotl64i,
2840           .opt_opc = vecop_list,
2841           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2842           .vece = MO_64 },
2843     };
2844 
2845     tcg_debug_assert(vece <= MO_64);
2846     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2847     if (shift == 0) {
2848         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2849     } else {
2850         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2851     }
2852 }
2853 
2854 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2855                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2856 {
2857     tcg_debug_assert(vece <= MO_64);
2858     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2859     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2860                        oprsz, maxsz);
2861 }
2862 
2863 /*
2864  * Specialized generation vector shifts by a non-constant scalar.
2865  */
2866 
2867 typedef struct {
2868     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2869     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2870     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2871     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2872     gen_helper_gvec_2 *fno[4];
2873     TCGOpcode s_list[2];
2874     TCGOpcode v_list[2];
2875 } GVecGen2sh;
2876 
2877 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2878                            uint32_t oprsz, uint32_t tysz, TCGType type,
2879                            TCGv_i32 shift,
2880                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2881 {
2882     TCGv_vec t0 = tcg_temp_new_vec(type);
2883     uint32_t i;
2884 
2885     for (i = 0; i < oprsz; i += tysz) {
2886         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2887         fni(vece, t0, t0, shift);
2888         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2889     }
2890     tcg_temp_free_vec(t0);
2891 }
2892 
2893 static void
2894 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2895                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2896 {
2897     TCGType type;
2898     uint32_t some;
2899 
2900     check_size_align(oprsz, maxsz, dofs | aofs);
2901     check_overlap_2(dofs, aofs, maxsz);
2902 
2903     /* If the backend has a scalar expansion, great.  */
2904     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2905     if (type) {
2906         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2907         switch (type) {
2908         case TCG_TYPE_V256:
2909             some = QEMU_ALIGN_DOWN(oprsz, 32);
2910             expand_2sh_vec(vece, dofs, aofs, some, 32,
2911                            TCG_TYPE_V256, shift, g->fniv_s);
2912             if (some == oprsz) {
2913                 break;
2914             }
2915             dofs += some;
2916             aofs += some;
2917             oprsz -= some;
2918             maxsz -= some;
2919             /* fallthru */
2920         case TCG_TYPE_V128:
2921             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2922                            TCG_TYPE_V128, shift, g->fniv_s);
2923             break;
2924         case TCG_TYPE_V64:
2925             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2926                            TCG_TYPE_V64, shift, g->fniv_s);
2927             break;
2928         default:
2929             g_assert_not_reached();
2930         }
2931         tcg_swap_vecop_list(hold_list);
2932         goto clear_tail;
2933     }
2934 
2935     /* If the backend supports variable vector shifts, also cool.  */
2936     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2937     if (type) {
2938         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2939         TCGv_vec v_shift = tcg_temp_new_vec(type);
2940 
2941         if (vece == MO_64) {
2942             TCGv_i64 sh64 = tcg_temp_new_i64();
2943             tcg_gen_extu_i32_i64(sh64, shift);
2944             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2945             tcg_temp_free_i64(sh64);
2946         } else {
2947             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2948         }
2949 
2950         switch (type) {
2951         case TCG_TYPE_V256:
2952             some = QEMU_ALIGN_DOWN(oprsz, 32);
2953             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2954                           v_shift, false, g->fniv_v);
2955             if (some == oprsz) {
2956                 break;
2957             }
2958             dofs += some;
2959             aofs += some;
2960             oprsz -= some;
2961             maxsz -= some;
2962             /* fallthru */
2963         case TCG_TYPE_V128:
2964             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2965                           v_shift, false, g->fniv_v);
2966             break;
2967         case TCG_TYPE_V64:
2968             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2969                           v_shift, false, g->fniv_v);
2970             break;
2971         default:
2972             g_assert_not_reached();
2973         }
2974         tcg_temp_free_vec(v_shift);
2975         tcg_swap_vecop_list(hold_list);
2976         goto clear_tail;
2977     }
2978 
2979     /* Otherwise fall back to integral... */
2980     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2981         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2982     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2983         TCGv_i64 sh64 = tcg_temp_new_i64();
2984         tcg_gen_extu_i32_i64(sh64, shift);
2985         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2986         tcg_temp_free_i64(sh64);
2987     } else {
2988         TCGv_ptr a0 = tcg_temp_new_ptr();
2989         TCGv_ptr a1 = tcg_temp_new_ptr();
2990         TCGv_i32 desc = tcg_temp_new_i32();
2991 
2992         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2993         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2994         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2995         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2996 
2997         g->fno[vece](a0, a1, desc);
2998 
2999         tcg_temp_free_ptr(a0);
3000         tcg_temp_free_ptr(a1);
3001         tcg_temp_free_i32(desc);
3002         return;
3003     }
3004 
3005  clear_tail:
3006     if (oprsz < maxsz) {
3007         expand_clr(dofs + oprsz, maxsz - oprsz);
3008     }
3009 }
3010 
3011 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3012                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3013 {
3014     static const GVecGen2sh g = {
3015         .fni4 = tcg_gen_shl_i32,
3016         .fni8 = tcg_gen_shl_i64,
3017         .fniv_s = tcg_gen_shls_vec,
3018         .fniv_v = tcg_gen_shlv_vec,
3019         .fno = {
3020             gen_helper_gvec_shl8i,
3021             gen_helper_gvec_shl16i,
3022             gen_helper_gvec_shl32i,
3023             gen_helper_gvec_shl64i,
3024         },
3025         .s_list = { INDEX_op_shls_vec, 0 },
3026         .v_list = { INDEX_op_shlv_vec, 0 },
3027     };
3028 
3029     tcg_debug_assert(vece <= MO_64);
3030     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3031 }
3032 
3033 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3034                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3035 {
3036     static const GVecGen2sh g = {
3037         .fni4 = tcg_gen_shr_i32,
3038         .fni8 = tcg_gen_shr_i64,
3039         .fniv_s = tcg_gen_shrs_vec,
3040         .fniv_v = tcg_gen_shrv_vec,
3041         .fno = {
3042             gen_helper_gvec_shr8i,
3043             gen_helper_gvec_shr16i,
3044             gen_helper_gvec_shr32i,
3045             gen_helper_gvec_shr64i,
3046         },
3047         .s_list = { INDEX_op_shrs_vec, 0 },
3048         .v_list = { INDEX_op_shrv_vec, 0 },
3049     };
3050 
3051     tcg_debug_assert(vece <= MO_64);
3052     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3053 }
3054 
3055 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3056                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3057 {
3058     static const GVecGen2sh g = {
3059         .fni4 = tcg_gen_sar_i32,
3060         .fni8 = tcg_gen_sar_i64,
3061         .fniv_s = tcg_gen_sars_vec,
3062         .fniv_v = tcg_gen_sarv_vec,
3063         .fno = {
3064             gen_helper_gvec_sar8i,
3065             gen_helper_gvec_sar16i,
3066             gen_helper_gvec_sar32i,
3067             gen_helper_gvec_sar64i,
3068         },
3069         .s_list = { INDEX_op_sars_vec, 0 },
3070         .v_list = { INDEX_op_sarv_vec, 0 },
3071     };
3072 
3073     tcg_debug_assert(vece <= MO_64);
3074     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3075 }
3076 
3077 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3078                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3079 {
3080     static const GVecGen2sh g = {
3081         .fni4 = tcg_gen_rotl_i32,
3082         .fni8 = tcg_gen_rotl_i64,
3083         .fniv_s = tcg_gen_rotls_vec,
3084         .fniv_v = tcg_gen_rotlv_vec,
3085         .fno = {
3086             gen_helper_gvec_rotl8i,
3087             gen_helper_gvec_rotl16i,
3088             gen_helper_gvec_rotl32i,
3089             gen_helper_gvec_rotl64i,
3090         },
3091         .s_list = { INDEX_op_rotls_vec, 0 },
3092         .v_list = { INDEX_op_rotlv_vec, 0 },
3093     };
3094 
3095     tcg_debug_assert(vece <= MO_64);
3096     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3097 }
3098 
3099 /*
3100  * Expand D = A << (B % element bits)
3101  *
3102  * Unlike scalar shifts, where it is easy for the target front end
3103  * to include the modulo as part of the expansion.  If the target
3104  * naturally includes the modulo as part of the operation, great!
3105  * If the target has some other behaviour from out-of-range shifts,
3106  * then it could not use this function anyway, and would need to
3107  * do it's own expansion with custom functions.
3108  */
3109 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3110                                  TCGv_vec a, TCGv_vec b)
3111 {
3112     TCGv_vec t = tcg_temp_new_vec_matching(d);
3113 
3114     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3115     tcg_gen_and_vec(vece, t, t, b);
3116     tcg_gen_shlv_vec(vece, d, a, t);
3117     tcg_temp_free_vec(t);
3118 }
3119 
3120 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3121 {
3122     TCGv_i32 t = tcg_temp_new_i32();
3123 
3124     tcg_gen_andi_i32(t, b, 31);
3125     tcg_gen_shl_i32(d, a, t);
3126     tcg_temp_free_i32(t);
3127 }
3128 
3129 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3130 {
3131     TCGv_i64 t = tcg_temp_new_i64();
3132 
3133     tcg_gen_andi_i64(t, b, 63);
3134     tcg_gen_shl_i64(d, a, t);
3135     tcg_temp_free_i64(t);
3136 }
3137 
3138 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3139                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3140 {
3141     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3142     static const GVecGen3 g[4] = {
3143         { .fniv = tcg_gen_shlv_mod_vec,
3144           .fno = gen_helper_gvec_shl8v,
3145           .opt_opc = vecop_list,
3146           .vece = MO_8 },
3147         { .fniv = tcg_gen_shlv_mod_vec,
3148           .fno = gen_helper_gvec_shl16v,
3149           .opt_opc = vecop_list,
3150           .vece = MO_16 },
3151         { .fni4 = tcg_gen_shl_mod_i32,
3152           .fniv = tcg_gen_shlv_mod_vec,
3153           .fno = gen_helper_gvec_shl32v,
3154           .opt_opc = vecop_list,
3155           .vece = MO_32 },
3156         { .fni8 = tcg_gen_shl_mod_i64,
3157           .fniv = tcg_gen_shlv_mod_vec,
3158           .fno = gen_helper_gvec_shl64v,
3159           .opt_opc = vecop_list,
3160           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3161           .vece = MO_64 },
3162     };
3163 
3164     tcg_debug_assert(vece <= MO_64);
3165     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3166 }
3167 
3168 /*
3169  * Similarly for logical right shifts.
3170  */
3171 
3172 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3173                                  TCGv_vec a, TCGv_vec b)
3174 {
3175     TCGv_vec t = tcg_temp_new_vec_matching(d);
3176 
3177     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3178     tcg_gen_and_vec(vece, t, t, b);
3179     tcg_gen_shrv_vec(vece, d, a, t);
3180     tcg_temp_free_vec(t);
3181 }
3182 
3183 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3184 {
3185     TCGv_i32 t = tcg_temp_new_i32();
3186 
3187     tcg_gen_andi_i32(t, b, 31);
3188     tcg_gen_shr_i32(d, a, t);
3189     tcg_temp_free_i32(t);
3190 }
3191 
3192 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3193 {
3194     TCGv_i64 t = tcg_temp_new_i64();
3195 
3196     tcg_gen_andi_i64(t, b, 63);
3197     tcg_gen_shr_i64(d, a, t);
3198     tcg_temp_free_i64(t);
3199 }
3200 
3201 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3202                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3203 {
3204     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3205     static const GVecGen3 g[4] = {
3206         { .fniv = tcg_gen_shrv_mod_vec,
3207           .fno = gen_helper_gvec_shr8v,
3208           .opt_opc = vecop_list,
3209           .vece = MO_8 },
3210         { .fniv = tcg_gen_shrv_mod_vec,
3211           .fno = gen_helper_gvec_shr16v,
3212           .opt_opc = vecop_list,
3213           .vece = MO_16 },
3214         { .fni4 = tcg_gen_shr_mod_i32,
3215           .fniv = tcg_gen_shrv_mod_vec,
3216           .fno = gen_helper_gvec_shr32v,
3217           .opt_opc = vecop_list,
3218           .vece = MO_32 },
3219         { .fni8 = tcg_gen_shr_mod_i64,
3220           .fniv = tcg_gen_shrv_mod_vec,
3221           .fno = gen_helper_gvec_shr64v,
3222           .opt_opc = vecop_list,
3223           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3224           .vece = MO_64 },
3225     };
3226 
3227     tcg_debug_assert(vece <= MO_64);
3228     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3229 }
3230 
3231 /*
3232  * Similarly for arithmetic right shifts.
3233  */
3234 
3235 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3236                                  TCGv_vec a, TCGv_vec b)
3237 {
3238     TCGv_vec t = tcg_temp_new_vec_matching(d);
3239 
3240     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3241     tcg_gen_and_vec(vece, t, t, b);
3242     tcg_gen_sarv_vec(vece, d, a, t);
3243     tcg_temp_free_vec(t);
3244 }
3245 
3246 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3247 {
3248     TCGv_i32 t = tcg_temp_new_i32();
3249 
3250     tcg_gen_andi_i32(t, b, 31);
3251     tcg_gen_sar_i32(d, a, t);
3252     tcg_temp_free_i32(t);
3253 }
3254 
3255 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3256 {
3257     TCGv_i64 t = tcg_temp_new_i64();
3258 
3259     tcg_gen_andi_i64(t, b, 63);
3260     tcg_gen_sar_i64(d, a, t);
3261     tcg_temp_free_i64(t);
3262 }
3263 
3264 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3265                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3266 {
3267     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3268     static const GVecGen3 g[4] = {
3269         { .fniv = tcg_gen_sarv_mod_vec,
3270           .fno = gen_helper_gvec_sar8v,
3271           .opt_opc = vecop_list,
3272           .vece = MO_8 },
3273         { .fniv = tcg_gen_sarv_mod_vec,
3274           .fno = gen_helper_gvec_sar16v,
3275           .opt_opc = vecop_list,
3276           .vece = MO_16 },
3277         { .fni4 = tcg_gen_sar_mod_i32,
3278           .fniv = tcg_gen_sarv_mod_vec,
3279           .fno = gen_helper_gvec_sar32v,
3280           .opt_opc = vecop_list,
3281           .vece = MO_32 },
3282         { .fni8 = tcg_gen_sar_mod_i64,
3283           .fniv = tcg_gen_sarv_mod_vec,
3284           .fno = gen_helper_gvec_sar64v,
3285           .opt_opc = vecop_list,
3286           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3287           .vece = MO_64 },
3288     };
3289 
3290     tcg_debug_assert(vece <= MO_64);
3291     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3292 }
3293 
3294 /*
3295  * Similarly for rotates.
3296  */
3297 
3298 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3299                                   TCGv_vec a, TCGv_vec b)
3300 {
3301     TCGv_vec t = tcg_temp_new_vec_matching(d);
3302 
3303     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3304     tcg_gen_and_vec(vece, t, t, b);
3305     tcg_gen_rotlv_vec(vece, d, a, t);
3306     tcg_temp_free_vec(t);
3307 }
3308 
3309 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3310 {
3311     TCGv_i32 t = tcg_temp_new_i32();
3312 
3313     tcg_gen_andi_i32(t, b, 31);
3314     tcg_gen_rotl_i32(d, a, t);
3315     tcg_temp_free_i32(t);
3316 }
3317 
3318 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3319 {
3320     TCGv_i64 t = tcg_temp_new_i64();
3321 
3322     tcg_gen_andi_i64(t, b, 63);
3323     tcg_gen_rotl_i64(d, a, t);
3324     tcg_temp_free_i64(t);
3325 }
3326 
3327 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3328                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3329 {
3330     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3331     static const GVecGen3 g[4] = {
3332         { .fniv = tcg_gen_rotlv_mod_vec,
3333           .fno = gen_helper_gvec_rotl8v,
3334           .opt_opc = vecop_list,
3335           .vece = MO_8 },
3336         { .fniv = tcg_gen_rotlv_mod_vec,
3337           .fno = gen_helper_gvec_rotl16v,
3338           .opt_opc = vecop_list,
3339           .vece = MO_16 },
3340         { .fni4 = tcg_gen_rotl_mod_i32,
3341           .fniv = tcg_gen_rotlv_mod_vec,
3342           .fno = gen_helper_gvec_rotl32v,
3343           .opt_opc = vecop_list,
3344           .vece = MO_32 },
3345         { .fni8 = tcg_gen_rotl_mod_i64,
3346           .fniv = tcg_gen_rotlv_mod_vec,
3347           .fno = gen_helper_gvec_rotl64v,
3348           .opt_opc = vecop_list,
3349           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3350           .vece = MO_64 },
3351     };
3352 
3353     tcg_debug_assert(vece <= MO_64);
3354     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3355 }
3356 
3357 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3358                                   TCGv_vec a, TCGv_vec b)
3359 {
3360     TCGv_vec t = tcg_temp_new_vec_matching(d);
3361 
3362     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3363     tcg_gen_and_vec(vece, t, t, b);
3364     tcg_gen_rotrv_vec(vece, d, a, t);
3365     tcg_temp_free_vec(t);
3366 }
3367 
3368 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3369 {
3370     TCGv_i32 t = tcg_temp_new_i32();
3371 
3372     tcg_gen_andi_i32(t, b, 31);
3373     tcg_gen_rotr_i32(d, a, t);
3374     tcg_temp_free_i32(t);
3375 }
3376 
3377 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3378 {
3379     TCGv_i64 t = tcg_temp_new_i64();
3380 
3381     tcg_gen_andi_i64(t, b, 63);
3382     tcg_gen_rotr_i64(d, a, t);
3383     tcg_temp_free_i64(t);
3384 }
3385 
3386 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3387                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3388 {
3389     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3390     static const GVecGen3 g[4] = {
3391         { .fniv = tcg_gen_rotrv_mod_vec,
3392           .fno = gen_helper_gvec_rotr8v,
3393           .opt_opc = vecop_list,
3394           .vece = MO_8 },
3395         { .fniv = tcg_gen_rotrv_mod_vec,
3396           .fno = gen_helper_gvec_rotr16v,
3397           .opt_opc = vecop_list,
3398           .vece = MO_16 },
3399         { .fni4 = tcg_gen_rotr_mod_i32,
3400           .fniv = tcg_gen_rotrv_mod_vec,
3401           .fno = gen_helper_gvec_rotr32v,
3402           .opt_opc = vecop_list,
3403           .vece = MO_32 },
3404         { .fni8 = tcg_gen_rotr_mod_i64,
3405           .fniv = tcg_gen_rotrv_mod_vec,
3406           .fno = gen_helper_gvec_rotr64v,
3407           .opt_opc = vecop_list,
3408           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3409           .vece = MO_64 },
3410     };
3411 
3412     tcg_debug_assert(vece <= MO_64);
3413     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3414 }
3415 
3416 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3417 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3418                            uint32_t oprsz, TCGCond cond)
3419 {
3420     TCGv_i32 t0 = tcg_temp_new_i32();
3421     TCGv_i32 t1 = tcg_temp_new_i32();
3422     uint32_t i;
3423 
3424     for (i = 0; i < oprsz; i += 4) {
3425         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3426         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3427         tcg_gen_setcond_i32(cond, t0, t0, t1);
3428         tcg_gen_neg_i32(t0, t0);
3429         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3430     }
3431     tcg_temp_free_i32(t1);
3432     tcg_temp_free_i32(t0);
3433 }
3434 
3435 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3436                            uint32_t oprsz, TCGCond cond)
3437 {
3438     TCGv_i64 t0 = tcg_temp_new_i64();
3439     TCGv_i64 t1 = tcg_temp_new_i64();
3440     uint32_t i;
3441 
3442     for (i = 0; i < oprsz; i += 8) {
3443         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3444         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3445         tcg_gen_setcond_i64(cond, t0, t0, t1);
3446         tcg_gen_neg_i64(t0, t0);
3447         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3448     }
3449     tcg_temp_free_i64(t1);
3450     tcg_temp_free_i64(t0);
3451 }
3452 
3453 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3454                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3455                            TCGType type, TCGCond cond)
3456 {
3457     TCGv_vec t0 = tcg_temp_new_vec(type);
3458     TCGv_vec t1 = tcg_temp_new_vec(type);
3459     uint32_t i;
3460 
3461     for (i = 0; i < oprsz; i += tysz) {
3462         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3463         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3464         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3465         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3466     }
3467     tcg_temp_free_vec(t1);
3468     tcg_temp_free_vec(t0);
3469 }
3470 
3471 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3472                       uint32_t aofs, uint32_t bofs,
3473                       uint32_t oprsz, uint32_t maxsz)
3474 {
3475     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3476     static gen_helper_gvec_3 * const eq_fn[4] = {
3477         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3478         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3479     };
3480     static gen_helper_gvec_3 * const ne_fn[4] = {
3481         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3482         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3483     };
3484     static gen_helper_gvec_3 * const lt_fn[4] = {
3485         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3486         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3487     };
3488     static gen_helper_gvec_3 * const le_fn[4] = {
3489         gen_helper_gvec_le8, gen_helper_gvec_le16,
3490         gen_helper_gvec_le32, gen_helper_gvec_le64
3491     };
3492     static gen_helper_gvec_3 * const ltu_fn[4] = {
3493         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3494         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3495     };
3496     static gen_helper_gvec_3 * const leu_fn[4] = {
3497         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3498         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3499     };
3500     static gen_helper_gvec_3 * const * const fns[16] = {
3501         [TCG_COND_EQ] = eq_fn,
3502         [TCG_COND_NE] = ne_fn,
3503         [TCG_COND_LT] = lt_fn,
3504         [TCG_COND_LE] = le_fn,
3505         [TCG_COND_LTU] = ltu_fn,
3506         [TCG_COND_LEU] = leu_fn,
3507     };
3508 
3509     const TCGOpcode *hold_list;
3510     TCGType type;
3511     uint32_t some;
3512 
3513     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3514     check_overlap_3(dofs, aofs, bofs, maxsz);
3515 
3516     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3517         do_dup(MO_8, dofs, oprsz, maxsz,
3518                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3519         return;
3520     }
3521 
3522     /*
3523      * Implement inline with a vector type, if possible.
3524      * Prefer integer when 64-bit host and 64-bit comparison.
3525      */
3526     hold_list = tcg_swap_vecop_list(cmp_list);
3527     type = choose_vector_type(cmp_list, vece, oprsz,
3528                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3529     switch (type) {
3530     case TCG_TYPE_V256:
3531         /* Recall that ARM SVE allows vector sizes that are not a
3532          * power of 2, but always a multiple of 16.  The intent is
3533          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3534          */
3535         some = QEMU_ALIGN_DOWN(oprsz, 32);
3536         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3537         if (some == oprsz) {
3538             break;
3539         }
3540         dofs += some;
3541         aofs += some;
3542         bofs += some;
3543         oprsz -= some;
3544         maxsz -= some;
3545         /* fallthru */
3546     case TCG_TYPE_V128:
3547         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3548         break;
3549     case TCG_TYPE_V64:
3550         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3551         break;
3552 
3553     case 0:
3554         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3555             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3556         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3557             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3558         } else {
3559             gen_helper_gvec_3 * const *fn = fns[cond];
3560 
3561             if (fn == NULL) {
3562                 uint32_t tmp;
3563                 tmp = aofs, aofs = bofs, bofs = tmp;
3564                 cond = tcg_swap_cond(cond);
3565                 fn = fns[cond];
3566                 assert(fn != NULL);
3567             }
3568             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3569             oprsz = maxsz;
3570         }
3571         break;
3572 
3573     default:
3574         g_assert_not_reached();
3575     }
3576     tcg_swap_vecop_list(hold_list);
3577 
3578     if (oprsz < maxsz) {
3579         expand_clr(dofs + oprsz, maxsz - oprsz);
3580     }
3581 }
3582 
3583 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3584 {
3585     TCGv_i64 t = tcg_temp_new_i64();
3586 
3587     tcg_gen_and_i64(t, b, a);
3588     tcg_gen_andc_i64(d, c, a);
3589     tcg_gen_or_i64(d, d, t);
3590     tcg_temp_free_i64(t);
3591 }
3592 
3593 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3594                          uint32_t bofs, uint32_t cofs,
3595                          uint32_t oprsz, uint32_t maxsz)
3596 {
3597     static const GVecGen4 g = {
3598         .fni8 = tcg_gen_bitsel_i64,
3599         .fniv = tcg_gen_bitsel_vec,
3600         .fno = gen_helper_gvec_bitsel,
3601     };
3602 
3603     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3604 }
3605