xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision 19f70347)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "qemu/main-loop.h"
25 #include "tcg/tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t opr_align = oprsz >= 16 ? 15 : 7;
41     uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
42     tcg_debug_assert(oprsz > 0);
43     tcg_debug_assert(oprsz <= maxsz);
44     tcg_debug_assert((oprsz & opr_align) == 0);
45     tcg_debug_assert((maxsz & max_align) == 0);
46     tcg_debug_assert((ofs & max_align) == 0);
47 }
48 
49 /* Verify vector overlap rules for two operands.  */
50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
51 {
52     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
53 }
54 
55 /* Verify vector overlap rules for three operands.  */
56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
57 {
58     check_overlap_2(d, a, s);
59     check_overlap_2(d, b, s);
60     check_overlap_2(a, b, s);
61 }
62 
63 /* Verify vector overlap rules for four operands.  */
64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
65                             uint32_t c, uint32_t s)
66 {
67     check_overlap_2(d, a, s);
68     check_overlap_2(d, b, s);
69     check_overlap_2(d, c, s);
70     check_overlap_2(a, b, s);
71     check_overlap_2(a, c, s);
72     check_overlap_2(b, c, s);
73 }
74 
75 /* Create a descriptor from components.  */
76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
77 {
78     uint32_t desc = 0;
79 
80     assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
81     assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
82     assert(data == sextract32(data, 0, SIMD_DATA_BITS));
83 
84     oprsz = (oprsz / 8) - 1;
85     maxsz = (maxsz / 8) - 1;
86     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
87     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
88     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
89 
90     return desc;
91 }
92 
93 /* Generate a call to a gvec-style helper with two vector operands.  */
94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
95                         uint32_t oprsz, uint32_t maxsz, int32_t data,
96                         gen_helper_gvec_2 *fn)
97 {
98     TCGv_ptr a0, a1;
99     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
100 
101     a0 = tcg_temp_new_ptr();
102     a1 = tcg_temp_new_ptr();
103 
104     tcg_gen_addi_ptr(a0, cpu_env, dofs);
105     tcg_gen_addi_ptr(a1, cpu_env, aofs);
106 
107     fn(a0, a1, desc);
108 
109     tcg_temp_free_ptr(a0);
110     tcg_temp_free_ptr(a1);
111     tcg_temp_free_i32(desc);
112 }
113 
114 /* Generate a call to a gvec-style helper with two vector operands
115    and one scalar operand.  */
116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
117                          uint32_t oprsz, uint32_t maxsz, int32_t data,
118                          gen_helper_gvec_2i *fn)
119 {
120     TCGv_ptr a0, a1;
121     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
122 
123     a0 = tcg_temp_new_ptr();
124     a1 = tcg_temp_new_ptr();
125 
126     tcg_gen_addi_ptr(a0, cpu_env, dofs);
127     tcg_gen_addi_ptr(a1, cpu_env, aofs);
128 
129     fn(a0, a1, c, desc);
130 
131     tcg_temp_free_ptr(a0);
132     tcg_temp_free_ptr(a1);
133     tcg_temp_free_i32(desc);
134 }
135 
136 /* Generate a call to a gvec-style helper with three vector operands.  */
137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
138                         uint32_t oprsz, uint32_t maxsz, int32_t data,
139                         gen_helper_gvec_3 *fn)
140 {
141     TCGv_ptr a0, a1, a2;
142     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
143 
144     a0 = tcg_temp_new_ptr();
145     a1 = tcg_temp_new_ptr();
146     a2 = tcg_temp_new_ptr();
147 
148     tcg_gen_addi_ptr(a0, cpu_env, dofs);
149     tcg_gen_addi_ptr(a1, cpu_env, aofs);
150     tcg_gen_addi_ptr(a2, cpu_env, bofs);
151 
152     fn(a0, a1, a2, desc);
153 
154     tcg_temp_free_ptr(a0);
155     tcg_temp_free_ptr(a1);
156     tcg_temp_free_ptr(a2);
157     tcg_temp_free_i32(desc);
158 }
159 
160 /* Generate a call to a gvec-style helper with four vector operands.  */
161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
162                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
163                         int32_t data, gen_helper_gvec_4 *fn)
164 {
165     TCGv_ptr a0, a1, a2, a3;
166     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
167 
168     a0 = tcg_temp_new_ptr();
169     a1 = tcg_temp_new_ptr();
170     a2 = tcg_temp_new_ptr();
171     a3 = tcg_temp_new_ptr();
172 
173     tcg_gen_addi_ptr(a0, cpu_env, dofs);
174     tcg_gen_addi_ptr(a1, cpu_env, aofs);
175     tcg_gen_addi_ptr(a2, cpu_env, bofs);
176     tcg_gen_addi_ptr(a3, cpu_env, cofs);
177 
178     fn(a0, a1, a2, a3, desc);
179 
180     tcg_temp_free_ptr(a0);
181     tcg_temp_free_ptr(a1);
182     tcg_temp_free_ptr(a2);
183     tcg_temp_free_ptr(a3);
184     tcg_temp_free_i32(desc);
185 }
186 
187 /* Generate a call to a gvec-style helper with five vector operands.  */
188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
189                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
190                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
191 {
192     TCGv_ptr a0, a1, a2, a3, a4;
193     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
194 
195     a0 = tcg_temp_new_ptr();
196     a1 = tcg_temp_new_ptr();
197     a2 = tcg_temp_new_ptr();
198     a3 = tcg_temp_new_ptr();
199     a4 = tcg_temp_new_ptr();
200 
201     tcg_gen_addi_ptr(a0, cpu_env, dofs);
202     tcg_gen_addi_ptr(a1, cpu_env, aofs);
203     tcg_gen_addi_ptr(a2, cpu_env, bofs);
204     tcg_gen_addi_ptr(a3, cpu_env, cofs);
205     tcg_gen_addi_ptr(a4, cpu_env, xofs);
206 
207     fn(a0, a1, a2, a3, a4, desc);
208 
209     tcg_temp_free_ptr(a0);
210     tcg_temp_free_ptr(a1);
211     tcg_temp_free_ptr(a2);
212     tcg_temp_free_ptr(a3);
213     tcg_temp_free_ptr(a4);
214     tcg_temp_free_i32(desc);
215 }
216 
217 /* Generate a call to a gvec-style helper with three vector operands
218    and an extra pointer operand.  */
219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
220                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
221                         int32_t data, gen_helper_gvec_2_ptr *fn)
222 {
223     TCGv_ptr a0, a1;
224     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
225 
226     a0 = tcg_temp_new_ptr();
227     a1 = tcg_temp_new_ptr();
228 
229     tcg_gen_addi_ptr(a0, cpu_env, dofs);
230     tcg_gen_addi_ptr(a1, cpu_env, aofs);
231 
232     fn(a0, a1, ptr, desc);
233 
234     tcg_temp_free_ptr(a0);
235     tcg_temp_free_ptr(a1);
236     tcg_temp_free_i32(desc);
237 }
238 
239 /* Generate a call to a gvec-style helper with three vector operands
240    and an extra pointer operand.  */
241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
242                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
243                         int32_t data, gen_helper_gvec_3_ptr *fn)
244 {
245     TCGv_ptr a0, a1, a2;
246     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
247 
248     a0 = tcg_temp_new_ptr();
249     a1 = tcg_temp_new_ptr();
250     a2 = tcg_temp_new_ptr();
251 
252     tcg_gen_addi_ptr(a0, cpu_env, dofs);
253     tcg_gen_addi_ptr(a1, cpu_env, aofs);
254     tcg_gen_addi_ptr(a2, cpu_env, bofs);
255 
256     fn(a0, a1, a2, ptr, desc);
257 
258     tcg_temp_free_ptr(a0);
259     tcg_temp_free_ptr(a1);
260     tcg_temp_free_ptr(a2);
261     tcg_temp_free_i32(desc);
262 }
263 
264 /* Generate a call to a gvec-style helper with four vector operands
265    and an extra pointer operand.  */
266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
267                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
268                         uint32_t maxsz, int32_t data,
269                         gen_helper_gvec_4_ptr *fn)
270 {
271     TCGv_ptr a0, a1, a2, a3;
272     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
273 
274     a0 = tcg_temp_new_ptr();
275     a1 = tcg_temp_new_ptr();
276     a2 = tcg_temp_new_ptr();
277     a3 = tcg_temp_new_ptr();
278 
279     tcg_gen_addi_ptr(a0, cpu_env, dofs);
280     tcg_gen_addi_ptr(a1, cpu_env, aofs);
281     tcg_gen_addi_ptr(a2, cpu_env, bofs);
282     tcg_gen_addi_ptr(a3, cpu_env, cofs);
283 
284     fn(a0, a1, a2, a3, ptr, desc);
285 
286     tcg_temp_free_ptr(a0);
287     tcg_temp_free_ptr(a1);
288     tcg_temp_free_ptr(a2);
289     tcg_temp_free_ptr(a3);
290     tcg_temp_free_i32(desc);
291 }
292 
293 /* Generate a call to a gvec-style helper with five vector operands
294    and an extra pointer operand.  */
295 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
296                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
297                         uint32_t oprsz, uint32_t maxsz, int32_t data,
298                         gen_helper_gvec_5_ptr *fn)
299 {
300     TCGv_ptr a0, a1, a2, a3, a4;
301     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
302 
303     a0 = tcg_temp_new_ptr();
304     a1 = tcg_temp_new_ptr();
305     a2 = tcg_temp_new_ptr();
306     a3 = tcg_temp_new_ptr();
307     a4 = tcg_temp_new_ptr();
308 
309     tcg_gen_addi_ptr(a0, cpu_env, dofs);
310     tcg_gen_addi_ptr(a1, cpu_env, aofs);
311     tcg_gen_addi_ptr(a2, cpu_env, bofs);
312     tcg_gen_addi_ptr(a3, cpu_env, cofs);
313     tcg_gen_addi_ptr(a4, cpu_env, eofs);
314 
315     fn(a0, a1, a2, a3, a4, ptr, desc);
316 
317     tcg_temp_free_ptr(a0);
318     tcg_temp_free_ptr(a1);
319     tcg_temp_free_ptr(a2);
320     tcg_temp_free_ptr(a3);
321     tcg_temp_free_ptr(a4);
322     tcg_temp_free_i32(desc);
323 }
324 
325 /* Return true if we want to implement something of OPRSZ bytes
326    in units of LNSZ.  This limits the expansion of inline code.  */
327 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
328 {
329     if (oprsz % lnsz == 0) {
330         uint32_t lnct = oprsz / lnsz;
331         return lnct >= 1 && lnct <= MAX_UNROLL;
332     }
333     return false;
334 }
335 
336 static void expand_clr(uint32_t dofs, uint32_t maxsz);
337 
338 /* Duplicate C as per VECE.  */
339 uint64_t (dup_const)(unsigned vece, uint64_t c)
340 {
341     switch (vece) {
342     case MO_8:
343         return 0x0101010101010101ull * (uint8_t)c;
344     case MO_16:
345         return 0x0001000100010001ull * (uint16_t)c;
346     case MO_32:
347         return 0x0000000100000001ull * (uint32_t)c;
348     case MO_64:
349         return c;
350     default:
351         g_assert_not_reached();
352     }
353 }
354 
355 /* Duplicate IN into OUT as per VECE.  */
356 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
357 {
358     switch (vece) {
359     case MO_8:
360         tcg_gen_ext8u_i32(out, in);
361         tcg_gen_muli_i32(out, out, 0x01010101);
362         break;
363     case MO_16:
364         tcg_gen_deposit_i32(out, in, in, 16, 16);
365         break;
366     case MO_32:
367         tcg_gen_mov_i32(out, in);
368         break;
369     default:
370         g_assert_not_reached();
371     }
372 }
373 
374 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
375 {
376     switch (vece) {
377     case MO_8:
378         tcg_gen_ext8u_i64(out, in);
379         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
380         break;
381     case MO_16:
382         tcg_gen_ext16u_i64(out, in);
383         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
384         break;
385     case MO_32:
386         tcg_gen_deposit_i64(out, in, in, 32, 32);
387         break;
388     case MO_64:
389         tcg_gen_mov_i64(out, in);
390         break;
391     default:
392         g_assert_not_reached();
393     }
394 }
395 
396 /* Select a supported vector type for implementing an operation on SIZE
397  * bytes.  If OP is 0, assume that the real operation to be performed is
398  * required by all backends.  Otherwise, make sure than OP can be performed
399  * on elements of size VECE in the selected type.  Do not select V64 if
400  * PREFER_I64 is true.  Return 0 if no vector type is selected.
401  */
402 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
403                                   uint32_t size, bool prefer_i64)
404 {
405     if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
406         /*
407          * Recall that ARM SVE allows vector sizes that are not a
408          * power of 2, but always a multiple of 16.  The intent is
409          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
410          * It is hard to imagine a case in which v256 is supported
411          * but v128 is not, but check anyway.
412          */
413         if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
414             && (size % 32 == 0
415                 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
416             return TCG_TYPE_V256;
417         }
418     }
419     if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
420         && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
421         return TCG_TYPE_V128;
422     }
423     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
424         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
425         return TCG_TYPE_V64;
426     }
427     return 0;
428 }
429 
430 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
431                          uint32_t maxsz, TCGv_vec t_vec)
432 {
433     uint32_t i = 0;
434 
435     switch (type) {
436     case TCG_TYPE_V256:
437         /*
438          * Recall that ARM SVE allows vector sizes that are not a
439          * power of 2, but always a multiple of 16.  The intent is
440          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
441          */
442         for (; i + 32 <= oprsz; i += 32) {
443             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
444         }
445         /* fallthru */
446     case TCG_TYPE_V128:
447         for (; i + 16 <= oprsz; i += 16) {
448             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
449         }
450         break;
451     case TCG_TYPE_V64:
452         for (; i < oprsz; i += 8) {
453             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
454         }
455         break;
456     default:
457         g_assert_not_reached();
458     }
459 
460     if (oprsz < maxsz) {
461         expand_clr(dofs + oprsz, maxsz - oprsz);
462     }
463 }
464 
465 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
466  * Only one of IN_32 or IN_64 may be set;
467  * IN_C is used if IN_32 and IN_64 are unset.
468  */
469 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
470                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
471                    uint64_t in_c)
472 {
473     TCGType type;
474     TCGv_i64 t_64;
475     TCGv_i32 t_32, t_desc;
476     TCGv_ptr t_ptr;
477     uint32_t i;
478 
479     assert(vece <= (in_32 ? MO_32 : MO_64));
480     assert(in_32 == NULL || in_64 == NULL);
481 
482     /* If we're storing 0, expand oprsz to maxsz.  */
483     if (in_32 == NULL && in_64 == NULL) {
484         in_c = dup_const(vece, in_c);
485         if (in_c == 0) {
486             oprsz = maxsz;
487         }
488     }
489 
490     /* Implement inline with a vector type, if possible.
491      * Prefer integer when 64-bit host and no variable dup.
492      */
493     type = choose_vector_type(NULL, vece, oprsz,
494                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
495                                && (in_64 == NULL || vece == MO_64)));
496     if (type != 0) {
497         TCGv_vec t_vec = tcg_temp_new_vec(type);
498 
499         if (in_32) {
500             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
501         } else if (in_64) {
502             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
503         } else {
504             tcg_gen_dupi_vec(vece, t_vec, in_c);
505         }
506         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
507         tcg_temp_free_vec(t_vec);
508         return;
509     }
510 
511     /* Otherwise, inline with an integer type, unless "large".  */
512     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
513         t_64 = NULL;
514         t_32 = NULL;
515 
516         if (in_32) {
517             /* We are given a 32-bit variable input.  For a 64-bit host,
518                use a 64-bit operation unless the 32-bit operation would
519                be simple enough.  */
520             if (TCG_TARGET_REG_BITS == 64
521                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
522                 t_64 = tcg_temp_new_i64();
523                 tcg_gen_extu_i32_i64(t_64, in_32);
524                 gen_dup_i64(vece, t_64, t_64);
525             } else {
526                 t_32 = tcg_temp_new_i32();
527                 gen_dup_i32(vece, t_32, in_32);
528             }
529         } else if (in_64) {
530             /* We are given a 64-bit variable input.  */
531             t_64 = tcg_temp_new_i64();
532             gen_dup_i64(vece, t_64, in_64);
533         } else {
534             /* We are given a constant input.  */
535             /* For 64-bit hosts, use 64-bit constants for "simple" constants
536                or when we'd need too many 32-bit stores, or when a 64-bit
537                constant is really required.  */
538             if (vece == MO_64
539                 || (TCG_TARGET_REG_BITS == 64
540                     && (in_c == 0 || in_c == -1
541                         || !check_size_impl(oprsz, 4)))) {
542                 t_64 = tcg_const_i64(in_c);
543             } else {
544                 t_32 = tcg_const_i32(in_c);
545             }
546         }
547 
548         /* Implement inline if we picked an implementation size above.  */
549         if (t_32) {
550             for (i = 0; i < oprsz; i += 4) {
551                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
552             }
553             tcg_temp_free_i32(t_32);
554             goto done;
555         }
556         if (t_64) {
557             for (i = 0; i < oprsz; i += 8) {
558                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
559             }
560             tcg_temp_free_i64(t_64);
561             goto done;
562         }
563     }
564 
565     /* Otherwise implement out of line.  */
566     t_ptr = tcg_temp_new_ptr();
567     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
568     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
569 
570     if (vece == MO_64) {
571         if (in_64) {
572             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
573         } else {
574             t_64 = tcg_const_i64(in_c);
575             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
576             tcg_temp_free_i64(t_64);
577         }
578     } else {
579         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
580         static dup_fn * const fns[3] = {
581             gen_helper_gvec_dup8,
582             gen_helper_gvec_dup16,
583             gen_helper_gvec_dup32
584         };
585 
586         if (in_32) {
587             fns[vece](t_ptr, t_desc, in_32);
588         } else {
589             t_32 = tcg_temp_new_i32();
590             if (in_64) {
591                 tcg_gen_extrl_i64_i32(t_32, in_64);
592             } else if (vece == MO_8) {
593                 tcg_gen_movi_i32(t_32, in_c & 0xff);
594             } else if (vece == MO_16) {
595                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
596             } else {
597                 tcg_gen_movi_i32(t_32, in_c);
598             }
599             fns[vece](t_ptr, t_desc, t_32);
600             tcg_temp_free_i32(t_32);
601         }
602     }
603 
604     tcg_temp_free_ptr(t_ptr);
605     tcg_temp_free_i32(t_desc);
606     return;
607 
608  done:
609     if (oprsz < maxsz) {
610         expand_clr(dofs + oprsz, maxsz - oprsz);
611     }
612 }
613 
614 /* Likewise, but with zero.  */
615 static void expand_clr(uint32_t dofs, uint32_t maxsz)
616 {
617     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
618 }
619 
620 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
621 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
622                          void (*fni)(TCGv_i32, TCGv_i32))
623 {
624     TCGv_i32 t0 = tcg_temp_new_i32();
625     uint32_t i;
626 
627     for (i = 0; i < oprsz; i += 4) {
628         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
629         fni(t0, t0);
630         tcg_gen_st_i32(t0, cpu_env, dofs + i);
631     }
632     tcg_temp_free_i32(t0);
633 }
634 
635 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
636                           int32_t c, bool load_dest,
637                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
638 {
639     TCGv_i32 t0 = tcg_temp_new_i32();
640     TCGv_i32 t1 = tcg_temp_new_i32();
641     uint32_t i;
642 
643     for (i = 0; i < oprsz; i += 4) {
644         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
645         if (load_dest) {
646             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
647         }
648         fni(t1, t0, c);
649         tcg_gen_st_i32(t1, cpu_env, dofs + i);
650     }
651     tcg_temp_free_i32(t0);
652     tcg_temp_free_i32(t1);
653 }
654 
655 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
656                           TCGv_i32 c, bool scalar_first,
657                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
658 {
659     TCGv_i32 t0 = tcg_temp_new_i32();
660     TCGv_i32 t1 = tcg_temp_new_i32();
661     uint32_t i;
662 
663     for (i = 0; i < oprsz; i += 4) {
664         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
665         if (scalar_first) {
666             fni(t1, c, t0);
667         } else {
668             fni(t1, t0, c);
669         }
670         tcg_gen_st_i32(t1, cpu_env, dofs + i);
671     }
672     tcg_temp_free_i32(t0);
673     tcg_temp_free_i32(t1);
674 }
675 
676 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
677 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
678                          uint32_t bofs, uint32_t oprsz, bool load_dest,
679                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
680 {
681     TCGv_i32 t0 = tcg_temp_new_i32();
682     TCGv_i32 t1 = tcg_temp_new_i32();
683     TCGv_i32 t2 = tcg_temp_new_i32();
684     uint32_t i;
685 
686     for (i = 0; i < oprsz; i += 4) {
687         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
688         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
689         if (load_dest) {
690             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
691         }
692         fni(t2, t0, t1);
693         tcg_gen_st_i32(t2, cpu_env, dofs + i);
694     }
695     tcg_temp_free_i32(t2);
696     tcg_temp_free_i32(t1);
697     tcg_temp_free_i32(t0);
698 }
699 
700 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
701                           uint32_t oprsz, int32_t c, bool load_dest,
702                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
703 {
704     TCGv_i32 t0 = tcg_temp_new_i32();
705     TCGv_i32 t1 = tcg_temp_new_i32();
706     TCGv_i32 t2 = tcg_temp_new_i32();
707     uint32_t i;
708 
709     for (i = 0; i < oprsz; i += 4) {
710         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
711         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
712         if (load_dest) {
713             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
714         }
715         fni(t2, t0, t1, c);
716         tcg_gen_st_i32(t2, cpu_env, dofs + i);
717     }
718     tcg_temp_free_i32(t0);
719     tcg_temp_free_i32(t1);
720     tcg_temp_free_i32(t2);
721 }
722 
723 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
724 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
725                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
726                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
727 {
728     TCGv_i32 t0 = tcg_temp_new_i32();
729     TCGv_i32 t1 = tcg_temp_new_i32();
730     TCGv_i32 t2 = tcg_temp_new_i32();
731     TCGv_i32 t3 = tcg_temp_new_i32();
732     uint32_t i;
733 
734     for (i = 0; i < oprsz; i += 4) {
735         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
736         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
737         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
738         fni(t0, t1, t2, t3);
739         tcg_gen_st_i32(t0, cpu_env, dofs + i);
740         if (write_aofs) {
741             tcg_gen_st_i32(t1, cpu_env, aofs + i);
742         }
743     }
744     tcg_temp_free_i32(t3);
745     tcg_temp_free_i32(t2);
746     tcg_temp_free_i32(t1);
747     tcg_temp_free_i32(t0);
748 }
749 
750 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
751 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
752                          void (*fni)(TCGv_i64, TCGv_i64))
753 {
754     TCGv_i64 t0 = tcg_temp_new_i64();
755     uint32_t i;
756 
757     for (i = 0; i < oprsz; i += 8) {
758         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
759         fni(t0, t0);
760         tcg_gen_st_i64(t0, cpu_env, dofs + i);
761     }
762     tcg_temp_free_i64(t0);
763 }
764 
765 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
766                           int64_t c, bool load_dest,
767                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
768 {
769     TCGv_i64 t0 = tcg_temp_new_i64();
770     TCGv_i64 t1 = tcg_temp_new_i64();
771     uint32_t i;
772 
773     for (i = 0; i < oprsz; i += 8) {
774         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
775         if (load_dest) {
776             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
777         }
778         fni(t1, t0, c);
779         tcg_gen_st_i64(t1, cpu_env, dofs + i);
780     }
781     tcg_temp_free_i64(t0);
782     tcg_temp_free_i64(t1);
783 }
784 
785 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
786                           TCGv_i64 c, bool scalar_first,
787                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
788 {
789     TCGv_i64 t0 = tcg_temp_new_i64();
790     TCGv_i64 t1 = tcg_temp_new_i64();
791     uint32_t i;
792 
793     for (i = 0; i < oprsz; i += 8) {
794         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
795         if (scalar_first) {
796             fni(t1, c, t0);
797         } else {
798             fni(t1, t0, c);
799         }
800         tcg_gen_st_i64(t1, cpu_env, dofs + i);
801     }
802     tcg_temp_free_i64(t0);
803     tcg_temp_free_i64(t1);
804 }
805 
806 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
807 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
808                          uint32_t bofs, uint32_t oprsz, bool load_dest,
809                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
810 {
811     TCGv_i64 t0 = tcg_temp_new_i64();
812     TCGv_i64 t1 = tcg_temp_new_i64();
813     TCGv_i64 t2 = tcg_temp_new_i64();
814     uint32_t i;
815 
816     for (i = 0; i < oprsz; i += 8) {
817         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
818         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
819         if (load_dest) {
820             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
821         }
822         fni(t2, t0, t1);
823         tcg_gen_st_i64(t2, cpu_env, dofs + i);
824     }
825     tcg_temp_free_i64(t2);
826     tcg_temp_free_i64(t1);
827     tcg_temp_free_i64(t0);
828 }
829 
830 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
831                           uint32_t oprsz, int64_t c, bool load_dest,
832                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
833 {
834     TCGv_i64 t0 = tcg_temp_new_i64();
835     TCGv_i64 t1 = tcg_temp_new_i64();
836     TCGv_i64 t2 = tcg_temp_new_i64();
837     uint32_t i;
838 
839     for (i = 0; i < oprsz; i += 8) {
840         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
841         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
842         if (load_dest) {
843             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
844         }
845         fni(t2, t0, t1, c);
846         tcg_gen_st_i64(t2, cpu_env, dofs + i);
847     }
848     tcg_temp_free_i64(t0);
849     tcg_temp_free_i64(t1);
850     tcg_temp_free_i64(t2);
851 }
852 
853 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
854 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
855                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
856                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
857 {
858     TCGv_i64 t0 = tcg_temp_new_i64();
859     TCGv_i64 t1 = tcg_temp_new_i64();
860     TCGv_i64 t2 = tcg_temp_new_i64();
861     TCGv_i64 t3 = tcg_temp_new_i64();
862     uint32_t i;
863 
864     for (i = 0; i < oprsz; i += 8) {
865         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
866         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
867         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
868         fni(t0, t1, t2, t3);
869         tcg_gen_st_i64(t0, cpu_env, dofs + i);
870         if (write_aofs) {
871             tcg_gen_st_i64(t1, cpu_env, aofs + i);
872         }
873     }
874     tcg_temp_free_i64(t3);
875     tcg_temp_free_i64(t2);
876     tcg_temp_free_i64(t1);
877     tcg_temp_free_i64(t0);
878 }
879 
880 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
881 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
882                          uint32_t oprsz, uint32_t tysz, TCGType type,
883                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
884 {
885     TCGv_vec t0 = tcg_temp_new_vec(type);
886     uint32_t i;
887 
888     for (i = 0; i < oprsz; i += tysz) {
889         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
890         fni(vece, t0, t0);
891         tcg_gen_st_vec(t0, cpu_env, dofs + i);
892     }
893     tcg_temp_free_vec(t0);
894 }
895 
896 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
897    using host vectors.  */
898 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
899                           uint32_t oprsz, uint32_t tysz, TCGType type,
900                           int64_t c, bool load_dest,
901                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
902 {
903     TCGv_vec t0 = tcg_temp_new_vec(type);
904     TCGv_vec t1 = tcg_temp_new_vec(type);
905     uint32_t i;
906 
907     for (i = 0; i < oprsz; i += tysz) {
908         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
909         if (load_dest) {
910             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
911         }
912         fni(vece, t1, t0, c);
913         tcg_gen_st_vec(t1, cpu_env, dofs + i);
914     }
915     tcg_temp_free_vec(t0);
916     tcg_temp_free_vec(t1);
917 }
918 
919 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
920                           uint32_t oprsz, uint32_t tysz, TCGType type,
921                           TCGv_vec c, bool scalar_first,
922                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
923 {
924     TCGv_vec t0 = tcg_temp_new_vec(type);
925     TCGv_vec t1 = tcg_temp_new_vec(type);
926     uint32_t i;
927 
928     for (i = 0; i < oprsz; i += tysz) {
929         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
930         if (scalar_first) {
931             fni(vece, t1, c, t0);
932         } else {
933             fni(vece, t1, t0, c);
934         }
935         tcg_gen_st_vec(t1, cpu_env, dofs + i);
936     }
937     tcg_temp_free_vec(t0);
938     tcg_temp_free_vec(t1);
939 }
940 
941 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
942 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
943                          uint32_t bofs, uint32_t oprsz,
944                          uint32_t tysz, TCGType type, bool load_dest,
945                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
946 {
947     TCGv_vec t0 = tcg_temp_new_vec(type);
948     TCGv_vec t1 = tcg_temp_new_vec(type);
949     TCGv_vec t2 = tcg_temp_new_vec(type);
950     uint32_t i;
951 
952     for (i = 0; i < oprsz; i += tysz) {
953         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
954         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
955         if (load_dest) {
956             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
957         }
958         fni(vece, t2, t0, t1);
959         tcg_gen_st_vec(t2, cpu_env, dofs + i);
960     }
961     tcg_temp_free_vec(t2);
962     tcg_temp_free_vec(t1);
963     tcg_temp_free_vec(t0);
964 }
965 
966 /*
967  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
968  * using host vectors.
969  */
970 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
971                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
972                           TCGType type, int64_t c, bool load_dest,
973                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
974                                       int64_t))
975 {
976     TCGv_vec t0 = tcg_temp_new_vec(type);
977     TCGv_vec t1 = tcg_temp_new_vec(type);
978     TCGv_vec t2 = tcg_temp_new_vec(type);
979     uint32_t i;
980 
981     for (i = 0; i < oprsz; i += tysz) {
982         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
983         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
984         if (load_dest) {
985             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
986         }
987         fni(vece, t2, t0, t1, c);
988         tcg_gen_st_vec(t2, cpu_env, dofs + i);
989     }
990     tcg_temp_free_vec(t0);
991     tcg_temp_free_vec(t1);
992     tcg_temp_free_vec(t2);
993 }
994 
995 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
996 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
997                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
998                          uint32_t tysz, TCGType type, bool write_aofs,
999                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1000                                      TCGv_vec, TCGv_vec))
1001 {
1002     TCGv_vec t0 = tcg_temp_new_vec(type);
1003     TCGv_vec t1 = tcg_temp_new_vec(type);
1004     TCGv_vec t2 = tcg_temp_new_vec(type);
1005     TCGv_vec t3 = tcg_temp_new_vec(type);
1006     uint32_t i;
1007 
1008     for (i = 0; i < oprsz; i += tysz) {
1009         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1010         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1011         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1012         fni(vece, t0, t1, t2, t3);
1013         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1014         if (write_aofs) {
1015             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1016         }
1017     }
1018     tcg_temp_free_vec(t3);
1019     tcg_temp_free_vec(t2);
1020     tcg_temp_free_vec(t1);
1021     tcg_temp_free_vec(t0);
1022 }
1023 
1024 /* Expand a vector two-operand operation.  */
1025 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1026                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1027 {
1028     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1029     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1030     TCGType type;
1031     uint32_t some;
1032 
1033     check_size_align(oprsz, maxsz, dofs | aofs);
1034     check_overlap_2(dofs, aofs, maxsz);
1035 
1036     type = 0;
1037     if (g->fniv) {
1038         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1039     }
1040     switch (type) {
1041     case TCG_TYPE_V256:
1042         /* Recall that ARM SVE allows vector sizes that are not a
1043          * power of 2, but always a multiple of 16.  The intent is
1044          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1045          */
1046         some = QEMU_ALIGN_DOWN(oprsz, 32);
1047         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
1048         if (some == oprsz) {
1049             break;
1050         }
1051         dofs += some;
1052         aofs += some;
1053         oprsz -= some;
1054         maxsz -= some;
1055         /* fallthru */
1056     case TCG_TYPE_V128:
1057         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
1058         break;
1059     case TCG_TYPE_V64:
1060         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
1061         break;
1062 
1063     case 0:
1064         if (g->fni8 && check_size_impl(oprsz, 8)) {
1065             expand_2_i64(dofs, aofs, oprsz, g->fni8);
1066         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1067             expand_2_i32(dofs, aofs, oprsz, g->fni4);
1068         } else {
1069             assert(g->fno != NULL);
1070             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1071             oprsz = maxsz;
1072         }
1073         break;
1074 
1075     default:
1076         g_assert_not_reached();
1077     }
1078     tcg_swap_vecop_list(hold_list);
1079 
1080     if (oprsz < maxsz) {
1081         expand_clr(dofs + oprsz, maxsz - oprsz);
1082     }
1083 }
1084 
1085 /* Expand a vector operation with two vectors and an immediate.  */
1086 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1087                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1088 {
1089     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1090     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1091     TCGType type;
1092     uint32_t some;
1093 
1094     check_size_align(oprsz, maxsz, dofs | aofs);
1095     check_overlap_2(dofs, aofs, maxsz);
1096 
1097     type = 0;
1098     if (g->fniv) {
1099         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1100     }
1101     switch (type) {
1102     case TCG_TYPE_V256:
1103         /* Recall that ARM SVE allows vector sizes that are not a
1104          * power of 2, but always a multiple of 16.  The intent is
1105          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1106          */
1107         some = QEMU_ALIGN_DOWN(oprsz, 32);
1108         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1109                       c, g->load_dest, g->fniv);
1110         if (some == oprsz) {
1111             break;
1112         }
1113         dofs += some;
1114         aofs += some;
1115         oprsz -= some;
1116         maxsz -= some;
1117         /* fallthru */
1118     case TCG_TYPE_V128:
1119         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1120                       c, g->load_dest, g->fniv);
1121         break;
1122     case TCG_TYPE_V64:
1123         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1124                       c, g->load_dest, g->fniv);
1125         break;
1126 
1127     case 0:
1128         if (g->fni8 && check_size_impl(oprsz, 8)) {
1129             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1130         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1131             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1132         } else {
1133             if (g->fno) {
1134                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1135             } else {
1136                 TCGv_i64 tcg_c = tcg_const_i64(c);
1137                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1138                                     maxsz, c, g->fnoi);
1139                 tcg_temp_free_i64(tcg_c);
1140             }
1141             oprsz = maxsz;
1142         }
1143         break;
1144 
1145     default:
1146         g_assert_not_reached();
1147     }
1148     tcg_swap_vecop_list(hold_list);
1149 
1150     if (oprsz < maxsz) {
1151         expand_clr(dofs + oprsz, maxsz - oprsz);
1152     }
1153 }
1154 
1155 /* Expand a vector operation with two vectors and a scalar.  */
1156 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1157                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1158 {
1159     TCGType type;
1160 
1161     check_size_align(oprsz, maxsz, dofs | aofs);
1162     check_overlap_2(dofs, aofs, maxsz);
1163 
1164     type = 0;
1165     if (g->fniv) {
1166         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1167     }
1168     if (type != 0) {
1169         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1170         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1171         TCGv_vec t_vec = tcg_temp_new_vec(type);
1172         uint32_t some;
1173 
1174         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1175 
1176         switch (type) {
1177         case TCG_TYPE_V256:
1178             /* Recall that ARM SVE allows vector sizes that are not a
1179              * power of 2, but always a multiple of 16.  The intent is
1180              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1181              */
1182             some = QEMU_ALIGN_DOWN(oprsz, 32);
1183             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1184                           t_vec, g->scalar_first, g->fniv);
1185             if (some == oprsz) {
1186                 break;
1187             }
1188             dofs += some;
1189             aofs += some;
1190             oprsz -= some;
1191             maxsz -= some;
1192             /* fallthru */
1193 
1194         case TCG_TYPE_V128:
1195             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1196                           t_vec, g->scalar_first, g->fniv);
1197             break;
1198 
1199         case TCG_TYPE_V64:
1200             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1201                           t_vec, g->scalar_first, g->fniv);
1202             break;
1203 
1204         default:
1205             g_assert_not_reached();
1206         }
1207         tcg_temp_free_vec(t_vec);
1208         tcg_swap_vecop_list(hold_list);
1209     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1210         TCGv_i64 t64 = tcg_temp_new_i64();
1211 
1212         gen_dup_i64(g->vece, t64, c);
1213         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1214         tcg_temp_free_i64(t64);
1215     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1216         TCGv_i32 t32 = tcg_temp_new_i32();
1217 
1218         tcg_gen_extrl_i64_i32(t32, c);
1219         gen_dup_i32(g->vece, t32, t32);
1220         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1221         tcg_temp_free_i32(t32);
1222     } else {
1223         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1224         return;
1225     }
1226 
1227     if (oprsz < maxsz) {
1228         expand_clr(dofs + oprsz, maxsz - oprsz);
1229     }
1230 }
1231 
1232 /* Expand a vector three-operand operation.  */
1233 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1234                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1235 {
1236     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1237     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1238     TCGType type;
1239     uint32_t some;
1240 
1241     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1242     check_overlap_3(dofs, aofs, bofs, maxsz);
1243 
1244     type = 0;
1245     if (g->fniv) {
1246         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1247     }
1248     switch (type) {
1249     case TCG_TYPE_V256:
1250         /* Recall that ARM SVE allows vector sizes that are not a
1251          * power of 2, but always a multiple of 16.  The intent is
1252          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1253          */
1254         some = QEMU_ALIGN_DOWN(oprsz, 32);
1255         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1256                      g->load_dest, g->fniv);
1257         if (some == oprsz) {
1258             break;
1259         }
1260         dofs += some;
1261         aofs += some;
1262         bofs += some;
1263         oprsz -= some;
1264         maxsz -= some;
1265         /* fallthru */
1266     case TCG_TYPE_V128:
1267         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1268                      g->load_dest, g->fniv);
1269         break;
1270     case TCG_TYPE_V64:
1271         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1272                      g->load_dest, g->fniv);
1273         break;
1274 
1275     case 0:
1276         if (g->fni8 && check_size_impl(oprsz, 8)) {
1277             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1278         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1279             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1280         } else {
1281             assert(g->fno != NULL);
1282             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1283                                maxsz, g->data, g->fno);
1284             oprsz = maxsz;
1285         }
1286         break;
1287 
1288     default:
1289         g_assert_not_reached();
1290     }
1291     tcg_swap_vecop_list(hold_list);
1292 
1293     if (oprsz < maxsz) {
1294         expand_clr(dofs + oprsz, maxsz - oprsz);
1295     }
1296 }
1297 
1298 /* Expand a vector operation with three vectors and an immediate.  */
1299 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1300                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1301                      const GVecGen3i *g)
1302 {
1303     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1304     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1305     TCGType type;
1306     uint32_t some;
1307 
1308     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1309     check_overlap_3(dofs, aofs, bofs, maxsz);
1310 
1311     type = 0;
1312     if (g->fniv) {
1313         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1314     }
1315     switch (type) {
1316     case TCG_TYPE_V256:
1317         /*
1318          * Recall that ARM SVE allows vector sizes that are not a
1319          * power of 2, but always a multiple of 16.  The intent is
1320          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1321          */
1322         some = QEMU_ALIGN_DOWN(oprsz, 32);
1323         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1324                       c, g->load_dest, g->fniv);
1325         if (some == oprsz) {
1326             break;
1327         }
1328         dofs += some;
1329         aofs += some;
1330         bofs += some;
1331         oprsz -= some;
1332         maxsz -= some;
1333         /* fallthru */
1334     case TCG_TYPE_V128:
1335         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1336                       c, g->load_dest, g->fniv);
1337         break;
1338     case TCG_TYPE_V64:
1339         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1340                       c, g->load_dest, g->fniv);
1341         break;
1342 
1343     case 0:
1344         if (g->fni8 && check_size_impl(oprsz, 8)) {
1345             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1346         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1347             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1348         } else {
1349             assert(g->fno != NULL);
1350             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1351             oprsz = maxsz;
1352         }
1353         break;
1354 
1355     default:
1356         g_assert_not_reached();
1357     }
1358     tcg_swap_vecop_list(hold_list);
1359 
1360     if (oprsz < maxsz) {
1361         expand_clr(dofs + oprsz, maxsz - oprsz);
1362     }
1363 }
1364 
1365 /* Expand a vector four-operand operation.  */
1366 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1367                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1368 {
1369     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1370     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1371     TCGType type;
1372     uint32_t some;
1373 
1374     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1375     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1376 
1377     type = 0;
1378     if (g->fniv) {
1379         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1380     }
1381     switch (type) {
1382     case TCG_TYPE_V256:
1383         /* Recall that ARM SVE allows vector sizes that are not a
1384          * power of 2, but always a multiple of 16.  The intent is
1385          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1386          */
1387         some = QEMU_ALIGN_DOWN(oprsz, 32);
1388         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1389                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1390         if (some == oprsz) {
1391             break;
1392         }
1393         dofs += some;
1394         aofs += some;
1395         bofs += some;
1396         cofs += some;
1397         oprsz -= some;
1398         maxsz -= some;
1399         /* fallthru */
1400     case TCG_TYPE_V128:
1401         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1402                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1403         break;
1404     case TCG_TYPE_V64:
1405         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1406                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1407         break;
1408 
1409     case 0:
1410         if (g->fni8 && check_size_impl(oprsz, 8)) {
1411             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1412                          g->write_aofs, g->fni8);
1413         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1414             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1415                          g->write_aofs, g->fni4);
1416         } else {
1417             assert(g->fno != NULL);
1418             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1419                                oprsz, maxsz, g->data, g->fno);
1420             oprsz = maxsz;
1421         }
1422         break;
1423 
1424     default:
1425         g_assert_not_reached();
1426     }
1427     tcg_swap_vecop_list(hold_list);
1428 
1429     if (oprsz < maxsz) {
1430         expand_clr(dofs + oprsz, maxsz - oprsz);
1431     }
1432 }
1433 
1434 /*
1435  * Expand specific vector operations.
1436  */
1437 
1438 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1439 {
1440     tcg_gen_mov_vec(a, b);
1441 }
1442 
1443 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1444                       uint32_t oprsz, uint32_t maxsz)
1445 {
1446     static const GVecGen2 g = {
1447         .fni8 = tcg_gen_mov_i64,
1448         .fniv = vec_mov2,
1449         .fno = gen_helper_gvec_mov,
1450         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1451     };
1452     if (dofs != aofs) {
1453         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1454     } else {
1455         check_size_align(oprsz, maxsz, dofs);
1456         if (oprsz < maxsz) {
1457             expand_clr(dofs + oprsz, maxsz - oprsz);
1458         }
1459     }
1460 }
1461 
1462 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1463                           uint32_t maxsz, TCGv_i32 in)
1464 {
1465     check_size_align(oprsz, maxsz, dofs);
1466     tcg_debug_assert(vece <= MO_32);
1467     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1468 }
1469 
1470 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1471                           uint32_t maxsz, TCGv_i64 in)
1472 {
1473     check_size_align(oprsz, maxsz, dofs);
1474     tcg_debug_assert(vece <= MO_64);
1475     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1476 }
1477 
1478 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1479                           uint32_t oprsz, uint32_t maxsz)
1480 {
1481     check_size_align(oprsz, maxsz, dofs);
1482     if (vece <= MO_64) {
1483         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1484         if (type != 0) {
1485             TCGv_vec t_vec = tcg_temp_new_vec(type);
1486             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1487             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1488             tcg_temp_free_vec(t_vec);
1489         } else if (vece <= MO_32) {
1490             TCGv_i32 in = tcg_temp_new_i32();
1491             switch (vece) {
1492             case MO_8:
1493                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1494                 break;
1495             case MO_16:
1496                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1497                 break;
1498             default:
1499                 tcg_gen_ld_i32(in, cpu_env, aofs);
1500                 break;
1501             }
1502             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1503             tcg_temp_free_i32(in);
1504         } else {
1505             TCGv_i64 in = tcg_temp_new_i64();
1506             tcg_gen_ld_i64(in, cpu_env, aofs);
1507             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1508             tcg_temp_free_i64(in);
1509         }
1510     } else {
1511         /* 128-bit duplicate.  */
1512         /* ??? Dup to 256-bit vector.  */
1513         int i;
1514 
1515         tcg_debug_assert(vece == 4);
1516         tcg_debug_assert(oprsz >= 16);
1517         if (TCG_TARGET_HAS_v128) {
1518             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1519 
1520             tcg_gen_ld_vec(in, cpu_env, aofs);
1521             for (i = 0; i < oprsz; i += 16) {
1522                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1523             }
1524             tcg_temp_free_vec(in);
1525         } else {
1526             TCGv_i64 in0 = tcg_temp_new_i64();
1527             TCGv_i64 in1 = tcg_temp_new_i64();
1528 
1529             tcg_gen_ld_i64(in0, cpu_env, aofs);
1530             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1531             for (i = 0; i < oprsz; i += 16) {
1532                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1533                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1534             }
1535             tcg_temp_free_i64(in0);
1536             tcg_temp_free_i64(in1);
1537         }
1538         if (oprsz < maxsz) {
1539             expand_clr(dofs + oprsz, maxsz - oprsz);
1540         }
1541     }
1542 }
1543 
1544 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1545                          uint32_t maxsz, uint64_t x)
1546 {
1547     check_size_align(oprsz, maxsz, dofs);
1548     do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1549 }
1550 
1551 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1552                          uint32_t maxsz, uint32_t x)
1553 {
1554     check_size_align(oprsz, maxsz, dofs);
1555     do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1556 }
1557 
1558 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1559                          uint32_t maxsz, uint16_t x)
1560 {
1561     check_size_align(oprsz, maxsz, dofs);
1562     do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1563 }
1564 
1565 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1566                          uint32_t maxsz, uint8_t x)
1567 {
1568     check_size_align(oprsz, maxsz, dofs);
1569     do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1570 }
1571 
1572 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1573                       uint32_t oprsz, uint32_t maxsz)
1574 {
1575     static const GVecGen2 g = {
1576         .fni8 = tcg_gen_not_i64,
1577         .fniv = tcg_gen_not_vec,
1578         .fno = gen_helper_gvec_not,
1579         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1580     };
1581     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1582 }
1583 
1584 /* Perform a vector addition using normal addition and a mask.  The mask
1585    should be the sign bit of each lane.  This 6-operation form is more
1586    efficient than separate additions when there are 4 or more lanes in
1587    the 64-bit operation.  */
1588 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1589 {
1590     TCGv_i64 t1 = tcg_temp_new_i64();
1591     TCGv_i64 t2 = tcg_temp_new_i64();
1592     TCGv_i64 t3 = tcg_temp_new_i64();
1593 
1594     tcg_gen_andc_i64(t1, a, m);
1595     tcg_gen_andc_i64(t2, b, m);
1596     tcg_gen_xor_i64(t3, a, b);
1597     tcg_gen_add_i64(d, t1, t2);
1598     tcg_gen_and_i64(t3, t3, m);
1599     tcg_gen_xor_i64(d, d, t3);
1600 
1601     tcg_temp_free_i64(t1);
1602     tcg_temp_free_i64(t2);
1603     tcg_temp_free_i64(t3);
1604 }
1605 
1606 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1607 {
1608     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1609     gen_addv_mask(d, a, b, m);
1610     tcg_temp_free_i64(m);
1611 }
1612 
1613 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1614 {
1615     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1616     gen_addv_mask(d, a, b, m);
1617     tcg_temp_free_i64(m);
1618 }
1619 
1620 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1621 {
1622     TCGv_i64 t1 = tcg_temp_new_i64();
1623     TCGv_i64 t2 = tcg_temp_new_i64();
1624 
1625     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1626     tcg_gen_add_i64(t2, a, b);
1627     tcg_gen_add_i64(t1, t1, b);
1628     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1629 
1630     tcg_temp_free_i64(t1);
1631     tcg_temp_free_i64(t2);
1632 }
1633 
1634 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1635 
1636 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1637                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1638 {
1639     static const GVecGen3 g[4] = {
1640         { .fni8 = tcg_gen_vec_add8_i64,
1641           .fniv = tcg_gen_add_vec,
1642           .fno = gen_helper_gvec_add8,
1643           .opt_opc = vecop_list_add,
1644           .vece = MO_8 },
1645         { .fni8 = tcg_gen_vec_add16_i64,
1646           .fniv = tcg_gen_add_vec,
1647           .fno = gen_helper_gvec_add16,
1648           .opt_opc = vecop_list_add,
1649           .vece = MO_16 },
1650         { .fni4 = tcg_gen_add_i32,
1651           .fniv = tcg_gen_add_vec,
1652           .fno = gen_helper_gvec_add32,
1653           .opt_opc = vecop_list_add,
1654           .vece = MO_32 },
1655         { .fni8 = tcg_gen_add_i64,
1656           .fniv = tcg_gen_add_vec,
1657           .fno = gen_helper_gvec_add64,
1658           .opt_opc = vecop_list_add,
1659           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1660           .vece = MO_64 },
1661     };
1662 
1663     tcg_debug_assert(vece <= MO_64);
1664     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1665 }
1666 
1667 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1668                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1669 {
1670     static const GVecGen2s g[4] = {
1671         { .fni8 = tcg_gen_vec_add8_i64,
1672           .fniv = tcg_gen_add_vec,
1673           .fno = gen_helper_gvec_adds8,
1674           .opt_opc = vecop_list_add,
1675           .vece = MO_8 },
1676         { .fni8 = tcg_gen_vec_add16_i64,
1677           .fniv = tcg_gen_add_vec,
1678           .fno = gen_helper_gvec_adds16,
1679           .opt_opc = vecop_list_add,
1680           .vece = MO_16 },
1681         { .fni4 = tcg_gen_add_i32,
1682           .fniv = tcg_gen_add_vec,
1683           .fno = gen_helper_gvec_adds32,
1684           .opt_opc = vecop_list_add,
1685           .vece = MO_32 },
1686         { .fni8 = tcg_gen_add_i64,
1687           .fniv = tcg_gen_add_vec,
1688           .fno = gen_helper_gvec_adds64,
1689           .opt_opc = vecop_list_add,
1690           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1691           .vece = MO_64 },
1692     };
1693 
1694     tcg_debug_assert(vece <= MO_64);
1695     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1696 }
1697 
1698 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1699                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1700 {
1701     TCGv_i64 tmp = tcg_const_i64(c);
1702     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1703     tcg_temp_free_i64(tmp);
1704 }
1705 
1706 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1707 
1708 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1709                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1710 {
1711     static const GVecGen2s g[4] = {
1712         { .fni8 = tcg_gen_vec_sub8_i64,
1713           .fniv = tcg_gen_sub_vec,
1714           .fno = gen_helper_gvec_subs8,
1715           .opt_opc = vecop_list_sub,
1716           .vece = MO_8 },
1717         { .fni8 = tcg_gen_vec_sub16_i64,
1718           .fniv = tcg_gen_sub_vec,
1719           .fno = gen_helper_gvec_subs16,
1720           .opt_opc = vecop_list_sub,
1721           .vece = MO_16 },
1722         { .fni4 = tcg_gen_sub_i32,
1723           .fniv = tcg_gen_sub_vec,
1724           .fno = gen_helper_gvec_subs32,
1725           .opt_opc = vecop_list_sub,
1726           .vece = MO_32 },
1727         { .fni8 = tcg_gen_sub_i64,
1728           .fniv = tcg_gen_sub_vec,
1729           .fno = gen_helper_gvec_subs64,
1730           .opt_opc = vecop_list_sub,
1731           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1732           .vece = MO_64 },
1733     };
1734 
1735     tcg_debug_assert(vece <= MO_64);
1736     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1737 }
1738 
1739 /* Perform a vector subtraction using normal subtraction and a mask.
1740    Compare gen_addv_mask above.  */
1741 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1742 {
1743     TCGv_i64 t1 = tcg_temp_new_i64();
1744     TCGv_i64 t2 = tcg_temp_new_i64();
1745     TCGv_i64 t3 = tcg_temp_new_i64();
1746 
1747     tcg_gen_or_i64(t1, a, m);
1748     tcg_gen_andc_i64(t2, b, m);
1749     tcg_gen_eqv_i64(t3, a, b);
1750     tcg_gen_sub_i64(d, t1, t2);
1751     tcg_gen_and_i64(t3, t3, m);
1752     tcg_gen_xor_i64(d, d, t3);
1753 
1754     tcg_temp_free_i64(t1);
1755     tcg_temp_free_i64(t2);
1756     tcg_temp_free_i64(t3);
1757 }
1758 
1759 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1760 {
1761     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1762     gen_subv_mask(d, a, b, m);
1763     tcg_temp_free_i64(m);
1764 }
1765 
1766 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1767 {
1768     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1769     gen_subv_mask(d, a, b, m);
1770     tcg_temp_free_i64(m);
1771 }
1772 
1773 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1774 {
1775     TCGv_i64 t1 = tcg_temp_new_i64();
1776     TCGv_i64 t2 = tcg_temp_new_i64();
1777 
1778     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1779     tcg_gen_sub_i64(t2, a, b);
1780     tcg_gen_sub_i64(t1, a, t1);
1781     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1782 
1783     tcg_temp_free_i64(t1);
1784     tcg_temp_free_i64(t2);
1785 }
1786 
1787 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1788                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1789 {
1790     static const GVecGen3 g[4] = {
1791         { .fni8 = tcg_gen_vec_sub8_i64,
1792           .fniv = tcg_gen_sub_vec,
1793           .fno = gen_helper_gvec_sub8,
1794           .opt_opc = vecop_list_sub,
1795           .vece = MO_8 },
1796         { .fni8 = tcg_gen_vec_sub16_i64,
1797           .fniv = tcg_gen_sub_vec,
1798           .fno = gen_helper_gvec_sub16,
1799           .opt_opc = vecop_list_sub,
1800           .vece = MO_16 },
1801         { .fni4 = tcg_gen_sub_i32,
1802           .fniv = tcg_gen_sub_vec,
1803           .fno = gen_helper_gvec_sub32,
1804           .opt_opc = vecop_list_sub,
1805           .vece = MO_32 },
1806         { .fni8 = tcg_gen_sub_i64,
1807           .fniv = tcg_gen_sub_vec,
1808           .fno = gen_helper_gvec_sub64,
1809           .opt_opc = vecop_list_sub,
1810           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1811           .vece = MO_64 },
1812     };
1813 
1814     tcg_debug_assert(vece <= MO_64);
1815     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1816 }
1817 
1818 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1819 
1820 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1821                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1822 {
1823     static const GVecGen3 g[4] = {
1824         { .fniv = tcg_gen_mul_vec,
1825           .fno = gen_helper_gvec_mul8,
1826           .opt_opc = vecop_list_mul,
1827           .vece = MO_8 },
1828         { .fniv = tcg_gen_mul_vec,
1829           .fno = gen_helper_gvec_mul16,
1830           .opt_opc = vecop_list_mul,
1831           .vece = MO_16 },
1832         { .fni4 = tcg_gen_mul_i32,
1833           .fniv = tcg_gen_mul_vec,
1834           .fno = gen_helper_gvec_mul32,
1835           .opt_opc = vecop_list_mul,
1836           .vece = MO_32 },
1837         { .fni8 = tcg_gen_mul_i64,
1838           .fniv = tcg_gen_mul_vec,
1839           .fno = gen_helper_gvec_mul64,
1840           .opt_opc = vecop_list_mul,
1841           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1842           .vece = MO_64 },
1843     };
1844 
1845     tcg_debug_assert(vece <= MO_64);
1846     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1847 }
1848 
1849 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1850                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1851 {
1852     static const GVecGen2s g[4] = {
1853         { .fniv = tcg_gen_mul_vec,
1854           .fno = gen_helper_gvec_muls8,
1855           .opt_opc = vecop_list_mul,
1856           .vece = MO_8 },
1857         { .fniv = tcg_gen_mul_vec,
1858           .fno = gen_helper_gvec_muls16,
1859           .opt_opc = vecop_list_mul,
1860           .vece = MO_16 },
1861         { .fni4 = tcg_gen_mul_i32,
1862           .fniv = tcg_gen_mul_vec,
1863           .fno = gen_helper_gvec_muls32,
1864           .opt_opc = vecop_list_mul,
1865           .vece = MO_32 },
1866         { .fni8 = tcg_gen_mul_i64,
1867           .fniv = tcg_gen_mul_vec,
1868           .fno = gen_helper_gvec_muls64,
1869           .opt_opc = vecop_list_mul,
1870           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1871           .vece = MO_64 },
1872     };
1873 
1874     tcg_debug_assert(vece <= MO_64);
1875     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1876 }
1877 
1878 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1879                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1880 {
1881     TCGv_i64 tmp = tcg_const_i64(c);
1882     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1883     tcg_temp_free_i64(tmp);
1884 }
1885 
1886 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1887                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1888 {
1889     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1890     static const GVecGen3 g[4] = {
1891         { .fniv = tcg_gen_ssadd_vec,
1892           .fno = gen_helper_gvec_ssadd8,
1893           .opt_opc = vecop_list,
1894           .vece = MO_8 },
1895         { .fniv = tcg_gen_ssadd_vec,
1896           .fno = gen_helper_gvec_ssadd16,
1897           .opt_opc = vecop_list,
1898           .vece = MO_16 },
1899         { .fniv = tcg_gen_ssadd_vec,
1900           .fno = gen_helper_gvec_ssadd32,
1901           .opt_opc = vecop_list,
1902           .vece = MO_32 },
1903         { .fniv = tcg_gen_ssadd_vec,
1904           .fno = gen_helper_gvec_ssadd64,
1905           .opt_opc = vecop_list,
1906           .vece = MO_64 },
1907     };
1908     tcg_debug_assert(vece <= MO_64);
1909     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1910 }
1911 
1912 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1913                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1914 {
1915     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
1916     static const GVecGen3 g[4] = {
1917         { .fniv = tcg_gen_sssub_vec,
1918           .fno = gen_helper_gvec_sssub8,
1919           .opt_opc = vecop_list,
1920           .vece = MO_8 },
1921         { .fniv = tcg_gen_sssub_vec,
1922           .fno = gen_helper_gvec_sssub16,
1923           .opt_opc = vecop_list,
1924           .vece = MO_16 },
1925         { .fniv = tcg_gen_sssub_vec,
1926           .fno = gen_helper_gvec_sssub32,
1927           .opt_opc = vecop_list,
1928           .vece = MO_32 },
1929         { .fniv = tcg_gen_sssub_vec,
1930           .fno = gen_helper_gvec_sssub64,
1931           .opt_opc = vecop_list,
1932           .vece = MO_64 },
1933     };
1934     tcg_debug_assert(vece <= MO_64);
1935     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1936 }
1937 
1938 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1939 {
1940     TCGv_i32 max = tcg_const_i32(-1);
1941     tcg_gen_add_i32(d, a, b);
1942     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1943     tcg_temp_free_i32(max);
1944 }
1945 
1946 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1947 {
1948     TCGv_i64 max = tcg_const_i64(-1);
1949     tcg_gen_add_i64(d, a, b);
1950     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1951     tcg_temp_free_i64(max);
1952 }
1953 
1954 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1955                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1956 {
1957     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
1958     static const GVecGen3 g[4] = {
1959         { .fniv = tcg_gen_usadd_vec,
1960           .fno = gen_helper_gvec_usadd8,
1961           .opt_opc = vecop_list,
1962           .vece = MO_8 },
1963         { .fniv = tcg_gen_usadd_vec,
1964           .fno = gen_helper_gvec_usadd16,
1965           .opt_opc = vecop_list,
1966           .vece = MO_16 },
1967         { .fni4 = tcg_gen_usadd_i32,
1968           .fniv = tcg_gen_usadd_vec,
1969           .fno = gen_helper_gvec_usadd32,
1970           .opt_opc = vecop_list,
1971           .vece = MO_32 },
1972         { .fni8 = tcg_gen_usadd_i64,
1973           .fniv = tcg_gen_usadd_vec,
1974           .fno = gen_helper_gvec_usadd64,
1975           .opt_opc = vecop_list,
1976           .vece = MO_64 }
1977     };
1978     tcg_debug_assert(vece <= MO_64);
1979     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1980 }
1981 
1982 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1983 {
1984     TCGv_i32 min = tcg_const_i32(0);
1985     tcg_gen_sub_i32(d, a, b);
1986     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1987     tcg_temp_free_i32(min);
1988 }
1989 
1990 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1991 {
1992     TCGv_i64 min = tcg_const_i64(0);
1993     tcg_gen_sub_i64(d, a, b);
1994     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1995     tcg_temp_free_i64(min);
1996 }
1997 
1998 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1999                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2000 {
2001     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2002     static const GVecGen3 g[4] = {
2003         { .fniv = tcg_gen_ussub_vec,
2004           .fno = gen_helper_gvec_ussub8,
2005           .opt_opc = vecop_list,
2006           .vece = MO_8 },
2007         { .fniv = tcg_gen_ussub_vec,
2008           .fno = gen_helper_gvec_ussub16,
2009           .opt_opc = vecop_list,
2010           .vece = MO_16 },
2011         { .fni4 = tcg_gen_ussub_i32,
2012           .fniv = tcg_gen_ussub_vec,
2013           .fno = gen_helper_gvec_ussub32,
2014           .opt_opc = vecop_list,
2015           .vece = MO_32 },
2016         { .fni8 = tcg_gen_ussub_i64,
2017           .fniv = tcg_gen_ussub_vec,
2018           .fno = gen_helper_gvec_ussub64,
2019           .opt_opc = vecop_list,
2020           .vece = MO_64 }
2021     };
2022     tcg_debug_assert(vece <= MO_64);
2023     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2024 }
2025 
2026 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2027                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2028 {
2029     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2030     static const GVecGen3 g[4] = {
2031         { .fniv = tcg_gen_smin_vec,
2032           .fno = gen_helper_gvec_smin8,
2033           .opt_opc = vecop_list,
2034           .vece = MO_8 },
2035         { .fniv = tcg_gen_smin_vec,
2036           .fno = gen_helper_gvec_smin16,
2037           .opt_opc = vecop_list,
2038           .vece = MO_16 },
2039         { .fni4 = tcg_gen_smin_i32,
2040           .fniv = tcg_gen_smin_vec,
2041           .fno = gen_helper_gvec_smin32,
2042           .opt_opc = vecop_list,
2043           .vece = MO_32 },
2044         { .fni8 = tcg_gen_smin_i64,
2045           .fniv = tcg_gen_smin_vec,
2046           .fno = gen_helper_gvec_smin64,
2047           .opt_opc = vecop_list,
2048           .vece = MO_64 }
2049     };
2050     tcg_debug_assert(vece <= MO_64);
2051     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2052 }
2053 
2054 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2055                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2056 {
2057     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2058     static const GVecGen3 g[4] = {
2059         { .fniv = tcg_gen_umin_vec,
2060           .fno = gen_helper_gvec_umin8,
2061           .opt_opc = vecop_list,
2062           .vece = MO_8 },
2063         { .fniv = tcg_gen_umin_vec,
2064           .fno = gen_helper_gvec_umin16,
2065           .opt_opc = vecop_list,
2066           .vece = MO_16 },
2067         { .fni4 = tcg_gen_umin_i32,
2068           .fniv = tcg_gen_umin_vec,
2069           .fno = gen_helper_gvec_umin32,
2070           .opt_opc = vecop_list,
2071           .vece = MO_32 },
2072         { .fni8 = tcg_gen_umin_i64,
2073           .fniv = tcg_gen_umin_vec,
2074           .fno = gen_helper_gvec_umin64,
2075           .opt_opc = vecop_list,
2076           .vece = MO_64 }
2077     };
2078     tcg_debug_assert(vece <= MO_64);
2079     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2080 }
2081 
2082 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2083                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2084 {
2085     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2086     static const GVecGen3 g[4] = {
2087         { .fniv = tcg_gen_smax_vec,
2088           .fno = gen_helper_gvec_smax8,
2089           .opt_opc = vecop_list,
2090           .vece = MO_8 },
2091         { .fniv = tcg_gen_smax_vec,
2092           .fno = gen_helper_gvec_smax16,
2093           .opt_opc = vecop_list,
2094           .vece = MO_16 },
2095         { .fni4 = tcg_gen_smax_i32,
2096           .fniv = tcg_gen_smax_vec,
2097           .fno = gen_helper_gvec_smax32,
2098           .opt_opc = vecop_list,
2099           .vece = MO_32 },
2100         { .fni8 = tcg_gen_smax_i64,
2101           .fniv = tcg_gen_smax_vec,
2102           .fno = gen_helper_gvec_smax64,
2103           .opt_opc = vecop_list,
2104           .vece = MO_64 }
2105     };
2106     tcg_debug_assert(vece <= MO_64);
2107     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2108 }
2109 
2110 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2111                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2112 {
2113     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2114     static const GVecGen3 g[4] = {
2115         { .fniv = tcg_gen_umax_vec,
2116           .fno = gen_helper_gvec_umax8,
2117           .opt_opc = vecop_list,
2118           .vece = MO_8 },
2119         { .fniv = tcg_gen_umax_vec,
2120           .fno = gen_helper_gvec_umax16,
2121           .opt_opc = vecop_list,
2122           .vece = MO_16 },
2123         { .fni4 = tcg_gen_umax_i32,
2124           .fniv = tcg_gen_umax_vec,
2125           .fno = gen_helper_gvec_umax32,
2126           .opt_opc = vecop_list,
2127           .vece = MO_32 },
2128         { .fni8 = tcg_gen_umax_i64,
2129           .fniv = tcg_gen_umax_vec,
2130           .fno = gen_helper_gvec_umax64,
2131           .opt_opc = vecop_list,
2132           .vece = MO_64 }
2133     };
2134     tcg_debug_assert(vece <= MO_64);
2135     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2136 }
2137 
2138 /* Perform a vector negation using normal negation and a mask.
2139    Compare gen_subv_mask above.  */
2140 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2141 {
2142     TCGv_i64 t2 = tcg_temp_new_i64();
2143     TCGv_i64 t3 = tcg_temp_new_i64();
2144 
2145     tcg_gen_andc_i64(t3, m, b);
2146     tcg_gen_andc_i64(t2, b, m);
2147     tcg_gen_sub_i64(d, m, t2);
2148     tcg_gen_xor_i64(d, d, t3);
2149 
2150     tcg_temp_free_i64(t2);
2151     tcg_temp_free_i64(t3);
2152 }
2153 
2154 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2155 {
2156     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2157     gen_negv_mask(d, b, m);
2158     tcg_temp_free_i64(m);
2159 }
2160 
2161 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2162 {
2163     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2164     gen_negv_mask(d, b, m);
2165     tcg_temp_free_i64(m);
2166 }
2167 
2168 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2169 {
2170     TCGv_i64 t1 = tcg_temp_new_i64();
2171     TCGv_i64 t2 = tcg_temp_new_i64();
2172 
2173     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2174     tcg_gen_neg_i64(t2, b);
2175     tcg_gen_neg_i64(t1, t1);
2176     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2177 
2178     tcg_temp_free_i64(t1);
2179     tcg_temp_free_i64(t2);
2180 }
2181 
2182 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2183                       uint32_t oprsz, uint32_t maxsz)
2184 {
2185     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2186     static const GVecGen2 g[4] = {
2187         { .fni8 = tcg_gen_vec_neg8_i64,
2188           .fniv = tcg_gen_neg_vec,
2189           .fno = gen_helper_gvec_neg8,
2190           .opt_opc = vecop_list,
2191           .vece = MO_8 },
2192         { .fni8 = tcg_gen_vec_neg16_i64,
2193           .fniv = tcg_gen_neg_vec,
2194           .fno = gen_helper_gvec_neg16,
2195           .opt_opc = vecop_list,
2196           .vece = MO_16 },
2197         { .fni4 = tcg_gen_neg_i32,
2198           .fniv = tcg_gen_neg_vec,
2199           .fno = gen_helper_gvec_neg32,
2200           .opt_opc = vecop_list,
2201           .vece = MO_32 },
2202         { .fni8 = tcg_gen_neg_i64,
2203           .fniv = tcg_gen_neg_vec,
2204           .fno = gen_helper_gvec_neg64,
2205           .opt_opc = vecop_list,
2206           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2207           .vece = MO_64 },
2208     };
2209 
2210     tcg_debug_assert(vece <= MO_64);
2211     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2212 }
2213 
2214 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2215 {
2216     TCGv_i64 t = tcg_temp_new_i64();
2217     int nbit = 8 << vece;
2218 
2219     /* Create -1 for each negative element.  */
2220     tcg_gen_shri_i64(t, b, nbit - 1);
2221     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2222     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2223 
2224     /*
2225      * Invert (via xor -1) and add one (via sub -1).
2226      * Because of the ordering the msb is cleared,
2227      * so we never have carry into the next element.
2228      */
2229     tcg_gen_xor_i64(d, b, t);
2230     tcg_gen_sub_i64(d, d, t);
2231 
2232     tcg_temp_free_i64(t);
2233 }
2234 
2235 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2236 {
2237     gen_absv_mask(d, b, MO_8);
2238 }
2239 
2240 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2241 {
2242     gen_absv_mask(d, b, MO_16);
2243 }
2244 
2245 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2246                       uint32_t oprsz, uint32_t maxsz)
2247 {
2248     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2249     static const GVecGen2 g[4] = {
2250         { .fni8 = tcg_gen_vec_abs8_i64,
2251           .fniv = tcg_gen_abs_vec,
2252           .fno = gen_helper_gvec_abs8,
2253           .opt_opc = vecop_list,
2254           .vece = MO_8 },
2255         { .fni8 = tcg_gen_vec_abs16_i64,
2256           .fniv = tcg_gen_abs_vec,
2257           .fno = gen_helper_gvec_abs16,
2258           .opt_opc = vecop_list,
2259           .vece = MO_16 },
2260         { .fni4 = tcg_gen_abs_i32,
2261           .fniv = tcg_gen_abs_vec,
2262           .fno = gen_helper_gvec_abs32,
2263           .opt_opc = vecop_list,
2264           .vece = MO_32 },
2265         { .fni8 = tcg_gen_abs_i64,
2266           .fniv = tcg_gen_abs_vec,
2267           .fno = gen_helper_gvec_abs64,
2268           .opt_opc = vecop_list,
2269           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2270           .vece = MO_64 },
2271     };
2272 
2273     tcg_debug_assert(vece <= MO_64);
2274     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2275 }
2276 
2277 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2278                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2279 {
2280     static const GVecGen3 g = {
2281         .fni8 = tcg_gen_and_i64,
2282         .fniv = tcg_gen_and_vec,
2283         .fno = gen_helper_gvec_and,
2284         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2285     };
2286 
2287     if (aofs == bofs) {
2288         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2289     } else {
2290         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2291     }
2292 }
2293 
2294 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2295                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2296 {
2297     static const GVecGen3 g = {
2298         .fni8 = tcg_gen_or_i64,
2299         .fniv = tcg_gen_or_vec,
2300         .fno = gen_helper_gvec_or,
2301         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2302     };
2303 
2304     if (aofs == bofs) {
2305         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2306     } else {
2307         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2308     }
2309 }
2310 
2311 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2312                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2313 {
2314     static const GVecGen3 g = {
2315         .fni8 = tcg_gen_xor_i64,
2316         .fniv = tcg_gen_xor_vec,
2317         .fno = gen_helper_gvec_xor,
2318         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2319     };
2320 
2321     if (aofs == bofs) {
2322         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2323     } else {
2324         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2325     }
2326 }
2327 
2328 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2329                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2330 {
2331     static const GVecGen3 g = {
2332         .fni8 = tcg_gen_andc_i64,
2333         .fniv = tcg_gen_andc_vec,
2334         .fno = gen_helper_gvec_andc,
2335         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2336     };
2337 
2338     if (aofs == bofs) {
2339         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2340     } else {
2341         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2342     }
2343 }
2344 
2345 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2346                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2347 {
2348     static const GVecGen3 g = {
2349         .fni8 = tcg_gen_orc_i64,
2350         .fniv = tcg_gen_orc_vec,
2351         .fno = gen_helper_gvec_orc,
2352         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2353     };
2354 
2355     if (aofs == bofs) {
2356         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2357     } else {
2358         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2359     }
2360 }
2361 
2362 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2363                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2364 {
2365     static const GVecGen3 g = {
2366         .fni8 = tcg_gen_nand_i64,
2367         .fniv = tcg_gen_nand_vec,
2368         .fno = gen_helper_gvec_nand,
2369         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2370     };
2371 
2372     if (aofs == bofs) {
2373         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2374     } else {
2375         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2376     }
2377 }
2378 
2379 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2380                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2381 {
2382     static const GVecGen3 g = {
2383         .fni8 = tcg_gen_nor_i64,
2384         .fniv = tcg_gen_nor_vec,
2385         .fno = gen_helper_gvec_nor,
2386         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2387     };
2388 
2389     if (aofs == bofs) {
2390         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2391     } else {
2392         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2393     }
2394 }
2395 
2396 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2397                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2398 {
2399     static const GVecGen3 g = {
2400         .fni8 = tcg_gen_eqv_i64,
2401         .fniv = tcg_gen_eqv_vec,
2402         .fno = gen_helper_gvec_eqv,
2403         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2404     };
2405 
2406     if (aofs == bofs) {
2407         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2408     } else {
2409         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2410     }
2411 }
2412 
2413 static const GVecGen2s gop_ands = {
2414     .fni8 = tcg_gen_and_i64,
2415     .fniv = tcg_gen_and_vec,
2416     .fno = gen_helper_gvec_ands,
2417     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2418     .vece = MO_64
2419 };
2420 
2421 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2422                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2423 {
2424     TCGv_i64 tmp = tcg_temp_new_i64();
2425     gen_dup_i64(vece, tmp, c);
2426     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2427     tcg_temp_free_i64(tmp);
2428 }
2429 
2430 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2431                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2432 {
2433     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2434     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2435     tcg_temp_free_i64(tmp);
2436 }
2437 
2438 static const GVecGen2s gop_xors = {
2439     .fni8 = tcg_gen_xor_i64,
2440     .fniv = tcg_gen_xor_vec,
2441     .fno = gen_helper_gvec_xors,
2442     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2443     .vece = MO_64
2444 };
2445 
2446 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2447                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2448 {
2449     TCGv_i64 tmp = tcg_temp_new_i64();
2450     gen_dup_i64(vece, tmp, c);
2451     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2452     tcg_temp_free_i64(tmp);
2453 }
2454 
2455 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2456                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2457 {
2458     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2459     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2460     tcg_temp_free_i64(tmp);
2461 }
2462 
2463 static const GVecGen2s gop_ors = {
2464     .fni8 = tcg_gen_or_i64,
2465     .fniv = tcg_gen_or_vec,
2466     .fno = gen_helper_gvec_ors,
2467     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2468     .vece = MO_64
2469 };
2470 
2471 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2472                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2473 {
2474     TCGv_i64 tmp = tcg_temp_new_i64();
2475     gen_dup_i64(vece, tmp, c);
2476     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2477     tcg_temp_free_i64(tmp);
2478 }
2479 
2480 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2481                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2482 {
2483     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2484     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2485     tcg_temp_free_i64(tmp);
2486 }
2487 
2488 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2489 {
2490     uint64_t mask = dup_const(MO_8, 0xff << c);
2491     tcg_gen_shli_i64(d, a, c);
2492     tcg_gen_andi_i64(d, d, mask);
2493 }
2494 
2495 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2496 {
2497     uint64_t mask = dup_const(MO_16, 0xffff << c);
2498     tcg_gen_shli_i64(d, a, c);
2499     tcg_gen_andi_i64(d, d, mask);
2500 }
2501 
2502 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2503                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2504 {
2505     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2506     static const GVecGen2i g[4] = {
2507         { .fni8 = tcg_gen_vec_shl8i_i64,
2508           .fniv = tcg_gen_shli_vec,
2509           .fno = gen_helper_gvec_shl8i,
2510           .opt_opc = vecop_list,
2511           .vece = MO_8 },
2512         { .fni8 = tcg_gen_vec_shl16i_i64,
2513           .fniv = tcg_gen_shli_vec,
2514           .fno = gen_helper_gvec_shl16i,
2515           .opt_opc = vecop_list,
2516           .vece = MO_16 },
2517         { .fni4 = tcg_gen_shli_i32,
2518           .fniv = tcg_gen_shli_vec,
2519           .fno = gen_helper_gvec_shl32i,
2520           .opt_opc = vecop_list,
2521           .vece = MO_32 },
2522         { .fni8 = tcg_gen_shli_i64,
2523           .fniv = tcg_gen_shli_vec,
2524           .fno = gen_helper_gvec_shl64i,
2525           .opt_opc = vecop_list,
2526           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2527           .vece = MO_64 },
2528     };
2529 
2530     tcg_debug_assert(vece <= MO_64);
2531     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2532     if (shift == 0) {
2533         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2534     } else {
2535         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2536     }
2537 }
2538 
2539 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2540 {
2541     uint64_t mask = dup_const(MO_8, 0xff >> c);
2542     tcg_gen_shri_i64(d, a, c);
2543     tcg_gen_andi_i64(d, d, mask);
2544 }
2545 
2546 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2547 {
2548     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2549     tcg_gen_shri_i64(d, a, c);
2550     tcg_gen_andi_i64(d, d, mask);
2551 }
2552 
2553 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2554                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2555 {
2556     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2557     static const GVecGen2i g[4] = {
2558         { .fni8 = tcg_gen_vec_shr8i_i64,
2559           .fniv = tcg_gen_shri_vec,
2560           .fno = gen_helper_gvec_shr8i,
2561           .opt_opc = vecop_list,
2562           .vece = MO_8 },
2563         { .fni8 = tcg_gen_vec_shr16i_i64,
2564           .fniv = tcg_gen_shri_vec,
2565           .fno = gen_helper_gvec_shr16i,
2566           .opt_opc = vecop_list,
2567           .vece = MO_16 },
2568         { .fni4 = tcg_gen_shri_i32,
2569           .fniv = tcg_gen_shri_vec,
2570           .fno = gen_helper_gvec_shr32i,
2571           .opt_opc = vecop_list,
2572           .vece = MO_32 },
2573         { .fni8 = tcg_gen_shri_i64,
2574           .fniv = tcg_gen_shri_vec,
2575           .fno = gen_helper_gvec_shr64i,
2576           .opt_opc = vecop_list,
2577           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2578           .vece = MO_64 },
2579     };
2580 
2581     tcg_debug_assert(vece <= MO_64);
2582     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2583     if (shift == 0) {
2584         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2585     } else {
2586         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2587     }
2588 }
2589 
2590 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2591 {
2592     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2593     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2594     TCGv_i64 s = tcg_temp_new_i64();
2595 
2596     tcg_gen_shri_i64(d, a, c);
2597     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2598     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2599     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2600     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2601     tcg_temp_free_i64(s);
2602 }
2603 
2604 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2605 {
2606     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2607     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2608     TCGv_i64 s = tcg_temp_new_i64();
2609 
2610     tcg_gen_shri_i64(d, a, c);
2611     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2612     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2613     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2614     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2615     tcg_temp_free_i64(s);
2616 }
2617 
2618 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2619                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2620 {
2621     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2622     static const GVecGen2i g[4] = {
2623         { .fni8 = tcg_gen_vec_sar8i_i64,
2624           .fniv = tcg_gen_sari_vec,
2625           .fno = gen_helper_gvec_sar8i,
2626           .opt_opc = vecop_list,
2627           .vece = MO_8 },
2628         { .fni8 = tcg_gen_vec_sar16i_i64,
2629           .fniv = tcg_gen_sari_vec,
2630           .fno = gen_helper_gvec_sar16i,
2631           .opt_opc = vecop_list,
2632           .vece = MO_16 },
2633         { .fni4 = tcg_gen_sari_i32,
2634           .fniv = tcg_gen_sari_vec,
2635           .fno = gen_helper_gvec_sar32i,
2636           .opt_opc = vecop_list,
2637           .vece = MO_32 },
2638         { .fni8 = tcg_gen_sari_i64,
2639           .fniv = tcg_gen_sari_vec,
2640           .fno = gen_helper_gvec_sar64i,
2641           .opt_opc = vecop_list,
2642           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2643           .vece = MO_64 },
2644     };
2645 
2646     tcg_debug_assert(vece <= MO_64);
2647     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2648     if (shift == 0) {
2649         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2650     } else {
2651         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2652     }
2653 }
2654 
2655 /*
2656  * Specialized generation vector shifts by a non-constant scalar.
2657  */
2658 
2659 typedef struct {
2660     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2661     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2662     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2663     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2664     gen_helper_gvec_2 *fno[4];
2665     TCGOpcode s_list[2];
2666     TCGOpcode v_list[2];
2667 } GVecGen2sh;
2668 
2669 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2670                            uint32_t oprsz, uint32_t tysz, TCGType type,
2671                            TCGv_i32 shift,
2672                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2673 {
2674     TCGv_vec t0 = tcg_temp_new_vec(type);
2675     uint32_t i;
2676 
2677     for (i = 0; i < oprsz; i += tysz) {
2678         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2679         fni(vece, t0, t0, shift);
2680         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2681     }
2682     tcg_temp_free_vec(t0);
2683 }
2684 
2685 static void
2686 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2687                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2688 {
2689     TCGType type;
2690     uint32_t some;
2691 
2692     check_size_align(oprsz, maxsz, dofs | aofs);
2693     check_overlap_2(dofs, aofs, maxsz);
2694 
2695     /* If the backend has a scalar expansion, great.  */
2696     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2697     if (type) {
2698         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2699         switch (type) {
2700         case TCG_TYPE_V256:
2701             some = QEMU_ALIGN_DOWN(oprsz, 32);
2702             expand_2sh_vec(vece, dofs, aofs, some, 32,
2703                            TCG_TYPE_V256, shift, g->fniv_s);
2704             if (some == oprsz) {
2705                 break;
2706             }
2707             dofs += some;
2708             aofs += some;
2709             oprsz -= some;
2710             maxsz -= some;
2711             /* fallthru */
2712         case TCG_TYPE_V128:
2713             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2714                            TCG_TYPE_V128, shift, g->fniv_s);
2715             break;
2716         case TCG_TYPE_V64:
2717             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2718                            TCG_TYPE_V64, shift, g->fniv_s);
2719             break;
2720         default:
2721             g_assert_not_reached();
2722         }
2723         tcg_swap_vecop_list(hold_list);
2724         goto clear_tail;
2725     }
2726 
2727     /* If the backend supports variable vector shifts, also cool.  */
2728     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2729     if (type) {
2730         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2731         TCGv_vec v_shift = tcg_temp_new_vec(type);
2732 
2733         if (vece == MO_64) {
2734             TCGv_i64 sh64 = tcg_temp_new_i64();
2735             tcg_gen_extu_i32_i64(sh64, shift);
2736             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2737             tcg_temp_free_i64(sh64);
2738         } else {
2739             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2740         }
2741 
2742         switch (type) {
2743         case TCG_TYPE_V256:
2744             some = QEMU_ALIGN_DOWN(oprsz, 32);
2745             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2746                           v_shift, false, g->fniv_v);
2747             if (some == oprsz) {
2748                 break;
2749             }
2750             dofs += some;
2751             aofs += some;
2752             oprsz -= some;
2753             maxsz -= some;
2754             /* fallthru */
2755         case TCG_TYPE_V128:
2756             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2757                           v_shift, false, g->fniv_v);
2758             break;
2759         case TCG_TYPE_V64:
2760             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2761                           v_shift, false, g->fniv_v);
2762             break;
2763         default:
2764             g_assert_not_reached();
2765         }
2766         tcg_temp_free_vec(v_shift);
2767         tcg_swap_vecop_list(hold_list);
2768         goto clear_tail;
2769     }
2770 
2771     /* Otherwise fall back to integral... */
2772     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2773         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2774     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2775         TCGv_i64 sh64 = tcg_temp_new_i64();
2776         tcg_gen_extu_i32_i64(sh64, shift);
2777         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2778         tcg_temp_free_i64(sh64);
2779     } else {
2780         TCGv_ptr a0 = tcg_temp_new_ptr();
2781         TCGv_ptr a1 = tcg_temp_new_ptr();
2782         TCGv_i32 desc = tcg_temp_new_i32();
2783 
2784         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2785         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2786         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2787         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2788 
2789         g->fno[vece](a0, a1, desc);
2790 
2791         tcg_temp_free_ptr(a0);
2792         tcg_temp_free_ptr(a1);
2793         tcg_temp_free_i32(desc);
2794         return;
2795     }
2796 
2797  clear_tail:
2798     if (oprsz < maxsz) {
2799         expand_clr(dofs + oprsz, maxsz - oprsz);
2800     }
2801 }
2802 
2803 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2804                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2805 {
2806     static const GVecGen2sh g = {
2807         .fni4 = tcg_gen_shl_i32,
2808         .fni8 = tcg_gen_shl_i64,
2809         .fniv_s = tcg_gen_shls_vec,
2810         .fniv_v = tcg_gen_shlv_vec,
2811         .fno = {
2812             gen_helper_gvec_shl8i,
2813             gen_helper_gvec_shl16i,
2814             gen_helper_gvec_shl32i,
2815             gen_helper_gvec_shl64i,
2816         },
2817         .s_list = { INDEX_op_shls_vec, 0 },
2818         .v_list = { INDEX_op_shlv_vec, 0 },
2819     };
2820 
2821     tcg_debug_assert(vece <= MO_64);
2822     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2823 }
2824 
2825 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
2826                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2827 {
2828     static const GVecGen2sh g = {
2829         .fni4 = tcg_gen_shr_i32,
2830         .fni8 = tcg_gen_shr_i64,
2831         .fniv_s = tcg_gen_shrs_vec,
2832         .fniv_v = tcg_gen_shrv_vec,
2833         .fno = {
2834             gen_helper_gvec_shr8i,
2835             gen_helper_gvec_shr16i,
2836             gen_helper_gvec_shr32i,
2837             gen_helper_gvec_shr64i,
2838         },
2839         .s_list = { INDEX_op_shrs_vec, 0 },
2840         .v_list = { INDEX_op_shrv_vec, 0 },
2841     };
2842 
2843     tcg_debug_assert(vece <= MO_64);
2844     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2845 }
2846 
2847 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
2848                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2849 {
2850     static const GVecGen2sh g = {
2851         .fni4 = tcg_gen_sar_i32,
2852         .fni8 = tcg_gen_sar_i64,
2853         .fniv_s = tcg_gen_sars_vec,
2854         .fniv_v = tcg_gen_sarv_vec,
2855         .fno = {
2856             gen_helper_gvec_sar8i,
2857             gen_helper_gvec_sar16i,
2858             gen_helper_gvec_sar32i,
2859             gen_helper_gvec_sar64i,
2860         },
2861         .s_list = { INDEX_op_sars_vec, 0 },
2862         .v_list = { INDEX_op_sarv_vec, 0 },
2863     };
2864 
2865     tcg_debug_assert(vece <= MO_64);
2866     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2867 }
2868 
2869 /*
2870  * Expand D = A << (B % element bits)
2871  *
2872  * Unlike scalar shifts, where it is easy for the target front end
2873  * to include the modulo as part of the expansion.  If the target
2874  * naturally includes the modulo as part of the operation, great!
2875  * If the target has some other behaviour from out-of-range shifts,
2876  * then it could not use this function anyway, and would need to
2877  * do it's own expansion with custom functions.
2878  */
2879 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
2880                                  TCGv_vec a, TCGv_vec b)
2881 {
2882     TCGv_vec t = tcg_temp_new_vec_matching(d);
2883 
2884     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2885     tcg_gen_and_vec(vece, t, t, b);
2886     tcg_gen_shlv_vec(vece, d, a, t);
2887     tcg_temp_free_vec(t);
2888 }
2889 
2890 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2891 {
2892     TCGv_i32 t = tcg_temp_new_i32();
2893 
2894     tcg_gen_andi_i32(t, b, 31);
2895     tcg_gen_shl_i32(d, a, t);
2896     tcg_temp_free_i32(t);
2897 }
2898 
2899 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2900 {
2901     TCGv_i64 t = tcg_temp_new_i64();
2902 
2903     tcg_gen_andi_i64(t, b, 63);
2904     tcg_gen_shl_i64(d, a, t);
2905     tcg_temp_free_i64(t);
2906 }
2907 
2908 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
2909                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2910 {
2911     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
2912     static const GVecGen3 g[4] = {
2913         { .fniv = tcg_gen_shlv_mod_vec,
2914           .fno = gen_helper_gvec_shl8v,
2915           .opt_opc = vecop_list,
2916           .vece = MO_8 },
2917         { .fniv = tcg_gen_shlv_mod_vec,
2918           .fno = gen_helper_gvec_shl16v,
2919           .opt_opc = vecop_list,
2920           .vece = MO_16 },
2921         { .fni4 = tcg_gen_shl_mod_i32,
2922           .fniv = tcg_gen_shlv_mod_vec,
2923           .fno = gen_helper_gvec_shl32v,
2924           .opt_opc = vecop_list,
2925           .vece = MO_32 },
2926         { .fni8 = tcg_gen_shl_mod_i64,
2927           .fniv = tcg_gen_shlv_mod_vec,
2928           .fno = gen_helper_gvec_shl64v,
2929           .opt_opc = vecop_list,
2930           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2931           .vece = MO_64 },
2932     };
2933 
2934     tcg_debug_assert(vece <= MO_64);
2935     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2936 }
2937 
2938 /*
2939  * Similarly for logical right shifts.
2940  */
2941 
2942 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
2943                                  TCGv_vec a, TCGv_vec b)
2944 {
2945     TCGv_vec t = tcg_temp_new_vec_matching(d);
2946 
2947     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2948     tcg_gen_and_vec(vece, t, t, b);
2949     tcg_gen_shrv_vec(vece, d, a, t);
2950     tcg_temp_free_vec(t);
2951 }
2952 
2953 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2954 {
2955     TCGv_i32 t = tcg_temp_new_i32();
2956 
2957     tcg_gen_andi_i32(t, b, 31);
2958     tcg_gen_shr_i32(d, a, t);
2959     tcg_temp_free_i32(t);
2960 }
2961 
2962 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2963 {
2964     TCGv_i64 t = tcg_temp_new_i64();
2965 
2966     tcg_gen_andi_i64(t, b, 63);
2967     tcg_gen_shr_i64(d, a, t);
2968     tcg_temp_free_i64(t);
2969 }
2970 
2971 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
2972                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2973 {
2974     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
2975     static const GVecGen3 g[4] = {
2976         { .fniv = tcg_gen_shrv_mod_vec,
2977           .fno = gen_helper_gvec_shr8v,
2978           .opt_opc = vecop_list,
2979           .vece = MO_8 },
2980         { .fniv = tcg_gen_shrv_mod_vec,
2981           .fno = gen_helper_gvec_shr16v,
2982           .opt_opc = vecop_list,
2983           .vece = MO_16 },
2984         { .fni4 = tcg_gen_shr_mod_i32,
2985           .fniv = tcg_gen_shrv_mod_vec,
2986           .fno = gen_helper_gvec_shr32v,
2987           .opt_opc = vecop_list,
2988           .vece = MO_32 },
2989         { .fni8 = tcg_gen_shr_mod_i64,
2990           .fniv = tcg_gen_shrv_mod_vec,
2991           .fno = gen_helper_gvec_shr64v,
2992           .opt_opc = vecop_list,
2993           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2994           .vece = MO_64 },
2995     };
2996 
2997     tcg_debug_assert(vece <= MO_64);
2998     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2999 }
3000 
3001 /*
3002  * Similarly for arithmetic right shifts.
3003  */
3004 
3005 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3006                                  TCGv_vec a, TCGv_vec b)
3007 {
3008     TCGv_vec t = tcg_temp_new_vec_matching(d);
3009 
3010     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3011     tcg_gen_and_vec(vece, t, t, b);
3012     tcg_gen_sarv_vec(vece, d, a, t);
3013     tcg_temp_free_vec(t);
3014 }
3015 
3016 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3017 {
3018     TCGv_i32 t = tcg_temp_new_i32();
3019 
3020     tcg_gen_andi_i32(t, b, 31);
3021     tcg_gen_sar_i32(d, a, t);
3022     tcg_temp_free_i32(t);
3023 }
3024 
3025 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3026 {
3027     TCGv_i64 t = tcg_temp_new_i64();
3028 
3029     tcg_gen_andi_i64(t, b, 63);
3030     tcg_gen_sar_i64(d, a, t);
3031     tcg_temp_free_i64(t);
3032 }
3033 
3034 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3035                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3036 {
3037     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3038     static const GVecGen3 g[4] = {
3039         { .fniv = tcg_gen_sarv_mod_vec,
3040           .fno = gen_helper_gvec_sar8v,
3041           .opt_opc = vecop_list,
3042           .vece = MO_8 },
3043         { .fniv = tcg_gen_sarv_mod_vec,
3044           .fno = gen_helper_gvec_sar16v,
3045           .opt_opc = vecop_list,
3046           .vece = MO_16 },
3047         { .fni4 = tcg_gen_sar_mod_i32,
3048           .fniv = tcg_gen_sarv_mod_vec,
3049           .fno = gen_helper_gvec_sar32v,
3050           .opt_opc = vecop_list,
3051           .vece = MO_32 },
3052         { .fni8 = tcg_gen_sar_mod_i64,
3053           .fniv = tcg_gen_sarv_mod_vec,
3054           .fno = gen_helper_gvec_sar64v,
3055           .opt_opc = vecop_list,
3056           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3057           .vece = MO_64 },
3058     };
3059 
3060     tcg_debug_assert(vece <= MO_64);
3061     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3062 }
3063 
3064 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3065 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3066                            uint32_t oprsz, TCGCond cond)
3067 {
3068     TCGv_i32 t0 = tcg_temp_new_i32();
3069     TCGv_i32 t1 = tcg_temp_new_i32();
3070     uint32_t i;
3071 
3072     for (i = 0; i < oprsz; i += 4) {
3073         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3074         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3075         tcg_gen_setcond_i32(cond, t0, t0, t1);
3076         tcg_gen_neg_i32(t0, t0);
3077         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3078     }
3079     tcg_temp_free_i32(t1);
3080     tcg_temp_free_i32(t0);
3081 }
3082 
3083 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3084                            uint32_t oprsz, TCGCond cond)
3085 {
3086     TCGv_i64 t0 = tcg_temp_new_i64();
3087     TCGv_i64 t1 = tcg_temp_new_i64();
3088     uint32_t i;
3089 
3090     for (i = 0; i < oprsz; i += 8) {
3091         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3092         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3093         tcg_gen_setcond_i64(cond, t0, t0, t1);
3094         tcg_gen_neg_i64(t0, t0);
3095         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3096     }
3097     tcg_temp_free_i64(t1);
3098     tcg_temp_free_i64(t0);
3099 }
3100 
3101 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3102                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3103                            TCGType type, TCGCond cond)
3104 {
3105     TCGv_vec t0 = tcg_temp_new_vec(type);
3106     TCGv_vec t1 = tcg_temp_new_vec(type);
3107     uint32_t i;
3108 
3109     for (i = 0; i < oprsz; i += tysz) {
3110         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3111         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3112         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3113         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3114     }
3115     tcg_temp_free_vec(t1);
3116     tcg_temp_free_vec(t0);
3117 }
3118 
3119 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3120                       uint32_t aofs, uint32_t bofs,
3121                       uint32_t oprsz, uint32_t maxsz)
3122 {
3123     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3124     static gen_helper_gvec_3 * const eq_fn[4] = {
3125         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3126         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3127     };
3128     static gen_helper_gvec_3 * const ne_fn[4] = {
3129         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3130         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3131     };
3132     static gen_helper_gvec_3 * const lt_fn[4] = {
3133         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3134         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3135     };
3136     static gen_helper_gvec_3 * const le_fn[4] = {
3137         gen_helper_gvec_le8, gen_helper_gvec_le16,
3138         gen_helper_gvec_le32, gen_helper_gvec_le64
3139     };
3140     static gen_helper_gvec_3 * const ltu_fn[4] = {
3141         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3142         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3143     };
3144     static gen_helper_gvec_3 * const leu_fn[4] = {
3145         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3146         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3147     };
3148     static gen_helper_gvec_3 * const * const fns[16] = {
3149         [TCG_COND_EQ] = eq_fn,
3150         [TCG_COND_NE] = ne_fn,
3151         [TCG_COND_LT] = lt_fn,
3152         [TCG_COND_LE] = le_fn,
3153         [TCG_COND_LTU] = ltu_fn,
3154         [TCG_COND_LEU] = leu_fn,
3155     };
3156 
3157     const TCGOpcode *hold_list;
3158     TCGType type;
3159     uint32_t some;
3160 
3161     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3162     check_overlap_3(dofs, aofs, bofs, maxsz);
3163 
3164     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3165         do_dup(MO_8, dofs, oprsz, maxsz,
3166                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3167         return;
3168     }
3169 
3170     /*
3171      * Implement inline with a vector type, if possible.
3172      * Prefer integer when 64-bit host and 64-bit comparison.
3173      */
3174     hold_list = tcg_swap_vecop_list(cmp_list);
3175     type = choose_vector_type(cmp_list, vece, oprsz,
3176                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3177     switch (type) {
3178     case TCG_TYPE_V256:
3179         /* Recall that ARM SVE allows vector sizes that are not a
3180          * power of 2, but always a multiple of 16.  The intent is
3181          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3182          */
3183         some = QEMU_ALIGN_DOWN(oprsz, 32);
3184         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3185         if (some == oprsz) {
3186             break;
3187         }
3188         dofs += some;
3189         aofs += some;
3190         bofs += some;
3191         oprsz -= some;
3192         maxsz -= some;
3193         /* fallthru */
3194     case TCG_TYPE_V128:
3195         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3196         break;
3197     case TCG_TYPE_V64:
3198         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3199         break;
3200 
3201     case 0:
3202         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3203             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3204         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3205             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3206         } else {
3207             gen_helper_gvec_3 * const *fn = fns[cond];
3208 
3209             if (fn == NULL) {
3210                 uint32_t tmp;
3211                 tmp = aofs, aofs = bofs, bofs = tmp;
3212                 cond = tcg_swap_cond(cond);
3213                 fn = fns[cond];
3214                 assert(fn != NULL);
3215             }
3216             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3217             oprsz = maxsz;
3218         }
3219         break;
3220 
3221     default:
3222         g_assert_not_reached();
3223     }
3224     tcg_swap_vecop_list(hold_list);
3225 
3226     if (oprsz < maxsz) {
3227         expand_clr(dofs + oprsz, maxsz - oprsz);
3228     }
3229 }
3230 
3231 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3232 {
3233     TCGv_i64 t = tcg_temp_new_i64();
3234 
3235     tcg_gen_and_i64(t, b, a);
3236     tcg_gen_andc_i64(d, c, a);
3237     tcg_gen_or_i64(d, d, t);
3238     tcg_temp_free_i64(t);
3239 }
3240 
3241 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3242                          uint32_t bofs, uint32_t cofs,
3243                          uint32_t oprsz, uint32_t maxsz)
3244 {
3245     static const GVecGen4 g = {
3246         .fni8 = tcg_gen_bitsel_i64,
3247         .fniv = tcg_gen_bitsel_vec,
3248         .fno = gen_helper_gvec_bitsel,
3249     };
3250 
3251     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3252 }
3253