xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision 5ee5c14c)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "qemu-common.h"
22 #include "tcg.h"
23 #include "tcg-op.h"
24 #include "tcg-op-gvec.h"
25 #include "tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t opr_align = oprsz >= 16 ? 15 : 7;
41     uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
42     tcg_debug_assert(oprsz > 0);
43     tcg_debug_assert(oprsz <= maxsz);
44     tcg_debug_assert((oprsz & opr_align) == 0);
45     tcg_debug_assert((maxsz & max_align) == 0);
46     tcg_debug_assert((ofs & max_align) == 0);
47 }
48 
49 /* Verify vector overlap rules for two operands.  */
50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
51 {
52     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
53 }
54 
55 /* Verify vector overlap rules for three operands.  */
56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
57 {
58     check_overlap_2(d, a, s);
59     check_overlap_2(d, b, s);
60     check_overlap_2(a, b, s);
61 }
62 
63 /* Verify vector overlap rules for four operands.  */
64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
65                             uint32_t c, uint32_t s)
66 {
67     check_overlap_2(d, a, s);
68     check_overlap_2(d, b, s);
69     check_overlap_2(d, c, s);
70     check_overlap_2(a, b, s);
71     check_overlap_2(a, c, s);
72     check_overlap_2(b, c, s);
73 }
74 
75 /* Create a descriptor from components.  */
76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
77 {
78     uint32_t desc = 0;
79 
80     assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
81     assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
82     assert(data == sextract32(data, 0, SIMD_DATA_BITS));
83 
84     oprsz = (oprsz / 8) - 1;
85     maxsz = (maxsz / 8) - 1;
86     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
87     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
88     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
89 
90     return desc;
91 }
92 
93 /* Generate a call to a gvec-style helper with two vector operands.  */
94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
95                         uint32_t oprsz, uint32_t maxsz, int32_t data,
96                         gen_helper_gvec_2 *fn)
97 {
98     TCGv_ptr a0, a1;
99     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
100 
101     a0 = tcg_temp_new_ptr();
102     a1 = tcg_temp_new_ptr();
103 
104     tcg_gen_addi_ptr(a0, cpu_env, dofs);
105     tcg_gen_addi_ptr(a1, cpu_env, aofs);
106 
107     fn(a0, a1, desc);
108 
109     tcg_temp_free_ptr(a0);
110     tcg_temp_free_ptr(a1);
111     tcg_temp_free_i32(desc);
112 }
113 
114 /* Generate a call to a gvec-style helper with two vector operands
115    and one scalar operand.  */
116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
117                          uint32_t oprsz, uint32_t maxsz, int32_t data,
118                          gen_helper_gvec_2i *fn)
119 {
120     TCGv_ptr a0, a1;
121     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
122 
123     a0 = tcg_temp_new_ptr();
124     a1 = tcg_temp_new_ptr();
125 
126     tcg_gen_addi_ptr(a0, cpu_env, dofs);
127     tcg_gen_addi_ptr(a1, cpu_env, aofs);
128 
129     fn(a0, a1, c, desc);
130 
131     tcg_temp_free_ptr(a0);
132     tcg_temp_free_ptr(a1);
133     tcg_temp_free_i32(desc);
134 }
135 
136 /* Generate a call to a gvec-style helper with three vector operands.  */
137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
138                         uint32_t oprsz, uint32_t maxsz, int32_t data,
139                         gen_helper_gvec_3 *fn)
140 {
141     TCGv_ptr a0, a1, a2;
142     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
143 
144     a0 = tcg_temp_new_ptr();
145     a1 = tcg_temp_new_ptr();
146     a2 = tcg_temp_new_ptr();
147 
148     tcg_gen_addi_ptr(a0, cpu_env, dofs);
149     tcg_gen_addi_ptr(a1, cpu_env, aofs);
150     tcg_gen_addi_ptr(a2, cpu_env, bofs);
151 
152     fn(a0, a1, a2, desc);
153 
154     tcg_temp_free_ptr(a0);
155     tcg_temp_free_ptr(a1);
156     tcg_temp_free_ptr(a2);
157     tcg_temp_free_i32(desc);
158 }
159 
160 /* Generate a call to a gvec-style helper with four vector operands.  */
161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
162                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
163                         int32_t data, gen_helper_gvec_4 *fn)
164 {
165     TCGv_ptr a0, a1, a2, a3;
166     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
167 
168     a0 = tcg_temp_new_ptr();
169     a1 = tcg_temp_new_ptr();
170     a2 = tcg_temp_new_ptr();
171     a3 = tcg_temp_new_ptr();
172 
173     tcg_gen_addi_ptr(a0, cpu_env, dofs);
174     tcg_gen_addi_ptr(a1, cpu_env, aofs);
175     tcg_gen_addi_ptr(a2, cpu_env, bofs);
176     tcg_gen_addi_ptr(a3, cpu_env, cofs);
177 
178     fn(a0, a1, a2, a3, desc);
179 
180     tcg_temp_free_ptr(a0);
181     tcg_temp_free_ptr(a1);
182     tcg_temp_free_ptr(a2);
183     tcg_temp_free_ptr(a3);
184     tcg_temp_free_i32(desc);
185 }
186 
187 /* Generate a call to a gvec-style helper with five vector operands.  */
188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
189                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
190                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
191 {
192     TCGv_ptr a0, a1, a2, a3, a4;
193     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
194 
195     a0 = tcg_temp_new_ptr();
196     a1 = tcg_temp_new_ptr();
197     a2 = tcg_temp_new_ptr();
198     a3 = tcg_temp_new_ptr();
199     a4 = tcg_temp_new_ptr();
200 
201     tcg_gen_addi_ptr(a0, cpu_env, dofs);
202     tcg_gen_addi_ptr(a1, cpu_env, aofs);
203     tcg_gen_addi_ptr(a2, cpu_env, bofs);
204     tcg_gen_addi_ptr(a3, cpu_env, cofs);
205     tcg_gen_addi_ptr(a4, cpu_env, xofs);
206 
207     fn(a0, a1, a2, a3, a4, desc);
208 
209     tcg_temp_free_ptr(a0);
210     tcg_temp_free_ptr(a1);
211     tcg_temp_free_ptr(a2);
212     tcg_temp_free_ptr(a3);
213     tcg_temp_free_ptr(a4);
214     tcg_temp_free_i32(desc);
215 }
216 
217 /* Generate a call to a gvec-style helper with three vector operands
218    and an extra pointer operand.  */
219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
220                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
221                         int32_t data, gen_helper_gvec_2_ptr *fn)
222 {
223     TCGv_ptr a0, a1;
224     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
225 
226     a0 = tcg_temp_new_ptr();
227     a1 = tcg_temp_new_ptr();
228 
229     tcg_gen_addi_ptr(a0, cpu_env, dofs);
230     tcg_gen_addi_ptr(a1, cpu_env, aofs);
231 
232     fn(a0, a1, ptr, desc);
233 
234     tcg_temp_free_ptr(a0);
235     tcg_temp_free_ptr(a1);
236     tcg_temp_free_i32(desc);
237 }
238 
239 /* Generate a call to a gvec-style helper with three vector operands
240    and an extra pointer operand.  */
241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
242                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
243                         int32_t data, gen_helper_gvec_3_ptr *fn)
244 {
245     TCGv_ptr a0, a1, a2;
246     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
247 
248     a0 = tcg_temp_new_ptr();
249     a1 = tcg_temp_new_ptr();
250     a2 = tcg_temp_new_ptr();
251 
252     tcg_gen_addi_ptr(a0, cpu_env, dofs);
253     tcg_gen_addi_ptr(a1, cpu_env, aofs);
254     tcg_gen_addi_ptr(a2, cpu_env, bofs);
255 
256     fn(a0, a1, a2, ptr, desc);
257 
258     tcg_temp_free_ptr(a0);
259     tcg_temp_free_ptr(a1);
260     tcg_temp_free_ptr(a2);
261     tcg_temp_free_i32(desc);
262 }
263 
264 /* Generate a call to a gvec-style helper with four vector operands
265    and an extra pointer operand.  */
266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
267                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
268                         uint32_t maxsz, int32_t data,
269                         gen_helper_gvec_4_ptr *fn)
270 {
271     TCGv_ptr a0, a1, a2, a3;
272     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
273 
274     a0 = tcg_temp_new_ptr();
275     a1 = tcg_temp_new_ptr();
276     a2 = tcg_temp_new_ptr();
277     a3 = tcg_temp_new_ptr();
278 
279     tcg_gen_addi_ptr(a0, cpu_env, dofs);
280     tcg_gen_addi_ptr(a1, cpu_env, aofs);
281     tcg_gen_addi_ptr(a2, cpu_env, bofs);
282     tcg_gen_addi_ptr(a3, cpu_env, cofs);
283 
284     fn(a0, a1, a2, a3, ptr, desc);
285 
286     tcg_temp_free_ptr(a0);
287     tcg_temp_free_ptr(a1);
288     tcg_temp_free_ptr(a2);
289     tcg_temp_free_ptr(a3);
290     tcg_temp_free_i32(desc);
291 }
292 
293 /* Return true if we want to implement something of OPRSZ bytes
294    in units of LNSZ.  This limits the expansion of inline code.  */
295 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
296 {
297     if (oprsz % lnsz == 0) {
298         uint32_t lnct = oprsz / lnsz;
299         return lnct >= 1 && lnct <= MAX_UNROLL;
300     }
301     return false;
302 }
303 
304 static void expand_clr(uint32_t dofs, uint32_t maxsz);
305 
306 /* Duplicate C as per VECE.  */
307 uint64_t (dup_const)(unsigned vece, uint64_t c)
308 {
309     switch (vece) {
310     case MO_8:
311         return 0x0101010101010101ull * (uint8_t)c;
312     case MO_16:
313         return 0x0001000100010001ull * (uint16_t)c;
314     case MO_32:
315         return 0x0000000100000001ull * (uint32_t)c;
316     case MO_64:
317         return c;
318     default:
319         g_assert_not_reached();
320     }
321 }
322 
323 /* Duplicate IN into OUT as per VECE.  */
324 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
325 {
326     switch (vece) {
327     case MO_8:
328         tcg_gen_ext8u_i32(out, in);
329         tcg_gen_muli_i32(out, out, 0x01010101);
330         break;
331     case MO_16:
332         tcg_gen_deposit_i32(out, in, in, 16, 16);
333         break;
334     case MO_32:
335         tcg_gen_mov_i32(out, in);
336         break;
337     default:
338         g_assert_not_reached();
339     }
340 }
341 
342 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
343 {
344     switch (vece) {
345     case MO_8:
346         tcg_gen_ext8u_i64(out, in);
347         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
348         break;
349     case MO_16:
350         tcg_gen_ext16u_i64(out, in);
351         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
352         break;
353     case MO_32:
354         tcg_gen_deposit_i64(out, in, in, 32, 32);
355         break;
356     case MO_64:
357         tcg_gen_mov_i64(out, in);
358         break;
359     default:
360         g_assert_not_reached();
361     }
362 }
363 
364 /* Select a supported vector type for implementing an operation on SIZE
365  * bytes.  If OP is 0, assume that the real operation to be performed is
366  * required by all backends.  Otherwise, make sure than OP can be performed
367  * on elements of size VECE in the selected type.  Do not select V64 if
368  * PREFER_I64 is true.  Return 0 if no vector type is selected.
369  */
370 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
371                                   uint32_t size, bool prefer_i64)
372 {
373     if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
374         /*
375          * Recall that ARM SVE allows vector sizes that are not a
376          * power of 2, but always a multiple of 16.  The intent is
377          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
378          * It is hard to imagine a case in which v256 is supported
379          * but v128 is not, but check anyway.
380          */
381         if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
382             && (size % 32 == 0
383                 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
384             return TCG_TYPE_V256;
385         }
386     }
387     if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
388         && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
389         return TCG_TYPE_V128;
390     }
391     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
392         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
393         return TCG_TYPE_V64;
394     }
395     return 0;
396 }
397 
398 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
399                          uint32_t maxsz, TCGv_vec t_vec)
400 {
401     uint32_t i = 0;
402 
403     switch (type) {
404     case TCG_TYPE_V256:
405         /*
406          * Recall that ARM SVE allows vector sizes that are not a
407          * power of 2, but always a multiple of 16.  The intent is
408          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
409          */
410         for (; i + 32 <= oprsz; i += 32) {
411             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
412         }
413         /* fallthru */
414     case TCG_TYPE_V128:
415         for (; i + 16 <= oprsz; i += 16) {
416             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
417         }
418         break;
419     case TCG_TYPE_V64:
420         for (; i < oprsz; i += 8) {
421             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
422         }
423         break;
424     default:
425         g_assert_not_reached();
426     }
427 
428     if (oprsz < maxsz) {
429         expand_clr(dofs + oprsz, maxsz - oprsz);
430     }
431 }
432 
433 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
434  * Only one of IN_32 or IN_64 may be set;
435  * IN_C is used if IN_32 and IN_64 are unset.
436  */
437 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
438                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
439                    uint64_t in_c)
440 {
441     TCGType type;
442     TCGv_i64 t_64;
443     TCGv_i32 t_32, t_desc;
444     TCGv_ptr t_ptr;
445     uint32_t i;
446 
447     assert(vece <= (in_32 ? MO_32 : MO_64));
448     assert(in_32 == NULL || in_64 == NULL);
449 
450     /* If we're storing 0, expand oprsz to maxsz.  */
451     if (in_32 == NULL && in_64 == NULL) {
452         in_c = dup_const(vece, in_c);
453         if (in_c == 0) {
454             oprsz = maxsz;
455         }
456     }
457 
458     /* Implement inline with a vector type, if possible.
459      * Prefer integer when 64-bit host and no variable dup.
460      */
461     type = choose_vector_type(NULL, vece, oprsz,
462                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
463                                && (in_64 == NULL || vece == MO_64)));
464     if (type != 0) {
465         TCGv_vec t_vec = tcg_temp_new_vec(type);
466 
467         if (in_32) {
468             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
469         } else if (in_64) {
470             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
471         } else {
472             tcg_gen_dupi_vec(vece, t_vec, in_c);
473         }
474         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
475         tcg_temp_free_vec(t_vec);
476         return;
477     }
478 
479     /* Otherwise, inline with an integer type, unless "large".  */
480     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
481         t_64 = NULL;
482         t_32 = NULL;
483 
484         if (in_32) {
485             /* We are given a 32-bit variable input.  For a 64-bit host,
486                use a 64-bit operation unless the 32-bit operation would
487                be simple enough.  */
488             if (TCG_TARGET_REG_BITS == 64
489                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
490                 t_64 = tcg_temp_new_i64();
491                 tcg_gen_extu_i32_i64(t_64, in_32);
492                 gen_dup_i64(vece, t_64, t_64);
493             } else {
494                 t_32 = tcg_temp_new_i32();
495                 gen_dup_i32(vece, t_32, in_32);
496             }
497         } else if (in_64) {
498             /* We are given a 64-bit variable input.  */
499             t_64 = tcg_temp_new_i64();
500             gen_dup_i64(vece, t_64, in_64);
501         } else {
502             /* We are given a constant input.  */
503             /* For 64-bit hosts, use 64-bit constants for "simple" constants
504                or when we'd need too many 32-bit stores, or when a 64-bit
505                constant is really required.  */
506             if (vece == MO_64
507                 || (TCG_TARGET_REG_BITS == 64
508                     && (in_c == 0 || in_c == -1
509                         || !check_size_impl(oprsz, 4)))) {
510                 t_64 = tcg_const_i64(in_c);
511             } else {
512                 t_32 = tcg_const_i32(in_c);
513             }
514         }
515 
516         /* Implement inline if we picked an implementation size above.  */
517         if (t_32) {
518             for (i = 0; i < oprsz; i += 4) {
519                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
520             }
521             tcg_temp_free_i32(t_32);
522             goto done;
523         }
524         if (t_64) {
525             for (i = 0; i < oprsz; i += 8) {
526                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
527             }
528             tcg_temp_free_i64(t_64);
529             goto done;
530         }
531     }
532 
533     /* Otherwise implement out of line.  */
534     t_ptr = tcg_temp_new_ptr();
535     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
536     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
537 
538     if (vece == MO_64) {
539         if (in_64) {
540             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
541         } else {
542             t_64 = tcg_const_i64(in_c);
543             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
544             tcg_temp_free_i64(t_64);
545         }
546     } else {
547         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
548         static dup_fn * const fns[3] = {
549             gen_helper_gvec_dup8,
550             gen_helper_gvec_dup16,
551             gen_helper_gvec_dup32
552         };
553 
554         if (in_32) {
555             fns[vece](t_ptr, t_desc, in_32);
556         } else {
557             t_32 = tcg_temp_new_i32();
558             if (in_64) {
559                 tcg_gen_extrl_i64_i32(t_32, in_64);
560             } else if (vece == MO_8) {
561                 tcg_gen_movi_i32(t_32, in_c & 0xff);
562             } else if (vece == MO_16) {
563                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
564             } else {
565                 tcg_gen_movi_i32(t_32, in_c);
566             }
567             fns[vece](t_ptr, t_desc, t_32);
568             tcg_temp_free_i32(t_32);
569         }
570     }
571 
572     tcg_temp_free_ptr(t_ptr);
573     tcg_temp_free_i32(t_desc);
574     return;
575 
576  done:
577     if (oprsz < maxsz) {
578         expand_clr(dofs + oprsz, maxsz - oprsz);
579     }
580 }
581 
582 /* Likewise, but with zero.  */
583 static void expand_clr(uint32_t dofs, uint32_t maxsz)
584 {
585     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
586 }
587 
588 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
589 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
590                          void (*fni)(TCGv_i32, TCGv_i32))
591 {
592     TCGv_i32 t0 = tcg_temp_new_i32();
593     uint32_t i;
594 
595     for (i = 0; i < oprsz; i += 4) {
596         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
597         fni(t0, t0);
598         tcg_gen_st_i32(t0, cpu_env, dofs + i);
599     }
600     tcg_temp_free_i32(t0);
601 }
602 
603 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
604                           int32_t c, bool load_dest,
605                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
606 {
607     TCGv_i32 t0 = tcg_temp_new_i32();
608     TCGv_i32 t1 = tcg_temp_new_i32();
609     uint32_t i;
610 
611     for (i = 0; i < oprsz; i += 4) {
612         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
613         if (load_dest) {
614             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
615         }
616         fni(t1, t0, c);
617         tcg_gen_st_i32(t1, cpu_env, dofs + i);
618     }
619     tcg_temp_free_i32(t0);
620     tcg_temp_free_i32(t1);
621 }
622 
623 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
624                           TCGv_i32 c, bool scalar_first,
625                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
626 {
627     TCGv_i32 t0 = tcg_temp_new_i32();
628     TCGv_i32 t1 = tcg_temp_new_i32();
629     uint32_t i;
630 
631     for (i = 0; i < oprsz; i += 4) {
632         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
633         if (scalar_first) {
634             fni(t1, c, t0);
635         } else {
636             fni(t1, t0, c);
637         }
638         tcg_gen_st_i32(t1, cpu_env, dofs + i);
639     }
640     tcg_temp_free_i32(t0);
641     tcg_temp_free_i32(t1);
642 }
643 
644 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
645 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
646                          uint32_t bofs, uint32_t oprsz, bool load_dest,
647                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
648 {
649     TCGv_i32 t0 = tcg_temp_new_i32();
650     TCGv_i32 t1 = tcg_temp_new_i32();
651     TCGv_i32 t2 = tcg_temp_new_i32();
652     uint32_t i;
653 
654     for (i = 0; i < oprsz; i += 4) {
655         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
656         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
657         if (load_dest) {
658             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
659         }
660         fni(t2, t0, t1);
661         tcg_gen_st_i32(t2, cpu_env, dofs + i);
662     }
663     tcg_temp_free_i32(t2);
664     tcg_temp_free_i32(t1);
665     tcg_temp_free_i32(t0);
666 }
667 
668 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
669                           uint32_t oprsz, int32_t c, bool load_dest,
670                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
671 {
672     TCGv_i32 t0 = tcg_temp_new_i32();
673     TCGv_i32 t1 = tcg_temp_new_i32();
674     TCGv_i32 t2 = tcg_temp_new_i32();
675     uint32_t i;
676 
677     for (i = 0; i < oprsz; i += 4) {
678         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
679         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
680         if (load_dest) {
681             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
682         }
683         fni(t2, t0, t1, c);
684         tcg_gen_st_i32(t2, cpu_env, dofs + i);
685     }
686     tcg_temp_free_i32(t0);
687     tcg_temp_free_i32(t1);
688     tcg_temp_free_i32(t2);
689 }
690 
691 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
692 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
693                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
694                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
695 {
696     TCGv_i32 t0 = tcg_temp_new_i32();
697     TCGv_i32 t1 = tcg_temp_new_i32();
698     TCGv_i32 t2 = tcg_temp_new_i32();
699     TCGv_i32 t3 = tcg_temp_new_i32();
700     uint32_t i;
701 
702     for (i = 0; i < oprsz; i += 4) {
703         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
704         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
705         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
706         fni(t0, t1, t2, t3);
707         tcg_gen_st_i32(t0, cpu_env, dofs + i);
708         if (write_aofs) {
709             tcg_gen_st_i32(t1, cpu_env, aofs + i);
710         }
711     }
712     tcg_temp_free_i32(t3);
713     tcg_temp_free_i32(t2);
714     tcg_temp_free_i32(t1);
715     tcg_temp_free_i32(t0);
716 }
717 
718 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
719 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
720                          void (*fni)(TCGv_i64, TCGv_i64))
721 {
722     TCGv_i64 t0 = tcg_temp_new_i64();
723     uint32_t i;
724 
725     for (i = 0; i < oprsz; i += 8) {
726         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
727         fni(t0, t0);
728         tcg_gen_st_i64(t0, cpu_env, dofs + i);
729     }
730     tcg_temp_free_i64(t0);
731 }
732 
733 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
734                           int64_t c, bool load_dest,
735                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
736 {
737     TCGv_i64 t0 = tcg_temp_new_i64();
738     TCGv_i64 t1 = tcg_temp_new_i64();
739     uint32_t i;
740 
741     for (i = 0; i < oprsz; i += 8) {
742         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
743         if (load_dest) {
744             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
745         }
746         fni(t1, t0, c);
747         tcg_gen_st_i64(t1, cpu_env, dofs + i);
748     }
749     tcg_temp_free_i64(t0);
750     tcg_temp_free_i64(t1);
751 }
752 
753 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
754                           TCGv_i64 c, bool scalar_first,
755                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
756 {
757     TCGv_i64 t0 = tcg_temp_new_i64();
758     TCGv_i64 t1 = tcg_temp_new_i64();
759     uint32_t i;
760 
761     for (i = 0; i < oprsz; i += 8) {
762         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
763         if (scalar_first) {
764             fni(t1, c, t0);
765         } else {
766             fni(t1, t0, c);
767         }
768         tcg_gen_st_i64(t1, cpu_env, dofs + i);
769     }
770     tcg_temp_free_i64(t0);
771     tcg_temp_free_i64(t1);
772 }
773 
774 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
775 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
776                          uint32_t bofs, uint32_t oprsz, bool load_dest,
777                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
778 {
779     TCGv_i64 t0 = tcg_temp_new_i64();
780     TCGv_i64 t1 = tcg_temp_new_i64();
781     TCGv_i64 t2 = tcg_temp_new_i64();
782     uint32_t i;
783 
784     for (i = 0; i < oprsz; i += 8) {
785         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
786         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
787         if (load_dest) {
788             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
789         }
790         fni(t2, t0, t1);
791         tcg_gen_st_i64(t2, cpu_env, dofs + i);
792     }
793     tcg_temp_free_i64(t2);
794     tcg_temp_free_i64(t1);
795     tcg_temp_free_i64(t0);
796 }
797 
798 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
799                           uint32_t oprsz, int64_t c, bool load_dest,
800                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
801 {
802     TCGv_i64 t0 = tcg_temp_new_i64();
803     TCGv_i64 t1 = tcg_temp_new_i64();
804     TCGv_i64 t2 = tcg_temp_new_i64();
805     uint32_t i;
806 
807     for (i = 0; i < oprsz; i += 8) {
808         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
809         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
810         if (load_dest) {
811             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
812         }
813         fni(t2, t0, t1, c);
814         tcg_gen_st_i64(t2, cpu_env, dofs + i);
815     }
816     tcg_temp_free_i64(t0);
817     tcg_temp_free_i64(t1);
818     tcg_temp_free_i64(t2);
819 }
820 
821 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
822 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
823                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
824                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
825 {
826     TCGv_i64 t0 = tcg_temp_new_i64();
827     TCGv_i64 t1 = tcg_temp_new_i64();
828     TCGv_i64 t2 = tcg_temp_new_i64();
829     TCGv_i64 t3 = tcg_temp_new_i64();
830     uint32_t i;
831 
832     for (i = 0; i < oprsz; i += 8) {
833         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
834         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
835         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
836         fni(t0, t1, t2, t3);
837         tcg_gen_st_i64(t0, cpu_env, dofs + i);
838         if (write_aofs) {
839             tcg_gen_st_i64(t1, cpu_env, aofs + i);
840         }
841     }
842     tcg_temp_free_i64(t3);
843     tcg_temp_free_i64(t2);
844     tcg_temp_free_i64(t1);
845     tcg_temp_free_i64(t0);
846 }
847 
848 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
849 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
850                          uint32_t oprsz, uint32_t tysz, TCGType type,
851                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
852 {
853     TCGv_vec t0 = tcg_temp_new_vec(type);
854     uint32_t i;
855 
856     for (i = 0; i < oprsz; i += tysz) {
857         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
858         fni(vece, t0, t0);
859         tcg_gen_st_vec(t0, cpu_env, dofs + i);
860     }
861     tcg_temp_free_vec(t0);
862 }
863 
864 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
865    using host vectors.  */
866 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
867                           uint32_t oprsz, uint32_t tysz, TCGType type,
868                           int64_t c, bool load_dest,
869                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
870 {
871     TCGv_vec t0 = tcg_temp_new_vec(type);
872     TCGv_vec t1 = tcg_temp_new_vec(type);
873     uint32_t i;
874 
875     for (i = 0; i < oprsz; i += tysz) {
876         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
877         if (load_dest) {
878             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
879         }
880         fni(vece, t1, t0, c);
881         tcg_gen_st_vec(t1, cpu_env, dofs + i);
882     }
883     tcg_temp_free_vec(t0);
884     tcg_temp_free_vec(t1);
885 }
886 
887 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
888                           uint32_t oprsz, uint32_t tysz, TCGType type,
889                           TCGv_vec c, bool scalar_first,
890                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
891 {
892     TCGv_vec t0 = tcg_temp_new_vec(type);
893     TCGv_vec t1 = tcg_temp_new_vec(type);
894     uint32_t i;
895 
896     for (i = 0; i < oprsz; i += tysz) {
897         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
898         if (scalar_first) {
899             fni(vece, t1, c, t0);
900         } else {
901             fni(vece, t1, t0, c);
902         }
903         tcg_gen_st_vec(t1, cpu_env, dofs + i);
904     }
905     tcg_temp_free_vec(t0);
906     tcg_temp_free_vec(t1);
907 }
908 
909 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
910 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
911                          uint32_t bofs, uint32_t oprsz,
912                          uint32_t tysz, TCGType type, bool load_dest,
913                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
914 {
915     TCGv_vec t0 = tcg_temp_new_vec(type);
916     TCGv_vec t1 = tcg_temp_new_vec(type);
917     TCGv_vec t2 = tcg_temp_new_vec(type);
918     uint32_t i;
919 
920     for (i = 0; i < oprsz; i += tysz) {
921         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
922         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
923         if (load_dest) {
924             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
925         }
926         fni(vece, t2, t0, t1);
927         tcg_gen_st_vec(t2, cpu_env, dofs + i);
928     }
929     tcg_temp_free_vec(t2);
930     tcg_temp_free_vec(t1);
931     tcg_temp_free_vec(t0);
932 }
933 
934 /*
935  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
936  * using host vectors.
937  */
938 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
939                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
940                           TCGType type, int64_t c, bool load_dest,
941                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
942                                       int64_t))
943 {
944     TCGv_vec t0 = tcg_temp_new_vec(type);
945     TCGv_vec t1 = tcg_temp_new_vec(type);
946     TCGv_vec t2 = tcg_temp_new_vec(type);
947     uint32_t i;
948 
949     for (i = 0; i < oprsz; i += tysz) {
950         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
951         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
952         if (load_dest) {
953             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
954         }
955         fni(vece, t2, t0, t1, c);
956         tcg_gen_st_vec(t2, cpu_env, dofs + i);
957     }
958     tcg_temp_free_vec(t0);
959     tcg_temp_free_vec(t1);
960     tcg_temp_free_vec(t2);
961 }
962 
963 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
964 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
965                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
966                          uint32_t tysz, TCGType type, bool write_aofs,
967                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
968                                      TCGv_vec, TCGv_vec))
969 {
970     TCGv_vec t0 = tcg_temp_new_vec(type);
971     TCGv_vec t1 = tcg_temp_new_vec(type);
972     TCGv_vec t2 = tcg_temp_new_vec(type);
973     TCGv_vec t3 = tcg_temp_new_vec(type);
974     uint32_t i;
975 
976     for (i = 0; i < oprsz; i += tysz) {
977         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
978         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
979         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
980         fni(vece, t0, t1, t2, t3);
981         tcg_gen_st_vec(t0, cpu_env, dofs + i);
982         if (write_aofs) {
983             tcg_gen_st_vec(t1, cpu_env, aofs + i);
984         }
985     }
986     tcg_temp_free_vec(t3);
987     tcg_temp_free_vec(t2);
988     tcg_temp_free_vec(t1);
989     tcg_temp_free_vec(t0);
990 }
991 
992 /* Expand a vector two-operand operation.  */
993 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
994                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
995 {
996     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
997     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
998     TCGType type;
999     uint32_t some;
1000 
1001     check_size_align(oprsz, maxsz, dofs | aofs);
1002     check_overlap_2(dofs, aofs, maxsz);
1003 
1004     type = 0;
1005     if (g->fniv) {
1006         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1007     }
1008     switch (type) {
1009     case TCG_TYPE_V256:
1010         /* Recall that ARM SVE allows vector sizes that are not a
1011          * power of 2, but always a multiple of 16.  The intent is
1012          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1013          */
1014         some = QEMU_ALIGN_DOWN(oprsz, 32);
1015         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
1016         if (some == oprsz) {
1017             break;
1018         }
1019         dofs += some;
1020         aofs += some;
1021         oprsz -= some;
1022         maxsz -= some;
1023         /* fallthru */
1024     case TCG_TYPE_V128:
1025         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
1026         break;
1027     case TCG_TYPE_V64:
1028         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
1029         break;
1030 
1031     case 0:
1032         if (g->fni8 && check_size_impl(oprsz, 8)) {
1033             expand_2_i64(dofs, aofs, oprsz, g->fni8);
1034         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1035             expand_2_i32(dofs, aofs, oprsz, g->fni4);
1036         } else {
1037             assert(g->fno != NULL);
1038             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1039             oprsz = maxsz;
1040         }
1041         break;
1042 
1043     default:
1044         g_assert_not_reached();
1045     }
1046     tcg_swap_vecop_list(hold_list);
1047 
1048     if (oprsz < maxsz) {
1049         expand_clr(dofs + oprsz, maxsz - oprsz);
1050     }
1051 }
1052 
1053 /* Expand a vector operation with two vectors and an immediate.  */
1054 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1055                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1056 {
1057     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1058     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1059     TCGType type;
1060     uint32_t some;
1061 
1062     check_size_align(oprsz, maxsz, dofs | aofs);
1063     check_overlap_2(dofs, aofs, maxsz);
1064 
1065     type = 0;
1066     if (g->fniv) {
1067         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1068     }
1069     switch (type) {
1070     case TCG_TYPE_V256:
1071         /* Recall that ARM SVE allows vector sizes that are not a
1072          * power of 2, but always a multiple of 16.  The intent is
1073          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1074          */
1075         some = QEMU_ALIGN_DOWN(oprsz, 32);
1076         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1077                       c, g->load_dest, g->fniv);
1078         if (some == oprsz) {
1079             break;
1080         }
1081         dofs += some;
1082         aofs += some;
1083         oprsz -= some;
1084         maxsz -= some;
1085         /* fallthru */
1086     case TCG_TYPE_V128:
1087         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1088                       c, g->load_dest, g->fniv);
1089         break;
1090     case TCG_TYPE_V64:
1091         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1092                       c, g->load_dest, g->fniv);
1093         break;
1094 
1095     case 0:
1096         if (g->fni8 && check_size_impl(oprsz, 8)) {
1097             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1098         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1099             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1100         } else {
1101             if (g->fno) {
1102                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1103             } else {
1104                 TCGv_i64 tcg_c = tcg_const_i64(c);
1105                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1106                                     maxsz, c, g->fnoi);
1107                 tcg_temp_free_i64(tcg_c);
1108             }
1109             oprsz = maxsz;
1110         }
1111         break;
1112 
1113     default:
1114         g_assert_not_reached();
1115     }
1116     tcg_swap_vecop_list(hold_list);
1117 
1118     if (oprsz < maxsz) {
1119         expand_clr(dofs + oprsz, maxsz - oprsz);
1120     }
1121 }
1122 
1123 /* Expand a vector operation with two vectors and a scalar.  */
1124 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1125                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1126 {
1127     TCGType type;
1128 
1129     check_size_align(oprsz, maxsz, dofs | aofs);
1130     check_overlap_2(dofs, aofs, maxsz);
1131 
1132     type = 0;
1133     if (g->fniv) {
1134         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1135     }
1136     if (type != 0) {
1137         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1138         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1139         TCGv_vec t_vec = tcg_temp_new_vec(type);
1140         uint32_t some;
1141 
1142         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1143 
1144         switch (type) {
1145         case TCG_TYPE_V256:
1146             /* Recall that ARM SVE allows vector sizes that are not a
1147              * power of 2, but always a multiple of 16.  The intent is
1148              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1149              */
1150             some = QEMU_ALIGN_DOWN(oprsz, 32);
1151             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1152                           t_vec, g->scalar_first, g->fniv);
1153             if (some == oprsz) {
1154                 break;
1155             }
1156             dofs += some;
1157             aofs += some;
1158             oprsz -= some;
1159             maxsz -= some;
1160             /* fallthru */
1161 
1162         case TCG_TYPE_V128:
1163             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1164                           t_vec, g->scalar_first, g->fniv);
1165             break;
1166 
1167         case TCG_TYPE_V64:
1168             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1169                           t_vec, g->scalar_first, g->fniv);
1170             break;
1171 
1172         default:
1173             g_assert_not_reached();
1174         }
1175         tcg_temp_free_vec(t_vec);
1176         tcg_swap_vecop_list(hold_list);
1177     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1178         TCGv_i64 t64 = tcg_temp_new_i64();
1179 
1180         gen_dup_i64(g->vece, t64, c);
1181         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1182         tcg_temp_free_i64(t64);
1183     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1184         TCGv_i32 t32 = tcg_temp_new_i32();
1185 
1186         tcg_gen_extrl_i64_i32(t32, c);
1187         gen_dup_i32(g->vece, t32, t32);
1188         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1189         tcg_temp_free_i32(t32);
1190     } else {
1191         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1192         return;
1193     }
1194 
1195     if (oprsz < maxsz) {
1196         expand_clr(dofs + oprsz, maxsz - oprsz);
1197     }
1198 }
1199 
1200 /* Expand a vector three-operand operation.  */
1201 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1202                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1203 {
1204     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1205     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1206     TCGType type;
1207     uint32_t some;
1208 
1209     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1210     check_overlap_3(dofs, aofs, bofs, maxsz);
1211 
1212     type = 0;
1213     if (g->fniv) {
1214         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1215     }
1216     switch (type) {
1217     case TCG_TYPE_V256:
1218         /* Recall that ARM SVE allows vector sizes that are not a
1219          * power of 2, but always a multiple of 16.  The intent is
1220          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1221          */
1222         some = QEMU_ALIGN_DOWN(oprsz, 32);
1223         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1224                      g->load_dest, g->fniv);
1225         if (some == oprsz) {
1226             break;
1227         }
1228         dofs += some;
1229         aofs += some;
1230         bofs += some;
1231         oprsz -= some;
1232         maxsz -= some;
1233         /* fallthru */
1234     case TCG_TYPE_V128:
1235         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1236                      g->load_dest, g->fniv);
1237         break;
1238     case TCG_TYPE_V64:
1239         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1240                      g->load_dest, g->fniv);
1241         break;
1242 
1243     case 0:
1244         if (g->fni8 && check_size_impl(oprsz, 8)) {
1245             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1246         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1247             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1248         } else {
1249             assert(g->fno != NULL);
1250             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1251                                maxsz, g->data, g->fno);
1252             oprsz = maxsz;
1253         }
1254         break;
1255 
1256     default:
1257         g_assert_not_reached();
1258     }
1259     tcg_swap_vecop_list(hold_list);
1260 
1261     if (oprsz < maxsz) {
1262         expand_clr(dofs + oprsz, maxsz - oprsz);
1263     }
1264 }
1265 
1266 /* Expand a vector operation with three vectors and an immediate.  */
1267 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1268                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1269                      const GVecGen3i *g)
1270 {
1271     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1272     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1273     TCGType type;
1274     uint32_t some;
1275 
1276     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1277     check_overlap_3(dofs, aofs, bofs, maxsz);
1278 
1279     type = 0;
1280     if (g->fniv) {
1281         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1282     }
1283     switch (type) {
1284     case TCG_TYPE_V256:
1285         /*
1286          * Recall that ARM SVE allows vector sizes that are not a
1287          * power of 2, but always a multiple of 16.  The intent is
1288          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1289          */
1290         some = QEMU_ALIGN_DOWN(oprsz, 32);
1291         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1292                       c, g->load_dest, g->fniv);
1293         if (some == oprsz) {
1294             break;
1295         }
1296         dofs += some;
1297         aofs += some;
1298         bofs += some;
1299         oprsz -= some;
1300         maxsz -= some;
1301         /* fallthru */
1302     case TCG_TYPE_V128:
1303         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1304                       c, g->load_dest, g->fniv);
1305         break;
1306     case TCG_TYPE_V64:
1307         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1308                       c, g->load_dest, g->fniv);
1309         break;
1310 
1311     case 0:
1312         if (g->fni8 && check_size_impl(oprsz, 8)) {
1313             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1314         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1315             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1316         } else {
1317             assert(g->fno != NULL);
1318             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1319             oprsz = maxsz;
1320         }
1321         break;
1322 
1323     default:
1324         g_assert_not_reached();
1325     }
1326     tcg_swap_vecop_list(hold_list);
1327 
1328     if (oprsz < maxsz) {
1329         expand_clr(dofs + oprsz, maxsz - oprsz);
1330     }
1331 }
1332 
1333 /* Expand a vector four-operand operation.  */
1334 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1335                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1336 {
1337     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1338     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1339     TCGType type;
1340     uint32_t some;
1341 
1342     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1343     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1344 
1345     type = 0;
1346     if (g->fniv) {
1347         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1348     }
1349     switch (type) {
1350     case TCG_TYPE_V256:
1351         /* Recall that ARM SVE allows vector sizes that are not a
1352          * power of 2, but always a multiple of 16.  The intent is
1353          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1354          */
1355         some = QEMU_ALIGN_DOWN(oprsz, 32);
1356         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1357                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1358         if (some == oprsz) {
1359             break;
1360         }
1361         dofs += some;
1362         aofs += some;
1363         bofs += some;
1364         cofs += some;
1365         oprsz -= some;
1366         maxsz -= some;
1367         /* fallthru */
1368     case TCG_TYPE_V128:
1369         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1370                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1371         break;
1372     case TCG_TYPE_V64:
1373         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1374                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1375         break;
1376 
1377     case 0:
1378         if (g->fni8 && check_size_impl(oprsz, 8)) {
1379             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1380                          g->write_aofs, g->fni8);
1381         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1382             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1383                          g->write_aofs, g->fni4);
1384         } else {
1385             assert(g->fno != NULL);
1386             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1387                                oprsz, maxsz, g->data, g->fno);
1388             oprsz = maxsz;
1389         }
1390         break;
1391 
1392     default:
1393         g_assert_not_reached();
1394     }
1395     tcg_swap_vecop_list(hold_list);
1396 
1397     if (oprsz < maxsz) {
1398         expand_clr(dofs + oprsz, maxsz - oprsz);
1399     }
1400 }
1401 
1402 /*
1403  * Expand specific vector operations.
1404  */
1405 
1406 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1407 {
1408     tcg_gen_mov_vec(a, b);
1409 }
1410 
1411 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1412                       uint32_t oprsz, uint32_t maxsz)
1413 {
1414     static const GVecGen2 g = {
1415         .fni8 = tcg_gen_mov_i64,
1416         .fniv = vec_mov2,
1417         .fno = gen_helper_gvec_mov,
1418         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1419     };
1420     if (dofs != aofs) {
1421         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1422     } else {
1423         check_size_align(oprsz, maxsz, dofs);
1424         if (oprsz < maxsz) {
1425             expand_clr(dofs + oprsz, maxsz - oprsz);
1426         }
1427     }
1428 }
1429 
1430 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1431                           uint32_t maxsz, TCGv_i32 in)
1432 {
1433     check_size_align(oprsz, maxsz, dofs);
1434     tcg_debug_assert(vece <= MO_32);
1435     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1436 }
1437 
1438 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1439                           uint32_t maxsz, TCGv_i64 in)
1440 {
1441     check_size_align(oprsz, maxsz, dofs);
1442     tcg_debug_assert(vece <= MO_64);
1443     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1444 }
1445 
1446 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1447                           uint32_t oprsz, uint32_t maxsz)
1448 {
1449     if (vece <= MO_64) {
1450         TCGType type = choose_vector_type(0, vece, oprsz, 0);
1451         if (type != 0) {
1452             TCGv_vec t_vec = tcg_temp_new_vec(type);
1453             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1454             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1455             tcg_temp_free_vec(t_vec);
1456             return;
1457         }
1458     }
1459     if (vece <= MO_32) {
1460         TCGv_i32 in = tcg_temp_new_i32();
1461         switch (vece) {
1462         case MO_8:
1463             tcg_gen_ld8u_i32(in, cpu_env, aofs);
1464             break;
1465         case MO_16:
1466             tcg_gen_ld16u_i32(in, cpu_env, aofs);
1467             break;
1468         case MO_32:
1469             tcg_gen_ld_i32(in, cpu_env, aofs);
1470             break;
1471         }
1472         tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1473         tcg_temp_free_i32(in);
1474     } else if (vece == MO_64) {
1475         TCGv_i64 in = tcg_temp_new_i64();
1476         tcg_gen_ld_i64(in, cpu_env, aofs);
1477         tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1478         tcg_temp_free_i64(in);
1479     } else {
1480         /* 128-bit duplicate.  */
1481         /* ??? Dup to 256-bit vector.  */
1482         int i;
1483 
1484         tcg_debug_assert(vece == 4);
1485         tcg_debug_assert(oprsz >= 16);
1486         if (TCG_TARGET_HAS_v128) {
1487             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1488 
1489             tcg_gen_ld_vec(in, cpu_env, aofs);
1490             for (i = 0; i < oprsz; i += 16) {
1491                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1492             }
1493             tcg_temp_free_vec(in);
1494         } else {
1495             TCGv_i64 in0 = tcg_temp_new_i64();
1496             TCGv_i64 in1 = tcg_temp_new_i64();
1497 
1498             tcg_gen_ld_i64(in0, cpu_env, aofs);
1499             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1500             for (i = 0; i < oprsz; i += 16) {
1501                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1502                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1503             }
1504             tcg_temp_free_i64(in0);
1505             tcg_temp_free_i64(in1);
1506         }
1507     }
1508 }
1509 
1510 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1511                          uint32_t maxsz, uint64_t x)
1512 {
1513     check_size_align(oprsz, maxsz, dofs);
1514     do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1515 }
1516 
1517 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1518                          uint32_t maxsz, uint32_t x)
1519 {
1520     check_size_align(oprsz, maxsz, dofs);
1521     do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1522 }
1523 
1524 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1525                          uint32_t maxsz, uint16_t x)
1526 {
1527     check_size_align(oprsz, maxsz, dofs);
1528     do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1529 }
1530 
1531 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1532                          uint32_t maxsz, uint8_t x)
1533 {
1534     check_size_align(oprsz, maxsz, dofs);
1535     do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1536 }
1537 
1538 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1539                       uint32_t oprsz, uint32_t maxsz)
1540 {
1541     static const GVecGen2 g = {
1542         .fni8 = tcg_gen_not_i64,
1543         .fniv = tcg_gen_not_vec,
1544         .fno = gen_helper_gvec_not,
1545         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1546     };
1547     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1548 }
1549 
1550 /* Perform a vector addition using normal addition and a mask.  The mask
1551    should be the sign bit of each lane.  This 6-operation form is more
1552    efficient than separate additions when there are 4 or more lanes in
1553    the 64-bit operation.  */
1554 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1555 {
1556     TCGv_i64 t1 = tcg_temp_new_i64();
1557     TCGv_i64 t2 = tcg_temp_new_i64();
1558     TCGv_i64 t3 = tcg_temp_new_i64();
1559 
1560     tcg_gen_andc_i64(t1, a, m);
1561     tcg_gen_andc_i64(t2, b, m);
1562     tcg_gen_xor_i64(t3, a, b);
1563     tcg_gen_add_i64(d, t1, t2);
1564     tcg_gen_and_i64(t3, t3, m);
1565     tcg_gen_xor_i64(d, d, t3);
1566 
1567     tcg_temp_free_i64(t1);
1568     tcg_temp_free_i64(t2);
1569     tcg_temp_free_i64(t3);
1570 }
1571 
1572 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1573 {
1574     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1575     gen_addv_mask(d, a, b, m);
1576     tcg_temp_free_i64(m);
1577 }
1578 
1579 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1580 {
1581     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1582     gen_addv_mask(d, a, b, m);
1583     tcg_temp_free_i64(m);
1584 }
1585 
1586 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1587 {
1588     TCGv_i64 t1 = tcg_temp_new_i64();
1589     TCGv_i64 t2 = tcg_temp_new_i64();
1590 
1591     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1592     tcg_gen_add_i64(t2, a, b);
1593     tcg_gen_add_i64(t1, t1, b);
1594     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1595 
1596     tcg_temp_free_i64(t1);
1597     tcg_temp_free_i64(t2);
1598 }
1599 
1600 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1601 
1602 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1603                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1604 {
1605     static const GVecGen3 g[4] = {
1606         { .fni8 = tcg_gen_vec_add8_i64,
1607           .fniv = tcg_gen_add_vec,
1608           .fno = gen_helper_gvec_add8,
1609           .opt_opc = vecop_list_add,
1610           .vece = MO_8 },
1611         { .fni8 = tcg_gen_vec_add16_i64,
1612           .fniv = tcg_gen_add_vec,
1613           .fno = gen_helper_gvec_add16,
1614           .opt_opc = vecop_list_add,
1615           .vece = MO_16 },
1616         { .fni4 = tcg_gen_add_i32,
1617           .fniv = tcg_gen_add_vec,
1618           .fno = gen_helper_gvec_add32,
1619           .opt_opc = vecop_list_add,
1620           .vece = MO_32 },
1621         { .fni8 = tcg_gen_add_i64,
1622           .fniv = tcg_gen_add_vec,
1623           .fno = gen_helper_gvec_add64,
1624           .opt_opc = vecop_list_add,
1625           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1626           .vece = MO_64 },
1627     };
1628 
1629     tcg_debug_assert(vece <= MO_64);
1630     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1631 }
1632 
1633 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1634                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1635 {
1636     static const GVecGen2s g[4] = {
1637         { .fni8 = tcg_gen_vec_add8_i64,
1638           .fniv = tcg_gen_add_vec,
1639           .fno = gen_helper_gvec_adds8,
1640           .opt_opc = vecop_list_add,
1641           .vece = MO_8 },
1642         { .fni8 = tcg_gen_vec_add16_i64,
1643           .fniv = tcg_gen_add_vec,
1644           .fno = gen_helper_gvec_adds16,
1645           .opt_opc = vecop_list_add,
1646           .vece = MO_16 },
1647         { .fni4 = tcg_gen_add_i32,
1648           .fniv = tcg_gen_add_vec,
1649           .fno = gen_helper_gvec_adds32,
1650           .opt_opc = vecop_list_add,
1651           .vece = MO_32 },
1652         { .fni8 = tcg_gen_add_i64,
1653           .fniv = tcg_gen_add_vec,
1654           .fno = gen_helper_gvec_adds64,
1655           .opt_opc = vecop_list_add,
1656           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1657           .vece = MO_64 },
1658     };
1659 
1660     tcg_debug_assert(vece <= MO_64);
1661     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1662 }
1663 
1664 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1665                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1666 {
1667     TCGv_i64 tmp = tcg_const_i64(c);
1668     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1669     tcg_temp_free_i64(tmp);
1670 }
1671 
1672 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1673 
1674 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1675                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1676 {
1677     static const GVecGen2s g[4] = {
1678         { .fni8 = tcg_gen_vec_sub8_i64,
1679           .fniv = tcg_gen_sub_vec,
1680           .fno = gen_helper_gvec_subs8,
1681           .opt_opc = vecop_list_sub,
1682           .vece = MO_8 },
1683         { .fni8 = tcg_gen_vec_sub16_i64,
1684           .fniv = tcg_gen_sub_vec,
1685           .fno = gen_helper_gvec_subs16,
1686           .opt_opc = vecop_list_sub,
1687           .vece = MO_16 },
1688         { .fni4 = tcg_gen_sub_i32,
1689           .fniv = tcg_gen_sub_vec,
1690           .fno = gen_helper_gvec_subs32,
1691           .opt_opc = vecop_list_sub,
1692           .vece = MO_32 },
1693         { .fni8 = tcg_gen_sub_i64,
1694           .fniv = tcg_gen_sub_vec,
1695           .fno = gen_helper_gvec_subs64,
1696           .opt_opc = vecop_list_sub,
1697           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1698           .vece = MO_64 },
1699     };
1700 
1701     tcg_debug_assert(vece <= MO_64);
1702     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1703 }
1704 
1705 /* Perform a vector subtraction using normal subtraction and a mask.
1706    Compare gen_addv_mask above.  */
1707 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1708 {
1709     TCGv_i64 t1 = tcg_temp_new_i64();
1710     TCGv_i64 t2 = tcg_temp_new_i64();
1711     TCGv_i64 t3 = tcg_temp_new_i64();
1712 
1713     tcg_gen_or_i64(t1, a, m);
1714     tcg_gen_andc_i64(t2, b, m);
1715     tcg_gen_eqv_i64(t3, a, b);
1716     tcg_gen_sub_i64(d, t1, t2);
1717     tcg_gen_and_i64(t3, t3, m);
1718     tcg_gen_xor_i64(d, d, t3);
1719 
1720     tcg_temp_free_i64(t1);
1721     tcg_temp_free_i64(t2);
1722     tcg_temp_free_i64(t3);
1723 }
1724 
1725 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1726 {
1727     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1728     gen_subv_mask(d, a, b, m);
1729     tcg_temp_free_i64(m);
1730 }
1731 
1732 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1733 {
1734     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1735     gen_subv_mask(d, a, b, m);
1736     tcg_temp_free_i64(m);
1737 }
1738 
1739 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1740 {
1741     TCGv_i64 t1 = tcg_temp_new_i64();
1742     TCGv_i64 t2 = tcg_temp_new_i64();
1743 
1744     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1745     tcg_gen_sub_i64(t2, a, b);
1746     tcg_gen_sub_i64(t1, a, t1);
1747     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1748 
1749     tcg_temp_free_i64(t1);
1750     tcg_temp_free_i64(t2);
1751 }
1752 
1753 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1754                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1755 {
1756     static const GVecGen3 g[4] = {
1757         { .fni8 = tcg_gen_vec_sub8_i64,
1758           .fniv = tcg_gen_sub_vec,
1759           .fno = gen_helper_gvec_sub8,
1760           .opt_opc = vecop_list_sub,
1761           .vece = MO_8 },
1762         { .fni8 = tcg_gen_vec_sub16_i64,
1763           .fniv = tcg_gen_sub_vec,
1764           .fno = gen_helper_gvec_sub16,
1765           .opt_opc = vecop_list_sub,
1766           .vece = MO_16 },
1767         { .fni4 = tcg_gen_sub_i32,
1768           .fniv = tcg_gen_sub_vec,
1769           .fno = gen_helper_gvec_sub32,
1770           .opt_opc = vecop_list_sub,
1771           .vece = MO_32 },
1772         { .fni8 = tcg_gen_sub_i64,
1773           .fniv = tcg_gen_sub_vec,
1774           .fno = gen_helper_gvec_sub64,
1775           .opt_opc = vecop_list_sub,
1776           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1777           .vece = MO_64 },
1778     };
1779 
1780     tcg_debug_assert(vece <= MO_64);
1781     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1782 }
1783 
1784 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1785 
1786 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1787                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1788 {
1789     static const GVecGen3 g[4] = {
1790         { .fniv = tcg_gen_mul_vec,
1791           .fno = gen_helper_gvec_mul8,
1792           .opt_opc = vecop_list_mul,
1793           .vece = MO_8 },
1794         { .fniv = tcg_gen_mul_vec,
1795           .fno = gen_helper_gvec_mul16,
1796           .opt_opc = vecop_list_mul,
1797           .vece = MO_16 },
1798         { .fni4 = tcg_gen_mul_i32,
1799           .fniv = tcg_gen_mul_vec,
1800           .fno = gen_helper_gvec_mul32,
1801           .opt_opc = vecop_list_mul,
1802           .vece = MO_32 },
1803         { .fni8 = tcg_gen_mul_i64,
1804           .fniv = tcg_gen_mul_vec,
1805           .fno = gen_helper_gvec_mul64,
1806           .opt_opc = vecop_list_mul,
1807           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1808           .vece = MO_64 },
1809     };
1810 
1811     tcg_debug_assert(vece <= MO_64);
1812     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1813 }
1814 
1815 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1816                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1817 {
1818     static const GVecGen2s g[4] = {
1819         { .fniv = tcg_gen_mul_vec,
1820           .fno = gen_helper_gvec_muls8,
1821           .opt_opc = vecop_list_mul,
1822           .vece = MO_8 },
1823         { .fniv = tcg_gen_mul_vec,
1824           .fno = gen_helper_gvec_muls16,
1825           .opt_opc = vecop_list_mul,
1826           .vece = MO_16 },
1827         { .fni4 = tcg_gen_mul_i32,
1828           .fniv = tcg_gen_mul_vec,
1829           .fno = gen_helper_gvec_muls32,
1830           .opt_opc = vecop_list_mul,
1831           .vece = MO_32 },
1832         { .fni8 = tcg_gen_mul_i64,
1833           .fniv = tcg_gen_mul_vec,
1834           .fno = gen_helper_gvec_muls64,
1835           .opt_opc = vecop_list_mul,
1836           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1837           .vece = MO_64 },
1838     };
1839 
1840     tcg_debug_assert(vece <= MO_64);
1841     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1842 }
1843 
1844 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1845                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1846 {
1847     TCGv_i64 tmp = tcg_const_i64(c);
1848     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1849     tcg_temp_free_i64(tmp);
1850 }
1851 
1852 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1853                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1854 {
1855     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1856     static const GVecGen3 g[4] = {
1857         { .fniv = tcg_gen_ssadd_vec,
1858           .fno = gen_helper_gvec_ssadd8,
1859           .opt_opc = vecop_list,
1860           .vece = MO_8 },
1861         { .fniv = tcg_gen_ssadd_vec,
1862           .fno = gen_helper_gvec_ssadd16,
1863           .opt_opc = vecop_list,
1864           .vece = MO_16 },
1865         { .fniv = tcg_gen_ssadd_vec,
1866           .fno = gen_helper_gvec_ssadd32,
1867           .opt_opc = vecop_list,
1868           .vece = MO_32 },
1869         { .fniv = tcg_gen_ssadd_vec,
1870           .fno = gen_helper_gvec_ssadd64,
1871           .opt_opc = vecop_list,
1872           .vece = MO_64 },
1873     };
1874     tcg_debug_assert(vece <= MO_64);
1875     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1876 }
1877 
1878 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1879                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1880 {
1881     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
1882     static const GVecGen3 g[4] = {
1883         { .fniv = tcg_gen_sssub_vec,
1884           .fno = gen_helper_gvec_sssub8,
1885           .opt_opc = vecop_list,
1886           .vece = MO_8 },
1887         { .fniv = tcg_gen_sssub_vec,
1888           .fno = gen_helper_gvec_sssub16,
1889           .opt_opc = vecop_list,
1890           .vece = MO_16 },
1891         { .fniv = tcg_gen_sssub_vec,
1892           .fno = gen_helper_gvec_sssub32,
1893           .opt_opc = vecop_list,
1894           .vece = MO_32 },
1895         { .fniv = tcg_gen_sssub_vec,
1896           .fno = gen_helper_gvec_sssub64,
1897           .opt_opc = vecop_list,
1898           .vece = MO_64 },
1899     };
1900     tcg_debug_assert(vece <= MO_64);
1901     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1902 }
1903 
1904 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1905 {
1906     TCGv_i32 max = tcg_const_i32(-1);
1907     tcg_gen_add_i32(d, a, b);
1908     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1909     tcg_temp_free_i32(max);
1910 }
1911 
1912 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1913 {
1914     TCGv_i64 max = tcg_const_i64(-1);
1915     tcg_gen_add_i64(d, a, b);
1916     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1917     tcg_temp_free_i64(max);
1918 }
1919 
1920 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1921                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1922 {
1923     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
1924     static const GVecGen3 g[4] = {
1925         { .fniv = tcg_gen_usadd_vec,
1926           .fno = gen_helper_gvec_usadd8,
1927           .opt_opc = vecop_list,
1928           .vece = MO_8 },
1929         { .fniv = tcg_gen_usadd_vec,
1930           .fno = gen_helper_gvec_usadd16,
1931           .opt_opc = vecop_list,
1932           .vece = MO_16 },
1933         { .fni4 = tcg_gen_usadd_i32,
1934           .fniv = tcg_gen_usadd_vec,
1935           .fno = gen_helper_gvec_usadd32,
1936           .opt_opc = vecop_list,
1937           .vece = MO_32 },
1938         { .fni8 = tcg_gen_usadd_i64,
1939           .fniv = tcg_gen_usadd_vec,
1940           .fno = gen_helper_gvec_usadd64,
1941           .opt_opc = vecop_list,
1942           .vece = MO_64 }
1943     };
1944     tcg_debug_assert(vece <= MO_64);
1945     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1946 }
1947 
1948 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1949 {
1950     TCGv_i32 min = tcg_const_i32(0);
1951     tcg_gen_sub_i32(d, a, b);
1952     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1953     tcg_temp_free_i32(min);
1954 }
1955 
1956 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1957 {
1958     TCGv_i64 min = tcg_const_i64(0);
1959     tcg_gen_sub_i64(d, a, b);
1960     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1961     tcg_temp_free_i64(min);
1962 }
1963 
1964 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1965                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1966 {
1967     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
1968     static const GVecGen3 g[4] = {
1969         { .fniv = tcg_gen_ussub_vec,
1970           .fno = gen_helper_gvec_ussub8,
1971           .opt_opc = vecop_list,
1972           .vece = MO_8 },
1973         { .fniv = tcg_gen_ussub_vec,
1974           .fno = gen_helper_gvec_ussub16,
1975           .opt_opc = vecop_list,
1976           .vece = MO_16 },
1977         { .fni4 = tcg_gen_ussub_i32,
1978           .fniv = tcg_gen_ussub_vec,
1979           .fno = gen_helper_gvec_ussub32,
1980           .opt_opc = vecop_list,
1981           .vece = MO_32 },
1982         { .fni8 = tcg_gen_ussub_i64,
1983           .fniv = tcg_gen_ussub_vec,
1984           .fno = gen_helper_gvec_ussub64,
1985           .opt_opc = vecop_list,
1986           .vece = MO_64 }
1987     };
1988     tcg_debug_assert(vece <= MO_64);
1989     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1990 }
1991 
1992 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
1993                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1994 {
1995     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
1996     static const GVecGen3 g[4] = {
1997         { .fniv = tcg_gen_smin_vec,
1998           .fno = gen_helper_gvec_smin8,
1999           .opt_opc = vecop_list,
2000           .vece = MO_8 },
2001         { .fniv = tcg_gen_smin_vec,
2002           .fno = gen_helper_gvec_smin16,
2003           .opt_opc = vecop_list,
2004           .vece = MO_16 },
2005         { .fni4 = tcg_gen_smin_i32,
2006           .fniv = tcg_gen_smin_vec,
2007           .fno = gen_helper_gvec_smin32,
2008           .opt_opc = vecop_list,
2009           .vece = MO_32 },
2010         { .fni8 = tcg_gen_smin_i64,
2011           .fniv = tcg_gen_smin_vec,
2012           .fno = gen_helper_gvec_smin64,
2013           .opt_opc = vecop_list,
2014           .vece = MO_64 }
2015     };
2016     tcg_debug_assert(vece <= MO_64);
2017     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2018 }
2019 
2020 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2021                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2022 {
2023     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2024     static const GVecGen3 g[4] = {
2025         { .fniv = tcg_gen_umin_vec,
2026           .fno = gen_helper_gvec_umin8,
2027           .opt_opc = vecop_list,
2028           .vece = MO_8 },
2029         { .fniv = tcg_gen_umin_vec,
2030           .fno = gen_helper_gvec_umin16,
2031           .opt_opc = vecop_list,
2032           .vece = MO_16 },
2033         { .fni4 = tcg_gen_umin_i32,
2034           .fniv = tcg_gen_umin_vec,
2035           .fno = gen_helper_gvec_umin32,
2036           .opt_opc = vecop_list,
2037           .vece = MO_32 },
2038         { .fni8 = tcg_gen_umin_i64,
2039           .fniv = tcg_gen_umin_vec,
2040           .fno = gen_helper_gvec_umin64,
2041           .opt_opc = vecop_list,
2042           .vece = MO_64 }
2043     };
2044     tcg_debug_assert(vece <= MO_64);
2045     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2046 }
2047 
2048 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2049                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2050 {
2051     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2052     static const GVecGen3 g[4] = {
2053         { .fniv = tcg_gen_smax_vec,
2054           .fno = gen_helper_gvec_smax8,
2055           .opt_opc = vecop_list,
2056           .vece = MO_8 },
2057         { .fniv = tcg_gen_smax_vec,
2058           .fno = gen_helper_gvec_smax16,
2059           .opt_opc = vecop_list,
2060           .vece = MO_16 },
2061         { .fni4 = tcg_gen_smax_i32,
2062           .fniv = tcg_gen_smax_vec,
2063           .fno = gen_helper_gvec_smax32,
2064           .opt_opc = vecop_list,
2065           .vece = MO_32 },
2066         { .fni8 = tcg_gen_smax_i64,
2067           .fniv = tcg_gen_smax_vec,
2068           .fno = gen_helper_gvec_smax64,
2069           .opt_opc = vecop_list,
2070           .vece = MO_64 }
2071     };
2072     tcg_debug_assert(vece <= MO_64);
2073     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2074 }
2075 
2076 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2077                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2078 {
2079     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2080     static const GVecGen3 g[4] = {
2081         { .fniv = tcg_gen_umax_vec,
2082           .fno = gen_helper_gvec_umax8,
2083           .opt_opc = vecop_list,
2084           .vece = MO_8 },
2085         { .fniv = tcg_gen_umax_vec,
2086           .fno = gen_helper_gvec_umax16,
2087           .opt_opc = vecop_list,
2088           .vece = MO_16 },
2089         { .fni4 = tcg_gen_umax_i32,
2090           .fniv = tcg_gen_umax_vec,
2091           .fno = gen_helper_gvec_umax32,
2092           .opt_opc = vecop_list,
2093           .vece = MO_32 },
2094         { .fni8 = tcg_gen_umax_i64,
2095           .fniv = tcg_gen_umax_vec,
2096           .fno = gen_helper_gvec_umax64,
2097           .opt_opc = vecop_list,
2098           .vece = MO_64 }
2099     };
2100     tcg_debug_assert(vece <= MO_64);
2101     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2102 }
2103 
2104 /* Perform a vector negation using normal negation and a mask.
2105    Compare gen_subv_mask above.  */
2106 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2107 {
2108     TCGv_i64 t2 = tcg_temp_new_i64();
2109     TCGv_i64 t3 = tcg_temp_new_i64();
2110 
2111     tcg_gen_andc_i64(t3, m, b);
2112     tcg_gen_andc_i64(t2, b, m);
2113     tcg_gen_sub_i64(d, m, t2);
2114     tcg_gen_xor_i64(d, d, t3);
2115 
2116     tcg_temp_free_i64(t2);
2117     tcg_temp_free_i64(t3);
2118 }
2119 
2120 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2121 {
2122     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2123     gen_negv_mask(d, b, m);
2124     tcg_temp_free_i64(m);
2125 }
2126 
2127 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2128 {
2129     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2130     gen_negv_mask(d, b, m);
2131     tcg_temp_free_i64(m);
2132 }
2133 
2134 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2135 {
2136     TCGv_i64 t1 = tcg_temp_new_i64();
2137     TCGv_i64 t2 = tcg_temp_new_i64();
2138 
2139     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2140     tcg_gen_neg_i64(t2, b);
2141     tcg_gen_neg_i64(t1, t1);
2142     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2143 
2144     tcg_temp_free_i64(t1);
2145     tcg_temp_free_i64(t2);
2146 }
2147 
2148 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2149                       uint32_t oprsz, uint32_t maxsz)
2150 {
2151     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2152     static const GVecGen2 g[4] = {
2153         { .fni8 = tcg_gen_vec_neg8_i64,
2154           .fniv = tcg_gen_neg_vec,
2155           .fno = gen_helper_gvec_neg8,
2156           .opt_opc = vecop_list,
2157           .vece = MO_8 },
2158         { .fni8 = tcg_gen_vec_neg16_i64,
2159           .fniv = tcg_gen_neg_vec,
2160           .fno = gen_helper_gvec_neg16,
2161           .opt_opc = vecop_list,
2162           .vece = MO_16 },
2163         { .fni4 = tcg_gen_neg_i32,
2164           .fniv = tcg_gen_neg_vec,
2165           .fno = gen_helper_gvec_neg32,
2166           .opt_opc = vecop_list,
2167           .vece = MO_32 },
2168         { .fni8 = tcg_gen_neg_i64,
2169           .fniv = tcg_gen_neg_vec,
2170           .fno = gen_helper_gvec_neg64,
2171           .opt_opc = vecop_list,
2172           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2173           .vece = MO_64 },
2174     };
2175 
2176     tcg_debug_assert(vece <= MO_64);
2177     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2178 }
2179 
2180 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2181                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2182 {
2183     static const GVecGen3 g = {
2184         .fni8 = tcg_gen_and_i64,
2185         .fniv = tcg_gen_and_vec,
2186         .fno = gen_helper_gvec_and,
2187         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2188     };
2189 
2190     if (aofs == bofs) {
2191         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2192     } else {
2193         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2194     }
2195 }
2196 
2197 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2198                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2199 {
2200     static const GVecGen3 g = {
2201         .fni8 = tcg_gen_or_i64,
2202         .fniv = tcg_gen_or_vec,
2203         .fno = gen_helper_gvec_or,
2204         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2205     };
2206 
2207     if (aofs == bofs) {
2208         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2209     } else {
2210         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2211     }
2212 }
2213 
2214 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2215                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2216 {
2217     static const GVecGen3 g = {
2218         .fni8 = tcg_gen_xor_i64,
2219         .fniv = tcg_gen_xor_vec,
2220         .fno = gen_helper_gvec_xor,
2221         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2222     };
2223 
2224     if (aofs == bofs) {
2225         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2226     } else {
2227         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2228     }
2229 }
2230 
2231 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2232                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2233 {
2234     static const GVecGen3 g = {
2235         .fni8 = tcg_gen_andc_i64,
2236         .fniv = tcg_gen_andc_vec,
2237         .fno = gen_helper_gvec_andc,
2238         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2239     };
2240 
2241     if (aofs == bofs) {
2242         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2243     } else {
2244         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2245     }
2246 }
2247 
2248 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2249                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2250 {
2251     static const GVecGen3 g = {
2252         .fni8 = tcg_gen_orc_i64,
2253         .fniv = tcg_gen_orc_vec,
2254         .fno = gen_helper_gvec_orc,
2255         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2256     };
2257 
2258     if (aofs == bofs) {
2259         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2260     } else {
2261         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2262     }
2263 }
2264 
2265 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2266                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2267 {
2268     static const GVecGen3 g = {
2269         .fni8 = tcg_gen_nand_i64,
2270         .fniv = tcg_gen_nand_vec,
2271         .fno = gen_helper_gvec_nand,
2272         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2273     };
2274 
2275     if (aofs == bofs) {
2276         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2277     } else {
2278         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2279     }
2280 }
2281 
2282 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2283                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2284 {
2285     static const GVecGen3 g = {
2286         .fni8 = tcg_gen_nor_i64,
2287         .fniv = tcg_gen_nor_vec,
2288         .fno = gen_helper_gvec_nor,
2289         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2290     };
2291 
2292     if (aofs == bofs) {
2293         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2294     } else {
2295         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2296     }
2297 }
2298 
2299 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2300                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2301 {
2302     static const GVecGen3 g = {
2303         .fni8 = tcg_gen_eqv_i64,
2304         .fniv = tcg_gen_eqv_vec,
2305         .fno = gen_helper_gvec_eqv,
2306         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2307     };
2308 
2309     if (aofs == bofs) {
2310         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2311     } else {
2312         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2313     }
2314 }
2315 
2316 static const GVecGen2s gop_ands = {
2317     .fni8 = tcg_gen_and_i64,
2318     .fniv = tcg_gen_and_vec,
2319     .fno = gen_helper_gvec_ands,
2320     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2321     .vece = MO_64
2322 };
2323 
2324 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2325                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2326 {
2327     TCGv_i64 tmp = tcg_temp_new_i64();
2328     gen_dup_i64(vece, tmp, c);
2329     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2330     tcg_temp_free_i64(tmp);
2331 }
2332 
2333 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2334                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2335 {
2336     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2337     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2338     tcg_temp_free_i64(tmp);
2339 }
2340 
2341 static const GVecGen2s gop_xors = {
2342     .fni8 = tcg_gen_xor_i64,
2343     .fniv = tcg_gen_xor_vec,
2344     .fno = gen_helper_gvec_xors,
2345     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2346     .vece = MO_64
2347 };
2348 
2349 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2350                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2351 {
2352     TCGv_i64 tmp = tcg_temp_new_i64();
2353     gen_dup_i64(vece, tmp, c);
2354     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2355     tcg_temp_free_i64(tmp);
2356 }
2357 
2358 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2359                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2360 {
2361     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2362     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2363     tcg_temp_free_i64(tmp);
2364 }
2365 
2366 static const GVecGen2s gop_ors = {
2367     .fni8 = tcg_gen_or_i64,
2368     .fniv = tcg_gen_or_vec,
2369     .fno = gen_helper_gvec_ors,
2370     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2371     .vece = MO_64
2372 };
2373 
2374 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2375                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2376 {
2377     TCGv_i64 tmp = tcg_temp_new_i64();
2378     gen_dup_i64(vece, tmp, c);
2379     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2380     tcg_temp_free_i64(tmp);
2381 }
2382 
2383 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2384                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2385 {
2386     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2387     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2388     tcg_temp_free_i64(tmp);
2389 }
2390 
2391 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2392 {
2393     uint64_t mask = dup_const(MO_8, 0xff << c);
2394     tcg_gen_shli_i64(d, a, c);
2395     tcg_gen_andi_i64(d, d, mask);
2396 }
2397 
2398 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2399 {
2400     uint64_t mask = dup_const(MO_16, 0xffff << c);
2401     tcg_gen_shli_i64(d, a, c);
2402     tcg_gen_andi_i64(d, d, mask);
2403 }
2404 
2405 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2406                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2407 {
2408     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2409     static const GVecGen2i g[4] = {
2410         { .fni8 = tcg_gen_vec_shl8i_i64,
2411           .fniv = tcg_gen_shli_vec,
2412           .fno = gen_helper_gvec_shl8i,
2413           .opt_opc = vecop_list,
2414           .vece = MO_8 },
2415         { .fni8 = tcg_gen_vec_shl16i_i64,
2416           .fniv = tcg_gen_shli_vec,
2417           .fno = gen_helper_gvec_shl16i,
2418           .opt_opc = vecop_list,
2419           .vece = MO_16 },
2420         { .fni4 = tcg_gen_shli_i32,
2421           .fniv = tcg_gen_shli_vec,
2422           .fno = gen_helper_gvec_shl32i,
2423           .opt_opc = vecop_list,
2424           .vece = MO_32 },
2425         { .fni8 = tcg_gen_shli_i64,
2426           .fniv = tcg_gen_shli_vec,
2427           .fno = gen_helper_gvec_shl64i,
2428           .opt_opc = vecop_list,
2429           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2430           .vece = MO_64 },
2431     };
2432 
2433     tcg_debug_assert(vece <= MO_64);
2434     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2435     if (shift == 0) {
2436         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2437     } else {
2438         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2439     }
2440 }
2441 
2442 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2443 {
2444     uint64_t mask = dup_const(MO_8, 0xff >> c);
2445     tcg_gen_shri_i64(d, a, c);
2446     tcg_gen_andi_i64(d, d, mask);
2447 }
2448 
2449 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2450 {
2451     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2452     tcg_gen_shri_i64(d, a, c);
2453     tcg_gen_andi_i64(d, d, mask);
2454 }
2455 
2456 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2457                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2458 {
2459     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2460     static const GVecGen2i g[4] = {
2461         { .fni8 = tcg_gen_vec_shr8i_i64,
2462           .fniv = tcg_gen_shri_vec,
2463           .fno = gen_helper_gvec_shr8i,
2464           .opt_opc = vecop_list,
2465           .vece = MO_8 },
2466         { .fni8 = tcg_gen_vec_shr16i_i64,
2467           .fniv = tcg_gen_shri_vec,
2468           .fno = gen_helper_gvec_shr16i,
2469           .opt_opc = vecop_list,
2470           .vece = MO_16 },
2471         { .fni4 = tcg_gen_shri_i32,
2472           .fniv = tcg_gen_shri_vec,
2473           .fno = gen_helper_gvec_shr32i,
2474           .opt_opc = vecop_list,
2475           .vece = MO_32 },
2476         { .fni8 = tcg_gen_shri_i64,
2477           .fniv = tcg_gen_shri_vec,
2478           .fno = gen_helper_gvec_shr64i,
2479           .opt_opc = vecop_list,
2480           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2481           .vece = MO_64 },
2482     };
2483 
2484     tcg_debug_assert(vece <= MO_64);
2485     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2486     if (shift == 0) {
2487         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2488     } else {
2489         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2490     }
2491 }
2492 
2493 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2494 {
2495     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2496     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2497     TCGv_i64 s = tcg_temp_new_i64();
2498 
2499     tcg_gen_shri_i64(d, a, c);
2500     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2501     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2502     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2503     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2504     tcg_temp_free_i64(s);
2505 }
2506 
2507 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2508 {
2509     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2510     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2511     TCGv_i64 s = tcg_temp_new_i64();
2512 
2513     tcg_gen_shri_i64(d, a, c);
2514     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2515     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2516     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2517     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2518     tcg_temp_free_i64(s);
2519 }
2520 
2521 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2522                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2523 {
2524     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2525     static const GVecGen2i g[4] = {
2526         { .fni8 = tcg_gen_vec_sar8i_i64,
2527           .fniv = tcg_gen_sari_vec,
2528           .fno = gen_helper_gvec_sar8i,
2529           .opt_opc = vecop_list,
2530           .vece = MO_8 },
2531         { .fni8 = tcg_gen_vec_sar16i_i64,
2532           .fniv = tcg_gen_sari_vec,
2533           .fno = gen_helper_gvec_sar16i,
2534           .opt_opc = vecop_list,
2535           .vece = MO_16 },
2536         { .fni4 = tcg_gen_sari_i32,
2537           .fniv = tcg_gen_sari_vec,
2538           .fno = gen_helper_gvec_sar32i,
2539           .opt_opc = vecop_list,
2540           .vece = MO_32 },
2541         { .fni8 = tcg_gen_sari_i64,
2542           .fniv = tcg_gen_sari_vec,
2543           .fno = gen_helper_gvec_sar64i,
2544           .opt_opc = vecop_list,
2545           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2546           .vece = MO_64 },
2547     };
2548 
2549     tcg_debug_assert(vece <= MO_64);
2550     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2551     if (shift == 0) {
2552         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2553     } else {
2554         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2555     }
2556 }
2557 
2558 /*
2559  * Expand D = A << (B % element bits)
2560  *
2561  * Unlike scalar shifts, where it is easy for the target front end
2562  * to include the modulo as part of the expansion.  If the target
2563  * naturally includes the modulo as part of the operation, great!
2564  * If the target has some other behaviour from out-of-range shifts,
2565  * then it could not use this function anyway, and would need to
2566  * do it's own expansion with custom functions.
2567  */
2568 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
2569                                  TCGv_vec a, TCGv_vec b)
2570 {
2571     TCGv_vec t = tcg_temp_new_vec_matching(d);
2572 
2573     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2574     tcg_gen_and_vec(vece, t, t, b);
2575     tcg_gen_shlv_vec(vece, d, a, t);
2576     tcg_temp_free_vec(t);
2577 }
2578 
2579 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2580 {
2581     TCGv_i32 t = tcg_temp_new_i32();
2582 
2583     tcg_gen_andi_i32(t, b, 31);
2584     tcg_gen_shl_i32(d, a, t);
2585     tcg_temp_free_i32(t);
2586 }
2587 
2588 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2589 {
2590     TCGv_i64 t = tcg_temp_new_i64();
2591 
2592     tcg_gen_andi_i64(t, b, 63);
2593     tcg_gen_shl_i64(d, a, t);
2594     tcg_temp_free_i64(t);
2595 }
2596 
2597 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
2598                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2599 {
2600     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
2601     static const GVecGen3 g[4] = {
2602         { .fniv = tcg_gen_shlv_mod_vec,
2603           .fno = gen_helper_gvec_shl8v,
2604           .opt_opc = vecop_list,
2605           .vece = MO_8 },
2606         { .fniv = tcg_gen_shlv_mod_vec,
2607           .fno = gen_helper_gvec_shl16v,
2608           .opt_opc = vecop_list,
2609           .vece = MO_16 },
2610         { .fni4 = tcg_gen_shl_mod_i32,
2611           .fniv = tcg_gen_shlv_mod_vec,
2612           .fno = gen_helper_gvec_shl32v,
2613           .opt_opc = vecop_list,
2614           .vece = MO_32 },
2615         { .fni8 = tcg_gen_shl_mod_i64,
2616           .fniv = tcg_gen_shlv_mod_vec,
2617           .fno = gen_helper_gvec_shl64v,
2618           .opt_opc = vecop_list,
2619           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2620           .vece = MO_64 },
2621     };
2622 
2623     tcg_debug_assert(vece <= MO_64);
2624     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2625 }
2626 
2627 /*
2628  * Similarly for logical right shifts.
2629  */
2630 
2631 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
2632                                  TCGv_vec a, TCGv_vec b)
2633 {
2634     TCGv_vec t = tcg_temp_new_vec_matching(d);
2635 
2636     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2637     tcg_gen_and_vec(vece, t, t, b);
2638     tcg_gen_shrv_vec(vece, d, a, t);
2639     tcg_temp_free_vec(t);
2640 }
2641 
2642 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2643 {
2644     TCGv_i32 t = tcg_temp_new_i32();
2645 
2646     tcg_gen_andi_i32(t, b, 31);
2647     tcg_gen_shr_i32(d, a, t);
2648     tcg_temp_free_i32(t);
2649 }
2650 
2651 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2652 {
2653     TCGv_i64 t = tcg_temp_new_i64();
2654 
2655     tcg_gen_andi_i64(t, b, 63);
2656     tcg_gen_shr_i64(d, a, t);
2657     tcg_temp_free_i64(t);
2658 }
2659 
2660 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
2661                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2662 {
2663     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
2664     static const GVecGen3 g[4] = {
2665         { .fniv = tcg_gen_shrv_mod_vec,
2666           .fno = gen_helper_gvec_shr8v,
2667           .opt_opc = vecop_list,
2668           .vece = MO_8 },
2669         { .fniv = tcg_gen_shrv_mod_vec,
2670           .fno = gen_helper_gvec_shr16v,
2671           .opt_opc = vecop_list,
2672           .vece = MO_16 },
2673         { .fni4 = tcg_gen_shr_mod_i32,
2674           .fniv = tcg_gen_shrv_mod_vec,
2675           .fno = gen_helper_gvec_shr32v,
2676           .opt_opc = vecop_list,
2677           .vece = MO_32 },
2678         { .fni8 = tcg_gen_shr_mod_i64,
2679           .fniv = tcg_gen_shrv_mod_vec,
2680           .fno = gen_helper_gvec_shr64v,
2681           .opt_opc = vecop_list,
2682           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2683           .vece = MO_64 },
2684     };
2685 
2686     tcg_debug_assert(vece <= MO_64);
2687     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2688 }
2689 
2690 /*
2691  * Similarly for arithmetic right shifts.
2692  */
2693 
2694 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
2695                                  TCGv_vec a, TCGv_vec b)
2696 {
2697     TCGv_vec t = tcg_temp_new_vec_matching(d);
2698 
2699     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2700     tcg_gen_and_vec(vece, t, t, b);
2701     tcg_gen_sarv_vec(vece, d, a, t);
2702     tcg_temp_free_vec(t);
2703 }
2704 
2705 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2706 {
2707     TCGv_i32 t = tcg_temp_new_i32();
2708 
2709     tcg_gen_andi_i32(t, b, 31);
2710     tcg_gen_sar_i32(d, a, t);
2711     tcg_temp_free_i32(t);
2712 }
2713 
2714 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2715 {
2716     TCGv_i64 t = tcg_temp_new_i64();
2717 
2718     tcg_gen_andi_i64(t, b, 63);
2719     tcg_gen_sar_i64(d, a, t);
2720     tcg_temp_free_i64(t);
2721 }
2722 
2723 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
2724                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2725 {
2726     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
2727     static const GVecGen3 g[4] = {
2728         { .fniv = tcg_gen_sarv_mod_vec,
2729           .fno = gen_helper_gvec_sar8v,
2730           .opt_opc = vecop_list,
2731           .vece = MO_8 },
2732         { .fniv = tcg_gen_sarv_mod_vec,
2733           .fno = gen_helper_gvec_sar16v,
2734           .opt_opc = vecop_list,
2735           .vece = MO_16 },
2736         { .fni4 = tcg_gen_sar_mod_i32,
2737           .fniv = tcg_gen_sarv_mod_vec,
2738           .fno = gen_helper_gvec_sar32v,
2739           .opt_opc = vecop_list,
2740           .vece = MO_32 },
2741         { .fni8 = tcg_gen_sar_mod_i64,
2742           .fniv = tcg_gen_sarv_mod_vec,
2743           .fno = gen_helper_gvec_sar64v,
2744           .opt_opc = vecop_list,
2745           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2746           .vece = MO_64 },
2747     };
2748 
2749     tcg_debug_assert(vece <= MO_64);
2750     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2751 }
2752 
2753 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
2754 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2755                            uint32_t oprsz, TCGCond cond)
2756 {
2757     TCGv_i32 t0 = tcg_temp_new_i32();
2758     TCGv_i32 t1 = tcg_temp_new_i32();
2759     uint32_t i;
2760 
2761     for (i = 0; i < oprsz; i += 4) {
2762         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
2763         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
2764         tcg_gen_setcond_i32(cond, t0, t0, t1);
2765         tcg_gen_neg_i32(t0, t0);
2766         tcg_gen_st_i32(t0, cpu_env, dofs + i);
2767     }
2768     tcg_temp_free_i32(t1);
2769     tcg_temp_free_i32(t0);
2770 }
2771 
2772 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2773                            uint32_t oprsz, TCGCond cond)
2774 {
2775     TCGv_i64 t0 = tcg_temp_new_i64();
2776     TCGv_i64 t1 = tcg_temp_new_i64();
2777     uint32_t i;
2778 
2779     for (i = 0; i < oprsz; i += 8) {
2780         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
2781         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
2782         tcg_gen_setcond_i64(cond, t0, t0, t1);
2783         tcg_gen_neg_i64(t0, t0);
2784         tcg_gen_st_i64(t0, cpu_env, dofs + i);
2785     }
2786     tcg_temp_free_i64(t1);
2787     tcg_temp_free_i64(t0);
2788 }
2789 
2790 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2791                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
2792                            TCGType type, TCGCond cond)
2793 {
2794     TCGv_vec t0 = tcg_temp_new_vec(type);
2795     TCGv_vec t1 = tcg_temp_new_vec(type);
2796     uint32_t i;
2797 
2798     for (i = 0; i < oprsz; i += tysz) {
2799         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2800         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
2801         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
2802         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2803     }
2804     tcg_temp_free_vec(t1);
2805     tcg_temp_free_vec(t0);
2806 }
2807 
2808 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
2809                       uint32_t aofs, uint32_t bofs,
2810                       uint32_t oprsz, uint32_t maxsz)
2811 {
2812     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
2813     static gen_helper_gvec_3 * const eq_fn[4] = {
2814         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
2815         gen_helper_gvec_eq32, gen_helper_gvec_eq64
2816     };
2817     static gen_helper_gvec_3 * const ne_fn[4] = {
2818         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
2819         gen_helper_gvec_ne32, gen_helper_gvec_ne64
2820     };
2821     static gen_helper_gvec_3 * const lt_fn[4] = {
2822         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
2823         gen_helper_gvec_lt32, gen_helper_gvec_lt64
2824     };
2825     static gen_helper_gvec_3 * const le_fn[4] = {
2826         gen_helper_gvec_le8, gen_helper_gvec_le16,
2827         gen_helper_gvec_le32, gen_helper_gvec_le64
2828     };
2829     static gen_helper_gvec_3 * const ltu_fn[4] = {
2830         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
2831         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
2832     };
2833     static gen_helper_gvec_3 * const leu_fn[4] = {
2834         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
2835         gen_helper_gvec_leu32, gen_helper_gvec_leu64
2836     };
2837     static gen_helper_gvec_3 * const * const fns[16] = {
2838         [TCG_COND_EQ] = eq_fn,
2839         [TCG_COND_NE] = ne_fn,
2840         [TCG_COND_LT] = lt_fn,
2841         [TCG_COND_LE] = le_fn,
2842         [TCG_COND_LTU] = ltu_fn,
2843         [TCG_COND_LEU] = leu_fn,
2844     };
2845 
2846     const TCGOpcode *hold_list;
2847     TCGType type;
2848     uint32_t some;
2849 
2850     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
2851     check_overlap_3(dofs, aofs, bofs, maxsz);
2852 
2853     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
2854         do_dup(MO_8, dofs, oprsz, maxsz,
2855                NULL, NULL, -(cond == TCG_COND_ALWAYS));
2856         return;
2857     }
2858 
2859     /*
2860      * Implement inline with a vector type, if possible.
2861      * Prefer integer when 64-bit host and 64-bit comparison.
2862      */
2863     hold_list = tcg_swap_vecop_list(cmp_list);
2864     type = choose_vector_type(cmp_list, vece, oprsz,
2865                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
2866     switch (type) {
2867     case TCG_TYPE_V256:
2868         /* Recall that ARM SVE allows vector sizes that are not a
2869          * power of 2, but always a multiple of 16.  The intent is
2870          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
2871          */
2872         some = QEMU_ALIGN_DOWN(oprsz, 32);
2873         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
2874         if (some == oprsz) {
2875             break;
2876         }
2877         dofs += some;
2878         aofs += some;
2879         bofs += some;
2880         oprsz -= some;
2881         maxsz -= some;
2882         /* fallthru */
2883     case TCG_TYPE_V128:
2884         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
2885         break;
2886     case TCG_TYPE_V64:
2887         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
2888         break;
2889 
2890     case 0:
2891         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2892             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
2893         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2894             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
2895         } else {
2896             gen_helper_gvec_3 * const *fn = fns[cond];
2897 
2898             if (fn == NULL) {
2899                 uint32_t tmp;
2900                 tmp = aofs, aofs = bofs, bofs = tmp;
2901                 cond = tcg_swap_cond(cond);
2902                 fn = fns[cond];
2903                 assert(fn != NULL);
2904             }
2905             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
2906             oprsz = maxsz;
2907         }
2908         break;
2909 
2910     default:
2911         g_assert_not_reached();
2912     }
2913     tcg_swap_vecop_list(hold_list);
2914 
2915     if (oprsz < maxsz) {
2916         expand_clr(dofs + oprsz, maxsz - oprsz);
2917     }
2918 }
2919