xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision b15c0f7d)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "qemu-common.h"
22 #include "tcg.h"
23 #include "tcg-op.h"
24 #include "tcg-op-gvec.h"
25 #include "tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t opr_align = oprsz >= 16 ? 15 : 7;
41     uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
42     tcg_debug_assert(oprsz > 0);
43     tcg_debug_assert(oprsz <= maxsz);
44     tcg_debug_assert((oprsz & opr_align) == 0);
45     tcg_debug_assert((maxsz & max_align) == 0);
46     tcg_debug_assert((ofs & max_align) == 0);
47 }
48 
49 /* Verify vector overlap rules for two operands.  */
50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
51 {
52     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
53 }
54 
55 /* Verify vector overlap rules for three operands.  */
56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
57 {
58     check_overlap_2(d, a, s);
59     check_overlap_2(d, b, s);
60     check_overlap_2(a, b, s);
61 }
62 
63 /* Verify vector overlap rules for four operands.  */
64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
65                             uint32_t c, uint32_t s)
66 {
67     check_overlap_2(d, a, s);
68     check_overlap_2(d, b, s);
69     check_overlap_2(d, c, s);
70     check_overlap_2(a, b, s);
71     check_overlap_2(a, c, s);
72     check_overlap_2(b, c, s);
73 }
74 
75 /* Create a descriptor from components.  */
76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
77 {
78     uint32_t desc = 0;
79 
80     assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
81     assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
82     assert(data == sextract32(data, 0, SIMD_DATA_BITS));
83 
84     oprsz = (oprsz / 8) - 1;
85     maxsz = (maxsz / 8) - 1;
86     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
87     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
88     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
89 
90     return desc;
91 }
92 
93 /* Generate a call to a gvec-style helper with two vector operands.  */
94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
95                         uint32_t oprsz, uint32_t maxsz, int32_t data,
96                         gen_helper_gvec_2 *fn)
97 {
98     TCGv_ptr a0, a1;
99     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
100 
101     a0 = tcg_temp_new_ptr();
102     a1 = tcg_temp_new_ptr();
103 
104     tcg_gen_addi_ptr(a0, cpu_env, dofs);
105     tcg_gen_addi_ptr(a1, cpu_env, aofs);
106 
107     fn(a0, a1, desc);
108 
109     tcg_temp_free_ptr(a0);
110     tcg_temp_free_ptr(a1);
111     tcg_temp_free_i32(desc);
112 }
113 
114 /* Generate a call to a gvec-style helper with two vector operands
115    and one scalar operand.  */
116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
117                          uint32_t oprsz, uint32_t maxsz, int32_t data,
118                          gen_helper_gvec_2i *fn)
119 {
120     TCGv_ptr a0, a1;
121     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
122 
123     a0 = tcg_temp_new_ptr();
124     a1 = tcg_temp_new_ptr();
125 
126     tcg_gen_addi_ptr(a0, cpu_env, dofs);
127     tcg_gen_addi_ptr(a1, cpu_env, aofs);
128 
129     fn(a0, a1, c, desc);
130 
131     tcg_temp_free_ptr(a0);
132     tcg_temp_free_ptr(a1);
133     tcg_temp_free_i32(desc);
134 }
135 
136 /* Generate a call to a gvec-style helper with three vector operands.  */
137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
138                         uint32_t oprsz, uint32_t maxsz, int32_t data,
139                         gen_helper_gvec_3 *fn)
140 {
141     TCGv_ptr a0, a1, a2;
142     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
143 
144     a0 = tcg_temp_new_ptr();
145     a1 = tcg_temp_new_ptr();
146     a2 = tcg_temp_new_ptr();
147 
148     tcg_gen_addi_ptr(a0, cpu_env, dofs);
149     tcg_gen_addi_ptr(a1, cpu_env, aofs);
150     tcg_gen_addi_ptr(a2, cpu_env, bofs);
151 
152     fn(a0, a1, a2, desc);
153 
154     tcg_temp_free_ptr(a0);
155     tcg_temp_free_ptr(a1);
156     tcg_temp_free_ptr(a2);
157     tcg_temp_free_i32(desc);
158 }
159 
160 /* Generate a call to a gvec-style helper with four vector operands.  */
161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
162                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
163                         int32_t data, gen_helper_gvec_4 *fn)
164 {
165     TCGv_ptr a0, a1, a2, a3;
166     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
167 
168     a0 = tcg_temp_new_ptr();
169     a1 = tcg_temp_new_ptr();
170     a2 = tcg_temp_new_ptr();
171     a3 = tcg_temp_new_ptr();
172 
173     tcg_gen_addi_ptr(a0, cpu_env, dofs);
174     tcg_gen_addi_ptr(a1, cpu_env, aofs);
175     tcg_gen_addi_ptr(a2, cpu_env, bofs);
176     tcg_gen_addi_ptr(a3, cpu_env, cofs);
177 
178     fn(a0, a1, a2, a3, desc);
179 
180     tcg_temp_free_ptr(a0);
181     tcg_temp_free_ptr(a1);
182     tcg_temp_free_ptr(a2);
183     tcg_temp_free_ptr(a3);
184     tcg_temp_free_i32(desc);
185 }
186 
187 /* Generate a call to a gvec-style helper with five vector operands.  */
188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
189                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
190                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
191 {
192     TCGv_ptr a0, a1, a2, a3, a4;
193     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
194 
195     a0 = tcg_temp_new_ptr();
196     a1 = tcg_temp_new_ptr();
197     a2 = tcg_temp_new_ptr();
198     a3 = tcg_temp_new_ptr();
199     a4 = tcg_temp_new_ptr();
200 
201     tcg_gen_addi_ptr(a0, cpu_env, dofs);
202     tcg_gen_addi_ptr(a1, cpu_env, aofs);
203     tcg_gen_addi_ptr(a2, cpu_env, bofs);
204     tcg_gen_addi_ptr(a3, cpu_env, cofs);
205     tcg_gen_addi_ptr(a4, cpu_env, xofs);
206 
207     fn(a0, a1, a2, a3, a4, desc);
208 
209     tcg_temp_free_ptr(a0);
210     tcg_temp_free_ptr(a1);
211     tcg_temp_free_ptr(a2);
212     tcg_temp_free_ptr(a3);
213     tcg_temp_free_ptr(a4);
214     tcg_temp_free_i32(desc);
215 }
216 
217 /* Generate a call to a gvec-style helper with three vector operands
218    and an extra pointer operand.  */
219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
220                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
221                         int32_t data, gen_helper_gvec_2_ptr *fn)
222 {
223     TCGv_ptr a0, a1;
224     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
225 
226     a0 = tcg_temp_new_ptr();
227     a1 = tcg_temp_new_ptr();
228 
229     tcg_gen_addi_ptr(a0, cpu_env, dofs);
230     tcg_gen_addi_ptr(a1, cpu_env, aofs);
231 
232     fn(a0, a1, ptr, desc);
233 
234     tcg_temp_free_ptr(a0);
235     tcg_temp_free_ptr(a1);
236     tcg_temp_free_i32(desc);
237 }
238 
239 /* Generate a call to a gvec-style helper with three vector operands
240    and an extra pointer operand.  */
241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
242                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
243                         int32_t data, gen_helper_gvec_3_ptr *fn)
244 {
245     TCGv_ptr a0, a1, a2;
246     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
247 
248     a0 = tcg_temp_new_ptr();
249     a1 = tcg_temp_new_ptr();
250     a2 = tcg_temp_new_ptr();
251 
252     tcg_gen_addi_ptr(a0, cpu_env, dofs);
253     tcg_gen_addi_ptr(a1, cpu_env, aofs);
254     tcg_gen_addi_ptr(a2, cpu_env, bofs);
255 
256     fn(a0, a1, a2, ptr, desc);
257 
258     tcg_temp_free_ptr(a0);
259     tcg_temp_free_ptr(a1);
260     tcg_temp_free_ptr(a2);
261     tcg_temp_free_i32(desc);
262 }
263 
264 /* Generate a call to a gvec-style helper with four vector operands
265    and an extra pointer operand.  */
266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
267                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
268                         uint32_t maxsz, int32_t data,
269                         gen_helper_gvec_4_ptr *fn)
270 {
271     TCGv_ptr a0, a1, a2, a3;
272     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
273 
274     a0 = tcg_temp_new_ptr();
275     a1 = tcg_temp_new_ptr();
276     a2 = tcg_temp_new_ptr();
277     a3 = tcg_temp_new_ptr();
278 
279     tcg_gen_addi_ptr(a0, cpu_env, dofs);
280     tcg_gen_addi_ptr(a1, cpu_env, aofs);
281     tcg_gen_addi_ptr(a2, cpu_env, bofs);
282     tcg_gen_addi_ptr(a3, cpu_env, cofs);
283 
284     fn(a0, a1, a2, a3, ptr, desc);
285 
286     tcg_temp_free_ptr(a0);
287     tcg_temp_free_ptr(a1);
288     tcg_temp_free_ptr(a2);
289     tcg_temp_free_ptr(a3);
290     tcg_temp_free_i32(desc);
291 }
292 
293 /* Return true if we want to implement something of OPRSZ bytes
294    in units of LNSZ.  This limits the expansion of inline code.  */
295 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
296 {
297     if (oprsz % lnsz == 0) {
298         uint32_t lnct = oprsz / lnsz;
299         return lnct >= 1 && lnct <= MAX_UNROLL;
300     }
301     return false;
302 }
303 
304 static void expand_clr(uint32_t dofs, uint32_t maxsz);
305 
306 /* Duplicate C as per VECE.  */
307 uint64_t (dup_const)(unsigned vece, uint64_t c)
308 {
309     switch (vece) {
310     case MO_8:
311         return 0x0101010101010101ull * (uint8_t)c;
312     case MO_16:
313         return 0x0001000100010001ull * (uint16_t)c;
314     case MO_32:
315         return 0x0000000100000001ull * (uint32_t)c;
316     case MO_64:
317         return c;
318     default:
319         g_assert_not_reached();
320     }
321 }
322 
323 /* Duplicate IN into OUT as per VECE.  */
324 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
325 {
326     switch (vece) {
327     case MO_8:
328         tcg_gen_ext8u_i32(out, in);
329         tcg_gen_muli_i32(out, out, 0x01010101);
330         break;
331     case MO_16:
332         tcg_gen_deposit_i32(out, in, in, 16, 16);
333         break;
334     case MO_32:
335         tcg_gen_mov_i32(out, in);
336         break;
337     default:
338         g_assert_not_reached();
339     }
340 }
341 
342 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
343 {
344     switch (vece) {
345     case MO_8:
346         tcg_gen_ext8u_i64(out, in);
347         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
348         break;
349     case MO_16:
350         tcg_gen_ext16u_i64(out, in);
351         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
352         break;
353     case MO_32:
354         tcg_gen_deposit_i64(out, in, in, 32, 32);
355         break;
356     case MO_64:
357         tcg_gen_mov_i64(out, in);
358         break;
359     default:
360         g_assert_not_reached();
361     }
362 }
363 
364 /* Select a supported vector type for implementing an operation on SIZE
365  * bytes.  If OP is 0, assume that the real operation to be performed is
366  * required by all backends.  Otherwise, make sure than OP can be performed
367  * on elements of size VECE in the selected type.  Do not select V64 if
368  * PREFER_I64 is true.  Return 0 if no vector type is selected.
369  */
370 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
371                                   uint32_t size, bool prefer_i64)
372 {
373     if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
374         /*
375          * Recall that ARM SVE allows vector sizes that are not a
376          * power of 2, but always a multiple of 16.  The intent is
377          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
378          * It is hard to imagine a case in which v256 is supported
379          * but v128 is not, but check anyway.
380          */
381         if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
382             && (size % 32 == 0
383                 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
384             return TCG_TYPE_V256;
385         }
386     }
387     if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
388         && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
389         return TCG_TYPE_V128;
390     }
391     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
392         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
393         return TCG_TYPE_V64;
394     }
395     return 0;
396 }
397 
398 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
399                          uint32_t maxsz, TCGv_vec t_vec)
400 {
401     uint32_t i = 0;
402 
403     switch (type) {
404     case TCG_TYPE_V256:
405         /*
406          * Recall that ARM SVE allows vector sizes that are not a
407          * power of 2, but always a multiple of 16.  The intent is
408          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
409          */
410         for (; i + 32 <= oprsz; i += 32) {
411             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
412         }
413         /* fallthru */
414     case TCG_TYPE_V128:
415         for (; i + 16 <= oprsz; i += 16) {
416             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
417         }
418         break;
419     case TCG_TYPE_V64:
420         for (; i < oprsz; i += 8) {
421             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
422         }
423         break;
424     default:
425         g_assert_not_reached();
426     }
427 
428     if (oprsz < maxsz) {
429         expand_clr(dofs + oprsz, maxsz - oprsz);
430     }
431 }
432 
433 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
434  * Only one of IN_32 or IN_64 may be set;
435  * IN_C is used if IN_32 and IN_64 are unset.
436  */
437 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
438                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
439                    uint64_t in_c)
440 {
441     TCGType type;
442     TCGv_i64 t_64;
443     TCGv_i32 t_32, t_desc;
444     TCGv_ptr t_ptr;
445     uint32_t i;
446 
447     assert(vece <= (in_32 ? MO_32 : MO_64));
448     assert(in_32 == NULL || in_64 == NULL);
449 
450     /* If we're storing 0, expand oprsz to maxsz.  */
451     if (in_32 == NULL && in_64 == NULL) {
452         in_c = dup_const(vece, in_c);
453         if (in_c == 0) {
454             oprsz = maxsz;
455         }
456     }
457 
458     /* Implement inline with a vector type, if possible.
459      * Prefer integer when 64-bit host and no variable dup.
460      */
461     type = choose_vector_type(NULL, vece, oprsz,
462                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
463                                && (in_64 == NULL || vece == MO_64)));
464     if (type != 0) {
465         TCGv_vec t_vec = tcg_temp_new_vec(type);
466 
467         if (in_32) {
468             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
469         } else if (in_64) {
470             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
471         } else {
472             tcg_gen_dupi_vec(vece, t_vec, in_c);
473         }
474         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
475         tcg_temp_free_vec(t_vec);
476         return;
477     }
478 
479     /* Otherwise, inline with an integer type, unless "large".  */
480     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
481         t_64 = NULL;
482         t_32 = NULL;
483 
484         if (in_32) {
485             /* We are given a 32-bit variable input.  For a 64-bit host,
486                use a 64-bit operation unless the 32-bit operation would
487                be simple enough.  */
488             if (TCG_TARGET_REG_BITS == 64
489                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
490                 t_64 = tcg_temp_new_i64();
491                 tcg_gen_extu_i32_i64(t_64, in_32);
492                 gen_dup_i64(vece, t_64, t_64);
493             } else {
494                 t_32 = tcg_temp_new_i32();
495                 gen_dup_i32(vece, t_32, in_32);
496             }
497         } else if (in_64) {
498             /* We are given a 64-bit variable input.  */
499             t_64 = tcg_temp_new_i64();
500             gen_dup_i64(vece, t_64, in_64);
501         } else {
502             /* We are given a constant input.  */
503             /* For 64-bit hosts, use 64-bit constants for "simple" constants
504                or when we'd need too many 32-bit stores, or when a 64-bit
505                constant is really required.  */
506             if (vece == MO_64
507                 || (TCG_TARGET_REG_BITS == 64
508                     && (in_c == 0 || in_c == -1
509                         || !check_size_impl(oprsz, 4)))) {
510                 t_64 = tcg_const_i64(in_c);
511             } else {
512                 t_32 = tcg_const_i32(in_c);
513             }
514         }
515 
516         /* Implement inline if we picked an implementation size above.  */
517         if (t_32) {
518             for (i = 0; i < oprsz; i += 4) {
519                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
520             }
521             tcg_temp_free_i32(t_32);
522             goto done;
523         }
524         if (t_64) {
525             for (i = 0; i < oprsz; i += 8) {
526                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
527             }
528             tcg_temp_free_i64(t_64);
529             goto done;
530         }
531     }
532 
533     /* Otherwise implement out of line.  */
534     t_ptr = tcg_temp_new_ptr();
535     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
536     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
537 
538     if (vece == MO_64) {
539         if (in_64) {
540             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
541         } else {
542             t_64 = tcg_const_i64(in_c);
543             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
544             tcg_temp_free_i64(t_64);
545         }
546     } else {
547         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
548         static dup_fn * const fns[3] = {
549             gen_helper_gvec_dup8,
550             gen_helper_gvec_dup16,
551             gen_helper_gvec_dup32
552         };
553 
554         if (in_32) {
555             fns[vece](t_ptr, t_desc, in_32);
556         } else {
557             t_32 = tcg_temp_new_i32();
558             if (in_64) {
559                 tcg_gen_extrl_i64_i32(t_32, in_64);
560             } else if (vece == MO_8) {
561                 tcg_gen_movi_i32(t_32, in_c & 0xff);
562             } else if (vece == MO_16) {
563                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
564             } else {
565                 tcg_gen_movi_i32(t_32, in_c);
566             }
567             fns[vece](t_ptr, t_desc, t_32);
568             tcg_temp_free_i32(t_32);
569         }
570     }
571 
572     tcg_temp_free_ptr(t_ptr);
573     tcg_temp_free_i32(t_desc);
574     return;
575 
576  done:
577     if (oprsz < maxsz) {
578         expand_clr(dofs + oprsz, maxsz - oprsz);
579     }
580 }
581 
582 /* Likewise, but with zero.  */
583 static void expand_clr(uint32_t dofs, uint32_t maxsz)
584 {
585     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
586 }
587 
588 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
589 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
590                          void (*fni)(TCGv_i32, TCGv_i32))
591 {
592     TCGv_i32 t0 = tcg_temp_new_i32();
593     uint32_t i;
594 
595     for (i = 0; i < oprsz; i += 4) {
596         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
597         fni(t0, t0);
598         tcg_gen_st_i32(t0, cpu_env, dofs + i);
599     }
600     tcg_temp_free_i32(t0);
601 }
602 
603 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
604                           int32_t c, bool load_dest,
605                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
606 {
607     TCGv_i32 t0 = tcg_temp_new_i32();
608     TCGv_i32 t1 = tcg_temp_new_i32();
609     uint32_t i;
610 
611     for (i = 0; i < oprsz; i += 4) {
612         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
613         if (load_dest) {
614             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
615         }
616         fni(t1, t0, c);
617         tcg_gen_st_i32(t1, cpu_env, dofs + i);
618     }
619     tcg_temp_free_i32(t0);
620     tcg_temp_free_i32(t1);
621 }
622 
623 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
624                           TCGv_i32 c, bool scalar_first,
625                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
626 {
627     TCGv_i32 t0 = tcg_temp_new_i32();
628     TCGv_i32 t1 = tcg_temp_new_i32();
629     uint32_t i;
630 
631     for (i = 0; i < oprsz; i += 4) {
632         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
633         if (scalar_first) {
634             fni(t1, c, t0);
635         } else {
636             fni(t1, t0, c);
637         }
638         tcg_gen_st_i32(t1, cpu_env, dofs + i);
639     }
640     tcg_temp_free_i32(t0);
641     tcg_temp_free_i32(t1);
642 }
643 
644 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
645 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
646                          uint32_t bofs, uint32_t oprsz, bool load_dest,
647                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
648 {
649     TCGv_i32 t0 = tcg_temp_new_i32();
650     TCGv_i32 t1 = tcg_temp_new_i32();
651     TCGv_i32 t2 = tcg_temp_new_i32();
652     uint32_t i;
653 
654     for (i = 0; i < oprsz; i += 4) {
655         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
656         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
657         if (load_dest) {
658             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
659         }
660         fni(t2, t0, t1);
661         tcg_gen_st_i32(t2, cpu_env, dofs + i);
662     }
663     tcg_temp_free_i32(t2);
664     tcg_temp_free_i32(t1);
665     tcg_temp_free_i32(t0);
666 }
667 
668 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
669                           uint32_t oprsz, int32_t c, bool load_dest,
670                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
671 {
672     TCGv_i32 t0 = tcg_temp_new_i32();
673     TCGv_i32 t1 = tcg_temp_new_i32();
674     TCGv_i32 t2 = tcg_temp_new_i32();
675     uint32_t i;
676 
677     for (i = 0; i < oprsz; i += 4) {
678         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
679         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
680         if (load_dest) {
681             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
682         }
683         fni(t2, t0, t1, c);
684         tcg_gen_st_i32(t2, cpu_env, dofs + i);
685     }
686     tcg_temp_free_i32(t0);
687     tcg_temp_free_i32(t1);
688     tcg_temp_free_i32(t2);
689 }
690 
691 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
692 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
693                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
694                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
695 {
696     TCGv_i32 t0 = tcg_temp_new_i32();
697     TCGv_i32 t1 = tcg_temp_new_i32();
698     TCGv_i32 t2 = tcg_temp_new_i32();
699     TCGv_i32 t3 = tcg_temp_new_i32();
700     uint32_t i;
701 
702     for (i = 0; i < oprsz; i += 4) {
703         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
704         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
705         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
706         fni(t0, t1, t2, t3);
707         tcg_gen_st_i32(t0, cpu_env, dofs + i);
708         if (write_aofs) {
709             tcg_gen_st_i32(t1, cpu_env, aofs + i);
710         }
711     }
712     tcg_temp_free_i32(t3);
713     tcg_temp_free_i32(t2);
714     tcg_temp_free_i32(t1);
715     tcg_temp_free_i32(t0);
716 }
717 
718 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
719 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
720                          void (*fni)(TCGv_i64, TCGv_i64))
721 {
722     TCGv_i64 t0 = tcg_temp_new_i64();
723     uint32_t i;
724 
725     for (i = 0; i < oprsz; i += 8) {
726         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
727         fni(t0, t0);
728         tcg_gen_st_i64(t0, cpu_env, dofs + i);
729     }
730     tcg_temp_free_i64(t0);
731 }
732 
733 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
734                           int64_t c, bool load_dest,
735                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
736 {
737     TCGv_i64 t0 = tcg_temp_new_i64();
738     TCGv_i64 t1 = tcg_temp_new_i64();
739     uint32_t i;
740 
741     for (i = 0; i < oprsz; i += 8) {
742         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
743         if (load_dest) {
744             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
745         }
746         fni(t1, t0, c);
747         tcg_gen_st_i64(t1, cpu_env, dofs + i);
748     }
749     tcg_temp_free_i64(t0);
750     tcg_temp_free_i64(t1);
751 }
752 
753 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
754                           TCGv_i64 c, bool scalar_first,
755                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
756 {
757     TCGv_i64 t0 = tcg_temp_new_i64();
758     TCGv_i64 t1 = tcg_temp_new_i64();
759     uint32_t i;
760 
761     for (i = 0; i < oprsz; i += 8) {
762         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
763         if (scalar_first) {
764             fni(t1, c, t0);
765         } else {
766             fni(t1, t0, c);
767         }
768         tcg_gen_st_i64(t1, cpu_env, dofs + i);
769     }
770     tcg_temp_free_i64(t0);
771     tcg_temp_free_i64(t1);
772 }
773 
774 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
775 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
776                          uint32_t bofs, uint32_t oprsz, bool load_dest,
777                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
778 {
779     TCGv_i64 t0 = tcg_temp_new_i64();
780     TCGv_i64 t1 = tcg_temp_new_i64();
781     TCGv_i64 t2 = tcg_temp_new_i64();
782     uint32_t i;
783 
784     for (i = 0; i < oprsz; i += 8) {
785         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
786         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
787         if (load_dest) {
788             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
789         }
790         fni(t2, t0, t1);
791         tcg_gen_st_i64(t2, cpu_env, dofs + i);
792     }
793     tcg_temp_free_i64(t2);
794     tcg_temp_free_i64(t1);
795     tcg_temp_free_i64(t0);
796 }
797 
798 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
799                           uint32_t oprsz, int64_t c, bool load_dest,
800                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
801 {
802     TCGv_i64 t0 = tcg_temp_new_i64();
803     TCGv_i64 t1 = tcg_temp_new_i64();
804     TCGv_i64 t2 = tcg_temp_new_i64();
805     uint32_t i;
806 
807     for (i = 0; i < oprsz; i += 8) {
808         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
809         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
810         if (load_dest) {
811             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
812         }
813         fni(t2, t0, t1, c);
814         tcg_gen_st_i64(t2, cpu_env, dofs + i);
815     }
816     tcg_temp_free_i64(t0);
817     tcg_temp_free_i64(t1);
818     tcg_temp_free_i64(t2);
819 }
820 
821 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
822 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
823                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
824                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
825 {
826     TCGv_i64 t0 = tcg_temp_new_i64();
827     TCGv_i64 t1 = tcg_temp_new_i64();
828     TCGv_i64 t2 = tcg_temp_new_i64();
829     TCGv_i64 t3 = tcg_temp_new_i64();
830     uint32_t i;
831 
832     for (i = 0; i < oprsz; i += 8) {
833         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
834         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
835         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
836         fni(t0, t1, t2, t3);
837         tcg_gen_st_i64(t0, cpu_env, dofs + i);
838         if (write_aofs) {
839             tcg_gen_st_i64(t1, cpu_env, aofs + i);
840         }
841     }
842     tcg_temp_free_i64(t3);
843     tcg_temp_free_i64(t2);
844     tcg_temp_free_i64(t1);
845     tcg_temp_free_i64(t0);
846 }
847 
848 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
849 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
850                          uint32_t oprsz, uint32_t tysz, TCGType type,
851                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
852 {
853     TCGv_vec t0 = tcg_temp_new_vec(type);
854     uint32_t i;
855 
856     for (i = 0; i < oprsz; i += tysz) {
857         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
858         fni(vece, t0, t0);
859         tcg_gen_st_vec(t0, cpu_env, dofs + i);
860     }
861     tcg_temp_free_vec(t0);
862 }
863 
864 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
865    using host vectors.  */
866 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
867                           uint32_t oprsz, uint32_t tysz, TCGType type,
868                           int64_t c, bool load_dest,
869                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
870 {
871     TCGv_vec t0 = tcg_temp_new_vec(type);
872     TCGv_vec t1 = tcg_temp_new_vec(type);
873     uint32_t i;
874 
875     for (i = 0; i < oprsz; i += tysz) {
876         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
877         if (load_dest) {
878             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
879         }
880         fni(vece, t1, t0, c);
881         tcg_gen_st_vec(t1, cpu_env, dofs + i);
882     }
883     tcg_temp_free_vec(t0);
884     tcg_temp_free_vec(t1);
885 }
886 
887 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
888                           uint32_t oprsz, uint32_t tysz, TCGType type,
889                           TCGv_vec c, bool scalar_first,
890                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
891 {
892     TCGv_vec t0 = tcg_temp_new_vec(type);
893     TCGv_vec t1 = tcg_temp_new_vec(type);
894     uint32_t i;
895 
896     for (i = 0; i < oprsz; i += tysz) {
897         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
898         if (scalar_first) {
899             fni(vece, t1, c, t0);
900         } else {
901             fni(vece, t1, t0, c);
902         }
903         tcg_gen_st_vec(t1, cpu_env, dofs + i);
904     }
905     tcg_temp_free_vec(t0);
906     tcg_temp_free_vec(t1);
907 }
908 
909 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
910 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
911                          uint32_t bofs, uint32_t oprsz,
912                          uint32_t tysz, TCGType type, bool load_dest,
913                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
914 {
915     TCGv_vec t0 = tcg_temp_new_vec(type);
916     TCGv_vec t1 = tcg_temp_new_vec(type);
917     TCGv_vec t2 = tcg_temp_new_vec(type);
918     uint32_t i;
919 
920     for (i = 0; i < oprsz; i += tysz) {
921         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
922         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
923         if (load_dest) {
924             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
925         }
926         fni(vece, t2, t0, t1);
927         tcg_gen_st_vec(t2, cpu_env, dofs + i);
928     }
929     tcg_temp_free_vec(t2);
930     tcg_temp_free_vec(t1);
931     tcg_temp_free_vec(t0);
932 }
933 
934 /*
935  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
936  * using host vectors.
937  */
938 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
939                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
940                           TCGType type, int64_t c, bool load_dest,
941                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
942                                       int64_t))
943 {
944     TCGv_vec t0 = tcg_temp_new_vec(type);
945     TCGv_vec t1 = tcg_temp_new_vec(type);
946     TCGv_vec t2 = tcg_temp_new_vec(type);
947     uint32_t i;
948 
949     for (i = 0; i < oprsz; i += tysz) {
950         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
951         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
952         if (load_dest) {
953             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
954         }
955         fni(vece, t2, t0, t1, c);
956         tcg_gen_st_vec(t2, cpu_env, dofs + i);
957     }
958     tcg_temp_free_vec(t0);
959     tcg_temp_free_vec(t1);
960     tcg_temp_free_vec(t2);
961 }
962 
963 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
964 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
965                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
966                          uint32_t tysz, TCGType type, bool write_aofs,
967                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
968                                      TCGv_vec, TCGv_vec))
969 {
970     TCGv_vec t0 = tcg_temp_new_vec(type);
971     TCGv_vec t1 = tcg_temp_new_vec(type);
972     TCGv_vec t2 = tcg_temp_new_vec(type);
973     TCGv_vec t3 = tcg_temp_new_vec(type);
974     uint32_t i;
975 
976     for (i = 0; i < oprsz; i += tysz) {
977         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
978         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
979         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
980         fni(vece, t0, t1, t2, t3);
981         tcg_gen_st_vec(t0, cpu_env, dofs + i);
982         if (write_aofs) {
983             tcg_gen_st_vec(t1, cpu_env, aofs + i);
984         }
985     }
986     tcg_temp_free_vec(t3);
987     tcg_temp_free_vec(t2);
988     tcg_temp_free_vec(t1);
989     tcg_temp_free_vec(t0);
990 }
991 
992 /* Expand a vector two-operand operation.  */
993 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
994                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
995 {
996     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
997     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
998     TCGType type;
999     uint32_t some;
1000 
1001     check_size_align(oprsz, maxsz, dofs | aofs);
1002     check_overlap_2(dofs, aofs, maxsz);
1003 
1004     type = 0;
1005     if (g->fniv) {
1006         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1007     }
1008     switch (type) {
1009     case TCG_TYPE_V256:
1010         /* Recall that ARM SVE allows vector sizes that are not a
1011          * power of 2, but always a multiple of 16.  The intent is
1012          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1013          */
1014         some = QEMU_ALIGN_DOWN(oprsz, 32);
1015         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
1016         if (some == oprsz) {
1017             break;
1018         }
1019         dofs += some;
1020         aofs += some;
1021         oprsz -= some;
1022         maxsz -= some;
1023         /* fallthru */
1024     case TCG_TYPE_V128:
1025         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
1026         break;
1027     case TCG_TYPE_V64:
1028         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
1029         break;
1030 
1031     case 0:
1032         if (g->fni8 && check_size_impl(oprsz, 8)) {
1033             expand_2_i64(dofs, aofs, oprsz, g->fni8);
1034         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1035             expand_2_i32(dofs, aofs, oprsz, g->fni4);
1036         } else {
1037             assert(g->fno != NULL);
1038             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1039             oprsz = maxsz;
1040         }
1041         break;
1042 
1043     default:
1044         g_assert_not_reached();
1045     }
1046     tcg_swap_vecop_list(hold_list);
1047 
1048     if (oprsz < maxsz) {
1049         expand_clr(dofs + oprsz, maxsz - oprsz);
1050     }
1051 }
1052 
1053 /* Expand a vector operation with two vectors and an immediate.  */
1054 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1055                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1056 {
1057     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1058     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1059     TCGType type;
1060     uint32_t some;
1061 
1062     check_size_align(oprsz, maxsz, dofs | aofs);
1063     check_overlap_2(dofs, aofs, maxsz);
1064 
1065     type = 0;
1066     if (g->fniv) {
1067         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1068     }
1069     switch (type) {
1070     case TCG_TYPE_V256:
1071         /* Recall that ARM SVE allows vector sizes that are not a
1072          * power of 2, but always a multiple of 16.  The intent is
1073          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1074          */
1075         some = QEMU_ALIGN_DOWN(oprsz, 32);
1076         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1077                       c, g->load_dest, g->fniv);
1078         if (some == oprsz) {
1079             break;
1080         }
1081         dofs += some;
1082         aofs += some;
1083         oprsz -= some;
1084         maxsz -= some;
1085         /* fallthru */
1086     case TCG_TYPE_V128:
1087         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1088                       c, g->load_dest, g->fniv);
1089         break;
1090     case TCG_TYPE_V64:
1091         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1092                       c, g->load_dest, g->fniv);
1093         break;
1094 
1095     case 0:
1096         if (g->fni8 && check_size_impl(oprsz, 8)) {
1097             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1098         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1099             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1100         } else {
1101             if (g->fno) {
1102                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1103             } else {
1104                 TCGv_i64 tcg_c = tcg_const_i64(c);
1105                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1106                                     maxsz, c, g->fnoi);
1107                 tcg_temp_free_i64(tcg_c);
1108             }
1109             oprsz = maxsz;
1110         }
1111         break;
1112 
1113     default:
1114         g_assert_not_reached();
1115     }
1116     tcg_swap_vecop_list(hold_list);
1117 
1118     if (oprsz < maxsz) {
1119         expand_clr(dofs + oprsz, maxsz - oprsz);
1120     }
1121 }
1122 
1123 /* Expand a vector operation with two vectors and a scalar.  */
1124 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1125                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1126 {
1127     TCGType type;
1128 
1129     check_size_align(oprsz, maxsz, dofs | aofs);
1130     check_overlap_2(dofs, aofs, maxsz);
1131 
1132     type = 0;
1133     if (g->fniv) {
1134         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1135     }
1136     if (type != 0) {
1137         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1138         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1139         TCGv_vec t_vec = tcg_temp_new_vec(type);
1140         uint32_t some;
1141 
1142         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1143 
1144         switch (type) {
1145         case TCG_TYPE_V256:
1146             /* Recall that ARM SVE allows vector sizes that are not a
1147              * power of 2, but always a multiple of 16.  The intent is
1148              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1149              */
1150             some = QEMU_ALIGN_DOWN(oprsz, 32);
1151             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1152                           t_vec, g->scalar_first, g->fniv);
1153             if (some == oprsz) {
1154                 break;
1155             }
1156             dofs += some;
1157             aofs += some;
1158             oprsz -= some;
1159             maxsz -= some;
1160             /* fallthru */
1161 
1162         case TCG_TYPE_V128:
1163             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1164                           t_vec, g->scalar_first, g->fniv);
1165             break;
1166 
1167         case TCG_TYPE_V64:
1168             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1169                           t_vec, g->scalar_first, g->fniv);
1170             break;
1171 
1172         default:
1173             g_assert_not_reached();
1174         }
1175         tcg_temp_free_vec(t_vec);
1176         tcg_swap_vecop_list(hold_list);
1177     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1178         TCGv_i64 t64 = tcg_temp_new_i64();
1179 
1180         gen_dup_i64(g->vece, t64, c);
1181         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1182         tcg_temp_free_i64(t64);
1183     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1184         TCGv_i32 t32 = tcg_temp_new_i32();
1185 
1186         tcg_gen_extrl_i64_i32(t32, c);
1187         gen_dup_i32(g->vece, t32, t32);
1188         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1189         tcg_temp_free_i32(t32);
1190     } else {
1191         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1192         return;
1193     }
1194 
1195     if (oprsz < maxsz) {
1196         expand_clr(dofs + oprsz, maxsz - oprsz);
1197     }
1198 }
1199 
1200 /* Expand a vector three-operand operation.  */
1201 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1202                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1203 {
1204     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1205     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1206     TCGType type;
1207     uint32_t some;
1208 
1209     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1210     check_overlap_3(dofs, aofs, bofs, maxsz);
1211 
1212     type = 0;
1213     if (g->fniv) {
1214         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1215     }
1216     switch (type) {
1217     case TCG_TYPE_V256:
1218         /* Recall that ARM SVE allows vector sizes that are not a
1219          * power of 2, but always a multiple of 16.  The intent is
1220          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1221          */
1222         some = QEMU_ALIGN_DOWN(oprsz, 32);
1223         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1224                      g->load_dest, g->fniv);
1225         if (some == oprsz) {
1226             break;
1227         }
1228         dofs += some;
1229         aofs += some;
1230         bofs += some;
1231         oprsz -= some;
1232         maxsz -= some;
1233         /* fallthru */
1234     case TCG_TYPE_V128:
1235         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1236                      g->load_dest, g->fniv);
1237         break;
1238     case TCG_TYPE_V64:
1239         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1240                      g->load_dest, g->fniv);
1241         break;
1242 
1243     case 0:
1244         if (g->fni8 && check_size_impl(oprsz, 8)) {
1245             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1246         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1247             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1248         } else {
1249             assert(g->fno != NULL);
1250             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1251                                maxsz, g->data, g->fno);
1252             oprsz = maxsz;
1253         }
1254         break;
1255 
1256     default:
1257         g_assert_not_reached();
1258     }
1259     tcg_swap_vecop_list(hold_list);
1260 
1261     if (oprsz < maxsz) {
1262         expand_clr(dofs + oprsz, maxsz - oprsz);
1263     }
1264 }
1265 
1266 /* Expand a vector operation with three vectors and an immediate.  */
1267 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1268                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1269                      const GVecGen3i *g)
1270 {
1271     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1272     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1273     TCGType type;
1274     uint32_t some;
1275 
1276     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1277     check_overlap_3(dofs, aofs, bofs, maxsz);
1278 
1279     type = 0;
1280     if (g->fniv) {
1281         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1282     }
1283     switch (type) {
1284     case TCG_TYPE_V256:
1285         /*
1286          * Recall that ARM SVE allows vector sizes that are not a
1287          * power of 2, but always a multiple of 16.  The intent is
1288          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1289          */
1290         some = QEMU_ALIGN_DOWN(oprsz, 32);
1291         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1292                       c, g->load_dest, g->fniv);
1293         if (some == oprsz) {
1294             break;
1295         }
1296         dofs += some;
1297         aofs += some;
1298         bofs += some;
1299         oprsz -= some;
1300         maxsz -= some;
1301         /* fallthru */
1302     case TCG_TYPE_V128:
1303         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1304                       c, g->load_dest, g->fniv);
1305         break;
1306     case TCG_TYPE_V64:
1307         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1308                       c, g->load_dest, g->fniv);
1309         break;
1310 
1311     case 0:
1312         if (g->fni8 && check_size_impl(oprsz, 8)) {
1313             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1314         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1315             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1316         } else {
1317             assert(g->fno != NULL);
1318             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1319             oprsz = maxsz;
1320         }
1321         break;
1322 
1323     default:
1324         g_assert_not_reached();
1325     }
1326     tcg_swap_vecop_list(hold_list);
1327 
1328     if (oprsz < maxsz) {
1329         expand_clr(dofs + oprsz, maxsz - oprsz);
1330     }
1331 }
1332 
1333 /* Expand a vector four-operand operation.  */
1334 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1335                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1336 {
1337     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1338     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1339     TCGType type;
1340     uint32_t some;
1341 
1342     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1343     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1344 
1345     type = 0;
1346     if (g->fniv) {
1347         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1348     }
1349     switch (type) {
1350     case TCG_TYPE_V256:
1351         /* Recall that ARM SVE allows vector sizes that are not a
1352          * power of 2, but always a multiple of 16.  The intent is
1353          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1354          */
1355         some = QEMU_ALIGN_DOWN(oprsz, 32);
1356         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1357                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1358         if (some == oprsz) {
1359             break;
1360         }
1361         dofs += some;
1362         aofs += some;
1363         bofs += some;
1364         cofs += some;
1365         oprsz -= some;
1366         maxsz -= some;
1367         /* fallthru */
1368     case TCG_TYPE_V128:
1369         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1370                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1371         break;
1372     case TCG_TYPE_V64:
1373         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1374                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1375         break;
1376 
1377     case 0:
1378         if (g->fni8 && check_size_impl(oprsz, 8)) {
1379             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1380                          g->write_aofs, g->fni8);
1381         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1382             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1383                          g->write_aofs, g->fni4);
1384         } else {
1385             assert(g->fno != NULL);
1386             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1387                                oprsz, maxsz, g->data, g->fno);
1388             oprsz = maxsz;
1389         }
1390         break;
1391 
1392     default:
1393         g_assert_not_reached();
1394     }
1395     tcg_swap_vecop_list(hold_list);
1396 
1397     if (oprsz < maxsz) {
1398         expand_clr(dofs + oprsz, maxsz - oprsz);
1399     }
1400 }
1401 
1402 /*
1403  * Expand specific vector operations.
1404  */
1405 
1406 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1407 {
1408     tcg_gen_mov_vec(a, b);
1409 }
1410 
1411 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1412                       uint32_t oprsz, uint32_t maxsz)
1413 {
1414     static const GVecGen2 g = {
1415         .fni8 = tcg_gen_mov_i64,
1416         .fniv = vec_mov2,
1417         .fno = gen_helper_gvec_mov,
1418         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1419     };
1420     if (dofs != aofs) {
1421         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1422     } else {
1423         check_size_align(oprsz, maxsz, dofs);
1424         if (oprsz < maxsz) {
1425             expand_clr(dofs + oprsz, maxsz - oprsz);
1426         }
1427     }
1428 }
1429 
1430 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1431                           uint32_t maxsz, TCGv_i32 in)
1432 {
1433     check_size_align(oprsz, maxsz, dofs);
1434     tcg_debug_assert(vece <= MO_32);
1435     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1436 }
1437 
1438 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1439                           uint32_t maxsz, TCGv_i64 in)
1440 {
1441     check_size_align(oprsz, maxsz, dofs);
1442     tcg_debug_assert(vece <= MO_64);
1443     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1444 }
1445 
1446 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1447                           uint32_t oprsz, uint32_t maxsz)
1448 {
1449     if (vece <= MO_64) {
1450         TCGType type = choose_vector_type(0, vece, oprsz, 0);
1451         if (type != 0) {
1452             TCGv_vec t_vec = tcg_temp_new_vec(type);
1453             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1454             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1455             tcg_temp_free_vec(t_vec);
1456             return;
1457         }
1458     }
1459     if (vece <= MO_32) {
1460         TCGv_i32 in = tcg_temp_new_i32();
1461         switch (vece) {
1462         case MO_8:
1463             tcg_gen_ld8u_i32(in, cpu_env, aofs);
1464             break;
1465         case MO_16:
1466             tcg_gen_ld16u_i32(in, cpu_env, aofs);
1467             break;
1468         case MO_32:
1469             tcg_gen_ld_i32(in, cpu_env, aofs);
1470             break;
1471         }
1472         tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1473         tcg_temp_free_i32(in);
1474     } else if (vece == MO_64) {
1475         TCGv_i64 in = tcg_temp_new_i64();
1476         tcg_gen_ld_i64(in, cpu_env, aofs);
1477         tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1478         tcg_temp_free_i64(in);
1479     } else {
1480         /* 128-bit duplicate.  */
1481         /* ??? Dup to 256-bit vector.  */
1482         int i;
1483 
1484         tcg_debug_assert(vece == 4);
1485         tcg_debug_assert(oprsz >= 16);
1486         if (TCG_TARGET_HAS_v128) {
1487             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1488 
1489             tcg_gen_ld_vec(in, cpu_env, aofs);
1490             for (i = 0; i < oprsz; i += 16) {
1491                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1492             }
1493             tcg_temp_free_vec(in);
1494         } else {
1495             TCGv_i64 in0 = tcg_temp_new_i64();
1496             TCGv_i64 in1 = tcg_temp_new_i64();
1497 
1498             tcg_gen_ld_i64(in0, cpu_env, aofs);
1499             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1500             for (i = 0; i < oprsz; i += 16) {
1501                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1502                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1503             }
1504             tcg_temp_free_i64(in0);
1505             tcg_temp_free_i64(in1);
1506         }
1507     }
1508 }
1509 
1510 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1511                          uint32_t maxsz, uint64_t x)
1512 {
1513     check_size_align(oprsz, maxsz, dofs);
1514     do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1515 }
1516 
1517 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1518                          uint32_t maxsz, uint32_t x)
1519 {
1520     check_size_align(oprsz, maxsz, dofs);
1521     do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1522 }
1523 
1524 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1525                          uint32_t maxsz, uint16_t x)
1526 {
1527     check_size_align(oprsz, maxsz, dofs);
1528     do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1529 }
1530 
1531 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1532                          uint32_t maxsz, uint8_t x)
1533 {
1534     check_size_align(oprsz, maxsz, dofs);
1535     do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1536 }
1537 
1538 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1539                       uint32_t oprsz, uint32_t maxsz)
1540 {
1541     static const GVecGen2 g = {
1542         .fni8 = tcg_gen_not_i64,
1543         .fniv = tcg_gen_not_vec,
1544         .fno = gen_helper_gvec_not,
1545         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1546     };
1547     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1548 }
1549 
1550 /* Perform a vector addition using normal addition and a mask.  The mask
1551    should be the sign bit of each lane.  This 6-operation form is more
1552    efficient than separate additions when there are 4 or more lanes in
1553    the 64-bit operation.  */
1554 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1555 {
1556     TCGv_i64 t1 = tcg_temp_new_i64();
1557     TCGv_i64 t2 = tcg_temp_new_i64();
1558     TCGv_i64 t3 = tcg_temp_new_i64();
1559 
1560     tcg_gen_andc_i64(t1, a, m);
1561     tcg_gen_andc_i64(t2, b, m);
1562     tcg_gen_xor_i64(t3, a, b);
1563     tcg_gen_add_i64(d, t1, t2);
1564     tcg_gen_and_i64(t3, t3, m);
1565     tcg_gen_xor_i64(d, d, t3);
1566 
1567     tcg_temp_free_i64(t1);
1568     tcg_temp_free_i64(t2);
1569     tcg_temp_free_i64(t3);
1570 }
1571 
1572 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1573 {
1574     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1575     gen_addv_mask(d, a, b, m);
1576     tcg_temp_free_i64(m);
1577 }
1578 
1579 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1580 {
1581     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1582     gen_addv_mask(d, a, b, m);
1583     tcg_temp_free_i64(m);
1584 }
1585 
1586 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1587 {
1588     TCGv_i64 t1 = tcg_temp_new_i64();
1589     TCGv_i64 t2 = tcg_temp_new_i64();
1590 
1591     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1592     tcg_gen_add_i64(t2, a, b);
1593     tcg_gen_add_i64(t1, t1, b);
1594     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1595 
1596     tcg_temp_free_i64(t1);
1597     tcg_temp_free_i64(t2);
1598 }
1599 
1600 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1601 
1602 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1603                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1604 {
1605     static const GVecGen3 g[4] = {
1606         { .fni8 = tcg_gen_vec_add8_i64,
1607           .fniv = tcg_gen_add_vec,
1608           .fno = gen_helper_gvec_add8,
1609           .opt_opc = vecop_list_add,
1610           .vece = MO_8 },
1611         { .fni8 = tcg_gen_vec_add16_i64,
1612           .fniv = tcg_gen_add_vec,
1613           .fno = gen_helper_gvec_add16,
1614           .opt_opc = vecop_list_add,
1615           .vece = MO_16 },
1616         { .fni4 = tcg_gen_add_i32,
1617           .fniv = tcg_gen_add_vec,
1618           .fno = gen_helper_gvec_add32,
1619           .opt_opc = vecop_list_add,
1620           .vece = MO_32 },
1621         { .fni8 = tcg_gen_add_i64,
1622           .fniv = tcg_gen_add_vec,
1623           .fno = gen_helper_gvec_add64,
1624           .opt_opc = vecop_list_add,
1625           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1626           .vece = MO_64 },
1627     };
1628 
1629     tcg_debug_assert(vece <= MO_64);
1630     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1631 }
1632 
1633 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1634                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1635 {
1636     static const GVecGen2s g[4] = {
1637         { .fni8 = tcg_gen_vec_add8_i64,
1638           .fniv = tcg_gen_add_vec,
1639           .fno = gen_helper_gvec_adds8,
1640           .opt_opc = vecop_list_add,
1641           .vece = MO_8 },
1642         { .fni8 = tcg_gen_vec_add16_i64,
1643           .fniv = tcg_gen_add_vec,
1644           .fno = gen_helper_gvec_adds16,
1645           .opt_opc = vecop_list_add,
1646           .vece = MO_16 },
1647         { .fni4 = tcg_gen_add_i32,
1648           .fniv = tcg_gen_add_vec,
1649           .fno = gen_helper_gvec_adds32,
1650           .opt_opc = vecop_list_add,
1651           .vece = MO_32 },
1652         { .fni8 = tcg_gen_add_i64,
1653           .fniv = tcg_gen_add_vec,
1654           .fno = gen_helper_gvec_adds64,
1655           .opt_opc = vecop_list_add,
1656           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1657           .vece = MO_64 },
1658     };
1659 
1660     tcg_debug_assert(vece <= MO_64);
1661     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1662 }
1663 
1664 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1665                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1666 {
1667     TCGv_i64 tmp = tcg_const_i64(c);
1668     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1669     tcg_temp_free_i64(tmp);
1670 }
1671 
1672 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1673 
1674 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1675                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1676 {
1677     static const GVecGen2s g[4] = {
1678         { .fni8 = tcg_gen_vec_sub8_i64,
1679           .fniv = tcg_gen_sub_vec,
1680           .fno = gen_helper_gvec_subs8,
1681           .opt_opc = vecop_list_sub,
1682           .vece = MO_8 },
1683         { .fni8 = tcg_gen_vec_sub16_i64,
1684           .fniv = tcg_gen_sub_vec,
1685           .fno = gen_helper_gvec_subs16,
1686           .opt_opc = vecop_list_sub,
1687           .vece = MO_16 },
1688         { .fni4 = tcg_gen_sub_i32,
1689           .fniv = tcg_gen_sub_vec,
1690           .fno = gen_helper_gvec_subs32,
1691           .opt_opc = vecop_list_sub,
1692           .vece = MO_32 },
1693         { .fni8 = tcg_gen_sub_i64,
1694           .fniv = tcg_gen_sub_vec,
1695           .fno = gen_helper_gvec_subs64,
1696           .opt_opc = vecop_list_sub,
1697           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1698           .vece = MO_64 },
1699     };
1700 
1701     tcg_debug_assert(vece <= MO_64);
1702     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1703 }
1704 
1705 /* Perform a vector subtraction using normal subtraction and a mask.
1706    Compare gen_addv_mask above.  */
1707 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1708 {
1709     TCGv_i64 t1 = tcg_temp_new_i64();
1710     TCGv_i64 t2 = tcg_temp_new_i64();
1711     TCGv_i64 t3 = tcg_temp_new_i64();
1712 
1713     tcg_gen_or_i64(t1, a, m);
1714     tcg_gen_andc_i64(t2, b, m);
1715     tcg_gen_eqv_i64(t3, a, b);
1716     tcg_gen_sub_i64(d, t1, t2);
1717     tcg_gen_and_i64(t3, t3, m);
1718     tcg_gen_xor_i64(d, d, t3);
1719 
1720     tcg_temp_free_i64(t1);
1721     tcg_temp_free_i64(t2);
1722     tcg_temp_free_i64(t3);
1723 }
1724 
1725 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1726 {
1727     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1728     gen_subv_mask(d, a, b, m);
1729     tcg_temp_free_i64(m);
1730 }
1731 
1732 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1733 {
1734     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1735     gen_subv_mask(d, a, b, m);
1736     tcg_temp_free_i64(m);
1737 }
1738 
1739 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1740 {
1741     TCGv_i64 t1 = tcg_temp_new_i64();
1742     TCGv_i64 t2 = tcg_temp_new_i64();
1743 
1744     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1745     tcg_gen_sub_i64(t2, a, b);
1746     tcg_gen_sub_i64(t1, a, t1);
1747     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1748 
1749     tcg_temp_free_i64(t1);
1750     tcg_temp_free_i64(t2);
1751 }
1752 
1753 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1754                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1755 {
1756     static const GVecGen3 g[4] = {
1757         { .fni8 = tcg_gen_vec_sub8_i64,
1758           .fniv = tcg_gen_sub_vec,
1759           .fno = gen_helper_gvec_sub8,
1760           .opt_opc = vecop_list_sub,
1761           .vece = MO_8 },
1762         { .fni8 = tcg_gen_vec_sub16_i64,
1763           .fniv = tcg_gen_sub_vec,
1764           .fno = gen_helper_gvec_sub16,
1765           .opt_opc = vecop_list_sub,
1766           .vece = MO_16 },
1767         { .fni4 = tcg_gen_sub_i32,
1768           .fniv = tcg_gen_sub_vec,
1769           .fno = gen_helper_gvec_sub32,
1770           .opt_opc = vecop_list_sub,
1771           .vece = MO_32 },
1772         { .fni8 = tcg_gen_sub_i64,
1773           .fniv = tcg_gen_sub_vec,
1774           .fno = gen_helper_gvec_sub64,
1775           .opt_opc = vecop_list_sub,
1776           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1777           .vece = MO_64 },
1778     };
1779 
1780     tcg_debug_assert(vece <= MO_64);
1781     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1782 }
1783 
1784 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1785 
1786 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1787                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1788 {
1789     static const GVecGen3 g[4] = {
1790         { .fniv = tcg_gen_mul_vec,
1791           .fno = gen_helper_gvec_mul8,
1792           .opt_opc = vecop_list_mul,
1793           .vece = MO_8 },
1794         { .fniv = tcg_gen_mul_vec,
1795           .fno = gen_helper_gvec_mul16,
1796           .opt_opc = vecop_list_mul,
1797           .vece = MO_16 },
1798         { .fni4 = tcg_gen_mul_i32,
1799           .fniv = tcg_gen_mul_vec,
1800           .fno = gen_helper_gvec_mul32,
1801           .opt_opc = vecop_list_mul,
1802           .vece = MO_32 },
1803         { .fni8 = tcg_gen_mul_i64,
1804           .fniv = tcg_gen_mul_vec,
1805           .fno = gen_helper_gvec_mul64,
1806           .opt_opc = vecop_list_mul,
1807           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1808           .vece = MO_64 },
1809     };
1810 
1811     tcg_debug_assert(vece <= MO_64);
1812     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1813 }
1814 
1815 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1816                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1817 {
1818     static const GVecGen2s g[4] = {
1819         { .fniv = tcg_gen_mul_vec,
1820           .fno = gen_helper_gvec_muls8,
1821           .opt_opc = vecop_list_mul,
1822           .vece = MO_8 },
1823         { .fniv = tcg_gen_mul_vec,
1824           .fno = gen_helper_gvec_muls16,
1825           .opt_opc = vecop_list_mul,
1826           .vece = MO_16 },
1827         { .fni4 = tcg_gen_mul_i32,
1828           .fniv = tcg_gen_mul_vec,
1829           .fno = gen_helper_gvec_muls32,
1830           .opt_opc = vecop_list_mul,
1831           .vece = MO_32 },
1832         { .fni8 = tcg_gen_mul_i64,
1833           .fniv = tcg_gen_mul_vec,
1834           .fno = gen_helper_gvec_muls64,
1835           .opt_opc = vecop_list_mul,
1836           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1837           .vece = MO_64 },
1838     };
1839 
1840     tcg_debug_assert(vece <= MO_64);
1841     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1842 }
1843 
1844 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1845                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1846 {
1847     TCGv_i64 tmp = tcg_const_i64(c);
1848     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1849     tcg_temp_free_i64(tmp);
1850 }
1851 
1852 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1853                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1854 {
1855     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1856     static const GVecGen3 g[4] = {
1857         { .fniv = tcg_gen_ssadd_vec,
1858           .fno = gen_helper_gvec_ssadd8,
1859           .opt_opc = vecop_list,
1860           .vece = MO_8 },
1861         { .fniv = tcg_gen_ssadd_vec,
1862           .fno = gen_helper_gvec_ssadd16,
1863           .opt_opc = vecop_list,
1864           .vece = MO_16 },
1865         { .fniv = tcg_gen_ssadd_vec,
1866           .fno = gen_helper_gvec_ssadd32,
1867           .opt_opc = vecop_list,
1868           .vece = MO_32 },
1869         { .fniv = tcg_gen_ssadd_vec,
1870           .fno = gen_helper_gvec_ssadd64,
1871           .opt_opc = vecop_list,
1872           .vece = MO_64 },
1873     };
1874     tcg_debug_assert(vece <= MO_64);
1875     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1876 }
1877 
1878 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1879                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1880 {
1881     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
1882     static const GVecGen3 g[4] = {
1883         { .fniv = tcg_gen_sssub_vec,
1884           .fno = gen_helper_gvec_sssub8,
1885           .opt_opc = vecop_list,
1886           .vece = MO_8 },
1887         { .fniv = tcg_gen_sssub_vec,
1888           .fno = gen_helper_gvec_sssub16,
1889           .opt_opc = vecop_list,
1890           .vece = MO_16 },
1891         { .fniv = tcg_gen_sssub_vec,
1892           .fno = gen_helper_gvec_sssub32,
1893           .opt_opc = vecop_list,
1894           .vece = MO_32 },
1895         { .fniv = tcg_gen_sssub_vec,
1896           .fno = gen_helper_gvec_sssub64,
1897           .opt_opc = vecop_list,
1898           .vece = MO_64 },
1899     };
1900     tcg_debug_assert(vece <= MO_64);
1901     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1902 }
1903 
1904 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1905 {
1906     TCGv_i32 max = tcg_const_i32(-1);
1907     tcg_gen_add_i32(d, a, b);
1908     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1909     tcg_temp_free_i32(max);
1910 }
1911 
1912 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1913 {
1914     TCGv_i64 max = tcg_const_i64(-1);
1915     tcg_gen_add_i64(d, a, b);
1916     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1917     tcg_temp_free_i64(max);
1918 }
1919 
1920 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1921                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1922 {
1923     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
1924     static const GVecGen3 g[4] = {
1925         { .fniv = tcg_gen_usadd_vec,
1926           .fno = gen_helper_gvec_usadd8,
1927           .opt_opc = vecop_list,
1928           .vece = MO_8 },
1929         { .fniv = tcg_gen_usadd_vec,
1930           .fno = gen_helper_gvec_usadd16,
1931           .opt_opc = vecop_list,
1932           .vece = MO_16 },
1933         { .fni4 = tcg_gen_usadd_i32,
1934           .fniv = tcg_gen_usadd_vec,
1935           .fno = gen_helper_gvec_usadd32,
1936           .opt_opc = vecop_list,
1937           .vece = MO_32 },
1938         { .fni8 = tcg_gen_usadd_i64,
1939           .fniv = tcg_gen_usadd_vec,
1940           .fno = gen_helper_gvec_usadd64,
1941           .opt_opc = vecop_list,
1942           .vece = MO_64 }
1943     };
1944     tcg_debug_assert(vece <= MO_64);
1945     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1946 }
1947 
1948 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1949 {
1950     TCGv_i32 min = tcg_const_i32(0);
1951     tcg_gen_sub_i32(d, a, b);
1952     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1953     tcg_temp_free_i32(min);
1954 }
1955 
1956 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1957 {
1958     TCGv_i64 min = tcg_const_i64(0);
1959     tcg_gen_sub_i64(d, a, b);
1960     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1961     tcg_temp_free_i64(min);
1962 }
1963 
1964 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1965                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1966 {
1967     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
1968     static const GVecGen3 g[4] = {
1969         { .fniv = tcg_gen_ussub_vec,
1970           .fno = gen_helper_gvec_ussub8,
1971           .opt_opc = vecop_list,
1972           .vece = MO_8 },
1973         { .fniv = tcg_gen_ussub_vec,
1974           .fno = gen_helper_gvec_ussub16,
1975           .opt_opc = vecop_list,
1976           .vece = MO_16 },
1977         { .fni4 = tcg_gen_ussub_i32,
1978           .fniv = tcg_gen_ussub_vec,
1979           .fno = gen_helper_gvec_ussub32,
1980           .opt_opc = vecop_list,
1981           .vece = MO_32 },
1982         { .fni8 = tcg_gen_ussub_i64,
1983           .fniv = tcg_gen_ussub_vec,
1984           .fno = gen_helper_gvec_ussub64,
1985           .opt_opc = vecop_list,
1986           .vece = MO_64 }
1987     };
1988     tcg_debug_assert(vece <= MO_64);
1989     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1990 }
1991 
1992 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
1993                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1994 {
1995     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
1996     static const GVecGen3 g[4] = {
1997         { .fniv = tcg_gen_smin_vec,
1998           .fno = gen_helper_gvec_smin8,
1999           .opt_opc = vecop_list,
2000           .vece = MO_8 },
2001         { .fniv = tcg_gen_smin_vec,
2002           .fno = gen_helper_gvec_smin16,
2003           .opt_opc = vecop_list,
2004           .vece = MO_16 },
2005         { .fni4 = tcg_gen_smin_i32,
2006           .fniv = tcg_gen_smin_vec,
2007           .fno = gen_helper_gvec_smin32,
2008           .opt_opc = vecop_list,
2009           .vece = MO_32 },
2010         { .fni8 = tcg_gen_smin_i64,
2011           .fniv = tcg_gen_smin_vec,
2012           .fno = gen_helper_gvec_smin64,
2013           .opt_opc = vecop_list,
2014           .vece = MO_64 }
2015     };
2016     tcg_debug_assert(vece <= MO_64);
2017     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2018 }
2019 
2020 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2021                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2022 {
2023     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2024     static const GVecGen3 g[4] = {
2025         { .fniv = tcg_gen_umin_vec,
2026           .fno = gen_helper_gvec_umin8,
2027           .opt_opc = vecop_list,
2028           .vece = MO_8 },
2029         { .fniv = tcg_gen_umin_vec,
2030           .fno = gen_helper_gvec_umin16,
2031           .opt_opc = vecop_list,
2032           .vece = MO_16 },
2033         { .fni4 = tcg_gen_umin_i32,
2034           .fniv = tcg_gen_umin_vec,
2035           .fno = gen_helper_gvec_umin32,
2036           .opt_opc = vecop_list,
2037           .vece = MO_32 },
2038         { .fni8 = tcg_gen_umin_i64,
2039           .fniv = tcg_gen_umin_vec,
2040           .fno = gen_helper_gvec_umin64,
2041           .opt_opc = vecop_list,
2042           .vece = MO_64 }
2043     };
2044     tcg_debug_assert(vece <= MO_64);
2045     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2046 }
2047 
2048 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2049                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2050 {
2051     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2052     static const GVecGen3 g[4] = {
2053         { .fniv = tcg_gen_smax_vec,
2054           .fno = gen_helper_gvec_smax8,
2055           .opt_opc = vecop_list,
2056           .vece = MO_8 },
2057         { .fniv = tcg_gen_smax_vec,
2058           .fno = gen_helper_gvec_smax16,
2059           .opt_opc = vecop_list,
2060           .vece = MO_16 },
2061         { .fni4 = tcg_gen_smax_i32,
2062           .fniv = tcg_gen_smax_vec,
2063           .fno = gen_helper_gvec_smax32,
2064           .opt_opc = vecop_list,
2065           .vece = MO_32 },
2066         { .fni8 = tcg_gen_smax_i64,
2067           .fniv = tcg_gen_smax_vec,
2068           .fno = gen_helper_gvec_smax64,
2069           .opt_opc = vecop_list,
2070           .vece = MO_64 }
2071     };
2072     tcg_debug_assert(vece <= MO_64);
2073     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2074 }
2075 
2076 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2077                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2078 {
2079     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2080     static const GVecGen3 g[4] = {
2081         { .fniv = tcg_gen_umax_vec,
2082           .fno = gen_helper_gvec_umax8,
2083           .opt_opc = vecop_list,
2084           .vece = MO_8 },
2085         { .fniv = tcg_gen_umax_vec,
2086           .fno = gen_helper_gvec_umax16,
2087           .opt_opc = vecop_list,
2088           .vece = MO_16 },
2089         { .fni4 = tcg_gen_umax_i32,
2090           .fniv = tcg_gen_umax_vec,
2091           .fno = gen_helper_gvec_umax32,
2092           .opt_opc = vecop_list,
2093           .vece = MO_32 },
2094         { .fni8 = tcg_gen_umax_i64,
2095           .fniv = tcg_gen_umax_vec,
2096           .fno = gen_helper_gvec_umax64,
2097           .opt_opc = vecop_list,
2098           .vece = MO_64 }
2099     };
2100     tcg_debug_assert(vece <= MO_64);
2101     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2102 }
2103 
2104 /* Perform a vector negation using normal negation and a mask.
2105    Compare gen_subv_mask above.  */
2106 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2107 {
2108     TCGv_i64 t2 = tcg_temp_new_i64();
2109     TCGv_i64 t3 = tcg_temp_new_i64();
2110 
2111     tcg_gen_andc_i64(t3, m, b);
2112     tcg_gen_andc_i64(t2, b, m);
2113     tcg_gen_sub_i64(d, m, t2);
2114     tcg_gen_xor_i64(d, d, t3);
2115 
2116     tcg_temp_free_i64(t2);
2117     tcg_temp_free_i64(t3);
2118 }
2119 
2120 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2121 {
2122     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2123     gen_negv_mask(d, b, m);
2124     tcg_temp_free_i64(m);
2125 }
2126 
2127 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2128 {
2129     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2130     gen_negv_mask(d, b, m);
2131     tcg_temp_free_i64(m);
2132 }
2133 
2134 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2135 {
2136     TCGv_i64 t1 = tcg_temp_new_i64();
2137     TCGv_i64 t2 = tcg_temp_new_i64();
2138 
2139     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2140     tcg_gen_neg_i64(t2, b);
2141     tcg_gen_neg_i64(t1, t1);
2142     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2143 
2144     tcg_temp_free_i64(t1);
2145     tcg_temp_free_i64(t2);
2146 }
2147 
2148 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2149                       uint32_t oprsz, uint32_t maxsz)
2150 {
2151     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2152     static const GVecGen2 g[4] = {
2153         { .fni8 = tcg_gen_vec_neg8_i64,
2154           .fniv = tcg_gen_neg_vec,
2155           .fno = gen_helper_gvec_neg8,
2156           .opt_opc = vecop_list,
2157           .vece = MO_8 },
2158         { .fni8 = tcg_gen_vec_neg16_i64,
2159           .fniv = tcg_gen_neg_vec,
2160           .fno = gen_helper_gvec_neg16,
2161           .opt_opc = vecop_list,
2162           .vece = MO_16 },
2163         { .fni4 = tcg_gen_neg_i32,
2164           .fniv = tcg_gen_neg_vec,
2165           .fno = gen_helper_gvec_neg32,
2166           .opt_opc = vecop_list,
2167           .vece = MO_32 },
2168         { .fni8 = tcg_gen_neg_i64,
2169           .fniv = tcg_gen_neg_vec,
2170           .fno = gen_helper_gvec_neg64,
2171           .opt_opc = vecop_list,
2172           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2173           .vece = MO_64 },
2174     };
2175 
2176     tcg_debug_assert(vece <= MO_64);
2177     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2178 }
2179 
2180 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2181 {
2182     TCGv_i64 t = tcg_temp_new_i64();
2183     int nbit = 8 << vece;
2184 
2185     /* Create -1 for each negative element.  */
2186     tcg_gen_shri_i64(t, b, nbit - 1);
2187     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2188     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2189 
2190     /*
2191      * Invert (via xor -1) and add one (via sub -1).
2192      * Because of the ordering the msb is cleared,
2193      * so we never have carry into the next element.
2194      */
2195     tcg_gen_xor_i64(d, b, t);
2196     tcg_gen_sub_i64(d, d, t);
2197 
2198     tcg_temp_free_i64(t);
2199 }
2200 
2201 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2202 {
2203     gen_absv_mask(d, b, MO_8);
2204 }
2205 
2206 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2207 {
2208     gen_absv_mask(d, b, MO_16);
2209 }
2210 
2211 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2212                       uint32_t oprsz, uint32_t maxsz)
2213 {
2214     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2215     static const GVecGen2 g[4] = {
2216         { .fni8 = tcg_gen_vec_abs8_i64,
2217           .fniv = tcg_gen_abs_vec,
2218           .fno = gen_helper_gvec_abs8,
2219           .opt_opc = vecop_list,
2220           .vece = MO_8 },
2221         { .fni8 = tcg_gen_vec_abs16_i64,
2222           .fniv = tcg_gen_abs_vec,
2223           .fno = gen_helper_gvec_abs16,
2224           .opt_opc = vecop_list,
2225           .vece = MO_16 },
2226         { .fni4 = tcg_gen_abs_i32,
2227           .fniv = tcg_gen_abs_vec,
2228           .fno = gen_helper_gvec_abs32,
2229           .opt_opc = vecop_list,
2230           .vece = MO_32 },
2231         { .fni8 = tcg_gen_abs_i64,
2232           .fniv = tcg_gen_abs_vec,
2233           .fno = gen_helper_gvec_abs64,
2234           .opt_opc = vecop_list,
2235           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2236           .vece = MO_64 },
2237     };
2238 
2239     tcg_debug_assert(vece <= MO_64);
2240     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2241 }
2242 
2243 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2244                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2245 {
2246     static const GVecGen3 g = {
2247         .fni8 = tcg_gen_and_i64,
2248         .fniv = tcg_gen_and_vec,
2249         .fno = gen_helper_gvec_and,
2250         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2251     };
2252 
2253     if (aofs == bofs) {
2254         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2255     } else {
2256         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2257     }
2258 }
2259 
2260 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2261                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2262 {
2263     static const GVecGen3 g = {
2264         .fni8 = tcg_gen_or_i64,
2265         .fniv = tcg_gen_or_vec,
2266         .fno = gen_helper_gvec_or,
2267         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2268     };
2269 
2270     if (aofs == bofs) {
2271         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2272     } else {
2273         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2274     }
2275 }
2276 
2277 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2278                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2279 {
2280     static const GVecGen3 g = {
2281         .fni8 = tcg_gen_xor_i64,
2282         .fniv = tcg_gen_xor_vec,
2283         .fno = gen_helper_gvec_xor,
2284         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2285     };
2286 
2287     if (aofs == bofs) {
2288         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2289     } else {
2290         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2291     }
2292 }
2293 
2294 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2295                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2296 {
2297     static const GVecGen3 g = {
2298         .fni8 = tcg_gen_andc_i64,
2299         .fniv = tcg_gen_andc_vec,
2300         .fno = gen_helper_gvec_andc,
2301         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2302     };
2303 
2304     if (aofs == bofs) {
2305         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2306     } else {
2307         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2308     }
2309 }
2310 
2311 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2312                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2313 {
2314     static const GVecGen3 g = {
2315         .fni8 = tcg_gen_orc_i64,
2316         .fniv = tcg_gen_orc_vec,
2317         .fno = gen_helper_gvec_orc,
2318         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2319     };
2320 
2321     if (aofs == bofs) {
2322         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2323     } else {
2324         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2325     }
2326 }
2327 
2328 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2329                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2330 {
2331     static const GVecGen3 g = {
2332         .fni8 = tcg_gen_nand_i64,
2333         .fniv = tcg_gen_nand_vec,
2334         .fno = gen_helper_gvec_nand,
2335         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2336     };
2337 
2338     if (aofs == bofs) {
2339         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2340     } else {
2341         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2342     }
2343 }
2344 
2345 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2346                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2347 {
2348     static const GVecGen3 g = {
2349         .fni8 = tcg_gen_nor_i64,
2350         .fniv = tcg_gen_nor_vec,
2351         .fno = gen_helper_gvec_nor,
2352         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2353     };
2354 
2355     if (aofs == bofs) {
2356         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2357     } else {
2358         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2359     }
2360 }
2361 
2362 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2363                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2364 {
2365     static const GVecGen3 g = {
2366         .fni8 = tcg_gen_eqv_i64,
2367         .fniv = tcg_gen_eqv_vec,
2368         .fno = gen_helper_gvec_eqv,
2369         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2370     };
2371 
2372     if (aofs == bofs) {
2373         tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2374     } else {
2375         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2376     }
2377 }
2378 
2379 static const GVecGen2s gop_ands = {
2380     .fni8 = tcg_gen_and_i64,
2381     .fniv = tcg_gen_and_vec,
2382     .fno = gen_helper_gvec_ands,
2383     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2384     .vece = MO_64
2385 };
2386 
2387 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2388                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2389 {
2390     TCGv_i64 tmp = tcg_temp_new_i64();
2391     gen_dup_i64(vece, tmp, c);
2392     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2393     tcg_temp_free_i64(tmp);
2394 }
2395 
2396 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2397                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2398 {
2399     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2400     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2401     tcg_temp_free_i64(tmp);
2402 }
2403 
2404 static const GVecGen2s gop_xors = {
2405     .fni8 = tcg_gen_xor_i64,
2406     .fniv = tcg_gen_xor_vec,
2407     .fno = gen_helper_gvec_xors,
2408     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2409     .vece = MO_64
2410 };
2411 
2412 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2413                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2414 {
2415     TCGv_i64 tmp = tcg_temp_new_i64();
2416     gen_dup_i64(vece, tmp, c);
2417     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2418     tcg_temp_free_i64(tmp);
2419 }
2420 
2421 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2422                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2423 {
2424     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2425     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2426     tcg_temp_free_i64(tmp);
2427 }
2428 
2429 static const GVecGen2s gop_ors = {
2430     .fni8 = tcg_gen_or_i64,
2431     .fniv = tcg_gen_or_vec,
2432     .fno = gen_helper_gvec_ors,
2433     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2434     .vece = MO_64
2435 };
2436 
2437 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2438                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2439 {
2440     TCGv_i64 tmp = tcg_temp_new_i64();
2441     gen_dup_i64(vece, tmp, c);
2442     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2443     tcg_temp_free_i64(tmp);
2444 }
2445 
2446 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2447                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2448 {
2449     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2450     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2451     tcg_temp_free_i64(tmp);
2452 }
2453 
2454 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2455 {
2456     uint64_t mask = dup_const(MO_8, 0xff << c);
2457     tcg_gen_shli_i64(d, a, c);
2458     tcg_gen_andi_i64(d, d, mask);
2459 }
2460 
2461 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2462 {
2463     uint64_t mask = dup_const(MO_16, 0xffff << c);
2464     tcg_gen_shli_i64(d, a, c);
2465     tcg_gen_andi_i64(d, d, mask);
2466 }
2467 
2468 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2469                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2470 {
2471     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2472     static const GVecGen2i g[4] = {
2473         { .fni8 = tcg_gen_vec_shl8i_i64,
2474           .fniv = tcg_gen_shli_vec,
2475           .fno = gen_helper_gvec_shl8i,
2476           .opt_opc = vecop_list,
2477           .vece = MO_8 },
2478         { .fni8 = tcg_gen_vec_shl16i_i64,
2479           .fniv = tcg_gen_shli_vec,
2480           .fno = gen_helper_gvec_shl16i,
2481           .opt_opc = vecop_list,
2482           .vece = MO_16 },
2483         { .fni4 = tcg_gen_shli_i32,
2484           .fniv = tcg_gen_shli_vec,
2485           .fno = gen_helper_gvec_shl32i,
2486           .opt_opc = vecop_list,
2487           .vece = MO_32 },
2488         { .fni8 = tcg_gen_shli_i64,
2489           .fniv = tcg_gen_shli_vec,
2490           .fno = gen_helper_gvec_shl64i,
2491           .opt_opc = vecop_list,
2492           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2493           .vece = MO_64 },
2494     };
2495 
2496     tcg_debug_assert(vece <= MO_64);
2497     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2498     if (shift == 0) {
2499         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2500     } else {
2501         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2502     }
2503 }
2504 
2505 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2506 {
2507     uint64_t mask = dup_const(MO_8, 0xff >> c);
2508     tcg_gen_shri_i64(d, a, c);
2509     tcg_gen_andi_i64(d, d, mask);
2510 }
2511 
2512 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2513 {
2514     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2515     tcg_gen_shri_i64(d, a, c);
2516     tcg_gen_andi_i64(d, d, mask);
2517 }
2518 
2519 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2520                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2521 {
2522     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2523     static const GVecGen2i g[4] = {
2524         { .fni8 = tcg_gen_vec_shr8i_i64,
2525           .fniv = tcg_gen_shri_vec,
2526           .fno = gen_helper_gvec_shr8i,
2527           .opt_opc = vecop_list,
2528           .vece = MO_8 },
2529         { .fni8 = tcg_gen_vec_shr16i_i64,
2530           .fniv = tcg_gen_shri_vec,
2531           .fno = gen_helper_gvec_shr16i,
2532           .opt_opc = vecop_list,
2533           .vece = MO_16 },
2534         { .fni4 = tcg_gen_shri_i32,
2535           .fniv = tcg_gen_shri_vec,
2536           .fno = gen_helper_gvec_shr32i,
2537           .opt_opc = vecop_list,
2538           .vece = MO_32 },
2539         { .fni8 = tcg_gen_shri_i64,
2540           .fniv = tcg_gen_shri_vec,
2541           .fno = gen_helper_gvec_shr64i,
2542           .opt_opc = vecop_list,
2543           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2544           .vece = MO_64 },
2545     };
2546 
2547     tcg_debug_assert(vece <= MO_64);
2548     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2549     if (shift == 0) {
2550         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2551     } else {
2552         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2553     }
2554 }
2555 
2556 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2557 {
2558     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2559     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2560     TCGv_i64 s = tcg_temp_new_i64();
2561 
2562     tcg_gen_shri_i64(d, a, c);
2563     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2564     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2565     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2566     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2567     tcg_temp_free_i64(s);
2568 }
2569 
2570 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2571 {
2572     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2573     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2574     TCGv_i64 s = tcg_temp_new_i64();
2575 
2576     tcg_gen_shri_i64(d, a, c);
2577     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2578     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2579     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2580     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2581     tcg_temp_free_i64(s);
2582 }
2583 
2584 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2585                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2586 {
2587     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2588     static const GVecGen2i g[4] = {
2589         { .fni8 = tcg_gen_vec_sar8i_i64,
2590           .fniv = tcg_gen_sari_vec,
2591           .fno = gen_helper_gvec_sar8i,
2592           .opt_opc = vecop_list,
2593           .vece = MO_8 },
2594         { .fni8 = tcg_gen_vec_sar16i_i64,
2595           .fniv = tcg_gen_sari_vec,
2596           .fno = gen_helper_gvec_sar16i,
2597           .opt_opc = vecop_list,
2598           .vece = MO_16 },
2599         { .fni4 = tcg_gen_sari_i32,
2600           .fniv = tcg_gen_sari_vec,
2601           .fno = gen_helper_gvec_sar32i,
2602           .opt_opc = vecop_list,
2603           .vece = MO_32 },
2604         { .fni8 = tcg_gen_sari_i64,
2605           .fniv = tcg_gen_sari_vec,
2606           .fno = gen_helper_gvec_sar64i,
2607           .opt_opc = vecop_list,
2608           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2609           .vece = MO_64 },
2610     };
2611 
2612     tcg_debug_assert(vece <= MO_64);
2613     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2614     if (shift == 0) {
2615         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2616     } else {
2617         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2618     }
2619 }
2620 
2621 /*
2622  * Specialized generation vector shifts by a non-constant scalar.
2623  */
2624 
2625 typedef struct {
2626     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2627     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2628     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2629     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2630     gen_helper_gvec_2 *fno[4];
2631     TCGOpcode s_list[2];
2632     TCGOpcode v_list[2];
2633 } GVecGen2sh;
2634 
2635 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2636                            uint32_t oprsz, uint32_t tysz, TCGType type,
2637                            TCGv_i32 shift,
2638                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2639 {
2640     TCGv_vec t0 = tcg_temp_new_vec(type);
2641     uint32_t i;
2642 
2643     for (i = 0; i < oprsz; i += tysz) {
2644         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2645         fni(vece, t0, t0, shift);
2646         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2647     }
2648     tcg_temp_free_vec(t0);
2649 }
2650 
2651 static void
2652 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2653                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2654 {
2655     TCGType type;
2656     uint32_t some;
2657 
2658     check_size_align(oprsz, maxsz, dofs | aofs);
2659     check_overlap_2(dofs, aofs, maxsz);
2660 
2661     /* If the backend has a scalar expansion, great.  */
2662     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2663     if (type) {
2664         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2665         switch (type) {
2666         case TCG_TYPE_V256:
2667             some = QEMU_ALIGN_DOWN(oprsz, 32);
2668             expand_2sh_vec(vece, dofs, aofs, some, 32,
2669                            TCG_TYPE_V256, shift, g->fniv_s);
2670             if (some == oprsz) {
2671                 break;
2672             }
2673             dofs += some;
2674             aofs += some;
2675             oprsz -= some;
2676             maxsz -= some;
2677             /* fallthru */
2678         case TCG_TYPE_V128:
2679             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2680                            TCG_TYPE_V128, shift, g->fniv_s);
2681             break;
2682         case TCG_TYPE_V64:
2683             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2684                            TCG_TYPE_V64, shift, g->fniv_s);
2685             break;
2686         default:
2687             g_assert_not_reached();
2688         }
2689         tcg_swap_vecop_list(hold_list);
2690         goto clear_tail;
2691     }
2692 
2693     /* If the backend supports variable vector shifts, also cool.  */
2694     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2695     if (type) {
2696         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2697         TCGv_vec v_shift = tcg_temp_new_vec(type);
2698 
2699         if (vece == MO_64) {
2700             TCGv_i64 sh64 = tcg_temp_new_i64();
2701             tcg_gen_extu_i32_i64(sh64, shift);
2702             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2703             tcg_temp_free_i64(sh64);
2704         } else {
2705             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2706         }
2707 
2708         switch (type) {
2709         case TCG_TYPE_V256:
2710             some = QEMU_ALIGN_DOWN(oprsz, 32);
2711             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2712                           v_shift, false, g->fniv_v);
2713             if (some == oprsz) {
2714                 break;
2715             }
2716             dofs += some;
2717             aofs += some;
2718             oprsz -= some;
2719             maxsz -= some;
2720             /* fallthru */
2721         case TCG_TYPE_V128:
2722             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2723                           v_shift, false, g->fniv_v);
2724             break;
2725         case TCG_TYPE_V64:
2726             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2727                           v_shift, false, g->fniv_v);
2728             break;
2729         default:
2730             g_assert_not_reached();
2731         }
2732         tcg_temp_free_vec(v_shift);
2733         tcg_swap_vecop_list(hold_list);
2734         goto clear_tail;
2735     }
2736 
2737     /* Otherwise fall back to integral... */
2738     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2739         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2740     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2741         TCGv_i64 sh64 = tcg_temp_new_i64();
2742         tcg_gen_extu_i32_i64(sh64, shift);
2743         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2744         tcg_temp_free_i64(sh64);
2745     } else {
2746         TCGv_ptr a0 = tcg_temp_new_ptr();
2747         TCGv_ptr a1 = tcg_temp_new_ptr();
2748         TCGv_i32 desc = tcg_temp_new_i32();
2749 
2750         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2751         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2752         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2753         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2754 
2755         g->fno[vece](a0, a1, desc);
2756 
2757         tcg_temp_free_ptr(a0);
2758         tcg_temp_free_ptr(a1);
2759         tcg_temp_free_i32(desc);
2760         return;
2761     }
2762 
2763  clear_tail:
2764     if (oprsz < maxsz) {
2765         expand_clr(dofs + oprsz, maxsz - oprsz);
2766     }
2767 }
2768 
2769 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2770                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2771 {
2772     static const GVecGen2sh g = {
2773         .fni4 = tcg_gen_shl_i32,
2774         .fni8 = tcg_gen_shl_i64,
2775         .fniv_s = tcg_gen_shls_vec,
2776         .fniv_v = tcg_gen_shlv_vec,
2777         .fno = {
2778             gen_helper_gvec_shl8i,
2779             gen_helper_gvec_shl16i,
2780             gen_helper_gvec_shl32i,
2781             gen_helper_gvec_shl64i,
2782         },
2783         .s_list = { INDEX_op_shls_vec, 0 },
2784         .v_list = { INDEX_op_shlv_vec, 0 },
2785     };
2786 
2787     tcg_debug_assert(vece <= MO_64);
2788     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2789 }
2790 
2791 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
2792                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2793 {
2794     static const GVecGen2sh g = {
2795         .fni4 = tcg_gen_shr_i32,
2796         .fni8 = tcg_gen_shr_i64,
2797         .fniv_s = tcg_gen_shrs_vec,
2798         .fniv_v = tcg_gen_shrv_vec,
2799         .fno = {
2800             gen_helper_gvec_shr8i,
2801             gen_helper_gvec_shr16i,
2802             gen_helper_gvec_shr32i,
2803             gen_helper_gvec_shr64i,
2804         },
2805         .s_list = { INDEX_op_shrs_vec, 0 },
2806         .v_list = { INDEX_op_shrv_vec, 0 },
2807     };
2808 
2809     tcg_debug_assert(vece <= MO_64);
2810     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2811 }
2812 
2813 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
2814                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2815 {
2816     static const GVecGen2sh g = {
2817         .fni4 = tcg_gen_sar_i32,
2818         .fni8 = tcg_gen_sar_i64,
2819         .fniv_s = tcg_gen_sars_vec,
2820         .fniv_v = tcg_gen_sarv_vec,
2821         .fno = {
2822             gen_helper_gvec_sar8i,
2823             gen_helper_gvec_sar16i,
2824             gen_helper_gvec_sar32i,
2825             gen_helper_gvec_sar64i,
2826         },
2827         .s_list = { INDEX_op_sars_vec, 0 },
2828         .v_list = { INDEX_op_sarv_vec, 0 },
2829     };
2830 
2831     tcg_debug_assert(vece <= MO_64);
2832     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2833 }
2834 
2835 /*
2836  * Expand D = A << (B % element bits)
2837  *
2838  * Unlike scalar shifts, where it is easy for the target front end
2839  * to include the modulo as part of the expansion.  If the target
2840  * naturally includes the modulo as part of the operation, great!
2841  * If the target has some other behaviour from out-of-range shifts,
2842  * then it could not use this function anyway, and would need to
2843  * do it's own expansion with custom functions.
2844  */
2845 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
2846                                  TCGv_vec a, TCGv_vec b)
2847 {
2848     TCGv_vec t = tcg_temp_new_vec_matching(d);
2849 
2850     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2851     tcg_gen_and_vec(vece, t, t, b);
2852     tcg_gen_shlv_vec(vece, d, a, t);
2853     tcg_temp_free_vec(t);
2854 }
2855 
2856 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2857 {
2858     TCGv_i32 t = tcg_temp_new_i32();
2859 
2860     tcg_gen_andi_i32(t, b, 31);
2861     tcg_gen_shl_i32(d, a, t);
2862     tcg_temp_free_i32(t);
2863 }
2864 
2865 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2866 {
2867     TCGv_i64 t = tcg_temp_new_i64();
2868 
2869     tcg_gen_andi_i64(t, b, 63);
2870     tcg_gen_shl_i64(d, a, t);
2871     tcg_temp_free_i64(t);
2872 }
2873 
2874 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
2875                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2876 {
2877     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
2878     static const GVecGen3 g[4] = {
2879         { .fniv = tcg_gen_shlv_mod_vec,
2880           .fno = gen_helper_gvec_shl8v,
2881           .opt_opc = vecop_list,
2882           .vece = MO_8 },
2883         { .fniv = tcg_gen_shlv_mod_vec,
2884           .fno = gen_helper_gvec_shl16v,
2885           .opt_opc = vecop_list,
2886           .vece = MO_16 },
2887         { .fni4 = tcg_gen_shl_mod_i32,
2888           .fniv = tcg_gen_shlv_mod_vec,
2889           .fno = gen_helper_gvec_shl32v,
2890           .opt_opc = vecop_list,
2891           .vece = MO_32 },
2892         { .fni8 = tcg_gen_shl_mod_i64,
2893           .fniv = tcg_gen_shlv_mod_vec,
2894           .fno = gen_helper_gvec_shl64v,
2895           .opt_opc = vecop_list,
2896           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2897           .vece = MO_64 },
2898     };
2899 
2900     tcg_debug_assert(vece <= MO_64);
2901     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2902 }
2903 
2904 /*
2905  * Similarly for logical right shifts.
2906  */
2907 
2908 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
2909                                  TCGv_vec a, TCGv_vec b)
2910 {
2911     TCGv_vec t = tcg_temp_new_vec_matching(d);
2912 
2913     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2914     tcg_gen_and_vec(vece, t, t, b);
2915     tcg_gen_shrv_vec(vece, d, a, t);
2916     tcg_temp_free_vec(t);
2917 }
2918 
2919 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2920 {
2921     TCGv_i32 t = tcg_temp_new_i32();
2922 
2923     tcg_gen_andi_i32(t, b, 31);
2924     tcg_gen_shr_i32(d, a, t);
2925     tcg_temp_free_i32(t);
2926 }
2927 
2928 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2929 {
2930     TCGv_i64 t = tcg_temp_new_i64();
2931 
2932     tcg_gen_andi_i64(t, b, 63);
2933     tcg_gen_shr_i64(d, a, t);
2934     tcg_temp_free_i64(t);
2935 }
2936 
2937 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
2938                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2939 {
2940     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
2941     static const GVecGen3 g[4] = {
2942         { .fniv = tcg_gen_shrv_mod_vec,
2943           .fno = gen_helper_gvec_shr8v,
2944           .opt_opc = vecop_list,
2945           .vece = MO_8 },
2946         { .fniv = tcg_gen_shrv_mod_vec,
2947           .fno = gen_helper_gvec_shr16v,
2948           .opt_opc = vecop_list,
2949           .vece = MO_16 },
2950         { .fni4 = tcg_gen_shr_mod_i32,
2951           .fniv = tcg_gen_shrv_mod_vec,
2952           .fno = gen_helper_gvec_shr32v,
2953           .opt_opc = vecop_list,
2954           .vece = MO_32 },
2955         { .fni8 = tcg_gen_shr_mod_i64,
2956           .fniv = tcg_gen_shrv_mod_vec,
2957           .fno = gen_helper_gvec_shr64v,
2958           .opt_opc = vecop_list,
2959           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2960           .vece = MO_64 },
2961     };
2962 
2963     tcg_debug_assert(vece <= MO_64);
2964     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2965 }
2966 
2967 /*
2968  * Similarly for arithmetic right shifts.
2969  */
2970 
2971 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
2972                                  TCGv_vec a, TCGv_vec b)
2973 {
2974     TCGv_vec t = tcg_temp_new_vec_matching(d);
2975 
2976     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2977     tcg_gen_and_vec(vece, t, t, b);
2978     tcg_gen_sarv_vec(vece, d, a, t);
2979     tcg_temp_free_vec(t);
2980 }
2981 
2982 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2983 {
2984     TCGv_i32 t = tcg_temp_new_i32();
2985 
2986     tcg_gen_andi_i32(t, b, 31);
2987     tcg_gen_sar_i32(d, a, t);
2988     tcg_temp_free_i32(t);
2989 }
2990 
2991 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2992 {
2993     TCGv_i64 t = tcg_temp_new_i64();
2994 
2995     tcg_gen_andi_i64(t, b, 63);
2996     tcg_gen_sar_i64(d, a, t);
2997     tcg_temp_free_i64(t);
2998 }
2999 
3000 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3001                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3002 {
3003     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3004     static const GVecGen3 g[4] = {
3005         { .fniv = tcg_gen_sarv_mod_vec,
3006           .fno = gen_helper_gvec_sar8v,
3007           .opt_opc = vecop_list,
3008           .vece = MO_8 },
3009         { .fniv = tcg_gen_sarv_mod_vec,
3010           .fno = gen_helper_gvec_sar16v,
3011           .opt_opc = vecop_list,
3012           .vece = MO_16 },
3013         { .fni4 = tcg_gen_sar_mod_i32,
3014           .fniv = tcg_gen_sarv_mod_vec,
3015           .fno = gen_helper_gvec_sar32v,
3016           .opt_opc = vecop_list,
3017           .vece = MO_32 },
3018         { .fni8 = tcg_gen_sar_mod_i64,
3019           .fniv = tcg_gen_sarv_mod_vec,
3020           .fno = gen_helper_gvec_sar64v,
3021           .opt_opc = vecop_list,
3022           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3023           .vece = MO_64 },
3024     };
3025 
3026     tcg_debug_assert(vece <= MO_64);
3027     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3028 }
3029 
3030 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3031 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3032                            uint32_t oprsz, TCGCond cond)
3033 {
3034     TCGv_i32 t0 = tcg_temp_new_i32();
3035     TCGv_i32 t1 = tcg_temp_new_i32();
3036     uint32_t i;
3037 
3038     for (i = 0; i < oprsz; i += 4) {
3039         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3040         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3041         tcg_gen_setcond_i32(cond, t0, t0, t1);
3042         tcg_gen_neg_i32(t0, t0);
3043         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3044     }
3045     tcg_temp_free_i32(t1);
3046     tcg_temp_free_i32(t0);
3047 }
3048 
3049 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3050                            uint32_t oprsz, TCGCond cond)
3051 {
3052     TCGv_i64 t0 = tcg_temp_new_i64();
3053     TCGv_i64 t1 = tcg_temp_new_i64();
3054     uint32_t i;
3055 
3056     for (i = 0; i < oprsz; i += 8) {
3057         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3058         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3059         tcg_gen_setcond_i64(cond, t0, t0, t1);
3060         tcg_gen_neg_i64(t0, t0);
3061         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3062     }
3063     tcg_temp_free_i64(t1);
3064     tcg_temp_free_i64(t0);
3065 }
3066 
3067 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3068                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3069                            TCGType type, TCGCond cond)
3070 {
3071     TCGv_vec t0 = tcg_temp_new_vec(type);
3072     TCGv_vec t1 = tcg_temp_new_vec(type);
3073     uint32_t i;
3074 
3075     for (i = 0; i < oprsz; i += tysz) {
3076         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3077         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3078         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3079         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3080     }
3081     tcg_temp_free_vec(t1);
3082     tcg_temp_free_vec(t0);
3083 }
3084 
3085 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3086                       uint32_t aofs, uint32_t bofs,
3087                       uint32_t oprsz, uint32_t maxsz)
3088 {
3089     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3090     static gen_helper_gvec_3 * const eq_fn[4] = {
3091         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3092         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3093     };
3094     static gen_helper_gvec_3 * const ne_fn[4] = {
3095         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3096         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3097     };
3098     static gen_helper_gvec_3 * const lt_fn[4] = {
3099         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3100         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3101     };
3102     static gen_helper_gvec_3 * const le_fn[4] = {
3103         gen_helper_gvec_le8, gen_helper_gvec_le16,
3104         gen_helper_gvec_le32, gen_helper_gvec_le64
3105     };
3106     static gen_helper_gvec_3 * const ltu_fn[4] = {
3107         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3108         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3109     };
3110     static gen_helper_gvec_3 * const leu_fn[4] = {
3111         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3112         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3113     };
3114     static gen_helper_gvec_3 * const * const fns[16] = {
3115         [TCG_COND_EQ] = eq_fn,
3116         [TCG_COND_NE] = ne_fn,
3117         [TCG_COND_LT] = lt_fn,
3118         [TCG_COND_LE] = le_fn,
3119         [TCG_COND_LTU] = ltu_fn,
3120         [TCG_COND_LEU] = leu_fn,
3121     };
3122 
3123     const TCGOpcode *hold_list;
3124     TCGType type;
3125     uint32_t some;
3126 
3127     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3128     check_overlap_3(dofs, aofs, bofs, maxsz);
3129 
3130     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3131         do_dup(MO_8, dofs, oprsz, maxsz,
3132                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3133         return;
3134     }
3135 
3136     /*
3137      * Implement inline with a vector type, if possible.
3138      * Prefer integer when 64-bit host and 64-bit comparison.
3139      */
3140     hold_list = tcg_swap_vecop_list(cmp_list);
3141     type = choose_vector_type(cmp_list, vece, oprsz,
3142                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3143     switch (type) {
3144     case TCG_TYPE_V256:
3145         /* Recall that ARM SVE allows vector sizes that are not a
3146          * power of 2, but always a multiple of 16.  The intent is
3147          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3148          */
3149         some = QEMU_ALIGN_DOWN(oprsz, 32);
3150         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3151         if (some == oprsz) {
3152             break;
3153         }
3154         dofs += some;
3155         aofs += some;
3156         bofs += some;
3157         oprsz -= some;
3158         maxsz -= some;
3159         /* fallthru */
3160     case TCG_TYPE_V128:
3161         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3162         break;
3163     case TCG_TYPE_V64:
3164         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3165         break;
3166 
3167     case 0:
3168         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3169             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3170         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3171             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3172         } else {
3173             gen_helper_gvec_3 * const *fn = fns[cond];
3174 
3175             if (fn == NULL) {
3176                 uint32_t tmp;
3177                 tmp = aofs, aofs = bofs, bofs = tmp;
3178                 cond = tcg_swap_cond(cond);
3179                 fn = fns[cond];
3180                 assert(fn != NULL);
3181             }
3182             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3183             oprsz = maxsz;
3184         }
3185         break;
3186 
3187     default:
3188         g_assert_not_reached();
3189     }
3190     tcg_swap_vecop_list(hold_list);
3191 
3192     if (oprsz < maxsz) {
3193         expand_clr(dofs + oprsz, maxsz - oprsz);
3194     }
3195 }
3196