xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision e06cd791381383c6fa6041ad0758a86c5b1509e6)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-temp-internal.h"
23 #include "tcg/tcg-op-common.h"
24 #include "tcg/tcg-op-gvec-common.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "tcg-has.h"
27 
28 #define MAX_UNROLL  4
29 
30 #ifdef CONFIG_DEBUG_TCG
31 static const TCGOpcode vecop_list_empty[1] = { 0 };
32 #else
33 #define vecop_list_empty NULL
34 #endif
35 
36 
37 /* Verify vector size and alignment rules.  OFS should be the OR of all
38    of the operand offsets so that we can check them all at once.  */
39 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
40 {
41     uint32_t max_align;
42 
43     switch (oprsz) {
44     case 8:
45     case 16:
46     case 32:
47         tcg_debug_assert(oprsz <= maxsz);
48         break;
49     default:
50         tcg_debug_assert(oprsz == maxsz);
51         break;
52     }
53     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
54 
55     max_align = maxsz >= 16 ? 15 : 7;
56     tcg_debug_assert((maxsz & max_align) == 0);
57     tcg_debug_assert((ofs & max_align) == 0);
58 }
59 
60 /*
61  * Verify vector overlap rules for two operands.
62  * When dbase and abase are not the same pointer, we cannot check for
63  * overlap at compile-time, but the runtime restrictions remain.
64  */
65 static void check_overlap_2(TCGv_ptr dbase, uint32_t d,
66                             TCGv_ptr abase, uint32_t a, uint32_t s)
67 {
68     tcg_debug_assert(dbase != abase || d == a || d + s <= a || a + s <= d);
69 }
70 
71 /* Verify vector overlap rules for three operands.  */
72 static void check_overlap_3(TCGv_ptr dbase, uint32_t d,
73                             TCGv_ptr abase, uint32_t a,
74                             TCGv_ptr bbase, uint32_t b, uint32_t s)
75 {
76     check_overlap_2(dbase, d, abase, a, s);
77     check_overlap_2(dbase, d, bbase, b, s);
78     check_overlap_2(abase, a, bbase, b, s);
79 }
80 
81 /* Verify vector overlap rules for four operands.  */
82 static void check_overlap_4(TCGv_ptr dbase, uint32_t d,
83                             TCGv_ptr abase, uint32_t a,
84                             TCGv_ptr bbase, uint32_t b,
85                             TCGv_ptr cbase, uint32_t c, uint32_t s)
86 {
87     check_overlap_2(dbase, d, abase, a, s);
88     check_overlap_2(dbase, d, bbase, b, s);
89     check_overlap_2(dbase, d, cbase, c, s);
90     check_overlap_2(abase, a, bbase, b, s);
91     check_overlap_2(abase, a, cbase, c, s);
92     check_overlap_2(bbase, b, cbase, c, s);
93 }
94 
95 /* Create a descriptor from components.  */
96 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
97 {
98     uint32_t desc = 0;
99 
100     check_size_align(oprsz, maxsz, 0);
101 
102     /*
103      * We want to check that 'data' will fit into SIMD_DATA_BITS.
104      * However, some callers want to treat the data as a signed
105      * value (which they can later get back with simd_data())
106      * and some want to treat it as an unsigned value.
107      * So here we assert only that the data will fit into the
108      * field in at least one way. This means that some invalid
109      * values from the caller will not be detected, e.g. if the
110      * caller wants to handle the value as a signed integer but
111      * incorrectly passes us 1 << (SIMD_DATA_BITS - 1).
112      */
113     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) ||
114                      data == extract32(data, 0, SIMD_DATA_BITS));
115 
116     oprsz = (oprsz / 8) - 1;
117     maxsz = (maxsz / 8) - 1;
118 
119     /*
120      * We have just asserted in check_size_align that either
121      * oprsz is {8,16,32} or matches maxsz.  Encode the final
122      * case with '2', as that would otherwise map to 24.
123      */
124     if (oprsz == maxsz) {
125         oprsz = 2;
126     }
127 
128     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
129     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
130     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
131 
132     return desc;
133 }
134 
135 /* Generate a call to a gvec-style helper with two vector operands.  */
136 static void expand_2_ool(TCGv_ptr dbase, uint32_t dofs,
137                          TCGv_ptr abase, uint32_t aofs,
138                          uint32_t oprsz, uint32_t maxsz,
139                          int32_t data, gen_helper_gvec_2 *fn)
140 {
141     TCGv_ptr a0, a1;
142     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
143 
144     a0 = tcg_temp_ebb_new_ptr();
145     a1 = tcg_temp_ebb_new_ptr();
146 
147     tcg_gen_addi_ptr(a0, dbase, dofs);
148     tcg_gen_addi_ptr(a1, abase, aofs);
149 
150     fn(a0, a1, desc);
151 
152     tcg_temp_free_ptr(a0);
153     tcg_temp_free_ptr(a1);
154 }
155 
156 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
157                         uint32_t oprsz, uint32_t maxsz, int32_t data,
158                         gen_helper_gvec_2 *fn)
159 {
160     expand_2_ool(tcg_env, dofs, tcg_env, aofs, oprsz, maxsz, data, fn);
161 }
162 
163 /* Generate a call to a gvec-style helper with two vector operands
164    and one scalar operand.  */
165 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
166                          uint32_t oprsz, uint32_t maxsz, int32_t data,
167                          gen_helper_gvec_2i *fn)
168 {
169     TCGv_ptr a0, a1;
170     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
171 
172     a0 = tcg_temp_ebb_new_ptr();
173     a1 = tcg_temp_ebb_new_ptr();
174 
175     tcg_gen_addi_ptr(a0, tcg_env, dofs);
176     tcg_gen_addi_ptr(a1, tcg_env, aofs);
177 
178     fn(a0, a1, c, desc);
179 
180     tcg_temp_free_ptr(a0);
181     tcg_temp_free_ptr(a1);
182 }
183 
184 /* Generate a call to a gvec-style helper with three vector operands.  */
185 static void expand_3_ool(TCGv_ptr dbase, uint32_t dofs,
186                          TCGv_ptr abase, uint32_t aofs,
187                          TCGv_ptr bbase, uint32_t bofs,
188                          uint32_t oprsz, uint32_t maxsz,
189                          int32_t data, gen_helper_gvec_3 *fn)
190 {
191     TCGv_ptr a0, a1, a2;
192     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
193 
194     a0 = tcg_temp_ebb_new_ptr();
195     a1 = tcg_temp_ebb_new_ptr();
196     a2 = tcg_temp_ebb_new_ptr();
197 
198     tcg_gen_addi_ptr(a0, dbase, dofs);
199     tcg_gen_addi_ptr(a1, abase, aofs);
200     tcg_gen_addi_ptr(a2, bbase, bofs);
201 
202     fn(a0, a1, a2, desc);
203 
204     tcg_temp_free_ptr(a0);
205     tcg_temp_free_ptr(a1);
206     tcg_temp_free_ptr(a2);
207 }
208 
209 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
210                         uint32_t oprsz, uint32_t maxsz, int32_t data,
211                         gen_helper_gvec_3 *fn)
212 {
213     expand_3_ool(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs,
214                  oprsz, maxsz, data, fn);
215 }
216 
217 /* Generate a call to a gvec-style helper with four vector operands.  */
218 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
219                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
220                         int32_t data, gen_helper_gvec_4 *fn)
221 {
222     TCGv_ptr a0, a1, a2, a3;
223     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
224 
225     a0 = tcg_temp_ebb_new_ptr();
226     a1 = tcg_temp_ebb_new_ptr();
227     a2 = tcg_temp_ebb_new_ptr();
228     a3 = tcg_temp_ebb_new_ptr();
229 
230     tcg_gen_addi_ptr(a0, tcg_env, dofs);
231     tcg_gen_addi_ptr(a1, tcg_env, aofs);
232     tcg_gen_addi_ptr(a2, tcg_env, bofs);
233     tcg_gen_addi_ptr(a3, tcg_env, cofs);
234 
235     fn(a0, a1, a2, a3, desc);
236 
237     tcg_temp_free_ptr(a0);
238     tcg_temp_free_ptr(a1);
239     tcg_temp_free_ptr(a2);
240     tcg_temp_free_ptr(a3);
241 }
242 
243 /* Generate a call to a gvec-style helper with five vector operands.  */
244 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
245                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
246                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
247 {
248     TCGv_ptr a0, a1, a2, a3, a4;
249     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
250 
251     a0 = tcg_temp_ebb_new_ptr();
252     a1 = tcg_temp_ebb_new_ptr();
253     a2 = tcg_temp_ebb_new_ptr();
254     a3 = tcg_temp_ebb_new_ptr();
255     a4 = tcg_temp_ebb_new_ptr();
256 
257     tcg_gen_addi_ptr(a0, tcg_env, dofs);
258     tcg_gen_addi_ptr(a1, tcg_env, aofs);
259     tcg_gen_addi_ptr(a2, tcg_env, bofs);
260     tcg_gen_addi_ptr(a3, tcg_env, cofs);
261     tcg_gen_addi_ptr(a4, tcg_env, xofs);
262 
263     fn(a0, a1, a2, a3, a4, desc);
264 
265     tcg_temp_free_ptr(a0);
266     tcg_temp_free_ptr(a1);
267     tcg_temp_free_ptr(a2);
268     tcg_temp_free_ptr(a3);
269     tcg_temp_free_ptr(a4);
270 }
271 
272 /* Generate a call to a gvec-style helper with three vector operands
273    and an extra pointer operand.  */
274 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
275                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
276                         int32_t data, gen_helper_gvec_2_ptr *fn)
277 {
278     TCGv_ptr a0, a1;
279     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
280 
281     a0 = tcg_temp_ebb_new_ptr();
282     a1 = tcg_temp_ebb_new_ptr();
283 
284     tcg_gen_addi_ptr(a0, tcg_env, dofs);
285     tcg_gen_addi_ptr(a1, tcg_env, aofs);
286 
287     fn(a0, a1, ptr, desc);
288 
289     tcg_temp_free_ptr(a0);
290     tcg_temp_free_ptr(a1);
291 }
292 
293 /* Generate a call to a gvec-style helper with three vector operands
294    and an extra pointer operand.  */
295 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
296                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
297                         int32_t data, gen_helper_gvec_3_ptr *fn)
298 {
299     TCGv_ptr a0, a1, a2;
300     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
301 
302     a0 = tcg_temp_ebb_new_ptr();
303     a1 = tcg_temp_ebb_new_ptr();
304     a2 = tcg_temp_ebb_new_ptr();
305 
306     tcg_gen_addi_ptr(a0, tcg_env, dofs);
307     tcg_gen_addi_ptr(a1, tcg_env, aofs);
308     tcg_gen_addi_ptr(a2, tcg_env, bofs);
309 
310     fn(a0, a1, a2, ptr, desc);
311 
312     tcg_temp_free_ptr(a0);
313     tcg_temp_free_ptr(a1);
314     tcg_temp_free_ptr(a2);
315 }
316 
317 /* Generate a call to a gvec-style helper with four vector operands
318    and an extra pointer operand.  */
319 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
320                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
321                         uint32_t maxsz, int32_t data,
322                         gen_helper_gvec_4_ptr *fn)
323 {
324     TCGv_ptr a0, a1, a2, a3;
325     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
326 
327     a0 = tcg_temp_ebb_new_ptr();
328     a1 = tcg_temp_ebb_new_ptr();
329     a2 = tcg_temp_ebb_new_ptr();
330     a3 = tcg_temp_ebb_new_ptr();
331 
332     tcg_gen_addi_ptr(a0, tcg_env, dofs);
333     tcg_gen_addi_ptr(a1, tcg_env, aofs);
334     tcg_gen_addi_ptr(a2, tcg_env, bofs);
335     tcg_gen_addi_ptr(a3, tcg_env, cofs);
336 
337     fn(a0, a1, a2, a3, ptr, desc);
338 
339     tcg_temp_free_ptr(a0);
340     tcg_temp_free_ptr(a1);
341     tcg_temp_free_ptr(a2);
342     tcg_temp_free_ptr(a3);
343 }
344 
345 /* Generate a call to a gvec-style helper with five vector operands
346    and an extra pointer operand.  */
347 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
348                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
349                         uint32_t oprsz, uint32_t maxsz, int32_t data,
350                         gen_helper_gvec_5_ptr *fn)
351 {
352     TCGv_ptr a0, a1, a2, a3, a4;
353     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
354 
355     a0 = tcg_temp_ebb_new_ptr();
356     a1 = tcg_temp_ebb_new_ptr();
357     a2 = tcg_temp_ebb_new_ptr();
358     a3 = tcg_temp_ebb_new_ptr();
359     a4 = tcg_temp_ebb_new_ptr();
360 
361     tcg_gen_addi_ptr(a0, tcg_env, dofs);
362     tcg_gen_addi_ptr(a1, tcg_env, aofs);
363     tcg_gen_addi_ptr(a2, tcg_env, bofs);
364     tcg_gen_addi_ptr(a3, tcg_env, cofs);
365     tcg_gen_addi_ptr(a4, tcg_env, eofs);
366 
367     fn(a0, a1, a2, a3, a4, ptr, desc);
368 
369     tcg_temp_free_ptr(a0);
370     tcg_temp_free_ptr(a1);
371     tcg_temp_free_ptr(a2);
372     tcg_temp_free_ptr(a3);
373     tcg_temp_free_ptr(a4);
374 }
375 
376 /* Return true if we want to implement something of OPRSZ bytes
377    in units of LNSZ.  This limits the expansion of inline code.  */
378 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
379 {
380     uint32_t q, r;
381 
382     if (oprsz < lnsz) {
383         return false;
384     }
385 
386     q = oprsz / lnsz;
387     r = oprsz % lnsz;
388     tcg_debug_assert((r & 7) == 0);
389 
390     if (lnsz < 16) {
391         /* For sizes below 16, accept no remainder. */
392         if (r != 0) {
393             return false;
394         }
395     } else {
396         /*
397          * Recall that ARM SVE allows vector sizes that are not a
398          * power of 2, but always a multiple of 16.  The intent is
399          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
400          * In addition, expand_clr needs to handle a multiple of 8.
401          * Thus we can handle the tail with one more operation per
402          * diminishing power of 2.
403          */
404         q += ctpop32(r);
405     }
406 
407     return q <= MAX_UNROLL;
408 }
409 
410 static void expand_clr(TCGv_ptr dbase, uint32_t dofs, uint32_t maxsz);
411 
412 /* Duplicate C as per VECE.  */
413 uint64_t (dup_const)(unsigned vece, uint64_t c)
414 {
415     switch (vece) {
416     case MO_8:
417         return 0x0101010101010101ull * (uint8_t)c;
418     case MO_16:
419         return 0x0001000100010001ull * (uint16_t)c;
420     case MO_32:
421         return 0x0000000100000001ull * (uint32_t)c;
422     case MO_64:
423         return c;
424     default:
425         g_assert_not_reached();
426     }
427 }
428 
429 /* Duplicate IN into OUT as per VECE.  */
430 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
431 {
432     switch (vece) {
433     case MO_8:
434         tcg_gen_ext8u_i32(out, in);
435         tcg_gen_muli_i32(out, out, 0x01010101);
436         break;
437     case MO_16:
438         tcg_gen_deposit_i32(out, in, in, 16, 16);
439         break;
440     case MO_32:
441         tcg_gen_mov_i32(out, in);
442         break;
443     default:
444         g_assert_not_reached();
445     }
446 }
447 
448 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
449 {
450     switch (vece) {
451     case MO_8:
452         tcg_gen_ext8u_i64(out, in);
453         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
454         break;
455     case MO_16:
456         tcg_gen_ext16u_i64(out, in);
457         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
458         break;
459     case MO_32:
460         tcg_gen_deposit_i64(out, in, in, 32, 32);
461         break;
462     case MO_64:
463         tcg_gen_mov_i64(out, in);
464         break;
465     default:
466         g_assert_not_reached();
467     }
468 }
469 
470 /* Select a supported vector type for implementing an operation on SIZE
471  * bytes.  If OP is 0, assume that the real operation to be performed is
472  * required by all backends.  Otherwise, make sure than OP can be performed
473  * on elements of size VECE in the selected type.  Do not select V64 if
474  * PREFER_I64 is true.  Return 0 if no vector type is selected.
475  */
476 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
477                                   uint32_t size, bool prefer_i64)
478 {
479     /*
480      * Recall that ARM SVE allows vector sizes that are not a
481      * power of 2, but always a multiple of 16.  The intent is
482      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
483      * It is hard to imagine a case in which v256 is supported
484      * but v128 is not, but check anyway.
485      * In addition, expand_clr needs to handle a multiple of 8.
486      */
487     if (TCG_TARGET_HAS_v256 &&
488         check_size_impl(size, 32) &&
489         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
490         (!(size & 16) ||
491          (TCG_TARGET_HAS_v128 &&
492           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
493         (!(size & 8) ||
494          (TCG_TARGET_HAS_v64 &&
495           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
496         return TCG_TYPE_V256;
497     }
498     if (TCG_TARGET_HAS_v128 &&
499         check_size_impl(size, 16) &&
500         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
501         (!(size & 8) ||
502          (TCG_TARGET_HAS_v64 &&
503           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
504         return TCG_TYPE_V128;
505     }
506     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
507         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
508         return TCG_TYPE_V64;
509     }
510     return 0;
511 }
512 
513 static void do_dup_store(TCGType type, TCGv_ptr dbase, uint32_t dofs,
514                          uint32_t oprsz, uint32_t maxsz, TCGv_vec t_vec)
515 {
516     uint32_t i = 0;
517 
518     tcg_debug_assert(oprsz >= 8);
519 
520     /*
521      * This may be expand_clr for the tail of an operation, e.g.
522      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
523      * are misaligned wrt the maximum vector size, so do that first.
524      */
525     if (dofs & 8) {
526         tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V64);
527         i += 8;
528     }
529 
530     switch (type) {
531     case TCG_TYPE_V256:
532         /*
533          * Recall that ARM SVE allows vector sizes that are not a
534          * power of 2, but always a multiple of 16.  The intent is
535          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
536          */
537         for (; i + 32 <= oprsz; i += 32) {
538             tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V256);
539         }
540         /* fallthru */
541     case TCG_TYPE_V128:
542         for (; i + 16 <= oprsz; i += 16) {
543             tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V128);
544         }
545         break;
546     case TCG_TYPE_V64:
547         for (; i < oprsz; i += 8) {
548             tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V64);
549         }
550         break;
551     default:
552         g_assert_not_reached();
553     }
554 
555     if (oprsz < maxsz) {
556         expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
557     }
558 }
559 
560 /*
561  * Set OPRSZ bytes at DBASE + DOFS to replications of IN_32, IN_64 or IN_C.
562  * Only one of IN_32 or IN_64 may be set;
563  * IN_C is used if IN_32 and IN_64 are unset.
564  */
565 static void do_dup(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
566                    uint32_t oprsz, uint32_t maxsz,
567                    TCGv_i32 in_32, TCGv_i64 in_64, uint64_t in_c)
568 {
569     TCGType type;
570     TCGv_i64 t_64;
571     TCGv_i32 t_32, t_desc;
572     TCGv_ptr t_ptr;
573     uint32_t i;
574 
575     assert(vece <= (in_32 ? MO_32 : MO_64));
576     assert(in_32 == NULL || in_64 == NULL);
577 
578     /* If we're storing 0, expand oprsz to maxsz.  */
579     if (in_32 == NULL && in_64 == NULL) {
580         in_c = dup_const(vece, in_c);
581         if (in_c == 0) {
582             oprsz = maxsz;
583             vece = MO_8;
584         } else if (in_c == dup_const(MO_8, in_c)) {
585             vece = MO_8;
586         }
587     }
588 
589     /* Implement inline with a vector type, if possible.
590      * Prefer integer when 64-bit host and no variable dup.
591      */
592     type = choose_vector_type(NULL, vece, oprsz,
593                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
594                                && (in_64 == NULL || vece == MO_64)));
595     if (type != 0) {
596         TCGv_vec t_vec = tcg_temp_new_vec(type);
597 
598         if (in_32) {
599             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
600         } else if (in_64) {
601             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
602         } else {
603             tcg_gen_dupi_vec(vece, t_vec, in_c);
604         }
605         do_dup_store(type, dbase, dofs, oprsz, maxsz, t_vec);
606         return;
607     }
608 
609     /* Otherwise, inline with an integer type, unless "large".  */
610     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
611         t_64 = NULL;
612         t_32 = NULL;
613 
614         if (in_32) {
615             /* We are given a 32-bit variable input.  For a 64-bit host,
616                use a 64-bit operation unless the 32-bit operation would
617                be simple enough.  */
618             if (TCG_TARGET_REG_BITS == 64
619                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
620                 t_64 = tcg_temp_ebb_new_i64();
621                 tcg_gen_extu_i32_i64(t_64, in_32);
622                 tcg_gen_dup_i64(vece, t_64, t_64);
623             } else {
624                 t_32 = tcg_temp_ebb_new_i32();
625                 tcg_gen_dup_i32(vece, t_32, in_32);
626             }
627         } else if (in_64) {
628             /* We are given a 64-bit variable input.  */
629             t_64 = tcg_temp_ebb_new_i64();
630             tcg_gen_dup_i64(vece, t_64, in_64);
631         } else {
632             /* We are given a constant input.  */
633             /* For 64-bit hosts, use 64-bit constants for "simple" constants
634                or when we'd need too many 32-bit stores, or when a 64-bit
635                constant is really required.  */
636             if (vece == MO_64
637                 || (TCG_TARGET_REG_BITS == 64
638                     && (in_c == 0 || in_c == -1
639                         || !check_size_impl(oprsz, 4)))) {
640                 t_64 = tcg_constant_i64(in_c);
641             } else {
642                 t_32 = tcg_constant_i32(in_c);
643             }
644         }
645 
646         /* Implement inline if we picked an implementation size above.  */
647         if (t_32) {
648             for (i = 0; i < oprsz; i += 4) {
649                 tcg_gen_st_i32(t_32, dbase, dofs + i);
650             }
651             tcg_temp_free_i32(t_32);
652             goto done;
653         }
654         if (t_64) {
655             for (i = 0; i < oprsz; i += 8) {
656                 tcg_gen_st_i64(t_64, dbase, dofs + i);
657             }
658             tcg_temp_free_i64(t_64);
659             goto done;
660         }
661     }
662 
663     /* Otherwise implement out of line.  */
664     t_ptr = tcg_temp_ebb_new_ptr();
665     tcg_gen_addi_ptr(t_ptr, dbase, dofs);
666 
667     /*
668      * This may be expand_clr for the tail of an operation, e.g.
669      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
670      * wrt simd_desc and will assert.  Simply pass all replicated byte
671      * stores through to memset.
672      */
673     if (oprsz == maxsz && vece == MO_8) {
674         TCGv_ptr t_size = tcg_constant_ptr(oprsz);
675         TCGv_i32 t_val;
676 
677         if (in_32) {
678             t_val = in_32;
679         } else if (in_64) {
680             t_val = tcg_temp_ebb_new_i32();
681             tcg_gen_extrl_i64_i32(t_val, in_64);
682         } else {
683             t_val = tcg_constant_i32(in_c);
684         }
685         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
686 
687         if (in_64) {
688             tcg_temp_free_i32(t_val);
689         }
690         tcg_temp_free_ptr(t_ptr);
691         return;
692     }
693 
694     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
695 
696     if (vece == MO_64) {
697         if (in_64) {
698             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
699         } else {
700             t_64 = tcg_constant_i64(in_c);
701             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
702         }
703     } else {
704         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
705         static dup_fn * const fns[3] = {
706             gen_helper_gvec_dup8,
707             gen_helper_gvec_dup16,
708             gen_helper_gvec_dup32
709         };
710 
711         if (in_32) {
712             fns[vece](t_ptr, t_desc, in_32);
713         } else if (in_64) {
714             t_32 = tcg_temp_ebb_new_i32();
715             tcg_gen_extrl_i64_i32(t_32, in_64);
716             fns[vece](t_ptr, t_desc, t_32);
717             tcg_temp_free_i32(t_32);
718         } else {
719             if (vece == MO_8) {
720                 in_c &= 0xff;
721             } else if (vece == MO_16) {
722                 in_c &= 0xffff;
723             }
724             t_32 = tcg_constant_i32(in_c);
725             fns[vece](t_ptr, t_desc, t_32);
726         }
727     }
728 
729     tcg_temp_free_ptr(t_ptr);
730     return;
731 
732  done:
733     if (oprsz < maxsz) {
734         expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
735     }
736 }
737 
738 /* Likewise, but with zero.  */
739 static void expand_clr(TCGv_ptr dbase, uint32_t dofs, uint32_t maxsz)
740 {
741     do_dup(MO_8, dbase, dofs, maxsz, maxsz, NULL, NULL, 0);
742 }
743 
744 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
745 static void expand_2_i32(TCGv_ptr dbase, uint32_t dofs, TCGv_ptr abase,
746                          uint32_t aofs, uint32_t oprsz, bool load_dest,
747                          void (*fni)(TCGv_i32, TCGv_i32))
748 {
749     TCGv_i32 t0 = tcg_temp_new_i32();
750     TCGv_i32 t1 = tcg_temp_new_i32();
751     uint32_t i;
752 
753     for (i = 0; i < oprsz; i += 4) {
754         tcg_gen_ld_i32(t0, abase, aofs + i);
755         if (load_dest) {
756             tcg_gen_ld_i32(t1, dbase, dofs + i);
757         }
758         fni(t1, t0);
759         tcg_gen_st_i32(t1, dbase, dofs + i);
760     }
761     tcg_temp_free_i32(t0);
762     tcg_temp_free_i32(t1);
763 }
764 
765 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
766                           int32_t c, bool load_dest,
767                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
768 {
769     TCGv_i32 t0 = tcg_temp_new_i32();
770     TCGv_i32 t1 = tcg_temp_new_i32();
771     uint32_t i;
772 
773     for (i = 0; i < oprsz; i += 4) {
774         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
775         if (load_dest) {
776             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
777         }
778         fni(t1, t0, c);
779         tcg_gen_st_i32(t1, tcg_env, dofs + i);
780     }
781     tcg_temp_free_i32(t0);
782     tcg_temp_free_i32(t1);
783 }
784 
785 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
786                           TCGv_i32 c, bool scalar_first,
787                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
788 {
789     TCGv_i32 t0 = tcg_temp_new_i32();
790     TCGv_i32 t1 = tcg_temp_new_i32();
791     uint32_t i;
792 
793     for (i = 0; i < oprsz; i += 4) {
794         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
795         if (scalar_first) {
796             fni(t1, c, t0);
797         } else {
798             fni(t1, t0, c);
799         }
800         tcg_gen_st_i32(t1, tcg_env, dofs + i);
801     }
802     tcg_temp_free_i32(t0);
803     tcg_temp_free_i32(t1);
804 }
805 
806 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
807 static void expand_3_i32(TCGv_ptr dbase, uint32_t dofs,
808                          TCGv_ptr abase, uint32_t aofs,
809                          TCGv_ptr bbase, uint32_t bofs,
810                          uint32_t oprsz, bool load_dest,
811                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
812 {
813     TCGv_i32 t0 = tcg_temp_new_i32();
814     TCGv_i32 t1 = tcg_temp_new_i32();
815     TCGv_i32 t2 = tcg_temp_new_i32();
816     uint32_t i;
817 
818     for (i = 0; i < oprsz; i += 4) {
819         tcg_gen_ld_i32(t0, abase, aofs + i);
820         tcg_gen_ld_i32(t1, bbase, bofs + i);
821         if (load_dest) {
822             tcg_gen_ld_i32(t2, dbase, dofs + i);
823         }
824         fni(t2, t0, t1);
825         tcg_gen_st_i32(t2, dbase, dofs + i);
826     }
827     tcg_temp_free_i32(t2);
828     tcg_temp_free_i32(t1);
829     tcg_temp_free_i32(t0);
830 }
831 
832 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
833                           uint32_t oprsz, int32_t c,
834                           bool load_dest, bool write_aofs,
835                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
836 {
837     TCGv_i32 t0 = tcg_temp_new_i32();
838     TCGv_i32 t1 = tcg_temp_new_i32();
839     TCGv_i32 t2 = tcg_temp_new_i32();
840     uint32_t i;
841 
842     for (i = 0; i < oprsz; i += 4) {
843         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
844         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
845         if (load_dest) {
846             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
847         }
848         fni(t2, t0, t1, c);
849         tcg_gen_st_i32(t2, tcg_env, dofs + i);
850         if (write_aofs) {
851             tcg_gen_st_i32(t0, tcg_env, aofs + i);
852         }
853     }
854     tcg_temp_free_i32(t0);
855     tcg_temp_free_i32(t1);
856     tcg_temp_free_i32(t2);
857 }
858 
859 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
860 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
861                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
862                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
863 {
864     TCGv_i32 t0 = tcg_temp_new_i32();
865     TCGv_i32 t1 = tcg_temp_new_i32();
866     TCGv_i32 t2 = tcg_temp_new_i32();
867     TCGv_i32 t3 = tcg_temp_new_i32();
868     uint32_t i;
869 
870     for (i = 0; i < oprsz; i += 4) {
871         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
872         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
873         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
874         fni(t0, t1, t2, t3);
875         tcg_gen_st_i32(t0, tcg_env, dofs + i);
876         if (write_aofs) {
877             tcg_gen_st_i32(t1, tcg_env, aofs + i);
878         }
879     }
880     tcg_temp_free_i32(t3);
881     tcg_temp_free_i32(t2);
882     tcg_temp_free_i32(t1);
883     tcg_temp_free_i32(t0);
884 }
885 
886 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
887                           uint32_t cofs, uint32_t oprsz, int32_t c,
888                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
889                                       int32_t))
890 {
891     TCGv_i32 t0 = tcg_temp_new_i32();
892     TCGv_i32 t1 = tcg_temp_new_i32();
893     TCGv_i32 t2 = tcg_temp_new_i32();
894     TCGv_i32 t3 = tcg_temp_new_i32();
895     uint32_t i;
896 
897     for (i = 0; i < oprsz; i += 4) {
898         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
899         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
900         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
901         fni(t0, t1, t2, t3, c);
902         tcg_gen_st_i32(t0, tcg_env, dofs + i);
903     }
904     tcg_temp_free_i32(t3);
905     tcg_temp_free_i32(t2);
906     tcg_temp_free_i32(t1);
907     tcg_temp_free_i32(t0);
908 }
909 
910 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
911 static void expand_2_i64(TCGv_ptr dbase, uint32_t dofs, TCGv_ptr abase,
912                          uint32_t aofs, uint32_t oprsz, bool load_dest,
913                          void (*fni)(TCGv_i64, TCGv_i64))
914 {
915     TCGv_i64 t0 = tcg_temp_new_i64();
916     TCGv_i64 t1 = tcg_temp_new_i64();
917     uint32_t i;
918 
919     for (i = 0; i < oprsz; i += 8) {
920         tcg_gen_ld_i64(t0, abase, aofs + i);
921         if (load_dest) {
922             tcg_gen_ld_i64(t1, dbase, dofs + i);
923         }
924         fni(t1, t0);
925         tcg_gen_st_i64(t1, dbase, dofs + i);
926     }
927     tcg_temp_free_i64(t0);
928     tcg_temp_free_i64(t1);
929 }
930 
931 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
932                           int64_t c, bool load_dest,
933                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
934 {
935     TCGv_i64 t0 = tcg_temp_new_i64();
936     TCGv_i64 t1 = tcg_temp_new_i64();
937     uint32_t i;
938 
939     for (i = 0; i < oprsz; i += 8) {
940         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
941         if (load_dest) {
942             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
943         }
944         fni(t1, t0, c);
945         tcg_gen_st_i64(t1, tcg_env, dofs + i);
946     }
947     tcg_temp_free_i64(t0);
948     tcg_temp_free_i64(t1);
949 }
950 
951 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
952                           TCGv_i64 c, bool scalar_first,
953                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
954 {
955     TCGv_i64 t0 = tcg_temp_new_i64();
956     TCGv_i64 t1 = tcg_temp_new_i64();
957     uint32_t i;
958 
959     for (i = 0; i < oprsz; i += 8) {
960         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
961         if (scalar_first) {
962             fni(t1, c, t0);
963         } else {
964             fni(t1, t0, c);
965         }
966         tcg_gen_st_i64(t1, tcg_env, dofs + i);
967     }
968     tcg_temp_free_i64(t0);
969     tcg_temp_free_i64(t1);
970 }
971 
972 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
973 static void expand_3_i64(TCGv_ptr dbase, uint32_t dofs,
974                          TCGv_ptr abase, uint32_t aofs,
975                          TCGv_ptr bbase, uint32_t bofs,
976                          uint32_t oprsz, bool load_dest,
977                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
978 {
979     TCGv_i64 t0 = tcg_temp_new_i64();
980     TCGv_i64 t1 = tcg_temp_new_i64();
981     TCGv_i64 t2 = tcg_temp_new_i64();
982     uint32_t i;
983 
984     for (i = 0; i < oprsz; i += 8) {
985         tcg_gen_ld_i64(t0, abase, aofs + i);
986         tcg_gen_ld_i64(t1, bbase, bofs + i);
987         if (load_dest) {
988             tcg_gen_ld_i64(t2, dbase, dofs + i);
989         }
990         fni(t2, t0, t1);
991         tcg_gen_st_i64(t2, dbase, dofs + i);
992     }
993     tcg_temp_free_i64(t2);
994     tcg_temp_free_i64(t1);
995     tcg_temp_free_i64(t0);
996 }
997 
998 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
999                           uint32_t oprsz, int64_t c,
1000                           bool load_dest, bool write_aofs,
1001                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
1002 {
1003     TCGv_i64 t0 = tcg_temp_new_i64();
1004     TCGv_i64 t1 = tcg_temp_new_i64();
1005     TCGv_i64 t2 = tcg_temp_new_i64();
1006     uint32_t i;
1007 
1008     for (i = 0; i < oprsz; i += 8) {
1009         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
1010         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
1011         if (load_dest) {
1012             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
1013         }
1014         fni(t2, t0, t1, c);
1015         tcg_gen_st_i64(t2, tcg_env, dofs + i);
1016         if (write_aofs) {
1017             tcg_gen_st_i64(t0, tcg_env, aofs + i);
1018         }
1019     }
1020     tcg_temp_free_i64(t0);
1021     tcg_temp_free_i64(t1);
1022     tcg_temp_free_i64(t2);
1023 }
1024 
1025 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
1026 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1027                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
1028                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
1029 {
1030     TCGv_i64 t0 = tcg_temp_new_i64();
1031     TCGv_i64 t1 = tcg_temp_new_i64();
1032     TCGv_i64 t2 = tcg_temp_new_i64();
1033     TCGv_i64 t3 = tcg_temp_new_i64();
1034     uint32_t i;
1035 
1036     for (i = 0; i < oprsz; i += 8) {
1037         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1038         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1039         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1040         fni(t0, t1, t2, t3);
1041         tcg_gen_st_i64(t0, tcg_env, dofs + i);
1042         if (write_aofs) {
1043             tcg_gen_st_i64(t1, tcg_env, aofs + i);
1044         }
1045     }
1046     tcg_temp_free_i64(t3);
1047     tcg_temp_free_i64(t2);
1048     tcg_temp_free_i64(t1);
1049     tcg_temp_free_i64(t0);
1050 }
1051 
1052 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1053                           uint32_t cofs, uint32_t oprsz, int64_t c,
1054                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
1055                                       int64_t))
1056 {
1057     TCGv_i64 t0 = tcg_temp_new_i64();
1058     TCGv_i64 t1 = tcg_temp_new_i64();
1059     TCGv_i64 t2 = tcg_temp_new_i64();
1060     TCGv_i64 t3 = tcg_temp_new_i64();
1061     uint32_t i;
1062 
1063     for (i = 0; i < oprsz; i += 8) {
1064         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1065         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1066         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1067         fni(t0, t1, t2, t3, c);
1068         tcg_gen_st_i64(t0, tcg_env, dofs + i);
1069     }
1070     tcg_temp_free_i64(t3);
1071     tcg_temp_free_i64(t2);
1072     tcg_temp_free_i64(t1);
1073     tcg_temp_free_i64(t0);
1074 }
1075 
1076 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
1077 static void expand_2_vec(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
1078                          TCGv_ptr abase, uint32_t aofs,
1079                          uint32_t oprsz, uint32_t tysz, TCGType type,
1080                          bool load_dest,
1081                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1082 {
1083     for (uint32_t i = 0; i < oprsz; i += tysz) {
1084         TCGv_vec t0 = tcg_temp_new_vec(type);
1085         TCGv_vec t1 = tcg_temp_new_vec(type);
1086 
1087         tcg_gen_ld_vec(t0, abase, aofs + i);
1088         if (load_dest) {
1089             tcg_gen_ld_vec(t1, dbase, dofs + i);
1090         }
1091         fni(vece, t1, t0);
1092         tcg_gen_st_vec(t1, dbase, dofs + i);
1093     }
1094 }
1095 
1096 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1097    using host vectors.  */
1098 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1099                           uint32_t oprsz, uint32_t tysz, TCGType type,
1100                           int64_t c, bool load_dest,
1101                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1102 {
1103     for (uint32_t i = 0; i < oprsz; i += tysz) {
1104         TCGv_vec t0 = tcg_temp_new_vec(type);
1105         TCGv_vec t1 = tcg_temp_new_vec(type);
1106 
1107         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1108         if (load_dest) {
1109             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1110         }
1111         fni(vece, t1, t0, c);
1112         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1113     }
1114 }
1115 
1116 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1117                           uint32_t oprsz, uint32_t tysz, TCGType type,
1118                           TCGv_vec c, bool scalar_first,
1119                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1120 {
1121     for (uint32_t i = 0; i < oprsz; i += tysz) {
1122         TCGv_vec t0 = tcg_temp_new_vec(type);
1123         TCGv_vec t1 = tcg_temp_new_vec(type);
1124 
1125         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1126         if (scalar_first) {
1127             fni(vece, t1, c, t0);
1128         } else {
1129             fni(vece, t1, t0, c);
1130         }
1131         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1132     }
1133 }
1134 
1135 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1136 static void expand_3_vec(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
1137                          TCGv_ptr abase, uint32_t aofs,
1138                          TCGv_ptr bbase, uint32_t bofs, uint32_t oprsz,
1139                          uint32_t tysz, TCGType type, bool load_dest,
1140                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1141 {
1142     for (uint32_t i = 0; i < oprsz; i += tysz) {
1143         TCGv_vec t0 = tcg_temp_new_vec(type);
1144         TCGv_vec t1 = tcg_temp_new_vec(type);
1145         TCGv_vec t2 = tcg_temp_new_vec(type);
1146 
1147         tcg_gen_ld_vec(t0, abase, aofs + i);
1148         tcg_gen_ld_vec(t1, bbase, bofs + i);
1149         if (load_dest) {
1150             tcg_gen_ld_vec(t2, dbase, dofs + i);
1151         }
1152         fni(vece, t2, t0, t1);
1153         tcg_gen_st_vec(t2, dbase, dofs + i);
1154     }
1155 }
1156 
1157 /*
1158  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1159  * using host vectors.
1160  */
1161 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1162                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1163                           TCGType type, int64_t c,
1164                           bool load_dest, bool write_aofs,
1165                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1166                                       int64_t))
1167 {
1168     for (uint32_t i = 0; i < oprsz; i += tysz) {
1169         TCGv_vec t0 = tcg_temp_new_vec(type);
1170         TCGv_vec t1 = tcg_temp_new_vec(type);
1171         TCGv_vec t2 = tcg_temp_new_vec(type);
1172 
1173         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1174         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1175         if (load_dest) {
1176             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1177         }
1178         fni(vece, t2, t0, t1, c);
1179         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1180         if (write_aofs) {
1181             tcg_gen_st_vec(t0, tcg_env, aofs + i);
1182         }
1183     }
1184 }
1185 
1186 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1187 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1188                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1189                          uint32_t tysz, TCGType type, bool write_aofs,
1190                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1191                                      TCGv_vec, TCGv_vec))
1192 {
1193     for (uint32_t i = 0; i < oprsz; i += tysz) {
1194         TCGv_vec t0 = tcg_temp_new_vec(type);
1195         TCGv_vec t1 = tcg_temp_new_vec(type);
1196         TCGv_vec t2 = tcg_temp_new_vec(type);
1197         TCGv_vec t3 = tcg_temp_new_vec(type);
1198 
1199         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1200         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1201         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1202         fni(vece, t0, t1, t2, t3);
1203         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1204         if (write_aofs) {
1205             tcg_gen_st_vec(t1, tcg_env, aofs + i);
1206         }
1207     }
1208 }
1209 
1210 /*
1211  * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1212  * using host vectors.
1213  */
1214 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1215                           uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1216                           uint32_t tysz, TCGType type, int64_t c,
1217                           void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1218                                      TCGv_vec, TCGv_vec, int64_t))
1219 {
1220     for (uint32_t i = 0; i < oprsz; i += tysz) {
1221         TCGv_vec t0 = tcg_temp_new_vec(type);
1222         TCGv_vec t1 = tcg_temp_new_vec(type);
1223         TCGv_vec t2 = tcg_temp_new_vec(type);
1224         TCGv_vec t3 = tcg_temp_new_vec(type);
1225 
1226         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1227         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1228         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1229         fni(vece, t0, t1, t2, t3, c);
1230         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1231     }
1232 }
1233 
1234 /* Expand a vector two-operand operation.  */
1235 void tcg_gen_gvec_2_var(TCGv_ptr dbase, uint32_t dofs,
1236                         TCGv_ptr abase, uint32_t aofs,
1237                         uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1238 {
1239     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1240     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1241     TCGType type;
1242     uint32_t some;
1243 
1244     check_size_align(oprsz, maxsz, dofs | aofs);
1245     check_overlap_2(dbase, dofs, abase, aofs, maxsz);
1246 
1247     type = 0;
1248     if (g->fniv) {
1249         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1250     }
1251     switch (type) {
1252     case TCG_TYPE_V256:
1253         /* Recall that ARM SVE allows vector sizes that are not a
1254          * power of 2, but always a multiple of 16.  The intent is
1255          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1256          */
1257         some = QEMU_ALIGN_DOWN(oprsz, 32);
1258         expand_2_vec(g->vece, dbase, dofs, abase, aofs, some, 32,
1259                      TCG_TYPE_V256, g->load_dest, g->fniv);
1260         if (some == oprsz) {
1261             break;
1262         }
1263         dofs += some;
1264         aofs += some;
1265         oprsz -= some;
1266         maxsz -= some;
1267         /* fallthru */
1268     case TCG_TYPE_V128:
1269         expand_2_vec(g->vece, dbase, dofs, abase, aofs, oprsz, 16,
1270                      TCG_TYPE_V128, g->load_dest, g->fniv);
1271         break;
1272     case TCG_TYPE_V64:
1273         expand_2_vec(g->vece, dbase, dofs, abase, aofs, oprsz, 8,
1274                      TCG_TYPE_V64, g->load_dest, g->fniv);
1275         break;
1276 
1277     case 0:
1278         if (g->fni8 && check_size_impl(oprsz, 8)) {
1279             expand_2_i64(dbase, dofs, abase, aofs,
1280                          oprsz, g->load_dest, g->fni8);
1281         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1282             expand_2_i32(dbase, dofs, abase, aofs,
1283                          oprsz, g->load_dest, g->fni4);
1284         } else {
1285             assert(g->fno != NULL);
1286             expand_2_ool(dbase, dofs, abase, aofs,
1287                          oprsz, maxsz, g->data, g->fno);
1288             oprsz = maxsz;
1289         }
1290         break;
1291 
1292     default:
1293         g_assert_not_reached();
1294     }
1295     tcg_swap_vecop_list(hold_list);
1296 
1297     if (oprsz < maxsz) {
1298         expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
1299     }
1300 }
1301 
1302 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1303                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1304 {
1305     tcg_gen_gvec_2_var(tcg_env, dofs, tcg_env, aofs, oprsz, maxsz, g);
1306 }
1307 
1308 /* Expand a vector operation with two vectors and an immediate.  */
1309 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1310                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1311 {
1312     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1313     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1314     TCGType type;
1315     uint32_t some;
1316 
1317     check_size_align(oprsz, maxsz, dofs | aofs);
1318     check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz);
1319 
1320     type = 0;
1321     if (g->fniv) {
1322         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1323     }
1324     switch (type) {
1325     case TCG_TYPE_V256:
1326         /* Recall that ARM SVE allows vector sizes that are not a
1327          * power of 2, but always a multiple of 16.  The intent is
1328          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1329          */
1330         some = QEMU_ALIGN_DOWN(oprsz, 32);
1331         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1332                       c, g->load_dest, g->fniv);
1333         if (some == oprsz) {
1334             break;
1335         }
1336         dofs += some;
1337         aofs += some;
1338         oprsz -= some;
1339         maxsz -= some;
1340         /* fallthru */
1341     case TCG_TYPE_V128:
1342         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1343                       c, g->load_dest, g->fniv);
1344         break;
1345     case TCG_TYPE_V64:
1346         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1347                       c, g->load_dest, g->fniv);
1348         break;
1349 
1350     case 0:
1351         if (g->fni8 && check_size_impl(oprsz, 8)) {
1352             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1353         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1354             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1355         } else {
1356             if (g->fno) {
1357                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1358             } else {
1359                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1360                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1361                                     maxsz, c, g->fnoi);
1362             }
1363             oprsz = maxsz;
1364         }
1365         break;
1366 
1367     default:
1368         g_assert_not_reached();
1369     }
1370     tcg_swap_vecop_list(hold_list);
1371 
1372     if (oprsz < maxsz) {
1373         expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1374     }
1375 }
1376 
1377 /* Expand a vector operation with two vectors and a scalar.  */
1378 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1379                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1380 {
1381     TCGType type;
1382 
1383     check_size_align(oprsz, maxsz, dofs | aofs);
1384     check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz);
1385 
1386     type = 0;
1387     if (g->fniv) {
1388         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1389     }
1390     if (type != 0) {
1391         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1392         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1393         TCGv_vec t_vec = tcg_temp_new_vec(type);
1394         uint32_t some;
1395 
1396         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1397 
1398         switch (type) {
1399         case TCG_TYPE_V256:
1400             /* Recall that ARM SVE allows vector sizes that are not a
1401              * power of 2, but always a multiple of 16.  The intent is
1402              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1403              */
1404             some = QEMU_ALIGN_DOWN(oprsz, 32);
1405             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1406                           t_vec, g->scalar_first, g->fniv);
1407             if (some == oprsz) {
1408                 break;
1409             }
1410             dofs += some;
1411             aofs += some;
1412             oprsz -= some;
1413             maxsz -= some;
1414             /* fallthru */
1415 
1416         case TCG_TYPE_V128:
1417             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1418                           t_vec, g->scalar_first, g->fniv);
1419             break;
1420 
1421         case TCG_TYPE_V64:
1422             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1423                           t_vec, g->scalar_first, g->fniv);
1424             break;
1425 
1426         default:
1427             g_assert_not_reached();
1428         }
1429         tcg_temp_free_vec(t_vec);
1430         tcg_swap_vecop_list(hold_list);
1431     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1432         TCGv_i64 t64 = tcg_temp_new_i64();
1433 
1434         tcg_gen_dup_i64(g->vece, t64, c);
1435         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1436         tcg_temp_free_i64(t64);
1437     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1438         TCGv_i32 t32 = tcg_temp_new_i32();
1439 
1440         tcg_gen_extrl_i64_i32(t32, c);
1441         tcg_gen_dup_i32(g->vece, t32, t32);
1442         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1443         tcg_temp_free_i32(t32);
1444     } else {
1445         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1446         return;
1447     }
1448 
1449     if (oprsz < maxsz) {
1450         expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1451     }
1452 }
1453 
1454 /* Expand a vector three-operand operation.  */
1455 void tcg_gen_gvec_3_var(TCGv_ptr dbase, uint32_t dofs,
1456                         TCGv_ptr abase, uint32_t aofs,
1457                         TCGv_ptr bbase, uint32_t bofs,
1458                         uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1459 {
1460     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1461     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1462     TCGType type;
1463     uint32_t some;
1464 
1465     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1466     check_overlap_3(dbase, dofs, abase, aofs, bbase, bofs, maxsz);
1467 
1468     type = 0;
1469     if (g->fniv) {
1470         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1471     }
1472     switch (type) {
1473     case TCG_TYPE_V256:
1474         /* Recall that ARM SVE allows vector sizes that are not a
1475          * power of 2, but always a multiple of 16.  The intent is
1476          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1477          */
1478         some = QEMU_ALIGN_DOWN(oprsz, 32);
1479         expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs,
1480                      some, 32, TCG_TYPE_V256, g->load_dest, g->fniv);
1481         if (some == oprsz) {
1482             break;
1483         }
1484         dofs += some;
1485         aofs += some;
1486         bofs += some;
1487         oprsz -= some;
1488         maxsz -= some;
1489         /* fallthru */
1490     case TCG_TYPE_V128:
1491         expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs,
1492                      oprsz, 16, TCG_TYPE_V128, g->load_dest, g->fniv);
1493         break;
1494     case TCG_TYPE_V64:
1495         expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs,
1496                      oprsz, 8, TCG_TYPE_V64, g->load_dest, g->fniv);
1497         break;
1498 
1499     case 0:
1500         if (g->fni8 && check_size_impl(oprsz, 8)) {
1501             expand_3_i64(dbase, dofs, abase, aofs, bbase, bofs,
1502                          oprsz, g->load_dest, g->fni8);
1503         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1504             expand_3_i32(dbase, dofs, abase, aofs, bbase, bofs,
1505                          oprsz, g->load_dest, g->fni4);
1506         } else {
1507             assert(g->fno != NULL);
1508             expand_3_ool(dbase, dofs, abase, aofs, bbase, bofs,
1509                          oprsz, maxsz, g->data, g->fno);
1510             oprsz = maxsz;
1511         }
1512         break;
1513 
1514     default:
1515         g_assert_not_reached();
1516     }
1517     tcg_swap_vecop_list(hold_list);
1518 
1519     if (oprsz < maxsz) {
1520         expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
1521     }
1522 }
1523 
1524 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1525                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1526 {
1527     tcg_gen_gvec_3_var(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs,
1528                        oprsz, maxsz, g);
1529 }
1530 
1531 /* Expand a vector operation with three vectors and an immediate.  */
1532 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1533                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1534                      const GVecGen3i *g)
1535 {
1536     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1537     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1538     TCGType type;
1539     uint32_t some;
1540 
1541     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1542     check_overlap_3(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, maxsz);
1543 
1544     type = 0;
1545     if (g->fniv) {
1546         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1547     }
1548     switch (type) {
1549     case TCG_TYPE_V256:
1550         /*
1551          * Recall that ARM SVE allows vector sizes that are not a
1552          * power of 2, but always a multiple of 16.  The intent is
1553          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1554          */
1555         some = QEMU_ALIGN_DOWN(oprsz, 32);
1556         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1557                       c, g->load_dest, g->write_aofs, g->fniv);
1558         if (some == oprsz) {
1559             break;
1560         }
1561         dofs += some;
1562         aofs += some;
1563         bofs += some;
1564         oprsz -= some;
1565         maxsz -= some;
1566         /* fallthru */
1567     case TCG_TYPE_V128:
1568         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1569                       c, g->load_dest, g->write_aofs, g->fniv);
1570         break;
1571     case TCG_TYPE_V64:
1572         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1573                       c, g->load_dest, g->write_aofs, g->fniv);
1574         break;
1575 
1576     case 0:
1577         if (g->fni8 && check_size_impl(oprsz, 8)) {
1578             expand_3i_i64(dofs, aofs, bofs, oprsz, c,
1579                           g->load_dest, g->write_aofs, g->fni8);
1580         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1581             expand_3i_i32(dofs, aofs, bofs, oprsz, c,
1582                           g->load_dest, g->write_aofs, g->fni4);
1583         } else {
1584             assert(g->fno != NULL);
1585             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1586             oprsz = maxsz;
1587         }
1588         break;
1589 
1590     default:
1591         g_assert_not_reached();
1592     }
1593     tcg_swap_vecop_list(hold_list);
1594 
1595     if (oprsz < maxsz) {
1596         expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1597     }
1598 }
1599 
1600 /* Expand a vector four-operand operation.  */
1601 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1602                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1603 {
1604     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1605     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1606     TCGType type;
1607     uint32_t some;
1608 
1609     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1610     check_overlap_4(tcg_env, dofs, tcg_env, aofs,
1611                     tcg_env, bofs, tcg_env, cofs, maxsz);
1612 
1613     type = 0;
1614     if (g->fniv) {
1615         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1616     }
1617     switch (type) {
1618     case TCG_TYPE_V256:
1619         /* Recall that ARM SVE allows vector sizes that are not a
1620          * power of 2, but always a multiple of 16.  The intent is
1621          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1622          */
1623         some = QEMU_ALIGN_DOWN(oprsz, 32);
1624         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1625                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1626         if (some == oprsz) {
1627             break;
1628         }
1629         dofs += some;
1630         aofs += some;
1631         bofs += some;
1632         cofs += some;
1633         oprsz -= some;
1634         maxsz -= some;
1635         /* fallthru */
1636     case TCG_TYPE_V128:
1637         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1638                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1639         break;
1640     case TCG_TYPE_V64:
1641         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1642                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1643         break;
1644 
1645     case 0:
1646         if (g->fni8 && check_size_impl(oprsz, 8)) {
1647             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1648                          g->write_aofs, g->fni8);
1649         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1650             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1651                          g->write_aofs, g->fni4);
1652         } else {
1653             assert(g->fno != NULL);
1654             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1655                                oprsz, maxsz, g->data, g->fno);
1656             oprsz = maxsz;
1657         }
1658         break;
1659 
1660     default:
1661         g_assert_not_reached();
1662     }
1663     tcg_swap_vecop_list(hold_list);
1664 
1665     if (oprsz < maxsz) {
1666         expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1667     }
1668 }
1669 
1670 /* Expand a vector four-operand operation.  */
1671 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1672                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1673                      const GVecGen4i *g)
1674 {
1675     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1676     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1677     TCGType type;
1678     uint32_t some;
1679 
1680     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1681     check_overlap_4(tcg_env, dofs, tcg_env, aofs,
1682                     tcg_env, bofs, tcg_env, cofs, maxsz);
1683 
1684     type = 0;
1685     if (g->fniv) {
1686         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1687     }
1688     switch (type) {
1689     case TCG_TYPE_V256:
1690         /*
1691          * Recall that ARM SVE allows vector sizes that are not a
1692          * power of 2, but always a multiple of 16.  The intent is
1693          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1694          */
1695         some = QEMU_ALIGN_DOWN(oprsz, 32);
1696         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1697                       32, TCG_TYPE_V256, c, g->fniv);
1698         if (some == oprsz) {
1699             break;
1700         }
1701         dofs += some;
1702         aofs += some;
1703         bofs += some;
1704         cofs += some;
1705         oprsz -= some;
1706         maxsz -= some;
1707         /* fallthru */
1708     case TCG_TYPE_V128:
1709         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1710                        16, TCG_TYPE_V128, c, g->fniv);
1711         break;
1712     case TCG_TYPE_V64:
1713         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1714                       8, TCG_TYPE_V64, c, g->fniv);
1715         break;
1716 
1717     case 0:
1718         if (g->fni8 && check_size_impl(oprsz, 8)) {
1719             expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1720         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1721             expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1722         } else {
1723             assert(g->fno != NULL);
1724             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1725                                oprsz, maxsz, c, g->fno);
1726             oprsz = maxsz;
1727         }
1728         break;
1729 
1730     default:
1731         g_assert_not_reached();
1732     }
1733     tcg_swap_vecop_list(hold_list);
1734 
1735     if (oprsz < maxsz) {
1736         expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1737     }
1738 }
1739 
1740 /*
1741  * Expand specific vector operations.
1742  */
1743 
1744 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1745 {
1746     tcg_gen_mov_vec(a, b);
1747 }
1748 
1749 void tcg_gen_gvec_mov_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
1750                           TCGv_ptr abase, uint32_t aofs,
1751                           uint32_t oprsz, uint32_t maxsz)
1752 {
1753     static const GVecGen2 g = {
1754         .fni8 = tcg_gen_mov_i64,
1755         .fniv = vec_mov2,
1756         .fno = gen_helper_gvec_mov,
1757         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1758     };
1759 
1760     if (dofs == aofs && dbase == abase) {
1761         check_size_align(oprsz, maxsz, dofs);
1762         if (oprsz < maxsz) {
1763             expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
1764         }
1765         return;
1766     }
1767 
1768     tcg_gen_gvec_2_var(dbase, dofs, abase, aofs, oprsz, maxsz, &g);
1769 }
1770 
1771 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1772                       uint32_t oprsz, uint32_t maxsz)
1773 {
1774     tcg_gen_gvec_mov_var(vece, tcg_env, dofs, tcg_env, aofs, oprsz, maxsz);
1775 }
1776 
1777 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1778                           uint32_t maxsz, TCGv_i32 in)
1779 {
1780     check_size_align(oprsz, maxsz, dofs);
1781     tcg_debug_assert(vece <= MO_32);
1782     do_dup(vece, tcg_env, dofs, oprsz, maxsz, in, NULL, 0);
1783 }
1784 
1785 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1786                           uint32_t maxsz, TCGv_i64 in)
1787 {
1788     check_size_align(oprsz, maxsz, dofs);
1789     tcg_debug_assert(vece <= MO_64);
1790     do_dup(vece, tcg_env, dofs, oprsz, maxsz, NULL, in, 0);
1791 }
1792 
1793 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1794                           uint32_t oprsz, uint32_t maxsz)
1795 {
1796     check_size_align(oprsz, maxsz, dofs);
1797     if (vece <= MO_64) {
1798         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1799         if (type != 0) {
1800             TCGv_vec t_vec = tcg_temp_new_vec(type);
1801             tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs);
1802             do_dup_store(type, tcg_env, dofs, oprsz, maxsz, t_vec);
1803         } else if (vece <= MO_32) {
1804             TCGv_i32 in = tcg_temp_ebb_new_i32();
1805             switch (vece) {
1806             case MO_8:
1807                 tcg_gen_ld8u_i32(in, tcg_env, aofs);
1808                 break;
1809             case MO_16:
1810                 tcg_gen_ld16u_i32(in, tcg_env, aofs);
1811                 break;
1812             default:
1813                 tcg_gen_ld_i32(in, tcg_env, aofs);
1814                 break;
1815             }
1816             do_dup(vece, tcg_env, dofs, oprsz, maxsz, in, NULL, 0);
1817             tcg_temp_free_i32(in);
1818         } else {
1819             TCGv_i64 in = tcg_temp_ebb_new_i64();
1820             tcg_gen_ld_i64(in, tcg_env, aofs);
1821             do_dup(vece, tcg_env, dofs, oprsz, maxsz, NULL, in, 0);
1822             tcg_temp_free_i64(in);
1823         }
1824     } else if (vece == 4) {
1825         /* 128-bit duplicate.  */
1826         int i;
1827 
1828         tcg_debug_assert(oprsz >= 16);
1829         if (TCG_TARGET_HAS_v128) {
1830             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1831 
1832             tcg_gen_ld_vec(in, tcg_env, aofs);
1833             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1834                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1835             }
1836         } else {
1837             TCGv_i64 in0 = tcg_temp_ebb_new_i64();
1838             TCGv_i64 in1 = tcg_temp_ebb_new_i64();
1839 
1840             tcg_gen_ld_i64(in0, tcg_env, aofs);
1841             tcg_gen_ld_i64(in1, tcg_env, aofs + 8);
1842             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1843                 tcg_gen_st_i64(in0, tcg_env, dofs + i);
1844                 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8);
1845             }
1846             tcg_temp_free_i64(in0);
1847             tcg_temp_free_i64(in1);
1848         }
1849         if (oprsz < maxsz) {
1850             expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1851         }
1852     } else if (vece == 5) {
1853         /* 256-bit duplicate.  */
1854         int i;
1855 
1856         tcg_debug_assert(oprsz >= 32);
1857         tcg_debug_assert(oprsz % 32 == 0);
1858         if (TCG_TARGET_HAS_v256) {
1859             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1860 
1861             tcg_gen_ld_vec(in, tcg_env, aofs);
1862             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1863                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1864             }
1865         } else if (TCG_TARGET_HAS_v128) {
1866             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1867             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1868 
1869             tcg_gen_ld_vec(in0, tcg_env, aofs);
1870             tcg_gen_ld_vec(in1, tcg_env, aofs + 16);
1871             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1872                 tcg_gen_st_vec(in0, tcg_env, dofs + i);
1873                 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16);
1874             }
1875         } else {
1876             TCGv_i64 in[4];
1877             int j;
1878 
1879             for (j = 0; j < 4; ++j) {
1880                 in[j] = tcg_temp_ebb_new_i64();
1881                 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8);
1882             }
1883             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1884                 for (j = 0; j < 4; ++j) {
1885                     tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8);
1886                 }
1887             }
1888             for (j = 0; j < 4; ++j) {
1889                 tcg_temp_free_i64(in[j]);
1890             }
1891         }
1892         if (oprsz < maxsz) {
1893             expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1894         }
1895     } else {
1896         g_assert_not_reached();
1897     }
1898 }
1899 
1900 void tcg_gen_gvec_dup_imm_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
1901                               uint32_t oprsz, uint32_t maxsz, uint64_t x)
1902 {
1903     check_size_align(oprsz, maxsz, dofs);
1904     do_dup(vece, dbase, dofs, oprsz, maxsz, NULL, NULL, x);
1905 }
1906 
1907 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1908                           uint32_t maxsz, uint64_t x)
1909 {
1910     tcg_gen_gvec_dup_imm_var(vece, tcg_env, dofs, oprsz, maxsz, x);
1911 }
1912 
1913 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1914                       uint32_t oprsz, uint32_t maxsz)
1915 {
1916     static const GVecGen2 g = {
1917         .fni8 = tcg_gen_not_i64,
1918         .fniv = tcg_gen_not_vec,
1919         .fno = gen_helper_gvec_not,
1920         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1921     };
1922     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1923 }
1924 
1925 /* Perform a vector addition using normal addition and a mask.  The mask
1926    should be the sign bit of each lane.  This 6-operation form is more
1927    efficient than separate additions when there are 4 or more lanes in
1928    the 64-bit operation.  */
1929 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1930 {
1931     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1932     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1933     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
1934 
1935     tcg_gen_andc_i64(t1, a, m);
1936     tcg_gen_andc_i64(t2, b, m);
1937     tcg_gen_xor_i64(t3, a, b);
1938     tcg_gen_add_i64(d, t1, t2);
1939     tcg_gen_and_i64(t3, t3, m);
1940     tcg_gen_xor_i64(d, d, t3);
1941 
1942     tcg_temp_free_i64(t1);
1943     tcg_temp_free_i64(t2);
1944     tcg_temp_free_i64(t3);
1945 }
1946 
1947 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1948 {
1949     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1950     gen_addv_mask(d, a, b, m);
1951 }
1952 
1953 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1954 {
1955     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1956     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1957     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1958     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
1959 
1960     tcg_gen_andc_i32(t1, a, m);
1961     tcg_gen_andc_i32(t2, b, m);
1962     tcg_gen_xor_i32(t3, a, b);
1963     tcg_gen_add_i32(d, t1, t2);
1964     tcg_gen_and_i32(t3, t3, m);
1965     tcg_gen_xor_i32(d, d, t3);
1966 
1967     tcg_temp_free_i32(t1);
1968     tcg_temp_free_i32(t2);
1969     tcg_temp_free_i32(t3);
1970 }
1971 
1972 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1973 {
1974     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1975     gen_addv_mask(d, a, b, m);
1976 }
1977 
1978 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1979 {
1980     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1981     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1982 
1983     tcg_gen_andi_i32(t1, a, ~0xffff);
1984     tcg_gen_add_i32(t2, a, b);
1985     tcg_gen_add_i32(t1, t1, b);
1986     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1987 
1988     tcg_temp_free_i32(t1);
1989     tcg_temp_free_i32(t2);
1990 }
1991 
1992 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1993 {
1994     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1995     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1996 
1997     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1998     tcg_gen_add_i64(t2, a, b);
1999     tcg_gen_add_i64(t1, t1, b);
2000     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2001 
2002     tcg_temp_free_i64(t1);
2003     tcg_temp_free_i64(t2);
2004 }
2005 
2006 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
2007 
2008 void tcg_gen_gvec_add_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
2009                           TCGv_ptr abase, uint32_t aofs,
2010                           TCGv_ptr bbase, uint32_t bofs,
2011                           uint32_t oprsz, uint32_t maxsz)
2012 {
2013     static const GVecGen3 g[4] = {
2014         { .fni8 = tcg_gen_vec_add8_i64,
2015           .fniv = tcg_gen_add_vec,
2016           .fno = gen_helper_gvec_add8,
2017           .opt_opc = vecop_list_add,
2018           .vece = MO_8 },
2019         { .fni8 = tcg_gen_vec_add16_i64,
2020           .fniv = tcg_gen_add_vec,
2021           .fno = gen_helper_gvec_add16,
2022           .opt_opc = vecop_list_add,
2023           .vece = MO_16 },
2024         { .fni4 = tcg_gen_add_i32,
2025           .fniv = tcg_gen_add_vec,
2026           .fno = gen_helper_gvec_add32,
2027           .opt_opc = vecop_list_add,
2028           .vece = MO_32 },
2029         { .fni8 = tcg_gen_add_i64,
2030           .fniv = tcg_gen_add_vec,
2031           .fno = gen_helper_gvec_add64,
2032           .opt_opc = vecop_list_add,
2033           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2034           .vece = MO_64 },
2035     };
2036 
2037     tcg_debug_assert(vece <= MO_64);
2038     tcg_gen_gvec_3_var(dbase, dofs, abase, aofs, bbase, bofs,
2039                        oprsz, maxsz, &g[vece]);
2040 }
2041 
2042 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
2043                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2044 {
2045     tcg_gen_gvec_add_var(vece, tcg_env, dofs, tcg_env, aofs, tcg_env, bofs,
2046                          oprsz, maxsz);
2047 }
2048 
2049 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
2050                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2051 {
2052     static const GVecGen2s g[4] = {
2053         { .fni8 = tcg_gen_vec_add8_i64,
2054           .fniv = tcg_gen_add_vec,
2055           .fno = gen_helper_gvec_adds8,
2056           .opt_opc = vecop_list_add,
2057           .vece = MO_8 },
2058         { .fni8 = tcg_gen_vec_add16_i64,
2059           .fniv = tcg_gen_add_vec,
2060           .fno = gen_helper_gvec_adds16,
2061           .opt_opc = vecop_list_add,
2062           .vece = MO_16 },
2063         { .fni4 = tcg_gen_add_i32,
2064           .fniv = tcg_gen_add_vec,
2065           .fno = gen_helper_gvec_adds32,
2066           .opt_opc = vecop_list_add,
2067           .vece = MO_32 },
2068         { .fni8 = tcg_gen_add_i64,
2069           .fniv = tcg_gen_add_vec,
2070           .fno = gen_helper_gvec_adds64,
2071           .opt_opc = vecop_list_add,
2072           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2073           .vece = MO_64 },
2074     };
2075 
2076     tcg_debug_assert(vece <= MO_64);
2077     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2078 }
2079 
2080 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
2081                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2082 {
2083     TCGv_i64 tmp = tcg_constant_i64(c);
2084     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
2085 }
2086 
2087 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
2088 
2089 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
2090                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2091 {
2092     static const GVecGen2s g[4] = {
2093         { .fni8 = tcg_gen_vec_sub8_i64,
2094           .fniv = tcg_gen_sub_vec,
2095           .fno = gen_helper_gvec_subs8,
2096           .opt_opc = vecop_list_sub,
2097           .vece = MO_8 },
2098         { .fni8 = tcg_gen_vec_sub16_i64,
2099           .fniv = tcg_gen_sub_vec,
2100           .fno = gen_helper_gvec_subs16,
2101           .opt_opc = vecop_list_sub,
2102           .vece = MO_16 },
2103         { .fni4 = tcg_gen_sub_i32,
2104           .fniv = tcg_gen_sub_vec,
2105           .fno = gen_helper_gvec_subs32,
2106           .opt_opc = vecop_list_sub,
2107           .vece = MO_32 },
2108         { .fni8 = tcg_gen_sub_i64,
2109           .fniv = tcg_gen_sub_vec,
2110           .fno = gen_helper_gvec_subs64,
2111           .opt_opc = vecop_list_sub,
2112           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2113           .vece = MO_64 },
2114     };
2115 
2116     tcg_debug_assert(vece <= MO_64);
2117     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2118 }
2119 
2120 /* Perform a vector subtraction using normal subtraction and a mask.
2121    Compare gen_addv_mask above.  */
2122 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2123 {
2124     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2125     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2126     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2127 
2128     tcg_gen_or_i64(t1, a, m);
2129     tcg_gen_andc_i64(t2, b, m);
2130     tcg_gen_eqv_i64(t3, a, b);
2131     tcg_gen_sub_i64(d, t1, t2);
2132     tcg_gen_and_i64(t3, t3, m);
2133     tcg_gen_xor_i64(d, d, t3);
2134 
2135     tcg_temp_free_i64(t1);
2136     tcg_temp_free_i64(t2);
2137     tcg_temp_free_i64(t3);
2138 }
2139 
2140 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2141 {
2142     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2143     gen_subv_mask(d, a, b, m);
2144 }
2145 
2146 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2147 {
2148     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2149     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2150     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2151     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
2152 
2153     tcg_gen_or_i32(t1, a, m);
2154     tcg_gen_andc_i32(t2, b, m);
2155     tcg_gen_eqv_i32(t3, a, b);
2156     tcg_gen_sub_i32(d, t1, t2);
2157     tcg_gen_and_i32(t3, t3, m);
2158     tcg_gen_xor_i32(d, d, t3);
2159 
2160     tcg_temp_free_i32(t1);
2161     tcg_temp_free_i32(t2);
2162     tcg_temp_free_i32(t3);
2163 }
2164 
2165 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2166 {
2167     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2168     gen_subv_mask(d, a, b, m);
2169 }
2170 
2171 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2172 {
2173     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2174     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2175 
2176     tcg_gen_andi_i32(t1, b, ~0xffff);
2177     tcg_gen_sub_i32(t2, a, b);
2178     tcg_gen_sub_i32(t1, a, t1);
2179     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2180 
2181     tcg_temp_free_i32(t1);
2182     tcg_temp_free_i32(t2);
2183 }
2184 
2185 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2186 {
2187     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2188     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2189 
2190     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2191     tcg_gen_sub_i64(t2, a, b);
2192     tcg_gen_sub_i64(t1, a, t1);
2193     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2194 
2195     tcg_temp_free_i64(t1);
2196     tcg_temp_free_i64(t2);
2197 }
2198 
2199 void tcg_gen_gvec_sub_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
2200                           TCGv_ptr abase, uint32_t aofs,
2201                           TCGv_ptr bbase, uint32_t bofs,
2202                           uint32_t oprsz, uint32_t maxsz)
2203 {
2204     static const GVecGen3 g[4] = {
2205         { .fni8 = tcg_gen_vec_sub8_i64,
2206           .fniv = tcg_gen_sub_vec,
2207           .fno = gen_helper_gvec_sub8,
2208           .opt_opc = vecop_list_sub,
2209           .vece = MO_8 },
2210         { .fni8 = tcg_gen_vec_sub16_i64,
2211           .fniv = tcg_gen_sub_vec,
2212           .fno = gen_helper_gvec_sub16,
2213           .opt_opc = vecop_list_sub,
2214           .vece = MO_16 },
2215         { .fni4 = tcg_gen_sub_i32,
2216           .fniv = tcg_gen_sub_vec,
2217           .fno = gen_helper_gvec_sub32,
2218           .opt_opc = vecop_list_sub,
2219           .vece = MO_32 },
2220         { .fni8 = tcg_gen_sub_i64,
2221           .fniv = tcg_gen_sub_vec,
2222           .fno = gen_helper_gvec_sub64,
2223           .opt_opc = vecop_list_sub,
2224           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2225           .vece = MO_64 },
2226     };
2227 
2228     tcg_debug_assert(vece <= MO_64);
2229     tcg_gen_gvec_3_var(dbase, dofs, abase, aofs, bbase, bofs,
2230                        oprsz, maxsz, &g[vece]);
2231 }
2232 
2233 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2234                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2235 {
2236     tcg_gen_gvec_sub_var(vece, tcg_env, dofs, tcg_env, aofs, tcg_env, bofs,
2237                          oprsz, maxsz);
2238 }
2239 
2240 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2241 
2242 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2243                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2244 {
2245     static const GVecGen3 g[4] = {
2246         { .fniv = tcg_gen_mul_vec,
2247           .fno = gen_helper_gvec_mul8,
2248           .opt_opc = vecop_list_mul,
2249           .vece = MO_8 },
2250         { .fniv = tcg_gen_mul_vec,
2251           .fno = gen_helper_gvec_mul16,
2252           .opt_opc = vecop_list_mul,
2253           .vece = MO_16 },
2254         { .fni4 = tcg_gen_mul_i32,
2255           .fniv = tcg_gen_mul_vec,
2256           .fno = gen_helper_gvec_mul32,
2257           .opt_opc = vecop_list_mul,
2258           .vece = MO_32 },
2259         { .fni8 = tcg_gen_mul_i64,
2260           .fniv = tcg_gen_mul_vec,
2261           .fno = gen_helper_gvec_mul64,
2262           .opt_opc = vecop_list_mul,
2263           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2264           .vece = MO_64 },
2265     };
2266 
2267     tcg_debug_assert(vece <= MO_64);
2268     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2269 }
2270 
2271 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2272                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2273 {
2274     static const GVecGen2s g[4] = {
2275         { .fniv = tcg_gen_mul_vec,
2276           .fno = gen_helper_gvec_muls8,
2277           .opt_opc = vecop_list_mul,
2278           .vece = MO_8 },
2279         { .fniv = tcg_gen_mul_vec,
2280           .fno = gen_helper_gvec_muls16,
2281           .opt_opc = vecop_list_mul,
2282           .vece = MO_16 },
2283         { .fni4 = tcg_gen_mul_i32,
2284           .fniv = tcg_gen_mul_vec,
2285           .fno = gen_helper_gvec_muls32,
2286           .opt_opc = vecop_list_mul,
2287           .vece = MO_32 },
2288         { .fni8 = tcg_gen_mul_i64,
2289           .fniv = tcg_gen_mul_vec,
2290           .fno = gen_helper_gvec_muls64,
2291           .opt_opc = vecop_list_mul,
2292           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2293           .vece = MO_64 },
2294     };
2295 
2296     tcg_debug_assert(vece <= MO_64);
2297     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2298 }
2299 
2300 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2301                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2302 {
2303     TCGv_i64 tmp = tcg_constant_i64(c);
2304     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2305 }
2306 
2307 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2308                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2309 {
2310     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2311     static const GVecGen3 g[4] = {
2312         { .fniv = tcg_gen_ssadd_vec,
2313           .fno = gen_helper_gvec_ssadd8,
2314           .opt_opc = vecop_list,
2315           .vece = MO_8 },
2316         { .fniv = tcg_gen_ssadd_vec,
2317           .fno = gen_helper_gvec_ssadd16,
2318           .opt_opc = vecop_list,
2319           .vece = MO_16 },
2320         { .fniv = tcg_gen_ssadd_vec,
2321           .fno = gen_helper_gvec_ssadd32,
2322           .opt_opc = vecop_list,
2323           .vece = MO_32 },
2324         { .fniv = tcg_gen_ssadd_vec,
2325           .fno = gen_helper_gvec_ssadd64,
2326           .opt_opc = vecop_list,
2327           .vece = MO_64 },
2328     };
2329     tcg_debug_assert(vece <= MO_64);
2330     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2331 }
2332 
2333 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2334                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2335 {
2336     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2337     static const GVecGen3 g[4] = {
2338         { .fniv = tcg_gen_sssub_vec,
2339           .fno = gen_helper_gvec_sssub8,
2340           .opt_opc = vecop_list,
2341           .vece = MO_8 },
2342         { .fniv = tcg_gen_sssub_vec,
2343           .fno = gen_helper_gvec_sssub16,
2344           .opt_opc = vecop_list,
2345           .vece = MO_16 },
2346         { .fniv = tcg_gen_sssub_vec,
2347           .fno = gen_helper_gvec_sssub32,
2348           .opt_opc = vecop_list,
2349           .vece = MO_32 },
2350         { .fniv = tcg_gen_sssub_vec,
2351           .fno = gen_helper_gvec_sssub64,
2352           .opt_opc = vecop_list,
2353           .vece = MO_64 },
2354     };
2355     tcg_debug_assert(vece <= MO_64);
2356     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2357 }
2358 
2359 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2360 {
2361     TCGv_i32 max = tcg_constant_i32(-1);
2362     tcg_gen_add_i32(d, a, b);
2363     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2364 }
2365 
2366 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2367 {
2368     TCGv_i64 max = tcg_constant_i64(-1);
2369     tcg_gen_add_i64(d, a, b);
2370     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2371 }
2372 
2373 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2374                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2375 {
2376     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2377     static const GVecGen3 g[4] = {
2378         { .fniv = tcg_gen_usadd_vec,
2379           .fno = gen_helper_gvec_usadd8,
2380           .opt_opc = vecop_list,
2381           .vece = MO_8 },
2382         { .fniv = tcg_gen_usadd_vec,
2383           .fno = gen_helper_gvec_usadd16,
2384           .opt_opc = vecop_list,
2385           .vece = MO_16 },
2386         { .fni4 = tcg_gen_usadd_i32,
2387           .fniv = tcg_gen_usadd_vec,
2388           .fno = gen_helper_gvec_usadd32,
2389           .opt_opc = vecop_list,
2390           .vece = MO_32 },
2391         { .fni8 = tcg_gen_usadd_i64,
2392           .fniv = tcg_gen_usadd_vec,
2393           .fno = gen_helper_gvec_usadd64,
2394           .opt_opc = vecop_list,
2395           .vece = MO_64 }
2396     };
2397     tcg_debug_assert(vece <= MO_64);
2398     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2399 }
2400 
2401 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2402 {
2403     TCGv_i32 min = tcg_constant_i32(0);
2404     tcg_gen_sub_i32(d, a, b);
2405     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2406 }
2407 
2408 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2409 {
2410     TCGv_i64 min = tcg_constant_i64(0);
2411     tcg_gen_sub_i64(d, a, b);
2412     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2413 }
2414 
2415 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2416                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2417 {
2418     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2419     static const GVecGen3 g[4] = {
2420         { .fniv = tcg_gen_ussub_vec,
2421           .fno = gen_helper_gvec_ussub8,
2422           .opt_opc = vecop_list,
2423           .vece = MO_8 },
2424         { .fniv = tcg_gen_ussub_vec,
2425           .fno = gen_helper_gvec_ussub16,
2426           .opt_opc = vecop_list,
2427           .vece = MO_16 },
2428         { .fni4 = tcg_gen_ussub_i32,
2429           .fniv = tcg_gen_ussub_vec,
2430           .fno = gen_helper_gvec_ussub32,
2431           .opt_opc = vecop_list,
2432           .vece = MO_32 },
2433         { .fni8 = tcg_gen_ussub_i64,
2434           .fniv = tcg_gen_ussub_vec,
2435           .fno = gen_helper_gvec_ussub64,
2436           .opt_opc = vecop_list,
2437           .vece = MO_64 }
2438     };
2439     tcg_debug_assert(vece <= MO_64);
2440     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2441 }
2442 
2443 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2444                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2445 {
2446     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2447     static const GVecGen3 g[4] = {
2448         { .fniv = tcg_gen_smin_vec,
2449           .fno = gen_helper_gvec_smin8,
2450           .opt_opc = vecop_list,
2451           .vece = MO_8 },
2452         { .fniv = tcg_gen_smin_vec,
2453           .fno = gen_helper_gvec_smin16,
2454           .opt_opc = vecop_list,
2455           .vece = MO_16 },
2456         { .fni4 = tcg_gen_smin_i32,
2457           .fniv = tcg_gen_smin_vec,
2458           .fno = gen_helper_gvec_smin32,
2459           .opt_opc = vecop_list,
2460           .vece = MO_32 },
2461         { .fni8 = tcg_gen_smin_i64,
2462           .fniv = tcg_gen_smin_vec,
2463           .fno = gen_helper_gvec_smin64,
2464           .opt_opc = vecop_list,
2465           .vece = MO_64 }
2466     };
2467     tcg_debug_assert(vece <= MO_64);
2468     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2469 }
2470 
2471 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2472                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2473 {
2474     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2475     static const GVecGen3 g[4] = {
2476         { .fniv = tcg_gen_umin_vec,
2477           .fno = gen_helper_gvec_umin8,
2478           .opt_opc = vecop_list,
2479           .vece = MO_8 },
2480         { .fniv = tcg_gen_umin_vec,
2481           .fno = gen_helper_gvec_umin16,
2482           .opt_opc = vecop_list,
2483           .vece = MO_16 },
2484         { .fni4 = tcg_gen_umin_i32,
2485           .fniv = tcg_gen_umin_vec,
2486           .fno = gen_helper_gvec_umin32,
2487           .opt_opc = vecop_list,
2488           .vece = MO_32 },
2489         { .fni8 = tcg_gen_umin_i64,
2490           .fniv = tcg_gen_umin_vec,
2491           .fno = gen_helper_gvec_umin64,
2492           .opt_opc = vecop_list,
2493           .vece = MO_64 }
2494     };
2495     tcg_debug_assert(vece <= MO_64);
2496     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2497 }
2498 
2499 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2500                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2501 {
2502     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2503     static const GVecGen3 g[4] = {
2504         { .fniv = tcg_gen_smax_vec,
2505           .fno = gen_helper_gvec_smax8,
2506           .opt_opc = vecop_list,
2507           .vece = MO_8 },
2508         { .fniv = tcg_gen_smax_vec,
2509           .fno = gen_helper_gvec_smax16,
2510           .opt_opc = vecop_list,
2511           .vece = MO_16 },
2512         { .fni4 = tcg_gen_smax_i32,
2513           .fniv = tcg_gen_smax_vec,
2514           .fno = gen_helper_gvec_smax32,
2515           .opt_opc = vecop_list,
2516           .vece = MO_32 },
2517         { .fni8 = tcg_gen_smax_i64,
2518           .fniv = tcg_gen_smax_vec,
2519           .fno = gen_helper_gvec_smax64,
2520           .opt_opc = vecop_list,
2521           .vece = MO_64 }
2522     };
2523     tcg_debug_assert(vece <= MO_64);
2524     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2525 }
2526 
2527 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2528                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2529 {
2530     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2531     static const GVecGen3 g[4] = {
2532         { .fniv = tcg_gen_umax_vec,
2533           .fno = gen_helper_gvec_umax8,
2534           .opt_opc = vecop_list,
2535           .vece = MO_8 },
2536         { .fniv = tcg_gen_umax_vec,
2537           .fno = gen_helper_gvec_umax16,
2538           .opt_opc = vecop_list,
2539           .vece = MO_16 },
2540         { .fni4 = tcg_gen_umax_i32,
2541           .fniv = tcg_gen_umax_vec,
2542           .fno = gen_helper_gvec_umax32,
2543           .opt_opc = vecop_list,
2544           .vece = MO_32 },
2545         { .fni8 = tcg_gen_umax_i64,
2546           .fniv = tcg_gen_umax_vec,
2547           .fno = gen_helper_gvec_umax64,
2548           .opt_opc = vecop_list,
2549           .vece = MO_64 }
2550     };
2551     tcg_debug_assert(vece <= MO_64);
2552     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2553 }
2554 
2555 /* Perform a vector negation using normal negation and a mask.
2556    Compare gen_subv_mask above.  */
2557 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2558 {
2559     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2560     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2561 
2562     tcg_gen_andc_i64(t3, m, b);
2563     tcg_gen_andc_i64(t2, b, m);
2564     tcg_gen_sub_i64(d, m, t2);
2565     tcg_gen_xor_i64(d, d, t3);
2566 
2567     tcg_temp_free_i64(t2);
2568     tcg_temp_free_i64(t3);
2569 }
2570 
2571 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2572 {
2573     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2574     gen_negv_mask(d, b, m);
2575 }
2576 
2577 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2578 {
2579     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2580     gen_negv_mask(d, b, m);
2581 }
2582 
2583 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2584 {
2585     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2586     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2587 
2588     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2589     tcg_gen_neg_i64(t2, b);
2590     tcg_gen_neg_i64(t1, t1);
2591     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2592 
2593     tcg_temp_free_i64(t1);
2594     tcg_temp_free_i64(t2);
2595 }
2596 
2597 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2598                       uint32_t oprsz, uint32_t maxsz)
2599 {
2600     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2601     static const GVecGen2 g[4] = {
2602         { .fni8 = tcg_gen_vec_neg8_i64,
2603           .fniv = tcg_gen_neg_vec,
2604           .fno = gen_helper_gvec_neg8,
2605           .opt_opc = vecop_list,
2606           .vece = MO_8 },
2607         { .fni8 = tcg_gen_vec_neg16_i64,
2608           .fniv = tcg_gen_neg_vec,
2609           .fno = gen_helper_gvec_neg16,
2610           .opt_opc = vecop_list,
2611           .vece = MO_16 },
2612         { .fni4 = tcg_gen_neg_i32,
2613           .fniv = tcg_gen_neg_vec,
2614           .fno = gen_helper_gvec_neg32,
2615           .opt_opc = vecop_list,
2616           .vece = MO_32 },
2617         { .fni8 = tcg_gen_neg_i64,
2618           .fniv = tcg_gen_neg_vec,
2619           .fno = gen_helper_gvec_neg64,
2620           .opt_opc = vecop_list,
2621           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2622           .vece = MO_64 },
2623     };
2624 
2625     tcg_debug_assert(vece <= MO_64);
2626     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2627 }
2628 
2629 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2630 {
2631     TCGv_i64 t = tcg_temp_ebb_new_i64();
2632     int nbit = 8 << vece;
2633 
2634     /* Create -1 for each negative element.  */
2635     tcg_gen_shri_i64(t, b, nbit - 1);
2636     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2637     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2638 
2639     /*
2640      * Invert (via xor -1) and add one.
2641      * Because of the ordering the msb is cleared,
2642      * so we never have carry into the next element.
2643      */
2644     tcg_gen_xor_i64(d, b, t);
2645     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2646     tcg_gen_add_i64(d, d, t);
2647 
2648     tcg_temp_free_i64(t);
2649 }
2650 
2651 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2652 {
2653     gen_absv_mask(d, b, MO_8);
2654 }
2655 
2656 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2657 {
2658     gen_absv_mask(d, b, MO_16);
2659 }
2660 
2661 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2662                       uint32_t oprsz, uint32_t maxsz)
2663 {
2664     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2665     static const GVecGen2 g[4] = {
2666         { .fni8 = tcg_gen_vec_abs8_i64,
2667           .fniv = tcg_gen_abs_vec,
2668           .fno = gen_helper_gvec_abs8,
2669           .opt_opc = vecop_list,
2670           .vece = MO_8 },
2671         { .fni8 = tcg_gen_vec_abs16_i64,
2672           .fniv = tcg_gen_abs_vec,
2673           .fno = gen_helper_gvec_abs16,
2674           .opt_opc = vecop_list,
2675           .vece = MO_16 },
2676         { .fni4 = tcg_gen_abs_i32,
2677           .fniv = tcg_gen_abs_vec,
2678           .fno = gen_helper_gvec_abs32,
2679           .opt_opc = vecop_list,
2680           .vece = MO_32 },
2681         { .fni8 = tcg_gen_abs_i64,
2682           .fniv = tcg_gen_abs_vec,
2683           .fno = gen_helper_gvec_abs64,
2684           .opt_opc = vecop_list,
2685           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2686           .vece = MO_64 },
2687     };
2688 
2689     tcg_debug_assert(vece <= MO_64);
2690     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2691 }
2692 
2693 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2694                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2695 {
2696     static const GVecGen3 g = {
2697         .fni8 = tcg_gen_and_i64,
2698         .fniv = tcg_gen_and_vec,
2699         .fno = gen_helper_gvec_and,
2700         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2701     };
2702 
2703     if (aofs == bofs) {
2704         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2705     } else {
2706         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2707     }
2708 }
2709 
2710 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2711                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2712 {
2713     static const GVecGen3 g = {
2714         .fni8 = tcg_gen_or_i64,
2715         .fniv = tcg_gen_or_vec,
2716         .fno = gen_helper_gvec_or,
2717         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2718     };
2719 
2720     if (aofs == bofs) {
2721         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2722     } else {
2723         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2724     }
2725 }
2726 
2727 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2728                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2729 {
2730     static const GVecGen3 g = {
2731         .fni8 = tcg_gen_xor_i64,
2732         .fniv = tcg_gen_xor_vec,
2733         .fno = gen_helper_gvec_xor,
2734         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2735     };
2736 
2737     if (aofs == bofs) {
2738         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2739     } else {
2740         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2741     }
2742 }
2743 
2744 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2745                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2746 {
2747     static const GVecGen3 g = {
2748         .fni8 = tcg_gen_andc_i64,
2749         .fniv = tcg_gen_andc_vec,
2750         .fno = gen_helper_gvec_andc,
2751         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2752     };
2753 
2754     if (aofs == bofs) {
2755         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2756     } else {
2757         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2758     }
2759 }
2760 
2761 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2762                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2763 {
2764     static const GVecGen3 g = {
2765         .fni8 = tcg_gen_orc_i64,
2766         .fniv = tcg_gen_orc_vec,
2767         .fno = gen_helper_gvec_orc,
2768         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2769     };
2770 
2771     if (aofs == bofs) {
2772         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2773     } else {
2774         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2775     }
2776 }
2777 
2778 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2779                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2780 {
2781     static const GVecGen3 g = {
2782         .fni8 = tcg_gen_nand_i64,
2783         .fniv = tcg_gen_nand_vec,
2784         .fno = gen_helper_gvec_nand,
2785         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2786     };
2787 
2788     if (aofs == bofs) {
2789         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2790     } else {
2791         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2792     }
2793 }
2794 
2795 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2796                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2797 {
2798     static const GVecGen3 g = {
2799         .fni8 = tcg_gen_nor_i64,
2800         .fniv = tcg_gen_nor_vec,
2801         .fno = gen_helper_gvec_nor,
2802         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2803     };
2804 
2805     if (aofs == bofs) {
2806         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2807     } else {
2808         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2809     }
2810 }
2811 
2812 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2813                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2814 {
2815     static const GVecGen3 g = {
2816         .fni8 = tcg_gen_eqv_i64,
2817         .fniv = tcg_gen_eqv_vec,
2818         .fno = gen_helper_gvec_eqv,
2819         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2820     };
2821 
2822     if (aofs == bofs) {
2823         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2824     } else {
2825         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2826     }
2827 }
2828 
2829 static const GVecGen2s gop_ands = {
2830     .fni8 = tcg_gen_and_i64,
2831     .fniv = tcg_gen_and_vec,
2832     .fno = gen_helper_gvec_ands,
2833     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2834     .vece = MO_64
2835 };
2836 
2837 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2838                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2839 {
2840     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2841     tcg_gen_dup_i64(vece, tmp, c);
2842     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2843     tcg_temp_free_i64(tmp);
2844 }
2845 
2846 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2847                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2848 {
2849     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2850     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2851 }
2852 
2853 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
2854                         TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2855 {
2856     static GVecGen2s g = {
2857         .fni8 = tcg_gen_andc_i64,
2858         .fniv = tcg_gen_andc_vec,
2859         .fno = gen_helper_gvec_andcs,
2860         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2861         .vece = MO_64
2862     };
2863 
2864     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2865     tcg_gen_dup_i64(vece, tmp, c);
2866     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g);
2867     tcg_temp_free_i64(tmp);
2868 }
2869 
2870 static const GVecGen2s gop_xors = {
2871     .fni8 = tcg_gen_xor_i64,
2872     .fniv = tcg_gen_xor_vec,
2873     .fno = gen_helper_gvec_xors,
2874     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2875     .vece = MO_64
2876 };
2877 
2878 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2879                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2880 {
2881     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2882     tcg_gen_dup_i64(vece, tmp, c);
2883     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2884     tcg_temp_free_i64(tmp);
2885 }
2886 
2887 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2888                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2889 {
2890     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2891     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2892 }
2893 
2894 static const GVecGen2s gop_ors = {
2895     .fni8 = tcg_gen_or_i64,
2896     .fniv = tcg_gen_or_vec,
2897     .fno = gen_helper_gvec_ors,
2898     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2899     .vece = MO_64
2900 };
2901 
2902 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2903                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2904 {
2905     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2906     tcg_gen_dup_i64(vece, tmp, c);
2907     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2908     tcg_temp_free_i64(tmp);
2909 }
2910 
2911 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2912                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2913 {
2914     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2915     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2916 }
2917 
2918 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2919 {
2920     uint64_t mask = dup_const(MO_8, 0xff << c);
2921     tcg_gen_shli_i64(d, a, c);
2922     tcg_gen_andi_i64(d, d, mask);
2923 }
2924 
2925 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2926 {
2927     uint64_t mask = dup_const(MO_16, 0xffff << c);
2928     tcg_gen_shli_i64(d, a, c);
2929     tcg_gen_andi_i64(d, d, mask);
2930 }
2931 
2932 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2933 {
2934     uint32_t mask = dup_const(MO_8, 0xff << c);
2935     tcg_gen_shli_i32(d, a, c);
2936     tcg_gen_andi_i32(d, d, mask);
2937 }
2938 
2939 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2940 {
2941     uint32_t mask = dup_const(MO_16, 0xffff << c);
2942     tcg_gen_shli_i32(d, a, c);
2943     tcg_gen_andi_i32(d, d, mask);
2944 }
2945 
2946 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2947                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2948 {
2949     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2950     static const GVecGen2i g[4] = {
2951         { .fni8 = tcg_gen_vec_shl8i_i64,
2952           .fniv = tcg_gen_shli_vec,
2953           .fno = gen_helper_gvec_shl8i,
2954           .opt_opc = vecop_list,
2955           .vece = MO_8 },
2956         { .fni8 = tcg_gen_vec_shl16i_i64,
2957           .fniv = tcg_gen_shli_vec,
2958           .fno = gen_helper_gvec_shl16i,
2959           .opt_opc = vecop_list,
2960           .vece = MO_16 },
2961         { .fni4 = tcg_gen_shli_i32,
2962           .fniv = tcg_gen_shli_vec,
2963           .fno = gen_helper_gvec_shl32i,
2964           .opt_opc = vecop_list,
2965           .vece = MO_32 },
2966         { .fni8 = tcg_gen_shli_i64,
2967           .fniv = tcg_gen_shli_vec,
2968           .fno = gen_helper_gvec_shl64i,
2969           .opt_opc = vecop_list,
2970           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2971           .vece = MO_64 },
2972     };
2973 
2974     tcg_debug_assert(vece <= MO_64);
2975     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2976     if (shift == 0) {
2977         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2978     } else {
2979         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2980     }
2981 }
2982 
2983 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2984 {
2985     uint64_t mask = dup_const(MO_8, 0xff >> c);
2986     tcg_gen_shri_i64(d, a, c);
2987     tcg_gen_andi_i64(d, d, mask);
2988 }
2989 
2990 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2991 {
2992     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2993     tcg_gen_shri_i64(d, a, c);
2994     tcg_gen_andi_i64(d, d, mask);
2995 }
2996 
2997 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2998 {
2999     uint32_t mask = dup_const(MO_8, 0xff >> c);
3000     tcg_gen_shri_i32(d, a, c);
3001     tcg_gen_andi_i32(d, d, mask);
3002 }
3003 
3004 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
3005 {
3006     uint32_t mask = dup_const(MO_16, 0xffff >> c);
3007     tcg_gen_shri_i32(d, a, c);
3008     tcg_gen_andi_i32(d, d, mask);
3009 }
3010 
3011 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
3012                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
3013 {
3014     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
3015     static const GVecGen2i g[4] = {
3016         { .fni8 = tcg_gen_vec_shr8i_i64,
3017           .fniv = tcg_gen_shri_vec,
3018           .fno = gen_helper_gvec_shr8i,
3019           .opt_opc = vecop_list,
3020           .vece = MO_8 },
3021         { .fni8 = tcg_gen_vec_shr16i_i64,
3022           .fniv = tcg_gen_shri_vec,
3023           .fno = gen_helper_gvec_shr16i,
3024           .opt_opc = vecop_list,
3025           .vece = MO_16 },
3026         { .fni4 = tcg_gen_shri_i32,
3027           .fniv = tcg_gen_shri_vec,
3028           .fno = gen_helper_gvec_shr32i,
3029           .opt_opc = vecop_list,
3030           .vece = MO_32 },
3031         { .fni8 = tcg_gen_shri_i64,
3032           .fniv = tcg_gen_shri_vec,
3033           .fno = gen_helper_gvec_shr64i,
3034           .opt_opc = vecop_list,
3035           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3036           .vece = MO_64 },
3037     };
3038 
3039     tcg_debug_assert(vece <= MO_64);
3040     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3041     if (shift == 0) {
3042         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3043     } else {
3044         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3045     }
3046 }
3047 
3048 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3049 {
3050     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
3051     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
3052     TCGv_i64 s = tcg_temp_ebb_new_i64();
3053 
3054     tcg_gen_shri_i64(d, a, c);
3055     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
3056     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
3057     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
3058     tcg_gen_or_i64(d, d, s);         /* include sign extension */
3059     tcg_temp_free_i64(s);
3060 }
3061 
3062 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3063 {
3064     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
3065     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
3066     TCGv_i64 s = tcg_temp_ebb_new_i64();
3067 
3068     tcg_gen_shri_i64(d, a, c);
3069     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
3070     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
3071     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
3072     tcg_gen_or_i64(d, d, s);         /* include sign extension */
3073     tcg_temp_free_i64(s);
3074 }
3075 
3076 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
3077 {
3078     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
3079     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
3080     TCGv_i32 s = tcg_temp_ebb_new_i32();
3081 
3082     tcg_gen_shri_i32(d, a, c);
3083     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
3084     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
3085     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
3086     tcg_gen_or_i32(d, d, s);         /* include sign extension */
3087     tcg_temp_free_i32(s);
3088 }
3089 
3090 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
3091 {
3092     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
3093     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
3094     TCGv_i32 s = tcg_temp_ebb_new_i32();
3095 
3096     tcg_gen_shri_i32(d, a, c);
3097     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
3098     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
3099     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
3100     tcg_gen_or_i32(d, d, s);         /* include sign extension */
3101     tcg_temp_free_i32(s);
3102 }
3103 
3104 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
3105                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
3106 {
3107     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3108     static const GVecGen2i g[4] = {
3109         { .fni8 = tcg_gen_vec_sar8i_i64,
3110           .fniv = tcg_gen_sari_vec,
3111           .fno = gen_helper_gvec_sar8i,
3112           .opt_opc = vecop_list,
3113           .vece = MO_8 },
3114         { .fni8 = tcg_gen_vec_sar16i_i64,
3115           .fniv = tcg_gen_sari_vec,
3116           .fno = gen_helper_gvec_sar16i,
3117           .opt_opc = vecop_list,
3118           .vece = MO_16 },
3119         { .fni4 = tcg_gen_sari_i32,
3120           .fniv = tcg_gen_sari_vec,
3121           .fno = gen_helper_gvec_sar32i,
3122           .opt_opc = vecop_list,
3123           .vece = MO_32 },
3124         { .fni8 = tcg_gen_sari_i64,
3125           .fniv = tcg_gen_sari_vec,
3126           .fno = gen_helper_gvec_sar64i,
3127           .opt_opc = vecop_list,
3128           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3129           .vece = MO_64 },
3130     };
3131 
3132     tcg_debug_assert(vece <= MO_64);
3133     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3134     if (shift == 0) {
3135         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3136     } else {
3137         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3138     }
3139 }
3140 
3141 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3142 {
3143     uint64_t mask = dup_const(MO_8, 0xff << c);
3144 
3145     tcg_gen_shli_i64(d, a, c);
3146     tcg_gen_shri_i64(a, a, 8 - c);
3147     tcg_gen_andi_i64(d, d, mask);
3148     tcg_gen_andi_i64(a, a, ~mask);
3149     tcg_gen_or_i64(d, d, a);
3150 }
3151 
3152 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3153 {
3154     uint64_t mask = dup_const(MO_16, 0xffff << c);
3155 
3156     tcg_gen_shli_i64(d, a, c);
3157     tcg_gen_shri_i64(a, a, 16 - c);
3158     tcg_gen_andi_i64(d, d, mask);
3159     tcg_gen_andi_i64(a, a, ~mask);
3160     tcg_gen_or_i64(d, d, a);
3161 }
3162 
3163 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3164                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3165 {
3166     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3167     static const GVecGen2i g[4] = {
3168         { .fni8 = tcg_gen_vec_rotl8i_i64,
3169           .fniv = tcg_gen_rotli_vec,
3170           .fno = gen_helper_gvec_rotl8i,
3171           .opt_opc = vecop_list,
3172           .vece = MO_8 },
3173         { .fni8 = tcg_gen_vec_rotl16i_i64,
3174           .fniv = tcg_gen_rotli_vec,
3175           .fno = gen_helper_gvec_rotl16i,
3176           .opt_opc = vecop_list,
3177           .vece = MO_16 },
3178         { .fni4 = tcg_gen_rotli_i32,
3179           .fniv = tcg_gen_rotli_vec,
3180           .fno = gen_helper_gvec_rotl32i,
3181           .opt_opc = vecop_list,
3182           .vece = MO_32 },
3183         { .fni8 = tcg_gen_rotli_i64,
3184           .fniv = tcg_gen_rotli_vec,
3185           .fno = gen_helper_gvec_rotl64i,
3186           .opt_opc = vecop_list,
3187           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3188           .vece = MO_64 },
3189     };
3190 
3191     tcg_debug_assert(vece <= MO_64);
3192     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3193     if (shift == 0) {
3194         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3195     } else {
3196         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3197     }
3198 }
3199 
3200 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3201                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3202 {
3203     tcg_debug_assert(vece <= MO_64);
3204     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3205     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3206                        oprsz, maxsz);
3207 }
3208 
3209 /*
3210  * Specialized generation vector shifts by a non-constant scalar.
3211  */
3212 
3213 typedef struct {
3214     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3215     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3216     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3217     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3218     gen_helper_gvec_2 *fno[4];
3219     TCGOpcode s_list[2];
3220     TCGOpcode v_list[2];
3221 } GVecGen2sh;
3222 
3223 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3224                            uint32_t oprsz, uint32_t tysz, TCGType type,
3225                            TCGv_i32 shift,
3226                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3227 {
3228     for (uint32_t i = 0; i < oprsz; i += tysz) {
3229         TCGv_vec t0 = tcg_temp_new_vec(type);
3230         TCGv_vec t1 = tcg_temp_new_vec(type);
3231 
3232         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3233         fni(vece, t1, t0, shift);
3234         tcg_gen_st_vec(t1, tcg_env, dofs + i);
3235     }
3236 }
3237 
3238 static void
3239 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3240                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3241 {
3242     TCGType type;
3243     uint32_t some;
3244 
3245     check_size_align(oprsz, maxsz, dofs | aofs);
3246     check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz);
3247 
3248     /* If the backend has a scalar expansion, great.  */
3249     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3250     if (type) {
3251         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3252         switch (type) {
3253         case TCG_TYPE_V256:
3254             some = QEMU_ALIGN_DOWN(oprsz, 32);
3255             expand_2sh_vec(vece, dofs, aofs, some, 32,
3256                            TCG_TYPE_V256, shift, g->fniv_s);
3257             if (some == oprsz) {
3258                 break;
3259             }
3260             dofs += some;
3261             aofs += some;
3262             oprsz -= some;
3263             maxsz -= some;
3264             /* fallthru */
3265         case TCG_TYPE_V128:
3266             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3267                            TCG_TYPE_V128, shift, g->fniv_s);
3268             break;
3269         case TCG_TYPE_V64:
3270             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3271                            TCG_TYPE_V64, shift, g->fniv_s);
3272             break;
3273         default:
3274             g_assert_not_reached();
3275         }
3276         tcg_swap_vecop_list(hold_list);
3277         goto clear_tail;
3278     }
3279 
3280     /* If the backend supports variable vector shifts, also cool.  */
3281     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3282     if (type) {
3283         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3284         TCGv_vec v_shift = tcg_temp_new_vec(type);
3285 
3286         if (vece == MO_64) {
3287             TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3288             tcg_gen_extu_i32_i64(sh64, shift);
3289             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3290             tcg_temp_free_i64(sh64);
3291         } else {
3292             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3293         }
3294 
3295         switch (type) {
3296         case TCG_TYPE_V256:
3297             some = QEMU_ALIGN_DOWN(oprsz, 32);
3298             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3299                           v_shift, false, g->fniv_v);
3300             if (some == oprsz) {
3301                 break;
3302             }
3303             dofs += some;
3304             aofs += some;
3305             oprsz -= some;
3306             maxsz -= some;
3307             /* fallthru */
3308         case TCG_TYPE_V128:
3309             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3310                           v_shift, false, g->fniv_v);
3311             break;
3312         case TCG_TYPE_V64:
3313             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3314                           v_shift, false, g->fniv_v);
3315             break;
3316         default:
3317             g_assert_not_reached();
3318         }
3319         tcg_temp_free_vec(v_shift);
3320         tcg_swap_vecop_list(hold_list);
3321         goto clear_tail;
3322     }
3323 
3324     /* Otherwise fall back to integral... */
3325     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3326         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3327     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3328         TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3329         tcg_gen_extu_i32_i64(sh64, shift);
3330         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3331         tcg_temp_free_i64(sh64);
3332     } else {
3333         TCGv_ptr a0 = tcg_temp_ebb_new_ptr();
3334         TCGv_ptr a1 = tcg_temp_ebb_new_ptr();
3335         TCGv_i32 desc = tcg_temp_ebb_new_i32();
3336 
3337         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3338         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3339         tcg_gen_addi_ptr(a0, tcg_env, dofs);
3340         tcg_gen_addi_ptr(a1, tcg_env, aofs);
3341 
3342         g->fno[vece](a0, a1, desc);
3343 
3344         tcg_temp_free_ptr(a0);
3345         tcg_temp_free_ptr(a1);
3346         tcg_temp_free_i32(desc);
3347         return;
3348     }
3349 
3350  clear_tail:
3351     if (oprsz < maxsz) {
3352         expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
3353     }
3354 }
3355 
3356 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3357                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3358 {
3359     static const GVecGen2sh g = {
3360         .fni4 = tcg_gen_shl_i32,
3361         .fni8 = tcg_gen_shl_i64,
3362         .fniv_s = tcg_gen_shls_vec,
3363         .fniv_v = tcg_gen_shlv_vec,
3364         .fno = {
3365             gen_helper_gvec_shl8i,
3366             gen_helper_gvec_shl16i,
3367             gen_helper_gvec_shl32i,
3368             gen_helper_gvec_shl64i,
3369         },
3370         .s_list = { INDEX_op_shls_vec, 0 },
3371         .v_list = { INDEX_op_shlv_vec, 0 },
3372     };
3373 
3374     tcg_debug_assert(vece <= MO_64);
3375     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3376 }
3377 
3378 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3379                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3380 {
3381     static const GVecGen2sh g = {
3382         .fni4 = tcg_gen_shr_i32,
3383         .fni8 = tcg_gen_shr_i64,
3384         .fniv_s = tcg_gen_shrs_vec,
3385         .fniv_v = tcg_gen_shrv_vec,
3386         .fno = {
3387             gen_helper_gvec_shr8i,
3388             gen_helper_gvec_shr16i,
3389             gen_helper_gvec_shr32i,
3390             gen_helper_gvec_shr64i,
3391         },
3392         .s_list = { INDEX_op_shrs_vec, 0 },
3393         .v_list = { INDEX_op_shrv_vec, 0 },
3394     };
3395 
3396     tcg_debug_assert(vece <= MO_64);
3397     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3398 }
3399 
3400 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3401                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3402 {
3403     static const GVecGen2sh g = {
3404         .fni4 = tcg_gen_sar_i32,
3405         .fni8 = tcg_gen_sar_i64,
3406         .fniv_s = tcg_gen_sars_vec,
3407         .fniv_v = tcg_gen_sarv_vec,
3408         .fno = {
3409             gen_helper_gvec_sar8i,
3410             gen_helper_gvec_sar16i,
3411             gen_helper_gvec_sar32i,
3412             gen_helper_gvec_sar64i,
3413         },
3414         .s_list = { INDEX_op_sars_vec, 0 },
3415         .v_list = { INDEX_op_sarv_vec, 0 },
3416     };
3417 
3418     tcg_debug_assert(vece <= MO_64);
3419     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3420 }
3421 
3422 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3423                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3424 {
3425     static const GVecGen2sh g = {
3426         .fni4 = tcg_gen_rotl_i32,
3427         .fni8 = tcg_gen_rotl_i64,
3428         .fniv_s = tcg_gen_rotls_vec,
3429         .fniv_v = tcg_gen_rotlv_vec,
3430         .fno = {
3431             gen_helper_gvec_rotl8i,
3432             gen_helper_gvec_rotl16i,
3433             gen_helper_gvec_rotl32i,
3434             gen_helper_gvec_rotl64i,
3435         },
3436         .s_list = { INDEX_op_rotls_vec, 0 },
3437         .v_list = { INDEX_op_rotlv_vec, 0 },
3438     };
3439 
3440     tcg_debug_assert(vece <= MO_64);
3441     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3442 }
3443 
3444 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3445                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3446 {
3447     TCGv_i32 tmp = tcg_temp_ebb_new_i32();
3448 
3449     tcg_gen_neg_i32(tmp, shift);
3450     tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
3451     tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
3452     tcg_temp_free_i32(tmp);
3453 }
3454 
3455 /*
3456  * Expand D = A << (B % element bits)
3457  *
3458  * Unlike scalar shifts, where it is easy for the target front end
3459  * to include the modulo as part of the expansion.  If the target
3460  * naturally includes the modulo as part of the operation, great!
3461  * If the target has some other behaviour from out-of-range shifts,
3462  * then it could not use this function anyway, and would need to
3463  * do it's own expansion with custom functions.
3464  */
3465 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3466                                  TCGv_vec a, TCGv_vec b)
3467 {
3468     TCGv_vec t = tcg_temp_new_vec_matching(d);
3469     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3470 
3471     tcg_gen_and_vec(vece, t, b, m);
3472     tcg_gen_shlv_vec(vece, d, a, t);
3473     tcg_temp_free_vec(t);
3474 }
3475 
3476 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3477 {
3478     TCGv_i32 t = tcg_temp_ebb_new_i32();
3479 
3480     tcg_gen_andi_i32(t, b, 31);
3481     tcg_gen_shl_i32(d, a, t);
3482     tcg_temp_free_i32(t);
3483 }
3484 
3485 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3486 {
3487     TCGv_i64 t = tcg_temp_ebb_new_i64();
3488 
3489     tcg_gen_andi_i64(t, b, 63);
3490     tcg_gen_shl_i64(d, a, t);
3491     tcg_temp_free_i64(t);
3492 }
3493 
3494 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3495                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3496 {
3497     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3498     static const GVecGen3 g[4] = {
3499         { .fniv = tcg_gen_shlv_mod_vec,
3500           .fno = gen_helper_gvec_shl8v,
3501           .opt_opc = vecop_list,
3502           .vece = MO_8 },
3503         { .fniv = tcg_gen_shlv_mod_vec,
3504           .fno = gen_helper_gvec_shl16v,
3505           .opt_opc = vecop_list,
3506           .vece = MO_16 },
3507         { .fni4 = tcg_gen_shl_mod_i32,
3508           .fniv = tcg_gen_shlv_mod_vec,
3509           .fno = gen_helper_gvec_shl32v,
3510           .opt_opc = vecop_list,
3511           .vece = MO_32 },
3512         { .fni8 = tcg_gen_shl_mod_i64,
3513           .fniv = tcg_gen_shlv_mod_vec,
3514           .fno = gen_helper_gvec_shl64v,
3515           .opt_opc = vecop_list,
3516           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3517           .vece = MO_64 },
3518     };
3519 
3520     tcg_debug_assert(vece <= MO_64);
3521     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3522 }
3523 
3524 /*
3525  * Similarly for logical right shifts.
3526  */
3527 
3528 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3529                                  TCGv_vec a, TCGv_vec b)
3530 {
3531     TCGv_vec t = tcg_temp_new_vec_matching(d);
3532     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3533 
3534     tcg_gen_and_vec(vece, t, b, m);
3535     tcg_gen_shrv_vec(vece, d, a, t);
3536     tcg_temp_free_vec(t);
3537 }
3538 
3539 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3540 {
3541     TCGv_i32 t = tcg_temp_ebb_new_i32();
3542 
3543     tcg_gen_andi_i32(t, b, 31);
3544     tcg_gen_shr_i32(d, a, t);
3545     tcg_temp_free_i32(t);
3546 }
3547 
3548 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3549 {
3550     TCGv_i64 t = tcg_temp_ebb_new_i64();
3551 
3552     tcg_gen_andi_i64(t, b, 63);
3553     tcg_gen_shr_i64(d, a, t);
3554     tcg_temp_free_i64(t);
3555 }
3556 
3557 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3558                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3559 {
3560     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3561     static const GVecGen3 g[4] = {
3562         { .fniv = tcg_gen_shrv_mod_vec,
3563           .fno = gen_helper_gvec_shr8v,
3564           .opt_opc = vecop_list,
3565           .vece = MO_8 },
3566         { .fniv = tcg_gen_shrv_mod_vec,
3567           .fno = gen_helper_gvec_shr16v,
3568           .opt_opc = vecop_list,
3569           .vece = MO_16 },
3570         { .fni4 = tcg_gen_shr_mod_i32,
3571           .fniv = tcg_gen_shrv_mod_vec,
3572           .fno = gen_helper_gvec_shr32v,
3573           .opt_opc = vecop_list,
3574           .vece = MO_32 },
3575         { .fni8 = tcg_gen_shr_mod_i64,
3576           .fniv = tcg_gen_shrv_mod_vec,
3577           .fno = gen_helper_gvec_shr64v,
3578           .opt_opc = vecop_list,
3579           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3580           .vece = MO_64 },
3581     };
3582 
3583     tcg_debug_assert(vece <= MO_64);
3584     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3585 }
3586 
3587 /*
3588  * Similarly for arithmetic right shifts.
3589  */
3590 
3591 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3592                                  TCGv_vec a, TCGv_vec b)
3593 {
3594     TCGv_vec t = tcg_temp_new_vec_matching(d);
3595     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3596 
3597     tcg_gen_and_vec(vece, t, b, m);
3598     tcg_gen_sarv_vec(vece, d, a, t);
3599     tcg_temp_free_vec(t);
3600 }
3601 
3602 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3603 {
3604     TCGv_i32 t = tcg_temp_ebb_new_i32();
3605 
3606     tcg_gen_andi_i32(t, b, 31);
3607     tcg_gen_sar_i32(d, a, t);
3608     tcg_temp_free_i32(t);
3609 }
3610 
3611 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3612 {
3613     TCGv_i64 t = tcg_temp_ebb_new_i64();
3614 
3615     tcg_gen_andi_i64(t, b, 63);
3616     tcg_gen_sar_i64(d, a, t);
3617     tcg_temp_free_i64(t);
3618 }
3619 
3620 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3621                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3622 {
3623     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3624     static const GVecGen3 g[4] = {
3625         { .fniv = tcg_gen_sarv_mod_vec,
3626           .fno = gen_helper_gvec_sar8v,
3627           .opt_opc = vecop_list,
3628           .vece = MO_8 },
3629         { .fniv = tcg_gen_sarv_mod_vec,
3630           .fno = gen_helper_gvec_sar16v,
3631           .opt_opc = vecop_list,
3632           .vece = MO_16 },
3633         { .fni4 = tcg_gen_sar_mod_i32,
3634           .fniv = tcg_gen_sarv_mod_vec,
3635           .fno = gen_helper_gvec_sar32v,
3636           .opt_opc = vecop_list,
3637           .vece = MO_32 },
3638         { .fni8 = tcg_gen_sar_mod_i64,
3639           .fniv = tcg_gen_sarv_mod_vec,
3640           .fno = gen_helper_gvec_sar64v,
3641           .opt_opc = vecop_list,
3642           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3643           .vece = MO_64 },
3644     };
3645 
3646     tcg_debug_assert(vece <= MO_64);
3647     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3648 }
3649 
3650 /*
3651  * Similarly for rotates.
3652  */
3653 
3654 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3655                                   TCGv_vec a, TCGv_vec b)
3656 {
3657     TCGv_vec t = tcg_temp_new_vec_matching(d);
3658     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3659 
3660     tcg_gen_and_vec(vece, t, b, m);
3661     tcg_gen_rotlv_vec(vece, d, a, t);
3662     tcg_temp_free_vec(t);
3663 }
3664 
3665 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3666 {
3667     TCGv_i32 t = tcg_temp_ebb_new_i32();
3668 
3669     tcg_gen_andi_i32(t, b, 31);
3670     tcg_gen_rotl_i32(d, a, t);
3671     tcg_temp_free_i32(t);
3672 }
3673 
3674 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3675 {
3676     TCGv_i64 t = tcg_temp_ebb_new_i64();
3677 
3678     tcg_gen_andi_i64(t, b, 63);
3679     tcg_gen_rotl_i64(d, a, t);
3680     tcg_temp_free_i64(t);
3681 }
3682 
3683 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3684                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3685 {
3686     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3687     static const GVecGen3 g[4] = {
3688         { .fniv = tcg_gen_rotlv_mod_vec,
3689           .fno = gen_helper_gvec_rotl8v,
3690           .opt_opc = vecop_list,
3691           .vece = MO_8 },
3692         { .fniv = tcg_gen_rotlv_mod_vec,
3693           .fno = gen_helper_gvec_rotl16v,
3694           .opt_opc = vecop_list,
3695           .vece = MO_16 },
3696         { .fni4 = tcg_gen_rotl_mod_i32,
3697           .fniv = tcg_gen_rotlv_mod_vec,
3698           .fno = gen_helper_gvec_rotl32v,
3699           .opt_opc = vecop_list,
3700           .vece = MO_32 },
3701         { .fni8 = tcg_gen_rotl_mod_i64,
3702           .fniv = tcg_gen_rotlv_mod_vec,
3703           .fno = gen_helper_gvec_rotl64v,
3704           .opt_opc = vecop_list,
3705           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3706           .vece = MO_64 },
3707     };
3708 
3709     tcg_debug_assert(vece <= MO_64);
3710     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3711 }
3712 
3713 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3714                                   TCGv_vec a, TCGv_vec b)
3715 {
3716     TCGv_vec t = tcg_temp_new_vec_matching(d);
3717     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3718 
3719     tcg_gen_and_vec(vece, t, b, m);
3720     tcg_gen_rotrv_vec(vece, d, a, t);
3721     tcg_temp_free_vec(t);
3722 }
3723 
3724 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3725 {
3726     TCGv_i32 t = tcg_temp_ebb_new_i32();
3727 
3728     tcg_gen_andi_i32(t, b, 31);
3729     tcg_gen_rotr_i32(d, a, t);
3730     tcg_temp_free_i32(t);
3731 }
3732 
3733 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3734 {
3735     TCGv_i64 t = tcg_temp_ebb_new_i64();
3736 
3737     tcg_gen_andi_i64(t, b, 63);
3738     tcg_gen_rotr_i64(d, a, t);
3739     tcg_temp_free_i64(t);
3740 }
3741 
3742 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3743                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3744 {
3745     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3746     static const GVecGen3 g[4] = {
3747         { .fniv = tcg_gen_rotrv_mod_vec,
3748           .fno = gen_helper_gvec_rotr8v,
3749           .opt_opc = vecop_list,
3750           .vece = MO_8 },
3751         { .fniv = tcg_gen_rotrv_mod_vec,
3752           .fno = gen_helper_gvec_rotr16v,
3753           .opt_opc = vecop_list,
3754           .vece = MO_16 },
3755         { .fni4 = tcg_gen_rotr_mod_i32,
3756           .fniv = tcg_gen_rotrv_mod_vec,
3757           .fno = gen_helper_gvec_rotr32v,
3758           .opt_opc = vecop_list,
3759           .vece = MO_32 },
3760         { .fni8 = tcg_gen_rotr_mod_i64,
3761           .fniv = tcg_gen_rotrv_mod_vec,
3762           .fno = gen_helper_gvec_rotr64v,
3763           .opt_opc = vecop_list,
3764           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3765           .vece = MO_64 },
3766     };
3767 
3768     tcg_debug_assert(vece <= MO_64);
3769     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3770 }
3771 
3772 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3773 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3774                            uint32_t oprsz, TCGCond cond)
3775 {
3776     TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3777     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3778     uint32_t i;
3779 
3780     for (i = 0; i < oprsz; i += 4) {
3781         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3782         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
3783         tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3784         tcg_gen_st_i32(t0, tcg_env, dofs + i);
3785     }
3786     tcg_temp_free_i32(t1);
3787     tcg_temp_free_i32(t0);
3788 }
3789 
3790 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3791                            uint32_t oprsz, TCGCond cond)
3792 {
3793     TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3794     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
3795     uint32_t i;
3796 
3797     for (i = 0; i < oprsz; i += 8) {
3798         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3799         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
3800         tcg_gen_negsetcond_i64(cond, t0, t0, t1);
3801         tcg_gen_st_i64(t0, tcg_env, dofs + i);
3802     }
3803     tcg_temp_free_i64(t1);
3804     tcg_temp_free_i64(t0);
3805 }
3806 
3807 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3808                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3809                            TCGType type, TCGCond cond)
3810 {
3811     for (uint32_t i = 0; i < oprsz; i += tysz) {
3812         TCGv_vec t0 = tcg_temp_new_vec(type);
3813         TCGv_vec t1 = tcg_temp_new_vec(type);
3814         TCGv_vec t2 = tcg_temp_new_vec(type);
3815 
3816         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3817         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
3818         tcg_gen_cmp_vec(cond, vece, t2, t0, t1);
3819         tcg_gen_st_vec(t2, tcg_env, dofs + i);
3820     }
3821 }
3822 
3823 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3824                       uint32_t aofs, uint32_t bofs,
3825                       uint32_t oprsz, uint32_t maxsz)
3826 {
3827     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3828     static gen_helper_gvec_3 * const eq_fn[4] = {
3829         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3830         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3831     };
3832     static gen_helper_gvec_3 * const ne_fn[4] = {
3833         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3834         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3835     };
3836     static gen_helper_gvec_3 * const lt_fn[4] = {
3837         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3838         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3839     };
3840     static gen_helper_gvec_3 * const le_fn[4] = {
3841         gen_helper_gvec_le8, gen_helper_gvec_le16,
3842         gen_helper_gvec_le32, gen_helper_gvec_le64
3843     };
3844     static gen_helper_gvec_3 * const ltu_fn[4] = {
3845         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3846         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3847     };
3848     static gen_helper_gvec_3 * const leu_fn[4] = {
3849         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3850         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3851     };
3852     static gen_helper_gvec_3 * const * const fns[16] = {
3853         [TCG_COND_EQ] = eq_fn,
3854         [TCG_COND_NE] = ne_fn,
3855         [TCG_COND_LT] = lt_fn,
3856         [TCG_COND_LE] = le_fn,
3857         [TCG_COND_LTU] = ltu_fn,
3858         [TCG_COND_LEU] = leu_fn,
3859     };
3860 
3861     const TCGOpcode *hold_list;
3862     TCGType type;
3863     uint32_t some;
3864 
3865     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3866     check_overlap_3(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, maxsz);
3867 
3868     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3869         do_dup(MO_8, tcg_env, dofs, oprsz, maxsz,
3870                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3871         return;
3872     }
3873 
3874     /*
3875      * Implement inline with a vector type, if possible.
3876      * Prefer integer when 64-bit host and 64-bit comparison.
3877      */
3878     hold_list = tcg_swap_vecop_list(cmp_list);
3879     type = choose_vector_type(cmp_list, vece, oprsz,
3880                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3881     switch (type) {
3882     case TCG_TYPE_V256:
3883         /* Recall that ARM SVE allows vector sizes that are not a
3884          * power of 2, but always a multiple of 16.  The intent is
3885          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3886          */
3887         some = QEMU_ALIGN_DOWN(oprsz, 32);
3888         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3889         if (some == oprsz) {
3890             break;
3891         }
3892         dofs += some;
3893         aofs += some;
3894         bofs += some;
3895         oprsz -= some;
3896         maxsz -= some;
3897         /* fallthru */
3898     case TCG_TYPE_V128:
3899         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3900         break;
3901     case TCG_TYPE_V64:
3902         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3903         break;
3904 
3905     case 0:
3906         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3907             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3908         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3909             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3910         } else {
3911             gen_helper_gvec_3 * const *fn = fns[cond];
3912 
3913             if (fn == NULL) {
3914                 uint32_t tmp;
3915                 tmp = aofs, aofs = bofs, bofs = tmp;
3916                 cond = tcg_swap_cond(cond);
3917                 fn = fns[cond];
3918                 assert(fn != NULL);
3919             }
3920             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3921             oprsz = maxsz;
3922         }
3923         break;
3924 
3925     default:
3926         g_assert_not_reached();
3927     }
3928     tcg_swap_vecop_list(hold_list);
3929 
3930     if (oprsz < maxsz) {
3931         expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
3932     }
3933 }
3934 
3935 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3936                             uint32_t oprsz, uint32_t tysz, TCGType type,
3937                             TCGCond cond, TCGv_vec c)
3938 {
3939     TCGv_vec t0 = tcg_temp_new_vec(type);
3940     TCGv_vec t1 = tcg_temp_new_vec(type);
3941     uint32_t i;
3942 
3943     for (i = 0; i < oprsz; i += tysz) {
3944         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
3945         tcg_gen_cmp_vec(cond, vece, t0, t1, c);
3946         tcg_gen_st_vec(t0, tcg_env, dofs + i);
3947     }
3948 }
3949 
3950 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
3951                        uint32_t aofs, TCGv_i64 c,
3952                        uint32_t oprsz, uint32_t maxsz)
3953 {
3954     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3955     static gen_helper_gvec_2i * const eq_fn[4] = {
3956         gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
3957         gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
3958     };
3959     static gen_helper_gvec_2i * const lt_fn[4] = {
3960         gen_helper_gvec_lts8, gen_helper_gvec_lts16,
3961         gen_helper_gvec_lts32, gen_helper_gvec_lts64
3962     };
3963     static gen_helper_gvec_2i * const le_fn[4] = {
3964         gen_helper_gvec_les8, gen_helper_gvec_les16,
3965         gen_helper_gvec_les32, gen_helper_gvec_les64
3966     };
3967     static gen_helper_gvec_2i * const ltu_fn[4] = {
3968         gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
3969         gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
3970     };
3971     static gen_helper_gvec_2i * const leu_fn[4] = {
3972         gen_helper_gvec_leus8, gen_helper_gvec_leus16,
3973         gen_helper_gvec_leus32, gen_helper_gvec_leus64
3974     };
3975     static gen_helper_gvec_2i * const * const fns[16] = {
3976         [TCG_COND_EQ] = eq_fn,
3977         [TCG_COND_LT] = lt_fn,
3978         [TCG_COND_LE] = le_fn,
3979         [TCG_COND_LTU] = ltu_fn,
3980         [TCG_COND_LEU] = leu_fn,
3981     };
3982 
3983     TCGType type;
3984 
3985     check_size_align(oprsz, maxsz, dofs | aofs);
3986     check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz);
3987 
3988     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3989         do_dup(MO_8, tcg_env, dofs, oprsz, maxsz,
3990                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3991         return;
3992     }
3993 
3994     /*
3995      * Implement inline with a vector type, if possible.
3996      * Prefer integer when 64-bit host and 64-bit comparison.
3997      */
3998     type = choose_vector_type(cmp_list, vece, oprsz,
3999                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
4000     if (type != 0) {
4001         const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
4002         TCGv_vec t_vec = tcg_temp_new_vec(type);
4003         uint32_t some;
4004 
4005         tcg_gen_dup_i64_vec(vece, t_vec, c);
4006         switch (type) {
4007         case TCG_TYPE_V256:
4008             some = QEMU_ALIGN_DOWN(oprsz, 32);
4009             expand_cmps_vec(vece, dofs, aofs, some, 32,
4010                             TCG_TYPE_V256, cond, t_vec);
4011             aofs += some;
4012             dofs += some;
4013             oprsz -= some;
4014             maxsz -= some;
4015             /* fallthru */
4016 
4017         case TCG_TYPE_V128:
4018             some = QEMU_ALIGN_DOWN(oprsz, 16);
4019             expand_cmps_vec(vece, dofs, aofs, some, 16,
4020                             TCG_TYPE_V128, cond, t_vec);
4021             break;
4022 
4023         case TCG_TYPE_V64:
4024             some = QEMU_ALIGN_DOWN(oprsz, 8);
4025             expand_cmps_vec(vece, dofs, aofs, some, 8,
4026                             TCG_TYPE_V64, cond, t_vec);
4027             break;
4028 
4029         default:
4030             g_assert_not_reached();
4031         }
4032         tcg_temp_free_vec(t_vec);
4033         tcg_swap_vecop_list(hold_list);
4034     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
4035         TCGv_i64 t0 = tcg_temp_ebb_new_i64();
4036         uint32_t i;
4037 
4038         for (i = 0; i < oprsz; i += 8) {
4039             tcg_gen_ld_i64(t0, tcg_env, aofs + i);
4040             tcg_gen_negsetcond_i64(cond, t0, t0, c);
4041             tcg_gen_st_i64(t0, tcg_env, dofs + i);
4042         }
4043         tcg_temp_free_i64(t0);
4044     } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
4045         TCGv_i32 t0 = tcg_temp_ebb_new_i32();
4046         TCGv_i32 t1 = tcg_temp_ebb_new_i32();
4047         uint32_t i;
4048 
4049         tcg_gen_extrl_i64_i32(t1, c);
4050         for (i = 0; i < oprsz; i += 4) {
4051             tcg_gen_ld_i32(t0, tcg_env, aofs + i);
4052             tcg_gen_negsetcond_i32(cond, t0, t0, t1);
4053             tcg_gen_st_i32(t0, tcg_env, dofs + i);
4054         }
4055         tcg_temp_free_i32(t0);
4056         tcg_temp_free_i32(t1);
4057     } else {
4058         gen_helper_gvec_2i * const *fn = fns[cond];
4059         bool inv = false;
4060 
4061         if (fn == NULL) {
4062             cond = tcg_invert_cond(cond);
4063             fn = fns[cond];
4064             assert(fn != NULL);
4065             inv = true;
4066         }
4067         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
4068         return;
4069     }
4070 
4071     if (oprsz < maxsz) {
4072         expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
4073     }
4074 }
4075 
4076 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
4077                        uint32_t aofs, int64_t c,
4078                        uint32_t oprsz, uint32_t maxsz)
4079 {
4080     TCGv_i64 tmp = tcg_constant_i64(c);
4081     tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
4082 }
4083 
4084 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
4085 {
4086     TCGv_i64 t = tcg_temp_ebb_new_i64();
4087 
4088     tcg_gen_and_i64(t, b, a);
4089     tcg_gen_andc_i64(d, c, a);
4090     tcg_gen_or_i64(d, d, t);
4091     tcg_temp_free_i64(t);
4092 }
4093 
4094 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
4095                          uint32_t bofs, uint32_t cofs,
4096                          uint32_t oprsz, uint32_t maxsz)
4097 {
4098     static const GVecGen4 g = {
4099         .fni8 = tcg_gen_bitsel_i64,
4100         .fniv = tcg_gen_bitsel_vec,
4101         .fno = gen_helper_gvec_bitsel,
4102     };
4103 
4104     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
4105 }
4106