xref: /openbmc/qemu/tcg/tcg-op-gvec.c (revision 61b01bbc)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "qemu-common.h"
22 #include "tcg.h"
23 #include "tcg-op.h"
24 #include "tcg-op-gvec.h"
25 #include "tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 /* Verify vector size and alignment rules.  OFS should be the OR of all
30    of the operand offsets so that we can check them all at once.  */
31 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
32 {
33     uint32_t opr_align = oprsz >= 16 ? 15 : 7;
34     uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
35     tcg_debug_assert(oprsz > 0);
36     tcg_debug_assert(oprsz <= maxsz);
37     tcg_debug_assert((oprsz & opr_align) == 0);
38     tcg_debug_assert((maxsz & max_align) == 0);
39     tcg_debug_assert((ofs & max_align) == 0);
40 }
41 
42 /* Verify vector overlap rules for two operands.  */
43 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
44 {
45     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
46 }
47 
48 /* Verify vector overlap rules for three operands.  */
49 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
50 {
51     check_overlap_2(d, a, s);
52     check_overlap_2(d, b, s);
53     check_overlap_2(a, b, s);
54 }
55 
56 /* Verify vector overlap rules for four operands.  */
57 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
58                             uint32_t c, uint32_t s)
59 {
60     check_overlap_2(d, a, s);
61     check_overlap_2(d, b, s);
62     check_overlap_2(d, c, s);
63     check_overlap_2(a, b, s);
64     check_overlap_2(a, c, s);
65     check_overlap_2(b, c, s);
66 }
67 
68 /* Create a descriptor from components.  */
69 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
70 {
71     uint32_t desc = 0;
72 
73     assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
74     assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
75     assert(data == sextract32(data, 0, SIMD_DATA_BITS));
76 
77     oprsz = (oprsz / 8) - 1;
78     maxsz = (maxsz / 8) - 1;
79     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
80     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
81     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
82 
83     return desc;
84 }
85 
86 /* Generate a call to a gvec-style helper with two vector operands.  */
87 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
88                         uint32_t oprsz, uint32_t maxsz, int32_t data,
89                         gen_helper_gvec_2 *fn)
90 {
91     TCGv_ptr a0, a1;
92     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
93 
94     a0 = tcg_temp_new_ptr();
95     a1 = tcg_temp_new_ptr();
96 
97     tcg_gen_addi_ptr(a0, cpu_env, dofs);
98     tcg_gen_addi_ptr(a1, cpu_env, aofs);
99 
100     fn(a0, a1, desc);
101 
102     tcg_temp_free_ptr(a0);
103     tcg_temp_free_ptr(a1);
104     tcg_temp_free_i32(desc);
105 }
106 
107 /* Generate a call to a gvec-style helper with two vector operands
108    and one scalar operand.  */
109 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
110                          uint32_t oprsz, uint32_t maxsz, int32_t data,
111                          gen_helper_gvec_2i *fn)
112 {
113     TCGv_ptr a0, a1;
114     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
115 
116     a0 = tcg_temp_new_ptr();
117     a1 = tcg_temp_new_ptr();
118 
119     tcg_gen_addi_ptr(a0, cpu_env, dofs);
120     tcg_gen_addi_ptr(a1, cpu_env, aofs);
121 
122     fn(a0, a1, c, desc);
123 
124     tcg_temp_free_ptr(a0);
125     tcg_temp_free_ptr(a1);
126     tcg_temp_free_i32(desc);
127 }
128 
129 /* Generate a call to a gvec-style helper with three vector operands.  */
130 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
131                         uint32_t oprsz, uint32_t maxsz, int32_t data,
132                         gen_helper_gvec_3 *fn)
133 {
134     TCGv_ptr a0, a1, a2;
135     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
136 
137     a0 = tcg_temp_new_ptr();
138     a1 = tcg_temp_new_ptr();
139     a2 = tcg_temp_new_ptr();
140 
141     tcg_gen_addi_ptr(a0, cpu_env, dofs);
142     tcg_gen_addi_ptr(a1, cpu_env, aofs);
143     tcg_gen_addi_ptr(a2, cpu_env, bofs);
144 
145     fn(a0, a1, a2, desc);
146 
147     tcg_temp_free_ptr(a0);
148     tcg_temp_free_ptr(a1);
149     tcg_temp_free_ptr(a2);
150     tcg_temp_free_i32(desc);
151 }
152 
153 /* Generate a call to a gvec-style helper with four vector operands.  */
154 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
155                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
156                         int32_t data, gen_helper_gvec_4 *fn)
157 {
158     TCGv_ptr a0, a1, a2, a3;
159     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
160 
161     a0 = tcg_temp_new_ptr();
162     a1 = tcg_temp_new_ptr();
163     a2 = tcg_temp_new_ptr();
164     a3 = tcg_temp_new_ptr();
165 
166     tcg_gen_addi_ptr(a0, cpu_env, dofs);
167     tcg_gen_addi_ptr(a1, cpu_env, aofs);
168     tcg_gen_addi_ptr(a2, cpu_env, bofs);
169     tcg_gen_addi_ptr(a3, cpu_env, cofs);
170 
171     fn(a0, a1, a2, a3, desc);
172 
173     tcg_temp_free_ptr(a0);
174     tcg_temp_free_ptr(a1);
175     tcg_temp_free_ptr(a2);
176     tcg_temp_free_ptr(a3);
177     tcg_temp_free_i32(desc);
178 }
179 
180 /* Generate a call to a gvec-style helper with five vector operands.  */
181 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
182                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
183                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
184 {
185     TCGv_ptr a0, a1, a2, a3, a4;
186     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
187 
188     a0 = tcg_temp_new_ptr();
189     a1 = tcg_temp_new_ptr();
190     a2 = tcg_temp_new_ptr();
191     a3 = tcg_temp_new_ptr();
192     a4 = tcg_temp_new_ptr();
193 
194     tcg_gen_addi_ptr(a0, cpu_env, dofs);
195     tcg_gen_addi_ptr(a1, cpu_env, aofs);
196     tcg_gen_addi_ptr(a2, cpu_env, bofs);
197     tcg_gen_addi_ptr(a3, cpu_env, cofs);
198     tcg_gen_addi_ptr(a4, cpu_env, xofs);
199 
200     fn(a0, a1, a2, a3, a4, desc);
201 
202     tcg_temp_free_ptr(a0);
203     tcg_temp_free_ptr(a1);
204     tcg_temp_free_ptr(a2);
205     tcg_temp_free_ptr(a3);
206     tcg_temp_free_ptr(a4);
207     tcg_temp_free_i32(desc);
208 }
209 
210 /* Generate a call to a gvec-style helper with three vector operands
211    and an extra pointer operand.  */
212 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
213                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
214                         int32_t data, gen_helper_gvec_2_ptr *fn)
215 {
216     TCGv_ptr a0, a1;
217     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
218 
219     a0 = tcg_temp_new_ptr();
220     a1 = tcg_temp_new_ptr();
221 
222     tcg_gen_addi_ptr(a0, cpu_env, dofs);
223     tcg_gen_addi_ptr(a1, cpu_env, aofs);
224 
225     fn(a0, a1, ptr, desc);
226 
227     tcg_temp_free_ptr(a0);
228     tcg_temp_free_ptr(a1);
229     tcg_temp_free_i32(desc);
230 }
231 
232 /* Generate a call to a gvec-style helper with three vector operands
233    and an extra pointer operand.  */
234 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
235                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
236                         int32_t data, gen_helper_gvec_3_ptr *fn)
237 {
238     TCGv_ptr a0, a1, a2;
239     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
240 
241     a0 = tcg_temp_new_ptr();
242     a1 = tcg_temp_new_ptr();
243     a2 = tcg_temp_new_ptr();
244 
245     tcg_gen_addi_ptr(a0, cpu_env, dofs);
246     tcg_gen_addi_ptr(a1, cpu_env, aofs);
247     tcg_gen_addi_ptr(a2, cpu_env, bofs);
248 
249     fn(a0, a1, a2, ptr, desc);
250 
251     tcg_temp_free_ptr(a0);
252     tcg_temp_free_ptr(a1);
253     tcg_temp_free_ptr(a2);
254     tcg_temp_free_i32(desc);
255 }
256 
257 /* Generate a call to a gvec-style helper with four vector operands
258    and an extra pointer operand.  */
259 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
260                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
261                         uint32_t maxsz, int32_t data,
262                         gen_helper_gvec_4_ptr *fn)
263 {
264     TCGv_ptr a0, a1, a2, a3;
265     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
266 
267     a0 = tcg_temp_new_ptr();
268     a1 = tcg_temp_new_ptr();
269     a2 = tcg_temp_new_ptr();
270     a3 = tcg_temp_new_ptr();
271 
272     tcg_gen_addi_ptr(a0, cpu_env, dofs);
273     tcg_gen_addi_ptr(a1, cpu_env, aofs);
274     tcg_gen_addi_ptr(a2, cpu_env, bofs);
275     tcg_gen_addi_ptr(a3, cpu_env, cofs);
276 
277     fn(a0, a1, a2, a3, ptr, desc);
278 
279     tcg_temp_free_ptr(a0);
280     tcg_temp_free_ptr(a1);
281     tcg_temp_free_ptr(a2);
282     tcg_temp_free_ptr(a3);
283     tcg_temp_free_i32(desc);
284 }
285 
286 /* Return true if we want to implement something of OPRSZ bytes
287    in units of LNSZ.  This limits the expansion of inline code.  */
288 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
289 {
290     uint32_t lnct = oprsz / lnsz;
291     return lnct >= 1 && lnct <= MAX_UNROLL;
292 }
293 
294 static void expand_clr(uint32_t dofs, uint32_t maxsz);
295 
296 /* Duplicate C as per VECE.  */
297 uint64_t (dup_const)(unsigned vece, uint64_t c)
298 {
299     switch (vece) {
300     case MO_8:
301         return 0x0101010101010101ull * (uint8_t)c;
302     case MO_16:
303         return 0x0001000100010001ull * (uint16_t)c;
304     case MO_32:
305         return 0x0000000100000001ull * (uint32_t)c;
306     case MO_64:
307         return c;
308     default:
309         g_assert_not_reached();
310     }
311 }
312 
313 /* Duplicate IN into OUT as per VECE.  */
314 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
315 {
316     switch (vece) {
317     case MO_8:
318         tcg_gen_ext8u_i32(out, in);
319         tcg_gen_muli_i32(out, out, 0x01010101);
320         break;
321     case MO_16:
322         tcg_gen_deposit_i32(out, in, in, 16, 16);
323         break;
324     case MO_32:
325         tcg_gen_mov_i32(out, in);
326         break;
327     default:
328         g_assert_not_reached();
329     }
330 }
331 
332 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
333 {
334     switch (vece) {
335     case MO_8:
336         tcg_gen_ext8u_i64(out, in);
337         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
338         break;
339     case MO_16:
340         tcg_gen_ext16u_i64(out, in);
341         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
342         break;
343     case MO_32:
344         tcg_gen_deposit_i64(out, in, in, 32, 32);
345         break;
346     case MO_64:
347         tcg_gen_mov_i64(out, in);
348         break;
349     default:
350         g_assert_not_reached();
351     }
352 }
353 
354 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
355  * Only one of IN_32 or IN_64 may be set;
356  * IN_C is used if IN_32 and IN_64 are unset.
357  */
358 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
359                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
360                    uint64_t in_c)
361 {
362     TCGType type;
363     TCGv_i64 t_64;
364     TCGv_i32 t_32, t_desc;
365     TCGv_ptr t_ptr;
366     uint32_t i;
367 
368     assert(vece <= (in_32 ? MO_32 : MO_64));
369     assert(in_32 == NULL || in_64 == NULL);
370 
371     /* If we're storing 0, expand oprsz to maxsz.  */
372     if (in_32 == NULL && in_64 == NULL) {
373         in_c = dup_const(vece, in_c);
374         if (in_c == 0) {
375             oprsz = maxsz;
376         }
377     }
378 
379     type = 0;
380     if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
381         type = TCG_TYPE_V256;
382     } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
383         type = TCG_TYPE_V128;
384     } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8)
385                /* Prefer integer when 64-bit host and no variable dup.  */
386                && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL
387                     && (in_64 == NULL || vece == MO_64))) {
388         type = TCG_TYPE_V64;
389     }
390 
391     /* Implement inline with a vector type, if possible.  */
392     if (type != 0) {
393         TCGv_vec t_vec = tcg_temp_new_vec(type);
394 
395         if (in_32) {
396             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
397         } else if (in_64) {
398             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
399         } else {
400             switch (vece) {
401             case MO_8:
402                 tcg_gen_dup8i_vec(t_vec, in_c);
403                 break;
404             case MO_16:
405                 tcg_gen_dup16i_vec(t_vec, in_c);
406                 break;
407             case MO_32:
408                 tcg_gen_dup32i_vec(t_vec, in_c);
409                 break;
410             default:
411                 tcg_gen_dup64i_vec(t_vec, in_c);
412                 break;
413             }
414         }
415 
416         i = 0;
417         if (TCG_TARGET_HAS_v256) {
418             for (; i + 32 <= oprsz; i += 32) {
419                 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
420             }
421         }
422         if (TCG_TARGET_HAS_v128) {
423             for (; i + 16 <= oprsz; i += 16) {
424                 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
425             }
426         }
427         if (TCG_TARGET_HAS_v64) {
428             for (; i < oprsz; i += 8) {
429                 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
430             }
431         }
432         tcg_temp_free_vec(t_vec);
433         goto done;
434     }
435 
436     /* Otherwise, inline with an integer type, unless "large".  */
437     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
438         t_64 = NULL;
439         t_32 = NULL;
440 
441         if (in_32) {
442             /* We are given a 32-bit variable input.  For a 64-bit host,
443                use a 64-bit operation unless the 32-bit operation would
444                be simple enough.  */
445             if (TCG_TARGET_REG_BITS == 64
446                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
447                 t_64 = tcg_temp_new_i64();
448                 tcg_gen_extu_i32_i64(t_64, in_32);
449                 gen_dup_i64(vece, t_64, t_64);
450             } else {
451                 t_32 = tcg_temp_new_i32();
452                 gen_dup_i32(vece, t_32, in_32);
453             }
454         } else if (in_64) {
455             /* We are given a 64-bit variable input.  */
456             t_64 = tcg_temp_new_i64();
457             gen_dup_i64(vece, t_64, in_64);
458         } else {
459             /* We are given a constant input.  */
460             /* For 64-bit hosts, use 64-bit constants for "simple" constants
461                or when we'd need too many 32-bit stores, or when a 64-bit
462                constant is really required.  */
463             if (vece == MO_64
464                 || (TCG_TARGET_REG_BITS == 64
465                     && (in_c == 0 || in_c == -1
466                         || !check_size_impl(oprsz, 4)))) {
467                 t_64 = tcg_const_i64(in_c);
468             } else {
469                 t_32 = tcg_const_i32(in_c);
470             }
471         }
472 
473         /* Implement inline if we picked an implementation size above.  */
474         if (t_32) {
475             for (i = 0; i < oprsz; i += 4) {
476                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
477             }
478             tcg_temp_free_i32(t_32);
479             goto done;
480         }
481         if (t_64) {
482             for (i = 0; i < oprsz; i += 8) {
483                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
484             }
485             tcg_temp_free_i64(t_64);
486             goto done;
487         }
488     }
489 
490     /* Otherwise implement out of line.  */
491     t_ptr = tcg_temp_new_ptr();
492     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
493     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
494 
495     if (vece == MO_64) {
496         if (in_64) {
497             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
498         } else {
499             t_64 = tcg_const_i64(in_c);
500             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
501             tcg_temp_free_i64(t_64);
502         }
503     } else {
504         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
505         static dup_fn * const fns[3] = {
506             gen_helper_gvec_dup8,
507             gen_helper_gvec_dup16,
508             gen_helper_gvec_dup32
509         };
510 
511         if (in_32) {
512             fns[vece](t_ptr, t_desc, in_32);
513         } else {
514             t_32 = tcg_temp_new_i32();
515             if (in_64) {
516                 tcg_gen_extrl_i64_i32(t_32, in_64);
517             } else if (vece == MO_8) {
518                 tcg_gen_movi_i32(t_32, in_c & 0xff);
519             } else if (vece == MO_16) {
520                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
521             } else {
522                 tcg_gen_movi_i32(t_32, in_c);
523             }
524             fns[vece](t_ptr, t_desc, t_32);
525             tcg_temp_free_i32(t_32);
526         }
527     }
528 
529     tcg_temp_free_ptr(t_ptr);
530     tcg_temp_free_i32(t_desc);
531     return;
532 
533  done:
534     if (oprsz < maxsz) {
535         expand_clr(dofs + oprsz, maxsz - oprsz);
536     }
537 }
538 
539 /* Likewise, but with zero.  */
540 static void expand_clr(uint32_t dofs, uint32_t maxsz)
541 {
542     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
543 }
544 
545 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
546 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
547                          void (*fni)(TCGv_i32, TCGv_i32))
548 {
549     TCGv_i32 t0 = tcg_temp_new_i32();
550     uint32_t i;
551 
552     for (i = 0; i < oprsz; i += 4) {
553         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
554         fni(t0, t0);
555         tcg_gen_st_i32(t0, cpu_env, dofs + i);
556     }
557     tcg_temp_free_i32(t0);
558 }
559 
560 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
561                           int32_t c, bool load_dest,
562                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
563 {
564     TCGv_i32 t0 = tcg_temp_new_i32();
565     TCGv_i32 t1 = tcg_temp_new_i32();
566     uint32_t i;
567 
568     for (i = 0; i < oprsz; i += 4) {
569         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
570         if (load_dest) {
571             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
572         }
573         fni(t1, t0, c);
574         tcg_gen_st_i32(t1, cpu_env, dofs + i);
575     }
576     tcg_temp_free_i32(t0);
577     tcg_temp_free_i32(t1);
578 }
579 
580 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
581                           TCGv_i32 c, bool scalar_first,
582                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
583 {
584     TCGv_i32 t0 = tcg_temp_new_i32();
585     TCGv_i32 t1 = tcg_temp_new_i32();
586     uint32_t i;
587 
588     for (i = 0; i < oprsz; i += 4) {
589         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
590         if (scalar_first) {
591             fni(t1, c, t0);
592         } else {
593             fni(t1, t0, c);
594         }
595         tcg_gen_st_i32(t1, cpu_env, dofs + i);
596     }
597     tcg_temp_free_i32(t0);
598     tcg_temp_free_i32(t1);
599 }
600 
601 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
602 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
603                          uint32_t bofs, uint32_t oprsz, bool load_dest,
604                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
605 {
606     TCGv_i32 t0 = tcg_temp_new_i32();
607     TCGv_i32 t1 = tcg_temp_new_i32();
608     TCGv_i32 t2 = tcg_temp_new_i32();
609     uint32_t i;
610 
611     for (i = 0; i < oprsz; i += 4) {
612         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
613         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
614         if (load_dest) {
615             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
616         }
617         fni(t2, t0, t1);
618         tcg_gen_st_i32(t2, cpu_env, dofs + i);
619     }
620     tcg_temp_free_i32(t2);
621     tcg_temp_free_i32(t1);
622     tcg_temp_free_i32(t0);
623 }
624 
625 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
626 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
627                          uint32_t cofs, uint32_t oprsz,
628                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
629 {
630     TCGv_i32 t0 = tcg_temp_new_i32();
631     TCGv_i32 t1 = tcg_temp_new_i32();
632     TCGv_i32 t2 = tcg_temp_new_i32();
633     TCGv_i32 t3 = tcg_temp_new_i32();
634     uint32_t i;
635 
636     for (i = 0; i < oprsz; i += 4) {
637         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
638         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
639         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
640         fni(t0, t1, t2, t3);
641         tcg_gen_st_i32(t0, cpu_env, dofs + i);
642     }
643     tcg_temp_free_i32(t3);
644     tcg_temp_free_i32(t2);
645     tcg_temp_free_i32(t1);
646     tcg_temp_free_i32(t0);
647 }
648 
649 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
650 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
651                          void (*fni)(TCGv_i64, TCGv_i64))
652 {
653     TCGv_i64 t0 = tcg_temp_new_i64();
654     uint32_t i;
655 
656     for (i = 0; i < oprsz; i += 8) {
657         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
658         fni(t0, t0);
659         tcg_gen_st_i64(t0, cpu_env, dofs + i);
660     }
661     tcg_temp_free_i64(t0);
662 }
663 
664 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
665                           int64_t c, bool load_dest,
666                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
667 {
668     TCGv_i64 t0 = tcg_temp_new_i64();
669     TCGv_i64 t1 = tcg_temp_new_i64();
670     uint32_t i;
671 
672     for (i = 0; i < oprsz; i += 8) {
673         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
674         if (load_dest) {
675             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
676         }
677         fni(t1, t0, c);
678         tcg_gen_st_i64(t1, cpu_env, dofs + i);
679     }
680     tcg_temp_free_i64(t0);
681     tcg_temp_free_i64(t1);
682 }
683 
684 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
685                           TCGv_i64 c, bool scalar_first,
686                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
687 {
688     TCGv_i64 t0 = tcg_temp_new_i64();
689     TCGv_i64 t1 = tcg_temp_new_i64();
690     uint32_t i;
691 
692     for (i = 0; i < oprsz; i += 8) {
693         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
694         if (scalar_first) {
695             fni(t1, c, t0);
696         } else {
697             fni(t1, t0, c);
698         }
699         tcg_gen_st_i64(t1, cpu_env, dofs + i);
700     }
701     tcg_temp_free_i64(t0);
702     tcg_temp_free_i64(t1);
703 }
704 
705 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
706 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
707                          uint32_t bofs, uint32_t oprsz, bool load_dest,
708                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
709 {
710     TCGv_i64 t0 = tcg_temp_new_i64();
711     TCGv_i64 t1 = tcg_temp_new_i64();
712     TCGv_i64 t2 = tcg_temp_new_i64();
713     uint32_t i;
714 
715     for (i = 0; i < oprsz; i += 8) {
716         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
717         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
718         if (load_dest) {
719             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
720         }
721         fni(t2, t0, t1);
722         tcg_gen_st_i64(t2, cpu_env, dofs + i);
723     }
724     tcg_temp_free_i64(t2);
725     tcg_temp_free_i64(t1);
726     tcg_temp_free_i64(t0);
727 }
728 
729 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
730 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
731                          uint32_t cofs, uint32_t oprsz,
732                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
733 {
734     TCGv_i64 t0 = tcg_temp_new_i64();
735     TCGv_i64 t1 = tcg_temp_new_i64();
736     TCGv_i64 t2 = tcg_temp_new_i64();
737     TCGv_i64 t3 = tcg_temp_new_i64();
738     uint32_t i;
739 
740     for (i = 0; i < oprsz; i += 8) {
741         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
742         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
743         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
744         fni(t0, t1, t2, t3);
745         tcg_gen_st_i64(t0, cpu_env, dofs + i);
746     }
747     tcg_temp_free_i64(t3);
748     tcg_temp_free_i64(t2);
749     tcg_temp_free_i64(t1);
750     tcg_temp_free_i64(t0);
751 }
752 
753 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
754 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
755                          uint32_t oprsz, uint32_t tysz, TCGType type,
756                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
757 {
758     TCGv_vec t0 = tcg_temp_new_vec(type);
759     uint32_t i;
760 
761     for (i = 0; i < oprsz; i += tysz) {
762         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
763         fni(vece, t0, t0);
764         tcg_gen_st_vec(t0, cpu_env, dofs + i);
765     }
766     tcg_temp_free_vec(t0);
767 }
768 
769 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
770    using host vectors.  */
771 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
772                           uint32_t oprsz, uint32_t tysz, TCGType type,
773                           int64_t c, bool load_dest,
774                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
775 {
776     TCGv_vec t0 = tcg_temp_new_vec(type);
777     TCGv_vec t1 = tcg_temp_new_vec(type);
778     uint32_t i;
779 
780     for (i = 0; i < oprsz; i += tysz) {
781         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
782         if (load_dest) {
783             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
784         }
785         fni(vece, t1, t0, c);
786         tcg_gen_st_vec(t1, cpu_env, dofs + i);
787     }
788     tcg_temp_free_vec(t0);
789     tcg_temp_free_vec(t1);
790 }
791 
792 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
793                           uint32_t oprsz, uint32_t tysz, TCGType type,
794                           TCGv_vec c, bool scalar_first,
795                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
796 {
797     TCGv_vec t0 = tcg_temp_new_vec(type);
798     TCGv_vec t1 = tcg_temp_new_vec(type);
799     uint32_t i;
800 
801     for (i = 0; i < oprsz; i += tysz) {
802         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
803         if (scalar_first) {
804             fni(vece, t1, c, t0);
805         } else {
806             fni(vece, t1, t0, c);
807         }
808         tcg_gen_st_vec(t1, cpu_env, dofs + i);
809     }
810     tcg_temp_free_vec(t0);
811     tcg_temp_free_vec(t1);
812 }
813 
814 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
815 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
816                          uint32_t bofs, uint32_t oprsz,
817                          uint32_t tysz, TCGType type, bool load_dest,
818                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
819 {
820     TCGv_vec t0 = tcg_temp_new_vec(type);
821     TCGv_vec t1 = tcg_temp_new_vec(type);
822     TCGv_vec t2 = tcg_temp_new_vec(type);
823     uint32_t i;
824 
825     for (i = 0; i < oprsz; i += tysz) {
826         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
827         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
828         if (load_dest) {
829             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
830         }
831         fni(vece, t2, t0, t1);
832         tcg_gen_st_vec(t2, cpu_env, dofs + i);
833     }
834     tcg_temp_free_vec(t2);
835     tcg_temp_free_vec(t1);
836     tcg_temp_free_vec(t0);
837 }
838 
839 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
840 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
841                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
842                          uint32_t tysz, TCGType type,
843                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
844                                      TCGv_vec, TCGv_vec))
845 {
846     TCGv_vec t0 = tcg_temp_new_vec(type);
847     TCGv_vec t1 = tcg_temp_new_vec(type);
848     TCGv_vec t2 = tcg_temp_new_vec(type);
849     TCGv_vec t3 = tcg_temp_new_vec(type);
850     uint32_t i;
851 
852     for (i = 0; i < oprsz; i += tysz) {
853         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
854         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
855         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
856         fni(vece, t0, t1, t2, t3);
857         tcg_gen_st_vec(t0, cpu_env, dofs + i);
858     }
859     tcg_temp_free_vec(t3);
860     tcg_temp_free_vec(t2);
861     tcg_temp_free_vec(t1);
862     tcg_temp_free_vec(t0);
863 }
864 
865 /* Expand a vector two-operand operation.  */
866 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
867                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
868 {
869     check_size_align(oprsz, maxsz, dofs | aofs);
870     check_overlap_2(dofs, aofs, maxsz);
871 
872     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
873        Expand with successively smaller host vector sizes.  The intent is
874        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
875     /* ??? For maxsz > oprsz, the host may be able to use an opr-sized
876        operation, zeroing the balance of the register.  We can then
877        use a max-sized store to implement the clearing without an extra
878        store operation.  This is true for aarch64 and x86_64 hosts.  */
879 
880     if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
881         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
882         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
883         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
884         if (some == oprsz) {
885             goto done;
886         }
887         dofs += some;
888         aofs += some;
889         oprsz -= some;
890         maxsz -= some;
891     }
892 
893     if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
894         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
895         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
896     } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
897                && g->fniv && check_size_impl(oprsz, 8)
898                && (!g->opc
899                    || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
900         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
901     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
902         expand_2_i64(dofs, aofs, oprsz, g->fni8);
903     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
904         expand_2_i32(dofs, aofs, oprsz, g->fni4);
905     } else {
906         assert(g->fno != NULL);
907         tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
908         return;
909     }
910 
911  done:
912     if (oprsz < maxsz) {
913         expand_clr(dofs + oprsz, maxsz - oprsz);
914     }
915 }
916 
917 /* Expand a vector operation with two vectors and an immediate.  */
918 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
919                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
920 {
921     check_size_align(oprsz, maxsz, dofs | aofs);
922     check_overlap_2(dofs, aofs, maxsz);
923 
924     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
925        Expand with successively smaller host vector sizes.  The intent is
926        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
927 
928     if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
929         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
930         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
931         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
932                       c, g->load_dest, g->fniv);
933         if (some == oprsz) {
934             goto done;
935         }
936         dofs += some;
937         aofs += some;
938         oprsz -= some;
939         maxsz -= some;
940     }
941 
942     if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
943         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
944         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
945                       c, g->load_dest, g->fniv);
946     } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
947                && g->fniv && check_size_impl(oprsz, 8)
948                && (!g->opc
949                    || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
950         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
951                       c, g->load_dest, g->fniv);
952     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
953         expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
954     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
955         expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
956     } else {
957         if (g->fno) {
958             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
959         } else {
960             TCGv_i64 tcg_c = tcg_const_i64(c);
961             tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, maxsz, c, g->fnoi);
962             tcg_temp_free_i64(tcg_c);
963         }
964         return;
965     }
966 
967  done:
968     if (oprsz < maxsz) {
969         expand_clr(dofs + oprsz, maxsz - oprsz);
970     }
971 }
972 
973 /* Expand a vector operation with two vectors and a scalar.  */
974 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
975                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
976 {
977     TCGType type;
978 
979     check_size_align(oprsz, maxsz, dofs | aofs);
980     check_overlap_2(dofs, aofs, maxsz);
981 
982     type = 0;
983     if (g->fniv) {
984         if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
985             type = TCG_TYPE_V256;
986         } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
987             type = TCG_TYPE_V128;
988         } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
989                && check_size_impl(oprsz, 8)) {
990             type = TCG_TYPE_V64;
991         }
992     }
993     if (type != 0) {
994         TCGv_vec t_vec = tcg_temp_new_vec(type);
995 
996         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
997 
998         /* Recall that ARM SVE allows vector sizes that are not a power of 2.
999            Expand with successively smaller host vector sizes.  The intent is
1000            that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
1001         switch (type) {
1002         case TCG_TYPE_V256:
1003             {
1004                 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
1005                 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1006                               t_vec, g->scalar_first, g->fniv);
1007                 if (some == oprsz) {
1008                     break;
1009                 }
1010                 dofs += some;
1011                 aofs += some;
1012                 oprsz -= some;
1013                 maxsz -= some;
1014             }
1015             /* fallthru */
1016 
1017         case TCG_TYPE_V128:
1018             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1019                           t_vec, g->scalar_first, g->fniv);
1020             break;
1021 
1022         case TCG_TYPE_V64:
1023             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1024                           t_vec, g->scalar_first, g->fniv);
1025             break;
1026 
1027         default:
1028             g_assert_not_reached();
1029         }
1030         tcg_temp_free_vec(t_vec);
1031     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1032         TCGv_i64 t64 = tcg_temp_new_i64();
1033 
1034         gen_dup_i64(g->vece, t64, c);
1035         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1036         tcg_temp_free_i64(t64);
1037     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1038         TCGv_i32 t32 = tcg_temp_new_i32();
1039 
1040         tcg_gen_extrl_i64_i32(t32, c);
1041         gen_dup_i32(g->vece, t32, t32);
1042         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1043         tcg_temp_free_i32(t32);
1044     } else {
1045         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1046         return;
1047     }
1048 
1049     if (oprsz < maxsz) {
1050         expand_clr(dofs + oprsz, maxsz - oprsz);
1051     }
1052 }
1053 
1054 /* Expand a vector three-operand operation.  */
1055 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1056                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1057 {
1058     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1059     check_overlap_3(dofs, aofs, bofs, maxsz);
1060 
1061     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
1062        Expand with successively smaller host vector sizes.  The intent is
1063        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
1064 
1065     if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
1066         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
1067         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
1068         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1069                      g->load_dest, g->fniv);
1070         if (some == oprsz) {
1071             goto done;
1072         }
1073         dofs += some;
1074         aofs += some;
1075         bofs += some;
1076         oprsz -= some;
1077         maxsz -= some;
1078     }
1079 
1080     if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
1081         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
1082         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1083                      g->load_dest, g->fniv);
1084     } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
1085                && g->fniv && check_size_impl(oprsz, 8)
1086                && (!g->opc
1087                    || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
1088         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1089                      g->load_dest, g->fniv);
1090     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1091         expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1092     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1093         expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1094     } else {
1095         assert(g->fno != NULL);
1096         tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno);
1097     }
1098 
1099  done:
1100     if (oprsz < maxsz) {
1101         expand_clr(dofs + oprsz, maxsz - oprsz);
1102     }
1103 }
1104 
1105 /* Expand a vector four-operand operation.  */
1106 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1107                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1108 {
1109     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1110     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1111 
1112     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
1113        Expand with successively smaller host vector sizes.  The intent is
1114        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
1115 
1116     if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
1117         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
1118         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
1119         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1120                      32, TCG_TYPE_V256, g->fniv);
1121         if (some == oprsz) {
1122             goto done;
1123         }
1124         dofs += some;
1125         aofs += some;
1126         bofs += some;
1127         cofs += some;
1128         oprsz -= some;
1129         maxsz -= some;
1130     }
1131 
1132     if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
1133         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
1134         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1135                      16, TCG_TYPE_V128, g->fniv);
1136     } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
1137                && g->fniv && check_size_impl(oprsz, 8)
1138                 && (!g->opc
1139                     || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
1140         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1141                      8, TCG_TYPE_V64, g->fniv);
1142     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1143         expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
1144     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1145         expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
1146     } else {
1147         assert(g->fno != NULL);
1148         tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1149                            oprsz, maxsz, g->data, g->fno);
1150         return;
1151     }
1152 
1153  done:
1154     if (oprsz < maxsz) {
1155         expand_clr(dofs + oprsz, maxsz - oprsz);
1156     }
1157 }
1158 
1159 /*
1160  * Expand specific vector operations.
1161  */
1162 
1163 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1164 {
1165     tcg_gen_mov_vec(a, b);
1166 }
1167 
1168 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1169                       uint32_t oprsz, uint32_t maxsz)
1170 {
1171     static const GVecGen2 g = {
1172         .fni8 = tcg_gen_mov_i64,
1173         .fniv = vec_mov2,
1174         .fno = gen_helper_gvec_mov,
1175         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1176     };
1177     if (dofs != aofs) {
1178         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1179     } else {
1180         check_size_align(oprsz, maxsz, dofs);
1181         if (oprsz < maxsz) {
1182             expand_clr(dofs + oprsz, maxsz - oprsz);
1183         }
1184     }
1185 }
1186 
1187 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1188                           uint32_t maxsz, TCGv_i32 in)
1189 {
1190     check_size_align(oprsz, maxsz, dofs);
1191     tcg_debug_assert(vece <= MO_32);
1192     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1193 }
1194 
1195 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1196                           uint32_t maxsz, TCGv_i64 in)
1197 {
1198     check_size_align(oprsz, maxsz, dofs);
1199     tcg_debug_assert(vece <= MO_64);
1200     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1201 }
1202 
1203 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1204                           uint32_t oprsz, uint32_t maxsz)
1205 {
1206     if (vece <= MO_32) {
1207         TCGv_i32 in = tcg_temp_new_i32();
1208         switch (vece) {
1209         case MO_8:
1210             tcg_gen_ld8u_i32(in, cpu_env, aofs);
1211             break;
1212         case MO_16:
1213             tcg_gen_ld16u_i32(in, cpu_env, aofs);
1214             break;
1215         case MO_32:
1216             tcg_gen_ld_i32(in, cpu_env, aofs);
1217             break;
1218         }
1219         tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1220         tcg_temp_free_i32(in);
1221     } else if (vece == MO_64) {
1222         TCGv_i64 in = tcg_temp_new_i64();
1223         tcg_gen_ld_i64(in, cpu_env, aofs);
1224         tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1225         tcg_temp_free_i64(in);
1226     } else {
1227         /* 128-bit duplicate.  */
1228         /* ??? Dup to 256-bit vector.  */
1229         int i;
1230 
1231         tcg_debug_assert(vece == 4);
1232         tcg_debug_assert(oprsz >= 16);
1233         if (TCG_TARGET_HAS_v128) {
1234             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1235 
1236             tcg_gen_ld_vec(in, cpu_env, aofs);
1237             for (i = 0; i < oprsz; i += 16) {
1238                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1239             }
1240             tcg_temp_free_vec(in);
1241         } else {
1242             TCGv_i64 in0 = tcg_temp_new_i64();
1243             TCGv_i64 in1 = tcg_temp_new_i64();
1244 
1245             tcg_gen_ld_i64(in0, cpu_env, aofs);
1246             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1247             for (i = 0; i < oprsz; i += 16) {
1248                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1249                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1250             }
1251             tcg_temp_free_i64(in0);
1252             tcg_temp_free_i64(in1);
1253         }
1254     }
1255 }
1256 
1257 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1258                          uint32_t maxsz, uint64_t x)
1259 {
1260     check_size_align(oprsz, maxsz, dofs);
1261     do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1262 }
1263 
1264 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1265                          uint32_t maxsz, uint32_t x)
1266 {
1267     check_size_align(oprsz, maxsz, dofs);
1268     do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1269 }
1270 
1271 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1272                          uint32_t maxsz, uint16_t x)
1273 {
1274     check_size_align(oprsz, maxsz, dofs);
1275     do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1276 }
1277 
1278 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1279                          uint32_t maxsz, uint8_t x)
1280 {
1281     check_size_align(oprsz, maxsz, dofs);
1282     do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1283 }
1284 
1285 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1286                       uint32_t oprsz, uint32_t maxsz)
1287 {
1288     static const GVecGen2 g = {
1289         .fni8 = tcg_gen_not_i64,
1290         .fniv = tcg_gen_not_vec,
1291         .fno = gen_helper_gvec_not,
1292         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1293     };
1294     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1295 }
1296 
1297 /* Perform a vector addition using normal addition and a mask.  The mask
1298    should be the sign bit of each lane.  This 6-operation form is more
1299    efficient than separate additions when there are 4 or more lanes in
1300    the 64-bit operation.  */
1301 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1302 {
1303     TCGv_i64 t1 = tcg_temp_new_i64();
1304     TCGv_i64 t2 = tcg_temp_new_i64();
1305     TCGv_i64 t3 = tcg_temp_new_i64();
1306 
1307     tcg_gen_andc_i64(t1, a, m);
1308     tcg_gen_andc_i64(t2, b, m);
1309     tcg_gen_xor_i64(t3, a, b);
1310     tcg_gen_add_i64(d, t1, t2);
1311     tcg_gen_and_i64(t3, t3, m);
1312     tcg_gen_xor_i64(d, d, t3);
1313 
1314     tcg_temp_free_i64(t1);
1315     tcg_temp_free_i64(t2);
1316     tcg_temp_free_i64(t3);
1317 }
1318 
1319 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1320 {
1321     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1322     gen_addv_mask(d, a, b, m);
1323     tcg_temp_free_i64(m);
1324 }
1325 
1326 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1327 {
1328     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1329     gen_addv_mask(d, a, b, m);
1330     tcg_temp_free_i64(m);
1331 }
1332 
1333 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1334 {
1335     TCGv_i64 t1 = tcg_temp_new_i64();
1336     TCGv_i64 t2 = tcg_temp_new_i64();
1337 
1338     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1339     tcg_gen_add_i64(t2, a, b);
1340     tcg_gen_add_i64(t1, t1, b);
1341     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1342 
1343     tcg_temp_free_i64(t1);
1344     tcg_temp_free_i64(t2);
1345 }
1346 
1347 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1348                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1349 {
1350     static const GVecGen3 g[4] = {
1351         { .fni8 = tcg_gen_vec_add8_i64,
1352           .fniv = tcg_gen_add_vec,
1353           .fno = gen_helper_gvec_add8,
1354           .opc = INDEX_op_add_vec,
1355           .vece = MO_8 },
1356         { .fni8 = tcg_gen_vec_add16_i64,
1357           .fniv = tcg_gen_add_vec,
1358           .fno = gen_helper_gvec_add16,
1359           .opc = INDEX_op_add_vec,
1360           .vece = MO_16 },
1361         { .fni4 = tcg_gen_add_i32,
1362           .fniv = tcg_gen_add_vec,
1363           .fno = gen_helper_gvec_add32,
1364           .opc = INDEX_op_add_vec,
1365           .vece = MO_32 },
1366         { .fni8 = tcg_gen_add_i64,
1367           .fniv = tcg_gen_add_vec,
1368           .fno = gen_helper_gvec_add64,
1369           .opc = INDEX_op_add_vec,
1370           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1371           .vece = MO_64 },
1372     };
1373 
1374     tcg_debug_assert(vece <= MO_64);
1375     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1376 }
1377 
1378 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1379                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1380 {
1381     static const GVecGen2s g[4] = {
1382         { .fni8 = tcg_gen_vec_add8_i64,
1383           .fniv = tcg_gen_add_vec,
1384           .fno = gen_helper_gvec_adds8,
1385           .opc = INDEX_op_add_vec,
1386           .vece = MO_8 },
1387         { .fni8 = tcg_gen_vec_add16_i64,
1388           .fniv = tcg_gen_add_vec,
1389           .fno = gen_helper_gvec_adds16,
1390           .opc = INDEX_op_add_vec,
1391           .vece = MO_16 },
1392         { .fni4 = tcg_gen_add_i32,
1393           .fniv = tcg_gen_add_vec,
1394           .fno = gen_helper_gvec_adds32,
1395           .opc = INDEX_op_add_vec,
1396           .vece = MO_32 },
1397         { .fni8 = tcg_gen_add_i64,
1398           .fniv = tcg_gen_add_vec,
1399           .fno = gen_helper_gvec_adds64,
1400           .opc = INDEX_op_add_vec,
1401           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1402           .vece = MO_64 },
1403     };
1404 
1405     tcg_debug_assert(vece <= MO_64);
1406     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1407 }
1408 
1409 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1410                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1411 {
1412     TCGv_i64 tmp = tcg_const_i64(c);
1413     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1414     tcg_temp_free_i64(tmp);
1415 }
1416 
1417 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1418                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1419 {
1420     static const GVecGen2s g[4] = {
1421         { .fni8 = tcg_gen_vec_sub8_i64,
1422           .fniv = tcg_gen_sub_vec,
1423           .fno = gen_helper_gvec_subs8,
1424           .opc = INDEX_op_sub_vec,
1425           .vece = MO_8 },
1426         { .fni8 = tcg_gen_vec_sub16_i64,
1427           .fniv = tcg_gen_sub_vec,
1428           .fno = gen_helper_gvec_subs16,
1429           .opc = INDEX_op_sub_vec,
1430           .vece = MO_16 },
1431         { .fni4 = tcg_gen_sub_i32,
1432           .fniv = tcg_gen_sub_vec,
1433           .fno = gen_helper_gvec_subs32,
1434           .opc = INDEX_op_sub_vec,
1435           .vece = MO_32 },
1436         { .fni8 = tcg_gen_sub_i64,
1437           .fniv = tcg_gen_sub_vec,
1438           .fno = gen_helper_gvec_subs64,
1439           .opc = INDEX_op_sub_vec,
1440           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1441           .vece = MO_64 },
1442     };
1443 
1444     tcg_debug_assert(vece <= MO_64);
1445     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1446 }
1447 
1448 /* Perform a vector subtraction using normal subtraction and a mask.
1449    Compare gen_addv_mask above.  */
1450 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1451 {
1452     TCGv_i64 t1 = tcg_temp_new_i64();
1453     TCGv_i64 t2 = tcg_temp_new_i64();
1454     TCGv_i64 t3 = tcg_temp_new_i64();
1455 
1456     tcg_gen_or_i64(t1, a, m);
1457     tcg_gen_andc_i64(t2, b, m);
1458     tcg_gen_eqv_i64(t3, a, b);
1459     tcg_gen_sub_i64(d, t1, t2);
1460     tcg_gen_and_i64(t3, t3, m);
1461     tcg_gen_xor_i64(d, d, t3);
1462 
1463     tcg_temp_free_i64(t1);
1464     tcg_temp_free_i64(t2);
1465     tcg_temp_free_i64(t3);
1466 }
1467 
1468 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1469 {
1470     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1471     gen_subv_mask(d, a, b, m);
1472     tcg_temp_free_i64(m);
1473 }
1474 
1475 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1476 {
1477     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1478     gen_subv_mask(d, a, b, m);
1479     tcg_temp_free_i64(m);
1480 }
1481 
1482 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1483 {
1484     TCGv_i64 t1 = tcg_temp_new_i64();
1485     TCGv_i64 t2 = tcg_temp_new_i64();
1486 
1487     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1488     tcg_gen_sub_i64(t2, a, b);
1489     tcg_gen_sub_i64(t1, a, t1);
1490     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1491 
1492     tcg_temp_free_i64(t1);
1493     tcg_temp_free_i64(t2);
1494 }
1495 
1496 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1497                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1498 {
1499     static const GVecGen3 g[4] = {
1500         { .fni8 = tcg_gen_vec_sub8_i64,
1501           .fniv = tcg_gen_sub_vec,
1502           .fno = gen_helper_gvec_sub8,
1503           .opc = INDEX_op_sub_vec,
1504           .vece = MO_8 },
1505         { .fni8 = tcg_gen_vec_sub16_i64,
1506           .fniv = tcg_gen_sub_vec,
1507           .fno = gen_helper_gvec_sub16,
1508           .opc = INDEX_op_sub_vec,
1509           .vece = MO_16 },
1510         { .fni4 = tcg_gen_sub_i32,
1511           .fniv = tcg_gen_sub_vec,
1512           .fno = gen_helper_gvec_sub32,
1513           .opc = INDEX_op_sub_vec,
1514           .vece = MO_32 },
1515         { .fni8 = tcg_gen_sub_i64,
1516           .fniv = tcg_gen_sub_vec,
1517           .fno = gen_helper_gvec_sub64,
1518           .opc = INDEX_op_sub_vec,
1519           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1520           .vece = MO_64 },
1521     };
1522 
1523     tcg_debug_assert(vece <= MO_64);
1524     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1525 }
1526 
1527 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1528                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1529 {
1530     static const GVecGen3 g[4] = {
1531         { .fniv = tcg_gen_mul_vec,
1532           .fno = gen_helper_gvec_mul8,
1533           .opc = INDEX_op_mul_vec,
1534           .vece = MO_8 },
1535         { .fniv = tcg_gen_mul_vec,
1536           .fno = gen_helper_gvec_mul16,
1537           .opc = INDEX_op_mul_vec,
1538           .vece = MO_16 },
1539         { .fni4 = tcg_gen_mul_i32,
1540           .fniv = tcg_gen_mul_vec,
1541           .fno = gen_helper_gvec_mul32,
1542           .opc = INDEX_op_mul_vec,
1543           .vece = MO_32 },
1544         { .fni8 = tcg_gen_mul_i64,
1545           .fniv = tcg_gen_mul_vec,
1546           .fno = gen_helper_gvec_mul64,
1547           .opc = INDEX_op_mul_vec,
1548           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1549           .vece = MO_64 },
1550     };
1551 
1552     tcg_debug_assert(vece <= MO_64);
1553     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1554 }
1555 
1556 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1557                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1558 {
1559     static const GVecGen2s g[4] = {
1560         { .fniv = tcg_gen_mul_vec,
1561           .fno = gen_helper_gvec_muls8,
1562           .opc = INDEX_op_mul_vec,
1563           .vece = MO_8 },
1564         { .fniv = tcg_gen_mul_vec,
1565           .fno = gen_helper_gvec_muls16,
1566           .opc = INDEX_op_mul_vec,
1567           .vece = MO_16 },
1568         { .fni4 = tcg_gen_mul_i32,
1569           .fniv = tcg_gen_mul_vec,
1570           .fno = gen_helper_gvec_muls32,
1571           .opc = INDEX_op_mul_vec,
1572           .vece = MO_32 },
1573         { .fni8 = tcg_gen_mul_i64,
1574           .fniv = tcg_gen_mul_vec,
1575           .fno = gen_helper_gvec_muls64,
1576           .opc = INDEX_op_mul_vec,
1577           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1578           .vece = MO_64 },
1579     };
1580 
1581     tcg_debug_assert(vece <= MO_64);
1582     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1583 }
1584 
1585 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1586                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1587 {
1588     TCGv_i64 tmp = tcg_const_i64(c);
1589     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1590     tcg_temp_free_i64(tmp);
1591 }
1592 
1593 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1594                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1595 {
1596     static const GVecGen3 g[4] = {
1597         { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 },
1598         { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 },
1599         { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 },
1600         { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 }
1601     };
1602     tcg_debug_assert(vece <= MO_64);
1603     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1604 }
1605 
1606 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1607                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1608 {
1609     static const GVecGen3 g[4] = {
1610         { .fno = gen_helper_gvec_sssub8, .vece = MO_8 },
1611         { .fno = gen_helper_gvec_sssub16, .vece = MO_16 },
1612         { .fno = gen_helper_gvec_sssub32, .vece = MO_32 },
1613         { .fno = gen_helper_gvec_sssub64, .vece = MO_64 }
1614     };
1615     tcg_debug_assert(vece <= MO_64);
1616     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1617 }
1618 
1619 static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1620 {
1621     TCGv_i32 max = tcg_const_i32(-1);
1622     tcg_gen_add_i32(d, a, b);
1623     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1624     tcg_temp_free_i32(max);
1625 }
1626 
1627 static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1628 {
1629     TCGv_i64 max = tcg_const_i64(-1);
1630     tcg_gen_add_i64(d, a, b);
1631     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1632     tcg_temp_free_i64(max);
1633 }
1634 
1635 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1636                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1637 {
1638     static const GVecGen3 g[4] = {
1639         { .fno = gen_helper_gvec_usadd8, .vece = MO_8 },
1640         { .fno = gen_helper_gvec_usadd16, .vece = MO_16 },
1641         { .fni4 = tcg_gen_vec_usadd32_i32,
1642           .fno = gen_helper_gvec_usadd32,
1643           .vece = MO_32 },
1644         { .fni8 = tcg_gen_vec_usadd32_i64,
1645           .fno = gen_helper_gvec_usadd64,
1646           .vece = MO_64 }
1647     };
1648     tcg_debug_assert(vece <= MO_64);
1649     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1650 }
1651 
1652 static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1653 {
1654     TCGv_i32 min = tcg_const_i32(0);
1655     tcg_gen_sub_i32(d, a, b);
1656     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1657     tcg_temp_free_i32(min);
1658 }
1659 
1660 static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1661 {
1662     TCGv_i64 min = tcg_const_i64(0);
1663     tcg_gen_sub_i64(d, a, b);
1664     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1665     tcg_temp_free_i64(min);
1666 }
1667 
1668 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1669                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1670 {
1671     static const GVecGen3 g[4] = {
1672         { .fno = gen_helper_gvec_ussub8, .vece = MO_8 },
1673         { .fno = gen_helper_gvec_ussub16, .vece = MO_16 },
1674         { .fni4 = tcg_gen_vec_ussub32_i32,
1675           .fno = gen_helper_gvec_ussub32,
1676           .vece = MO_32 },
1677         { .fni8 = tcg_gen_vec_ussub32_i64,
1678           .fno = gen_helper_gvec_ussub64,
1679           .vece = MO_64 }
1680     };
1681     tcg_debug_assert(vece <= MO_64);
1682     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1683 }
1684 
1685 /* Perform a vector negation using normal negation and a mask.
1686    Compare gen_subv_mask above.  */
1687 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
1688 {
1689     TCGv_i64 t2 = tcg_temp_new_i64();
1690     TCGv_i64 t3 = tcg_temp_new_i64();
1691 
1692     tcg_gen_andc_i64(t3, m, b);
1693     tcg_gen_andc_i64(t2, b, m);
1694     tcg_gen_sub_i64(d, m, t2);
1695     tcg_gen_xor_i64(d, d, t3);
1696 
1697     tcg_temp_free_i64(t2);
1698     tcg_temp_free_i64(t3);
1699 }
1700 
1701 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
1702 {
1703     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1704     gen_negv_mask(d, b, m);
1705     tcg_temp_free_i64(m);
1706 }
1707 
1708 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
1709 {
1710     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1711     gen_negv_mask(d, b, m);
1712     tcg_temp_free_i64(m);
1713 }
1714 
1715 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
1716 {
1717     TCGv_i64 t1 = tcg_temp_new_i64();
1718     TCGv_i64 t2 = tcg_temp_new_i64();
1719 
1720     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1721     tcg_gen_neg_i64(t2, b);
1722     tcg_gen_neg_i64(t1, t1);
1723     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1724 
1725     tcg_temp_free_i64(t1);
1726     tcg_temp_free_i64(t2);
1727 }
1728 
1729 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
1730                       uint32_t oprsz, uint32_t maxsz)
1731 {
1732     static const GVecGen2 g[4] = {
1733         { .fni8 = tcg_gen_vec_neg8_i64,
1734           .fniv = tcg_gen_neg_vec,
1735           .fno = gen_helper_gvec_neg8,
1736           .opc = INDEX_op_neg_vec,
1737           .vece = MO_8 },
1738         { .fni8 = tcg_gen_vec_neg16_i64,
1739           .fniv = tcg_gen_neg_vec,
1740           .fno = gen_helper_gvec_neg16,
1741           .opc = INDEX_op_neg_vec,
1742           .vece = MO_16 },
1743         { .fni4 = tcg_gen_neg_i32,
1744           .fniv = tcg_gen_neg_vec,
1745           .fno = gen_helper_gvec_neg32,
1746           .opc = INDEX_op_neg_vec,
1747           .vece = MO_32 },
1748         { .fni8 = tcg_gen_neg_i64,
1749           .fniv = tcg_gen_neg_vec,
1750           .fno = gen_helper_gvec_neg64,
1751           .opc = INDEX_op_neg_vec,
1752           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1753           .vece = MO_64 },
1754     };
1755 
1756     tcg_debug_assert(vece <= MO_64);
1757     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
1758 }
1759 
1760 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
1761                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1762 {
1763     static const GVecGen3 g = {
1764         .fni8 = tcg_gen_and_i64,
1765         .fniv = tcg_gen_and_vec,
1766         .fno = gen_helper_gvec_and,
1767         .opc = INDEX_op_and_vec,
1768         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1769     };
1770     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1771 }
1772 
1773 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
1774                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1775 {
1776     static const GVecGen3 g = {
1777         .fni8 = tcg_gen_or_i64,
1778         .fniv = tcg_gen_or_vec,
1779         .fno = gen_helper_gvec_or,
1780         .opc = INDEX_op_or_vec,
1781         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1782     };
1783     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1784 }
1785 
1786 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
1787                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1788 {
1789     static const GVecGen3 g = {
1790         .fni8 = tcg_gen_xor_i64,
1791         .fniv = tcg_gen_xor_vec,
1792         .fno = gen_helper_gvec_xor,
1793         .opc = INDEX_op_xor_vec,
1794         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1795     };
1796     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1797 }
1798 
1799 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
1800                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1801 {
1802     static const GVecGen3 g = {
1803         .fni8 = tcg_gen_andc_i64,
1804         .fniv = tcg_gen_andc_vec,
1805         .fno = gen_helper_gvec_andc,
1806         .opc = INDEX_op_andc_vec,
1807         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1808     };
1809     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1810 }
1811 
1812 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
1813                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1814 {
1815     static const GVecGen3 g = {
1816         .fni8 = tcg_gen_orc_i64,
1817         .fniv = tcg_gen_orc_vec,
1818         .fno = gen_helper_gvec_orc,
1819         .opc = INDEX_op_orc_vec,
1820         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1821     };
1822     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1823 }
1824 
1825 static const GVecGen2s gop_ands = {
1826     .fni8 = tcg_gen_and_i64,
1827     .fniv = tcg_gen_and_vec,
1828     .fno = gen_helper_gvec_ands,
1829     .opc = INDEX_op_and_vec,
1830     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1831     .vece = MO_64
1832 };
1833 
1834 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
1835                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1836 {
1837     TCGv_i64 tmp = tcg_temp_new_i64();
1838     gen_dup_i64(vece, tmp, c);
1839     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1840     tcg_temp_free_i64(tmp);
1841 }
1842 
1843 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
1844                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1845 {
1846     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1847     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1848     tcg_temp_free_i64(tmp);
1849 }
1850 
1851 static const GVecGen2s gop_xors = {
1852     .fni8 = tcg_gen_xor_i64,
1853     .fniv = tcg_gen_xor_vec,
1854     .fno = gen_helper_gvec_xors,
1855     .opc = INDEX_op_xor_vec,
1856     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1857     .vece = MO_64
1858 };
1859 
1860 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
1861                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1862 {
1863     TCGv_i64 tmp = tcg_temp_new_i64();
1864     gen_dup_i64(vece, tmp, c);
1865     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1866     tcg_temp_free_i64(tmp);
1867 }
1868 
1869 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
1870                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1871 {
1872     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1873     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1874     tcg_temp_free_i64(tmp);
1875 }
1876 
1877 static const GVecGen2s gop_ors = {
1878     .fni8 = tcg_gen_or_i64,
1879     .fniv = tcg_gen_or_vec,
1880     .fno = gen_helper_gvec_ors,
1881     .opc = INDEX_op_or_vec,
1882     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1883     .vece = MO_64
1884 };
1885 
1886 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
1887                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1888 {
1889     TCGv_i64 tmp = tcg_temp_new_i64();
1890     gen_dup_i64(vece, tmp, c);
1891     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1892     tcg_temp_free_i64(tmp);
1893 }
1894 
1895 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
1896                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1897 {
1898     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1899     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1900     tcg_temp_free_i64(tmp);
1901 }
1902 
1903 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1904 {
1905     uint64_t mask = dup_const(MO_8, 0xff << c);
1906     tcg_gen_shli_i64(d, a, c);
1907     tcg_gen_andi_i64(d, d, mask);
1908 }
1909 
1910 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1911 {
1912     uint64_t mask = dup_const(MO_16, 0xffff << c);
1913     tcg_gen_shli_i64(d, a, c);
1914     tcg_gen_andi_i64(d, d, mask);
1915 }
1916 
1917 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
1918                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
1919 {
1920     static const GVecGen2i g[4] = {
1921         { .fni8 = tcg_gen_vec_shl8i_i64,
1922           .fniv = tcg_gen_shli_vec,
1923           .fno = gen_helper_gvec_shl8i,
1924           .opc = INDEX_op_shli_vec,
1925           .vece = MO_8 },
1926         { .fni8 = tcg_gen_vec_shl16i_i64,
1927           .fniv = tcg_gen_shli_vec,
1928           .fno = gen_helper_gvec_shl16i,
1929           .opc = INDEX_op_shli_vec,
1930           .vece = MO_16 },
1931         { .fni4 = tcg_gen_shli_i32,
1932           .fniv = tcg_gen_shli_vec,
1933           .fno = gen_helper_gvec_shl32i,
1934           .opc = INDEX_op_shli_vec,
1935           .vece = MO_32 },
1936         { .fni8 = tcg_gen_shli_i64,
1937           .fniv = tcg_gen_shli_vec,
1938           .fno = gen_helper_gvec_shl64i,
1939           .opc = INDEX_op_shli_vec,
1940           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1941           .vece = MO_64 },
1942     };
1943 
1944     tcg_debug_assert(vece <= MO_64);
1945     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
1946     if (shift == 0) {
1947         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
1948     } else {
1949         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
1950     }
1951 }
1952 
1953 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1954 {
1955     uint64_t mask = dup_const(MO_8, 0xff >> c);
1956     tcg_gen_shri_i64(d, a, c);
1957     tcg_gen_andi_i64(d, d, mask);
1958 }
1959 
1960 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1961 {
1962     uint64_t mask = dup_const(MO_16, 0xffff >> c);
1963     tcg_gen_shri_i64(d, a, c);
1964     tcg_gen_andi_i64(d, d, mask);
1965 }
1966 
1967 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
1968                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
1969 {
1970     static const GVecGen2i g[4] = {
1971         { .fni8 = tcg_gen_vec_shr8i_i64,
1972           .fniv = tcg_gen_shri_vec,
1973           .fno = gen_helper_gvec_shr8i,
1974           .opc = INDEX_op_shri_vec,
1975           .vece = MO_8 },
1976         { .fni8 = tcg_gen_vec_shr16i_i64,
1977           .fniv = tcg_gen_shri_vec,
1978           .fno = gen_helper_gvec_shr16i,
1979           .opc = INDEX_op_shri_vec,
1980           .vece = MO_16 },
1981         { .fni4 = tcg_gen_shri_i32,
1982           .fniv = tcg_gen_shri_vec,
1983           .fno = gen_helper_gvec_shr32i,
1984           .opc = INDEX_op_shri_vec,
1985           .vece = MO_32 },
1986         { .fni8 = tcg_gen_shri_i64,
1987           .fniv = tcg_gen_shri_vec,
1988           .fno = gen_helper_gvec_shr64i,
1989           .opc = INDEX_op_shri_vec,
1990           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1991           .vece = MO_64 },
1992     };
1993 
1994     tcg_debug_assert(vece <= MO_64);
1995     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
1996     if (shift == 0) {
1997         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
1998     } else {
1999         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2000     }
2001 }
2002 
2003 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2004 {
2005     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2006     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2007     TCGv_i64 s = tcg_temp_new_i64();
2008 
2009     tcg_gen_shri_i64(d, a, c);
2010     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2011     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2012     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2013     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2014     tcg_temp_free_i64(s);
2015 }
2016 
2017 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2018 {
2019     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2020     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2021     TCGv_i64 s = tcg_temp_new_i64();
2022 
2023     tcg_gen_shri_i64(d, a, c);
2024     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2025     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2026     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2027     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2028     tcg_temp_free_i64(s);
2029 }
2030 
2031 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2032                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2033 {
2034     static const GVecGen2i g[4] = {
2035         { .fni8 = tcg_gen_vec_sar8i_i64,
2036           .fniv = tcg_gen_sari_vec,
2037           .fno = gen_helper_gvec_sar8i,
2038           .opc = INDEX_op_sari_vec,
2039           .vece = MO_8 },
2040         { .fni8 = tcg_gen_vec_sar16i_i64,
2041           .fniv = tcg_gen_sari_vec,
2042           .fno = gen_helper_gvec_sar16i,
2043           .opc = INDEX_op_sari_vec,
2044           .vece = MO_16 },
2045         { .fni4 = tcg_gen_sari_i32,
2046           .fniv = tcg_gen_sari_vec,
2047           .fno = gen_helper_gvec_sar32i,
2048           .opc = INDEX_op_sari_vec,
2049           .vece = MO_32 },
2050         { .fni8 = tcg_gen_sari_i64,
2051           .fniv = tcg_gen_sari_vec,
2052           .fno = gen_helper_gvec_sar64i,
2053           .opc = INDEX_op_sari_vec,
2054           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2055           .vece = MO_64 },
2056     };
2057 
2058     tcg_debug_assert(vece <= MO_64);
2059     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2060     if (shift == 0) {
2061         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2062     } else {
2063         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2064     }
2065 }
2066 
2067 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
2068 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2069                            uint32_t oprsz, TCGCond cond)
2070 {
2071     TCGv_i32 t0 = tcg_temp_new_i32();
2072     TCGv_i32 t1 = tcg_temp_new_i32();
2073     uint32_t i;
2074 
2075     for (i = 0; i < oprsz; i += 4) {
2076         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
2077         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
2078         tcg_gen_setcond_i32(cond, t0, t0, t1);
2079         tcg_gen_neg_i32(t0, t0);
2080         tcg_gen_st_i32(t0, cpu_env, dofs + i);
2081     }
2082     tcg_temp_free_i32(t1);
2083     tcg_temp_free_i32(t0);
2084 }
2085 
2086 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2087                            uint32_t oprsz, TCGCond cond)
2088 {
2089     TCGv_i64 t0 = tcg_temp_new_i64();
2090     TCGv_i64 t1 = tcg_temp_new_i64();
2091     uint32_t i;
2092 
2093     for (i = 0; i < oprsz; i += 8) {
2094         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
2095         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
2096         tcg_gen_setcond_i64(cond, t0, t0, t1);
2097         tcg_gen_neg_i64(t0, t0);
2098         tcg_gen_st_i64(t0, cpu_env, dofs + i);
2099     }
2100     tcg_temp_free_i64(t1);
2101     tcg_temp_free_i64(t0);
2102 }
2103 
2104 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2105                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
2106                            TCGType type, TCGCond cond)
2107 {
2108     TCGv_vec t0 = tcg_temp_new_vec(type);
2109     TCGv_vec t1 = tcg_temp_new_vec(type);
2110     uint32_t i;
2111 
2112     for (i = 0; i < oprsz; i += tysz) {
2113         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2114         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
2115         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
2116         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2117     }
2118     tcg_temp_free_vec(t1);
2119     tcg_temp_free_vec(t0);
2120 }
2121 
2122 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
2123                       uint32_t aofs, uint32_t bofs,
2124                       uint32_t oprsz, uint32_t maxsz)
2125 {
2126     static gen_helper_gvec_3 * const eq_fn[4] = {
2127         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
2128         gen_helper_gvec_eq32, gen_helper_gvec_eq64
2129     };
2130     static gen_helper_gvec_3 * const ne_fn[4] = {
2131         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
2132         gen_helper_gvec_ne32, gen_helper_gvec_ne64
2133     };
2134     static gen_helper_gvec_3 * const lt_fn[4] = {
2135         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
2136         gen_helper_gvec_lt32, gen_helper_gvec_lt64
2137     };
2138     static gen_helper_gvec_3 * const le_fn[4] = {
2139         gen_helper_gvec_le8, gen_helper_gvec_le16,
2140         gen_helper_gvec_le32, gen_helper_gvec_le64
2141     };
2142     static gen_helper_gvec_3 * const ltu_fn[4] = {
2143         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
2144         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
2145     };
2146     static gen_helper_gvec_3 * const leu_fn[4] = {
2147         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
2148         gen_helper_gvec_leu32, gen_helper_gvec_leu64
2149     };
2150     static gen_helper_gvec_3 * const * const fns[16] = {
2151         [TCG_COND_EQ] = eq_fn,
2152         [TCG_COND_NE] = ne_fn,
2153         [TCG_COND_LT] = lt_fn,
2154         [TCG_COND_LE] = le_fn,
2155         [TCG_COND_LTU] = ltu_fn,
2156         [TCG_COND_LEU] = leu_fn,
2157     };
2158 
2159     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
2160     check_overlap_3(dofs, aofs, bofs, maxsz);
2161 
2162     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
2163         do_dup(MO_8, dofs, oprsz, maxsz,
2164                NULL, NULL, -(cond == TCG_COND_ALWAYS));
2165         return;
2166     }
2167 
2168     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
2169        Expand with successively smaller host vector sizes.  The intent is
2170        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
2171 
2172     if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)
2173         && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V256, vece)) {
2174         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
2175         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
2176         if (some == oprsz) {
2177             goto done;
2178         }
2179         dofs += some;
2180         aofs += some;
2181         bofs += some;
2182         oprsz -= some;
2183         maxsz -= some;
2184     }
2185 
2186     if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)
2187         && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V128, vece)) {
2188         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
2189     } else if (TCG_TARGET_HAS_v64
2190                && check_size_impl(oprsz, 8)
2191                && (TCG_TARGET_REG_BITS == 32 || vece != MO_64)
2192                && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V64, vece)) {
2193         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
2194     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2195         expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
2196     } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2197         expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
2198     } else {
2199         gen_helper_gvec_3 * const *fn = fns[cond];
2200 
2201         if (fn == NULL) {
2202             uint32_t tmp;
2203             tmp = aofs, aofs = bofs, bofs = tmp;
2204             cond = tcg_swap_cond(cond);
2205             fn = fns[cond];
2206             assert(fn != NULL);
2207         }
2208         tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
2209         return;
2210     }
2211 
2212  done:
2213     if (oprsz < maxsz) {
2214         expand_clr(dofs + oprsz, maxsz - oprsz);
2215     }
2216 }
2217