xref: /openbmc/qemu/target/arm/tcg/translate-neon.c (revision 83ecdb18)
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include "qemu/osdep.h"
24 #include "tcg/tcg-op.h"
25 #include "tcg/tcg-op-gvec.h"
26 #include "exec/exec-all.h"
27 #include "exec/gen-icount.h"
28 #include "translate.h"
29 #include "translate-a32.h"
30 
31 /* Include the generated Neon decoder */
32 #include "decode-neon-dp.c.inc"
33 #include "decode-neon-ls.c.inc"
34 #include "decode-neon-shared.c.inc"
35 
36 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
37 {
38     TCGv_ptr ret = tcg_temp_new_ptr();
39     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
40     return ret;
41 }
42 
43 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
44 {
45     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
46 
47     switch (mop) {
48     case MO_UB:
49         tcg_gen_ld8u_i32(var, cpu_env, offset);
50         break;
51     case MO_UW:
52         tcg_gen_ld16u_i32(var, cpu_env, offset);
53         break;
54     case MO_UL:
55         tcg_gen_ld_i32(var, cpu_env, offset);
56         break;
57     default:
58         g_assert_not_reached();
59     }
60 }
61 
62 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
63 {
64     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
65 
66     switch (mop) {
67     case MO_UB:
68         tcg_gen_ld8u_i64(var, cpu_env, offset);
69         break;
70     case MO_UW:
71         tcg_gen_ld16u_i64(var, cpu_env, offset);
72         break;
73     case MO_UL:
74         tcg_gen_ld32u_i64(var, cpu_env, offset);
75         break;
76     case MO_UQ:
77         tcg_gen_ld_i64(var, cpu_env, offset);
78         break;
79     default:
80         g_assert_not_reached();
81     }
82 }
83 
84 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
85 {
86     long offset = neon_element_offset(reg, ele, size);
87 
88     switch (size) {
89     case MO_8:
90         tcg_gen_st8_i32(var, cpu_env, offset);
91         break;
92     case MO_16:
93         tcg_gen_st16_i32(var, cpu_env, offset);
94         break;
95     case MO_32:
96         tcg_gen_st_i32(var, cpu_env, offset);
97         break;
98     default:
99         g_assert_not_reached();
100     }
101 }
102 
103 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
104 {
105     long offset = neon_element_offset(reg, ele, size);
106 
107     switch (size) {
108     case MO_8:
109         tcg_gen_st8_i64(var, cpu_env, offset);
110         break;
111     case MO_16:
112         tcg_gen_st16_i64(var, cpu_env, offset);
113         break;
114     case MO_32:
115         tcg_gen_st32_i64(var, cpu_env, offset);
116         break;
117     case MO_64:
118         tcg_gen_st_i64(var, cpu_env, offset);
119         break;
120     default:
121         g_assert_not_reached();
122     }
123 }
124 
125 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
126                          int data, gen_helper_gvec_4 *fn_gvec)
127 {
128     /* UNDEF accesses to D16-D31 if they don't exist. */
129     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
130         return false;
131     }
132 
133     /*
134      * UNDEF accesses to odd registers for each bit of Q.
135      * Q will be 0b111 for all Q-reg instructions, otherwise
136      * when we have mixed Q- and D-reg inputs.
137      */
138     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
139         return false;
140     }
141 
142     if (!vfp_access_check(s)) {
143         return true;
144     }
145 
146     int opr_sz = q ? 16 : 8;
147     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
148                        vfp_reg_offset(1, vn),
149                        vfp_reg_offset(1, vm),
150                        vfp_reg_offset(1, vd),
151                        opr_sz, opr_sz, data, fn_gvec);
152     return true;
153 }
154 
155 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
156                               int data, ARMFPStatusFlavour fp_flavour,
157                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
158 {
159     /* UNDEF accesses to D16-D31 if they don't exist. */
160     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
161         return false;
162     }
163 
164     /*
165      * UNDEF accesses to odd registers for each bit of Q.
166      * Q will be 0b111 for all Q-reg instructions, otherwise
167      * when we have mixed Q- and D-reg inputs.
168      */
169     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
170         return false;
171     }
172 
173     if (!vfp_access_check(s)) {
174         return true;
175     }
176 
177     int opr_sz = q ? 16 : 8;
178     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
179 
180     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
181                        vfp_reg_offset(1, vn),
182                        vfp_reg_offset(1, vm),
183                        vfp_reg_offset(1, vd),
184                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
185     return true;
186 }
187 
188 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
189 {
190     if (!dc_isar_feature(aa32_vcma, s)) {
191         return false;
192     }
193     if (a->size == MO_16) {
194         if (!dc_isar_feature(aa32_fp16_arith, s)) {
195             return false;
196         }
197         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
198                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
199     }
200     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
201                              FPST_STD, gen_helper_gvec_fcmlas);
202 }
203 
204 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
205 {
206     int opr_sz;
207     TCGv_ptr fpst;
208     gen_helper_gvec_3_ptr *fn_gvec_ptr;
209 
210     if (!dc_isar_feature(aa32_vcma, s)
211         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
212         return false;
213     }
214 
215     /* UNDEF accesses to D16-D31 if they don't exist. */
216     if (!dc_isar_feature(aa32_simd_r32, s) &&
217         ((a->vd | a->vn | a->vm) & 0x10)) {
218         return false;
219     }
220 
221     if ((a->vn | a->vm | a->vd) & a->q) {
222         return false;
223     }
224 
225     if (!vfp_access_check(s)) {
226         return true;
227     }
228 
229     opr_sz = (1 + a->q) * 8;
230     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
231     fn_gvec_ptr = (a->size == MO_16) ?
232         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
233     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
234                        vfp_reg_offset(1, a->vn),
235                        vfp_reg_offset(1, a->vm),
236                        fpst, opr_sz, opr_sz, a->rot,
237                        fn_gvec_ptr);
238     return true;
239 }
240 
241 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
242 {
243     if (!dc_isar_feature(aa32_dp, s)) {
244         return false;
245     }
246     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
247                         gen_helper_gvec_sdot_b);
248 }
249 
250 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
251 {
252     if (!dc_isar_feature(aa32_dp, s)) {
253         return false;
254     }
255     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
256                         gen_helper_gvec_udot_b);
257 }
258 
259 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
260 {
261     if (!dc_isar_feature(aa32_i8mm, s)) {
262         return false;
263     }
264     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
265                         gen_helper_gvec_usdot_b);
266 }
267 
268 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
269 {
270     if (!dc_isar_feature(aa32_bf16, s)) {
271         return false;
272     }
273     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
274                         gen_helper_gvec_bfdot);
275 }
276 
277 static bool trans_VFML(DisasContext *s, arg_VFML *a)
278 {
279     int opr_sz;
280 
281     if (!dc_isar_feature(aa32_fhm, s)) {
282         return false;
283     }
284 
285     /* UNDEF accesses to D16-D31 if they don't exist. */
286     if (!dc_isar_feature(aa32_simd_r32, s) &&
287         (a->vd & 0x10)) {
288         return false;
289     }
290 
291     if (a->vd & a->q) {
292         return false;
293     }
294 
295     if (!vfp_access_check(s)) {
296         return true;
297     }
298 
299     opr_sz = (1 + a->q) * 8;
300     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
301                        vfp_reg_offset(a->q, a->vn),
302                        vfp_reg_offset(a->q, a->vm),
303                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
304                        gen_helper_gvec_fmlal_a32);
305     return true;
306 }
307 
308 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
309 {
310     int data = (a->index << 2) | a->rot;
311 
312     if (!dc_isar_feature(aa32_vcma, s)) {
313         return false;
314     }
315     if (a->size == MO_16) {
316         if (!dc_isar_feature(aa32_fp16_arith, s)) {
317             return false;
318         }
319         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
320                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
321     }
322     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
323                              FPST_STD, gen_helper_gvec_fcmlas_idx);
324 }
325 
326 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
327 {
328     if (!dc_isar_feature(aa32_dp, s)) {
329         return false;
330     }
331     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
332                         gen_helper_gvec_sdot_idx_b);
333 }
334 
335 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
336 {
337     if (!dc_isar_feature(aa32_dp, s)) {
338         return false;
339     }
340     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
341                         gen_helper_gvec_udot_idx_b);
342 }
343 
344 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
345 {
346     if (!dc_isar_feature(aa32_i8mm, s)) {
347         return false;
348     }
349     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
350                         gen_helper_gvec_usdot_idx_b);
351 }
352 
353 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
354 {
355     if (!dc_isar_feature(aa32_i8mm, s)) {
356         return false;
357     }
358     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
359                         gen_helper_gvec_sudot_idx_b);
360 }
361 
362 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
363 {
364     if (!dc_isar_feature(aa32_bf16, s)) {
365         return false;
366     }
367     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
368                         gen_helper_gvec_bfdot_idx);
369 }
370 
371 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
372 {
373     int opr_sz;
374 
375     if (!dc_isar_feature(aa32_fhm, s)) {
376         return false;
377     }
378 
379     /* UNDEF accesses to D16-D31 if they don't exist. */
380     if (!dc_isar_feature(aa32_simd_r32, s) &&
381         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
382         return false;
383     }
384 
385     if (a->vd & a->q) {
386         return false;
387     }
388 
389     if (!vfp_access_check(s)) {
390         return true;
391     }
392 
393     opr_sz = (1 + a->q) * 8;
394     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
395                        vfp_reg_offset(a->q, a->vn),
396                        vfp_reg_offset(a->q, a->rm),
397                        cpu_env, opr_sz, opr_sz,
398                        (a->index << 2) | a->s, /* is_2 == 0 */
399                        gen_helper_gvec_fmlal_idx_a32);
400     return true;
401 }
402 
403 static struct {
404     int nregs;
405     int interleave;
406     int spacing;
407 } const neon_ls_element_type[11] = {
408     {1, 4, 1},
409     {1, 4, 2},
410     {4, 1, 1},
411     {2, 2, 2},
412     {1, 3, 1},
413     {1, 3, 2},
414     {3, 1, 1},
415     {1, 1, 1},
416     {1, 2, 1},
417     {1, 2, 2},
418     {2, 1, 1}
419 };
420 
421 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
422                                       int stride)
423 {
424     if (rm != 15) {
425         TCGv_i32 base;
426 
427         base = load_reg(s, rn);
428         if (rm == 13) {
429             tcg_gen_addi_i32(base, base, stride);
430         } else {
431             TCGv_i32 index;
432             index = load_reg(s, rm);
433             tcg_gen_add_i32(base, base, index);
434         }
435         store_reg(s, rn, base);
436     }
437 }
438 
439 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
440 {
441     /* Neon load/store multiple structures */
442     int nregs, interleave, spacing, reg, n;
443     MemOp mop, align, endian;
444     int mmu_idx = get_mem_index(s);
445     int size = a->size;
446     TCGv_i64 tmp64;
447     TCGv_i32 addr;
448 
449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
450         return false;
451     }
452 
453     /* UNDEF accesses to D16-D31 if they don't exist */
454     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
455         return false;
456     }
457     if (a->itype > 10) {
458         return false;
459     }
460     /* Catch UNDEF cases for bad values of align field */
461     switch (a->itype & 0xc) {
462     case 4:
463         if (a->align >= 2) {
464             return false;
465         }
466         break;
467     case 8:
468         if (a->align == 3) {
469             return false;
470         }
471         break;
472     default:
473         break;
474     }
475     nregs = neon_ls_element_type[a->itype].nregs;
476     interleave = neon_ls_element_type[a->itype].interleave;
477     spacing = neon_ls_element_type[a->itype].spacing;
478     if (size == 3 && (interleave | spacing) != 1) {
479         return false;
480     }
481 
482     if (!vfp_access_check(s)) {
483         return true;
484     }
485 
486     /* For our purposes, bytes are always little-endian.  */
487     endian = s->be_data;
488     if (size == 0) {
489         endian = MO_LE;
490     }
491 
492     /* Enforce alignment requested by the instruction */
493     if (a->align) {
494         align = pow2_align(a->align + 2); /* 4 ** a->align */
495     } else {
496         align = s->align_mem ? MO_ALIGN : 0;
497     }
498 
499     /*
500      * Consecutive little-endian elements from a single register
501      * can be promoted to a larger little-endian operation.
502      */
503     if (interleave == 1 && endian == MO_LE) {
504         /* Retain any natural alignment. */
505         if (align == MO_ALIGN) {
506             align = pow2_align(size);
507         }
508         size = 3;
509     }
510 
511     tmp64 = tcg_temp_new_i64();
512     addr = tcg_temp_new_i32();
513     load_reg_var(s, addr, a->rn);
514 
515     mop = endian | size | align;
516     for (reg = 0; reg < nregs; reg++) {
517         for (n = 0; n < 8 >> size; n++) {
518             int xs;
519             for (xs = 0; xs < interleave; xs++) {
520                 int tt = a->vd + reg + spacing * xs;
521 
522                 if (a->l) {
523                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
524                     neon_store_element64(tt, n, size, tmp64);
525                 } else {
526                     neon_load_element64(tmp64, tt, n, size);
527                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
528                 }
529                 tcg_gen_addi_i32(addr, addr, 1 << size);
530 
531                 /* Subsequent memory operations inherit alignment */
532                 mop &= ~MO_AMASK;
533             }
534         }
535     }
536 
537     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
538     return true;
539 }
540 
541 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
542 {
543     /* Neon load single structure to all lanes */
544     int reg, stride, vec_size;
545     int vd = a->vd;
546     int size = a->size;
547     int nregs = a->n + 1;
548     TCGv_i32 addr, tmp;
549     MemOp mop, align;
550 
551     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
552         return false;
553     }
554 
555     /* UNDEF accesses to D16-D31 if they don't exist */
556     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
557         return false;
558     }
559 
560     align = 0;
561     if (size == 3) {
562         if (nregs != 4 || a->a == 0) {
563             return false;
564         }
565         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
566         size = MO_32;
567         align = MO_ALIGN_16;
568     } else if (a->a) {
569         switch (nregs) {
570         case 1:
571             if (size == 0) {
572                 return false;
573             }
574             align = MO_ALIGN;
575             break;
576         case 2:
577             align = pow2_align(size + 1);
578             break;
579         case 3:
580             return false;
581         case 4:
582             if (size == 2) {
583                 align = pow2_align(3);
584             } else {
585                 align = pow2_align(size + 2);
586             }
587             break;
588         default:
589             g_assert_not_reached();
590         }
591     }
592 
593     if (!vfp_access_check(s)) {
594         return true;
595     }
596 
597     /*
598      * VLD1 to all lanes: T bit indicates how many Dregs to write.
599      * VLD2/3/4 to all lanes: T bit indicates register stride.
600      */
601     stride = a->t ? 2 : 1;
602     vec_size = nregs == 1 ? stride * 8 : 8;
603     mop = size | align;
604     tmp = tcg_temp_new_i32();
605     addr = tcg_temp_new_i32();
606     load_reg_var(s, addr, a->rn);
607     for (reg = 0; reg < nregs; reg++) {
608         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
609         if ((vd & 1) && vec_size == 16) {
610             /*
611              * We cannot write 16 bytes at once because the
612              * destination is unaligned.
613              */
614             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
615                                  8, 8, tmp);
616             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
617                              neon_full_reg_offset(vd), 8, 8);
618         } else {
619             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
620                                  vec_size, vec_size, tmp);
621         }
622         tcg_gen_addi_i32(addr, addr, 1 << size);
623         vd += stride;
624 
625         /* Subsequent memory operations inherit alignment */
626         mop &= ~MO_AMASK;
627     }
628 
629     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
630 
631     return true;
632 }
633 
634 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
635 {
636     /* Neon load/store single structure to one lane */
637     int reg;
638     int nregs = a->n + 1;
639     int vd = a->vd;
640     TCGv_i32 addr, tmp;
641     MemOp mop;
642 
643     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
644         return false;
645     }
646 
647     /* UNDEF accesses to D16-D31 if they don't exist */
648     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
649         return false;
650     }
651 
652     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
653     switch (nregs) {
654     case 1:
655         if (a->stride != 1) {
656             return false;
657         }
658         if (((a->align & (1 << a->size)) != 0) ||
659             (a->size == 2 && (a->align == 1 || a->align == 2))) {
660             return false;
661         }
662         break;
663     case 2:
664         if (a->size == 2 && (a->align & 2) != 0) {
665             return false;
666         }
667         break;
668     case 3:
669         if (a->align != 0) {
670             return false;
671         }
672         break;
673     case 4:
674         if (a->size == 2 && a->align == 3) {
675             return false;
676         }
677         break;
678     default:
679         g_assert_not_reached();
680     }
681     if ((vd + a->stride * (nregs - 1)) > 31) {
682         /*
683          * Attempts to write off the end of the register file are
684          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
685          * access off the end of the array that holds the register data.
686          */
687         return false;
688     }
689 
690     if (!vfp_access_check(s)) {
691         return true;
692     }
693 
694     /* Pick up SCTLR settings */
695     mop = finalize_memop(s, a->size);
696 
697     if (a->align) {
698         MemOp align_op;
699 
700         switch (nregs) {
701         case 1:
702             /* For VLD1, use natural alignment. */
703             align_op = MO_ALIGN;
704             break;
705         case 2:
706             /* For VLD2, use double alignment. */
707             align_op = pow2_align(a->size + 1);
708             break;
709         case 4:
710             if (a->size == MO_32) {
711                 /*
712                  * For VLD4.32, align = 1 is double alignment, align = 2 is
713                  * quad alignment; align = 3 is rejected above.
714                  */
715                 align_op = pow2_align(a->size + a->align);
716             } else {
717                 /* For VLD4.8 and VLD.16, we want quad alignment. */
718                 align_op = pow2_align(a->size + 2);
719             }
720             break;
721         default:
722             /* For VLD3, the alignment field is zero and rejected above. */
723             g_assert_not_reached();
724         }
725 
726         mop = (mop & ~MO_AMASK) | align_op;
727     }
728 
729     tmp = tcg_temp_new_i32();
730     addr = tcg_temp_new_i32();
731     load_reg_var(s, addr, a->rn);
732 
733     for (reg = 0; reg < nregs; reg++) {
734         if (a->l) {
735             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
736             neon_store_element(vd, a->reg_idx, a->size, tmp);
737         } else { /* Store */
738             neon_load_element(tmp, vd, a->reg_idx, a->size);
739             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
740         }
741         vd += a->stride;
742         tcg_gen_addi_i32(addr, addr, 1 << a->size);
743 
744         /* Subsequent memory operations inherit alignment */
745         mop &= ~MO_AMASK;
746     }
747 
748     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
749 
750     return true;
751 }
752 
753 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
754 {
755     int vec_size = a->q ? 16 : 8;
756     int rd_ofs = neon_full_reg_offset(a->vd);
757     int rn_ofs = neon_full_reg_offset(a->vn);
758     int rm_ofs = neon_full_reg_offset(a->vm);
759 
760     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
761         return false;
762     }
763 
764     /* UNDEF accesses to D16-D31 if they don't exist. */
765     if (!dc_isar_feature(aa32_simd_r32, s) &&
766         ((a->vd | a->vn | a->vm) & 0x10)) {
767         return false;
768     }
769 
770     if ((a->vn | a->vm | a->vd) & a->q) {
771         return false;
772     }
773 
774     if (!vfp_access_check(s)) {
775         return true;
776     }
777 
778     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
779     return true;
780 }
781 
782 #define DO_3SAME(INSN, FUNC)                                            \
783     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
784     {                                                                   \
785         return do_3same(s, a, FUNC);                                    \
786     }
787 
788 DO_3SAME(VADD, tcg_gen_gvec_add)
789 DO_3SAME(VSUB, tcg_gen_gvec_sub)
790 DO_3SAME(VAND, tcg_gen_gvec_and)
791 DO_3SAME(VBIC, tcg_gen_gvec_andc)
792 DO_3SAME(VORR, tcg_gen_gvec_or)
793 DO_3SAME(VORN, tcg_gen_gvec_orc)
794 DO_3SAME(VEOR, tcg_gen_gvec_xor)
795 DO_3SAME(VSHL_S, gen_gvec_sshl)
796 DO_3SAME(VSHL_U, gen_gvec_ushl)
797 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
798 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
799 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
800 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
801 
802 /* These insns are all gvec_bitsel but with the inputs in various orders. */
803 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
804     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
805                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
806                                 uint32_t oprsz, uint32_t maxsz)         \
807     {                                                                   \
808         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
809     }                                                                   \
810     DO_3SAME(INSN, gen_##INSN##_3s)
811 
812 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
813 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
814 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
815 
816 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
817     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
818     {                                                                   \
819         if (a->size == 3) {                                             \
820             return false;                                               \
821         }                                                               \
822         return do_3same(s, a, FUNC);                                    \
823     }
824 
825 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
826 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
827 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
828 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
829 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
830 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
831 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
832 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
833 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
834 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
835 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
836 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
837 
838 #define DO_3SAME_CMP(INSN, COND)                                        \
839     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
840                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
841                                 uint32_t oprsz, uint32_t maxsz)         \
842     {                                                                   \
843         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
844     }                                                                   \
845     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
846 
847 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
848 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
849 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
850 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
851 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
852 
853 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
854     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
855                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
856     {                                                                      \
857         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
858     }
859 
860 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
861 
862 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
863 {
864     if (a->size != 0) {
865         return false;
866     }
867     return do_3same(s, a, gen_VMUL_p_3s);
868 }
869 
870 #define DO_VQRDMLAH(INSN, FUNC)                                         \
871     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
872     {                                                                   \
873         if (!dc_isar_feature(aa32_rdm, s)) {                            \
874             return false;                                               \
875         }                                                               \
876         if (a->size != 1 && a->size != 2) {                             \
877             return false;                                               \
878         }                                                               \
879         return do_3same(s, a, FUNC);                                    \
880     }
881 
882 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
883 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
884 
885 #define DO_SHA1(NAME, FUNC)                                             \
886     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
887     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
888     {                                                                   \
889         if (!dc_isar_feature(aa32_sha1, s)) {                           \
890             return false;                                               \
891         }                                                               \
892         return do_3same(s, a, gen_##NAME##_3s);                         \
893     }
894 
895 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
896 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
897 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
898 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
899 
900 #define DO_SHA2(NAME, FUNC)                                             \
901     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
902     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
903     {                                                                   \
904         if (!dc_isar_feature(aa32_sha2, s)) {                           \
905             return false;                                               \
906         }                                                               \
907         return do_3same(s, a, gen_##NAME##_3s);                         \
908     }
909 
910 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
911 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
912 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
913 
914 #define DO_3SAME_64(INSN, FUNC)                                         \
915     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
916                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
917                                 uint32_t oprsz, uint32_t maxsz)         \
918     {                                                                   \
919         static const GVecGen3 op = { .fni8 = FUNC };                    \
920         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
921     }                                                                   \
922     DO_3SAME(INSN, gen_##INSN##_3s)
923 
924 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
925     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
926     {                                                                   \
927         FUNC(d, cpu_env, n, m);                                         \
928     }                                                                   \
929     DO_3SAME_64(INSN, gen_##INSN##_elt)
930 
931 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
932 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
933 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
934 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
935 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
936 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
937 
938 #define DO_3SAME_32(INSN, FUNC)                                         \
939     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
940                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
941                                 uint32_t oprsz, uint32_t maxsz)         \
942     {                                                                   \
943         static const GVecGen3 ops[4] = {                                \
944             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
945             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
946             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
947             { 0 },                                                      \
948         };                                                              \
949         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
950     }                                                                   \
951     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
952     {                                                                   \
953         if (a->size > 2) {                                              \
954             return false;                                               \
955         }                                                               \
956         return do_3same(s, a, gen_##INSN##_3s);                         \
957     }
958 
959 /*
960  * Some helper functions need to be passed the cpu_env. In order
961  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
962  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
963  * and which call a NeonGenTwoOpEnvFn().
964  */
965 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
966     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
967     {                                                                   \
968         FUNC(d, cpu_env, n, m);                                         \
969     }
970 
971 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
972     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
973     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
974     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
975     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
976                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
977                                 uint32_t oprsz, uint32_t maxsz)         \
978     {                                                                   \
979         static const GVecGen3 ops[4] = {                                \
980             { .fni4 = gen_##INSN##_tramp8 },                            \
981             { .fni4 = gen_##INSN##_tramp16 },                           \
982             { .fni4 = gen_##INSN##_tramp32 },                           \
983             { 0 },                                                      \
984         };                                                              \
985         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
986     }                                                                   \
987     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
988     {                                                                   \
989         if (a->size > 2) {                                              \
990             return false;                                               \
991         }                                                               \
992         return do_3same(s, a, gen_##INSN##_3s);                         \
993     }
994 
995 DO_3SAME_32(VHADD_S, hadd_s)
996 DO_3SAME_32(VHADD_U, hadd_u)
997 DO_3SAME_32(VHSUB_S, hsub_s)
998 DO_3SAME_32(VHSUB_U, hsub_u)
999 DO_3SAME_32(VRHADD_S, rhadd_s)
1000 DO_3SAME_32(VRHADD_U, rhadd_u)
1001 DO_3SAME_32(VRSHL_S, rshl_s)
1002 DO_3SAME_32(VRSHL_U, rshl_u)
1003 
1004 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1005 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1006 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1007 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1008 
1009 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1010 {
1011     /* Operations handled pairwise 32 bits at a time */
1012     TCGv_i32 tmp, tmp2, tmp3;
1013 
1014     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1015         return false;
1016     }
1017 
1018     /* UNDEF accesses to D16-D31 if they don't exist. */
1019     if (!dc_isar_feature(aa32_simd_r32, s) &&
1020         ((a->vd | a->vn | a->vm) & 0x10)) {
1021         return false;
1022     }
1023 
1024     if (a->size == 3) {
1025         return false;
1026     }
1027 
1028     if (!vfp_access_check(s)) {
1029         return true;
1030     }
1031 
1032     assert(a->q == 0); /* enforced by decode patterns */
1033 
1034     /*
1035      * Note that we have to be careful not to clobber the source operands
1036      * in the "vm == vd" case by storing the result of the first pass too
1037      * early. Since Q is 0 there are always just two passes, so instead
1038      * of a complicated loop over each pass we just unroll.
1039      */
1040     tmp = tcg_temp_new_i32();
1041     tmp2 = tcg_temp_new_i32();
1042     tmp3 = tcg_temp_new_i32();
1043 
1044     read_neon_element32(tmp, a->vn, 0, MO_32);
1045     read_neon_element32(tmp2, a->vn, 1, MO_32);
1046     fn(tmp, tmp, tmp2);
1047 
1048     read_neon_element32(tmp3, a->vm, 0, MO_32);
1049     read_neon_element32(tmp2, a->vm, 1, MO_32);
1050     fn(tmp3, tmp3, tmp2);
1051 
1052     write_neon_element32(tmp, a->vd, 0, MO_32);
1053     write_neon_element32(tmp3, a->vd, 1, MO_32);
1054 
1055     return true;
1056 }
1057 
1058 #define DO_3SAME_PAIR(INSN, func)                                       \
1059     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1060     {                                                                   \
1061         static NeonGenTwoOpFn * const fns[] = {                         \
1062             gen_helper_neon_##func##8,                                  \
1063             gen_helper_neon_##func##16,                                 \
1064             gen_helper_neon_##func##32,                                 \
1065         };                                                              \
1066         if (a->size > 2) {                                              \
1067             return false;                                               \
1068         }                                                               \
1069         return do_3same_pair(s, a, fns[a->size]);                       \
1070     }
1071 
1072 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1073 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1074 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1075 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1076 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1077 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1078 
1079 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1080 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1081 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1082 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1083 DO_3SAME_PAIR(VPADD, padd_u)
1084 
1085 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1086     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1087     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1088     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1089                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1090                                 uint32_t oprsz, uint32_t maxsz)         \
1091     {                                                                   \
1092         static const GVecGen3 ops[2] = {                                \
1093             { .fni4 = gen_##INSN##_tramp16 },                           \
1094             { .fni4 = gen_##INSN##_tramp32 },                           \
1095         };                                                              \
1096         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1097     }                                                                   \
1098     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1099     {                                                                   \
1100         if (a->size != 1 && a->size != 2) {                             \
1101             return false;                                               \
1102         }                                                               \
1103         return do_3same(s, a, gen_##INSN##_3s);                         \
1104     }
1105 
1106 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1107 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1108 
1109 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1110     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1111                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1112                          uint32_t oprsz, uint32_t maxsz)                \
1113     {                                                                   \
1114         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1115         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1116                            oprsz, maxsz, 0, FUNC);                      \
1117     }
1118 
1119 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1120     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1121     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1122     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1123     {                                                                   \
1124         if (a->size == MO_16) {                                         \
1125             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1126                 return false;                                           \
1127             }                                                           \
1128             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1129         }                                                               \
1130         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1131     }
1132 
1133 
1134 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1135 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1136 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1137 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1138 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1139 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1140 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1141 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1142 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1143 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1144 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1145 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1146 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1147 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1148 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1149 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1150 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1151 
1152 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1153 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1154 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1155 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1156 
1157 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1158 {
1159     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1160         return false;
1161     }
1162 
1163     if (a->size == MO_16) {
1164         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1165             return false;
1166         }
1167         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1168     }
1169     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1170 }
1171 
1172 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1173 {
1174     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1175         return false;
1176     }
1177 
1178     if (a->size == MO_16) {
1179         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1180             return false;
1181         }
1182         return do_3same(s, a, gen_VMINNM_fp16_3s);
1183     }
1184     return do_3same(s, a, gen_VMINNM_fp32_3s);
1185 }
1186 
1187 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1188                              gen_helper_gvec_3_ptr *fn)
1189 {
1190     /* FP pairwise operations */
1191     TCGv_ptr fpstatus;
1192 
1193     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1194         return false;
1195     }
1196 
1197     /* UNDEF accesses to D16-D31 if they don't exist. */
1198     if (!dc_isar_feature(aa32_simd_r32, s) &&
1199         ((a->vd | a->vn | a->vm) & 0x10)) {
1200         return false;
1201     }
1202 
1203     if (!vfp_access_check(s)) {
1204         return true;
1205     }
1206 
1207     assert(a->q == 0); /* enforced by decode patterns */
1208 
1209 
1210     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1211     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1212                        vfp_reg_offset(1, a->vn),
1213                        vfp_reg_offset(1, a->vm),
1214                        fpstatus, 8, 8, 0, fn);
1215 
1216     return true;
1217 }
1218 
1219 /*
1220  * For all the functions using this macro, size == 1 means fp16,
1221  * which is an architecture extension we don't implement yet.
1222  */
1223 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1224     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1225     {                                                               \
1226         if (a->size == MO_16) {                                     \
1227             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1228                 return false;                                       \
1229             }                                                       \
1230             return do_3same_fp_pair(s, a, FUNC##h);                 \
1231         }                                                           \
1232         return do_3same_fp_pair(s, a, FUNC##s);                     \
1233     }
1234 
1235 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1236 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1237 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1238 
1239 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1240 {
1241     /* Handle a 2-reg-shift insn which can be vectorized. */
1242     int vec_size = a->q ? 16 : 8;
1243     int rd_ofs = neon_full_reg_offset(a->vd);
1244     int rm_ofs = neon_full_reg_offset(a->vm);
1245 
1246     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1247         return false;
1248     }
1249 
1250     /* UNDEF accesses to D16-D31 if they don't exist. */
1251     if (!dc_isar_feature(aa32_simd_r32, s) &&
1252         ((a->vd | a->vm) & 0x10)) {
1253         return false;
1254     }
1255 
1256     if ((a->vm | a->vd) & a->q) {
1257         return false;
1258     }
1259 
1260     if (!vfp_access_check(s)) {
1261         return true;
1262     }
1263 
1264     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1265     return true;
1266 }
1267 
1268 #define DO_2SH(INSN, FUNC)                                              \
1269     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1270     {                                                                   \
1271         return do_vector_2sh(s, a, FUNC);                               \
1272     }                                                                   \
1273 
1274 DO_2SH(VSHL, tcg_gen_gvec_shli)
1275 DO_2SH(VSLI, gen_gvec_sli)
1276 DO_2SH(VSRI, gen_gvec_sri)
1277 DO_2SH(VSRA_S, gen_gvec_ssra)
1278 DO_2SH(VSRA_U, gen_gvec_usra)
1279 DO_2SH(VRSHR_S, gen_gvec_srshr)
1280 DO_2SH(VRSHR_U, gen_gvec_urshr)
1281 DO_2SH(VRSRA_S, gen_gvec_srsra)
1282 DO_2SH(VRSRA_U, gen_gvec_ursra)
1283 
1284 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1285 {
1286     /* Signed shift out of range results in all-sign-bits */
1287     a->shift = MIN(a->shift, (8 << a->size) - 1);
1288     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1289 }
1290 
1291 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1292                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1293 {
1294     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1295 }
1296 
1297 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1298 {
1299     /* Shift out of range is architecturally valid and results in zero. */
1300     if (a->shift >= (8 << a->size)) {
1301         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1302     } else {
1303         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1304     }
1305 }
1306 
1307 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1308                              NeonGenTwo64OpEnvFn *fn)
1309 {
1310     /*
1311      * 2-reg-and-shift operations, size == 3 case, where the
1312      * function needs to be passed cpu_env.
1313      */
1314     TCGv_i64 constimm;
1315     int pass;
1316 
1317     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1318         return false;
1319     }
1320 
1321     /* UNDEF accesses to D16-D31 if they don't exist. */
1322     if (!dc_isar_feature(aa32_simd_r32, s) &&
1323         ((a->vd | a->vm) & 0x10)) {
1324         return false;
1325     }
1326 
1327     if ((a->vm | a->vd) & a->q) {
1328         return false;
1329     }
1330 
1331     if (!vfp_access_check(s)) {
1332         return true;
1333     }
1334 
1335     /*
1336      * To avoid excessive duplication of ops we implement shift
1337      * by immediate using the variable shift operations.
1338      */
1339     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1340 
1341     for (pass = 0; pass < a->q + 1; pass++) {
1342         TCGv_i64 tmp = tcg_temp_new_i64();
1343 
1344         read_neon_element64(tmp, a->vm, pass, MO_64);
1345         fn(tmp, cpu_env, tmp, constimm);
1346         write_neon_element64(tmp, a->vd, pass, MO_64);
1347     }
1348     return true;
1349 }
1350 
1351 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1352                              NeonGenTwoOpEnvFn *fn)
1353 {
1354     /*
1355      * 2-reg-and-shift operations, size < 3 case, where the
1356      * helper needs to be passed cpu_env.
1357      */
1358     TCGv_i32 constimm, tmp;
1359     int pass;
1360 
1361     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1362         return false;
1363     }
1364 
1365     /* UNDEF accesses to D16-D31 if they don't exist. */
1366     if (!dc_isar_feature(aa32_simd_r32, s) &&
1367         ((a->vd | a->vm) & 0x10)) {
1368         return false;
1369     }
1370 
1371     if ((a->vm | a->vd) & a->q) {
1372         return false;
1373     }
1374 
1375     if (!vfp_access_check(s)) {
1376         return true;
1377     }
1378 
1379     /*
1380      * To avoid excessive duplication of ops we implement shift
1381      * by immediate using the variable shift operations.
1382      */
1383     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1384     tmp = tcg_temp_new_i32();
1385 
1386     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1387         read_neon_element32(tmp, a->vm, pass, MO_32);
1388         fn(tmp, cpu_env, tmp, constimm);
1389         write_neon_element32(tmp, a->vd, pass, MO_32);
1390     }
1391     return true;
1392 }
1393 
1394 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1395     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1396     {                                                                   \
1397         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1398     }                                                                   \
1399     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1400     {                                                                   \
1401         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1402             gen_helper_neon_##FUNC##8,                                  \
1403             gen_helper_neon_##FUNC##16,                                 \
1404             gen_helper_neon_##FUNC##32,                                 \
1405         };                                                              \
1406         assert(a->size < ARRAY_SIZE(fns));                              \
1407         return do_2shift_env_32(s, a, fns[a->size]);                    \
1408     }
1409 
1410 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1411 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1412 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1413 
1414 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1415                                 NeonGenTwo64OpFn *shiftfn,
1416                                 NeonGenNarrowEnvFn *narrowfn)
1417 {
1418     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1419     TCGv_i64 constimm, rm1, rm2;
1420     TCGv_i32 rd;
1421 
1422     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1423         return false;
1424     }
1425 
1426     /* UNDEF accesses to D16-D31 if they don't exist. */
1427     if (!dc_isar_feature(aa32_simd_r32, s) &&
1428         ((a->vd | a->vm) & 0x10)) {
1429         return false;
1430     }
1431 
1432     if (a->vm & 1) {
1433         return false;
1434     }
1435 
1436     if (!vfp_access_check(s)) {
1437         return true;
1438     }
1439 
1440     /*
1441      * This is always a right shift, and the shiftfn is always a
1442      * left-shift helper, which thus needs the negated shift count.
1443      */
1444     constimm = tcg_constant_i64(-a->shift);
1445     rm1 = tcg_temp_new_i64();
1446     rm2 = tcg_temp_new_i64();
1447     rd = tcg_temp_new_i32();
1448 
1449     /* Load both inputs first to avoid potential overwrite if rm == rd */
1450     read_neon_element64(rm1, a->vm, 0, MO_64);
1451     read_neon_element64(rm2, a->vm, 1, MO_64);
1452 
1453     shiftfn(rm1, rm1, constimm);
1454     narrowfn(rd, cpu_env, rm1);
1455     write_neon_element32(rd, a->vd, 0, MO_32);
1456 
1457     shiftfn(rm2, rm2, constimm);
1458     narrowfn(rd, cpu_env, rm2);
1459     write_neon_element32(rd, a->vd, 1, MO_32);
1460 
1461     return true;
1462 }
1463 
1464 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1465                                 NeonGenTwoOpFn *shiftfn,
1466                                 NeonGenNarrowEnvFn *narrowfn)
1467 {
1468     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1469     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1470     TCGv_i64 rtmp;
1471     uint32_t imm;
1472 
1473     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1474         return false;
1475     }
1476 
1477     /* UNDEF accesses to D16-D31 if they don't exist. */
1478     if (!dc_isar_feature(aa32_simd_r32, s) &&
1479         ((a->vd | a->vm) & 0x10)) {
1480         return false;
1481     }
1482 
1483     if (a->vm & 1) {
1484         return false;
1485     }
1486 
1487     if (!vfp_access_check(s)) {
1488         return true;
1489     }
1490 
1491     /*
1492      * This is always a right shift, and the shiftfn is always a
1493      * left-shift helper, which thus needs the negated shift count
1494      * duplicated into each lane of the immediate value.
1495      */
1496     if (a->size == 1) {
1497         imm = (uint16_t)(-a->shift);
1498         imm |= imm << 16;
1499     } else {
1500         /* size == 2 */
1501         imm = -a->shift;
1502     }
1503     constimm = tcg_constant_i32(imm);
1504 
1505     /* Load all inputs first to avoid potential overwrite */
1506     rm1 = tcg_temp_new_i32();
1507     rm2 = tcg_temp_new_i32();
1508     rm3 = tcg_temp_new_i32();
1509     rm4 = tcg_temp_new_i32();
1510     read_neon_element32(rm1, a->vm, 0, MO_32);
1511     read_neon_element32(rm2, a->vm, 1, MO_32);
1512     read_neon_element32(rm3, a->vm, 2, MO_32);
1513     read_neon_element32(rm4, a->vm, 3, MO_32);
1514     rtmp = tcg_temp_new_i64();
1515 
1516     shiftfn(rm1, rm1, constimm);
1517     shiftfn(rm2, rm2, constimm);
1518 
1519     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1520 
1521     narrowfn(rm1, cpu_env, rtmp);
1522     write_neon_element32(rm1, a->vd, 0, MO_32);
1523 
1524     shiftfn(rm3, rm3, constimm);
1525     shiftfn(rm4, rm4, constimm);
1526 
1527     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1528 
1529     narrowfn(rm3, cpu_env, rtmp);
1530     write_neon_element32(rm3, a->vd, 1, MO_32);
1531     return true;
1532 }
1533 
1534 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1535     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1536     {                                                                   \
1537         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1538     }
1539 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1540     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1541     {                                                                   \
1542         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1543     }
1544 
1545 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1546 {
1547     tcg_gen_extrl_i64_i32(dest, src);
1548 }
1549 
1550 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1551 {
1552     gen_helper_neon_narrow_u16(dest, src);
1553 }
1554 
1555 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1556 {
1557     gen_helper_neon_narrow_u8(dest, src);
1558 }
1559 
1560 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1561 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1562 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1563 
1564 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1565 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1566 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1567 
1568 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1569 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1570 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1571 
1572 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1573 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1574 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1575 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1576 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1577 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1578 
1579 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1580 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1581 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1582 
1583 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1584 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1585 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1586 
1587 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1588 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1589 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1590 
1591 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1592                          NeonGenWidenFn *widenfn, bool u)
1593 {
1594     TCGv_i64 tmp;
1595     TCGv_i32 rm0, rm1;
1596     uint64_t widen_mask = 0;
1597 
1598     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1599         return false;
1600     }
1601 
1602     /* UNDEF accesses to D16-D31 if they don't exist. */
1603     if (!dc_isar_feature(aa32_simd_r32, s) &&
1604         ((a->vd | a->vm) & 0x10)) {
1605         return false;
1606     }
1607 
1608     if (a->vd & 1) {
1609         return false;
1610     }
1611 
1612     if (!vfp_access_check(s)) {
1613         return true;
1614     }
1615 
1616     /*
1617      * This is a widen-and-shift operation. The shift is always less
1618      * than the width of the source type, so after widening the input
1619      * vector we can simply shift the whole 64-bit widened register,
1620      * and then clear the potential overflow bits resulting from left
1621      * bits of the narrow input appearing as right bits of the left
1622      * neighbour narrow input. Calculate a mask of bits to clear.
1623      */
1624     if ((a->shift != 0) && (a->size < 2 || u)) {
1625         int esize = 8 << a->size;
1626         widen_mask = MAKE_64BIT_MASK(0, esize);
1627         widen_mask >>= esize - a->shift;
1628         widen_mask = dup_const(a->size + 1, widen_mask);
1629     }
1630 
1631     rm0 = tcg_temp_new_i32();
1632     rm1 = tcg_temp_new_i32();
1633     read_neon_element32(rm0, a->vm, 0, MO_32);
1634     read_neon_element32(rm1, a->vm, 1, MO_32);
1635     tmp = tcg_temp_new_i64();
1636 
1637     widenfn(tmp, rm0);
1638     if (a->shift != 0) {
1639         tcg_gen_shli_i64(tmp, tmp, a->shift);
1640         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1641     }
1642     write_neon_element64(tmp, a->vd, 0, MO_64);
1643 
1644     widenfn(tmp, rm1);
1645     if (a->shift != 0) {
1646         tcg_gen_shli_i64(tmp, tmp, a->shift);
1647         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1648     }
1649     write_neon_element64(tmp, a->vd, 1, MO_64);
1650     return true;
1651 }
1652 
1653 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1654 {
1655     static NeonGenWidenFn * const widenfn[] = {
1656         gen_helper_neon_widen_s8,
1657         gen_helper_neon_widen_s16,
1658         tcg_gen_ext_i32_i64,
1659     };
1660     return do_vshll_2sh(s, a, widenfn[a->size], false);
1661 }
1662 
1663 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1664 {
1665     static NeonGenWidenFn * const widenfn[] = {
1666         gen_helper_neon_widen_u8,
1667         gen_helper_neon_widen_u16,
1668         tcg_gen_extu_i32_i64,
1669     };
1670     return do_vshll_2sh(s, a, widenfn[a->size], true);
1671 }
1672 
1673 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1674                       gen_helper_gvec_2_ptr *fn)
1675 {
1676     /* FP operations in 2-reg-and-shift group */
1677     int vec_size = a->q ? 16 : 8;
1678     int rd_ofs = neon_full_reg_offset(a->vd);
1679     int rm_ofs = neon_full_reg_offset(a->vm);
1680     TCGv_ptr fpst;
1681 
1682     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1683         return false;
1684     }
1685 
1686     if (a->size == MO_16) {
1687         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1688             return false;
1689         }
1690     }
1691 
1692     /* UNDEF accesses to D16-D31 if they don't exist. */
1693     if (!dc_isar_feature(aa32_simd_r32, s) &&
1694         ((a->vd | a->vm) & 0x10)) {
1695         return false;
1696     }
1697 
1698     if ((a->vm | a->vd) & a->q) {
1699         return false;
1700     }
1701 
1702     if (!vfp_access_check(s)) {
1703         return true;
1704     }
1705 
1706     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1707     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1708     return true;
1709 }
1710 
1711 #define DO_FP_2SH(INSN, FUNC)                                           \
1712     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1713     {                                                                   \
1714         return do_fp_2sh(s, a, FUNC);                                   \
1715     }
1716 
1717 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1718 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1719 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1720 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1721 
1722 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1723 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1724 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1725 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1726 
1727 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1728                         GVecGen2iFn *fn)
1729 {
1730     uint64_t imm;
1731     int reg_ofs, vec_size;
1732 
1733     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1734         return false;
1735     }
1736 
1737     /* UNDEF accesses to D16-D31 if they don't exist. */
1738     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1739         return false;
1740     }
1741 
1742     if (a->vd & a->q) {
1743         return false;
1744     }
1745 
1746     if (!vfp_access_check(s)) {
1747         return true;
1748     }
1749 
1750     reg_ofs = neon_full_reg_offset(a->vd);
1751     vec_size = a->q ? 16 : 8;
1752     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1753 
1754     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1755     return true;
1756 }
1757 
1758 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1759                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1760 {
1761     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1762 }
1763 
1764 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1765 {
1766     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1767     GVecGen2iFn *fn;
1768 
1769     if ((a->cmode & 1) && a->cmode < 12) {
1770         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1771         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1772     } else {
1773         /* There is one unallocated cmode/op combination in this space */
1774         if (a->cmode == 15 && a->op == 1) {
1775             return false;
1776         }
1777         fn = gen_VMOV_1r;
1778     }
1779     return do_1reg_imm(s, a, fn);
1780 }
1781 
1782 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1783                            NeonGenWidenFn *widenfn,
1784                            NeonGenTwo64OpFn *opfn,
1785                            int src1_mop, int src2_mop)
1786 {
1787     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1788     TCGv_i64 rn0_64, rn1_64, rm_64;
1789 
1790     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1791         return false;
1792     }
1793 
1794     /* UNDEF accesses to D16-D31 if they don't exist. */
1795     if (!dc_isar_feature(aa32_simd_r32, s) &&
1796         ((a->vd | a->vn | a->vm) & 0x10)) {
1797         return false;
1798     }
1799 
1800     if (!opfn) {
1801         /* size == 3 case, which is an entirely different insn group */
1802         return false;
1803     }
1804 
1805     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1806         return false;
1807     }
1808 
1809     if (!vfp_access_check(s)) {
1810         return true;
1811     }
1812 
1813     rn0_64 = tcg_temp_new_i64();
1814     rn1_64 = tcg_temp_new_i64();
1815     rm_64 = tcg_temp_new_i64();
1816 
1817     if (src1_mop >= 0) {
1818         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1819     } else {
1820         TCGv_i32 tmp = tcg_temp_new_i32();
1821         read_neon_element32(tmp, a->vn, 0, MO_32);
1822         widenfn(rn0_64, tmp);
1823     }
1824     if (src2_mop >= 0) {
1825         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1826     } else {
1827         TCGv_i32 tmp = tcg_temp_new_i32();
1828         read_neon_element32(tmp, a->vm, 0, MO_32);
1829         widenfn(rm_64, tmp);
1830     }
1831 
1832     opfn(rn0_64, rn0_64, rm_64);
1833 
1834     /*
1835      * Load second pass inputs before storing the first pass result, to
1836      * avoid incorrect results if a narrow input overlaps with the result.
1837      */
1838     if (src1_mop >= 0) {
1839         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1840     } else {
1841         TCGv_i32 tmp = tcg_temp_new_i32();
1842         read_neon_element32(tmp, a->vn, 1, MO_32);
1843         widenfn(rn1_64, tmp);
1844     }
1845     if (src2_mop >= 0) {
1846         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1847     } else {
1848         TCGv_i32 tmp = tcg_temp_new_i32();
1849         read_neon_element32(tmp, a->vm, 1, MO_32);
1850         widenfn(rm_64, tmp);
1851     }
1852 
1853     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1854 
1855     opfn(rn1_64, rn1_64, rm_64);
1856     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1857 
1858     return true;
1859 }
1860 
1861 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1862     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1863     {                                                                   \
1864         static NeonGenWidenFn * const widenfn[] = {                     \
1865             gen_helper_neon_widen_##S##8,                               \
1866             gen_helper_neon_widen_##S##16,                              \
1867             NULL, NULL,                                                 \
1868         };                                                              \
1869         static NeonGenTwo64OpFn * const addfn[] = {                     \
1870             gen_helper_neon_##OP##l_u16,                                \
1871             gen_helper_neon_##OP##l_u32,                                \
1872             tcg_gen_##OP##_i64,                                         \
1873             NULL,                                                       \
1874         };                                                              \
1875         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1876         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1877                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1878                               narrow_mop);                              \
1879     }
1880 
1881 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1882 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1883 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1884 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1885 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1886 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1887 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1888 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1889 
1890 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1891                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1892 {
1893     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1894     TCGv_i64 rn_64, rm_64;
1895     TCGv_i32 rd0, rd1;
1896 
1897     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1898         return false;
1899     }
1900 
1901     /* UNDEF accesses to D16-D31 if they don't exist. */
1902     if (!dc_isar_feature(aa32_simd_r32, s) &&
1903         ((a->vd | a->vn | a->vm) & 0x10)) {
1904         return false;
1905     }
1906 
1907     if (!opfn || !narrowfn) {
1908         /* size == 3 case, which is an entirely different insn group */
1909         return false;
1910     }
1911 
1912     if ((a->vn | a->vm) & 1) {
1913         return false;
1914     }
1915 
1916     if (!vfp_access_check(s)) {
1917         return true;
1918     }
1919 
1920     rn_64 = tcg_temp_new_i64();
1921     rm_64 = tcg_temp_new_i64();
1922     rd0 = tcg_temp_new_i32();
1923     rd1 = tcg_temp_new_i32();
1924 
1925     read_neon_element64(rn_64, a->vn, 0, MO_64);
1926     read_neon_element64(rm_64, a->vm, 0, MO_64);
1927 
1928     opfn(rn_64, rn_64, rm_64);
1929 
1930     narrowfn(rd0, rn_64);
1931 
1932     read_neon_element64(rn_64, a->vn, 1, MO_64);
1933     read_neon_element64(rm_64, a->vm, 1, MO_64);
1934 
1935     opfn(rn_64, rn_64, rm_64);
1936 
1937     narrowfn(rd1, rn_64);
1938 
1939     write_neon_element32(rd0, a->vd, 0, MO_32);
1940     write_neon_element32(rd1, a->vd, 1, MO_32);
1941 
1942     return true;
1943 }
1944 
1945 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1946     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1947     {                                                                   \
1948         static NeonGenTwo64OpFn * const addfn[] = {                     \
1949             gen_helper_neon_##OP##l_u16,                                \
1950             gen_helper_neon_##OP##l_u32,                                \
1951             tcg_gen_##OP##_i64,                                         \
1952             NULL,                                                       \
1953         };                                                              \
1954         static NeonGenNarrowFn * const narrowfn[] = {                   \
1955             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1956             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1957             EXTOP,                                                      \
1958             NULL,                                                       \
1959         };                                                              \
1960         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1961     }
1962 
1963 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1964 {
1965     tcg_gen_addi_i64(rn, rn, 1u << 31);
1966     tcg_gen_extrh_i64_i32(rd, rn);
1967 }
1968 
1969 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1970 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1971 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1972 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1973 
1974 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1975                        NeonGenTwoOpWidenFn *opfn,
1976                        NeonGenTwo64OpFn *accfn)
1977 {
1978     /*
1979      * 3-regs different lengths, long operations.
1980      * These perform an operation on two inputs that returns a double-width
1981      * result, and then possibly perform an accumulation operation of
1982      * that result into the double-width destination.
1983      */
1984     TCGv_i64 rd0, rd1, tmp;
1985     TCGv_i32 rn, rm;
1986 
1987     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1988         return false;
1989     }
1990 
1991     /* UNDEF accesses to D16-D31 if they don't exist. */
1992     if (!dc_isar_feature(aa32_simd_r32, s) &&
1993         ((a->vd | a->vn | a->vm) & 0x10)) {
1994         return false;
1995     }
1996 
1997     if (!opfn) {
1998         /* size == 3 case, which is an entirely different insn group */
1999         return false;
2000     }
2001 
2002     if (a->vd & 1) {
2003         return false;
2004     }
2005 
2006     if (!vfp_access_check(s)) {
2007         return true;
2008     }
2009 
2010     rd0 = tcg_temp_new_i64();
2011     rd1 = tcg_temp_new_i64();
2012 
2013     rn = tcg_temp_new_i32();
2014     rm = tcg_temp_new_i32();
2015     read_neon_element32(rn, a->vn, 0, MO_32);
2016     read_neon_element32(rm, a->vm, 0, MO_32);
2017     opfn(rd0, rn, rm);
2018 
2019     read_neon_element32(rn, a->vn, 1, MO_32);
2020     read_neon_element32(rm, a->vm, 1, MO_32);
2021     opfn(rd1, rn, rm);
2022 
2023     /* Don't store results until after all loads: they might overlap */
2024     if (accfn) {
2025         tmp = tcg_temp_new_i64();
2026         read_neon_element64(tmp, a->vd, 0, MO_64);
2027         accfn(rd0, tmp, rd0);
2028         read_neon_element64(tmp, a->vd, 1, MO_64);
2029         accfn(rd1, tmp, rd1);
2030     }
2031 
2032     write_neon_element64(rd0, a->vd, 0, MO_64);
2033     write_neon_element64(rd1, a->vd, 1, MO_64);
2034 
2035     return true;
2036 }
2037 
2038 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2039 {
2040     static NeonGenTwoOpWidenFn * const opfn[] = {
2041         gen_helper_neon_abdl_s16,
2042         gen_helper_neon_abdl_s32,
2043         gen_helper_neon_abdl_s64,
2044         NULL,
2045     };
2046 
2047     return do_long_3d(s, a, opfn[a->size], NULL);
2048 }
2049 
2050 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2051 {
2052     static NeonGenTwoOpWidenFn * const opfn[] = {
2053         gen_helper_neon_abdl_u16,
2054         gen_helper_neon_abdl_u32,
2055         gen_helper_neon_abdl_u64,
2056         NULL,
2057     };
2058 
2059     return do_long_3d(s, a, opfn[a->size], NULL);
2060 }
2061 
2062 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2063 {
2064     static NeonGenTwoOpWidenFn * const opfn[] = {
2065         gen_helper_neon_abdl_s16,
2066         gen_helper_neon_abdl_s32,
2067         gen_helper_neon_abdl_s64,
2068         NULL,
2069     };
2070     static NeonGenTwo64OpFn * const addfn[] = {
2071         gen_helper_neon_addl_u16,
2072         gen_helper_neon_addl_u32,
2073         tcg_gen_add_i64,
2074         NULL,
2075     };
2076 
2077     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2078 }
2079 
2080 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2081 {
2082     static NeonGenTwoOpWidenFn * const opfn[] = {
2083         gen_helper_neon_abdl_u16,
2084         gen_helper_neon_abdl_u32,
2085         gen_helper_neon_abdl_u64,
2086         NULL,
2087     };
2088     static NeonGenTwo64OpFn * const addfn[] = {
2089         gen_helper_neon_addl_u16,
2090         gen_helper_neon_addl_u32,
2091         tcg_gen_add_i64,
2092         NULL,
2093     };
2094 
2095     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2096 }
2097 
2098 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2099 {
2100     TCGv_i32 lo = tcg_temp_new_i32();
2101     TCGv_i32 hi = tcg_temp_new_i32();
2102 
2103     tcg_gen_muls2_i32(lo, hi, rn, rm);
2104     tcg_gen_concat_i32_i64(rd, lo, hi);
2105 }
2106 
2107 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2108 {
2109     TCGv_i32 lo = tcg_temp_new_i32();
2110     TCGv_i32 hi = tcg_temp_new_i32();
2111 
2112     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2113     tcg_gen_concat_i32_i64(rd, lo, hi);
2114 }
2115 
2116 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2117 {
2118     static NeonGenTwoOpWidenFn * const opfn[] = {
2119         gen_helper_neon_mull_s8,
2120         gen_helper_neon_mull_s16,
2121         gen_mull_s32,
2122         NULL,
2123     };
2124 
2125     return do_long_3d(s, a, opfn[a->size], NULL);
2126 }
2127 
2128 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2129 {
2130     static NeonGenTwoOpWidenFn * const opfn[] = {
2131         gen_helper_neon_mull_u8,
2132         gen_helper_neon_mull_u16,
2133         gen_mull_u32,
2134         NULL,
2135     };
2136 
2137     return do_long_3d(s, a, opfn[a->size], NULL);
2138 }
2139 
2140 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2141     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2142     {                                                                   \
2143         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2144             gen_helper_neon_##MULL##8,                                  \
2145             gen_helper_neon_##MULL##16,                                 \
2146             gen_##MULL##32,                                             \
2147             NULL,                                                       \
2148         };                                                              \
2149         static NeonGenTwo64OpFn * const accfn[] = {                     \
2150             gen_helper_neon_##ACC##l_u16,                               \
2151             gen_helper_neon_##ACC##l_u32,                               \
2152             tcg_gen_##ACC##_i64,                                        \
2153             NULL,                                                       \
2154         };                                                              \
2155         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2156     }
2157 
2158 DO_VMLAL(VMLAL_S,mull_s,add)
2159 DO_VMLAL(VMLAL_U,mull_u,add)
2160 DO_VMLAL(VMLSL_S,mull_s,sub)
2161 DO_VMLAL(VMLSL_U,mull_u,sub)
2162 
2163 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2164 {
2165     gen_helper_neon_mull_s16(rd, rn, rm);
2166     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2167 }
2168 
2169 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2170 {
2171     gen_mull_s32(rd, rn, rm);
2172     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2173 }
2174 
2175 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2176 {
2177     static NeonGenTwoOpWidenFn * const opfn[] = {
2178         NULL,
2179         gen_VQDMULL_16,
2180         gen_VQDMULL_32,
2181         NULL,
2182     };
2183 
2184     return do_long_3d(s, a, opfn[a->size], NULL);
2185 }
2186 
2187 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2188 {
2189     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2190 }
2191 
2192 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2193 {
2194     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2195 }
2196 
2197 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2198 {
2199     static NeonGenTwoOpWidenFn * const opfn[] = {
2200         NULL,
2201         gen_VQDMULL_16,
2202         gen_VQDMULL_32,
2203         NULL,
2204     };
2205     static NeonGenTwo64OpFn * const accfn[] = {
2206         NULL,
2207         gen_VQDMLAL_acc_16,
2208         gen_VQDMLAL_acc_32,
2209         NULL,
2210     };
2211 
2212     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2213 }
2214 
2215 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2216 {
2217     gen_helper_neon_negl_u32(rm, rm);
2218     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2219 }
2220 
2221 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2222 {
2223     tcg_gen_neg_i64(rm, rm);
2224     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2225 }
2226 
2227 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2228 {
2229     static NeonGenTwoOpWidenFn * const opfn[] = {
2230         NULL,
2231         gen_VQDMULL_16,
2232         gen_VQDMULL_32,
2233         NULL,
2234     };
2235     static NeonGenTwo64OpFn * const accfn[] = {
2236         NULL,
2237         gen_VQDMLSL_acc_16,
2238         gen_VQDMLSL_acc_32,
2239         NULL,
2240     };
2241 
2242     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2243 }
2244 
2245 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2246 {
2247     gen_helper_gvec_3 *fn_gvec;
2248 
2249     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2250         return false;
2251     }
2252 
2253     /* UNDEF accesses to D16-D31 if they don't exist. */
2254     if (!dc_isar_feature(aa32_simd_r32, s) &&
2255         ((a->vd | a->vn | a->vm) & 0x10)) {
2256         return false;
2257     }
2258 
2259     if (a->vd & 1) {
2260         return false;
2261     }
2262 
2263     switch (a->size) {
2264     case 0:
2265         fn_gvec = gen_helper_neon_pmull_h;
2266         break;
2267     case 2:
2268         if (!dc_isar_feature(aa32_pmull, s)) {
2269             return false;
2270         }
2271         fn_gvec = gen_helper_gvec_pmull_q;
2272         break;
2273     default:
2274         return false;
2275     }
2276 
2277     if (!vfp_access_check(s)) {
2278         return true;
2279     }
2280 
2281     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2282                        neon_full_reg_offset(a->vn),
2283                        neon_full_reg_offset(a->vm),
2284                        16, 16, 0, fn_gvec);
2285     return true;
2286 }
2287 
2288 static void gen_neon_dup_low16(TCGv_i32 var)
2289 {
2290     TCGv_i32 tmp = tcg_temp_new_i32();
2291     tcg_gen_ext16u_i32(var, var);
2292     tcg_gen_shli_i32(tmp, var, 16);
2293     tcg_gen_or_i32(var, var, tmp);
2294 }
2295 
2296 static void gen_neon_dup_high16(TCGv_i32 var)
2297 {
2298     TCGv_i32 tmp = tcg_temp_new_i32();
2299     tcg_gen_andi_i32(var, var, 0xffff0000);
2300     tcg_gen_shri_i32(tmp, var, 16);
2301     tcg_gen_or_i32(var, var, tmp);
2302 }
2303 
2304 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2305 {
2306     TCGv_i32 tmp = tcg_temp_new_i32();
2307     if (size == MO_16) {
2308         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2309         if (reg & 8) {
2310             gen_neon_dup_high16(tmp);
2311         } else {
2312             gen_neon_dup_low16(tmp);
2313         }
2314     } else {
2315         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2316     }
2317     return tmp;
2318 }
2319 
2320 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2321                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2322 {
2323     /*
2324      * Two registers and a scalar: perform an operation between
2325      * the input elements and the scalar, and then possibly
2326      * perform an accumulation operation of that result into the
2327      * destination.
2328      */
2329     TCGv_i32 scalar, tmp;
2330     int pass;
2331 
2332     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2333         return false;
2334     }
2335 
2336     /* UNDEF accesses to D16-D31 if they don't exist. */
2337     if (!dc_isar_feature(aa32_simd_r32, s) &&
2338         ((a->vd | a->vn | a->vm) & 0x10)) {
2339         return false;
2340     }
2341 
2342     if (!opfn) {
2343         /* Bad size (including size == 3, which is a different insn group) */
2344         return false;
2345     }
2346 
2347     if (a->q && ((a->vd | a->vn) & 1)) {
2348         return false;
2349     }
2350 
2351     if (!vfp_access_check(s)) {
2352         return true;
2353     }
2354 
2355     scalar = neon_get_scalar(a->size, a->vm);
2356     tmp = tcg_temp_new_i32();
2357 
2358     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2359         read_neon_element32(tmp, a->vn, pass, MO_32);
2360         opfn(tmp, tmp, scalar);
2361         if (accfn) {
2362             TCGv_i32 rd = tcg_temp_new_i32();
2363             read_neon_element32(rd, a->vd, pass, MO_32);
2364             accfn(tmp, rd, tmp);
2365         }
2366         write_neon_element32(tmp, a->vd, pass, MO_32);
2367     }
2368     return true;
2369 }
2370 
2371 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2372 {
2373     static NeonGenTwoOpFn * const opfn[] = {
2374         NULL,
2375         gen_helper_neon_mul_u16,
2376         tcg_gen_mul_i32,
2377         NULL,
2378     };
2379 
2380     return do_2scalar(s, a, opfn[a->size], NULL);
2381 }
2382 
2383 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2384 {
2385     static NeonGenTwoOpFn * const opfn[] = {
2386         NULL,
2387         gen_helper_neon_mul_u16,
2388         tcg_gen_mul_i32,
2389         NULL,
2390     };
2391     static NeonGenTwoOpFn * const accfn[] = {
2392         NULL,
2393         gen_helper_neon_add_u16,
2394         tcg_gen_add_i32,
2395         NULL,
2396     };
2397 
2398     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2399 }
2400 
2401 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2402 {
2403     static NeonGenTwoOpFn * const opfn[] = {
2404         NULL,
2405         gen_helper_neon_mul_u16,
2406         tcg_gen_mul_i32,
2407         NULL,
2408     };
2409     static NeonGenTwoOpFn * const accfn[] = {
2410         NULL,
2411         gen_helper_neon_sub_u16,
2412         tcg_gen_sub_i32,
2413         NULL,
2414     };
2415 
2416     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2417 }
2418 
2419 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2420                               gen_helper_gvec_3_ptr *fn)
2421 {
2422     /* Two registers and a scalar, using gvec */
2423     int vec_size = a->q ? 16 : 8;
2424     int rd_ofs = neon_full_reg_offset(a->vd);
2425     int rn_ofs = neon_full_reg_offset(a->vn);
2426     int rm_ofs;
2427     int idx;
2428     TCGv_ptr fpstatus;
2429 
2430     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2431         return false;
2432     }
2433 
2434     /* UNDEF accesses to D16-D31 if they don't exist. */
2435     if (!dc_isar_feature(aa32_simd_r32, s) &&
2436         ((a->vd | a->vn | a->vm) & 0x10)) {
2437         return false;
2438     }
2439 
2440     if (!fn) {
2441         /* Bad size (including size == 3, which is a different insn group) */
2442         return false;
2443     }
2444 
2445     if (a->q && ((a->vd | a->vn) & 1)) {
2446         return false;
2447     }
2448 
2449     if (!vfp_access_check(s)) {
2450         return true;
2451     }
2452 
2453     /* a->vm is M:Vm, which encodes both register and index */
2454     idx = extract32(a->vm, a->size + 2, 2);
2455     a->vm = extract32(a->vm, 0, a->size + 2);
2456     rm_ofs = neon_full_reg_offset(a->vm);
2457 
2458     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2459     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2460                        vec_size, vec_size, idx, fn);
2461     return true;
2462 }
2463 
2464 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2465     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2466     {                                                                   \
2467         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2468             NULL,                                                       \
2469             gen_helper_##FUNC##_h,                                      \
2470             gen_helper_##FUNC##_s,                                      \
2471             NULL,                                                       \
2472         };                                                              \
2473         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2474             return false;                                               \
2475         }                                                               \
2476         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2477     }
2478 
2479 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2480 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2481 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2482 
2483 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2484 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2485 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2486 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2487 
2488 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2489 {
2490     static NeonGenTwoOpFn * const opfn[] = {
2491         NULL,
2492         gen_VQDMULH_16,
2493         gen_VQDMULH_32,
2494         NULL,
2495     };
2496 
2497     return do_2scalar(s, a, opfn[a->size], NULL);
2498 }
2499 
2500 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2501 {
2502     static NeonGenTwoOpFn * const opfn[] = {
2503         NULL,
2504         gen_VQRDMULH_16,
2505         gen_VQRDMULH_32,
2506         NULL,
2507     };
2508 
2509     return do_2scalar(s, a, opfn[a->size], NULL);
2510 }
2511 
2512 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2513                             NeonGenThreeOpEnvFn *opfn)
2514 {
2515     /*
2516      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2517      * performs a kind of fused op-then-accumulate using a helper
2518      * function that takes all of rd, rn and the scalar at once.
2519      */
2520     TCGv_i32 scalar, rn, rd;
2521     int pass;
2522 
2523     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2524         return false;
2525     }
2526 
2527     if (!dc_isar_feature(aa32_rdm, s)) {
2528         return false;
2529     }
2530 
2531     /* UNDEF accesses to D16-D31 if they don't exist. */
2532     if (!dc_isar_feature(aa32_simd_r32, s) &&
2533         ((a->vd | a->vn | a->vm) & 0x10)) {
2534         return false;
2535     }
2536 
2537     if (!opfn) {
2538         /* Bad size (including size == 3, which is a different insn group) */
2539         return false;
2540     }
2541 
2542     if (a->q && ((a->vd | a->vn) & 1)) {
2543         return false;
2544     }
2545 
2546     if (!vfp_access_check(s)) {
2547         return true;
2548     }
2549 
2550     scalar = neon_get_scalar(a->size, a->vm);
2551     rn = tcg_temp_new_i32();
2552     rd = tcg_temp_new_i32();
2553 
2554     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2555         read_neon_element32(rn, a->vn, pass, MO_32);
2556         read_neon_element32(rd, a->vd, pass, MO_32);
2557         opfn(rd, cpu_env, rn, scalar, rd);
2558         write_neon_element32(rd, a->vd, pass, MO_32);
2559     }
2560     return true;
2561 }
2562 
2563 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2564 {
2565     static NeonGenThreeOpEnvFn *opfn[] = {
2566         NULL,
2567         gen_helper_neon_qrdmlah_s16,
2568         gen_helper_neon_qrdmlah_s32,
2569         NULL,
2570     };
2571     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2572 }
2573 
2574 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2575 {
2576     static NeonGenThreeOpEnvFn *opfn[] = {
2577         NULL,
2578         gen_helper_neon_qrdmlsh_s16,
2579         gen_helper_neon_qrdmlsh_s32,
2580         NULL,
2581     };
2582     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2583 }
2584 
2585 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2586                             NeonGenTwoOpWidenFn *opfn,
2587                             NeonGenTwo64OpFn *accfn)
2588 {
2589     /*
2590      * Two registers and a scalar, long operations: perform an
2591      * operation on the input elements and the scalar which produces
2592      * a double-width result, and then possibly perform an accumulation
2593      * operation of that result into the destination.
2594      */
2595     TCGv_i32 scalar, rn;
2596     TCGv_i64 rn0_64, rn1_64;
2597 
2598     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2599         return false;
2600     }
2601 
2602     /* UNDEF accesses to D16-D31 if they don't exist. */
2603     if (!dc_isar_feature(aa32_simd_r32, s) &&
2604         ((a->vd | a->vn | a->vm) & 0x10)) {
2605         return false;
2606     }
2607 
2608     if (!opfn) {
2609         /* Bad size (including size == 3, which is a different insn group) */
2610         return false;
2611     }
2612 
2613     if (a->vd & 1) {
2614         return false;
2615     }
2616 
2617     if (!vfp_access_check(s)) {
2618         return true;
2619     }
2620 
2621     scalar = neon_get_scalar(a->size, a->vm);
2622 
2623     /* Load all inputs before writing any outputs, in case of overlap */
2624     rn = tcg_temp_new_i32();
2625     read_neon_element32(rn, a->vn, 0, MO_32);
2626     rn0_64 = tcg_temp_new_i64();
2627     opfn(rn0_64, rn, scalar);
2628 
2629     read_neon_element32(rn, a->vn, 1, MO_32);
2630     rn1_64 = tcg_temp_new_i64();
2631     opfn(rn1_64, rn, scalar);
2632 
2633     if (accfn) {
2634         TCGv_i64 t64 = tcg_temp_new_i64();
2635         read_neon_element64(t64, a->vd, 0, MO_64);
2636         accfn(rn0_64, t64, rn0_64);
2637         read_neon_element64(t64, a->vd, 1, MO_64);
2638         accfn(rn1_64, t64, rn1_64);
2639     }
2640 
2641     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2642     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2643     return true;
2644 }
2645 
2646 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2647 {
2648     static NeonGenTwoOpWidenFn * const opfn[] = {
2649         NULL,
2650         gen_helper_neon_mull_s16,
2651         gen_mull_s32,
2652         NULL,
2653     };
2654 
2655     return do_2scalar_long(s, a, opfn[a->size], NULL);
2656 }
2657 
2658 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2659 {
2660     static NeonGenTwoOpWidenFn * const opfn[] = {
2661         NULL,
2662         gen_helper_neon_mull_u16,
2663         gen_mull_u32,
2664         NULL,
2665     };
2666 
2667     return do_2scalar_long(s, a, opfn[a->size], NULL);
2668 }
2669 
2670 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2671     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2672     {                                                                   \
2673         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2674             NULL,                                                       \
2675             gen_helper_neon_##MULL##16,                                 \
2676             gen_##MULL##32,                                             \
2677             NULL,                                                       \
2678         };                                                              \
2679         static NeonGenTwo64OpFn * const accfn[] = {                     \
2680             NULL,                                                       \
2681             gen_helper_neon_##ACC##l_u32,                               \
2682             tcg_gen_##ACC##_i64,                                        \
2683             NULL,                                                       \
2684         };                                                              \
2685         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2686     }
2687 
2688 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2689 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2690 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2691 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2692 
2693 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2694 {
2695     static NeonGenTwoOpWidenFn * const opfn[] = {
2696         NULL,
2697         gen_VQDMULL_16,
2698         gen_VQDMULL_32,
2699         NULL,
2700     };
2701 
2702     return do_2scalar_long(s, a, opfn[a->size], NULL);
2703 }
2704 
2705 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2706 {
2707     static NeonGenTwoOpWidenFn * const opfn[] = {
2708         NULL,
2709         gen_VQDMULL_16,
2710         gen_VQDMULL_32,
2711         NULL,
2712     };
2713     static NeonGenTwo64OpFn * const accfn[] = {
2714         NULL,
2715         gen_VQDMLAL_acc_16,
2716         gen_VQDMLAL_acc_32,
2717         NULL,
2718     };
2719 
2720     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2721 }
2722 
2723 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2724 {
2725     static NeonGenTwoOpWidenFn * const opfn[] = {
2726         NULL,
2727         gen_VQDMULL_16,
2728         gen_VQDMULL_32,
2729         NULL,
2730     };
2731     static NeonGenTwo64OpFn * const accfn[] = {
2732         NULL,
2733         gen_VQDMLSL_acc_16,
2734         gen_VQDMLSL_acc_32,
2735         NULL,
2736     };
2737 
2738     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2739 }
2740 
2741 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2742 {
2743     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2744         return false;
2745     }
2746 
2747     /* UNDEF accesses to D16-D31 if they don't exist. */
2748     if (!dc_isar_feature(aa32_simd_r32, s) &&
2749         ((a->vd | a->vn | a->vm) & 0x10)) {
2750         return false;
2751     }
2752 
2753     if ((a->vn | a->vm | a->vd) & a->q) {
2754         return false;
2755     }
2756 
2757     if (a->imm > 7 && !a->q) {
2758         return false;
2759     }
2760 
2761     if (!vfp_access_check(s)) {
2762         return true;
2763     }
2764 
2765     if (!a->q) {
2766         /* Extract 64 bits from <Vm:Vn> */
2767         TCGv_i64 left, right, dest;
2768 
2769         left = tcg_temp_new_i64();
2770         right = tcg_temp_new_i64();
2771         dest = tcg_temp_new_i64();
2772 
2773         read_neon_element64(right, a->vn, 0, MO_64);
2774         read_neon_element64(left, a->vm, 0, MO_64);
2775         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2776         write_neon_element64(dest, a->vd, 0, MO_64);
2777     } else {
2778         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2779         TCGv_i64 left, middle, right, destleft, destright;
2780 
2781         left = tcg_temp_new_i64();
2782         middle = tcg_temp_new_i64();
2783         right = tcg_temp_new_i64();
2784         destleft = tcg_temp_new_i64();
2785         destright = tcg_temp_new_i64();
2786 
2787         if (a->imm < 8) {
2788             read_neon_element64(right, a->vn, 0, MO_64);
2789             read_neon_element64(middle, a->vn, 1, MO_64);
2790             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2791             read_neon_element64(left, a->vm, 0, MO_64);
2792             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2793         } else {
2794             read_neon_element64(right, a->vn, 1, MO_64);
2795             read_neon_element64(middle, a->vm, 0, MO_64);
2796             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2797             read_neon_element64(left, a->vm, 1, MO_64);
2798             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2799         }
2800 
2801         write_neon_element64(destright, a->vd, 0, MO_64);
2802         write_neon_element64(destleft, a->vd, 1, MO_64);
2803     }
2804     return true;
2805 }
2806 
2807 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2808 {
2809     TCGv_i64 val, def;
2810     TCGv_i32 desc;
2811 
2812     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2813         return false;
2814     }
2815 
2816     /* UNDEF accesses to D16-D31 if they don't exist. */
2817     if (!dc_isar_feature(aa32_simd_r32, s) &&
2818         ((a->vd | a->vn | a->vm) & 0x10)) {
2819         return false;
2820     }
2821 
2822     if ((a->vn + a->len + 1) > 32) {
2823         /*
2824          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2825          * helper function running off the end of the register file.
2826          */
2827         return false;
2828     }
2829 
2830     if (!vfp_access_check(s)) {
2831         return true;
2832     }
2833 
2834     desc = tcg_constant_i32((a->vn << 2) | a->len);
2835     def = tcg_temp_new_i64();
2836     if (a->op) {
2837         read_neon_element64(def, a->vd, 0, MO_64);
2838     } else {
2839         tcg_gen_movi_i64(def, 0);
2840     }
2841     val = tcg_temp_new_i64();
2842     read_neon_element64(val, a->vm, 0, MO_64);
2843 
2844     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2845     write_neon_element64(val, a->vd, 0, MO_64);
2846     return true;
2847 }
2848 
2849 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2850 {
2851     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2852         return false;
2853     }
2854 
2855     /* UNDEF accesses to D16-D31 if they don't exist. */
2856     if (!dc_isar_feature(aa32_simd_r32, s) &&
2857         ((a->vd | a->vm) & 0x10)) {
2858         return false;
2859     }
2860 
2861     if (a->vd & a->q) {
2862         return false;
2863     }
2864 
2865     if (!vfp_access_check(s)) {
2866         return true;
2867     }
2868 
2869     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2870                          neon_element_offset(a->vm, a->index, a->size),
2871                          a->q ? 16 : 8, a->q ? 16 : 8);
2872     return true;
2873 }
2874 
2875 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2876 {
2877     int pass, half;
2878     TCGv_i32 tmp[2];
2879 
2880     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2881         return false;
2882     }
2883 
2884     /* UNDEF accesses to D16-D31 if they don't exist. */
2885     if (!dc_isar_feature(aa32_simd_r32, s) &&
2886         ((a->vd | a->vm) & 0x10)) {
2887         return false;
2888     }
2889 
2890     if ((a->vd | a->vm) & a->q) {
2891         return false;
2892     }
2893 
2894     if (a->size == 3) {
2895         return false;
2896     }
2897 
2898     if (!vfp_access_check(s)) {
2899         return true;
2900     }
2901 
2902     tmp[0] = tcg_temp_new_i32();
2903     tmp[1] = tcg_temp_new_i32();
2904 
2905     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2906         for (half = 0; half < 2; half++) {
2907             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2908             switch (a->size) {
2909             case 0:
2910                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2911                 break;
2912             case 1:
2913                 gen_swap_half(tmp[half], tmp[half]);
2914                 break;
2915             case 2:
2916                 break;
2917             default:
2918                 g_assert_not_reached();
2919             }
2920         }
2921         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2922         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
2923     }
2924     return true;
2925 }
2926 
2927 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2928                               NeonGenWidenFn *widenfn,
2929                               NeonGenTwo64OpFn *opfn,
2930                               NeonGenTwo64OpFn *accfn)
2931 {
2932     /*
2933      * Pairwise long operations: widen both halves of the pair,
2934      * combine the pairs with the opfn, and then possibly accumulate
2935      * into the destination with the accfn.
2936      */
2937     int pass;
2938 
2939     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2940         return false;
2941     }
2942 
2943     /* UNDEF accesses to D16-D31 if they don't exist. */
2944     if (!dc_isar_feature(aa32_simd_r32, s) &&
2945         ((a->vd | a->vm) & 0x10)) {
2946         return false;
2947     }
2948 
2949     if ((a->vd | a->vm) & a->q) {
2950         return false;
2951     }
2952 
2953     if (!widenfn) {
2954         return false;
2955     }
2956 
2957     if (!vfp_access_check(s)) {
2958         return true;
2959     }
2960 
2961     for (pass = 0; pass < a->q + 1; pass++) {
2962         TCGv_i32 tmp;
2963         TCGv_i64 rm0_64, rm1_64, rd_64;
2964 
2965         rm0_64 = tcg_temp_new_i64();
2966         rm1_64 = tcg_temp_new_i64();
2967         rd_64 = tcg_temp_new_i64();
2968 
2969         tmp = tcg_temp_new_i32();
2970         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
2971         widenfn(rm0_64, tmp);
2972         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
2973         widenfn(rm1_64, tmp);
2974 
2975         opfn(rd_64, rm0_64, rm1_64);
2976 
2977         if (accfn) {
2978             TCGv_i64 tmp64 = tcg_temp_new_i64();
2979             read_neon_element64(tmp64, a->vd, pass, MO_64);
2980             accfn(rd_64, tmp64, rd_64);
2981         }
2982         write_neon_element64(rd_64, a->vd, pass, MO_64);
2983     }
2984     return true;
2985 }
2986 
2987 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
2988 {
2989     static NeonGenWidenFn * const widenfn[] = {
2990         gen_helper_neon_widen_s8,
2991         gen_helper_neon_widen_s16,
2992         tcg_gen_ext_i32_i64,
2993         NULL,
2994     };
2995     static NeonGenTwo64OpFn * const opfn[] = {
2996         gen_helper_neon_paddl_u16,
2997         gen_helper_neon_paddl_u32,
2998         tcg_gen_add_i64,
2999         NULL,
3000     };
3001 
3002     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3003 }
3004 
3005 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3006 {
3007     static NeonGenWidenFn * const widenfn[] = {
3008         gen_helper_neon_widen_u8,
3009         gen_helper_neon_widen_u16,
3010         tcg_gen_extu_i32_i64,
3011         NULL,
3012     };
3013     static NeonGenTwo64OpFn * const opfn[] = {
3014         gen_helper_neon_paddl_u16,
3015         gen_helper_neon_paddl_u32,
3016         tcg_gen_add_i64,
3017         NULL,
3018     };
3019 
3020     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3021 }
3022 
3023 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3024 {
3025     static NeonGenWidenFn * const widenfn[] = {
3026         gen_helper_neon_widen_s8,
3027         gen_helper_neon_widen_s16,
3028         tcg_gen_ext_i32_i64,
3029         NULL,
3030     };
3031     static NeonGenTwo64OpFn * const opfn[] = {
3032         gen_helper_neon_paddl_u16,
3033         gen_helper_neon_paddl_u32,
3034         tcg_gen_add_i64,
3035         NULL,
3036     };
3037     static NeonGenTwo64OpFn * const accfn[] = {
3038         gen_helper_neon_addl_u16,
3039         gen_helper_neon_addl_u32,
3040         tcg_gen_add_i64,
3041         NULL,
3042     };
3043 
3044     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3045                              accfn[a->size]);
3046 }
3047 
3048 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3049 {
3050     static NeonGenWidenFn * const widenfn[] = {
3051         gen_helper_neon_widen_u8,
3052         gen_helper_neon_widen_u16,
3053         tcg_gen_extu_i32_i64,
3054         NULL,
3055     };
3056     static NeonGenTwo64OpFn * const opfn[] = {
3057         gen_helper_neon_paddl_u16,
3058         gen_helper_neon_paddl_u32,
3059         tcg_gen_add_i64,
3060         NULL,
3061     };
3062     static NeonGenTwo64OpFn * const accfn[] = {
3063         gen_helper_neon_addl_u16,
3064         gen_helper_neon_addl_u32,
3065         tcg_gen_add_i64,
3066         NULL,
3067     };
3068 
3069     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3070                              accfn[a->size]);
3071 }
3072 
3073 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3074 
3075 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3076                        ZipFn *fn)
3077 {
3078     TCGv_ptr pd, pm;
3079 
3080     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3081         return false;
3082     }
3083 
3084     /* UNDEF accesses to D16-D31 if they don't exist. */
3085     if (!dc_isar_feature(aa32_simd_r32, s) &&
3086         ((a->vd | a->vm) & 0x10)) {
3087         return false;
3088     }
3089 
3090     if ((a->vd | a->vm) & a->q) {
3091         return false;
3092     }
3093 
3094     if (!fn) {
3095         /* Bad size or size/q combination */
3096         return false;
3097     }
3098 
3099     if (!vfp_access_check(s)) {
3100         return true;
3101     }
3102 
3103     pd = vfp_reg_ptr(true, a->vd);
3104     pm = vfp_reg_ptr(true, a->vm);
3105     fn(pd, pm);
3106     return true;
3107 }
3108 
3109 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3110 {
3111     static ZipFn * const fn[2][4] = {
3112         {
3113             gen_helper_neon_unzip8,
3114             gen_helper_neon_unzip16,
3115             NULL,
3116             NULL,
3117         }, {
3118             gen_helper_neon_qunzip8,
3119             gen_helper_neon_qunzip16,
3120             gen_helper_neon_qunzip32,
3121             NULL,
3122         }
3123     };
3124     return do_zip_uzp(s, a, fn[a->q][a->size]);
3125 }
3126 
3127 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3128 {
3129     static ZipFn * const fn[2][4] = {
3130         {
3131             gen_helper_neon_zip8,
3132             gen_helper_neon_zip16,
3133             NULL,
3134             NULL,
3135         }, {
3136             gen_helper_neon_qzip8,
3137             gen_helper_neon_qzip16,
3138             gen_helper_neon_qzip32,
3139             NULL,
3140         }
3141     };
3142     return do_zip_uzp(s, a, fn[a->q][a->size]);
3143 }
3144 
3145 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3146                      NeonGenNarrowEnvFn *narrowfn)
3147 {
3148     TCGv_i64 rm;
3149     TCGv_i32 rd0, rd1;
3150 
3151     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3152         return false;
3153     }
3154 
3155     /* UNDEF accesses to D16-D31 if they don't exist. */
3156     if (!dc_isar_feature(aa32_simd_r32, s) &&
3157         ((a->vd | a->vm) & 0x10)) {
3158         return false;
3159     }
3160 
3161     if (a->vm & 1) {
3162         return false;
3163     }
3164 
3165     if (!narrowfn) {
3166         return false;
3167     }
3168 
3169     if (!vfp_access_check(s)) {
3170         return true;
3171     }
3172 
3173     rm = tcg_temp_new_i64();
3174     rd0 = tcg_temp_new_i32();
3175     rd1 = tcg_temp_new_i32();
3176 
3177     read_neon_element64(rm, a->vm, 0, MO_64);
3178     narrowfn(rd0, cpu_env, rm);
3179     read_neon_element64(rm, a->vm, 1, MO_64);
3180     narrowfn(rd1, cpu_env, rm);
3181     write_neon_element32(rd0, a->vd, 0, MO_32);
3182     write_neon_element32(rd1, a->vd, 1, MO_32);
3183     return true;
3184 }
3185 
3186 #define DO_VMOVN(INSN, FUNC)                                    \
3187     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3188     {                                                           \
3189         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3190             FUNC##8,                                            \
3191             FUNC##16,                                           \
3192             FUNC##32,                                           \
3193             NULL,                                               \
3194         };                                                      \
3195         return do_vmovn(s, a, narrowfn[a->size]);               \
3196     }
3197 
3198 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3199 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3200 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3201 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3202 
3203 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3204 {
3205     TCGv_i32 rm0, rm1;
3206     TCGv_i64 rd;
3207     static NeonGenWidenFn * const widenfns[] = {
3208         gen_helper_neon_widen_u8,
3209         gen_helper_neon_widen_u16,
3210         tcg_gen_extu_i32_i64,
3211         NULL,
3212     };
3213     NeonGenWidenFn *widenfn = widenfns[a->size];
3214 
3215     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3216         return false;
3217     }
3218 
3219     /* UNDEF accesses to D16-D31 if they don't exist. */
3220     if (!dc_isar_feature(aa32_simd_r32, s) &&
3221         ((a->vd | a->vm) & 0x10)) {
3222         return false;
3223     }
3224 
3225     if (a->vd & 1) {
3226         return false;
3227     }
3228 
3229     if (!widenfn) {
3230         return false;
3231     }
3232 
3233     if (!vfp_access_check(s)) {
3234         return true;
3235     }
3236 
3237     rd = tcg_temp_new_i64();
3238     rm0 = tcg_temp_new_i32();
3239     rm1 = tcg_temp_new_i32();
3240 
3241     read_neon_element32(rm0, a->vm, 0, MO_32);
3242     read_neon_element32(rm1, a->vm, 1, MO_32);
3243 
3244     widenfn(rd, rm0);
3245     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3246     write_neon_element64(rd, a->vd, 0, MO_64);
3247     widenfn(rd, rm1);
3248     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3249     write_neon_element64(rd, a->vd, 1, MO_64);
3250     return true;
3251 }
3252 
3253 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3254 {
3255     TCGv_ptr fpst;
3256     TCGv_i64 tmp;
3257     TCGv_i32 dst0, dst1;
3258 
3259     if (!dc_isar_feature(aa32_bf16, s)) {
3260         return false;
3261     }
3262 
3263     /* UNDEF accesses to D16-D31 if they don't exist. */
3264     if (!dc_isar_feature(aa32_simd_r32, s) &&
3265         ((a->vd | a->vm) & 0x10)) {
3266         return false;
3267     }
3268 
3269     if ((a->vm & 1) || (a->size != 1)) {
3270         return false;
3271     }
3272 
3273     if (!vfp_access_check(s)) {
3274         return true;
3275     }
3276 
3277     fpst = fpstatus_ptr(FPST_STD);
3278     tmp = tcg_temp_new_i64();
3279     dst0 = tcg_temp_new_i32();
3280     dst1 = tcg_temp_new_i32();
3281 
3282     read_neon_element64(tmp, a->vm, 0, MO_64);
3283     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3284 
3285     read_neon_element64(tmp, a->vm, 1, MO_64);
3286     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3287 
3288     write_neon_element32(dst0, a->vd, 0, MO_32);
3289     write_neon_element32(dst1, a->vd, 1, MO_32);
3290     return true;
3291 }
3292 
3293 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3294 {
3295     TCGv_ptr fpst;
3296     TCGv_i32 ahp, tmp, tmp2, tmp3;
3297 
3298     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3299         !dc_isar_feature(aa32_fp16_spconv, s)) {
3300         return false;
3301     }
3302 
3303     /* UNDEF accesses to D16-D31 if they don't exist. */
3304     if (!dc_isar_feature(aa32_simd_r32, s) &&
3305         ((a->vd | a->vm) & 0x10)) {
3306         return false;
3307     }
3308 
3309     if ((a->vm & 1) || (a->size != 1)) {
3310         return false;
3311     }
3312 
3313     if (!vfp_access_check(s)) {
3314         return true;
3315     }
3316 
3317     fpst = fpstatus_ptr(FPST_STD);
3318     ahp = get_ahp_flag();
3319     tmp = tcg_temp_new_i32();
3320     read_neon_element32(tmp, a->vm, 0, MO_32);
3321     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3322     tmp2 = tcg_temp_new_i32();
3323     read_neon_element32(tmp2, a->vm, 1, MO_32);
3324     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3325     tcg_gen_shli_i32(tmp2, tmp2, 16);
3326     tcg_gen_or_i32(tmp2, tmp2, tmp);
3327     read_neon_element32(tmp, a->vm, 2, MO_32);
3328     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3329     tmp3 = tcg_temp_new_i32();
3330     read_neon_element32(tmp3, a->vm, 3, MO_32);
3331     write_neon_element32(tmp2, a->vd, 0, MO_32);
3332     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3333     tcg_gen_shli_i32(tmp3, tmp3, 16);
3334     tcg_gen_or_i32(tmp3, tmp3, tmp);
3335     write_neon_element32(tmp3, a->vd, 1, MO_32);
3336     return true;
3337 }
3338 
3339 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3340 {
3341     TCGv_ptr fpst;
3342     TCGv_i32 ahp, tmp, tmp2, tmp3;
3343 
3344     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3345         !dc_isar_feature(aa32_fp16_spconv, s)) {
3346         return false;
3347     }
3348 
3349     /* UNDEF accesses to D16-D31 if they don't exist. */
3350     if (!dc_isar_feature(aa32_simd_r32, s) &&
3351         ((a->vd | a->vm) & 0x10)) {
3352         return false;
3353     }
3354 
3355     if ((a->vd & 1) || (a->size != 1)) {
3356         return false;
3357     }
3358 
3359     if (!vfp_access_check(s)) {
3360         return true;
3361     }
3362 
3363     fpst = fpstatus_ptr(FPST_STD);
3364     ahp = get_ahp_flag();
3365     tmp3 = tcg_temp_new_i32();
3366     tmp2 = tcg_temp_new_i32();
3367     tmp = tcg_temp_new_i32();
3368     read_neon_element32(tmp, a->vm, 0, MO_32);
3369     read_neon_element32(tmp2, a->vm, 1, MO_32);
3370     tcg_gen_ext16u_i32(tmp3, tmp);
3371     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3372     write_neon_element32(tmp3, a->vd, 0, MO_32);
3373     tcg_gen_shri_i32(tmp, tmp, 16);
3374     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3375     write_neon_element32(tmp, a->vd, 1, MO_32);
3376     tcg_gen_ext16u_i32(tmp3, tmp2);
3377     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3378     write_neon_element32(tmp3, a->vd, 2, MO_32);
3379     tcg_gen_shri_i32(tmp2, tmp2, 16);
3380     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3381     write_neon_element32(tmp2, a->vd, 3, MO_32);
3382     return true;
3383 }
3384 
3385 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3386 {
3387     int vec_size = a->q ? 16 : 8;
3388     int rd_ofs = neon_full_reg_offset(a->vd);
3389     int rm_ofs = neon_full_reg_offset(a->vm);
3390 
3391     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3392         return false;
3393     }
3394 
3395     /* UNDEF accesses to D16-D31 if they don't exist. */
3396     if (!dc_isar_feature(aa32_simd_r32, s) &&
3397         ((a->vd | a->vm) & 0x10)) {
3398         return false;
3399     }
3400 
3401     if (a->size == 3) {
3402         return false;
3403     }
3404 
3405     if ((a->vd | a->vm) & a->q) {
3406         return false;
3407     }
3408 
3409     if (!vfp_access_check(s)) {
3410         return true;
3411     }
3412 
3413     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3414 
3415     return true;
3416 }
3417 
3418 #define DO_2MISC_VEC(INSN, FN)                                  \
3419     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3420     {                                                           \
3421         return do_2misc_vec(s, a, FN);                          \
3422     }
3423 
3424 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3425 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3426 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3427 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3428 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3429 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3430 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3431 
3432 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3433 {
3434     if (a->size != 0) {
3435         return false;
3436     }
3437     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3438 }
3439 
3440 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3441     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3442                          uint32_t rm_ofs, uint32_t oprsz,               \
3443                          uint32_t maxsz)                                \
3444     {                                                                   \
3445         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3446                            DATA, FUNC);                                 \
3447     }
3448 
3449 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3450     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3451                          uint32_t rm_ofs, uint32_t oprsz,               \
3452                          uint32_t maxsz)                                \
3453     {                                                                   \
3454         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3455     }
3456 
3457 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3458 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3459 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3460 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3461 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3462 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3463 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3464 
3465 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3466     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3467     {                                                           \
3468         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3469             return false;                                       \
3470         }                                                       \
3471         return do_2misc_vec(s, a, gen_##INSN);                  \
3472     }
3473 
3474 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3475 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3476 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3477 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3478 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3479 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3480 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3481 
3482 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3483 {
3484     TCGv_i32 tmp;
3485     int pass;
3486 
3487     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3488     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3489         return false;
3490     }
3491 
3492     /* UNDEF accesses to D16-D31 if they don't exist. */
3493     if (!dc_isar_feature(aa32_simd_r32, s) &&
3494         ((a->vd | a->vm) & 0x10)) {
3495         return false;
3496     }
3497 
3498     if (!fn) {
3499         return false;
3500     }
3501 
3502     if ((a->vd | a->vm) & a->q) {
3503         return false;
3504     }
3505 
3506     if (!vfp_access_check(s)) {
3507         return true;
3508     }
3509 
3510     tmp = tcg_temp_new_i32();
3511     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3512         read_neon_element32(tmp, a->vm, pass, MO_32);
3513         fn(tmp, tmp);
3514         write_neon_element32(tmp, a->vd, pass, MO_32);
3515     }
3516     return true;
3517 }
3518 
3519 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3520 {
3521     static NeonGenOneOpFn * const fn[] = {
3522         tcg_gen_bswap32_i32,
3523         gen_swap_half,
3524         NULL,
3525         NULL,
3526     };
3527     return do_2misc(s, a, fn[a->size]);
3528 }
3529 
3530 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3531 {
3532     if (a->size != 0) {
3533         return false;
3534     }
3535     return do_2misc(s, a, gen_rev16);
3536 }
3537 
3538 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3539 {
3540     static NeonGenOneOpFn * const fn[] = {
3541         gen_helper_neon_cls_s8,
3542         gen_helper_neon_cls_s16,
3543         gen_helper_neon_cls_s32,
3544         NULL,
3545     };
3546     return do_2misc(s, a, fn[a->size]);
3547 }
3548 
3549 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3550 {
3551     tcg_gen_clzi_i32(rd, rm, 32);
3552 }
3553 
3554 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3555 {
3556     static NeonGenOneOpFn * const fn[] = {
3557         gen_helper_neon_clz_u8,
3558         gen_helper_neon_clz_u16,
3559         do_VCLZ_32,
3560         NULL,
3561     };
3562     return do_2misc(s, a, fn[a->size]);
3563 }
3564 
3565 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3566 {
3567     if (a->size != 0) {
3568         return false;
3569     }
3570     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3571 }
3572 
3573 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3574                        uint32_t oprsz, uint32_t maxsz)
3575 {
3576     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3577                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3578                       oprsz, maxsz);
3579 }
3580 
3581 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3582 {
3583     if (a->size == MO_16) {
3584         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3585             return false;
3586         }
3587     } else if (a->size != MO_32) {
3588         return false;
3589     }
3590     return do_2misc_vec(s, a, gen_VABS_F);
3591 }
3592 
3593 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3594                        uint32_t oprsz, uint32_t maxsz)
3595 {
3596     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3597                       vece == MO_16 ? 0x8000 : 0x80000000,
3598                       oprsz, maxsz);
3599 }
3600 
3601 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3602 {
3603     if (a->size == MO_16) {
3604         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3605             return false;
3606         }
3607     } else if (a->size != MO_32) {
3608         return false;
3609     }
3610     return do_2misc_vec(s, a, gen_VNEG_F);
3611 }
3612 
3613 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3614 {
3615     if (a->size != 2) {
3616         return false;
3617     }
3618     return do_2misc(s, a, gen_helper_recpe_u32);
3619 }
3620 
3621 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3622 {
3623     if (a->size != 2) {
3624         return false;
3625     }
3626     return do_2misc(s, a, gen_helper_rsqrte_u32);
3627 }
3628 
3629 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3630     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3631     {                                                   \
3632         FUNC(d, cpu_env, m);                            \
3633     }
3634 
3635 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3636 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3637 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3638 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3639 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3640 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3641 
3642 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3643 {
3644     static NeonGenOneOpFn * const fn[] = {
3645         gen_VQABS_s8,
3646         gen_VQABS_s16,
3647         gen_VQABS_s32,
3648         NULL,
3649     };
3650     return do_2misc(s, a, fn[a->size]);
3651 }
3652 
3653 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3654 {
3655     static NeonGenOneOpFn * const fn[] = {
3656         gen_VQNEG_s8,
3657         gen_VQNEG_s16,
3658         gen_VQNEG_s32,
3659         NULL,
3660     };
3661     return do_2misc(s, a, fn[a->size]);
3662 }
3663 
3664 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3665     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3666                            uint32_t rm_ofs,                             \
3667                            uint32_t oprsz, uint32_t maxsz)              \
3668     {                                                                   \
3669         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3670             NULL, HFUNC, SFUNC, NULL,                                   \
3671         };                                                              \
3672         TCGv_ptr fpst;                                                  \
3673         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3674         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3675                            fns[vece]);                                  \
3676     }                                                                   \
3677     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3678     {                                                                   \
3679         if (a->size == MO_16) {                                         \
3680             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3681                 return false;                                           \
3682             }                                                           \
3683         } else if (a->size != MO_32) {                                  \
3684             return false;                                               \
3685         }                                                               \
3686         return do_2misc_vec(s, a, gen_##INSN);                          \
3687     }
3688 
3689 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3690 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3691 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3692 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3693 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3694 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3695 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3696 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3697 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3698 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3699 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3700 
3701 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3702 
3703 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3704 {
3705     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3706         return false;
3707     }
3708     return trans_VRINTX_impl(s, a);
3709 }
3710 
3711 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3712     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3713                            uint32_t rm_ofs,                             \
3714                            uint32_t oprsz, uint32_t maxsz)              \
3715     {                                                                   \
3716         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3717             NULL,                                                       \
3718             gen_helper_gvec_##OP##h,                                    \
3719             gen_helper_gvec_##OP##s,                                    \
3720             NULL,                                                       \
3721         };                                                              \
3722         TCGv_ptr fpst;                                                  \
3723         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3724         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3725                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3726     }                                                                   \
3727     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3728     {                                                                   \
3729         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3730             return false;                                               \
3731         }                                                               \
3732         if (a->size == MO_16) {                                         \
3733             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3734                 return false;                                           \
3735             }                                                           \
3736         } else if (a->size != MO_32) {                                  \
3737             return false;                                               \
3738         }                                                               \
3739         return do_2misc_vec(s, a, gen_##INSN);                          \
3740     }
3741 
3742 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3743 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3744 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3745 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3746 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3747 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3748 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3749 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3750 
3751 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3752 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3753 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3754 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3755 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3756 
3757 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3758 {
3759     TCGv_i64 rm, rd;
3760     int pass;
3761 
3762     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3763         return false;
3764     }
3765 
3766     /* UNDEF accesses to D16-D31 if they don't exist. */
3767     if (!dc_isar_feature(aa32_simd_r32, s) &&
3768         ((a->vd | a->vm) & 0x10)) {
3769         return false;
3770     }
3771 
3772     if (a->size != 0) {
3773         return false;
3774     }
3775 
3776     if ((a->vd | a->vm) & a->q) {
3777         return false;
3778     }
3779 
3780     if (!vfp_access_check(s)) {
3781         return true;
3782     }
3783 
3784     rm = tcg_temp_new_i64();
3785     rd = tcg_temp_new_i64();
3786     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3787         read_neon_element64(rm, a->vm, pass, MO_64);
3788         read_neon_element64(rd, a->vd, pass, MO_64);
3789         write_neon_element64(rm, a->vd, pass, MO_64);
3790         write_neon_element64(rd, a->vm, pass, MO_64);
3791     }
3792     return true;
3793 }
3794 
3795 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3796 {
3797     TCGv_i32 rd, tmp;
3798 
3799     rd = tcg_temp_new_i32();
3800     tmp = tcg_temp_new_i32();
3801 
3802     tcg_gen_shli_i32(rd, t0, 8);
3803     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3804     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3805     tcg_gen_or_i32(rd, rd, tmp);
3806 
3807     tcg_gen_shri_i32(t1, t1, 8);
3808     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3809     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3810     tcg_gen_or_i32(t1, t1, tmp);
3811     tcg_gen_mov_i32(t0, rd);
3812 }
3813 
3814 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3815 {
3816     TCGv_i32 rd, tmp;
3817 
3818     rd = tcg_temp_new_i32();
3819     tmp = tcg_temp_new_i32();
3820 
3821     tcg_gen_shli_i32(rd, t0, 16);
3822     tcg_gen_andi_i32(tmp, t1, 0xffff);
3823     tcg_gen_or_i32(rd, rd, tmp);
3824     tcg_gen_shri_i32(t1, t1, 16);
3825     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3826     tcg_gen_or_i32(t1, t1, tmp);
3827     tcg_gen_mov_i32(t0, rd);
3828 }
3829 
3830 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3831 {
3832     TCGv_i32 tmp, tmp2;
3833     int pass;
3834 
3835     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3836         return false;
3837     }
3838 
3839     /* UNDEF accesses to D16-D31 if they don't exist. */
3840     if (!dc_isar_feature(aa32_simd_r32, s) &&
3841         ((a->vd | a->vm) & 0x10)) {
3842         return false;
3843     }
3844 
3845     if ((a->vd | a->vm) & a->q) {
3846         return false;
3847     }
3848 
3849     if (a->size == 3) {
3850         return false;
3851     }
3852 
3853     if (!vfp_access_check(s)) {
3854         return true;
3855     }
3856 
3857     tmp = tcg_temp_new_i32();
3858     tmp2 = tcg_temp_new_i32();
3859     if (a->size == MO_32) {
3860         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3861             read_neon_element32(tmp, a->vm, pass, MO_32);
3862             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3863             write_neon_element32(tmp2, a->vm, pass, MO_32);
3864             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3865         }
3866     } else {
3867         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3868             read_neon_element32(tmp, a->vm, pass, MO_32);
3869             read_neon_element32(tmp2, a->vd, pass, MO_32);
3870             if (a->size == MO_8) {
3871                 gen_neon_trn_u8(tmp, tmp2);
3872             } else {
3873                 gen_neon_trn_u16(tmp, tmp2);
3874             }
3875             write_neon_element32(tmp2, a->vm, pass, MO_32);
3876             write_neon_element32(tmp, a->vd, pass, MO_32);
3877         }
3878     }
3879     return true;
3880 }
3881 
3882 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
3883 {
3884     if (!dc_isar_feature(aa32_i8mm, s)) {
3885         return false;
3886     }
3887     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3888                         gen_helper_gvec_smmla_b);
3889 }
3890 
3891 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
3892 {
3893     if (!dc_isar_feature(aa32_i8mm, s)) {
3894         return false;
3895     }
3896     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3897                         gen_helper_gvec_ummla_b);
3898 }
3899 
3900 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
3901 {
3902     if (!dc_isar_feature(aa32_i8mm, s)) {
3903         return false;
3904     }
3905     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3906                         gen_helper_gvec_usmmla_b);
3907 }
3908 
3909 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
3910 {
3911     if (!dc_isar_feature(aa32_bf16, s)) {
3912         return false;
3913     }
3914     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3915                         gen_helper_gvec_bfmmla);
3916 }
3917 
3918 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
3919 {
3920     if (!dc_isar_feature(aa32_bf16, s)) {
3921         return false;
3922     }
3923     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
3924                              gen_helper_gvec_bfmlal);
3925 }
3926 
3927 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
3928 {
3929     if (!dc_isar_feature(aa32_bf16, s)) {
3930         return false;
3931     }
3932     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
3933                              (a->index << 1) | a->q, FPST_STD,
3934                              gen_helper_gvec_bfmlal_idx);
3935 }
3936