xref: /openbmc/qemu/target/arm/tcg/translate-neon.c (revision 4860af2c4fc4632c180ba902758f6a7f66ed9ac1)
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include "qemu/osdep.h"
24 #include "translate.h"
25 #include "translate-a32.h"
26 
27 /* Include the generated Neon decoder */
28 #include "decode-neon-dp.c.inc"
29 #include "decode-neon-ls.c.inc"
30 #include "decode-neon-shared.c.inc"
31 
32 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
33 {
34     TCGv_ptr ret = tcg_temp_new_ptr();
35     tcg_gen_addi_ptr(ret, tcg_env, vfp_reg_offset(dp, reg));
36     return ret;
37 }
38 
39 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
40 {
41     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
42 
43     switch (mop) {
44     case MO_UB:
45         tcg_gen_ld8u_i32(var, tcg_env, offset);
46         break;
47     case MO_UW:
48         tcg_gen_ld16u_i32(var, tcg_env, offset);
49         break;
50     case MO_UL:
51         tcg_gen_ld_i32(var, tcg_env, offset);
52         break;
53     default:
54         g_assert_not_reached();
55     }
56 }
57 
58 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
59 {
60     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
61 
62     switch (mop) {
63     case MO_UB:
64         tcg_gen_ld8u_i64(var, tcg_env, offset);
65         break;
66     case MO_UW:
67         tcg_gen_ld16u_i64(var, tcg_env, offset);
68         break;
69     case MO_UL:
70         tcg_gen_ld32u_i64(var, tcg_env, offset);
71         break;
72     case MO_UQ:
73         tcg_gen_ld_i64(var, tcg_env, offset);
74         break;
75     default:
76         g_assert_not_reached();
77     }
78 }
79 
80 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
81 {
82     long offset = neon_element_offset(reg, ele, size);
83 
84     switch (size) {
85     case MO_8:
86         tcg_gen_st8_i32(var, tcg_env, offset);
87         break;
88     case MO_16:
89         tcg_gen_st16_i32(var, tcg_env, offset);
90         break;
91     case MO_32:
92         tcg_gen_st_i32(var, tcg_env, offset);
93         break;
94     default:
95         g_assert_not_reached();
96     }
97 }
98 
99 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
100 {
101     long offset = neon_element_offset(reg, ele, size);
102 
103     switch (size) {
104     case MO_8:
105         tcg_gen_st8_i64(var, tcg_env, offset);
106         break;
107     case MO_16:
108         tcg_gen_st16_i64(var, tcg_env, offset);
109         break;
110     case MO_32:
111         tcg_gen_st32_i64(var, tcg_env, offset);
112         break;
113     case MO_64:
114         tcg_gen_st_i64(var, tcg_env, offset);
115         break;
116     default:
117         g_assert_not_reached();
118     }
119 }
120 
121 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
122                          int data, gen_helper_gvec_4 *fn_gvec)
123 {
124     /* UNDEF accesses to D16-D31 if they don't exist. */
125     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
126         return false;
127     }
128 
129     /*
130      * UNDEF accesses to odd registers for each bit of Q.
131      * Q will be 0b111 for all Q-reg instructions, otherwise
132      * when we have mixed Q- and D-reg inputs.
133      */
134     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
135         return false;
136     }
137 
138     if (!vfp_access_check(s)) {
139         return true;
140     }
141 
142     int opr_sz = q ? 16 : 8;
143     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
144                        vfp_reg_offset(1, vn),
145                        vfp_reg_offset(1, vm),
146                        vfp_reg_offset(1, vd),
147                        opr_sz, opr_sz, data, fn_gvec);
148     return true;
149 }
150 
151 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
152                               int data, ARMFPStatusFlavour fp_flavour,
153                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
154 {
155     /* UNDEF accesses to D16-D31 if they don't exist. */
156     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
157         return false;
158     }
159 
160     /*
161      * UNDEF accesses to odd registers for each bit of Q.
162      * Q will be 0b111 for all Q-reg instructions, otherwise
163      * when we have mixed Q- and D-reg inputs.
164      */
165     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
166         return false;
167     }
168 
169     if (!vfp_access_check(s)) {
170         return true;
171     }
172 
173     int opr_sz = q ? 16 : 8;
174     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
175 
176     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
177                        vfp_reg_offset(1, vn),
178                        vfp_reg_offset(1, vm),
179                        vfp_reg_offset(1, vd),
180                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
181     return true;
182 }
183 
184 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
185 {
186     if (!dc_isar_feature(aa32_vcma, s)) {
187         return false;
188     }
189     if (a->size == MO_16) {
190         if (!dc_isar_feature(aa32_fp16_arith, s)) {
191             return false;
192         }
193         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
194                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
195     }
196     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
197                              FPST_STD, gen_helper_gvec_fcmlas);
198 }
199 
200 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
201 {
202     int opr_sz;
203     TCGv_ptr fpst;
204     gen_helper_gvec_3_ptr *fn_gvec_ptr;
205 
206     if (!dc_isar_feature(aa32_vcma, s)
207         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
208         return false;
209     }
210 
211     /* UNDEF accesses to D16-D31 if they don't exist. */
212     if (!dc_isar_feature(aa32_simd_r32, s) &&
213         ((a->vd | a->vn | a->vm) & 0x10)) {
214         return false;
215     }
216 
217     if ((a->vn | a->vm | a->vd) & a->q) {
218         return false;
219     }
220 
221     if (!vfp_access_check(s)) {
222         return true;
223     }
224 
225     opr_sz = (1 + a->q) * 8;
226     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
227     fn_gvec_ptr = (a->size == MO_16) ?
228         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
229     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
230                        vfp_reg_offset(1, a->vn),
231                        vfp_reg_offset(1, a->vm),
232                        fpst, opr_sz, opr_sz, a->rot,
233                        fn_gvec_ptr);
234     return true;
235 }
236 
237 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
238 {
239     if (!dc_isar_feature(aa32_dp, s)) {
240         return false;
241     }
242     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
243                         gen_helper_gvec_sdot_b);
244 }
245 
246 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
247 {
248     if (!dc_isar_feature(aa32_dp, s)) {
249         return false;
250     }
251     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
252                         gen_helper_gvec_udot_b);
253 }
254 
255 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
256 {
257     if (!dc_isar_feature(aa32_i8mm, s)) {
258         return false;
259     }
260     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
261                         gen_helper_gvec_usdot_b);
262 }
263 
264 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
265 {
266     if (!dc_isar_feature(aa32_bf16, s)) {
267         return false;
268     }
269     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
270                         gen_helper_gvec_bfdot);
271 }
272 
273 static bool trans_VFML(DisasContext *s, arg_VFML *a)
274 {
275     int opr_sz;
276 
277     if (!dc_isar_feature(aa32_fhm, s)) {
278         return false;
279     }
280 
281     /* UNDEF accesses to D16-D31 if they don't exist. */
282     if (!dc_isar_feature(aa32_simd_r32, s) &&
283         (a->vd & 0x10)) {
284         return false;
285     }
286 
287     if (a->vd & a->q) {
288         return false;
289     }
290 
291     if (!vfp_access_check(s)) {
292         return true;
293     }
294 
295     opr_sz = (1 + a->q) * 8;
296     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
297                        vfp_reg_offset(a->q, a->vn),
298                        vfp_reg_offset(a->q, a->vm),
299                        tcg_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
300                        gen_helper_gvec_fmlal_a32);
301     return true;
302 }
303 
304 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
305 {
306     int data = (a->index << 2) | a->rot;
307 
308     if (!dc_isar_feature(aa32_vcma, s)) {
309         return false;
310     }
311     if (a->size == MO_16) {
312         if (!dc_isar_feature(aa32_fp16_arith, s)) {
313             return false;
314         }
315         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
316                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
317     }
318     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
319                              FPST_STD, gen_helper_gvec_fcmlas_idx);
320 }
321 
322 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
323 {
324     if (!dc_isar_feature(aa32_dp, s)) {
325         return false;
326     }
327     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
328                         gen_helper_gvec_sdot_idx_b);
329 }
330 
331 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
332 {
333     if (!dc_isar_feature(aa32_dp, s)) {
334         return false;
335     }
336     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
337                         gen_helper_gvec_udot_idx_b);
338 }
339 
340 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
341 {
342     if (!dc_isar_feature(aa32_i8mm, s)) {
343         return false;
344     }
345     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
346                         gen_helper_gvec_usdot_idx_b);
347 }
348 
349 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
350 {
351     if (!dc_isar_feature(aa32_i8mm, s)) {
352         return false;
353     }
354     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
355                         gen_helper_gvec_sudot_idx_b);
356 }
357 
358 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
359 {
360     if (!dc_isar_feature(aa32_bf16, s)) {
361         return false;
362     }
363     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
364                         gen_helper_gvec_bfdot_idx);
365 }
366 
367 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
368 {
369     int opr_sz;
370 
371     if (!dc_isar_feature(aa32_fhm, s)) {
372         return false;
373     }
374 
375     /* UNDEF accesses to D16-D31 if they don't exist. */
376     if (!dc_isar_feature(aa32_simd_r32, s) &&
377         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
378         return false;
379     }
380 
381     if (a->vd & a->q) {
382         return false;
383     }
384 
385     if (!vfp_access_check(s)) {
386         return true;
387     }
388 
389     opr_sz = (1 + a->q) * 8;
390     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
391                        vfp_reg_offset(a->q, a->vn),
392                        vfp_reg_offset(a->q, a->rm),
393                        tcg_env, opr_sz, opr_sz,
394                        (a->index << 2) | a->s, /* is_2 == 0 */
395                        gen_helper_gvec_fmlal_idx_a32);
396     return true;
397 }
398 
399 static struct {
400     int nregs;
401     int interleave;
402     int spacing;
403 } const neon_ls_element_type[11] = {
404     {1, 4, 1},
405     {1, 4, 2},
406     {4, 1, 1},
407     {2, 2, 2},
408     {1, 3, 1},
409     {1, 3, 2},
410     {3, 1, 1},
411     {1, 1, 1},
412     {1, 2, 1},
413     {1, 2, 2},
414     {2, 1, 1}
415 };
416 
417 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
418                                       int stride)
419 {
420     if (rm != 15) {
421         TCGv_i32 base;
422 
423         base = load_reg(s, rn);
424         if (rm == 13) {
425             tcg_gen_addi_i32(base, base, stride);
426         } else {
427             TCGv_i32 index;
428             index = load_reg(s, rm);
429             tcg_gen_add_i32(base, base, index);
430         }
431         store_reg(s, rn, base);
432     }
433 }
434 
435 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
436 {
437     /* Neon load/store multiple structures */
438     int nregs, interleave, spacing, reg, n;
439     MemOp mop, align, endian;
440     int mmu_idx = get_mem_index(s);
441     int size = a->size;
442     TCGv_i64 tmp64;
443     TCGv_i32 addr;
444 
445     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
446         return false;
447     }
448 
449     /* UNDEF accesses to D16-D31 if they don't exist */
450     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
451         return false;
452     }
453     if (a->itype > 10) {
454         return false;
455     }
456     /* Catch UNDEF cases for bad values of align field */
457     switch (a->itype & 0xc) {
458     case 4:
459         if (a->align >= 2) {
460             return false;
461         }
462         break;
463     case 8:
464         if (a->align == 3) {
465             return false;
466         }
467         break;
468     default:
469         break;
470     }
471     nregs = neon_ls_element_type[a->itype].nregs;
472     interleave = neon_ls_element_type[a->itype].interleave;
473     spacing = neon_ls_element_type[a->itype].spacing;
474     if (size == 3 && (interleave | spacing) != 1) {
475         return false;
476     }
477 
478     if (!vfp_access_check(s)) {
479         return true;
480     }
481 
482     /* For our purposes, bytes are always little-endian.  */
483     endian = s->be_data;
484     if (size == 0) {
485         endian = MO_LE;
486     }
487 
488     /* Enforce alignment requested by the instruction */
489     if (a->align) {
490         align = pow2_align(a->align + 2); /* 4 ** a->align */
491     } else {
492         align = s->align_mem ? MO_ALIGN : 0;
493     }
494 
495     /*
496      * Consecutive little-endian elements from a single register
497      * can be promoted to a larger little-endian operation.
498      */
499     if (interleave == 1 && endian == MO_LE) {
500         /* Retain any natural alignment. */
501         if (align == MO_ALIGN) {
502             align = pow2_align(size);
503         }
504         size = 3;
505     }
506 
507     tmp64 = tcg_temp_new_i64();
508     addr = tcg_temp_new_i32();
509     load_reg_var(s, addr, a->rn);
510 
511     mop = endian | size | align;
512     for (reg = 0; reg < nregs; reg++) {
513         for (n = 0; n < 8 >> size; n++) {
514             int xs;
515             for (xs = 0; xs < interleave; xs++) {
516                 int tt = a->vd + reg + spacing * xs;
517 
518                 if (a->l) {
519                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
520                     neon_store_element64(tt, n, size, tmp64);
521                 } else {
522                     neon_load_element64(tmp64, tt, n, size);
523                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
524                 }
525                 tcg_gen_addi_i32(addr, addr, 1 << size);
526 
527                 /* Subsequent memory operations inherit alignment */
528                 mop &= ~MO_AMASK;
529             }
530         }
531     }
532 
533     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
534     return true;
535 }
536 
537 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
538 {
539     /* Neon load single structure to all lanes */
540     int reg, stride, vec_size;
541     int vd = a->vd;
542     int size = a->size;
543     int nregs = a->n + 1;
544     TCGv_i32 addr, tmp;
545     MemOp mop, align;
546 
547     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
548         return false;
549     }
550 
551     /* UNDEF accesses to D16-D31 if they don't exist */
552     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
553         return false;
554     }
555 
556     align = 0;
557     if (size == 3) {
558         if (nregs != 4 || a->a == 0) {
559             return false;
560         }
561         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
562         size = MO_32;
563         align = MO_ALIGN_16;
564     } else if (a->a) {
565         switch (nregs) {
566         case 1:
567             if (size == 0) {
568                 return false;
569             }
570             align = MO_ALIGN;
571             break;
572         case 2:
573             align = pow2_align(size + 1);
574             break;
575         case 3:
576             return false;
577         case 4:
578             if (size == 2) {
579                 align = pow2_align(3);
580             } else {
581                 align = pow2_align(size + 2);
582             }
583             break;
584         default:
585             g_assert_not_reached();
586         }
587     }
588 
589     if (!vfp_access_check(s)) {
590         return true;
591     }
592 
593     /*
594      * VLD1 to all lanes: T bit indicates how many Dregs to write.
595      * VLD2/3/4 to all lanes: T bit indicates register stride.
596      */
597     stride = a->t ? 2 : 1;
598     vec_size = nregs == 1 ? stride * 8 : 8;
599     mop = size | align;
600     tmp = tcg_temp_new_i32();
601     addr = tcg_temp_new_i32();
602     load_reg_var(s, addr, a->rn);
603     for (reg = 0; reg < nregs; reg++) {
604         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
605         if ((vd & 1) && vec_size == 16) {
606             /*
607              * We cannot write 16 bytes at once because the
608              * destination is unaligned.
609              */
610             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
611                                  8, 8, tmp);
612             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
613                              neon_full_reg_offset(vd), 8, 8);
614         } else {
615             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
616                                  vec_size, vec_size, tmp);
617         }
618         tcg_gen_addi_i32(addr, addr, 1 << size);
619         vd += stride;
620 
621         /* Subsequent memory operations inherit alignment */
622         mop &= ~MO_AMASK;
623     }
624 
625     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
626 
627     return true;
628 }
629 
630 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
631 {
632     /* Neon load/store single structure to one lane */
633     int reg;
634     int nregs = a->n + 1;
635     int vd = a->vd;
636     TCGv_i32 addr, tmp;
637     MemOp mop;
638 
639     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
640         return false;
641     }
642 
643     /* UNDEF accesses to D16-D31 if they don't exist */
644     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
645         return false;
646     }
647 
648     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
649     switch (nregs) {
650     case 1:
651         if (a->stride != 1) {
652             return false;
653         }
654         if (((a->align & (1 << a->size)) != 0) ||
655             (a->size == 2 && (a->align == 1 || a->align == 2))) {
656             return false;
657         }
658         break;
659     case 2:
660         if (a->size == 2 && (a->align & 2) != 0) {
661             return false;
662         }
663         break;
664     case 3:
665         if (a->align != 0) {
666             return false;
667         }
668         break;
669     case 4:
670         if (a->size == 2 && a->align == 3) {
671             return false;
672         }
673         break;
674     default:
675         g_assert_not_reached();
676     }
677     if ((vd + a->stride * (nregs - 1)) > 31) {
678         /*
679          * Attempts to write off the end of the register file are
680          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
681          * access off the end of the array that holds the register data.
682          */
683         return false;
684     }
685 
686     if (!vfp_access_check(s)) {
687         return true;
688     }
689 
690     /* Pick up SCTLR settings */
691     mop = finalize_memop(s, a->size);
692 
693     if (a->align) {
694         MemOp align_op;
695 
696         switch (nregs) {
697         case 1:
698             /* For VLD1, use natural alignment. */
699             align_op = MO_ALIGN;
700             break;
701         case 2:
702             /* For VLD2, use double alignment. */
703             align_op = pow2_align(a->size + 1);
704             break;
705         case 4:
706             if (a->size == MO_32) {
707                 /*
708                  * For VLD4.32, align = 1 is double alignment, align = 2 is
709                  * quad alignment; align = 3 is rejected above.
710                  */
711                 align_op = pow2_align(a->size + a->align);
712             } else {
713                 /* For VLD4.8 and VLD.16, we want quad alignment. */
714                 align_op = pow2_align(a->size + 2);
715             }
716             break;
717         default:
718             /* For VLD3, the alignment field is zero and rejected above. */
719             g_assert_not_reached();
720         }
721 
722         mop = (mop & ~MO_AMASK) | align_op;
723     }
724 
725     tmp = tcg_temp_new_i32();
726     addr = tcg_temp_new_i32();
727     load_reg_var(s, addr, a->rn);
728 
729     for (reg = 0; reg < nregs; reg++) {
730         if (a->l) {
731             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
732             neon_store_element(vd, a->reg_idx, a->size, tmp);
733         } else { /* Store */
734             neon_load_element(tmp, vd, a->reg_idx, a->size);
735             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
736         }
737         vd += a->stride;
738         tcg_gen_addi_i32(addr, addr, 1 << a->size);
739 
740         /* Subsequent memory operations inherit alignment */
741         mop &= ~MO_AMASK;
742     }
743 
744     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
745 
746     return true;
747 }
748 
749 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
750 {
751     int vec_size = a->q ? 16 : 8;
752     int rd_ofs = neon_full_reg_offset(a->vd);
753     int rn_ofs = neon_full_reg_offset(a->vn);
754     int rm_ofs = neon_full_reg_offset(a->vm);
755 
756     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
757         return false;
758     }
759 
760     /* UNDEF accesses to D16-D31 if they don't exist. */
761     if (!dc_isar_feature(aa32_simd_r32, s) &&
762         ((a->vd | a->vn | a->vm) & 0x10)) {
763         return false;
764     }
765 
766     if ((a->vn | a->vm | a->vd) & a->q) {
767         return false;
768     }
769 
770     if (!vfp_access_check(s)) {
771         return true;
772     }
773 
774     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
775     return true;
776 }
777 
778 #define DO_3SAME(INSN, FUNC)                                            \
779     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
780     {                                                                   \
781         return do_3same(s, a, FUNC);                                    \
782     }
783 
784 DO_3SAME(VADD, tcg_gen_gvec_add)
785 DO_3SAME(VSUB, tcg_gen_gvec_sub)
786 DO_3SAME(VAND, tcg_gen_gvec_and)
787 DO_3SAME(VBIC, tcg_gen_gvec_andc)
788 DO_3SAME(VORR, tcg_gen_gvec_or)
789 DO_3SAME(VORN, tcg_gen_gvec_orc)
790 DO_3SAME(VEOR, tcg_gen_gvec_xor)
791 DO_3SAME(VSHL_S, gen_gvec_sshl)
792 DO_3SAME(VSHL_U, gen_gvec_ushl)
793 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
794 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
795 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
796 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
797 DO_3SAME(VRSHL_S, gen_gvec_srshl)
798 DO_3SAME(VRSHL_U, gen_gvec_urshl)
799 DO_3SAME(VQSHL_S, gen_neon_sqshl)
800 DO_3SAME(VQSHL_U, gen_neon_uqshl)
801 DO_3SAME(VQRSHL_S, gen_neon_sqrshl)
802 DO_3SAME(VQRSHL_U, gen_neon_uqrshl)
803 
804 /* These insns are all gvec_bitsel but with the inputs in various orders. */
805 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
806     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
807                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
808                                 uint32_t oprsz, uint32_t maxsz)         \
809     {                                                                   \
810         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
811     }                                                                   \
812     DO_3SAME(INSN, gen_##INSN##_3s)
813 
814 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
815 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
816 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
817 
818 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
819     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
820     {                                                                   \
821         if (a->size == 3) {                                             \
822             return false;                                               \
823         }                                                               \
824         return do_3same(s, a, FUNC);                                    \
825     }
826 
827 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
828 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
829 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
830 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
831 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
832 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
833 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
834 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
835 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
836 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
837 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
838 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
839 DO_3SAME_NO_SZ_3(VPADD, gen_gvec_addp)
840 DO_3SAME_NO_SZ_3(VPMAX_S, gen_gvec_smaxp)
841 DO_3SAME_NO_SZ_3(VPMIN_S, gen_gvec_sminp)
842 DO_3SAME_NO_SZ_3(VPMAX_U, gen_gvec_umaxp)
843 DO_3SAME_NO_SZ_3(VPMIN_U, gen_gvec_uminp)
844 DO_3SAME_NO_SZ_3(VHADD_S, gen_gvec_shadd)
845 DO_3SAME_NO_SZ_3(VHADD_U, gen_gvec_uhadd)
846 DO_3SAME_NO_SZ_3(VHSUB_S, gen_gvec_shsub)
847 DO_3SAME_NO_SZ_3(VHSUB_U, gen_gvec_uhsub)
848 DO_3SAME_NO_SZ_3(VRHADD_S, gen_gvec_srhadd)
849 DO_3SAME_NO_SZ_3(VRHADD_U, gen_gvec_urhadd)
850 
851 #define DO_3SAME_CMP(INSN, COND)                                        \
852     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
853                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
854                                 uint32_t oprsz, uint32_t maxsz)         \
855     {                                                                   \
856         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
857     }                                                                   \
858     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
859 
860 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
861 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
862 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
863 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
864 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
865 
866 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
867     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
868                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
869     {                                                                      \
870         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
871     }
872 
873 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
874 
875 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
876 {
877     if (a->size != 0) {
878         return false;
879     }
880     return do_3same(s, a, gen_VMUL_p_3s);
881 }
882 
883 #define DO_VQRDMLAH(INSN, FUNC)                                         \
884     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
885     {                                                                   \
886         if (!dc_isar_feature(aa32_rdm, s)) {                            \
887             return false;                                               \
888         }                                                               \
889         if (a->size != 1 && a->size != 2) {                             \
890             return false;                                               \
891         }                                                               \
892         return do_3same(s, a, FUNC);                                    \
893     }
894 
895 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
896 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
897 
898 #define DO_SHA1(NAME, FUNC)                                             \
899     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
900     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
901     {                                                                   \
902         if (!dc_isar_feature(aa32_sha1, s)) {                           \
903             return false;                                               \
904         }                                                               \
905         return do_3same(s, a, gen_##NAME##_3s);                         \
906     }
907 
908 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
909 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
910 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
911 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
912 
913 #define DO_SHA2(NAME, FUNC)                                             \
914     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
915     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
916     {                                                                   \
917         if (!dc_isar_feature(aa32_sha2, s)) {                           \
918             return false;                                               \
919         }                                                               \
920         return do_3same(s, a, gen_##NAME##_3s);                         \
921     }
922 
923 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
924 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
925 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
926 
927 /*
928  * Some helper functions need to be passed the tcg_env. In order
929  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
930  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
931  * and which call a NeonGenTwoOpEnvFn().
932  */
933 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
934     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
935     {                                                                   \
936         FUNC(d, tcg_env, n, m);                                         \
937     }
938 
939 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
940     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
941     { return a->size >= 1 && a->size <= 2 && do_3same(s, a, FUNC); }
942 
943 DO_3SAME_VQDMULH(VQDMULH, gen_gvec_sqdmulh_qc)
944 DO_3SAME_VQDMULH(VQRDMULH, gen_gvec_sqrdmulh_qc)
945 
946 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
947     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
948                          uint32_t rn_ofs, uint32_t rm_ofs,              \
949                          uint32_t oprsz, uint32_t maxsz)                \
950     {                                                                   \
951         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
952         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
953                            oprsz, maxsz, 0, FUNC);                      \
954     }
955 
956 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
957     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
958     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
959     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
960     {                                                                   \
961         if (a->size == MO_16) {                                         \
962             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
963                 return false;                                           \
964             }                                                           \
965             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
966         }                                                               \
967         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
968     }
969 
970 
971 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
972 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
973 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
974 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
975 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
976 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
977 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
978 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
979 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
980 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
981 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
982 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
983 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
984 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
985 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
986 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
987 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
988 DO_3S_FP_GVEC(VPADD, gen_helper_gvec_faddp_s, gen_helper_gvec_faddp_h)
989 DO_3S_FP_GVEC(VPMAX, gen_helper_gvec_fmaxp_s, gen_helper_gvec_fmaxp_h)
990 DO_3S_FP_GVEC(VPMIN, gen_helper_gvec_fminp_s, gen_helper_gvec_fminp_h)
991 
992 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
993 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
994 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
995 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
996 
997 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
998 {
999     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1000         return false;
1001     }
1002 
1003     if (a->size == MO_16) {
1004         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1005             return false;
1006         }
1007         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1008     }
1009     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1010 }
1011 
1012 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1013 {
1014     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1015         return false;
1016     }
1017 
1018     if (a->size == MO_16) {
1019         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1020             return false;
1021         }
1022         return do_3same(s, a, gen_VMINNM_fp16_3s);
1023     }
1024     return do_3same(s, a, gen_VMINNM_fp32_3s);
1025 }
1026 
1027 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1028 {
1029     /* Handle a 2-reg-shift insn which can be vectorized. */
1030     int vec_size = a->q ? 16 : 8;
1031     int rd_ofs = neon_full_reg_offset(a->vd);
1032     int rm_ofs = neon_full_reg_offset(a->vm);
1033 
1034     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1035         return false;
1036     }
1037 
1038     /* UNDEF accesses to D16-D31 if they don't exist. */
1039     if (!dc_isar_feature(aa32_simd_r32, s) &&
1040         ((a->vd | a->vm) & 0x10)) {
1041         return false;
1042     }
1043 
1044     if ((a->vm | a->vd) & a->q) {
1045         return false;
1046     }
1047 
1048     if (!vfp_access_check(s)) {
1049         return true;
1050     }
1051 
1052     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1053     return true;
1054 }
1055 
1056 #define DO_2SH(INSN, FUNC)                                              \
1057     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1058     {                                                                   \
1059         return do_vector_2sh(s, a, FUNC);                               \
1060     }                                                                   \
1061 
1062 DO_2SH(VSHL, tcg_gen_gvec_shli)
1063 DO_2SH(VSLI, gen_gvec_sli)
1064 DO_2SH(VSRI, gen_gvec_sri)
1065 DO_2SH(VSRA_S, gen_gvec_ssra)
1066 DO_2SH(VSRA_U, gen_gvec_usra)
1067 DO_2SH(VRSHR_S, gen_gvec_srshr)
1068 DO_2SH(VRSHR_U, gen_gvec_urshr)
1069 DO_2SH(VRSRA_S, gen_gvec_srsra)
1070 DO_2SH(VRSRA_U, gen_gvec_ursra)
1071 
1072 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1073 {
1074     /* Signed shift out of range results in all-sign-bits */
1075     a->shift = MIN(a->shift, (8 << a->size) - 1);
1076     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1077 }
1078 
1079 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1080                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1081 {
1082     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1083 }
1084 
1085 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1086 {
1087     /* Shift out of range is architecturally valid and results in zero. */
1088     if (a->shift >= (8 << a->size)) {
1089         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1090     } else {
1091         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1092     }
1093 }
1094 
1095 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1096                              NeonGenTwo64OpEnvFn *fn)
1097 {
1098     /*
1099      * 2-reg-and-shift operations, size == 3 case, where the
1100      * function needs to be passed tcg_env.
1101      */
1102     TCGv_i64 constimm;
1103     int pass;
1104 
1105     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1106         return false;
1107     }
1108 
1109     /* UNDEF accesses to D16-D31 if they don't exist. */
1110     if (!dc_isar_feature(aa32_simd_r32, s) &&
1111         ((a->vd | a->vm) & 0x10)) {
1112         return false;
1113     }
1114 
1115     if ((a->vm | a->vd) & a->q) {
1116         return false;
1117     }
1118 
1119     if (!vfp_access_check(s)) {
1120         return true;
1121     }
1122 
1123     /*
1124      * To avoid excessive duplication of ops we implement shift
1125      * by immediate using the variable shift operations.
1126      */
1127     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1128 
1129     for (pass = 0; pass < a->q + 1; pass++) {
1130         TCGv_i64 tmp = tcg_temp_new_i64();
1131 
1132         read_neon_element64(tmp, a->vm, pass, MO_64);
1133         fn(tmp, tcg_env, tmp, constimm);
1134         write_neon_element64(tmp, a->vd, pass, MO_64);
1135     }
1136     return true;
1137 }
1138 
1139 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1140                              NeonGenTwoOpEnvFn *fn)
1141 {
1142     /*
1143      * 2-reg-and-shift operations, size < 3 case, where the
1144      * helper needs to be passed tcg_env.
1145      */
1146     TCGv_i32 constimm, tmp;
1147     int pass;
1148 
1149     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1150         return false;
1151     }
1152 
1153     /* UNDEF accesses to D16-D31 if they don't exist. */
1154     if (!dc_isar_feature(aa32_simd_r32, s) &&
1155         ((a->vd | a->vm) & 0x10)) {
1156         return false;
1157     }
1158 
1159     if ((a->vm | a->vd) & a->q) {
1160         return false;
1161     }
1162 
1163     if (!vfp_access_check(s)) {
1164         return true;
1165     }
1166 
1167     /*
1168      * To avoid excessive duplication of ops we implement shift
1169      * by immediate using the variable shift operations.
1170      */
1171     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1172     tmp = tcg_temp_new_i32();
1173 
1174     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1175         read_neon_element32(tmp, a->vm, pass, MO_32);
1176         fn(tmp, tcg_env, tmp, constimm);
1177         write_neon_element32(tmp, a->vd, pass, MO_32);
1178     }
1179     return true;
1180 }
1181 
1182 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1183     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1184     {                                                                   \
1185         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1186     }                                                                   \
1187     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1188     {                                                                   \
1189         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1190             gen_helper_neon_##FUNC##8,                                  \
1191             gen_helper_neon_##FUNC##16,                                 \
1192             gen_helper_neon_##FUNC##32,                                 \
1193         };                                                              \
1194         assert(a->size < ARRAY_SIZE(fns));                              \
1195         return do_2shift_env_32(s, a, fns[a->size]);                    \
1196     }
1197 
1198 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1199 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1200 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1201 
1202 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1203                                 NeonGenTwo64OpFn *shiftfn,
1204                                 NeonGenNarrowEnvFn *narrowfn)
1205 {
1206     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1207     TCGv_i64 constimm, rm1, rm2;
1208     TCGv_i32 rd;
1209 
1210     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1211         return false;
1212     }
1213 
1214     /* UNDEF accesses to D16-D31 if they don't exist. */
1215     if (!dc_isar_feature(aa32_simd_r32, s) &&
1216         ((a->vd | a->vm) & 0x10)) {
1217         return false;
1218     }
1219 
1220     if (a->vm & 1) {
1221         return false;
1222     }
1223 
1224     if (!vfp_access_check(s)) {
1225         return true;
1226     }
1227 
1228     /*
1229      * This is always a right shift, and the shiftfn is always a
1230      * left-shift helper, which thus needs the negated shift count.
1231      */
1232     constimm = tcg_constant_i64(-a->shift);
1233     rm1 = tcg_temp_new_i64();
1234     rm2 = tcg_temp_new_i64();
1235     rd = tcg_temp_new_i32();
1236 
1237     /* Load both inputs first to avoid potential overwrite if rm == rd */
1238     read_neon_element64(rm1, a->vm, 0, MO_64);
1239     read_neon_element64(rm2, a->vm, 1, MO_64);
1240 
1241     shiftfn(rm1, rm1, constimm);
1242     narrowfn(rd, tcg_env, rm1);
1243     write_neon_element32(rd, a->vd, 0, MO_32);
1244 
1245     shiftfn(rm2, rm2, constimm);
1246     narrowfn(rd, tcg_env, rm2);
1247     write_neon_element32(rd, a->vd, 1, MO_32);
1248 
1249     return true;
1250 }
1251 
1252 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1253                                 NeonGenTwoOpFn *shiftfn,
1254                                 NeonGenNarrowEnvFn *narrowfn)
1255 {
1256     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1257     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1258     TCGv_i64 rtmp;
1259     uint32_t imm;
1260 
1261     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1262         return false;
1263     }
1264 
1265     /* UNDEF accesses to D16-D31 if they don't exist. */
1266     if (!dc_isar_feature(aa32_simd_r32, s) &&
1267         ((a->vd | a->vm) & 0x10)) {
1268         return false;
1269     }
1270 
1271     if (a->vm & 1) {
1272         return false;
1273     }
1274 
1275     if (!vfp_access_check(s)) {
1276         return true;
1277     }
1278 
1279     /*
1280      * This is always a right shift, and the shiftfn is always a
1281      * left-shift helper, which thus needs the negated shift count
1282      * duplicated into each lane of the immediate value.
1283      */
1284     if (a->size == 1) {
1285         imm = (uint16_t)(-a->shift);
1286         imm |= imm << 16;
1287     } else {
1288         /* size == 2 */
1289         imm = -a->shift;
1290     }
1291     constimm = tcg_constant_i32(imm);
1292 
1293     /* Load all inputs first to avoid potential overwrite */
1294     rm1 = tcg_temp_new_i32();
1295     rm2 = tcg_temp_new_i32();
1296     rm3 = tcg_temp_new_i32();
1297     rm4 = tcg_temp_new_i32();
1298     read_neon_element32(rm1, a->vm, 0, MO_32);
1299     read_neon_element32(rm2, a->vm, 1, MO_32);
1300     read_neon_element32(rm3, a->vm, 2, MO_32);
1301     read_neon_element32(rm4, a->vm, 3, MO_32);
1302     rtmp = tcg_temp_new_i64();
1303 
1304     shiftfn(rm1, rm1, constimm);
1305     shiftfn(rm2, rm2, constimm);
1306 
1307     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1308 
1309     narrowfn(rm1, tcg_env, rtmp);
1310     write_neon_element32(rm1, a->vd, 0, MO_32);
1311 
1312     shiftfn(rm3, rm3, constimm);
1313     shiftfn(rm4, rm4, constimm);
1314 
1315     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1316 
1317     narrowfn(rm3, tcg_env, rtmp);
1318     write_neon_element32(rm3, a->vd, 1, MO_32);
1319     return true;
1320 }
1321 
1322 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1323     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1324     {                                                                   \
1325         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1326     }
1327 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1328     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1329     {                                                                   \
1330         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1331     }
1332 
1333 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1334 {
1335     tcg_gen_extrl_i64_i32(dest, src);
1336 }
1337 
1338 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1339 {
1340     gen_helper_neon_narrow_u16(dest, src);
1341 }
1342 
1343 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1344 {
1345     gen_helper_neon_narrow_u8(dest, src);
1346 }
1347 
1348 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1349 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1350 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1351 
1352 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1353 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1354 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1355 
1356 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1357 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1358 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1359 
1360 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1361 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1362 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1363 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1364 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1365 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1366 
1367 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1368 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1369 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1370 
1371 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1372 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1373 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1374 
1375 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1376 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1377 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1378 
1379 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1380                          NeonGenWidenFn *widenfn, bool u)
1381 {
1382     TCGv_i64 tmp;
1383     TCGv_i32 rm0, rm1;
1384     uint64_t widen_mask = 0;
1385 
1386     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1387         return false;
1388     }
1389 
1390     /* UNDEF accesses to D16-D31 if they don't exist. */
1391     if (!dc_isar_feature(aa32_simd_r32, s) &&
1392         ((a->vd | a->vm) & 0x10)) {
1393         return false;
1394     }
1395 
1396     if (a->vd & 1) {
1397         return false;
1398     }
1399 
1400     if (!vfp_access_check(s)) {
1401         return true;
1402     }
1403 
1404     /*
1405      * This is a widen-and-shift operation. The shift is always less
1406      * than the width of the source type, so after widening the input
1407      * vector we can simply shift the whole 64-bit widened register,
1408      * and then clear the potential overflow bits resulting from left
1409      * bits of the narrow input appearing as right bits of the left
1410      * neighbour narrow input. Calculate a mask of bits to clear.
1411      */
1412     if ((a->shift != 0) && (a->size < 2 || u)) {
1413         int esize = 8 << a->size;
1414         widen_mask = MAKE_64BIT_MASK(0, esize);
1415         widen_mask >>= esize - a->shift;
1416         widen_mask = dup_const(a->size + 1, widen_mask);
1417     }
1418 
1419     rm0 = tcg_temp_new_i32();
1420     rm1 = tcg_temp_new_i32();
1421     read_neon_element32(rm0, a->vm, 0, MO_32);
1422     read_neon_element32(rm1, a->vm, 1, MO_32);
1423     tmp = tcg_temp_new_i64();
1424 
1425     widenfn(tmp, rm0);
1426     if (a->shift != 0) {
1427         tcg_gen_shli_i64(tmp, tmp, a->shift);
1428         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1429     }
1430     write_neon_element64(tmp, a->vd, 0, MO_64);
1431 
1432     widenfn(tmp, rm1);
1433     if (a->shift != 0) {
1434         tcg_gen_shli_i64(tmp, tmp, a->shift);
1435         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1436     }
1437     write_neon_element64(tmp, a->vd, 1, MO_64);
1438     return true;
1439 }
1440 
1441 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1442 {
1443     static NeonGenWidenFn * const widenfn[] = {
1444         gen_helper_neon_widen_s8,
1445         gen_helper_neon_widen_s16,
1446         tcg_gen_ext_i32_i64,
1447     };
1448     return do_vshll_2sh(s, a, widenfn[a->size], false);
1449 }
1450 
1451 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1452 {
1453     static NeonGenWidenFn * const widenfn[] = {
1454         gen_helper_neon_widen_u8,
1455         gen_helper_neon_widen_u16,
1456         tcg_gen_extu_i32_i64,
1457     };
1458     return do_vshll_2sh(s, a, widenfn[a->size], true);
1459 }
1460 
1461 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1462                       gen_helper_gvec_2_ptr *fn)
1463 {
1464     /* FP operations in 2-reg-and-shift group */
1465     int vec_size = a->q ? 16 : 8;
1466     int rd_ofs = neon_full_reg_offset(a->vd);
1467     int rm_ofs = neon_full_reg_offset(a->vm);
1468     TCGv_ptr fpst;
1469 
1470     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1471         return false;
1472     }
1473 
1474     if (a->size == MO_16) {
1475         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1476             return false;
1477         }
1478     }
1479 
1480     /* UNDEF accesses to D16-D31 if they don't exist. */
1481     if (!dc_isar_feature(aa32_simd_r32, s) &&
1482         ((a->vd | a->vm) & 0x10)) {
1483         return false;
1484     }
1485 
1486     if ((a->vm | a->vd) & a->q) {
1487         return false;
1488     }
1489 
1490     if (!vfp_access_check(s)) {
1491         return true;
1492     }
1493 
1494     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1495     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1496     return true;
1497 }
1498 
1499 #define DO_FP_2SH(INSN, FUNC)                                           \
1500     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1501     {                                                                   \
1502         return do_fp_2sh(s, a, FUNC);                                   \
1503     }
1504 
1505 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1506 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1507 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1508 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1509 
1510 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1511 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1512 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1513 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1514 
1515 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1516                         GVecGen2iFn *fn)
1517 {
1518     uint64_t imm;
1519     int reg_ofs, vec_size;
1520 
1521     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1522         return false;
1523     }
1524 
1525     /* UNDEF accesses to D16-D31 if they don't exist. */
1526     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1527         return false;
1528     }
1529 
1530     if (a->vd & a->q) {
1531         return false;
1532     }
1533 
1534     if (!vfp_access_check(s)) {
1535         return true;
1536     }
1537 
1538     reg_ofs = neon_full_reg_offset(a->vd);
1539     vec_size = a->q ? 16 : 8;
1540     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1541 
1542     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1543     return true;
1544 }
1545 
1546 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1547                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1548 {
1549     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1550 }
1551 
1552 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1553 {
1554     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1555     GVecGen2iFn *fn;
1556 
1557     if ((a->cmode & 1) && a->cmode < 12) {
1558         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1559         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1560     } else {
1561         /* There is one unallocated cmode/op combination in this space */
1562         if (a->cmode == 15 && a->op == 1) {
1563             return false;
1564         }
1565         fn = gen_VMOV_1r;
1566     }
1567     return do_1reg_imm(s, a, fn);
1568 }
1569 
1570 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1571                            NeonGenWidenFn *widenfn,
1572                            NeonGenTwo64OpFn *opfn,
1573                            int src1_mop, int src2_mop)
1574 {
1575     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1576     TCGv_i64 rn0_64, rn1_64, rm_64;
1577 
1578     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1579         return false;
1580     }
1581 
1582     /* UNDEF accesses to D16-D31 if they don't exist. */
1583     if (!dc_isar_feature(aa32_simd_r32, s) &&
1584         ((a->vd | a->vn | a->vm) & 0x10)) {
1585         return false;
1586     }
1587 
1588     if (!opfn) {
1589         /* size == 3 case, which is an entirely different insn group */
1590         return false;
1591     }
1592 
1593     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1594         return false;
1595     }
1596 
1597     if (!vfp_access_check(s)) {
1598         return true;
1599     }
1600 
1601     rn0_64 = tcg_temp_new_i64();
1602     rn1_64 = tcg_temp_new_i64();
1603     rm_64 = tcg_temp_new_i64();
1604 
1605     if (src1_mop >= 0) {
1606         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1607     } else {
1608         TCGv_i32 tmp = tcg_temp_new_i32();
1609         read_neon_element32(tmp, a->vn, 0, MO_32);
1610         widenfn(rn0_64, tmp);
1611     }
1612     if (src2_mop >= 0) {
1613         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1614     } else {
1615         TCGv_i32 tmp = tcg_temp_new_i32();
1616         read_neon_element32(tmp, a->vm, 0, MO_32);
1617         widenfn(rm_64, tmp);
1618     }
1619 
1620     opfn(rn0_64, rn0_64, rm_64);
1621 
1622     /*
1623      * Load second pass inputs before storing the first pass result, to
1624      * avoid incorrect results if a narrow input overlaps with the result.
1625      */
1626     if (src1_mop >= 0) {
1627         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1628     } else {
1629         TCGv_i32 tmp = tcg_temp_new_i32();
1630         read_neon_element32(tmp, a->vn, 1, MO_32);
1631         widenfn(rn1_64, tmp);
1632     }
1633     if (src2_mop >= 0) {
1634         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1635     } else {
1636         TCGv_i32 tmp = tcg_temp_new_i32();
1637         read_neon_element32(tmp, a->vm, 1, MO_32);
1638         widenfn(rm_64, tmp);
1639     }
1640 
1641     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1642 
1643     opfn(rn1_64, rn1_64, rm_64);
1644     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1645 
1646     return true;
1647 }
1648 
1649 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1650     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1651     {                                                                   \
1652         static NeonGenWidenFn * const widenfn[] = {                     \
1653             gen_helper_neon_widen_##S##8,                               \
1654             gen_helper_neon_widen_##S##16,                              \
1655             NULL, NULL,                                                 \
1656         };                                                              \
1657         static NeonGenTwo64OpFn * const addfn[] = {                     \
1658             gen_helper_neon_##OP##l_u16,                                \
1659             gen_helper_neon_##OP##l_u32,                                \
1660             tcg_gen_##OP##_i64,                                         \
1661             NULL,                                                       \
1662         };                                                              \
1663         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1664         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1665                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1666                               narrow_mop);                              \
1667     }
1668 
1669 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1670 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1671 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1672 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1673 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1674 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1675 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1676 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1677 
1678 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1679                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1680 {
1681     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1682     TCGv_i64 rn_64, rm_64;
1683     TCGv_i32 rd0, rd1;
1684 
1685     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1686         return false;
1687     }
1688 
1689     /* UNDEF accesses to D16-D31 if they don't exist. */
1690     if (!dc_isar_feature(aa32_simd_r32, s) &&
1691         ((a->vd | a->vn | a->vm) & 0x10)) {
1692         return false;
1693     }
1694 
1695     if (!opfn || !narrowfn) {
1696         /* size == 3 case, which is an entirely different insn group */
1697         return false;
1698     }
1699 
1700     if ((a->vn | a->vm) & 1) {
1701         return false;
1702     }
1703 
1704     if (!vfp_access_check(s)) {
1705         return true;
1706     }
1707 
1708     rn_64 = tcg_temp_new_i64();
1709     rm_64 = tcg_temp_new_i64();
1710     rd0 = tcg_temp_new_i32();
1711     rd1 = tcg_temp_new_i32();
1712 
1713     read_neon_element64(rn_64, a->vn, 0, MO_64);
1714     read_neon_element64(rm_64, a->vm, 0, MO_64);
1715 
1716     opfn(rn_64, rn_64, rm_64);
1717 
1718     narrowfn(rd0, rn_64);
1719 
1720     read_neon_element64(rn_64, a->vn, 1, MO_64);
1721     read_neon_element64(rm_64, a->vm, 1, MO_64);
1722 
1723     opfn(rn_64, rn_64, rm_64);
1724 
1725     narrowfn(rd1, rn_64);
1726 
1727     write_neon_element32(rd0, a->vd, 0, MO_32);
1728     write_neon_element32(rd1, a->vd, 1, MO_32);
1729 
1730     return true;
1731 }
1732 
1733 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1734     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1735     {                                                                   \
1736         static NeonGenTwo64OpFn * const addfn[] = {                     \
1737             gen_helper_neon_##OP##l_u16,                                \
1738             gen_helper_neon_##OP##l_u32,                                \
1739             tcg_gen_##OP##_i64,                                         \
1740             NULL,                                                       \
1741         };                                                              \
1742         static NeonGenNarrowFn * const narrowfn[] = {                   \
1743             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1744             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1745             EXTOP,                                                      \
1746             NULL,                                                       \
1747         };                                                              \
1748         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1749     }
1750 
1751 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1752 {
1753     tcg_gen_addi_i64(rn, rn, 1u << 31);
1754     tcg_gen_extrh_i64_i32(rd, rn);
1755 }
1756 
1757 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1758 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1759 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1760 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1761 
1762 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1763                        NeonGenTwoOpWidenFn *opfn,
1764                        NeonGenTwo64OpFn *accfn)
1765 {
1766     /*
1767      * 3-regs different lengths, long operations.
1768      * These perform an operation on two inputs that returns a double-width
1769      * result, and then possibly perform an accumulation operation of
1770      * that result into the double-width destination.
1771      */
1772     TCGv_i64 rd0, rd1, tmp;
1773     TCGv_i32 rn, rm;
1774 
1775     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1776         return false;
1777     }
1778 
1779     /* UNDEF accesses to D16-D31 if they don't exist. */
1780     if (!dc_isar_feature(aa32_simd_r32, s) &&
1781         ((a->vd | a->vn | a->vm) & 0x10)) {
1782         return false;
1783     }
1784 
1785     if (!opfn) {
1786         /* size == 3 case, which is an entirely different insn group */
1787         return false;
1788     }
1789 
1790     if (a->vd & 1) {
1791         return false;
1792     }
1793 
1794     if (!vfp_access_check(s)) {
1795         return true;
1796     }
1797 
1798     rd0 = tcg_temp_new_i64();
1799     rd1 = tcg_temp_new_i64();
1800 
1801     rn = tcg_temp_new_i32();
1802     rm = tcg_temp_new_i32();
1803     read_neon_element32(rn, a->vn, 0, MO_32);
1804     read_neon_element32(rm, a->vm, 0, MO_32);
1805     opfn(rd0, rn, rm);
1806 
1807     read_neon_element32(rn, a->vn, 1, MO_32);
1808     read_neon_element32(rm, a->vm, 1, MO_32);
1809     opfn(rd1, rn, rm);
1810 
1811     /* Don't store results until after all loads: they might overlap */
1812     if (accfn) {
1813         tmp = tcg_temp_new_i64();
1814         read_neon_element64(tmp, a->vd, 0, MO_64);
1815         accfn(rd0, tmp, rd0);
1816         read_neon_element64(tmp, a->vd, 1, MO_64);
1817         accfn(rd1, tmp, rd1);
1818     }
1819 
1820     write_neon_element64(rd0, a->vd, 0, MO_64);
1821     write_neon_element64(rd1, a->vd, 1, MO_64);
1822 
1823     return true;
1824 }
1825 
1826 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
1827 {
1828     static NeonGenTwoOpWidenFn * const opfn[] = {
1829         gen_helper_neon_abdl_s16,
1830         gen_helper_neon_abdl_s32,
1831         gen_helper_neon_abdl_s64,
1832         NULL,
1833     };
1834 
1835     return do_long_3d(s, a, opfn[a->size], NULL);
1836 }
1837 
1838 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
1839 {
1840     static NeonGenTwoOpWidenFn * const opfn[] = {
1841         gen_helper_neon_abdl_u16,
1842         gen_helper_neon_abdl_u32,
1843         gen_helper_neon_abdl_u64,
1844         NULL,
1845     };
1846 
1847     return do_long_3d(s, a, opfn[a->size], NULL);
1848 }
1849 
1850 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
1851 {
1852     static NeonGenTwoOpWidenFn * const opfn[] = {
1853         gen_helper_neon_abdl_s16,
1854         gen_helper_neon_abdl_s32,
1855         gen_helper_neon_abdl_s64,
1856         NULL,
1857     };
1858     static NeonGenTwo64OpFn * const addfn[] = {
1859         gen_helper_neon_addl_u16,
1860         gen_helper_neon_addl_u32,
1861         tcg_gen_add_i64,
1862         NULL,
1863     };
1864 
1865     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
1866 }
1867 
1868 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
1869 {
1870     static NeonGenTwoOpWidenFn * const opfn[] = {
1871         gen_helper_neon_abdl_u16,
1872         gen_helper_neon_abdl_u32,
1873         gen_helper_neon_abdl_u64,
1874         NULL,
1875     };
1876     static NeonGenTwo64OpFn * const addfn[] = {
1877         gen_helper_neon_addl_u16,
1878         gen_helper_neon_addl_u32,
1879         tcg_gen_add_i64,
1880         NULL,
1881     };
1882 
1883     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
1884 }
1885 
1886 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1887 {
1888     TCGv_i32 lo = tcg_temp_new_i32();
1889     TCGv_i32 hi = tcg_temp_new_i32();
1890 
1891     tcg_gen_muls2_i32(lo, hi, rn, rm);
1892     tcg_gen_concat_i32_i64(rd, lo, hi);
1893 }
1894 
1895 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1896 {
1897     TCGv_i32 lo = tcg_temp_new_i32();
1898     TCGv_i32 hi = tcg_temp_new_i32();
1899 
1900     tcg_gen_mulu2_i32(lo, hi, rn, rm);
1901     tcg_gen_concat_i32_i64(rd, lo, hi);
1902 }
1903 
1904 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
1905 {
1906     static NeonGenTwoOpWidenFn * const opfn[] = {
1907         gen_helper_neon_mull_s8,
1908         gen_helper_neon_mull_s16,
1909         gen_mull_s32,
1910         NULL,
1911     };
1912 
1913     return do_long_3d(s, a, opfn[a->size], NULL);
1914 }
1915 
1916 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
1917 {
1918     static NeonGenTwoOpWidenFn * const opfn[] = {
1919         gen_helper_neon_mull_u8,
1920         gen_helper_neon_mull_u16,
1921         gen_mull_u32,
1922         NULL,
1923     };
1924 
1925     return do_long_3d(s, a, opfn[a->size], NULL);
1926 }
1927 
1928 #define DO_VMLAL(INSN,MULL,ACC)                                         \
1929     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1930     {                                                                   \
1931         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
1932             gen_helper_neon_##MULL##8,                                  \
1933             gen_helper_neon_##MULL##16,                                 \
1934             gen_##MULL##32,                                             \
1935             NULL,                                                       \
1936         };                                                              \
1937         static NeonGenTwo64OpFn * const accfn[] = {                     \
1938             gen_helper_neon_##ACC##l_u16,                               \
1939             gen_helper_neon_##ACC##l_u32,                               \
1940             tcg_gen_##ACC##_i64,                                        \
1941             NULL,                                                       \
1942         };                                                              \
1943         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
1944     }
1945 
1946 DO_VMLAL(VMLAL_S,mull_s,add)
1947 DO_VMLAL(VMLAL_U,mull_u,add)
1948 DO_VMLAL(VMLSL_S,mull_s,sub)
1949 DO_VMLAL(VMLSL_U,mull_u,sub)
1950 
1951 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1952 {
1953     gen_helper_neon_mull_s16(rd, rn, rm);
1954     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rd, rd);
1955 }
1956 
1957 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1958 {
1959     gen_mull_s32(rd, rn, rm);
1960     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rd, rd);
1961 }
1962 
1963 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
1964 {
1965     static NeonGenTwoOpWidenFn * const opfn[] = {
1966         NULL,
1967         gen_VQDMULL_16,
1968         gen_VQDMULL_32,
1969         NULL,
1970     };
1971 
1972     return do_long_3d(s, a, opfn[a->size], NULL);
1973 }
1974 
1975 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
1976 {
1977     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rn, rm);
1978 }
1979 
1980 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
1981 {
1982     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rn, rm);
1983 }
1984 
1985 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
1986 {
1987     static NeonGenTwoOpWidenFn * const opfn[] = {
1988         NULL,
1989         gen_VQDMULL_16,
1990         gen_VQDMULL_32,
1991         NULL,
1992     };
1993     static NeonGenTwo64OpFn * const accfn[] = {
1994         NULL,
1995         gen_VQDMLAL_acc_16,
1996         gen_VQDMLAL_acc_32,
1997         NULL,
1998     };
1999 
2000     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2001 }
2002 
2003 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2004 {
2005     gen_helper_neon_negl_u32(rm, rm);
2006     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rn, rm);
2007 }
2008 
2009 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2010 {
2011     tcg_gen_neg_i64(rm, rm);
2012     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rn, rm);
2013 }
2014 
2015 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2016 {
2017     static NeonGenTwoOpWidenFn * const opfn[] = {
2018         NULL,
2019         gen_VQDMULL_16,
2020         gen_VQDMULL_32,
2021         NULL,
2022     };
2023     static NeonGenTwo64OpFn * const accfn[] = {
2024         NULL,
2025         gen_VQDMLSL_acc_16,
2026         gen_VQDMLSL_acc_32,
2027         NULL,
2028     };
2029 
2030     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2031 }
2032 
2033 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2034 {
2035     gen_helper_gvec_3 *fn_gvec;
2036 
2037     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2038         return false;
2039     }
2040 
2041     /* UNDEF accesses to D16-D31 if they don't exist. */
2042     if (!dc_isar_feature(aa32_simd_r32, s) &&
2043         ((a->vd | a->vn | a->vm) & 0x10)) {
2044         return false;
2045     }
2046 
2047     if (a->vd & 1) {
2048         return false;
2049     }
2050 
2051     switch (a->size) {
2052     case 0:
2053         fn_gvec = gen_helper_neon_pmull_h;
2054         break;
2055     case 2:
2056         if (!dc_isar_feature(aa32_pmull, s)) {
2057             return false;
2058         }
2059         fn_gvec = gen_helper_gvec_pmull_q;
2060         break;
2061     default:
2062         return false;
2063     }
2064 
2065     if (!vfp_access_check(s)) {
2066         return true;
2067     }
2068 
2069     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2070                        neon_full_reg_offset(a->vn),
2071                        neon_full_reg_offset(a->vm),
2072                        16, 16, 0, fn_gvec);
2073     return true;
2074 }
2075 
2076 static void gen_neon_dup_low16(TCGv_i32 var)
2077 {
2078     TCGv_i32 tmp = tcg_temp_new_i32();
2079     tcg_gen_ext16u_i32(var, var);
2080     tcg_gen_shli_i32(tmp, var, 16);
2081     tcg_gen_or_i32(var, var, tmp);
2082 }
2083 
2084 static void gen_neon_dup_high16(TCGv_i32 var)
2085 {
2086     TCGv_i32 tmp = tcg_temp_new_i32();
2087     tcg_gen_andi_i32(var, var, 0xffff0000);
2088     tcg_gen_shri_i32(tmp, var, 16);
2089     tcg_gen_or_i32(var, var, tmp);
2090 }
2091 
2092 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2093 {
2094     TCGv_i32 tmp = tcg_temp_new_i32();
2095     if (size == MO_16) {
2096         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2097         if (reg & 8) {
2098             gen_neon_dup_high16(tmp);
2099         } else {
2100             gen_neon_dup_low16(tmp);
2101         }
2102     } else {
2103         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2104     }
2105     return tmp;
2106 }
2107 
2108 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2109                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2110 {
2111     /*
2112      * Two registers and a scalar: perform an operation between
2113      * the input elements and the scalar, and then possibly
2114      * perform an accumulation operation of that result into the
2115      * destination.
2116      */
2117     TCGv_i32 scalar, tmp;
2118     int pass;
2119 
2120     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2121         return false;
2122     }
2123 
2124     /* UNDEF accesses to D16-D31 if they don't exist. */
2125     if (!dc_isar_feature(aa32_simd_r32, s) &&
2126         ((a->vd | a->vn | a->vm) & 0x10)) {
2127         return false;
2128     }
2129 
2130     if (!opfn) {
2131         /* Bad size (including size == 3, which is a different insn group) */
2132         return false;
2133     }
2134 
2135     if (a->q && ((a->vd | a->vn) & 1)) {
2136         return false;
2137     }
2138 
2139     if (!vfp_access_check(s)) {
2140         return true;
2141     }
2142 
2143     scalar = neon_get_scalar(a->size, a->vm);
2144     tmp = tcg_temp_new_i32();
2145 
2146     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2147         read_neon_element32(tmp, a->vn, pass, MO_32);
2148         opfn(tmp, tmp, scalar);
2149         if (accfn) {
2150             TCGv_i32 rd = tcg_temp_new_i32();
2151             read_neon_element32(rd, a->vd, pass, MO_32);
2152             accfn(tmp, rd, tmp);
2153         }
2154         write_neon_element32(tmp, a->vd, pass, MO_32);
2155     }
2156     return true;
2157 }
2158 
2159 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2160 {
2161     static NeonGenTwoOpFn * const opfn[] = {
2162         NULL,
2163         gen_helper_neon_mul_u16,
2164         tcg_gen_mul_i32,
2165         NULL,
2166     };
2167 
2168     return do_2scalar(s, a, opfn[a->size], NULL);
2169 }
2170 
2171 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2172 {
2173     static NeonGenTwoOpFn * const opfn[] = {
2174         NULL,
2175         gen_helper_neon_mul_u16,
2176         tcg_gen_mul_i32,
2177         NULL,
2178     };
2179     static NeonGenTwoOpFn * const accfn[] = {
2180         NULL,
2181         gen_helper_neon_add_u16,
2182         tcg_gen_add_i32,
2183         NULL,
2184     };
2185 
2186     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2187 }
2188 
2189 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2190 {
2191     static NeonGenTwoOpFn * const opfn[] = {
2192         NULL,
2193         gen_helper_neon_mul_u16,
2194         tcg_gen_mul_i32,
2195         NULL,
2196     };
2197     static NeonGenTwoOpFn * const accfn[] = {
2198         NULL,
2199         gen_helper_neon_sub_u16,
2200         tcg_gen_sub_i32,
2201         NULL,
2202     };
2203 
2204     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2205 }
2206 
2207 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2208                               gen_helper_gvec_3_ptr *fn)
2209 {
2210     /* Two registers and a scalar, using gvec */
2211     int vec_size = a->q ? 16 : 8;
2212     int rd_ofs = neon_full_reg_offset(a->vd);
2213     int rn_ofs = neon_full_reg_offset(a->vn);
2214     int rm_ofs;
2215     int idx;
2216     TCGv_ptr fpstatus;
2217 
2218     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2219         return false;
2220     }
2221 
2222     /* UNDEF accesses to D16-D31 if they don't exist. */
2223     if (!dc_isar_feature(aa32_simd_r32, s) &&
2224         ((a->vd | a->vn | a->vm) & 0x10)) {
2225         return false;
2226     }
2227 
2228     if (!fn) {
2229         /* Bad size (including size == 3, which is a different insn group) */
2230         return false;
2231     }
2232 
2233     if (a->q && ((a->vd | a->vn) & 1)) {
2234         return false;
2235     }
2236 
2237     if (!vfp_access_check(s)) {
2238         return true;
2239     }
2240 
2241     /* a->vm is M:Vm, which encodes both register and index */
2242     idx = extract32(a->vm, a->size + 2, 2);
2243     a->vm = extract32(a->vm, 0, a->size + 2);
2244     rm_ofs = neon_full_reg_offset(a->vm);
2245 
2246     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2247     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2248                        vec_size, vec_size, idx, fn);
2249     return true;
2250 }
2251 
2252 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2253     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2254     {                                                                   \
2255         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2256             NULL,                                                       \
2257             gen_helper_##FUNC##_h,                                      \
2258             gen_helper_##FUNC##_s,                                      \
2259             NULL,                                                       \
2260         };                                                              \
2261         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2262             return false;                                               \
2263         }                                                               \
2264         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2265     }
2266 
2267 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2268 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2269 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2270 
2271 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2272 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2273 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2274 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2275 
2276 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2277 {
2278     static NeonGenTwoOpFn * const opfn[] = {
2279         NULL,
2280         gen_VQDMULH_16,
2281         gen_VQDMULH_32,
2282         NULL,
2283     };
2284 
2285     return do_2scalar(s, a, opfn[a->size], NULL);
2286 }
2287 
2288 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2289 {
2290     static NeonGenTwoOpFn * const opfn[] = {
2291         NULL,
2292         gen_VQRDMULH_16,
2293         gen_VQRDMULH_32,
2294         NULL,
2295     };
2296 
2297     return do_2scalar(s, a, opfn[a->size], NULL);
2298 }
2299 
2300 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2301                             NeonGenThreeOpEnvFn *opfn)
2302 {
2303     /*
2304      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2305      * performs a kind of fused op-then-accumulate using a helper
2306      * function that takes all of rd, rn and the scalar at once.
2307      */
2308     TCGv_i32 scalar, rn, rd;
2309     int pass;
2310 
2311     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2312         return false;
2313     }
2314 
2315     if (!dc_isar_feature(aa32_rdm, s)) {
2316         return false;
2317     }
2318 
2319     /* UNDEF accesses to D16-D31 if they don't exist. */
2320     if (!dc_isar_feature(aa32_simd_r32, s) &&
2321         ((a->vd | a->vn | a->vm) & 0x10)) {
2322         return false;
2323     }
2324 
2325     if (!opfn) {
2326         /* Bad size (including size == 3, which is a different insn group) */
2327         return false;
2328     }
2329 
2330     if (a->q && ((a->vd | a->vn) & 1)) {
2331         return false;
2332     }
2333 
2334     if (!vfp_access_check(s)) {
2335         return true;
2336     }
2337 
2338     scalar = neon_get_scalar(a->size, a->vm);
2339     rn = tcg_temp_new_i32();
2340     rd = tcg_temp_new_i32();
2341 
2342     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2343         read_neon_element32(rn, a->vn, pass, MO_32);
2344         read_neon_element32(rd, a->vd, pass, MO_32);
2345         opfn(rd, tcg_env, rn, scalar, rd);
2346         write_neon_element32(rd, a->vd, pass, MO_32);
2347     }
2348     return true;
2349 }
2350 
2351 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2352 {
2353     static NeonGenThreeOpEnvFn *opfn[] = {
2354         NULL,
2355         gen_helper_neon_qrdmlah_s16,
2356         gen_helper_neon_qrdmlah_s32,
2357         NULL,
2358     };
2359     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2360 }
2361 
2362 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2363 {
2364     static NeonGenThreeOpEnvFn *opfn[] = {
2365         NULL,
2366         gen_helper_neon_qrdmlsh_s16,
2367         gen_helper_neon_qrdmlsh_s32,
2368         NULL,
2369     };
2370     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2371 }
2372 
2373 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2374                             NeonGenTwoOpWidenFn *opfn,
2375                             NeonGenTwo64OpFn *accfn)
2376 {
2377     /*
2378      * Two registers and a scalar, long operations: perform an
2379      * operation on the input elements and the scalar which produces
2380      * a double-width result, and then possibly perform an accumulation
2381      * operation of that result into the destination.
2382      */
2383     TCGv_i32 scalar, rn;
2384     TCGv_i64 rn0_64, rn1_64;
2385 
2386     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2387         return false;
2388     }
2389 
2390     /* UNDEF accesses to D16-D31 if they don't exist. */
2391     if (!dc_isar_feature(aa32_simd_r32, s) &&
2392         ((a->vd | a->vn | a->vm) & 0x10)) {
2393         return false;
2394     }
2395 
2396     if (!opfn) {
2397         /* Bad size (including size == 3, which is a different insn group) */
2398         return false;
2399     }
2400 
2401     if (a->vd & 1) {
2402         return false;
2403     }
2404 
2405     if (!vfp_access_check(s)) {
2406         return true;
2407     }
2408 
2409     scalar = neon_get_scalar(a->size, a->vm);
2410 
2411     /* Load all inputs before writing any outputs, in case of overlap */
2412     rn = tcg_temp_new_i32();
2413     read_neon_element32(rn, a->vn, 0, MO_32);
2414     rn0_64 = tcg_temp_new_i64();
2415     opfn(rn0_64, rn, scalar);
2416 
2417     read_neon_element32(rn, a->vn, 1, MO_32);
2418     rn1_64 = tcg_temp_new_i64();
2419     opfn(rn1_64, rn, scalar);
2420 
2421     if (accfn) {
2422         TCGv_i64 t64 = tcg_temp_new_i64();
2423         read_neon_element64(t64, a->vd, 0, MO_64);
2424         accfn(rn0_64, t64, rn0_64);
2425         read_neon_element64(t64, a->vd, 1, MO_64);
2426         accfn(rn1_64, t64, rn1_64);
2427     }
2428 
2429     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2430     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2431     return true;
2432 }
2433 
2434 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2435 {
2436     static NeonGenTwoOpWidenFn * const opfn[] = {
2437         NULL,
2438         gen_helper_neon_mull_s16,
2439         gen_mull_s32,
2440         NULL,
2441     };
2442 
2443     return do_2scalar_long(s, a, opfn[a->size], NULL);
2444 }
2445 
2446 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2447 {
2448     static NeonGenTwoOpWidenFn * const opfn[] = {
2449         NULL,
2450         gen_helper_neon_mull_u16,
2451         gen_mull_u32,
2452         NULL,
2453     };
2454 
2455     return do_2scalar_long(s, a, opfn[a->size], NULL);
2456 }
2457 
2458 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2459     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2460     {                                                                   \
2461         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2462             NULL,                                                       \
2463             gen_helper_neon_##MULL##16,                                 \
2464             gen_##MULL##32,                                             \
2465             NULL,                                                       \
2466         };                                                              \
2467         static NeonGenTwo64OpFn * const accfn[] = {                     \
2468             NULL,                                                       \
2469             gen_helper_neon_##ACC##l_u32,                               \
2470             tcg_gen_##ACC##_i64,                                        \
2471             NULL,                                                       \
2472         };                                                              \
2473         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2474     }
2475 
2476 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2477 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2478 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2479 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2480 
2481 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2482 {
2483     static NeonGenTwoOpWidenFn * const opfn[] = {
2484         NULL,
2485         gen_VQDMULL_16,
2486         gen_VQDMULL_32,
2487         NULL,
2488     };
2489 
2490     return do_2scalar_long(s, a, opfn[a->size], NULL);
2491 }
2492 
2493 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2494 {
2495     static NeonGenTwoOpWidenFn * const opfn[] = {
2496         NULL,
2497         gen_VQDMULL_16,
2498         gen_VQDMULL_32,
2499         NULL,
2500     };
2501     static NeonGenTwo64OpFn * const accfn[] = {
2502         NULL,
2503         gen_VQDMLAL_acc_16,
2504         gen_VQDMLAL_acc_32,
2505         NULL,
2506     };
2507 
2508     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2509 }
2510 
2511 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2512 {
2513     static NeonGenTwoOpWidenFn * const opfn[] = {
2514         NULL,
2515         gen_VQDMULL_16,
2516         gen_VQDMULL_32,
2517         NULL,
2518     };
2519     static NeonGenTwo64OpFn * const accfn[] = {
2520         NULL,
2521         gen_VQDMLSL_acc_16,
2522         gen_VQDMLSL_acc_32,
2523         NULL,
2524     };
2525 
2526     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2527 }
2528 
2529 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2530 {
2531     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2532         return false;
2533     }
2534 
2535     /* UNDEF accesses to D16-D31 if they don't exist. */
2536     if (!dc_isar_feature(aa32_simd_r32, s) &&
2537         ((a->vd | a->vn | a->vm) & 0x10)) {
2538         return false;
2539     }
2540 
2541     if ((a->vn | a->vm | a->vd) & a->q) {
2542         return false;
2543     }
2544 
2545     if (a->imm > 7 && !a->q) {
2546         return false;
2547     }
2548 
2549     if (!vfp_access_check(s)) {
2550         return true;
2551     }
2552 
2553     if (!a->q) {
2554         /* Extract 64 bits from <Vm:Vn> */
2555         TCGv_i64 left, right, dest;
2556 
2557         left = tcg_temp_new_i64();
2558         right = tcg_temp_new_i64();
2559         dest = tcg_temp_new_i64();
2560 
2561         read_neon_element64(right, a->vn, 0, MO_64);
2562         read_neon_element64(left, a->vm, 0, MO_64);
2563         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2564         write_neon_element64(dest, a->vd, 0, MO_64);
2565     } else {
2566         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2567         TCGv_i64 left, middle, right, destleft, destright;
2568 
2569         left = tcg_temp_new_i64();
2570         middle = tcg_temp_new_i64();
2571         right = tcg_temp_new_i64();
2572         destleft = tcg_temp_new_i64();
2573         destright = tcg_temp_new_i64();
2574 
2575         if (a->imm < 8) {
2576             read_neon_element64(right, a->vn, 0, MO_64);
2577             read_neon_element64(middle, a->vn, 1, MO_64);
2578             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2579             read_neon_element64(left, a->vm, 0, MO_64);
2580             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2581         } else {
2582             read_neon_element64(right, a->vn, 1, MO_64);
2583             read_neon_element64(middle, a->vm, 0, MO_64);
2584             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2585             read_neon_element64(left, a->vm, 1, MO_64);
2586             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2587         }
2588 
2589         write_neon_element64(destright, a->vd, 0, MO_64);
2590         write_neon_element64(destleft, a->vd, 1, MO_64);
2591     }
2592     return true;
2593 }
2594 
2595 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2596 {
2597     TCGv_i64 val, def;
2598     TCGv_i32 desc;
2599 
2600     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2601         return false;
2602     }
2603 
2604     /* UNDEF accesses to D16-D31 if they don't exist. */
2605     if (!dc_isar_feature(aa32_simd_r32, s) &&
2606         ((a->vd | a->vn | a->vm) & 0x10)) {
2607         return false;
2608     }
2609 
2610     if ((a->vn + a->len + 1) > 32) {
2611         /*
2612          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2613          * helper function running off the end of the register file.
2614          */
2615         return false;
2616     }
2617 
2618     if (!vfp_access_check(s)) {
2619         return true;
2620     }
2621 
2622     desc = tcg_constant_i32((a->vn << 2) | a->len);
2623     def = tcg_temp_new_i64();
2624     if (a->op) {
2625         read_neon_element64(def, a->vd, 0, MO_64);
2626     } else {
2627         tcg_gen_movi_i64(def, 0);
2628     }
2629     val = tcg_temp_new_i64();
2630     read_neon_element64(val, a->vm, 0, MO_64);
2631 
2632     gen_helper_neon_tbl(val, tcg_env, desc, val, def);
2633     write_neon_element64(val, a->vd, 0, MO_64);
2634     return true;
2635 }
2636 
2637 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2638 {
2639     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2640         return false;
2641     }
2642 
2643     /* UNDEF accesses to D16-D31 if they don't exist. */
2644     if (!dc_isar_feature(aa32_simd_r32, s) &&
2645         ((a->vd | a->vm) & 0x10)) {
2646         return false;
2647     }
2648 
2649     if (a->vd & a->q) {
2650         return false;
2651     }
2652 
2653     if (!vfp_access_check(s)) {
2654         return true;
2655     }
2656 
2657     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2658                          neon_element_offset(a->vm, a->index, a->size),
2659                          a->q ? 16 : 8, a->q ? 16 : 8);
2660     return true;
2661 }
2662 
2663 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2664 {
2665     int pass, half;
2666     TCGv_i32 tmp[2];
2667 
2668     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2669         return false;
2670     }
2671 
2672     /* UNDEF accesses to D16-D31 if they don't exist. */
2673     if (!dc_isar_feature(aa32_simd_r32, s) &&
2674         ((a->vd | a->vm) & 0x10)) {
2675         return false;
2676     }
2677 
2678     if ((a->vd | a->vm) & a->q) {
2679         return false;
2680     }
2681 
2682     if (a->size == 3) {
2683         return false;
2684     }
2685 
2686     if (!vfp_access_check(s)) {
2687         return true;
2688     }
2689 
2690     tmp[0] = tcg_temp_new_i32();
2691     tmp[1] = tcg_temp_new_i32();
2692 
2693     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2694         for (half = 0; half < 2; half++) {
2695             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2696             switch (a->size) {
2697             case 0:
2698                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2699                 break;
2700             case 1:
2701                 gen_swap_half(tmp[half], tmp[half]);
2702                 break;
2703             case 2:
2704                 break;
2705             default:
2706                 g_assert_not_reached();
2707             }
2708         }
2709         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2710         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
2711     }
2712     return true;
2713 }
2714 
2715 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2716                               NeonGenWidenFn *widenfn,
2717                               NeonGenTwo64OpFn *opfn,
2718                               NeonGenTwo64OpFn *accfn)
2719 {
2720     /*
2721      * Pairwise long operations: widen both halves of the pair,
2722      * combine the pairs with the opfn, and then possibly accumulate
2723      * into the destination with the accfn.
2724      */
2725     int pass;
2726 
2727     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2728         return false;
2729     }
2730 
2731     /* UNDEF accesses to D16-D31 if they don't exist. */
2732     if (!dc_isar_feature(aa32_simd_r32, s) &&
2733         ((a->vd | a->vm) & 0x10)) {
2734         return false;
2735     }
2736 
2737     if ((a->vd | a->vm) & a->q) {
2738         return false;
2739     }
2740 
2741     if (!widenfn) {
2742         return false;
2743     }
2744 
2745     if (!vfp_access_check(s)) {
2746         return true;
2747     }
2748 
2749     for (pass = 0; pass < a->q + 1; pass++) {
2750         TCGv_i32 tmp;
2751         TCGv_i64 rm0_64, rm1_64, rd_64;
2752 
2753         rm0_64 = tcg_temp_new_i64();
2754         rm1_64 = tcg_temp_new_i64();
2755         rd_64 = tcg_temp_new_i64();
2756 
2757         tmp = tcg_temp_new_i32();
2758         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
2759         widenfn(rm0_64, tmp);
2760         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
2761         widenfn(rm1_64, tmp);
2762 
2763         opfn(rd_64, rm0_64, rm1_64);
2764 
2765         if (accfn) {
2766             TCGv_i64 tmp64 = tcg_temp_new_i64();
2767             read_neon_element64(tmp64, a->vd, pass, MO_64);
2768             accfn(rd_64, tmp64, rd_64);
2769         }
2770         write_neon_element64(rd_64, a->vd, pass, MO_64);
2771     }
2772     return true;
2773 }
2774 
2775 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
2776 {
2777     static NeonGenWidenFn * const widenfn[] = {
2778         gen_helper_neon_widen_s8,
2779         gen_helper_neon_widen_s16,
2780         tcg_gen_ext_i32_i64,
2781         NULL,
2782     };
2783     static NeonGenTwo64OpFn * const opfn[] = {
2784         gen_helper_neon_paddl_u16,
2785         gen_helper_neon_paddl_u32,
2786         tcg_gen_add_i64,
2787         NULL,
2788     };
2789 
2790     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
2791 }
2792 
2793 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
2794 {
2795     static NeonGenWidenFn * const widenfn[] = {
2796         gen_helper_neon_widen_u8,
2797         gen_helper_neon_widen_u16,
2798         tcg_gen_extu_i32_i64,
2799         NULL,
2800     };
2801     static NeonGenTwo64OpFn * const opfn[] = {
2802         gen_helper_neon_paddl_u16,
2803         gen_helper_neon_paddl_u32,
2804         tcg_gen_add_i64,
2805         NULL,
2806     };
2807 
2808     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
2809 }
2810 
2811 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
2812 {
2813     static NeonGenWidenFn * const widenfn[] = {
2814         gen_helper_neon_widen_s8,
2815         gen_helper_neon_widen_s16,
2816         tcg_gen_ext_i32_i64,
2817         NULL,
2818     };
2819     static NeonGenTwo64OpFn * const opfn[] = {
2820         gen_helper_neon_paddl_u16,
2821         gen_helper_neon_paddl_u32,
2822         tcg_gen_add_i64,
2823         NULL,
2824     };
2825     static NeonGenTwo64OpFn * const accfn[] = {
2826         gen_helper_neon_addl_u16,
2827         gen_helper_neon_addl_u32,
2828         tcg_gen_add_i64,
2829         NULL,
2830     };
2831 
2832     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
2833                              accfn[a->size]);
2834 }
2835 
2836 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
2837 {
2838     static NeonGenWidenFn * const widenfn[] = {
2839         gen_helper_neon_widen_u8,
2840         gen_helper_neon_widen_u16,
2841         tcg_gen_extu_i32_i64,
2842         NULL,
2843     };
2844     static NeonGenTwo64OpFn * const opfn[] = {
2845         gen_helper_neon_paddl_u16,
2846         gen_helper_neon_paddl_u32,
2847         tcg_gen_add_i64,
2848         NULL,
2849     };
2850     static NeonGenTwo64OpFn * const accfn[] = {
2851         gen_helper_neon_addl_u16,
2852         gen_helper_neon_addl_u32,
2853         tcg_gen_add_i64,
2854         NULL,
2855     };
2856 
2857     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
2858                              accfn[a->size]);
2859 }
2860 
2861 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
2862 
2863 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
2864                        ZipFn *fn)
2865 {
2866     TCGv_ptr pd, pm;
2867 
2868     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2869         return false;
2870     }
2871 
2872     /* UNDEF accesses to D16-D31 if they don't exist. */
2873     if (!dc_isar_feature(aa32_simd_r32, s) &&
2874         ((a->vd | a->vm) & 0x10)) {
2875         return false;
2876     }
2877 
2878     if ((a->vd | a->vm) & a->q) {
2879         return false;
2880     }
2881 
2882     if (!fn) {
2883         /* Bad size or size/q combination */
2884         return false;
2885     }
2886 
2887     if (!vfp_access_check(s)) {
2888         return true;
2889     }
2890 
2891     pd = vfp_reg_ptr(true, a->vd);
2892     pm = vfp_reg_ptr(true, a->vm);
2893     fn(pd, pm);
2894     return true;
2895 }
2896 
2897 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
2898 {
2899     static ZipFn * const fn[2][4] = {
2900         {
2901             gen_helper_neon_unzip8,
2902             gen_helper_neon_unzip16,
2903             NULL,
2904             NULL,
2905         }, {
2906             gen_helper_neon_qunzip8,
2907             gen_helper_neon_qunzip16,
2908             gen_helper_neon_qunzip32,
2909             NULL,
2910         }
2911     };
2912     return do_zip_uzp(s, a, fn[a->q][a->size]);
2913 }
2914 
2915 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
2916 {
2917     static ZipFn * const fn[2][4] = {
2918         {
2919             gen_helper_neon_zip8,
2920             gen_helper_neon_zip16,
2921             NULL,
2922             NULL,
2923         }, {
2924             gen_helper_neon_qzip8,
2925             gen_helper_neon_qzip16,
2926             gen_helper_neon_qzip32,
2927             NULL,
2928         }
2929     };
2930     return do_zip_uzp(s, a, fn[a->q][a->size]);
2931 }
2932 
2933 static bool do_vmovn(DisasContext *s, arg_2misc *a,
2934                      NeonGenNarrowEnvFn *narrowfn)
2935 {
2936     TCGv_i64 rm;
2937     TCGv_i32 rd0, rd1;
2938 
2939     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2940         return false;
2941     }
2942 
2943     /* UNDEF accesses to D16-D31 if they don't exist. */
2944     if (!dc_isar_feature(aa32_simd_r32, s) &&
2945         ((a->vd | a->vm) & 0x10)) {
2946         return false;
2947     }
2948 
2949     if (a->vm & 1) {
2950         return false;
2951     }
2952 
2953     if (!narrowfn) {
2954         return false;
2955     }
2956 
2957     if (!vfp_access_check(s)) {
2958         return true;
2959     }
2960 
2961     rm = tcg_temp_new_i64();
2962     rd0 = tcg_temp_new_i32();
2963     rd1 = tcg_temp_new_i32();
2964 
2965     read_neon_element64(rm, a->vm, 0, MO_64);
2966     narrowfn(rd0, tcg_env, rm);
2967     read_neon_element64(rm, a->vm, 1, MO_64);
2968     narrowfn(rd1, tcg_env, rm);
2969     write_neon_element32(rd0, a->vd, 0, MO_32);
2970     write_neon_element32(rd1, a->vd, 1, MO_32);
2971     return true;
2972 }
2973 
2974 #define DO_VMOVN(INSN, FUNC)                                    \
2975     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
2976     {                                                           \
2977         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
2978             FUNC##8,                                            \
2979             FUNC##16,                                           \
2980             FUNC##32,                                           \
2981             NULL,                                               \
2982         };                                                      \
2983         return do_vmovn(s, a, narrowfn[a->size]);               \
2984     }
2985 
2986 DO_VMOVN(VMOVN, gen_neon_narrow_u)
2987 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
2988 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
2989 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
2990 
2991 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
2992 {
2993     TCGv_i32 rm0, rm1;
2994     TCGv_i64 rd;
2995     static NeonGenWidenFn * const widenfns[] = {
2996         gen_helper_neon_widen_u8,
2997         gen_helper_neon_widen_u16,
2998         tcg_gen_extu_i32_i64,
2999         NULL,
3000     };
3001     NeonGenWidenFn *widenfn = widenfns[a->size];
3002 
3003     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3004         return false;
3005     }
3006 
3007     /* UNDEF accesses to D16-D31 if they don't exist. */
3008     if (!dc_isar_feature(aa32_simd_r32, s) &&
3009         ((a->vd | a->vm) & 0x10)) {
3010         return false;
3011     }
3012 
3013     if (a->vd & 1) {
3014         return false;
3015     }
3016 
3017     if (!widenfn) {
3018         return false;
3019     }
3020 
3021     if (!vfp_access_check(s)) {
3022         return true;
3023     }
3024 
3025     rd = tcg_temp_new_i64();
3026     rm0 = tcg_temp_new_i32();
3027     rm1 = tcg_temp_new_i32();
3028 
3029     read_neon_element32(rm0, a->vm, 0, MO_32);
3030     read_neon_element32(rm1, a->vm, 1, MO_32);
3031 
3032     widenfn(rd, rm0);
3033     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3034     write_neon_element64(rd, a->vd, 0, MO_64);
3035     widenfn(rd, rm1);
3036     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3037     write_neon_element64(rd, a->vd, 1, MO_64);
3038     return true;
3039 }
3040 
3041 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3042 {
3043     TCGv_ptr fpst;
3044     TCGv_i64 tmp;
3045     TCGv_i32 dst0, dst1;
3046 
3047     if (!dc_isar_feature(aa32_bf16, s)) {
3048         return false;
3049     }
3050 
3051     /* UNDEF accesses to D16-D31 if they don't exist. */
3052     if (!dc_isar_feature(aa32_simd_r32, s) &&
3053         ((a->vd | a->vm) & 0x10)) {
3054         return false;
3055     }
3056 
3057     if ((a->vm & 1) || (a->size != 1)) {
3058         return false;
3059     }
3060 
3061     if (!vfp_access_check(s)) {
3062         return true;
3063     }
3064 
3065     fpst = fpstatus_ptr(FPST_STD);
3066     tmp = tcg_temp_new_i64();
3067     dst0 = tcg_temp_new_i32();
3068     dst1 = tcg_temp_new_i32();
3069 
3070     read_neon_element64(tmp, a->vm, 0, MO_64);
3071     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3072 
3073     read_neon_element64(tmp, a->vm, 1, MO_64);
3074     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3075 
3076     write_neon_element32(dst0, a->vd, 0, MO_32);
3077     write_neon_element32(dst1, a->vd, 1, MO_32);
3078     return true;
3079 }
3080 
3081 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3082 {
3083     TCGv_ptr fpst;
3084     TCGv_i32 ahp, tmp, tmp2, tmp3;
3085 
3086     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3087         !dc_isar_feature(aa32_fp16_spconv, s)) {
3088         return false;
3089     }
3090 
3091     /* UNDEF accesses to D16-D31 if they don't exist. */
3092     if (!dc_isar_feature(aa32_simd_r32, s) &&
3093         ((a->vd | a->vm) & 0x10)) {
3094         return false;
3095     }
3096 
3097     if ((a->vm & 1) || (a->size != 1)) {
3098         return false;
3099     }
3100 
3101     if (!vfp_access_check(s)) {
3102         return true;
3103     }
3104 
3105     fpst = fpstatus_ptr(FPST_STD);
3106     ahp = get_ahp_flag();
3107     tmp = tcg_temp_new_i32();
3108     read_neon_element32(tmp, a->vm, 0, MO_32);
3109     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3110     tmp2 = tcg_temp_new_i32();
3111     read_neon_element32(tmp2, a->vm, 1, MO_32);
3112     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3113     tcg_gen_shli_i32(tmp2, tmp2, 16);
3114     tcg_gen_or_i32(tmp2, tmp2, tmp);
3115     read_neon_element32(tmp, a->vm, 2, MO_32);
3116     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3117     tmp3 = tcg_temp_new_i32();
3118     read_neon_element32(tmp3, a->vm, 3, MO_32);
3119     write_neon_element32(tmp2, a->vd, 0, MO_32);
3120     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3121     tcg_gen_shli_i32(tmp3, tmp3, 16);
3122     tcg_gen_or_i32(tmp3, tmp3, tmp);
3123     write_neon_element32(tmp3, a->vd, 1, MO_32);
3124     return true;
3125 }
3126 
3127 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3128 {
3129     TCGv_ptr fpst;
3130     TCGv_i32 ahp, tmp, tmp2, tmp3;
3131 
3132     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3133         !dc_isar_feature(aa32_fp16_spconv, s)) {
3134         return false;
3135     }
3136 
3137     /* UNDEF accesses to D16-D31 if they don't exist. */
3138     if (!dc_isar_feature(aa32_simd_r32, s) &&
3139         ((a->vd | a->vm) & 0x10)) {
3140         return false;
3141     }
3142 
3143     if ((a->vd & 1) || (a->size != 1)) {
3144         return false;
3145     }
3146 
3147     if (!vfp_access_check(s)) {
3148         return true;
3149     }
3150 
3151     fpst = fpstatus_ptr(FPST_STD);
3152     ahp = get_ahp_flag();
3153     tmp3 = tcg_temp_new_i32();
3154     tmp2 = tcg_temp_new_i32();
3155     tmp = tcg_temp_new_i32();
3156     read_neon_element32(tmp, a->vm, 0, MO_32);
3157     read_neon_element32(tmp2, a->vm, 1, MO_32);
3158     tcg_gen_ext16u_i32(tmp3, tmp);
3159     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3160     write_neon_element32(tmp3, a->vd, 0, MO_32);
3161     tcg_gen_shri_i32(tmp, tmp, 16);
3162     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3163     write_neon_element32(tmp, a->vd, 1, MO_32);
3164     tcg_gen_ext16u_i32(tmp3, tmp2);
3165     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3166     write_neon_element32(tmp3, a->vd, 2, MO_32);
3167     tcg_gen_shri_i32(tmp2, tmp2, 16);
3168     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3169     write_neon_element32(tmp2, a->vd, 3, MO_32);
3170     return true;
3171 }
3172 
3173 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3174 {
3175     int vec_size = a->q ? 16 : 8;
3176     int rd_ofs = neon_full_reg_offset(a->vd);
3177     int rm_ofs = neon_full_reg_offset(a->vm);
3178 
3179     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3180         return false;
3181     }
3182 
3183     /* UNDEF accesses to D16-D31 if they don't exist. */
3184     if (!dc_isar_feature(aa32_simd_r32, s) &&
3185         ((a->vd | a->vm) & 0x10)) {
3186         return false;
3187     }
3188 
3189     if (a->size == 3) {
3190         return false;
3191     }
3192 
3193     if ((a->vd | a->vm) & a->q) {
3194         return false;
3195     }
3196 
3197     if (!vfp_access_check(s)) {
3198         return true;
3199     }
3200 
3201     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3202 
3203     return true;
3204 }
3205 
3206 #define DO_2MISC_VEC(INSN, FN)                                  \
3207     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3208     {                                                           \
3209         return do_2misc_vec(s, a, FN);                          \
3210     }
3211 
3212 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3213 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3214 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3215 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3216 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3217 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3218 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3219 
3220 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3221 {
3222     if (a->size != 0) {
3223         return false;
3224     }
3225     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3226 }
3227 
3228 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3229     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3230                          uint32_t rm_ofs, uint32_t oprsz,               \
3231                          uint32_t maxsz)                                \
3232     {                                                                   \
3233         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3234                            DATA, FUNC);                                 \
3235     }
3236 
3237 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3238     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3239                          uint32_t rm_ofs, uint32_t oprsz,               \
3240                          uint32_t maxsz)                                \
3241     {                                                                   \
3242         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3243     }
3244 
3245 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3246 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aesd, 0)
3247 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3248 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesimc, 0)
3249 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3250 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3251 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3252 
3253 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3254     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3255     {                                                           \
3256         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3257             return false;                                       \
3258         }                                                       \
3259         return do_2misc_vec(s, a, gen_##INSN);                  \
3260     }
3261 
3262 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3263 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3264 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3265 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3266 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3267 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3268 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3269 
3270 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3271 {
3272     TCGv_i32 tmp;
3273     int pass;
3274 
3275     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3276     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3277         return false;
3278     }
3279 
3280     /* UNDEF accesses to D16-D31 if they don't exist. */
3281     if (!dc_isar_feature(aa32_simd_r32, s) &&
3282         ((a->vd | a->vm) & 0x10)) {
3283         return false;
3284     }
3285 
3286     if (!fn) {
3287         return false;
3288     }
3289 
3290     if ((a->vd | a->vm) & a->q) {
3291         return false;
3292     }
3293 
3294     if (!vfp_access_check(s)) {
3295         return true;
3296     }
3297 
3298     tmp = tcg_temp_new_i32();
3299     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3300         read_neon_element32(tmp, a->vm, pass, MO_32);
3301         fn(tmp, tmp);
3302         write_neon_element32(tmp, a->vd, pass, MO_32);
3303     }
3304     return true;
3305 }
3306 
3307 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3308 {
3309     static NeonGenOneOpFn * const fn[] = {
3310         tcg_gen_bswap32_i32,
3311         gen_swap_half,
3312         NULL,
3313         NULL,
3314     };
3315     return do_2misc(s, a, fn[a->size]);
3316 }
3317 
3318 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3319 {
3320     if (a->size != 0) {
3321         return false;
3322     }
3323     return do_2misc(s, a, gen_rev16);
3324 }
3325 
3326 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3327 {
3328     static NeonGenOneOpFn * const fn[] = {
3329         gen_helper_neon_cls_s8,
3330         gen_helper_neon_cls_s16,
3331         gen_helper_neon_cls_s32,
3332         NULL,
3333     };
3334     return do_2misc(s, a, fn[a->size]);
3335 }
3336 
3337 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3338 {
3339     tcg_gen_clzi_i32(rd, rm, 32);
3340 }
3341 
3342 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3343 {
3344     static NeonGenOneOpFn * const fn[] = {
3345         gen_helper_neon_clz_u8,
3346         gen_helper_neon_clz_u16,
3347         do_VCLZ_32,
3348         NULL,
3349     };
3350     return do_2misc(s, a, fn[a->size]);
3351 }
3352 
3353 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3354 {
3355     if (a->size != 0) {
3356         return false;
3357     }
3358     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3359 }
3360 
3361 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3362                        uint32_t oprsz, uint32_t maxsz)
3363 {
3364     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3365                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3366                       oprsz, maxsz);
3367 }
3368 
3369 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3370 {
3371     if (a->size == MO_16) {
3372         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3373             return false;
3374         }
3375     } else if (a->size != MO_32) {
3376         return false;
3377     }
3378     return do_2misc_vec(s, a, gen_VABS_F);
3379 }
3380 
3381 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3382                        uint32_t oprsz, uint32_t maxsz)
3383 {
3384     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3385                       vece == MO_16 ? 0x8000 : 0x80000000,
3386                       oprsz, maxsz);
3387 }
3388 
3389 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3390 {
3391     if (a->size == MO_16) {
3392         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3393             return false;
3394         }
3395     } else if (a->size != MO_32) {
3396         return false;
3397     }
3398     return do_2misc_vec(s, a, gen_VNEG_F);
3399 }
3400 
3401 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3402 {
3403     if (a->size != 2) {
3404         return false;
3405     }
3406     return do_2misc(s, a, gen_helper_recpe_u32);
3407 }
3408 
3409 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3410 {
3411     if (a->size != 2) {
3412         return false;
3413     }
3414     return do_2misc(s, a, gen_helper_rsqrte_u32);
3415 }
3416 
3417 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3418     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3419     {                                                   \
3420         FUNC(d, tcg_env, m);                            \
3421     }
3422 
3423 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3424 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3425 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3426 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3427 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3428 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3429 
3430 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3431 {
3432     static NeonGenOneOpFn * const fn[] = {
3433         gen_VQABS_s8,
3434         gen_VQABS_s16,
3435         gen_VQABS_s32,
3436         NULL,
3437     };
3438     return do_2misc(s, a, fn[a->size]);
3439 }
3440 
3441 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3442 {
3443     static NeonGenOneOpFn * const fn[] = {
3444         gen_VQNEG_s8,
3445         gen_VQNEG_s16,
3446         gen_VQNEG_s32,
3447         NULL,
3448     };
3449     return do_2misc(s, a, fn[a->size]);
3450 }
3451 
3452 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3453     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3454                            uint32_t rm_ofs,                             \
3455                            uint32_t oprsz, uint32_t maxsz)              \
3456     {                                                                   \
3457         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3458             NULL, HFUNC, SFUNC, NULL,                                   \
3459         };                                                              \
3460         TCGv_ptr fpst;                                                  \
3461         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3462         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3463                            fns[vece]);                                  \
3464     }                                                                   \
3465     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3466     {                                                                   \
3467         if (a->size == MO_16) {                                         \
3468             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3469                 return false;                                           \
3470             }                                                           \
3471         } else if (a->size != MO_32) {                                  \
3472             return false;                                               \
3473         }                                                               \
3474         return do_2misc_vec(s, a, gen_##INSN);                          \
3475     }
3476 
3477 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3478 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3479 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3480 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3481 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3482 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3483 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3484 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3485 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3486 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3487 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3488 
3489 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3490 
3491 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3492 {
3493     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3494         return false;
3495     }
3496     return trans_VRINTX_impl(s, a);
3497 }
3498 
3499 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3500     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3501                            uint32_t rm_ofs,                             \
3502                            uint32_t oprsz, uint32_t maxsz)              \
3503     {                                                                   \
3504         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3505             NULL,                                                       \
3506             gen_helper_gvec_##OP##h,                                    \
3507             gen_helper_gvec_##OP##s,                                    \
3508             NULL,                                                       \
3509         };                                                              \
3510         TCGv_ptr fpst;                                                  \
3511         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3512         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3513                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3514     }                                                                   \
3515     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3516     {                                                                   \
3517         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3518             return false;                                               \
3519         }                                                               \
3520         if (a->size == MO_16) {                                         \
3521             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3522                 return false;                                           \
3523             }                                                           \
3524         } else if (a->size != MO_32) {                                  \
3525             return false;                                               \
3526         }                                                               \
3527         return do_2misc_vec(s, a, gen_##INSN);                          \
3528     }
3529 
3530 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3531 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3532 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3533 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3534 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3535 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3536 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3537 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3538 
3539 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3540 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3541 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3542 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3543 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3544 
3545 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3546 {
3547     TCGv_i64 rm, rd;
3548     int pass;
3549 
3550     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3551         return false;
3552     }
3553 
3554     /* UNDEF accesses to D16-D31 if they don't exist. */
3555     if (!dc_isar_feature(aa32_simd_r32, s) &&
3556         ((a->vd | a->vm) & 0x10)) {
3557         return false;
3558     }
3559 
3560     if (a->size != 0) {
3561         return false;
3562     }
3563 
3564     if ((a->vd | a->vm) & a->q) {
3565         return false;
3566     }
3567 
3568     if (!vfp_access_check(s)) {
3569         return true;
3570     }
3571 
3572     rm = tcg_temp_new_i64();
3573     rd = tcg_temp_new_i64();
3574     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3575         read_neon_element64(rm, a->vm, pass, MO_64);
3576         read_neon_element64(rd, a->vd, pass, MO_64);
3577         write_neon_element64(rm, a->vd, pass, MO_64);
3578         write_neon_element64(rd, a->vm, pass, MO_64);
3579     }
3580     return true;
3581 }
3582 
3583 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3584 {
3585     TCGv_i32 rd, tmp;
3586 
3587     rd = tcg_temp_new_i32();
3588     tmp = tcg_temp_new_i32();
3589 
3590     tcg_gen_shli_i32(rd, t0, 8);
3591     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3592     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3593     tcg_gen_or_i32(rd, rd, tmp);
3594 
3595     tcg_gen_shri_i32(t1, t1, 8);
3596     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3597     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3598     tcg_gen_or_i32(t1, t1, tmp);
3599     tcg_gen_mov_i32(t0, rd);
3600 }
3601 
3602 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3603 {
3604     TCGv_i32 rd, tmp;
3605 
3606     rd = tcg_temp_new_i32();
3607     tmp = tcg_temp_new_i32();
3608 
3609     tcg_gen_shli_i32(rd, t0, 16);
3610     tcg_gen_andi_i32(tmp, t1, 0xffff);
3611     tcg_gen_or_i32(rd, rd, tmp);
3612     tcg_gen_shri_i32(t1, t1, 16);
3613     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3614     tcg_gen_or_i32(t1, t1, tmp);
3615     tcg_gen_mov_i32(t0, rd);
3616 }
3617 
3618 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3619 {
3620     TCGv_i32 tmp, tmp2;
3621     int pass;
3622 
3623     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3624         return false;
3625     }
3626 
3627     /* UNDEF accesses to D16-D31 if they don't exist. */
3628     if (!dc_isar_feature(aa32_simd_r32, s) &&
3629         ((a->vd | a->vm) & 0x10)) {
3630         return false;
3631     }
3632 
3633     if ((a->vd | a->vm) & a->q) {
3634         return false;
3635     }
3636 
3637     if (a->size == 3) {
3638         return false;
3639     }
3640 
3641     if (!vfp_access_check(s)) {
3642         return true;
3643     }
3644 
3645     tmp = tcg_temp_new_i32();
3646     tmp2 = tcg_temp_new_i32();
3647     if (a->size == MO_32) {
3648         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3649             read_neon_element32(tmp, a->vm, pass, MO_32);
3650             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3651             write_neon_element32(tmp2, a->vm, pass, MO_32);
3652             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3653         }
3654     } else {
3655         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3656             read_neon_element32(tmp, a->vm, pass, MO_32);
3657             read_neon_element32(tmp2, a->vd, pass, MO_32);
3658             if (a->size == MO_8) {
3659                 gen_neon_trn_u8(tmp, tmp2);
3660             } else {
3661                 gen_neon_trn_u16(tmp, tmp2);
3662             }
3663             write_neon_element32(tmp2, a->vm, pass, MO_32);
3664             write_neon_element32(tmp, a->vd, pass, MO_32);
3665         }
3666     }
3667     return true;
3668 }
3669 
3670 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
3671 {
3672     if (!dc_isar_feature(aa32_i8mm, s)) {
3673         return false;
3674     }
3675     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3676                         gen_helper_gvec_smmla_b);
3677 }
3678 
3679 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
3680 {
3681     if (!dc_isar_feature(aa32_i8mm, s)) {
3682         return false;
3683     }
3684     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3685                         gen_helper_gvec_ummla_b);
3686 }
3687 
3688 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
3689 {
3690     if (!dc_isar_feature(aa32_i8mm, s)) {
3691         return false;
3692     }
3693     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3694                         gen_helper_gvec_usmmla_b);
3695 }
3696 
3697 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
3698 {
3699     if (!dc_isar_feature(aa32_bf16, s)) {
3700         return false;
3701     }
3702     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3703                         gen_helper_gvec_bfmmla);
3704 }
3705 
3706 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
3707 {
3708     if (!dc_isar_feature(aa32_bf16, s)) {
3709         return false;
3710     }
3711     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
3712                              gen_helper_gvec_bfmlal);
3713 }
3714 
3715 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
3716 {
3717     if (!dc_isar_feature(aa32_bf16, s)) {
3718         return false;
3719     }
3720     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
3721                              (a->index << 1) | a->q, FPST_STD,
3722                              gen_helper_gvec_bfmlal_idx);
3723 }
3724