xref: /openbmc/qemu/target/arm/tcg/translate-neon.c (revision f8ed3648)
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include "qemu/osdep.h"
24 #include "translate.h"
25 #include "translate-a32.h"
26 
27 /* Include the generated Neon decoder */
28 #include "decode-neon-dp.c.inc"
29 #include "decode-neon-ls.c.inc"
30 #include "decode-neon-shared.c.inc"
31 
32 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
33 {
34     TCGv_ptr ret = tcg_temp_new_ptr();
35     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
36     return ret;
37 }
38 
39 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
40 {
41     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
42 
43     switch (mop) {
44     case MO_UB:
45         tcg_gen_ld8u_i32(var, cpu_env, offset);
46         break;
47     case MO_UW:
48         tcg_gen_ld16u_i32(var, cpu_env, offset);
49         break;
50     case MO_UL:
51         tcg_gen_ld_i32(var, cpu_env, offset);
52         break;
53     default:
54         g_assert_not_reached();
55     }
56 }
57 
58 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
59 {
60     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
61 
62     switch (mop) {
63     case MO_UB:
64         tcg_gen_ld8u_i64(var, cpu_env, offset);
65         break;
66     case MO_UW:
67         tcg_gen_ld16u_i64(var, cpu_env, offset);
68         break;
69     case MO_UL:
70         tcg_gen_ld32u_i64(var, cpu_env, offset);
71         break;
72     case MO_UQ:
73         tcg_gen_ld_i64(var, cpu_env, offset);
74         break;
75     default:
76         g_assert_not_reached();
77     }
78 }
79 
80 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
81 {
82     long offset = neon_element_offset(reg, ele, size);
83 
84     switch (size) {
85     case MO_8:
86         tcg_gen_st8_i32(var, cpu_env, offset);
87         break;
88     case MO_16:
89         tcg_gen_st16_i32(var, cpu_env, offset);
90         break;
91     case MO_32:
92         tcg_gen_st_i32(var, cpu_env, offset);
93         break;
94     default:
95         g_assert_not_reached();
96     }
97 }
98 
99 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
100 {
101     long offset = neon_element_offset(reg, ele, size);
102 
103     switch (size) {
104     case MO_8:
105         tcg_gen_st8_i64(var, cpu_env, offset);
106         break;
107     case MO_16:
108         tcg_gen_st16_i64(var, cpu_env, offset);
109         break;
110     case MO_32:
111         tcg_gen_st32_i64(var, cpu_env, offset);
112         break;
113     case MO_64:
114         tcg_gen_st_i64(var, cpu_env, offset);
115         break;
116     default:
117         g_assert_not_reached();
118     }
119 }
120 
121 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
122                          int data, gen_helper_gvec_4 *fn_gvec)
123 {
124     /* UNDEF accesses to D16-D31 if they don't exist. */
125     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
126         return false;
127     }
128 
129     /*
130      * UNDEF accesses to odd registers for each bit of Q.
131      * Q will be 0b111 for all Q-reg instructions, otherwise
132      * when we have mixed Q- and D-reg inputs.
133      */
134     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
135         return false;
136     }
137 
138     if (!vfp_access_check(s)) {
139         return true;
140     }
141 
142     int opr_sz = q ? 16 : 8;
143     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
144                        vfp_reg_offset(1, vn),
145                        vfp_reg_offset(1, vm),
146                        vfp_reg_offset(1, vd),
147                        opr_sz, opr_sz, data, fn_gvec);
148     return true;
149 }
150 
151 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
152                               int data, ARMFPStatusFlavour fp_flavour,
153                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
154 {
155     /* UNDEF accesses to D16-D31 if they don't exist. */
156     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
157         return false;
158     }
159 
160     /*
161      * UNDEF accesses to odd registers for each bit of Q.
162      * Q will be 0b111 for all Q-reg instructions, otherwise
163      * when we have mixed Q- and D-reg inputs.
164      */
165     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
166         return false;
167     }
168 
169     if (!vfp_access_check(s)) {
170         return true;
171     }
172 
173     int opr_sz = q ? 16 : 8;
174     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
175 
176     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
177                        vfp_reg_offset(1, vn),
178                        vfp_reg_offset(1, vm),
179                        vfp_reg_offset(1, vd),
180                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
181     return true;
182 }
183 
184 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
185 {
186     if (!dc_isar_feature(aa32_vcma, s)) {
187         return false;
188     }
189     if (a->size == MO_16) {
190         if (!dc_isar_feature(aa32_fp16_arith, s)) {
191             return false;
192         }
193         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
194                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
195     }
196     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
197                              FPST_STD, gen_helper_gvec_fcmlas);
198 }
199 
200 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
201 {
202     int opr_sz;
203     TCGv_ptr fpst;
204     gen_helper_gvec_3_ptr *fn_gvec_ptr;
205 
206     if (!dc_isar_feature(aa32_vcma, s)
207         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
208         return false;
209     }
210 
211     /* UNDEF accesses to D16-D31 if they don't exist. */
212     if (!dc_isar_feature(aa32_simd_r32, s) &&
213         ((a->vd | a->vn | a->vm) & 0x10)) {
214         return false;
215     }
216 
217     if ((a->vn | a->vm | a->vd) & a->q) {
218         return false;
219     }
220 
221     if (!vfp_access_check(s)) {
222         return true;
223     }
224 
225     opr_sz = (1 + a->q) * 8;
226     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
227     fn_gvec_ptr = (a->size == MO_16) ?
228         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
229     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
230                        vfp_reg_offset(1, a->vn),
231                        vfp_reg_offset(1, a->vm),
232                        fpst, opr_sz, opr_sz, a->rot,
233                        fn_gvec_ptr);
234     return true;
235 }
236 
237 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
238 {
239     if (!dc_isar_feature(aa32_dp, s)) {
240         return false;
241     }
242     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
243                         gen_helper_gvec_sdot_b);
244 }
245 
246 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
247 {
248     if (!dc_isar_feature(aa32_dp, s)) {
249         return false;
250     }
251     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
252                         gen_helper_gvec_udot_b);
253 }
254 
255 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
256 {
257     if (!dc_isar_feature(aa32_i8mm, s)) {
258         return false;
259     }
260     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
261                         gen_helper_gvec_usdot_b);
262 }
263 
264 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
265 {
266     if (!dc_isar_feature(aa32_bf16, s)) {
267         return false;
268     }
269     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
270                         gen_helper_gvec_bfdot);
271 }
272 
273 static bool trans_VFML(DisasContext *s, arg_VFML *a)
274 {
275     int opr_sz;
276 
277     if (!dc_isar_feature(aa32_fhm, s)) {
278         return false;
279     }
280 
281     /* UNDEF accesses to D16-D31 if they don't exist. */
282     if (!dc_isar_feature(aa32_simd_r32, s) &&
283         (a->vd & 0x10)) {
284         return false;
285     }
286 
287     if (a->vd & a->q) {
288         return false;
289     }
290 
291     if (!vfp_access_check(s)) {
292         return true;
293     }
294 
295     opr_sz = (1 + a->q) * 8;
296     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
297                        vfp_reg_offset(a->q, a->vn),
298                        vfp_reg_offset(a->q, a->vm),
299                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
300                        gen_helper_gvec_fmlal_a32);
301     return true;
302 }
303 
304 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
305 {
306     int data = (a->index << 2) | a->rot;
307 
308     if (!dc_isar_feature(aa32_vcma, s)) {
309         return false;
310     }
311     if (a->size == MO_16) {
312         if (!dc_isar_feature(aa32_fp16_arith, s)) {
313             return false;
314         }
315         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
316                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
317     }
318     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
319                              FPST_STD, gen_helper_gvec_fcmlas_idx);
320 }
321 
322 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
323 {
324     if (!dc_isar_feature(aa32_dp, s)) {
325         return false;
326     }
327     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
328                         gen_helper_gvec_sdot_idx_b);
329 }
330 
331 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
332 {
333     if (!dc_isar_feature(aa32_dp, s)) {
334         return false;
335     }
336     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
337                         gen_helper_gvec_udot_idx_b);
338 }
339 
340 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
341 {
342     if (!dc_isar_feature(aa32_i8mm, s)) {
343         return false;
344     }
345     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
346                         gen_helper_gvec_usdot_idx_b);
347 }
348 
349 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
350 {
351     if (!dc_isar_feature(aa32_i8mm, s)) {
352         return false;
353     }
354     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
355                         gen_helper_gvec_sudot_idx_b);
356 }
357 
358 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
359 {
360     if (!dc_isar_feature(aa32_bf16, s)) {
361         return false;
362     }
363     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
364                         gen_helper_gvec_bfdot_idx);
365 }
366 
367 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
368 {
369     int opr_sz;
370 
371     if (!dc_isar_feature(aa32_fhm, s)) {
372         return false;
373     }
374 
375     /* UNDEF accesses to D16-D31 if they don't exist. */
376     if (!dc_isar_feature(aa32_simd_r32, s) &&
377         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
378         return false;
379     }
380 
381     if (a->vd & a->q) {
382         return false;
383     }
384 
385     if (!vfp_access_check(s)) {
386         return true;
387     }
388 
389     opr_sz = (1 + a->q) * 8;
390     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
391                        vfp_reg_offset(a->q, a->vn),
392                        vfp_reg_offset(a->q, a->rm),
393                        cpu_env, opr_sz, opr_sz,
394                        (a->index << 2) | a->s, /* is_2 == 0 */
395                        gen_helper_gvec_fmlal_idx_a32);
396     return true;
397 }
398 
399 static struct {
400     int nregs;
401     int interleave;
402     int spacing;
403 } const neon_ls_element_type[11] = {
404     {1, 4, 1},
405     {1, 4, 2},
406     {4, 1, 1},
407     {2, 2, 2},
408     {1, 3, 1},
409     {1, 3, 2},
410     {3, 1, 1},
411     {1, 1, 1},
412     {1, 2, 1},
413     {1, 2, 2},
414     {2, 1, 1}
415 };
416 
417 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
418                                       int stride)
419 {
420     if (rm != 15) {
421         TCGv_i32 base;
422 
423         base = load_reg(s, rn);
424         if (rm == 13) {
425             tcg_gen_addi_i32(base, base, stride);
426         } else {
427             TCGv_i32 index;
428             index = load_reg(s, rm);
429             tcg_gen_add_i32(base, base, index);
430         }
431         store_reg(s, rn, base);
432     }
433 }
434 
435 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
436 {
437     /* Neon load/store multiple structures */
438     int nregs, interleave, spacing, reg, n;
439     MemOp mop, align, endian;
440     int mmu_idx = get_mem_index(s);
441     int size = a->size;
442     TCGv_i64 tmp64;
443     TCGv_i32 addr;
444 
445     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
446         return false;
447     }
448 
449     /* UNDEF accesses to D16-D31 if they don't exist */
450     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
451         return false;
452     }
453     if (a->itype > 10) {
454         return false;
455     }
456     /* Catch UNDEF cases for bad values of align field */
457     switch (a->itype & 0xc) {
458     case 4:
459         if (a->align >= 2) {
460             return false;
461         }
462         break;
463     case 8:
464         if (a->align == 3) {
465             return false;
466         }
467         break;
468     default:
469         break;
470     }
471     nregs = neon_ls_element_type[a->itype].nregs;
472     interleave = neon_ls_element_type[a->itype].interleave;
473     spacing = neon_ls_element_type[a->itype].spacing;
474     if (size == 3 && (interleave | spacing) != 1) {
475         return false;
476     }
477 
478     if (!vfp_access_check(s)) {
479         return true;
480     }
481 
482     /* For our purposes, bytes are always little-endian.  */
483     endian = s->be_data;
484     if (size == 0) {
485         endian = MO_LE;
486     }
487 
488     /* Enforce alignment requested by the instruction */
489     if (a->align) {
490         align = pow2_align(a->align + 2); /* 4 ** a->align */
491     } else {
492         align = s->align_mem ? MO_ALIGN : 0;
493     }
494 
495     /*
496      * Consecutive little-endian elements from a single register
497      * can be promoted to a larger little-endian operation.
498      */
499     if (interleave == 1 && endian == MO_LE) {
500         /* Retain any natural alignment. */
501         if (align == MO_ALIGN) {
502             align = pow2_align(size);
503         }
504         size = 3;
505     }
506 
507     tmp64 = tcg_temp_new_i64();
508     addr = tcg_temp_new_i32();
509     load_reg_var(s, addr, a->rn);
510 
511     mop = endian | size | align;
512     for (reg = 0; reg < nregs; reg++) {
513         for (n = 0; n < 8 >> size; n++) {
514             int xs;
515             for (xs = 0; xs < interleave; xs++) {
516                 int tt = a->vd + reg + spacing * xs;
517 
518                 if (a->l) {
519                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
520                     neon_store_element64(tt, n, size, tmp64);
521                 } else {
522                     neon_load_element64(tmp64, tt, n, size);
523                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
524                 }
525                 tcg_gen_addi_i32(addr, addr, 1 << size);
526 
527                 /* Subsequent memory operations inherit alignment */
528                 mop &= ~MO_AMASK;
529             }
530         }
531     }
532 
533     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
534     return true;
535 }
536 
537 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
538 {
539     /* Neon load single structure to all lanes */
540     int reg, stride, vec_size;
541     int vd = a->vd;
542     int size = a->size;
543     int nregs = a->n + 1;
544     TCGv_i32 addr, tmp;
545     MemOp mop, align;
546 
547     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
548         return false;
549     }
550 
551     /* UNDEF accesses to D16-D31 if they don't exist */
552     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
553         return false;
554     }
555 
556     align = 0;
557     if (size == 3) {
558         if (nregs != 4 || a->a == 0) {
559             return false;
560         }
561         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
562         size = MO_32;
563         align = MO_ALIGN_16;
564     } else if (a->a) {
565         switch (nregs) {
566         case 1:
567             if (size == 0) {
568                 return false;
569             }
570             align = MO_ALIGN;
571             break;
572         case 2:
573             align = pow2_align(size + 1);
574             break;
575         case 3:
576             return false;
577         case 4:
578             if (size == 2) {
579                 align = pow2_align(3);
580             } else {
581                 align = pow2_align(size + 2);
582             }
583             break;
584         default:
585             g_assert_not_reached();
586         }
587     }
588 
589     if (!vfp_access_check(s)) {
590         return true;
591     }
592 
593     /*
594      * VLD1 to all lanes: T bit indicates how many Dregs to write.
595      * VLD2/3/4 to all lanes: T bit indicates register stride.
596      */
597     stride = a->t ? 2 : 1;
598     vec_size = nregs == 1 ? stride * 8 : 8;
599     mop = size | align;
600     tmp = tcg_temp_new_i32();
601     addr = tcg_temp_new_i32();
602     load_reg_var(s, addr, a->rn);
603     for (reg = 0; reg < nregs; reg++) {
604         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
605         if ((vd & 1) && vec_size == 16) {
606             /*
607              * We cannot write 16 bytes at once because the
608              * destination is unaligned.
609              */
610             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
611                                  8, 8, tmp);
612             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
613                              neon_full_reg_offset(vd), 8, 8);
614         } else {
615             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
616                                  vec_size, vec_size, tmp);
617         }
618         tcg_gen_addi_i32(addr, addr, 1 << size);
619         vd += stride;
620 
621         /* Subsequent memory operations inherit alignment */
622         mop &= ~MO_AMASK;
623     }
624 
625     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
626 
627     return true;
628 }
629 
630 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
631 {
632     /* Neon load/store single structure to one lane */
633     int reg;
634     int nregs = a->n + 1;
635     int vd = a->vd;
636     TCGv_i32 addr, tmp;
637     MemOp mop;
638 
639     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
640         return false;
641     }
642 
643     /* UNDEF accesses to D16-D31 if they don't exist */
644     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
645         return false;
646     }
647 
648     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
649     switch (nregs) {
650     case 1:
651         if (a->stride != 1) {
652             return false;
653         }
654         if (((a->align & (1 << a->size)) != 0) ||
655             (a->size == 2 && (a->align == 1 || a->align == 2))) {
656             return false;
657         }
658         break;
659     case 2:
660         if (a->size == 2 && (a->align & 2) != 0) {
661             return false;
662         }
663         break;
664     case 3:
665         if (a->align != 0) {
666             return false;
667         }
668         break;
669     case 4:
670         if (a->size == 2 && a->align == 3) {
671             return false;
672         }
673         break;
674     default:
675         g_assert_not_reached();
676     }
677     if ((vd + a->stride * (nregs - 1)) > 31) {
678         /*
679          * Attempts to write off the end of the register file are
680          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
681          * access off the end of the array that holds the register data.
682          */
683         return false;
684     }
685 
686     if (!vfp_access_check(s)) {
687         return true;
688     }
689 
690     /* Pick up SCTLR settings */
691     mop = finalize_memop(s, a->size);
692 
693     if (a->align) {
694         MemOp align_op;
695 
696         switch (nregs) {
697         case 1:
698             /* For VLD1, use natural alignment. */
699             align_op = MO_ALIGN;
700             break;
701         case 2:
702             /* For VLD2, use double alignment. */
703             align_op = pow2_align(a->size + 1);
704             break;
705         case 4:
706             if (a->size == MO_32) {
707                 /*
708                  * For VLD4.32, align = 1 is double alignment, align = 2 is
709                  * quad alignment; align = 3 is rejected above.
710                  */
711                 align_op = pow2_align(a->size + a->align);
712             } else {
713                 /* For VLD4.8 and VLD.16, we want quad alignment. */
714                 align_op = pow2_align(a->size + 2);
715             }
716             break;
717         default:
718             /* For VLD3, the alignment field is zero and rejected above. */
719             g_assert_not_reached();
720         }
721 
722         mop = (mop & ~MO_AMASK) | align_op;
723     }
724 
725     tmp = tcg_temp_new_i32();
726     addr = tcg_temp_new_i32();
727     load_reg_var(s, addr, a->rn);
728 
729     for (reg = 0; reg < nregs; reg++) {
730         if (a->l) {
731             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
732             neon_store_element(vd, a->reg_idx, a->size, tmp);
733         } else { /* Store */
734             neon_load_element(tmp, vd, a->reg_idx, a->size);
735             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
736         }
737         vd += a->stride;
738         tcg_gen_addi_i32(addr, addr, 1 << a->size);
739 
740         /* Subsequent memory operations inherit alignment */
741         mop &= ~MO_AMASK;
742     }
743 
744     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
745 
746     return true;
747 }
748 
749 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
750 {
751     int vec_size = a->q ? 16 : 8;
752     int rd_ofs = neon_full_reg_offset(a->vd);
753     int rn_ofs = neon_full_reg_offset(a->vn);
754     int rm_ofs = neon_full_reg_offset(a->vm);
755 
756     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
757         return false;
758     }
759 
760     /* UNDEF accesses to D16-D31 if they don't exist. */
761     if (!dc_isar_feature(aa32_simd_r32, s) &&
762         ((a->vd | a->vn | a->vm) & 0x10)) {
763         return false;
764     }
765 
766     if ((a->vn | a->vm | a->vd) & a->q) {
767         return false;
768     }
769 
770     if (!vfp_access_check(s)) {
771         return true;
772     }
773 
774     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
775     return true;
776 }
777 
778 #define DO_3SAME(INSN, FUNC)                                            \
779     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
780     {                                                                   \
781         return do_3same(s, a, FUNC);                                    \
782     }
783 
784 DO_3SAME(VADD, tcg_gen_gvec_add)
785 DO_3SAME(VSUB, tcg_gen_gvec_sub)
786 DO_3SAME(VAND, tcg_gen_gvec_and)
787 DO_3SAME(VBIC, tcg_gen_gvec_andc)
788 DO_3SAME(VORR, tcg_gen_gvec_or)
789 DO_3SAME(VORN, tcg_gen_gvec_orc)
790 DO_3SAME(VEOR, tcg_gen_gvec_xor)
791 DO_3SAME(VSHL_S, gen_gvec_sshl)
792 DO_3SAME(VSHL_U, gen_gvec_ushl)
793 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
794 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
795 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
796 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
797 
798 /* These insns are all gvec_bitsel but with the inputs in various orders. */
799 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
800     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
801                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
802                                 uint32_t oprsz, uint32_t maxsz)         \
803     {                                                                   \
804         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
805     }                                                                   \
806     DO_3SAME(INSN, gen_##INSN##_3s)
807 
808 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
809 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
810 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
811 
812 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
813     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
814     {                                                                   \
815         if (a->size == 3) {                                             \
816             return false;                                               \
817         }                                                               \
818         return do_3same(s, a, FUNC);                                    \
819     }
820 
821 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
822 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
823 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
824 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
825 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
826 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
827 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
828 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
829 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
830 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
831 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
832 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
833 
834 #define DO_3SAME_CMP(INSN, COND)                                        \
835     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
836                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
837                                 uint32_t oprsz, uint32_t maxsz)         \
838     {                                                                   \
839         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
840     }                                                                   \
841     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
842 
843 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
844 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
845 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
846 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
847 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
848 
849 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
850     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
851                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
852     {                                                                      \
853         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
854     }
855 
856 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
857 
858 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
859 {
860     if (a->size != 0) {
861         return false;
862     }
863     return do_3same(s, a, gen_VMUL_p_3s);
864 }
865 
866 #define DO_VQRDMLAH(INSN, FUNC)                                         \
867     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
868     {                                                                   \
869         if (!dc_isar_feature(aa32_rdm, s)) {                            \
870             return false;                                               \
871         }                                                               \
872         if (a->size != 1 && a->size != 2) {                             \
873             return false;                                               \
874         }                                                               \
875         return do_3same(s, a, FUNC);                                    \
876     }
877 
878 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
879 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
880 
881 #define DO_SHA1(NAME, FUNC)                                             \
882     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
883     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
884     {                                                                   \
885         if (!dc_isar_feature(aa32_sha1, s)) {                           \
886             return false;                                               \
887         }                                                               \
888         return do_3same(s, a, gen_##NAME##_3s);                         \
889     }
890 
891 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
892 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
893 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
894 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
895 
896 #define DO_SHA2(NAME, FUNC)                                             \
897     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
898     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
899     {                                                                   \
900         if (!dc_isar_feature(aa32_sha2, s)) {                           \
901             return false;                                               \
902         }                                                               \
903         return do_3same(s, a, gen_##NAME##_3s);                         \
904     }
905 
906 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
907 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
908 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
909 
910 #define DO_3SAME_64(INSN, FUNC)                                         \
911     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
912                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
913                                 uint32_t oprsz, uint32_t maxsz)         \
914     {                                                                   \
915         static const GVecGen3 op = { .fni8 = FUNC };                    \
916         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
917     }                                                                   \
918     DO_3SAME(INSN, gen_##INSN##_3s)
919 
920 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
921     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
922     {                                                                   \
923         FUNC(d, cpu_env, n, m);                                         \
924     }                                                                   \
925     DO_3SAME_64(INSN, gen_##INSN##_elt)
926 
927 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
928 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
929 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
930 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
931 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
932 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
933 
934 #define DO_3SAME_32(INSN, FUNC)                                         \
935     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
936                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
937                                 uint32_t oprsz, uint32_t maxsz)         \
938     {                                                                   \
939         static const GVecGen3 ops[4] = {                                \
940             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
941             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
942             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
943             { 0 },                                                      \
944         };                                                              \
945         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
946     }                                                                   \
947     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
948     {                                                                   \
949         if (a->size > 2) {                                              \
950             return false;                                               \
951         }                                                               \
952         return do_3same(s, a, gen_##INSN##_3s);                         \
953     }
954 
955 /*
956  * Some helper functions need to be passed the cpu_env. In order
957  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
958  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
959  * and which call a NeonGenTwoOpEnvFn().
960  */
961 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
962     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
963     {                                                                   \
964         FUNC(d, cpu_env, n, m);                                         \
965     }
966 
967 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
968     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
969     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
970     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
971     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
972                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
973                                 uint32_t oprsz, uint32_t maxsz)         \
974     {                                                                   \
975         static const GVecGen3 ops[4] = {                                \
976             { .fni4 = gen_##INSN##_tramp8 },                            \
977             { .fni4 = gen_##INSN##_tramp16 },                           \
978             { .fni4 = gen_##INSN##_tramp32 },                           \
979             { 0 },                                                      \
980         };                                                              \
981         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
982     }                                                                   \
983     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
984     {                                                                   \
985         if (a->size > 2) {                                              \
986             return false;                                               \
987         }                                                               \
988         return do_3same(s, a, gen_##INSN##_3s);                         \
989     }
990 
991 DO_3SAME_32(VHADD_S, hadd_s)
992 DO_3SAME_32(VHADD_U, hadd_u)
993 DO_3SAME_32(VHSUB_S, hsub_s)
994 DO_3SAME_32(VHSUB_U, hsub_u)
995 DO_3SAME_32(VRHADD_S, rhadd_s)
996 DO_3SAME_32(VRHADD_U, rhadd_u)
997 DO_3SAME_32(VRSHL_S, rshl_s)
998 DO_3SAME_32(VRSHL_U, rshl_u)
999 
1000 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1001 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1002 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1003 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1004 
1005 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1006 {
1007     /* Operations handled pairwise 32 bits at a time */
1008     TCGv_i32 tmp, tmp2, tmp3;
1009 
1010     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1011         return false;
1012     }
1013 
1014     /* UNDEF accesses to D16-D31 if they don't exist. */
1015     if (!dc_isar_feature(aa32_simd_r32, s) &&
1016         ((a->vd | a->vn | a->vm) & 0x10)) {
1017         return false;
1018     }
1019 
1020     if (a->size == 3) {
1021         return false;
1022     }
1023 
1024     if (!vfp_access_check(s)) {
1025         return true;
1026     }
1027 
1028     assert(a->q == 0); /* enforced by decode patterns */
1029 
1030     /*
1031      * Note that we have to be careful not to clobber the source operands
1032      * in the "vm == vd" case by storing the result of the first pass too
1033      * early. Since Q is 0 there are always just two passes, so instead
1034      * of a complicated loop over each pass we just unroll.
1035      */
1036     tmp = tcg_temp_new_i32();
1037     tmp2 = tcg_temp_new_i32();
1038     tmp3 = tcg_temp_new_i32();
1039 
1040     read_neon_element32(tmp, a->vn, 0, MO_32);
1041     read_neon_element32(tmp2, a->vn, 1, MO_32);
1042     fn(tmp, tmp, tmp2);
1043 
1044     read_neon_element32(tmp3, a->vm, 0, MO_32);
1045     read_neon_element32(tmp2, a->vm, 1, MO_32);
1046     fn(tmp3, tmp3, tmp2);
1047 
1048     write_neon_element32(tmp, a->vd, 0, MO_32);
1049     write_neon_element32(tmp3, a->vd, 1, MO_32);
1050 
1051     return true;
1052 }
1053 
1054 #define DO_3SAME_PAIR(INSN, func)                                       \
1055     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1056     {                                                                   \
1057         static NeonGenTwoOpFn * const fns[] = {                         \
1058             gen_helper_neon_##func##8,                                  \
1059             gen_helper_neon_##func##16,                                 \
1060             gen_helper_neon_##func##32,                                 \
1061         };                                                              \
1062         if (a->size > 2) {                                              \
1063             return false;                                               \
1064         }                                                               \
1065         return do_3same_pair(s, a, fns[a->size]);                       \
1066     }
1067 
1068 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1069 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1070 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1071 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1072 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1073 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1074 
1075 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1076 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1077 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1078 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1079 DO_3SAME_PAIR(VPADD, padd_u)
1080 
1081 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1082     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1083     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1084     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1085                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1086                                 uint32_t oprsz, uint32_t maxsz)         \
1087     {                                                                   \
1088         static const GVecGen3 ops[2] = {                                \
1089             { .fni4 = gen_##INSN##_tramp16 },                           \
1090             { .fni4 = gen_##INSN##_tramp32 },                           \
1091         };                                                              \
1092         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1093     }                                                                   \
1094     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1095     {                                                                   \
1096         if (a->size != 1 && a->size != 2) {                             \
1097             return false;                                               \
1098         }                                                               \
1099         return do_3same(s, a, gen_##INSN##_3s);                         \
1100     }
1101 
1102 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1103 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1104 
1105 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1106     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1107                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1108                          uint32_t oprsz, uint32_t maxsz)                \
1109     {                                                                   \
1110         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1111         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1112                            oprsz, maxsz, 0, FUNC);                      \
1113     }
1114 
1115 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1116     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1117     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1118     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1119     {                                                                   \
1120         if (a->size == MO_16) {                                         \
1121             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1122                 return false;                                           \
1123             }                                                           \
1124             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1125         }                                                               \
1126         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1127     }
1128 
1129 
1130 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1131 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1132 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1133 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1134 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1135 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1136 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1137 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1138 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1139 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1140 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1141 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1142 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1143 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1144 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1145 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1146 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1147 
1148 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1149 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1150 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1151 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1152 
1153 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1154 {
1155     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1156         return false;
1157     }
1158 
1159     if (a->size == MO_16) {
1160         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1161             return false;
1162         }
1163         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1164     }
1165     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1166 }
1167 
1168 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1169 {
1170     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1171         return false;
1172     }
1173 
1174     if (a->size == MO_16) {
1175         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1176             return false;
1177         }
1178         return do_3same(s, a, gen_VMINNM_fp16_3s);
1179     }
1180     return do_3same(s, a, gen_VMINNM_fp32_3s);
1181 }
1182 
1183 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1184                              gen_helper_gvec_3_ptr *fn)
1185 {
1186     /* FP pairwise operations */
1187     TCGv_ptr fpstatus;
1188 
1189     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1190         return false;
1191     }
1192 
1193     /* UNDEF accesses to D16-D31 if they don't exist. */
1194     if (!dc_isar_feature(aa32_simd_r32, s) &&
1195         ((a->vd | a->vn | a->vm) & 0x10)) {
1196         return false;
1197     }
1198 
1199     if (!vfp_access_check(s)) {
1200         return true;
1201     }
1202 
1203     assert(a->q == 0); /* enforced by decode patterns */
1204 
1205 
1206     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1207     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1208                        vfp_reg_offset(1, a->vn),
1209                        vfp_reg_offset(1, a->vm),
1210                        fpstatus, 8, 8, 0, fn);
1211 
1212     return true;
1213 }
1214 
1215 /*
1216  * For all the functions using this macro, size == 1 means fp16,
1217  * which is an architecture extension we don't implement yet.
1218  */
1219 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1220     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1221     {                                                               \
1222         if (a->size == MO_16) {                                     \
1223             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1224                 return false;                                       \
1225             }                                                       \
1226             return do_3same_fp_pair(s, a, FUNC##h);                 \
1227         }                                                           \
1228         return do_3same_fp_pair(s, a, FUNC##s);                     \
1229     }
1230 
1231 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1232 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1233 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1234 
1235 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1236 {
1237     /* Handle a 2-reg-shift insn which can be vectorized. */
1238     int vec_size = a->q ? 16 : 8;
1239     int rd_ofs = neon_full_reg_offset(a->vd);
1240     int rm_ofs = neon_full_reg_offset(a->vm);
1241 
1242     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1243         return false;
1244     }
1245 
1246     /* UNDEF accesses to D16-D31 if they don't exist. */
1247     if (!dc_isar_feature(aa32_simd_r32, s) &&
1248         ((a->vd | a->vm) & 0x10)) {
1249         return false;
1250     }
1251 
1252     if ((a->vm | a->vd) & a->q) {
1253         return false;
1254     }
1255 
1256     if (!vfp_access_check(s)) {
1257         return true;
1258     }
1259 
1260     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1261     return true;
1262 }
1263 
1264 #define DO_2SH(INSN, FUNC)                                              \
1265     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1266     {                                                                   \
1267         return do_vector_2sh(s, a, FUNC);                               \
1268     }                                                                   \
1269 
1270 DO_2SH(VSHL, tcg_gen_gvec_shli)
1271 DO_2SH(VSLI, gen_gvec_sli)
1272 DO_2SH(VSRI, gen_gvec_sri)
1273 DO_2SH(VSRA_S, gen_gvec_ssra)
1274 DO_2SH(VSRA_U, gen_gvec_usra)
1275 DO_2SH(VRSHR_S, gen_gvec_srshr)
1276 DO_2SH(VRSHR_U, gen_gvec_urshr)
1277 DO_2SH(VRSRA_S, gen_gvec_srsra)
1278 DO_2SH(VRSRA_U, gen_gvec_ursra)
1279 
1280 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1281 {
1282     /* Signed shift out of range results in all-sign-bits */
1283     a->shift = MIN(a->shift, (8 << a->size) - 1);
1284     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1285 }
1286 
1287 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1288                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1289 {
1290     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1291 }
1292 
1293 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1294 {
1295     /* Shift out of range is architecturally valid and results in zero. */
1296     if (a->shift >= (8 << a->size)) {
1297         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1298     } else {
1299         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1300     }
1301 }
1302 
1303 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1304                              NeonGenTwo64OpEnvFn *fn)
1305 {
1306     /*
1307      * 2-reg-and-shift operations, size == 3 case, where the
1308      * function needs to be passed cpu_env.
1309      */
1310     TCGv_i64 constimm;
1311     int pass;
1312 
1313     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1314         return false;
1315     }
1316 
1317     /* UNDEF accesses to D16-D31 if they don't exist. */
1318     if (!dc_isar_feature(aa32_simd_r32, s) &&
1319         ((a->vd | a->vm) & 0x10)) {
1320         return false;
1321     }
1322 
1323     if ((a->vm | a->vd) & a->q) {
1324         return false;
1325     }
1326 
1327     if (!vfp_access_check(s)) {
1328         return true;
1329     }
1330 
1331     /*
1332      * To avoid excessive duplication of ops we implement shift
1333      * by immediate using the variable shift operations.
1334      */
1335     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1336 
1337     for (pass = 0; pass < a->q + 1; pass++) {
1338         TCGv_i64 tmp = tcg_temp_new_i64();
1339 
1340         read_neon_element64(tmp, a->vm, pass, MO_64);
1341         fn(tmp, cpu_env, tmp, constimm);
1342         write_neon_element64(tmp, a->vd, pass, MO_64);
1343     }
1344     return true;
1345 }
1346 
1347 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1348                              NeonGenTwoOpEnvFn *fn)
1349 {
1350     /*
1351      * 2-reg-and-shift operations, size < 3 case, where the
1352      * helper needs to be passed cpu_env.
1353      */
1354     TCGv_i32 constimm, tmp;
1355     int pass;
1356 
1357     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1358         return false;
1359     }
1360 
1361     /* UNDEF accesses to D16-D31 if they don't exist. */
1362     if (!dc_isar_feature(aa32_simd_r32, s) &&
1363         ((a->vd | a->vm) & 0x10)) {
1364         return false;
1365     }
1366 
1367     if ((a->vm | a->vd) & a->q) {
1368         return false;
1369     }
1370 
1371     if (!vfp_access_check(s)) {
1372         return true;
1373     }
1374 
1375     /*
1376      * To avoid excessive duplication of ops we implement shift
1377      * by immediate using the variable shift operations.
1378      */
1379     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1380     tmp = tcg_temp_new_i32();
1381 
1382     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1383         read_neon_element32(tmp, a->vm, pass, MO_32);
1384         fn(tmp, cpu_env, tmp, constimm);
1385         write_neon_element32(tmp, a->vd, pass, MO_32);
1386     }
1387     return true;
1388 }
1389 
1390 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1391     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1392     {                                                                   \
1393         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1394     }                                                                   \
1395     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1396     {                                                                   \
1397         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1398             gen_helper_neon_##FUNC##8,                                  \
1399             gen_helper_neon_##FUNC##16,                                 \
1400             gen_helper_neon_##FUNC##32,                                 \
1401         };                                                              \
1402         assert(a->size < ARRAY_SIZE(fns));                              \
1403         return do_2shift_env_32(s, a, fns[a->size]);                    \
1404     }
1405 
1406 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1407 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1408 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1409 
1410 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1411                                 NeonGenTwo64OpFn *shiftfn,
1412                                 NeonGenNarrowEnvFn *narrowfn)
1413 {
1414     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1415     TCGv_i64 constimm, rm1, rm2;
1416     TCGv_i32 rd;
1417 
1418     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1419         return false;
1420     }
1421 
1422     /* UNDEF accesses to D16-D31 if they don't exist. */
1423     if (!dc_isar_feature(aa32_simd_r32, s) &&
1424         ((a->vd | a->vm) & 0x10)) {
1425         return false;
1426     }
1427 
1428     if (a->vm & 1) {
1429         return false;
1430     }
1431 
1432     if (!vfp_access_check(s)) {
1433         return true;
1434     }
1435 
1436     /*
1437      * This is always a right shift, and the shiftfn is always a
1438      * left-shift helper, which thus needs the negated shift count.
1439      */
1440     constimm = tcg_constant_i64(-a->shift);
1441     rm1 = tcg_temp_new_i64();
1442     rm2 = tcg_temp_new_i64();
1443     rd = tcg_temp_new_i32();
1444 
1445     /* Load both inputs first to avoid potential overwrite if rm == rd */
1446     read_neon_element64(rm1, a->vm, 0, MO_64);
1447     read_neon_element64(rm2, a->vm, 1, MO_64);
1448 
1449     shiftfn(rm1, rm1, constimm);
1450     narrowfn(rd, cpu_env, rm1);
1451     write_neon_element32(rd, a->vd, 0, MO_32);
1452 
1453     shiftfn(rm2, rm2, constimm);
1454     narrowfn(rd, cpu_env, rm2);
1455     write_neon_element32(rd, a->vd, 1, MO_32);
1456 
1457     return true;
1458 }
1459 
1460 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1461                                 NeonGenTwoOpFn *shiftfn,
1462                                 NeonGenNarrowEnvFn *narrowfn)
1463 {
1464     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1465     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1466     TCGv_i64 rtmp;
1467     uint32_t imm;
1468 
1469     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1470         return false;
1471     }
1472 
1473     /* UNDEF accesses to D16-D31 if they don't exist. */
1474     if (!dc_isar_feature(aa32_simd_r32, s) &&
1475         ((a->vd | a->vm) & 0x10)) {
1476         return false;
1477     }
1478 
1479     if (a->vm & 1) {
1480         return false;
1481     }
1482 
1483     if (!vfp_access_check(s)) {
1484         return true;
1485     }
1486 
1487     /*
1488      * This is always a right shift, and the shiftfn is always a
1489      * left-shift helper, which thus needs the negated shift count
1490      * duplicated into each lane of the immediate value.
1491      */
1492     if (a->size == 1) {
1493         imm = (uint16_t)(-a->shift);
1494         imm |= imm << 16;
1495     } else {
1496         /* size == 2 */
1497         imm = -a->shift;
1498     }
1499     constimm = tcg_constant_i32(imm);
1500 
1501     /* Load all inputs first to avoid potential overwrite */
1502     rm1 = tcg_temp_new_i32();
1503     rm2 = tcg_temp_new_i32();
1504     rm3 = tcg_temp_new_i32();
1505     rm4 = tcg_temp_new_i32();
1506     read_neon_element32(rm1, a->vm, 0, MO_32);
1507     read_neon_element32(rm2, a->vm, 1, MO_32);
1508     read_neon_element32(rm3, a->vm, 2, MO_32);
1509     read_neon_element32(rm4, a->vm, 3, MO_32);
1510     rtmp = tcg_temp_new_i64();
1511 
1512     shiftfn(rm1, rm1, constimm);
1513     shiftfn(rm2, rm2, constimm);
1514 
1515     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1516 
1517     narrowfn(rm1, cpu_env, rtmp);
1518     write_neon_element32(rm1, a->vd, 0, MO_32);
1519 
1520     shiftfn(rm3, rm3, constimm);
1521     shiftfn(rm4, rm4, constimm);
1522 
1523     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1524 
1525     narrowfn(rm3, cpu_env, rtmp);
1526     write_neon_element32(rm3, a->vd, 1, MO_32);
1527     return true;
1528 }
1529 
1530 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1531     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1532     {                                                                   \
1533         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1534     }
1535 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1536     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1537     {                                                                   \
1538         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1539     }
1540 
1541 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1542 {
1543     tcg_gen_extrl_i64_i32(dest, src);
1544 }
1545 
1546 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1547 {
1548     gen_helper_neon_narrow_u16(dest, src);
1549 }
1550 
1551 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1552 {
1553     gen_helper_neon_narrow_u8(dest, src);
1554 }
1555 
1556 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1557 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1558 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1559 
1560 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1561 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1562 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1563 
1564 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1565 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1566 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1567 
1568 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1569 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1570 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1571 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1572 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1573 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1574 
1575 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1576 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1577 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1578 
1579 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1580 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1581 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1582 
1583 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1584 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1585 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1586 
1587 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1588                          NeonGenWidenFn *widenfn, bool u)
1589 {
1590     TCGv_i64 tmp;
1591     TCGv_i32 rm0, rm1;
1592     uint64_t widen_mask = 0;
1593 
1594     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1595         return false;
1596     }
1597 
1598     /* UNDEF accesses to D16-D31 if they don't exist. */
1599     if (!dc_isar_feature(aa32_simd_r32, s) &&
1600         ((a->vd | a->vm) & 0x10)) {
1601         return false;
1602     }
1603 
1604     if (a->vd & 1) {
1605         return false;
1606     }
1607 
1608     if (!vfp_access_check(s)) {
1609         return true;
1610     }
1611 
1612     /*
1613      * This is a widen-and-shift operation. The shift is always less
1614      * than the width of the source type, so after widening the input
1615      * vector we can simply shift the whole 64-bit widened register,
1616      * and then clear the potential overflow bits resulting from left
1617      * bits of the narrow input appearing as right bits of the left
1618      * neighbour narrow input. Calculate a mask of bits to clear.
1619      */
1620     if ((a->shift != 0) && (a->size < 2 || u)) {
1621         int esize = 8 << a->size;
1622         widen_mask = MAKE_64BIT_MASK(0, esize);
1623         widen_mask >>= esize - a->shift;
1624         widen_mask = dup_const(a->size + 1, widen_mask);
1625     }
1626 
1627     rm0 = tcg_temp_new_i32();
1628     rm1 = tcg_temp_new_i32();
1629     read_neon_element32(rm0, a->vm, 0, MO_32);
1630     read_neon_element32(rm1, a->vm, 1, MO_32);
1631     tmp = tcg_temp_new_i64();
1632 
1633     widenfn(tmp, rm0);
1634     if (a->shift != 0) {
1635         tcg_gen_shli_i64(tmp, tmp, a->shift);
1636         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1637     }
1638     write_neon_element64(tmp, a->vd, 0, MO_64);
1639 
1640     widenfn(tmp, rm1);
1641     if (a->shift != 0) {
1642         tcg_gen_shli_i64(tmp, tmp, a->shift);
1643         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1644     }
1645     write_neon_element64(tmp, a->vd, 1, MO_64);
1646     return true;
1647 }
1648 
1649 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1650 {
1651     static NeonGenWidenFn * const widenfn[] = {
1652         gen_helper_neon_widen_s8,
1653         gen_helper_neon_widen_s16,
1654         tcg_gen_ext_i32_i64,
1655     };
1656     return do_vshll_2sh(s, a, widenfn[a->size], false);
1657 }
1658 
1659 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1660 {
1661     static NeonGenWidenFn * const widenfn[] = {
1662         gen_helper_neon_widen_u8,
1663         gen_helper_neon_widen_u16,
1664         tcg_gen_extu_i32_i64,
1665     };
1666     return do_vshll_2sh(s, a, widenfn[a->size], true);
1667 }
1668 
1669 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1670                       gen_helper_gvec_2_ptr *fn)
1671 {
1672     /* FP operations in 2-reg-and-shift group */
1673     int vec_size = a->q ? 16 : 8;
1674     int rd_ofs = neon_full_reg_offset(a->vd);
1675     int rm_ofs = neon_full_reg_offset(a->vm);
1676     TCGv_ptr fpst;
1677 
1678     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1679         return false;
1680     }
1681 
1682     if (a->size == MO_16) {
1683         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1684             return false;
1685         }
1686     }
1687 
1688     /* UNDEF accesses to D16-D31 if they don't exist. */
1689     if (!dc_isar_feature(aa32_simd_r32, s) &&
1690         ((a->vd | a->vm) & 0x10)) {
1691         return false;
1692     }
1693 
1694     if ((a->vm | a->vd) & a->q) {
1695         return false;
1696     }
1697 
1698     if (!vfp_access_check(s)) {
1699         return true;
1700     }
1701 
1702     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1703     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1704     return true;
1705 }
1706 
1707 #define DO_FP_2SH(INSN, FUNC)                                           \
1708     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1709     {                                                                   \
1710         return do_fp_2sh(s, a, FUNC);                                   \
1711     }
1712 
1713 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1714 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1715 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1716 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1717 
1718 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1719 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1720 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1721 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1722 
1723 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1724                         GVecGen2iFn *fn)
1725 {
1726     uint64_t imm;
1727     int reg_ofs, vec_size;
1728 
1729     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1730         return false;
1731     }
1732 
1733     /* UNDEF accesses to D16-D31 if they don't exist. */
1734     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1735         return false;
1736     }
1737 
1738     if (a->vd & a->q) {
1739         return false;
1740     }
1741 
1742     if (!vfp_access_check(s)) {
1743         return true;
1744     }
1745 
1746     reg_ofs = neon_full_reg_offset(a->vd);
1747     vec_size = a->q ? 16 : 8;
1748     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1749 
1750     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1751     return true;
1752 }
1753 
1754 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1755                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1756 {
1757     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1758 }
1759 
1760 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1761 {
1762     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1763     GVecGen2iFn *fn;
1764 
1765     if ((a->cmode & 1) && a->cmode < 12) {
1766         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1767         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1768     } else {
1769         /* There is one unallocated cmode/op combination in this space */
1770         if (a->cmode == 15 && a->op == 1) {
1771             return false;
1772         }
1773         fn = gen_VMOV_1r;
1774     }
1775     return do_1reg_imm(s, a, fn);
1776 }
1777 
1778 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1779                            NeonGenWidenFn *widenfn,
1780                            NeonGenTwo64OpFn *opfn,
1781                            int src1_mop, int src2_mop)
1782 {
1783     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1784     TCGv_i64 rn0_64, rn1_64, rm_64;
1785 
1786     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1787         return false;
1788     }
1789 
1790     /* UNDEF accesses to D16-D31 if they don't exist. */
1791     if (!dc_isar_feature(aa32_simd_r32, s) &&
1792         ((a->vd | a->vn | a->vm) & 0x10)) {
1793         return false;
1794     }
1795 
1796     if (!opfn) {
1797         /* size == 3 case, which is an entirely different insn group */
1798         return false;
1799     }
1800 
1801     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1802         return false;
1803     }
1804 
1805     if (!vfp_access_check(s)) {
1806         return true;
1807     }
1808 
1809     rn0_64 = tcg_temp_new_i64();
1810     rn1_64 = tcg_temp_new_i64();
1811     rm_64 = tcg_temp_new_i64();
1812 
1813     if (src1_mop >= 0) {
1814         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1815     } else {
1816         TCGv_i32 tmp = tcg_temp_new_i32();
1817         read_neon_element32(tmp, a->vn, 0, MO_32);
1818         widenfn(rn0_64, tmp);
1819     }
1820     if (src2_mop >= 0) {
1821         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1822     } else {
1823         TCGv_i32 tmp = tcg_temp_new_i32();
1824         read_neon_element32(tmp, a->vm, 0, MO_32);
1825         widenfn(rm_64, tmp);
1826     }
1827 
1828     opfn(rn0_64, rn0_64, rm_64);
1829 
1830     /*
1831      * Load second pass inputs before storing the first pass result, to
1832      * avoid incorrect results if a narrow input overlaps with the result.
1833      */
1834     if (src1_mop >= 0) {
1835         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1836     } else {
1837         TCGv_i32 tmp = tcg_temp_new_i32();
1838         read_neon_element32(tmp, a->vn, 1, MO_32);
1839         widenfn(rn1_64, tmp);
1840     }
1841     if (src2_mop >= 0) {
1842         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1843     } else {
1844         TCGv_i32 tmp = tcg_temp_new_i32();
1845         read_neon_element32(tmp, a->vm, 1, MO_32);
1846         widenfn(rm_64, tmp);
1847     }
1848 
1849     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1850 
1851     opfn(rn1_64, rn1_64, rm_64);
1852     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1853 
1854     return true;
1855 }
1856 
1857 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1858     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1859     {                                                                   \
1860         static NeonGenWidenFn * const widenfn[] = {                     \
1861             gen_helper_neon_widen_##S##8,                               \
1862             gen_helper_neon_widen_##S##16,                              \
1863             NULL, NULL,                                                 \
1864         };                                                              \
1865         static NeonGenTwo64OpFn * const addfn[] = {                     \
1866             gen_helper_neon_##OP##l_u16,                                \
1867             gen_helper_neon_##OP##l_u32,                                \
1868             tcg_gen_##OP##_i64,                                         \
1869             NULL,                                                       \
1870         };                                                              \
1871         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1872         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1873                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1874                               narrow_mop);                              \
1875     }
1876 
1877 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1878 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1879 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1880 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1881 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1882 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1883 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1884 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1885 
1886 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1887                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1888 {
1889     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1890     TCGv_i64 rn_64, rm_64;
1891     TCGv_i32 rd0, rd1;
1892 
1893     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1894         return false;
1895     }
1896 
1897     /* UNDEF accesses to D16-D31 if they don't exist. */
1898     if (!dc_isar_feature(aa32_simd_r32, s) &&
1899         ((a->vd | a->vn | a->vm) & 0x10)) {
1900         return false;
1901     }
1902 
1903     if (!opfn || !narrowfn) {
1904         /* size == 3 case, which is an entirely different insn group */
1905         return false;
1906     }
1907 
1908     if ((a->vn | a->vm) & 1) {
1909         return false;
1910     }
1911 
1912     if (!vfp_access_check(s)) {
1913         return true;
1914     }
1915 
1916     rn_64 = tcg_temp_new_i64();
1917     rm_64 = tcg_temp_new_i64();
1918     rd0 = tcg_temp_new_i32();
1919     rd1 = tcg_temp_new_i32();
1920 
1921     read_neon_element64(rn_64, a->vn, 0, MO_64);
1922     read_neon_element64(rm_64, a->vm, 0, MO_64);
1923 
1924     opfn(rn_64, rn_64, rm_64);
1925 
1926     narrowfn(rd0, rn_64);
1927 
1928     read_neon_element64(rn_64, a->vn, 1, MO_64);
1929     read_neon_element64(rm_64, a->vm, 1, MO_64);
1930 
1931     opfn(rn_64, rn_64, rm_64);
1932 
1933     narrowfn(rd1, rn_64);
1934 
1935     write_neon_element32(rd0, a->vd, 0, MO_32);
1936     write_neon_element32(rd1, a->vd, 1, MO_32);
1937 
1938     return true;
1939 }
1940 
1941 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1942     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1943     {                                                                   \
1944         static NeonGenTwo64OpFn * const addfn[] = {                     \
1945             gen_helper_neon_##OP##l_u16,                                \
1946             gen_helper_neon_##OP##l_u32,                                \
1947             tcg_gen_##OP##_i64,                                         \
1948             NULL,                                                       \
1949         };                                                              \
1950         static NeonGenNarrowFn * const narrowfn[] = {                   \
1951             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1952             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1953             EXTOP,                                                      \
1954             NULL,                                                       \
1955         };                                                              \
1956         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1957     }
1958 
1959 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1960 {
1961     tcg_gen_addi_i64(rn, rn, 1u << 31);
1962     tcg_gen_extrh_i64_i32(rd, rn);
1963 }
1964 
1965 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1966 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1967 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1968 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1969 
1970 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1971                        NeonGenTwoOpWidenFn *opfn,
1972                        NeonGenTwo64OpFn *accfn)
1973 {
1974     /*
1975      * 3-regs different lengths, long operations.
1976      * These perform an operation on two inputs that returns a double-width
1977      * result, and then possibly perform an accumulation operation of
1978      * that result into the double-width destination.
1979      */
1980     TCGv_i64 rd0, rd1, tmp;
1981     TCGv_i32 rn, rm;
1982 
1983     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1984         return false;
1985     }
1986 
1987     /* UNDEF accesses to D16-D31 if they don't exist. */
1988     if (!dc_isar_feature(aa32_simd_r32, s) &&
1989         ((a->vd | a->vn | a->vm) & 0x10)) {
1990         return false;
1991     }
1992 
1993     if (!opfn) {
1994         /* size == 3 case, which is an entirely different insn group */
1995         return false;
1996     }
1997 
1998     if (a->vd & 1) {
1999         return false;
2000     }
2001 
2002     if (!vfp_access_check(s)) {
2003         return true;
2004     }
2005 
2006     rd0 = tcg_temp_new_i64();
2007     rd1 = tcg_temp_new_i64();
2008 
2009     rn = tcg_temp_new_i32();
2010     rm = tcg_temp_new_i32();
2011     read_neon_element32(rn, a->vn, 0, MO_32);
2012     read_neon_element32(rm, a->vm, 0, MO_32);
2013     opfn(rd0, rn, rm);
2014 
2015     read_neon_element32(rn, a->vn, 1, MO_32);
2016     read_neon_element32(rm, a->vm, 1, MO_32);
2017     opfn(rd1, rn, rm);
2018 
2019     /* Don't store results until after all loads: they might overlap */
2020     if (accfn) {
2021         tmp = tcg_temp_new_i64();
2022         read_neon_element64(tmp, a->vd, 0, MO_64);
2023         accfn(rd0, tmp, rd0);
2024         read_neon_element64(tmp, a->vd, 1, MO_64);
2025         accfn(rd1, tmp, rd1);
2026     }
2027 
2028     write_neon_element64(rd0, a->vd, 0, MO_64);
2029     write_neon_element64(rd1, a->vd, 1, MO_64);
2030 
2031     return true;
2032 }
2033 
2034 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2035 {
2036     static NeonGenTwoOpWidenFn * const opfn[] = {
2037         gen_helper_neon_abdl_s16,
2038         gen_helper_neon_abdl_s32,
2039         gen_helper_neon_abdl_s64,
2040         NULL,
2041     };
2042 
2043     return do_long_3d(s, a, opfn[a->size], NULL);
2044 }
2045 
2046 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2047 {
2048     static NeonGenTwoOpWidenFn * const opfn[] = {
2049         gen_helper_neon_abdl_u16,
2050         gen_helper_neon_abdl_u32,
2051         gen_helper_neon_abdl_u64,
2052         NULL,
2053     };
2054 
2055     return do_long_3d(s, a, opfn[a->size], NULL);
2056 }
2057 
2058 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2059 {
2060     static NeonGenTwoOpWidenFn * const opfn[] = {
2061         gen_helper_neon_abdl_s16,
2062         gen_helper_neon_abdl_s32,
2063         gen_helper_neon_abdl_s64,
2064         NULL,
2065     };
2066     static NeonGenTwo64OpFn * const addfn[] = {
2067         gen_helper_neon_addl_u16,
2068         gen_helper_neon_addl_u32,
2069         tcg_gen_add_i64,
2070         NULL,
2071     };
2072 
2073     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2074 }
2075 
2076 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2077 {
2078     static NeonGenTwoOpWidenFn * const opfn[] = {
2079         gen_helper_neon_abdl_u16,
2080         gen_helper_neon_abdl_u32,
2081         gen_helper_neon_abdl_u64,
2082         NULL,
2083     };
2084     static NeonGenTwo64OpFn * const addfn[] = {
2085         gen_helper_neon_addl_u16,
2086         gen_helper_neon_addl_u32,
2087         tcg_gen_add_i64,
2088         NULL,
2089     };
2090 
2091     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2092 }
2093 
2094 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2095 {
2096     TCGv_i32 lo = tcg_temp_new_i32();
2097     TCGv_i32 hi = tcg_temp_new_i32();
2098 
2099     tcg_gen_muls2_i32(lo, hi, rn, rm);
2100     tcg_gen_concat_i32_i64(rd, lo, hi);
2101 }
2102 
2103 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2104 {
2105     TCGv_i32 lo = tcg_temp_new_i32();
2106     TCGv_i32 hi = tcg_temp_new_i32();
2107 
2108     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2109     tcg_gen_concat_i32_i64(rd, lo, hi);
2110 }
2111 
2112 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2113 {
2114     static NeonGenTwoOpWidenFn * const opfn[] = {
2115         gen_helper_neon_mull_s8,
2116         gen_helper_neon_mull_s16,
2117         gen_mull_s32,
2118         NULL,
2119     };
2120 
2121     return do_long_3d(s, a, opfn[a->size], NULL);
2122 }
2123 
2124 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2125 {
2126     static NeonGenTwoOpWidenFn * const opfn[] = {
2127         gen_helper_neon_mull_u8,
2128         gen_helper_neon_mull_u16,
2129         gen_mull_u32,
2130         NULL,
2131     };
2132 
2133     return do_long_3d(s, a, opfn[a->size], NULL);
2134 }
2135 
2136 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2137     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2138     {                                                                   \
2139         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2140             gen_helper_neon_##MULL##8,                                  \
2141             gen_helper_neon_##MULL##16,                                 \
2142             gen_##MULL##32,                                             \
2143             NULL,                                                       \
2144         };                                                              \
2145         static NeonGenTwo64OpFn * const accfn[] = {                     \
2146             gen_helper_neon_##ACC##l_u16,                               \
2147             gen_helper_neon_##ACC##l_u32,                               \
2148             tcg_gen_##ACC##_i64,                                        \
2149             NULL,                                                       \
2150         };                                                              \
2151         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2152     }
2153 
2154 DO_VMLAL(VMLAL_S,mull_s,add)
2155 DO_VMLAL(VMLAL_U,mull_u,add)
2156 DO_VMLAL(VMLSL_S,mull_s,sub)
2157 DO_VMLAL(VMLSL_U,mull_u,sub)
2158 
2159 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2160 {
2161     gen_helper_neon_mull_s16(rd, rn, rm);
2162     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2163 }
2164 
2165 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2166 {
2167     gen_mull_s32(rd, rn, rm);
2168     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2169 }
2170 
2171 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2172 {
2173     static NeonGenTwoOpWidenFn * const opfn[] = {
2174         NULL,
2175         gen_VQDMULL_16,
2176         gen_VQDMULL_32,
2177         NULL,
2178     };
2179 
2180     return do_long_3d(s, a, opfn[a->size], NULL);
2181 }
2182 
2183 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2184 {
2185     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2186 }
2187 
2188 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2189 {
2190     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2191 }
2192 
2193 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2194 {
2195     static NeonGenTwoOpWidenFn * const opfn[] = {
2196         NULL,
2197         gen_VQDMULL_16,
2198         gen_VQDMULL_32,
2199         NULL,
2200     };
2201     static NeonGenTwo64OpFn * const accfn[] = {
2202         NULL,
2203         gen_VQDMLAL_acc_16,
2204         gen_VQDMLAL_acc_32,
2205         NULL,
2206     };
2207 
2208     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2209 }
2210 
2211 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2212 {
2213     gen_helper_neon_negl_u32(rm, rm);
2214     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2215 }
2216 
2217 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2218 {
2219     tcg_gen_neg_i64(rm, rm);
2220     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2221 }
2222 
2223 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2224 {
2225     static NeonGenTwoOpWidenFn * const opfn[] = {
2226         NULL,
2227         gen_VQDMULL_16,
2228         gen_VQDMULL_32,
2229         NULL,
2230     };
2231     static NeonGenTwo64OpFn * const accfn[] = {
2232         NULL,
2233         gen_VQDMLSL_acc_16,
2234         gen_VQDMLSL_acc_32,
2235         NULL,
2236     };
2237 
2238     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2239 }
2240 
2241 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2242 {
2243     gen_helper_gvec_3 *fn_gvec;
2244 
2245     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2246         return false;
2247     }
2248 
2249     /* UNDEF accesses to D16-D31 if they don't exist. */
2250     if (!dc_isar_feature(aa32_simd_r32, s) &&
2251         ((a->vd | a->vn | a->vm) & 0x10)) {
2252         return false;
2253     }
2254 
2255     if (a->vd & 1) {
2256         return false;
2257     }
2258 
2259     switch (a->size) {
2260     case 0:
2261         fn_gvec = gen_helper_neon_pmull_h;
2262         break;
2263     case 2:
2264         if (!dc_isar_feature(aa32_pmull, s)) {
2265             return false;
2266         }
2267         fn_gvec = gen_helper_gvec_pmull_q;
2268         break;
2269     default:
2270         return false;
2271     }
2272 
2273     if (!vfp_access_check(s)) {
2274         return true;
2275     }
2276 
2277     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2278                        neon_full_reg_offset(a->vn),
2279                        neon_full_reg_offset(a->vm),
2280                        16, 16, 0, fn_gvec);
2281     return true;
2282 }
2283 
2284 static void gen_neon_dup_low16(TCGv_i32 var)
2285 {
2286     TCGv_i32 tmp = tcg_temp_new_i32();
2287     tcg_gen_ext16u_i32(var, var);
2288     tcg_gen_shli_i32(tmp, var, 16);
2289     tcg_gen_or_i32(var, var, tmp);
2290 }
2291 
2292 static void gen_neon_dup_high16(TCGv_i32 var)
2293 {
2294     TCGv_i32 tmp = tcg_temp_new_i32();
2295     tcg_gen_andi_i32(var, var, 0xffff0000);
2296     tcg_gen_shri_i32(tmp, var, 16);
2297     tcg_gen_or_i32(var, var, tmp);
2298 }
2299 
2300 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2301 {
2302     TCGv_i32 tmp = tcg_temp_new_i32();
2303     if (size == MO_16) {
2304         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2305         if (reg & 8) {
2306             gen_neon_dup_high16(tmp);
2307         } else {
2308             gen_neon_dup_low16(tmp);
2309         }
2310     } else {
2311         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2312     }
2313     return tmp;
2314 }
2315 
2316 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2317                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2318 {
2319     /*
2320      * Two registers and a scalar: perform an operation between
2321      * the input elements and the scalar, and then possibly
2322      * perform an accumulation operation of that result into the
2323      * destination.
2324      */
2325     TCGv_i32 scalar, tmp;
2326     int pass;
2327 
2328     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2329         return false;
2330     }
2331 
2332     /* UNDEF accesses to D16-D31 if they don't exist. */
2333     if (!dc_isar_feature(aa32_simd_r32, s) &&
2334         ((a->vd | a->vn | a->vm) & 0x10)) {
2335         return false;
2336     }
2337 
2338     if (!opfn) {
2339         /* Bad size (including size == 3, which is a different insn group) */
2340         return false;
2341     }
2342 
2343     if (a->q && ((a->vd | a->vn) & 1)) {
2344         return false;
2345     }
2346 
2347     if (!vfp_access_check(s)) {
2348         return true;
2349     }
2350 
2351     scalar = neon_get_scalar(a->size, a->vm);
2352     tmp = tcg_temp_new_i32();
2353 
2354     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2355         read_neon_element32(tmp, a->vn, pass, MO_32);
2356         opfn(tmp, tmp, scalar);
2357         if (accfn) {
2358             TCGv_i32 rd = tcg_temp_new_i32();
2359             read_neon_element32(rd, a->vd, pass, MO_32);
2360             accfn(tmp, rd, tmp);
2361         }
2362         write_neon_element32(tmp, a->vd, pass, MO_32);
2363     }
2364     return true;
2365 }
2366 
2367 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2368 {
2369     static NeonGenTwoOpFn * const opfn[] = {
2370         NULL,
2371         gen_helper_neon_mul_u16,
2372         tcg_gen_mul_i32,
2373         NULL,
2374     };
2375 
2376     return do_2scalar(s, a, opfn[a->size], NULL);
2377 }
2378 
2379 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2380 {
2381     static NeonGenTwoOpFn * const opfn[] = {
2382         NULL,
2383         gen_helper_neon_mul_u16,
2384         tcg_gen_mul_i32,
2385         NULL,
2386     };
2387     static NeonGenTwoOpFn * const accfn[] = {
2388         NULL,
2389         gen_helper_neon_add_u16,
2390         tcg_gen_add_i32,
2391         NULL,
2392     };
2393 
2394     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2395 }
2396 
2397 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2398 {
2399     static NeonGenTwoOpFn * const opfn[] = {
2400         NULL,
2401         gen_helper_neon_mul_u16,
2402         tcg_gen_mul_i32,
2403         NULL,
2404     };
2405     static NeonGenTwoOpFn * const accfn[] = {
2406         NULL,
2407         gen_helper_neon_sub_u16,
2408         tcg_gen_sub_i32,
2409         NULL,
2410     };
2411 
2412     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2413 }
2414 
2415 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2416                               gen_helper_gvec_3_ptr *fn)
2417 {
2418     /* Two registers and a scalar, using gvec */
2419     int vec_size = a->q ? 16 : 8;
2420     int rd_ofs = neon_full_reg_offset(a->vd);
2421     int rn_ofs = neon_full_reg_offset(a->vn);
2422     int rm_ofs;
2423     int idx;
2424     TCGv_ptr fpstatus;
2425 
2426     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2427         return false;
2428     }
2429 
2430     /* UNDEF accesses to D16-D31 if they don't exist. */
2431     if (!dc_isar_feature(aa32_simd_r32, s) &&
2432         ((a->vd | a->vn | a->vm) & 0x10)) {
2433         return false;
2434     }
2435 
2436     if (!fn) {
2437         /* Bad size (including size == 3, which is a different insn group) */
2438         return false;
2439     }
2440 
2441     if (a->q && ((a->vd | a->vn) & 1)) {
2442         return false;
2443     }
2444 
2445     if (!vfp_access_check(s)) {
2446         return true;
2447     }
2448 
2449     /* a->vm is M:Vm, which encodes both register and index */
2450     idx = extract32(a->vm, a->size + 2, 2);
2451     a->vm = extract32(a->vm, 0, a->size + 2);
2452     rm_ofs = neon_full_reg_offset(a->vm);
2453 
2454     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2455     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2456                        vec_size, vec_size, idx, fn);
2457     return true;
2458 }
2459 
2460 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2461     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2462     {                                                                   \
2463         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2464             NULL,                                                       \
2465             gen_helper_##FUNC##_h,                                      \
2466             gen_helper_##FUNC##_s,                                      \
2467             NULL,                                                       \
2468         };                                                              \
2469         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2470             return false;                                               \
2471         }                                                               \
2472         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2473     }
2474 
2475 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2476 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2477 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2478 
2479 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2480 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2481 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2482 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2483 
2484 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2485 {
2486     static NeonGenTwoOpFn * const opfn[] = {
2487         NULL,
2488         gen_VQDMULH_16,
2489         gen_VQDMULH_32,
2490         NULL,
2491     };
2492 
2493     return do_2scalar(s, a, opfn[a->size], NULL);
2494 }
2495 
2496 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2497 {
2498     static NeonGenTwoOpFn * const opfn[] = {
2499         NULL,
2500         gen_VQRDMULH_16,
2501         gen_VQRDMULH_32,
2502         NULL,
2503     };
2504 
2505     return do_2scalar(s, a, opfn[a->size], NULL);
2506 }
2507 
2508 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2509                             NeonGenThreeOpEnvFn *opfn)
2510 {
2511     /*
2512      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2513      * performs a kind of fused op-then-accumulate using a helper
2514      * function that takes all of rd, rn and the scalar at once.
2515      */
2516     TCGv_i32 scalar, rn, rd;
2517     int pass;
2518 
2519     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2520         return false;
2521     }
2522 
2523     if (!dc_isar_feature(aa32_rdm, s)) {
2524         return false;
2525     }
2526 
2527     /* UNDEF accesses to D16-D31 if they don't exist. */
2528     if (!dc_isar_feature(aa32_simd_r32, s) &&
2529         ((a->vd | a->vn | a->vm) & 0x10)) {
2530         return false;
2531     }
2532 
2533     if (!opfn) {
2534         /* Bad size (including size == 3, which is a different insn group) */
2535         return false;
2536     }
2537 
2538     if (a->q && ((a->vd | a->vn) & 1)) {
2539         return false;
2540     }
2541 
2542     if (!vfp_access_check(s)) {
2543         return true;
2544     }
2545 
2546     scalar = neon_get_scalar(a->size, a->vm);
2547     rn = tcg_temp_new_i32();
2548     rd = tcg_temp_new_i32();
2549 
2550     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2551         read_neon_element32(rn, a->vn, pass, MO_32);
2552         read_neon_element32(rd, a->vd, pass, MO_32);
2553         opfn(rd, cpu_env, rn, scalar, rd);
2554         write_neon_element32(rd, a->vd, pass, MO_32);
2555     }
2556     return true;
2557 }
2558 
2559 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2560 {
2561     static NeonGenThreeOpEnvFn *opfn[] = {
2562         NULL,
2563         gen_helper_neon_qrdmlah_s16,
2564         gen_helper_neon_qrdmlah_s32,
2565         NULL,
2566     };
2567     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2568 }
2569 
2570 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2571 {
2572     static NeonGenThreeOpEnvFn *opfn[] = {
2573         NULL,
2574         gen_helper_neon_qrdmlsh_s16,
2575         gen_helper_neon_qrdmlsh_s32,
2576         NULL,
2577     };
2578     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2579 }
2580 
2581 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2582                             NeonGenTwoOpWidenFn *opfn,
2583                             NeonGenTwo64OpFn *accfn)
2584 {
2585     /*
2586      * Two registers and a scalar, long operations: perform an
2587      * operation on the input elements and the scalar which produces
2588      * a double-width result, and then possibly perform an accumulation
2589      * operation of that result into the destination.
2590      */
2591     TCGv_i32 scalar, rn;
2592     TCGv_i64 rn0_64, rn1_64;
2593 
2594     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2595         return false;
2596     }
2597 
2598     /* UNDEF accesses to D16-D31 if they don't exist. */
2599     if (!dc_isar_feature(aa32_simd_r32, s) &&
2600         ((a->vd | a->vn | a->vm) & 0x10)) {
2601         return false;
2602     }
2603 
2604     if (!opfn) {
2605         /* Bad size (including size == 3, which is a different insn group) */
2606         return false;
2607     }
2608 
2609     if (a->vd & 1) {
2610         return false;
2611     }
2612 
2613     if (!vfp_access_check(s)) {
2614         return true;
2615     }
2616 
2617     scalar = neon_get_scalar(a->size, a->vm);
2618 
2619     /* Load all inputs before writing any outputs, in case of overlap */
2620     rn = tcg_temp_new_i32();
2621     read_neon_element32(rn, a->vn, 0, MO_32);
2622     rn0_64 = tcg_temp_new_i64();
2623     opfn(rn0_64, rn, scalar);
2624 
2625     read_neon_element32(rn, a->vn, 1, MO_32);
2626     rn1_64 = tcg_temp_new_i64();
2627     opfn(rn1_64, rn, scalar);
2628 
2629     if (accfn) {
2630         TCGv_i64 t64 = tcg_temp_new_i64();
2631         read_neon_element64(t64, a->vd, 0, MO_64);
2632         accfn(rn0_64, t64, rn0_64);
2633         read_neon_element64(t64, a->vd, 1, MO_64);
2634         accfn(rn1_64, t64, rn1_64);
2635     }
2636 
2637     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2638     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2639     return true;
2640 }
2641 
2642 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2643 {
2644     static NeonGenTwoOpWidenFn * const opfn[] = {
2645         NULL,
2646         gen_helper_neon_mull_s16,
2647         gen_mull_s32,
2648         NULL,
2649     };
2650 
2651     return do_2scalar_long(s, a, opfn[a->size], NULL);
2652 }
2653 
2654 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2655 {
2656     static NeonGenTwoOpWidenFn * const opfn[] = {
2657         NULL,
2658         gen_helper_neon_mull_u16,
2659         gen_mull_u32,
2660         NULL,
2661     };
2662 
2663     return do_2scalar_long(s, a, opfn[a->size], NULL);
2664 }
2665 
2666 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2667     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2668     {                                                                   \
2669         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2670             NULL,                                                       \
2671             gen_helper_neon_##MULL##16,                                 \
2672             gen_##MULL##32,                                             \
2673             NULL,                                                       \
2674         };                                                              \
2675         static NeonGenTwo64OpFn * const accfn[] = {                     \
2676             NULL,                                                       \
2677             gen_helper_neon_##ACC##l_u32,                               \
2678             tcg_gen_##ACC##_i64,                                        \
2679             NULL,                                                       \
2680         };                                                              \
2681         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2682     }
2683 
2684 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2685 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2686 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2687 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2688 
2689 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2690 {
2691     static NeonGenTwoOpWidenFn * const opfn[] = {
2692         NULL,
2693         gen_VQDMULL_16,
2694         gen_VQDMULL_32,
2695         NULL,
2696     };
2697 
2698     return do_2scalar_long(s, a, opfn[a->size], NULL);
2699 }
2700 
2701 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2702 {
2703     static NeonGenTwoOpWidenFn * const opfn[] = {
2704         NULL,
2705         gen_VQDMULL_16,
2706         gen_VQDMULL_32,
2707         NULL,
2708     };
2709     static NeonGenTwo64OpFn * const accfn[] = {
2710         NULL,
2711         gen_VQDMLAL_acc_16,
2712         gen_VQDMLAL_acc_32,
2713         NULL,
2714     };
2715 
2716     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2717 }
2718 
2719 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2720 {
2721     static NeonGenTwoOpWidenFn * const opfn[] = {
2722         NULL,
2723         gen_VQDMULL_16,
2724         gen_VQDMULL_32,
2725         NULL,
2726     };
2727     static NeonGenTwo64OpFn * const accfn[] = {
2728         NULL,
2729         gen_VQDMLSL_acc_16,
2730         gen_VQDMLSL_acc_32,
2731         NULL,
2732     };
2733 
2734     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2735 }
2736 
2737 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2738 {
2739     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2740         return false;
2741     }
2742 
2743     /* UNDEF accesses to D16-D31 if they don't exist. */
2744     if (!dc_isar_feature(aa32_simd_r32, s) &&
2745         ((a->vd | a->vn | a->vm) & 0x10)) {
2746         return false;
2747     }
2748 
2749     if ((a->vn | a->vm | a->vd) & a->q) {
2750         return false;
2751     }
2752 
2753     if (a->imm > 7 && !a->q) {
2754         return false;
2755     }
2756 
2757     if (!vfp_access_check(s)) {
2758         return true;
2759     }
2760 
2761     if (!a->q) {
2762         /* Extract 64 bits from <Vm:Vn> */
2763         TCGv_i64 left, right, dest;
2764 
2765         left = tcg_temp_new_i64();
2766         right = tcg_temp_new_i64();
2767         dest = tcg_temp_new_i64();
2768 
2769         read_neon_element64(right, a->vn, 0, MO_64);
2770         read_neon_element64(left, a->vm, 0, MO_64);
2771         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2772         write_neon_element64(dest, a->vd, 0, MO_64);
2773     } else {
2774         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2775         TCGv_i64 left, middle, right, destleft, destright;
2776 
2777         left = tcg_temp_new_i64();
2778         middle = tcg_temp_new_i64();
2779         right = tcg_temp_new_i64();
2780         destleft = tcg_temp_new_i64();
2781         destright = tcg_temp_new_i64();
2782 
2783         if (a->imm < 8) {
2784             read_neon_element64(right, a->vn, 0, MO_64);
2785             read_neon_element64(middle, a->vn, 1, MO_64);
2786             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2787             read_neon_element64(left, a->vm, 0, MO_64);
2788             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2789         } else {
2790             read_neon_element64(right, a->vn, 1, MO_64);
2791             read_neon_element64(middle, a->vm, 0, MO_64);
2792             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2793             read_neon_element64(left, a->vm, 1, MO_64);
2794             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2795         }
2796 
2797         write_neon_element64(destright, a->vd, 0, MO_64);
2798         write_neon_element64(destleft, a->vd, 1, MO_64);
2799     }
2800     return true;
2801 }
2802 
2803 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2804 {
2805     TCGv_i64 val, def;
2806     TCGv_i32 desc;
2807 
2808     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2809         return false;
2810     }
2811 
2812     /* UNDEF accesses to D16-D31 if they don't exist. */
2813     if (!dc_isar_feature(aa32_simd_r32, s) &&
2814         ((a->vd | a->vn | a->vm) & 0x10)) {
2815         return false;
2816     }
2817 
2818     if ((a->vn + a->len + 1) > 32) {
2819         /*
2820          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2821          * helper function running off the end of the register file.
2822          */
2823         return false;
2824     }
2825 
2826     if (!vfp_access_check(s)) {
2827         return true;
2828     }
2829 
2830     desc = tcg_constant_i32((a->vn << 2) | a->len);
2831     def = tcg_temp_new_i64();
2832     if (a->op) {
2833         read_neon_element64(def, a->vd, 0, MO_64);
2834     } else {
2835         tcg_gen_movi_i64(def, 0);
2836     }
2837     val = tcg_temp_new_i64();
2838     read_neon_element64(val, a->vm, 0, MO_64);
2839 
2840     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2841     write_neon_element64(val, a->vd, 0, MO_64);
2842     return true;
2843 }
2844 
2845 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2846 {
2847     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2848         return false;
2849     }
2850 
2851     /* UNDEF accesses to D16-D31 if they don't exist. */
2852     if (!dc_isar_feature(aa32_simd_r32, s) &&
2853         ((a->vd | a->vm) & 0x10)) {
2854         return false;
2855     }
2856 
2857     if (a->vd & a->q) {
2858         return false;
2859     }
2860 
2861     if (!vfp_access_check(s)) {
2862         return true;
2863     }
2864 
2865     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2866                          neon_element_offset(a->vm, a->index, a->size),
2867                          a->q ? 16 : 8, a->q ? 16 : 8);
2868     return true;
2869 }
2870 
2871 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2872 {
2873     int pass, half;
2874     TCGv_i32 tmp[2];
2875 
2876     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2877         return false;
2878     }
2879 
2880     /* UNDEF accesses to D16-D31 if they don't exist. */
2881     if (!dc_isar_feature(aa32_simd_r32, s) &&
2882         ((a->vd | a->vm) & 0x10)) {
2883         return false;
2884     }
2885 
2886     if ((a->vd | a->vm) & a->q) {
2887         return false;
2888     }
2889 
2890     if (a->size == 3) {
2891         return false;
2892     }
2893 
2894     if (!vfp_access_check(s)) {
2895         return true;
2896     }
2897 
2898     tmp[0] = tcg_temp_new_i32();
2899     tmp[1] = tcg_temp_new_i32();
2900 
2901     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2902         for (half = 0; half < 2; half++) {
2903             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2904             switch (a->size) {
2905             case 0:
2906                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2907                 break;
2908             case 1:
2909                 gen_swap_half(tmp[half], tmp[half]);
2910                 break;
2911             case 2:
2912                 break;
2913             default:
2914                 g_assert_not_reached();
2915             }
2916         }
2917         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2918         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
2919     }
2920     return true;
2921 }
2922 
2923 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2924                               NeonGenWidenFn *widenfn,
2925                               NeonGenTwo64OpFn *opfn,
2926                               NeonGenTwo64OpFn *accfn)
2927 {
2928     /*
2929      * Pairwise long operations: widen both halves of the pair,
2930      * combine the pairs with the opfn, and then possibly accumulate
2931      * into the destination with the accfn.
2932      */
2933     int pass;
2934 
2935     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2936         return false;
2937     }
2938 
2939     /* UNDEF accesses to D16-D31 if they don't exist. */
2940     if (!dc_isar_feature(aa32_simd_r32, s) &&
2941         ((a->vd | a->vm) & 0x10)) {
2942         return false;
2943     }
2944 
2945     if ((a->vd | a->vm) & a->q) {
2946         return false;
2947     }
2948 
2949     if (!widenfn) {
2950         return false;
2951     }
2952 
2953     if (!vfp_access_check(s)) {
2954         return true;
2955     }
2956 
2957     for (pass = 0; pass < a->q + 1; pass++) {
2958         TCGv_i32 tmp;
2959         TCGv_i64 rm0_64, rm1_64, rd_64;
2960 
2961         rm0_64 = tcg_temp_new_i64();
2962         rm1_64 = tcg_temp_new_i64();
2963         rd_64 = tcg_temp_new_i64();
2964 
2965         tmp = tcg_temp_new_i32();
2966         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
2967         widenfn(rm0_64, tmp);
2968         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
2969         widenfn(rm1_64, tmp);
2970 
2971         opfn(rd_64, rm0_64, rm1_64);
2972 
2973         if (accfn) {
2974             TCGv_i64 tmp64 = tcg_temp_new_i64();
2975             read_neon_element64(tmp64, a->vd, pass, MO_64);
2976             accfn(rd_64, tmp64, rd_64);
2977         }
2978         write_neon_element64(rd_64, a->vd, pass, MO_64);
2979     }
2980     return true;
2981 }
2982 
2983 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
2984 {
2985     static NeonGenWidenFn * const widenfn[] = {
2986         gen_helper_neon_widen_s8,
2987         gen_helper_neon_widen_s16,
2988         tcg_gen_ext_i32_i64,
2989         NULL,
2990     };
2991     static NeonGenTwo64OpFn * const opfn[] = {
2992         gen_helper_neon_paddl_u16,
2993         gen_helper_neon_paddl_u32,
2994         tcg_gen_add_i64,
2995         NULL,
2996     };
2997 
2998     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
2999 }
3000 
3001 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3002 {
3003     static NeonGenWidenFn * const widenfn[] = {
3004         gen_helper_neon_widen_u8,
3005         gen_helper_neon_widen_u16,
3006         tcg_gen_extu_i32_i64,
3007         NULL,
3008     };
3009     static NeonGenTwo64OpFn * const opfn[] = {
3010         gen_helper_neon_paddl_u16,
3011         gen_helper_neon_paddl_u32,
3012         tcg_gen_add_i64,
3013         NULL,
3014     };
3015 
3016     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3017 }
3018 
3019 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3020 {
3021     static NeonGenWidenFn * const widenfn[] = {
3022         gen_helper_neon_widen_s8,
3023         gen_helper_neon_widen_s16,
3024         tcg_gen_ext_i32_i64,
3025         NULL,
3026     };
3027     static NeonGenTwo64OpFn * const opfn[] = {
3028         gen_helper_neon_paddl_u16,
3029         gen_helper_neon_paddl_u32,
3030         tcg_gen_add_i64,
3031         NULL,
3032     };
3033     static NeonGenTwo64OpFn * const accfn[] = {
3034         gen_helper_neon_addl_u16,
3035         gen_helper_neon_addl_u32,
3036         tcg_gen_add_i64,
3037         NULL,
3038     };
3039 
3040     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3041                              accfn[a->size]);
3042 }
3043 
3044 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3045 {
3046     static NeonGenWidenFn * const widenfn[] = {
3047         gen_helper_neon_widen_u8,
3048         gen_helper_neon_widen_u16,
3049         tcg_gen_extu_i32_i64,
3050         NULL,
3051     };
3052     static NeonGenTwo64OpFn * const opfn[] = {
3053         gen_helper_neon_paddl_u16,
3054         gen_helper_neon_paddl_u32,
3055         tcg_gen_add_i64,
3056         NULL,
3057     };
3058     static NeonGenTwo64OpFn * const accfn[] = {
3059         gen_helper_neon_addl_u16,
3060         gen_helper_neon_addl_u32,
3061         tcg_gen_add_i64,
3062         NULL,
3063     };
3064 
3065     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3066                              accfn[a->size]);
3067 }
3068 
3069 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3070 
3071 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3072                        ZipFn *fn)
3073 {
3074     TCGv_ptr pd, pm;
3075 
3076     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3077         return false;
3078     }
3079 
3080     /* UNDEF accesses to D16-D31 if they don't exist. */
3081     if (!dc_isar_feature(aa32_simd_r32, s) &&
3082         ((a->vd | a->vm) & 0x10)) {
3083         return false;
3084     }
3085 
3086     if ((a->vd | a->vm) & a->q) {
3087         return false;
3088     }
3089 
3090     if (!fn) {
3091         /* Bad size or size/q combination */
3092         return false;
3093     }
3094 
3095     if (!vfp_access_check(s)) {
3096         return true;
3097     }
3098 
3099     pd = vfp_reg_ptr(true, a->vd);
3100     pm = vfp_reg_ptr(true, a->vm);
3101     fn(pd, pm);
3102     return true;
3103 }
3104 
3105 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3106 {
3107     static ZipFn * const fn[2][4] = {
3108         {
3109             gen_helper_neon_unzip8,
3110             gen_helper_neon_unzip16,
3111             NULL,
3112             NULL,
3113         }, {
3114             gen_helper_neon_qunzip8,
3115             gen_helper_neon_qunzip16,
3116             gen_helper_neon_qunzip32,
3117             NULL,
3118         }
3119     };
3120     return do_zip_uzp(s, a, fn[a->q][a->size]);
3121 }
3122 
3123 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3124 {
3125     static ZipFn * const fn[2][4] = {
3126         {
3127             gen_helper_neon_zip8,
3128             gen_helper_neon_zip16,
3129             NULL,
3130             NULL,
3131         }, {
3132             gen_helper_neon_qzip8,
3133             gen_helper_neon_qzip16,
3134             gen_helper_neon_qzip32,
3135             NULL,
3136         }
3137     };
3138     return do_zip_uzp(s, a, fn[a->q][a->size]);
3139 }
3140 
3141 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3142                      NeonGenNarrowEnvFn *narrowfn)
3143 {
3144     TCGv_i64 rm;
3145     TCGv_i32 rd0, rd1;
3146 
3147     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3148         return false;
3149     }
3150 
3151     /* UNDEF accesses to D16-D31 if they don't exist. */
3152     if (!dc_isar_feature(aa32_simd_r32, s) &&
3153         ((a->vd | a->vm) & 0x10)) {
3154         return false;
3155     }
3156 
3157     if (a->vm & 1) {
3158         return false;
3159     }
3160 
3161     if (!narrowfn) {
3162         return false;
3163     }
3164 
3165     if (!vfp_access_check(s)) {
3166         return true;
3167     }
3168 
3169     rm = tcg_temp_new_i64();
3170     rd0 = tcg_temp_new_i32();
3171     rd1 = tcg_temp_new_i32();
3172 
3173     read_neon_element64(rm, a->vm, 0, MO_64);
3174     narrowfn(rd0, cpu_env, rm);
3175     read_neon_element64(rm, a->vm, 1, MO_64);
3176     narrowfn(rd1, cpu_env, rm);
3177     write_neon_element32(rd0, a->vd, 0, MO_32);
3178     write_neon_element32(rd1, a->vd, 1, MO_32);
3179     return true;
3180 }
3181 
3182 #define DO_VMOVN(INSN, FUNC)                                    \
3183     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3184     {                                                           \
3185         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3186             FUNC##8,                                            \
3187             FUNC##16,                                           \
3188             FUNC##32,                                           \
3189             NULL,                                               \
3190         };                                                      \
3191         return do_vmovn(s, a, narrowfn[a->size]);               \
3192     }
3193 
3194 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3195 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3196 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3197 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3198 
3199 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3200 {
3201     TCGv_i32 rm0, rm1;
3202     TCGv_i64 rd;
3203     static NeonGenWidenFn * const widenfns[] = {
3204         gen_helper_neon_widen_u8,
3205         gen_helper_neon_widen_u16,
3206         tcg_gen_extu_i32_i64,
3207         NULL,
3208     };
3209     NeonGenWidenFn *widenfn = widenfns[a->size];
3210 
3211     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3212         return false;
3213     }
3214 
3215     /* UNDEF accesses to D16-D31 if they don't exist. */
3216     if (!dc_isar_feature(aa32_simd_r32, s) &&
3217         ((a->vd | a->vm) & 0x10)) {
3218         return false;
3219     }
3220 
3221     if (a->vd & 1) {
3222         return false;
3223     }
3224 
3225     if (!widenfn) {
3226         return false;
3227     }
3228 
3229     if (!vfp_access_check(s)) {
3230         return true;
3231     }
3232 
3233     rd = tcg_temp_new_i64();
3234     rm0 = tcg_temp_new_i32();
3235     rm1 = tcg_temp_new_i32();
3236 
3237     read_neon_element32(rm0, a->vm, 0, MO_32);
3238     read_neon_element32(rm1, a->vm, 1, MO_32);
3239 
3240     widenfn(rd, rm0);
3241     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3242     write_neon_element64(rd, a->vd, 0, MO_64);
3243     widenfn(rd, rm1);
3244     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3245     write_neon_element64(rd, a->vd, 1, MO_64);
3246     return true;
3247 }
3248 
3249 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3250 {
3251     TCGv_ptr fpst;
3252     TCGv_i64 tmp;
3253     TCGv_i32 dst0, dst1;
3254 
3255     if (!dc_isar_feature(aa32_bf16, s)) {
3256         return false;
3257     }
3258 
3259     /* UNDEF accesses to D16-D31 if they don't exist. */
3260     if (!dc_isar_feature(aa32_simd_r32, s) &&
3261         ((a->vd | a->vm) & 0x10)) {
3262         return false;
3263     }
3264 
3265     if ((a->vm & 1) || (a->size != 1)) {
3266         return false;
3267     }
3268 
3269     if (!vfp_access_check(s)) {
3270         return true;
3271     }
3272 
3273     fpst = fpstatus_ptr(FPST_STD);
3274     tmp = tcg_temp_new_i64();
3275     dst0 = tcg_temp_new_i32();
3276     dst1 = tcg_temp_new_i32();
3277 
3278     read_neon_element64(tmp, a->vm, 0, MO_64);
3279     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3280 
3281     read_neon_element64(tmp, a->vm, 1, MO_64);
3282     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3283 
3284     write_neon_element32(dst0, a->vd, 0, MO_32);
3285     write_neon_element32(dst1, a->vd, 1, MO_32);
3286     return true;
3287 }
3288 
3289 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3290 {
3291     TCGv_ptr fpst;
3292     TCGv_i32 ahp, tmp, tmp2, tmp3;
3293 
3294     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3295         !dc_isar_feature(aa32_fp16_spconv, s)) {
3296         return false;
3297     }
3298 
3299     /* UNDEF accesses to D16-D31 if they don't exist. */
3300     if (!dc_isar_feature(aa32_simd_r32, s) &&
3301         ((a->vd | a->vm) & 0x10)) {
3302         return false;
3303     }
3304 
3305     if ((a->vm & 1) || (a->size != 1)) {
3306         return false;
3307     }
3308 
3309     if (!vfp_access_check(s)) {
3310         return true;
3311     }
3312 
3313     fpst = fpstatus_ptr(FPST_STD);
3314     ahp = get_ahp_flag();
3315     tmp = tcg_temp_new_i32();
3316     read_neon_element32(tmp, a->vm, 0, MO_32);
3317     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3318     tmp2 = tcg_temp_new_i32();
3319     read_neon_element32(tmp2, a->vm, 1, MO_32);
3320     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3321     tcg_gen_shli_i32(tmp2, tmp2, 16);
3322     tcg_gen_or_i32(tmp2, tmp2, tmp);
3323     read_neon_element32(tmp, a->vm, 2, MO_32);
3324     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3325     tmp3 = tcg_temp_new_i32();
3326     read_neon_element32(tmp3, a->vm, 3, MO_32);
3327     write_neon_element32(tmp2, a->vd, 0, MO_32);
3328     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3329     tcg_gen_shli_i32(tmp3, tmp3, 16);
3330     tcg_gen_or_i32(tmp3, tmp3, tmp);
3331     write_neon_element32(tmp3, a->vd, 1, MO_32);
3332     return true;
3333 }
3334 
3335 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3336 {
3337     TCGv_ptr fpst;
3338     TCGv_i32 ahp, tmp, tmp2, tmp3;
3339 
3340     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3341         !dc_isar_feature(aa32_fp16_spconv, s)) {
3342         return false;
3343     }
3344 
3345     /* UNDEF accesses to D16-D31 if they don't exist. */
3346     if (!dc_isar_feature(aa32_simd_r32, s) &&
3347         ((a->vd | a->vm) & 0x10)) {
3348         return false;
3349     }
3350 
3351     if ((a->vd & 1) || (a->size != 1)) {
3352         return false;
3353     }
3354 
3355     if (!vfp_access_check(s)) {
3356         return true;
3357     }
3358 
3359     fpst = fpstatus_ptr(FPST_STD);
3360     ahp = get_ahp_flag();
3361     tmp3 = tcg_temp_new_i32();
3362     tmp2 = tcg_temp_new_i32();
3363     tmp = tcg_temp_new_i32();
3364     read_neon_element32(tmp, a->vm, 0, MO_32);
3365     read_neon_element32(tmp2, a->vm, 1, MO_32);
3366     tcg_gen_ext16u_i32(tmp3, tmp);
3367     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3368     write_neon_element32(tmp3, a->vd, 0, MO_32);
3369     tcg_gen_shri_i32(tmp, tmp, 16);
3370     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3371     write_neon_element32(tmp, a->vd, 1, MO_32);
3372     tcg_gen_ext16u_i32(tmp3, tmp2);
3373     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3374     write_neon_element32(tmp3, a->vd, 2, MO_32);
3375     tcg_gen_shri_i32(tmp2, tmp2, 16);
3376     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3377     write_neon_element32(tmp2, a->vd, 3, MO_32);
3378     return true;
3379 }
3380 
3381 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3382 {
3383     int vec_size = a->q ? 16 : 8;
3384     int rd_ofs = neon_full_reg_offset(a->vd);
3385     int rm_ofs = neon_full_reg_offset(a->vm);
3386 
3387     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3388         return false;
3389     }
3390 
3391     /* UNDEF accesses to D16-D31 if they don't exist. */
3392     if (!dc_isar_feature(aa32_simd_r32, s) &&
3393         ((a->vd | a->vm) & 0x10)) {
3394         return false;
3395     }
3396 
3397     if (a->size == 3) {
3398         return false;
3399     }
3400 
3401     if ((a->vd | a->vm) & a->q) {
3402         return false;
3403     }
3404 
3405     if (!vfp_access_check(s)) {
3406         return true;
3407     }
3408 
3409     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3410 
3411     return true;
3412 }
3413 
3414 #define DO_2MISC_VEC(INSN, FN)                                  \
3415     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3416     {                                                           \
3417         return do_2misc_vec(s, a, FN);                          \
3418     }
3419 
3420 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3421 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3422 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3423 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3424 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3425 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3426 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3427 
3428 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3429 {
3430     if (a->size != 0) {
3431         return false;
3432     }
3433     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3434 }
3435 
3436 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3437     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3438                          uint32_t rm_ofs, uint32_t oprsz,               \
3439                          uint32_t maxsz)                                \
3440     {                                                                   \
3441         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3442                            DATA, FUNC);                                 \
3443     }
3444 
3445 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3446     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3447                          uint32_t rm_ofs, uint32_t oprsz,               \
3448                          uint32_t maxsz)                                \
3449     {                                                                   \
3450         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3451     }
3452 
3453 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3454 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3455 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3456 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3457 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3458 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3459 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3460 
3461 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3462     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3463     {                                                           \
3464         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3465             return false;                                       \
3466         }                                                       \
3467         return do_2misc_vec(s, a, gen_##INSN);                  \
3468     }
3469 
3470 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3471 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3472 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3473 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3474 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3475 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3476 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3477 
3478 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3479 {
3480     TCGv_i32 tmp;
3481     int pass;
3482 
3483     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3484     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3485         return false;
3486     }
3487 
3488     /* UNDEF accesses to D16-D31 if they don't exist. */
3489     if (!dc_isar_feature(aa32_simd_r32, s) &&
3490         ((a->vd | a->vm) & 0x10)) {
3491         return false;
3492     }
3493 
3494     if (!fn) {
3495         return false;
3496     }
3497 
3498     if ((a->vd | a->vm) & a->q) {
3499         return false;
3500     }
3501 
3502     if (!vfp_access_check(s)) {
3503         return true;
3504     }
3505 
3506     tmp = tcg_temp_new_i32();
3507     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3508         read_neon_element32(tmp, a->vm, pass, MO_32);
3509         fn(tmp, tmp);
3510         write_neon_element32(tmp, a->vd, pass, MO_32);
3511     }
3512     return true;
3513 }
3514 
3515 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3516 {
3517     static NeonGenOneOpFn * const fn[] = {
3518         tcg_gen_bswap32_i32,
3519         gen_swap_half,
3520         NULL,
3521         NULL,
3522     };
3523     return do_2misc(s, a, fn[a->size]);
3524 }
3525 
3526 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3527 {
3528     if (a->size != 0) {
3529         return false;
3530     }
3531     return do_2misc(s, a, gen_rev16);
3532 }
3533 
3534 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3535 {
3536     static NeonGenOneOpFn * const fn[] = {
3537         gen_helper_neon_cls_s8,
3538         gen_helper_neon_cls_s16,
3539         gen_helper_neon_cls_s32,
3540         NULL,
3541     };
3542     return do_2misc(s, a, fn[a->size]);
3543 }
3544 
3545 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3546 {
3547     tcg_gen_clzi_i32(rd, rm, 32);
3548 }
3549 
3550 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3551 {
3552     static NeonGenOneOpFn * const fn[] = {
3553         gen_helper_neon_clz_u8,
3554         gen_helper_neon_clz_u16,
3555         do_VCLZ_32,
3556         NULL,
3557     };
3558     return do_2misc(s, a, fn[a->size]);
3559 }
3560 
3561 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3562 {
3563     if (a->size != 0) {
3564         return false;
3565     }
3566     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3567 }
3568 
3569 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3570                        uint32_t oprsz, uint32_t maxsz)
3571 {
3572     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3573                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3574                       oprsz, maxsz);
3575 }
3576 
3577 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3578 {
3579     if (a->size == MO_16) {
3580         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3581             return false;
3582         }
3583     } else if (a->size != MO_32) {
3584         return false;
3585     }
3586     return do_2misc_vec(s, a, gen_VABS_F);
3587 }
3588 
3589 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3590                        uint32_t oprsz, uint32_t maxsz)
3591 {
3592     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3593                       vece == MO_16 ? 0x8000 : 0x80000000,
3594                       oprsz, maxsz);
3595 }
3596 
3597 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3598 {
3599     if (a->size == MO_16) {
3600         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3601             return false;
3602         }
3603     } else if (a->size != MO_32) {
3604         return false;
3605     }
3606     return do_2misc_vec(s, a, gen_VNEG_F);
3607 }
3608 
3609 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3610 {
3611     if (a->size != 2) {
3612         return false;
3613     }
3614     return do_2misc(s, a, gen_helper_recpe_u32);
3615 }
3616 
3617 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3618 {
3619     if (a->size != 2) {
3620         return false;
3621     }
3622     return do_2misc(s, a, gen_helper_rsqrte_u32);
3623 }
3624 
3625 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3626     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3627     {                                                   \
3628         FUNC(d, cpu_env, m);                            \
3629     }
3630 
3631 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3632 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3633 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3634 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3635 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3636 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3637 
3638 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3639 {
3640     static NeonGenOneOpFn * const fn[] = {
3641         gen_VQABS_s8,
3642         gen_VQABS_s16,
3643         gen_VQABS_s32,
3644         NULL,
3645     };
3646     return do_2misc(s, a, fn[a->size]);
3647 }
3648 
3649 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3650 {
3651     static NeonGenOneOpFn * const fn[] = {
3652         gen_VQNEG_s8,
3653         gen_VQNEG_s16,
3654         gen_VQNEG_s32,
3655         NULL,
3656     };
3657     return do_2misc(s, a, fn[a->size]);
3658 }
3659 
3660 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3661     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3662                            uint32_t rm_ofs,                             \
3663                            uint32_t oprsz, uint32_t maxsz)              \
3664     {                                                                   \
3665         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3666             NULL, HFUNC, SFUNC, NULL,                                   \
3667         };                                                              \
3668         TCGv_ptr fpst;                                                  \
3669         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3670         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3671                            fns[vece]);                                  \
3672     }                                                                   \
3673     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3674     {                                                                   \
3675         if (a->size == MO_16) {                                         \
3676             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3677                 return false;                                           \
3678             }                                                           \
3679         } else if (a->size != MO_32) {                                  \
3680             return false;                                               \
3681         }                                                               \
3682         return do_2misc_vec(s, a, gen_##INSN);                          \
3683     }
3684 
3685 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3686 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3687 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3688 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3689 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3690 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3691 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3692 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3693 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3694 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3695 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3696 
3697 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3698 
3699 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3700 {
3701     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3702         return false;
3703     }
3704     return trans_VRINTX_impl(s, a);
3705 }
3706 
3707 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3708     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3709                            uint32_t rm_ofs,                             \
3710                            uint32_t oprsz, uint32_t maxsz)              \
3711     {                                                                   \
3712         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3713             NULL,                                                       \
3714             gen_helper_gvec_##OP##h,                                    \
3715             gen_helper_gvec_##OP##s,                                    \
3716             NULL,                                                       \
3717         };                                                              \
3718         TCGv_ptr fpst;                                                  \
3719         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3720         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3721                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3722     }                                                                   \
3723     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3724     {                                                                   \
3725         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3726             return false;                                               \
3727         }                                                               \
3728         if (a->size == MO_16) {                                         \
3729             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3730                 return false;                                           \
3731             }                                                           \
3732         } else if (a->size != MO_32) {                                  \
3733             return false;                                               \
3734         }                                                               \
3735         return do_2misc_vec(s, a, gen_##INSN);                          \
3736     }
3737 
3738 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3739 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3740 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3741 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3742 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3743 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3744 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3745 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3746 
3747 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3748 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3749 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3750 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3751 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3752 
3753 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3754 {
3755     TCGv_i64 rm, rd;
3756     int pass;
3757 
3758     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3759         return false;
3760     }
3761 
3762     /* UNDEF accesses to D16-D31 if they don't exist. */
3763     if (!dc_isar_feature(aa32_simd_r32, s) &&
3764         ((a->vd | a->vm) & 0x10)) {
3765         return false;
3766     }
3767 
3768     if (a->size != 0) {
3769         return false;
3770     }
3771 
3772     if ((a->vd | a->vm) & a->q) {
3773         return false;
3774     }
3775 
3776     if (!vfp_access_check(s)) {
3777         return true;
3778     }
3779 
3780     rm = tcg_temp_new_i64();
3781     rd = tcg_temp_new_i64();
3782     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3783         read_neon_element64(rm, a->vm, pass, MO_64);
3784         read_neon_element64(rd, a->vd, pass, MO_64);
3785         write_neon_element64(rm, a->vd, pass, MO_64);
3786         write_neon_element64(rd, a->vm, pass, MO_64);
3787     }
3788     return true;
3789 }
3790 
3791 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3792 {
3793     TCGv_i32 rd, tmp;
3794 
3795     rd = tcg_temp_new_i32();
3796     tmp = tcg_temp_new_i32();
3797 
3798     tcg_gen_shli_i32(rd, t0, 8);
3799     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3800     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3801     tcg_gen_or_i32(rd, rd, tmp);
3802 
3803     tcg_gen_shri_i32(t1, t1, 8);
3804     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3805     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3806     tcg_gen_or_i32(t1, t1, tmp);
3807     tcg_gen_mov_i32(t0, rd);
3808 }
3809 
3810 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3811 {
3812     TCGv_i32 rd, tmp;
3813 
3814     rd = tcg_temp_new_i32();
3815     tmp = tcg_temp_new_i32();
3816 
3817     tcg_gen_shli_i32(rd, t0, 16);
3818     tcg_gen_andi_i32(tmp, t1, 0xffff);
3819     tcg_gen_or_i32(rd, rd, tmp);
3820     tcg_gen_shri_i32(t1, t1, 16);
3821     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3822     tcg_gen_or_i32(t1, t1, tmp);
3823     tcg_gen_mov_i32(t0, rd);
3824 }
3825 
3826 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3827 {
3828     TCGv_i32 tmp, tmp2;
3829     int pass;
3830 
3831     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3832         return false;
3833     }
3834 
3835     /* UNDEF accesses to D16-D31 if they don't exist. */
3836     if (!dc_isar_feature(aa32_simd_r32, s) &&
3837         ((a->vd | a->vm) & 0x10)) {
3838         return false;
3839     }
3840 
3841     if ((a->vd | a->vm) & a->q) {
3842         return false;
3843     }
3844 
3845     if (a->size == 3) {
3846         return false;
3847     }
3848 
3849     if (!vfp_access_check(s)) {
3850         return true;
3851     }
3852 
3853     tmp = tcg_temp_new_i32();
3854     tmp2 = tcg_temp_new_i32();
3855     if (a->size == MO_32) {
3856         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3857             read_neon_element32(tmp, a->vm, pass, MO_32);
3858             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3859             write_neon_element32(tmp2, a->vm, pass, MO_32);
3860             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3861         }
3862     } else {
3863         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3864             read_neon_element32(tmp, a->vm, pass, MO_32);
3865             read_neon_element32(tmp2, a->vd, pass, MO_32);
3866             if (a->size == MO_8) {
3867                 gen_neon_trn_u8(tmp, tmp2);
3868             } else {
3869                 gen_neon_trn_u16(tmp, tmp2);
3870             }
3871             write_neon_element32(tmp2, a->vm, pass, MO_32);
3872             write_neon_element32(tmp, a->vd, pass, MO_32);
3873         }
3874     }
3875     return true;
3876 }
3877 
3878 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
3879 {
3880     if (!dc_isar_feature(aa32_i8mm, s)) {
3881         return false;
3882     }
3883     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3884                         gen_helper_gvec_smmla_b);
3885 }
3886 
3887 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
3888 {
3889     if (!dc_isar_feature(aa32_i8mm, s)) {
3890         return false;
3891     }
3892     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3893                         gen_helper_gvec_ummla_b);
3894 }
3895 
3896 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
3897 {
3898     if (!dc_isar_feature(aa32_i8mm, s)) {
3899         return false;
3900     }
3901     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3902                         gen_helper_gvec_usmmla_b);
3903 }
3904 
3905 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
3906 {
3907     if (!dc_isar_feature(aa32_bf16, s)) {
3908         return false;
3909     }
3910     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3911                         gen_helper_gvec_bfmmla);
3912 }
3913 
3914 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
3915 {
3916     if (!dc_isar_feature(aa32_bf16, s)) {
3917         return false;
3918     }
3919     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
3920                              gen_helper_gvec_bfmlal);
3921 }
3922 
3923 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
3924 {
3925     if (!dc_isar_feature(aa32_bf16, s)) {
3926         return false;
3927     }
3928     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
3929                              (a->index << 1) | a->q, FPST_STD,
3930                              gen_helper_gvec_bfmlal_idx);
3931 }
3932