xref: /openbmc/qemu/target/arm/tcg/translate-neon.c (revision b0409139)
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include "qemu/osdep.h"
24 #include "translate.h"
25 #include "translate-a32.h"
26 
27 /* Include the generated Neon decoder */
28 #include "decode-neon-dp.c.inc"
29 #include "decode-neon-ls.c.inc"
30 #include "decode-neon-shared.c.inc"
31 
32 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
33 {
34     TCGv_ptr ret = tcg_temp_new_ptr();
35     tcg_gen_addi_ptr(ret, tcg_env, vfp_reg_offset(dp, reg));
36     return ret;
37 }
38 
39 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
40 {
41     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
42 
43     switch (mop) {
44     case MO_UB:
45         tcg_gen_ld8u_i32(var, tcg_env, offset);
46         break;
47     case MO_UW:
48         tcg_gen_ld16u_i32(var, tcg_env, offset);
49         break;
50     case MO_UL:
51         tcg_gen_ld_i32(var, tcg_env, offset);
52         break;
53     default:
54         g_assert_not_reached();
55     }
56 }
57 
58 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
59 {
60     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
61 
62     switch (mop) {
63     case MO_UB:
64         tcg_gen_ld8u_i64(var, tcg_env, offset);
65         break;
66     case MO_UW:
67         tcg_gen_ld16u_i64(var, tcg_env, offset);
68         break;
69     case MO_UL:
70         tcg_gen_ld32u_i64(var, tcg_env, offset);
71         break;
72     case MO_UQ:
73         tcg_gen_ld_i64(var, tcg_env, offset);
74         break;
75     default:
76         g_assert_not_reached();
77     }
78 }
79 
80 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
81 {
82     long offset = neon_element_offset(reg, ele, size);
83 
84     switch (size) {
85     case MO_8:
86         tcg_gen_st8_i32(var, tcg_env, offset);
87         break;
88     case MO_16:
89         tcg_gen_st16_i32(var, tcg_env, offset);
90         break;
91     case MO_32:
92         tcg_gen_st_i32(var, tcg_env, offset);
93         break;
94     default:
95         g_assert_not_reached();
96     }
97 }
98 
99 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
100 {
101     long offset = neon_element_offset(reg, ele, size);
102 
103     switch (size) {
104     case MO_8:
105         tcg_gen_st8_i64(var, tcg_env, offset);
106         break;
107     case MO_16:
108         tcg_gen_st16_i64(var, tcg_env, offset);
109         break;
110     case MO_32:
111         tcg_gen_st32_i64(var, tcg_env, offset);
112         break;
113     case MO_64:
114         tcg_gen_st_i64(var, tcg_env, offset);
115         break;
116     default:
117         g_assert_not_reached();
118     }
119 }
120 
121 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
122                          int data, gen_helper_gvec_4 *fn_gvec)
123 {
124     /* UNDEF accesses to D16-D31 if they don't exist. */
125     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
126         return false;
127     }
128 
129     /*
130      * UNDEF accesses to odd registers for each bit of Q.
131      * Q will be 0b111 for all Q-reg instructions, otherwise
132      * when we have mixed Q- and D-reg inputs.
133      */
134     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
135         return false;
136     }
137 
138     if (!vfp_access_check(s)) {
139         return true;
140     }
141 
142     int opr_sz = q ? 16 : 8;
143     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
144                        vfp_reg_offset(1, vn),
145                        vfp_reg_offset(1, vm),
146                        vfp_reg_offset(1, vd),
147                        opr_sz, opr_sz, data, fn_gvec);
148     return true;
149 }
150 
151 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
152                               int data, ARMFPStatusFlavour fp_flavour,
153                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
154 {
155     /* UNDEF accesses to D16-D31 if they don't exist. */
156     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
157         return false;
158     }
159 
160     /*
161      * UNDEF accesses to odd registers for each bit of Q.
162      * Q will be 0b111 for all Q-reg instructions, otherwise
163      * when we have mixed Q- and D-reg inputs.
164      */
165     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
166         return false;
167     }
168 
169     if (!vfp_access_check(s)) {
170         return true;
171     }
172 
173     int opr_sz = q ? 16 : 8;
174     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
175 
176     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
177                        vfp_reg_offset(1, vn),
178                        vfp_reg_offset(1, vm),
179                        vfp_reg_offset(1, vd),
180                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
181     return true;
182 }
183 
184 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
185 {
186     if (!dc_isar_feature(aa32_vcma, s)) {
187         return false;
188     }
189     if (a->size == MO_16) {
190         if (!dc_isar_feature(aa32_fp16_arith, s)) {
191             return false;
192         }
193         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
194                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
195     }
196     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
197                              FPST_STD, gen_helper_gvec_fcmlas);
198 }
199 
200 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
201 {
202     int opr_sz;
203     TCGv_ptr fpst;
204     gen_helper_gvec_3_ptr *fn_gvec_ptr;
205 
206     if (!dc_isar_feature(aa32_vcma, s)
207         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
208         return false;
209     }
210 
211     /* UNDEF accesses to D16-D31 if they don't exist. */
212     if (!dc_isar_feature(aa32_simd_r32, s) &&
213         ((a->vd | a->vn | a->vm) & 0x10)) {
214         return false;
215     }
216 
217     if ((a->vn | a->vm | a->vd) & a->q) {
218         return false;
219     }
220 
221     if (!vfp_access_check(s)) {
222         return true;
223     }
224 
225     opr_sz = (1 + a->q) * 8;
226     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
227     fn_gvec_ptr = (a->size == MO_16) ?
228         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
229     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
230                        vfp_reg_offset(1, a->vn),
231                        vfp_reg_offset(1, a->vm),
232                        fpst, opr_sz, opr_sz, a->rot,
233                        fn_gvec_ptr);
234     return true;
235 }
236 
237 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
238 {
239     if (!dc_isar_feature(aa32_dp, s)) {
240         return false;
241     }
242     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
243                         gen_helper_gvec_sdot_b);
244 }
245 
246 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
247 {
248     if (!dc_isar_feature(aa32_dp, s)) {
249         return false;
250     }
251     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
252                         gen_helper_gvec_udot_b);
253 }
254 
255 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
256 {
257     if (!dc_isar_feature(aa32_i8mm, s)) {
258         return false;
259     }
260     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
261                         gen_helper_gvec_usdot_b);
262 }
263 
264 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
265 {
266     if (!dc_isar_feature(aa32_bf16, s)) {
267         return false;
268     }
269     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
270                         gen_helper_gvec_bfdot);
271 }
272 
273 static bool trans_VFML(DisasContext *s, arg_VFML *a)
274 {
275     int opr_sz;
276 
277     if (!dc_isar_feature(aa32_fhm, s)) {
278         return false;
279     }
280 
281     /* UNDEF accesses to D16-D31 if they don't exist. */
282     if (!dc_isar_feature(aa32_simd_r32, s) &&
283         (a->vd & 0x10)) {
284         return false;
285     }
286 
287     if (a->vd & a->q) {
288         return false;
289     }
290 
291     if (!vfp_access_check(s)) {
292         return true;
293     }
294 
295     opr_sz = (1 + a->q) * 8;
296     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
297                        vfp_reg_offset(a->q, a->vn),
298                        vfp_reg_offset(a->q, a->vm),
299                        tcg_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
300                        gen_helper_gvec_fmlal_a32);
301     return true;
302 }
303 
304 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
305 {
306     int data = (a->index << 2) | a->rot;
307 
308     if (!dc_isar_feature(aa32_vcma, s)) {
309         return false;
310     }
311     if (a->size == MO_16) {
312         if (!dc_isar_feature(aa32_fp16_arith, s)) {
313             return false;
314         }
315         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
316                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
317     }
318     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
319                              FPST_STD, gen_helper_gvec_fcmlas_idx);
320 }
321 
322 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
323 {
324     if (!dc_isar_feature(aa32_dp, s)) {
325         return false;
326     }
327     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
328                         gen_helper_gvec_sdot_idx_b);
329 }
330 
331 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
332 {
333     if (!dc_isar_feature(aa32_dp, s)) {
334         return false;
335     }
336     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
337                         gen_helper_gvec_udot_idx_b);
338 }
339 
340 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
341 {
342     if (!dc_isar_feature(aa32_i8mm, s)) {
343         return false;
344     }
345     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
346                         gen_helper_gvec_usdot_idx_b);
347 }
348 
349 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
350 {
351     if (!dc_isar_feature(aa32_i8mm, s)) {
352         return false;
353     }
354     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
355                         gen_helper_gvec_sudot_idx_b);
356 }
357 
358 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
359 {
360     if (!dc_isar_feature(aa32_bf16, s)) {
361         return false;
362     }
363     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
364                         gen_helper_gvec_bfdot_idx);
365 }
366 
367 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
368 {
369     int opr_sz;
370 
371     if (!dc_isar_feature(aa32_fhm, s)) {
372         return false;
373     }
374 
375     /* UNDEF accesses to D16-D31 if they don't exist. */
376     if (!dc_isar_feature(aa32_simd_r32, s) &&
377         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
378         return false;
379     }
380 
381     if (a->vd & a->q) {
382         return false;
383     }
384 
385     if (!vfp_access_check(s)) {
386         return true;
387     }
388 
389     opr_sz = (1 + a->q) * 8;
390     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
391                        vfp_reg_offset(a->q, a->vn),
392                        vfp_reg_offset(a->q, a->rm),
393                        tcg_env, opr_sz, opr_sz,
394                        (a->index << 2) | a->s, /* is_2 == 0 */
395                        gen_helper_gvec_fmlal_idx_a32);
396     return true;
397 }
398 
399 static struct {
400     int nregs;
401     int interleave;
402     int spacing;
403 } const neon_ls_element_type[11] = {
404     {1, 4, 1},
405     {1, 4, 2},
406     {4, 1, 1},
407     {2, 2, 2},
408     {1, 3, 1},
409     {1, 3, 2},
410     {3, 1, 1},
411     {1, 1, 1},
412     {1, 2, 1},
413     {1, 2, 2},
414     {2, 1, 1}
415 };
416 
417 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
418                                       int stride)
419 {
420     if (rm != 15) {
421         TCGv_i32 base;
422 
423         base = load_reg(s, rn);
424         if (rm == 13) {
425             tcg_gen_addi_i32(base, base, stride);
426         } else {
427             TCGv_i32 index;
428             index = load_reg(s, rm);
429             tcg_gen_add_i32(base, base, index);
430         }
431         store_reg(s, rn, base);
432     }
433 }
434 
435 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
436 {
437     /* Neon load/store multiple structures */
438     int nregs, interleave, spacing, reg, n;
439     MemOp mop, align, endian;
440     int mmu_idx = get_mem_index(s);
441     int size = a->size;
442     TCGv_i64 tmp64;
443     TCGv_i32 addr;
444 
445     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
446         return false;
447     }
448 
449     /* UNDEF accesses to D16-D31 if they don't exist */
450     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
451         return false;
452     }
453     if (a->itype > 10) {
454         return false;
455     }
456     /* Catch UNDEF cases for bad values of align field */
457     switch (a->itype & 0xc) {
458     case 4:
459         if (a->align >= 2) {
460             return false;
461         }
462         break;
463     case 8:
464         if (a->align == 3) {
465             return false;
466         }
467         break;
468     default:
469         break;
470     }
471     nregs = neon_ls_element_type[a->itype].nregs;
472     interleave = neon_ls_element_type[a->itype].interleave;
473     spacing = neon_ls_element_type[a->itype].spacing;
474     if (size == 3 && (interleave | spacing) != 1) {
475         return false;
476     }
477 
478     if (!vfp_access_check(s)) {
479         return true;
480     }
481 
482     /* For our purposes, bytes are always little-endian.  */
483     endian = s->be_data;
484     if (size == 0) {
485         endian = MO_LE;
486     }
487 
488     /* Enforce alignment requested by the instruction */
489     if (a->align) {
490         align = pow2_align(a->align + 2); /* 4 ** a->align */
491     } else {
492         align = s->align_mem ? MO_ALIGN : 0;
493     }
494 
495     /*
496      * Consecutive little-endian elements from a single register
497      * can be promoted to a larger little-endian operation.
498      */
499     if (interleave == 1 && endian == MO_LE) {
500         /* Retain any natural alignment. */
501         if (align == MO_ALIGN) {
502             align = pow2_align(size);
503         }
504         size = 3;
505     }
506 
507     tmp64 = tcg_temp_new_i64();
508     addr = tcg_temp_new_i32();
509     load_reg_var(s, addr, a->rn);
510 
511     mop = endian | size | align;
512     for (reg = 0; reg < nregs; reg++) {
513         for (n = 0; n < 8 >> size; n++) {
514             int xs;
515             for (xs = 0; xs < interleave; xs++) {
516                 int tt = a->vd + reg + spacing * xs;
517 
518                 if (a->l) {
519                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
520                     neon_store_element64(tt, n, size, tmp64);
521                 } else {
522                     neon_load_element64(tmp64, tt, n, size);
523                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
524                 }
525                 tcg_gen_addi_i32(addr, addr, 1 << size);
526 
527                 /* Subsequent memory operations inherit alignment */
528                 mop &= ~MO_AMASK;
529             }
530         }
531     }
532 
533     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
534     return true;
535 }
536 
537 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
538 {
539     /* Neon load single structure to all lanes */
540     int reg, stride, vec_size;
541     int vd = a->vd;
542     int size = a->size;
543     int nregs = a->n + 1;
544     TCGv_i32 addr, tmp;
545     MemOp mop, align;
546 
547     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
548         return false;
549     }
550 
551     /* UNDEF accesses to D16-D31 if they don't exist */
552     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
553         return false;
554     }
555 
556     align = 0;
557     if (size == 3) {
558         if (nregs != 4 || a->a == 0) {
559             return false;
560         }
561         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
562         size = MO_32;
563         align = MO_ALIGN_16;
564     } else if (a->a) {
565         switch (nregs) {
566         case 1:
567             if (size == 0) {
568                 return false;
569             }
570             align = MO_ALIGN;
571             break;
572         case 2:
573             align = pow2_align(size + 1);
574             break;
575         case 3:
576             return false;
577         case 4:
578             if (size == 2) {
579                 align = pow2_align(3);
580             } else {
581                 align = pow2_align(size + 2);
582             }
583             break;
584         default:
585             g_assert_not_reached();
586         }
587     }
588 
589     if (!vfp_access_check(s)) {
590         return true;
591     }
592 
593     /*
594      * VLD1 to all lanes: T bit indicates how many Dregs to write.
595      * VLD2/3/4 to all lanes: T bit indicates register stride.
596      */
597     stride = a->t ? 2 : 1;
598     vec_size = nregs == 1 ? stride * 8 : 8;
599     mop = size | align;
600     tmp = tcg_temp_new_i32();
601     addr = tcg_temp_new_i32();
602     load_reg_var(s, addr, a->rn);
603     for (reg = 0; reg < nregs; reg++) {
604         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
605         if ((vd & 1) && vec_size == 16) {
606             /*
607              * We cannot write 16 bytes at once because the
608              * destination is unaligned.
609              */
610             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
611                                  8, 8, tmp);
612             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
613                              neon_full_reg_offset(vd), 8, 8);
614         } else {
615             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
616                                  vec_size, vec_size, tmp);
617         }
618         tcg_gen_addi_i32(addr, addr, 1 << size);
619         vd += stride;
620 
621         /* Subsequent memory operations inherit alignment */
622         mop &= ~MO_AMASK;
623     }
624 
625     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
626 
627     return true;
628 }
629 
630 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
631 {
632     /* Neon load/store single structure to one lane */
633     int reg;
634     int nregs = a->n + 1;
635     int vd = a->vd;
636     TCGv_i32 addr, tmp;
637     MemOp mop;
638 
639     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
640         return false;
641     }
642 
643     /* UNDEF accesses to D16-D31 if they don't exist */
644     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
645         return false;
646     }
647 
648     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
649     switch (nregs) {
650     case 1:
651         if (a->stride != 1) {
652             return false;
653         }
654         if (((a->align & (1 << a->size)) != 0) ||
655             (a->size == 2 && (a->align == 1 || a->align == 2))) {
656             return false;
657         }
658         break;
659     case 2:
660         if (a->size == 2 && (a->align & 2) != 0) {
661             return false;
662         }
663         break;
664     case 3:
665         if (a->align != 0) {
666             return false;
667         }
668         break;
669     case 4:
670         if (a->size == 2 && a->align == 3) {
671             return false;
672         }
673         break;
674     default:
675         g_assert_not_reached();
676     }
677     if ((vd + a->stride * (nregs - 1)) > 31) {
678         /*
679          * Attempts to write off the end of the register file are
680          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
681          * access off the end of the array that holds the register data.
682          */
683         return false;
684     }
685 
686     if (!vfp_access_check(s)) {
687         return true;
688     }
689 
690     /* Pick up SCTLR settings */
691     mop = finalize_memop(s, a->size);
692 
693     if (a->align) {
694         MemOp align_op;
695 
696         switch (nregs) {
697         case 1:
698             /* For VLD1, use natural alignment. */
699             align_op = MO_ALIGN;
700             break;
701         case 2:
702             /* For VLD2, use double alignment. */
703             align_op = pow2_align(a->size + 1);
704             break;
705         case 4:
706             if (a->size == MO_32) {
707                 /*
708                  * For VLD4.32, align = 1 is double alignment, align = 2 is
709                  * quad alignment; align = 3 is rejected above.
710                  */
711                 align_op = pow2_align(a->size + a->align);
712             } else {
713                 /* For VLD4.8 and VLD.16, we want quad alignment. */
714                 align_op = pow2_align(a->size + 2);
715             }
716             break;
717         default:
718             /* For VLD3, the alignment field is zero and rejected above. */
719             g_assert_not_reached();
720         }
721 
722         mop = (mop & ~MO_AMASK) | align_op;
723     }
724 
725     tmp = tcg_temp_new_i32();
726     addr = tcg_temp_new_i32();
727     load_reg_var(s, addr, a->rn);
728 
729     for (reg = 0; reg < nregs; reg++) {
730         if (a->l) {
731             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
732             neon_store_element(vd, a->reg_idx, a->size, tmp);
733         } else { /* Store */
734             neon_load_element(tmp, vd, a->reg_idx, a->size);
735             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
736         }
737         vd += a->stride;
738         tcg_gen_addi_i32(addr, addr, 1 << a->size);
739 
740         /* Subsequent memory operations inherit alignment */
741         mop &= ~MO_AMASK;
742     }
743 
744     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
745 
746     return true;
747 }
748 
749 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
750 {
751     int vec_size = a->q ? 16 : 8;
752     int rd_ofs = neon_full_reg_offset(a->vd);
753     int rn_ofs = neon_full_reg_offset(a->vn);
754     int rm_ofs = neon_full_reg_offset(a->vm);
755 
756     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
757         return false;
758     }
759 
760     /* UNDEF accesses to D16-D31 if they don't exist. */
761     if (!dc_isar_feature(aa32_simd_r32, s) &&
762         ((a->vd | a->vn | a->vm) & 0x10)) {
763         return false;
764     }
765 
766     if ((a->vn | a->vm | a->vd) & a->q) {
767         return false;
768     }
769 
770     if (!vfp_access_check(s)) {
771         return true;
772     }
773 
774     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
775     return true;
776 }
777 
778 #define DO_3SAME(INSN, FUNC)                                            \
779     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
780     {                                                                   \
781         return do_3same(s, a, FUNC);                                    \
782     }
783 
784 DO_3SAME(VADD, tcg_gen_gvec_add)
785 DO_3SAME(VSUB, tcg_gen_gvec_sub)
786 DO_3SAME(VAND, tcg_gen_gvec_and)
787 DO_3SAME(VBIC, tcg_gen_gvec_andc)
788 DO_3SAME(VORR, tcg_gen_gvec_or)
789 DO_3SAME(VORN, tcg_gen_gvec_orc)
790 DO_3SAME(VEOR, tcg_gen_gvec_xor)
791 DO_3SAME(VSHL_S, gen_gvec_sshl)
792 DO_3SAME(VSHL_U, gen_gvec_ushl)
793 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
794 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
795 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
796 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
797 
798 /* These insns are all gvec_bitsel but with the inputs in various orders. */
799 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
800     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
801                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
802                                 uint32_t oprsz, uint32_t maxsz)         \
803     {                                                                   \
804         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
805     }                                                                   \
806     DO_3SAME(INSN, gen_##INSN##_3s)
807 
808 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
809 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
810 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
811 
812 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
813     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
814     {                                                                   \
815         if (a->size == 3) {                                             \
816             return false;                                               \
817         }                                                               \
818         return do_3same(s, a, FUNC);                                    \
819     }
820 
821 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
822 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
823 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
824 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
825 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
826 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
827 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
828 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
829 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
830 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
831 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
832 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
833 DO_3SAME_NO_SZ_3(VPADD, gen_gvec_addp)
834 DO_3SAME_NO_SZ_3(VPMAX_S, gen_gvec_smaxp)
835 DO_3SAME_NO_SZ_3(VPMIN_S, gen_gvec_sminp)
836 DO_3SAME_NO_SZ_3(VPMAX_U, gen_gvec_umaxp)
837 DO_3SAME_NO_SZ_3(VPMIN_U, gen_gvec_uminp)
838 
839 #define DO_3SAME_CMP(INSN, COND)                                        \
840     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
841                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
842                                 uint32_t oprsz, uint32_t maxsz)         \
843     {                                                                   \
844         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
845     }                                                                   \
846     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
847 
848 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
849 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
850 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
851 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
852 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
853 
854 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
855     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
856                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
857     {                                                                      \
858         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
859     }
860 
861 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
862 
863 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
864 {
865     if (a->size != 0) {
866         return false;
867     }
868     return do_3same(s, a, gen_VMUL_p_3s);
869 }
870 
871 #define DO_VQRDMLAH(INSN, FUNC)                                         \
872     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
873     {                                                                   \
874         if (!dc_isar_feature(aa32_rdm, s)) {                            \
875             return false;                                               \
876         }                                                               \
877         if (a->size != 1 && a->size != 2) {                             \
878             return false;                                               \
879         }                                                               \
880         return do_3same(s, a, FUNC);                                    \
881     }
882 
883 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
884 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
885 
886 #define DO_SHA1(NAME, FUNC)                                             \
887     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
888     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
889     {                                                                   \
890         if (!dc_isar_feature(aa32_sha1, s)) {                           \
891             return false;                                               \
892         }                                                               \
893         return do_3same(s, a, gen_##NAME##_3s);                         \
894     }
895 
896 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
897 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
898 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
899 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
900 
901 #define DO_SHA2(NAME, FUNC)                                             \
902     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
903     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
904     {                                                                   \
905         if (!dc_isar_feature(aa32_sha2, s)) {                           \
906             return false;                                               \
907         }                                                               \
908         return do_3same(s, a, gen_##NAME##_3s);                         \
909     }
910 
911 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
912 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
913 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
914 
915 #define DO_3SAME_64(INSN, FUNC)                                         \
916     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
917                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
918                                 uint32_t oprsz, uint32_t maxsz)         \
919     {                                                                   \
920         static const GVecGen3 op = { .fni8 = FUNC };                    \
921         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
922     }                                                                   \
923     DO_3SAME(INSN, gen_##INSN##_3s)
924 
925 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
926     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
927     {                                                                   \
928         FUNC(d, tcg_env, n, m);                                         \
929     }                                                                   \
930     DO_3SAME_64(INSN, gen_##INSN##_elt)
931 
932 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
933 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
934 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
935 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
936 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
937 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
938 
939 #define DO_3SAME_32(INSN, FUNC)                                         \
940     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
941                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
942                                 uint32_t oprsz, uint32_t maxsz)         \
943     {                                                                   \
944         static const GVecGen3 ops[4] = {                                \
945             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
946             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
947             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
948             { 0 },                                                      \
949         };                                                              \
950         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
951     }                                                                   \
952     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
953     {                                                                   \
954         if (a->size > 2) {                                              \
955             return false;                                               \
956         }                                                               \
957         return do_3same(s, a, gen_##INSN##_3s);                         \
958     }
959 
960 /*
961  * Some helper functions need to be passed the tcg_env. In order
962  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
963  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
964  * and which call a NeonGenTwoOpEnvFn().
965  */
966 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
967     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
968     {                                                                   \
969         FUNC(d, tcg_env, n, m);                                         \
970     }
971 
972 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
973     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
974     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
975     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
976     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
977                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
978                                 uint32_t oprsz, uint32_t maxsz)         \
979     {                                                                   \
980         static const GVecGen3 ops[4] = {                                \
981             { .fni4 = gen_##INSN##_tramp8 },                            \
982             { .fni4 = gen_##INSN##_tramp16 },                           \
983             { .fni4 = gen_##INSN##_tramp32 },                           \
984             { 0 },                                                      \
985         };                                                              \
986         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
987     }                                                                   \
988     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
989     {                                                                   \
990         if (a->size > 2) {                                              \
991             return false;                                               \
992         }                                                               \
993         return do_3same(s, a, gen_##INSN##_3s);                         \
994     }
995 
996 DO_3SAME_32(VHADD_S, hadd_s)
997 DO_3SAME_32(VHADD_U, hadd_u)
998 DO_3SAME_32(VHSUB_S, hsub_s)
999 DO_3SAME_32(VHSUB_U, hsub_u)
1000 DO_3SAME_32(VRHADD_S, rhadd_s)
1001 DO_3SAME_32(VRHADD_U, rhadd_u)
1002 DO_3SAME_32(VRSHL_S, rshl_s)
1003 DO_3SAME_32(VRSHL_U, rshl_u)
1004 
1005 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1006 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1007 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1008 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1009 
1010 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1011     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1012     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1013     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1014                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1015                                 uint32_t oprsz, uint32_t maxsz)         \
1016     {                                                                   \
1017         static const GVecGen3 ops[2] = {                                \
1018             { .fni4 = gen_##INSN##_tramp16 },                           \
1019             { .fni4 = gen_##INSN##_tramp32 },                           \
1020         };                                                              \
1021         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1022     }                                                                   \
1023     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1024     {                                                                   \
1025         if (a->size != 1 && a->size != 2) {                             \
1026             return false;                                               \
1027         }                                                               \
1028         return do_3same(s, a, gen_##INSN##_3s);                         \
1029     }
1030 
1031 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1032 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1033 
1034 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1035     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1036                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1037                          uint32_t oprsz, uint32_t maxsz)                \
1038     {                                                                   \
1039         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1040         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1041                            oprsz, maxsz, 0, FUNC);                      \
1042     }
1043 
1044 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1045     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1046     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1047     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1048     {                                                                   \
1049         if (a->size == MO_16) {                                         \
1050             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1051                 return false;                                           \
1052             }                                                           \
1053             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1054         }                                                               \
1055         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1056     }
1057 
1058 
1059 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1060 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1061 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1062 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1063 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1064 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1065 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1066 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1067 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1068 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1069 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1070 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1071 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1072 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1073 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1074 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1075 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1076 DO_3S_FP_GVEC(VPADD, gen_helper_gvec_faddp_s, gen_helper_gvec_faddp_h)
1077 DO_3S_FP_GVEC(VPMAX, gen_helper_gvec_fmaxp_s, gen_helper_gvec_fmaxp_h)
1078 DO_3S_FP_GVEC(VPMIN, gen_helper_gvec_fminp_s, gen_helper_gvec_fminp_h)
1079 
1080 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1081 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1082 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1083 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1084 
1085 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1086 {
1087     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1088         return false;
1089     }
1090 
1091     if (a->size == MO_16) {
1092         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1093             return false;
1094         }
1095         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1096     }
1097     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1098 }
1099 
1100 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1101 {
1102     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1103         return false;
1104     }
1105 
1106     if (a->size == MO_16) {
1107         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1108             return false;
1109         }
1110         return do_3same(s, a, gen_VMINNM_fp16_3s);
1111     }
1112     return do_3same(s, a, gen_VMINNM_fp32_3s);
1113 }
1114 
1115 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1116 {
1117     /* Handle a 2-reg-shift insn which can be vectorized. */
1118     int vec_size = a->q ? 16 : 8;
1119     int rd_ofs = neon_full_reg_offset(a->vd);
1120     int rm_ofs = neon_full_reg_offset(a->vm);
1121 
1122     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1123         return false;
1124     }
1125 
1126     /* UNDEF accesses to D16-D31 if they don't exist. */
1127     if (!dc_isar_feature(aa32_simd_r32, s) &&
1128         ((a->vd | a->vm) & 0x10)) {
1129         return false;
1130     }
1131 
1132     if ((a->vm | a->vd) & a->q) {
1133         return false;
1134     }
1135 
1136     if (!vfp_access_check(s)) {
1137         return true;
1138     }
1139 
1140     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1141     return true;
1142 }
1143 
1144 #define DO_2SH(INSN, FUNC)                                              \
1145     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1146     {                                                                   \
1147         return do_vector_2sh(s, a, FUNC);                               \
1148     }                                                                   \
1149 
1150 DO_2SH(VSHL, tcg_gen_gvec_shli)
1151 DO_2SH(VSLI, gen_gvec_sli)
1152 DO_2SH(VSRI, gen_gvec_sri)
1153 DO_2SH(VSRA_S, gen_gvec_ssra)
1154 DO_2SH(VSRA_U, gen_gvec_usra)
1155 DO_2SH(VRSHR_S, gen_gvec_srshr)
1156 DO_2SH(VRSHR_U, gen_gvec_urshr)
1157 DO_2SH(VRSRA_S, gen_gvec_srsra)
1158 DO_2SH(VRSRA_U, gen_gvec_ursra)
1159 
1160 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1161 {
1162     /* Signed shift out of range results in all-sign-bits */
1163     a->shift = MIN(a->shift, (8 << a->size) - 1);
1164     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1165 }
1166 
1167 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1168                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1169 {
1170     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1171 }
1172 
1173 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1174 {
1175     /* Shift out of range is architecturally valid and results in zero. */
1176     if (a->shift >= (8 << a->size)) {
1177         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1178     } else {
1179         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1180     }
1181 }
1182 
1183 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1184                              NeonGenTwo64OpEnvFn *fn)
1185 {
1186     /*
1187      * 2-reg-and-shift operations, size == 3 case, where the
1188      * function needs to be passed tcg_env.
1189      */
1190     TCGv_i64 constimm;
1191     int pass;
1192 
1193     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1194         return false;
1195     }
1196 
1197     /* UNDEF accesses to D16-D31 if they don't exist. */
1198     if (!dc_isar_feature(aa32_simd_r32, s) &&
1199         ((a->vd | a->vm) & 0x10)) {
1200         return false;
1201     }
1202 
1203     if ((a->vm | a->vd) & a->q) {
1204         return false;
1205     }
1206 
1207     if (!vfp_access_check(s)) {
1208         return true;
1209     }
1210 
1211     /*
1212      * To avoid excessive duplication of ops we implement shift
1213      * by immediate using the variable shift operations.
1214      */
1215     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1216 
1217     for (pass = 0; pass < a->q + 1; pass++) {
1218         TCGv_i64 tmp = tcg_temp_new_i64();
1219 
1220         read_neon_element64(tmp, a->vm, pass, MO_64);
1221         fn(tmp, tcg_env, tmp, constimm);
1222         write_neon_element64(tmp, a->vd, pass, MO_64);
1223     }
1224     return true;
1225 }
1226 
1227 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1228                              NeonGenTwoOpEnvFn *fn)
1229 {
1230     /*
1231      * 2-reg-and-shift operations, size < 3 case, where the
1232      * helper needs to be passed tcg_env.
1233      */
1234     TCGv_i32 constimm, tmp;
1235     int pass;
1236 
1237     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1238         return false;
1239     }
1240 
1241     /* UNDEF accesses to D16-D31 if they don't exist. */
1242     if (!dc_isar_feature(aa32_simd_r32, s) &&
1243         ((a->vd | a->vm) & 0x10)) {
1244         return false;
1245     }
1246 
1247     if ((a->vm | a->vd) & a->q) {
1248         return false;
1249     }
1250 
1251     if (!vfp_access_check(s)) {
1252         return true;
1253     }
1254 
1255     /*
1256      * To avoid excessive duplication of ops we implement shift
1257      * by immediate using the variable shift operations.
1258      */
1259     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1260     tmp = tcg_temp_new_i32();
1261 
1262     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1263         read_neon_element32(tmp, a->vm, pass, MO_32);
1264         fn(tmp, tcg_env, tmp, constimm);
1265         write_neon_element32(tmp, a->vd, pass, MO_32);
1266     }
1267     return true;
1268 }
1269 
1270 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1271     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1272     {                                                                   \
1273         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1274     }                                                                   \
1275     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1276     {                                                                   \
1277         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1278             gen_helper_neon_##FUNC##8,                                  \
1279             gen_helper_neon_##FUNC##16,                                 \
1280             gen_helper_neon_##FUNC##32,                                 \
1281         };                                                              \
1282         assert(a->size < ARRAY_SIZE(fns));                              \
1283         return do_2shift_env_32(s, a, fns[a->size]);                    \
1284     }
1285 
1286 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1287 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1288 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1289 
1290 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1291                                 NeonGenTwo64OpFn *shiftfn,
1292                                 NeonGenNarrowEnvFn *narrowfn)
1293 {
1294     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1295     TCGv_i64 constimm, rm1, rm2;
1296     TCGv_i32 rd;
1297 
1298     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1299         return false;
1300     }
1301 
1302     /* UNDEF accesses to D16-D31 if they don't exist. */
1303     if (!dc_isar_feature(aa32_simd_r32, s) &&
1304         ((a->vd | a->vm) & 0x10)) {
1305         return false;
1306     }
1307 
1308     if (a->vm & 1) {
1309         return false;
1310     }
1311 
1312     if (!vfp_access_check(s)) {
1313         return true;
1314     }
1315 
1316     /*
1317      * This is always a right shift, and the shiftfn is always a
1318      * left-shift helper, which thus needs the negated shift count.
1319      */
1320     constimm = tcg_constant_i64(-a->shift);
1321     rm1 = tcg_temp_new_i64();
1322     rm2 = tcg_temp_new_i64();
1323     rd = tcg_temp_new_i32();
1324 
1325     /* Load both inputs first to avoid potential overwrite if rm == rd */
1326     read_neon_element64(rm1, a->vm, 0, MO_64);
1327     read_neon_element64(rm2, a->vm, 1, MO_64);
1328 
1329     shiftfn(rm1, rm1, constimm);
1330     narrowfn(rd, tcg_env, rm1);
1331     write_neon_element32(rd, a->vd, 0, MO_32);
1332 
1333     shiftfn(rm2, rm2, constimm);
1334     narrowfn(rd, tcg_env, rm2);
1335     write_neon_element32(rd, a->vd, 1, MO_32);
1336 
1337     return true;
1338 }
1339 
1340 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1341                                 NeonGenTwoOpFn *shiftfn,
1342                                 NeonGenNarrowEnvFn *narrowfn)
1343 {
1344     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1345     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1346     TCGv_i64 rtmp;
1347     uint32_t imm;
1348 
1349     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1350         return false;
1351     }
1352 
1353     /* UNDEF accesses to D16-D31 if they don't exist. */
1354     if (!dc_isar_feature(aa32_simd_r32, s) &&
1355         ((a->vd | a->vm) & 0x10)) {
1356         return false;
1357     }
1358 
1359     if (a->vm & 1) {
1360         return false;
1361     }
1362 
1363     if (!vfp_access_check(s)) {
1364         return true;
1365     }
1366 
1367     /*
1368      * This is always a right shift, and the shiftfn is always a
1369      * left-shift helper, which thus needs the negated shift count
1370      * duplicated into each lane of the immediate value.
1371      */
1372     if (a->size == 1) {
1373         imm = (uint16_t)(-a->shift);
1374         imm |= imm << 16;
1375     } else {
1376         /* size == 2 */
1377         imm = -a->shift;
1378     }
1379     constimm = tcg_constant_i32(imm);
1380 
1381     /* Load all inputs first to avoid potential overwrite */
1382     rm1 = tcg_temp_new_i32();
1383     rm2 = tcg_temp_new_i32();
1384     rm3 = tcg_temp_new_i32();
1385     rm4 = tcg_temp_new_i32();
1386     read_neon_element32(rm1, a->vm, 0, MO_32);
1387     read_neon_element32(rm2, a->vm, 1, MO_32);
1388     read_neon_element32(rm3, a->vm, 2, MO_32);
1389     read_neon_element32(rm4, a->vm, 3, MO_32);
1390     rtmp = tcg_temp_new_i64();
1391 
1392     shiftfn(rm1, rm1, constimm);
1393     shiftfn(rm2, rm2, constimm);
1394 
1395     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1396 
1397     narrowfn(rm1, tcg_env, rtmp);
1398     write_neon_element32(rm1, a->vd, 0, MO_32);
1399 
1400     shiftfn(rm3, rm3, constimm);
1401     shiftfn(rm4, rm4, constimm);
1402 
1403     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1404 
1405     narrowfn(rm3, tcg_env, rtmp);
1406     write_neon_element32(rm3, a->vd, 1, MO_32);
1407     return true;
1408 }
1409 
1410 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1411     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1412     {                                                                   \
1413         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1414     }
1415 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1416     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1417     {                                                                   \
1418         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1419     }
1420 
1421 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1422 {
1423     tcg_gen_extrl_i64_i32(dest, src);
1424 }
1425 
1426 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1427 {
1428     gen_helper_neon_narrow_u16(dest, src);
1429 }
1430 
1431 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1432 {
1433     gen_helper_neon_narrow_u8(dest, src);
1434 }
1435 
1436 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1437 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1438 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1439 
1440 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1441 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1442 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1443 
1444 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1445 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1446 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1447 
1448 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1449 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1450 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1451 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1452 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1453 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1454 
1455 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1456 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1457 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1458 
1459 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1460 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1461 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1462 
1463 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1464 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1465 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1466 
1467 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1468                          NeonGenWidenFn *widenfn, bool u)
1469 {
1470     TCGv_i64 tmp;
1471     TCGv_i32 rm0, rm1;
1472     uint64_t widen_mask = 0;
1473 
1474     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1475         return false;
1476     }
1477 
1478     /* UNDEF accesses to D16-D31 if they don't exist. */
1479     if (!dc_isar_feature(aa32_simd_r32, s) &&
1480         ((a->vd | a->vm) & 0x10)) {
1481         return false;
1482     }
1483 
1484     if (a->vd & 1) {
1485         return false;
1486     }
1487 
1488     if (!vfp_access_check(s)) {
1489         return true;
1490     }
1491 
1492     /*
1493      * This is a widen-and-shift operation. The shift is always less
1494      * than the width of the source type, so after widening the input
1495      * vector we can simply shift the whole 64-bit widened register,
1496      * and then clear the potential overflow bits resulting from left
1497      * bits of the narrow input appearing as right bits of the left
1498      * neighbour narrow input. Calculate a mask of bits to clear.
1499      */
1500     if ((a->shift != 0) && (a->size < 2 || u)) {
1501         int esize = 8 << a->size;
1502         widen_mask = MAKE_64BIT_MASK(0, esize);
1503         widen_mask >>= esize - a->shift;
1504         widen_mask = dup_const(a->size + 1, widen_mask);
1505     }
1506 
1507     rm0 = tcg_temp_new_i32();
1508     rm1 = tcg_temp_new_i32();
1509     read_neon_element32(rm0, a->vm, 0, MO_32);
1510     read_neon_element32(rm1, a->vm, 1, MO_32);
1511     tmp = tcg_temp_new_i64();
1512 
1513     widenfn(tmp, rm0);
1514     if (a->shift != 0) {
1515         tcg_gen_shli_i64(tmp, tmp, a->shift);
1516         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1517     }
1518     write_neon_element64(tmp, a->vd, 0, MO_64);
1519 
1520     widenfn(tmp, rm1);
1521     if (a->shift != 0) {
1522         tcg_gen_shli_i64(tmp, tmp, a->shift);
1523         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1524     }
1525     write_neon_element64(tmp, a->vd, 1, MO_64);
1526     return true;
1527 }
1528 
1529 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1530 {
1531     static NeonGenWidenFn * const widenfn[] = {
1532         gen_helper_neon_widen_s8,
1533         gen_helper_neon_widen_s16,
1534         tcg_gen_ext_i32_i64,
1535     };
1536     return do_vshll_2sh(s, a, widenfn[a->size], false);
1537 }
1538 
1539 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1540 {
1541     static NeonGenWidenFn * const widenfn[] = {
1542         gen_helper_neon_widen_u8,
1543         gen_helper_neon_widen_u16,
1544         tcg_gen_extu_i32_i64,
1545     };
1546     return do_vshll_2sh(s, a, widenfn[a->size], true);
1547 }
1548 
1549 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1550                       gen_helper_gvec_2_ptr *fn)
1551 {
1552     /* FP operations in 2-reg-and-shift group */
1553     int vec_size = a->q ? 16 : 8;
1554     int rd_ofs = neon_full_reg_offset(a->vd);
1555     int rm_ofs = neon_full_reg_offset(a->vm);
1556     TCGv_ptr fpst;
1557 
1558     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1559         return false;
1560     }
1561 
1562     if (a->size == MO_16) {
1563         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1564             return false;
1565         }
1566     }
1567 
1568     /* UNDEF accesses to D16-D31 if they don't exist. */
1569     if (!dc_isar_feature(aa32_simd_r32, s) &&
1570         ((a->vd | a->vm) & 0x10)) {
1571         return false;
1572     }
1573 
1574     if ((a->vm | a->vd) & a->q) {
1575         return false;
1576     }
1577 
1578     if (!vfp_access_check(s)) {
1579         return true;
1580     }
1581 
1582     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1583     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1584     return true;
1585 }
1586 
1587 #define DO_FP_2SH(INSN, FUNC)                                           \
1588     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1589     {                                                                   \
1590         return do_fp_2sh(s, a, FUNC);                                   \
1591     }
1592 
1593 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1594 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1595 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1596 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1597 
1598 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1599 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1600 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1601 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1602 
1603 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1604                         GVecGen2iFn *fn)
1605 {
1606     uint64_t imm;
1607     int reg_ofs, vec_size;
1608 
1609     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1610         return false;
1611     }
1612 
1613     /* UNDEF accesses to D16-D31 if they don't exist. */
1614     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1615         return false;
1616     }
1617 
1618     if (a->vd & a->q) {
1619         return false;
1620     }
1621 
1622     if (!vfp_access_check(s)) {
1623         return true;
1624     }
1625 
1626     reg_ofs = neon_full_reg_offset(a->vd);
1627     vec_size = a->q ? 16 : 8;
1628     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1629 
1630     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1631     return true;
1632 }
1633 
1634 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1635                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1636 {
1637     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1638 }
1639 
1640 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1641 {
1642     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1643     GVecGen2iFn *fn;
1644 
1645     if ((a->cmode & 1) && a->cmode < 12) {
1646         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1647         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1648     } else {
1649         /* There is one unallocated cmode/op combination in this space */
1650         if (a->cmode == 15 && a->op == 1) {
1651             return false;
1652         }
1653         fn = gen_VMOV_1r;
1654     }
1655     return do_1reg_imm(s, a, fn);
1656 }
1657 
1658 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1659                            NeonGenWidenFn *widenfn,
1660                            NeonGenTwo64OpFn *opfn,
1661                            int src1_mop, int src2_mop)
1662 {
1663     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1664     TCGv_i64 rn0_64, rn1_64, rm_64;
1665 
1666     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1667         return false;
1668     }
1669 
1670     /* UNDEF accesses to D16-D31 if they don't exist. */
1671     if (!dc_isar_feature(aa32_simd_r32, s) &&
1672         ((a->vd | a->vn | a->vm) & 0x10)) {
1673         return false;
1674     }
1675 
1676     if (!opfn) {
1677         /* size == 3 case, which is an entirely different insn group */
1678         return false;
1679     }
1680 
1681     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1682         return false;
1683     }
1684 
1685     if (!vfp_access_check(s)) {
1686         return true;
1687     }
1688 
1689     rn0_64 = tcg_temp_new_i64();
1690     rn1_64 = tcg_temp_new_i64();
1691     rm_64 = tcg_temp_new_i64();
1692 
1693     if (src1_mop >= 0) {
1694         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1695     } else {
1696         TCGv_i32 tmp = tcg_temp_new_i32();
1697         read_neon_element32(tmp, a->vn, 0, MO_32);
1698         widenfn(rn0_64, tmp);
1699     }
1700     if (src2_mop >= 0) {
1701         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1702     } else {
1703         TCGv_i32 tmp = tcg_temp_new_i32();
1704         read_neon_element32(tmp, a->vm, 0, MO_32);
1705         widenfn(rm_64, tmp);
1706     }
1707 
1708     opfn(rn0_64, rn0_64, rm_64);
1709 
1710     /*
1711      * Load second pass inputs before storing the first pass result, to
1712      * avoid incorrect results if a narrow input overlaps with the result.
1713      */
1714     if (src1_mop >= 0) {
1715         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1716     } else {
1717         TCGv_i32 tmp = tcg_temp_new_i32();
1718         read_neon_element32(tmp, a->vn, 1, MO_32);
1719         widenfn(rn1_64, tmp);
1720     }
1721     if (src2_mop >= 0) {
1722         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1723     } else {
1724         TCGv_i32 tmp = tcg_temp_new_i32();
1725         read_neon_element32(tmp, a->vm, 1, MO_32);
1726         widenfn(rm_64, tmp);
1727     }
1728 
1729     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1730 
1731     opfn(rn1_64, rn1_64, rm_64);
1732     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1733 
1734     return true;
1735 }
1736 
1737 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1738     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1739     {                                                                   \
1740         static NeonGenWidenFn * const widenfn[] = {                     \
1741             gen_helper_neon_widen_##S##8,                               \
1742             gen_helper_neon_widen_##S##16,                              \
1743             NULL, NULL,                                                 \
1744         };                                                              \
1745         static NeonGenTwo64OpFn * const addfn[] = {                     \
1746             gen_helper_neon_##OP##l_u16,                                \
1747             gen_helper_neon_##OP##l_u32,                                \
1748             tcg_gen_##OP##_i64,                                         \
1749             NULL,                                                       \
1750         };                                                              \
1751         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1752         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1753                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1754                               narrow_mop);                              \
1755     }
1756 
1757 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1758 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1759 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1760 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1761 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1762 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1763 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1764 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1765 
1766 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1767                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1768 {
1769     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1770     TCGv_i64 rn_64, rm_64;
1771     TCGv_i32 rd0, rd1;
1772 
1773     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1774         return false;
1775     }
1776 
1777     /* UNDEF accesses to D16-D31 if they don't exist. */
1778     if (!dc_isar_feature(aa32_simd_r32, s) &&
1779         ((a->vd | a->vn | a->vm) & 0x10)) {
1780         return false;
1781     }
1782 
1783     if (!opfn || !narrowfn) {
1784         /* size == 3 case, which is an entirely different insn group */
1785         return false;
1786     }
1787 
1788     if ((a->vn | a->vm) & 1) {
1789         return false;
1790     }
1791 
1792     if (!vfp_access_check(s)) {
1793         return true;
1794     }
1795 
1796     rn_64 = tcg_temp_new_i64();
1797     rm_64 = tcg_temp_new_i64();
1798     rd0 = tcg_temp_new_i32();
1799     rd1 = tcg_temp_new_i32();
1800 
1801     read_neon_element64(rn_64, a->vn, 0, MO_64);
1802     read_neon_element64(rm_64, a->vm, 0, MO_64);
1803 
1804     opfn(rn_64, rn_64, rm_64);
1805 
1806     narrowfn(rd0, rn_64);
1807 
1808     read_neon_element64(rn_64, a->vn, 1, MO_64);
1809     read_neon_element64(rm_64, a->vm, 1, MO_64);
1810 
1811     opfn(rn_64, rn_64, rm_64);
1812 
1813     narrowfn(rd1, rn_64);
1814 
1815     write_neon_element32(rd0, a->vd, 0, MO_32);
1816     write_neon_element32(rd1, a->vd, 1, MO_32);
1817 
1818     return true;
1819 }
1820 
1821 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1822     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1823     {                                                                   \
1824         static NeonGenTwo64OpFn * const addfn[] = {                     \
1825             gen_helper_neon_##OP##l_u16,                                \
1826             gen_helper_neon_##OP##l_u32,                                \
1827             tcg_gen_##OP##_i64,                                         \
1828             NULL,                                                       \
1829         };                                                              \
1830         static NeonGenNarrowFn * const narrowfn[] = {                   \
1831             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1832             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1833             EXTOP,                                                      \
1834             NULL,                                                       \
1835         };                                                              \
1836         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1837     }
1838 
1839 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1840 {
1841     tcg_gen_addi_i64(rn, rn, 1u << 31);
1842     tcg_gen_extrh_i64_i32(rd, rn);
1843 }
1844 
1845 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1846 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1847 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1848 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1849 
1850 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1851                        NeonGenTwoOpWidenFn *opfn,
1852                        NeonGenTwo64OpFn *accfn)
1853 {
1854     /*
1855      * 3-regs different lengths, long operations.
1856      * These perform an operation on two inputs that returns a double-width
1857      * result, and then possibly perform an accumulation operation of
1858      * that result into the double-width destination.
1859      */
1860     TCGv_i64 rd0, rd1, tmp;
1861     TCGv_i32 rn, rm;
1862 
1863     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1864         return false;
1865     }
1866 
1867     /* UNDEF accesses to D16-D31 if they don't exist. */
1868     if (!dc_isar_feature(aa32_simd_r32, s) &&
1869         ((a->vd | a->vn | a->vm) & 0x10)) {
1870         return false;
1871     }
1872 
1873     if (!opfn) {
1874         /* size == 3 case, which is an entirely different insn group */
1875         return false;
1876     }
1877 
1878     if (a->vd & 1) {
1879         return false;
1880     }
1881 
1882     if (!vfp_access_check(s)) {
1883         return true;
1884     }
1885 
1886     rd0 = tcg_temp_new_i64();
1887     rd1 = tcg_temp_new_i64();
1888 
1889     rn = tcg_temp_new_i32();
1890     rm = tcg_temp_new_i32();
1891     read_neon_element32(rn, a->vn, 0, MO_32);
1892     read_neon_element32(rm, a->vm, 0, MO_32);
1893     opfn(rd0, rn, rm);
1894 
1895     read_neon_element32(rn, a->vn, 1, MO_32);
1896     read_neon_element32(rm, a->vm, 1, MO_32);
1897     opfn(rd1, rn, rm);
1898 
1899     /* Don't store results until after all loads: they might overlap */
1900     if (accfn) {
1901         tmp = tcg_temp_new_i64();
1902         read_neon_element64(tmp, a->vd, 0, MO_64);
1903         accfn(rd0, tmp, rd0);
1904         read_neon_element64(tmp, a->vd, 1, MO_64);
1905         accfn(rd1, tmp, rd1);
1906     }
1907 
1908     write_neon_element64(rd0, a->vd, 0, MO_64);
1909     write_neon_element64(rd1, a->vd, 1, MO_64);
1910 
1911     return true;
1912 }
1913 
1914 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
1915 {
1916     static NeonGenTwoOpWidenFn * const opfn[] = {
1917         gen_helper_neon_abdl_s16,
1918         gen_helper_neon_abdl_s32,
1919         gen_helper_neon_abdl_s64,
1920         NULL,
1921     };
1922 
1923     return do_long_3d(s, a, opfn[a->size], NULL);
1924 }
1925 
1926 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
1927 {
1928     static NeonGenTwoOpWidenFn * const opfn[] = {
1929         gen_helper_neon_abdl_u16,
1930         gen_helper_neon_abdl_u32,
1931         gen_helper_neon_abdl_u64,
1932         NULL,
1933     };
1934 
1935     return do_long_3d(s, a, opfn[a->size], NULL);
1936 }
1937 
1938 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
1939 {
1940     static NeonGenTwoOpWidenFn * const opfn[] = {
1941         gen_helper_neon_abdl_s16,
1942         gen_helper_neon_abdl_s32,
1943         gen_helper_neon_abdl_s64,
1944         NULL,
1945     };
1946     static NeonGenTwo64OpFn * const addfn[] = {
1947         gen_helper_neon_addl_u16,
1948         gen_helper_neon_addl_u32,
1949         tcg_gen_add_i64,
1950         NULL,
1951     };
1952 
1953     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
1954 }
1955 
1956 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
1957 {
1958     static NeonGenTwoOpWidenFn * const opfn[] = {
1959         gen_helper_neon_abdl_u16,
1960         gen_helper_neon_abdl_u32,
1961         gen_helper_neon_abdl_u64,
1962         NULL,
1963     };
1964     static NeonGenTwo64OpFn * const addfn[] = {
1965         gen_helper_neon_addl_u16,
1966         gen_helper_neon_addl_u32,
1967         tcg_gen_add_i64,
1968         NULL,
1969     };
1970 
1971     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
1972 }
1973 
1974 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1975 {
1976     TCGv_i32 lo = tcg_temp_new_i32();
1977     TCGv_i32 hi = tcg_temp_new_i32();
1978 
1979     tcg_gen_muls2_i32(lo, hi, rn, rm);
1980     tcg_gen_concat_i32_i64(rd, lo, hi);
1981 }
1982 
1983 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1984 {
1985     TCGv_i32 lo = tcg_temp_new_i32();
1986     TCGv_i32 hi = tcg_temp_new_i32();
1987 
1988     tcg_gen_mulu2_i32(lo, hi, rn, rm);
1989     tcg_gen_concat_i32_i64(rd, lo, hi);
1990 }
1991 
1992 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
1993 {
1994     static NeonGenTwoOpWidenFn * const opfn[] = {
1995         gen_helper_neon_mull_s8,
1996         gen_helper_neon_mull_s16,
1997         gen_mull_s32,
1998         NULL,
1999     };
2000 
2001     return do_long_3d(s, a, opfn[a->size], NULL);
2002 }
2003 
2004 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2005 {
2006     static NeonGenTwoOpWidenFn * const opfn[] = {
2007         gen_helper_neon_mull_u8,
2008         gen_helper_neon_mull_u16,
2009         gen_mull_u32,
2010         NULL,
2011     };
2012 
2013     return do_long_3d(s, a, opfn[a->size], NULL);
2014 }
2015 
2016 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2017     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2018     {                                                                   \
2019         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2020             gen_helper_neon_##MULL##8,                                  \
2021             gen_helper_neon_##MULL##16,                                 \
2022             gen_##MULL##32,                                             \
2023             NULL,                                                       \
2024         };                                                              \
2025         static NeonGenTwo64OpFn * const accfn[] = {                     \
2026             gen_helper_neon_##ACC##l_u16,                               \
2027             gen_helper_neon_##ACC##l_u32,                               \
2028             tcg_gen_##ACC##_i64,                                        \
2029             NULL,                                                       \
2030         };                                                              \
2031         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2032     }
2033 
2034 DO_VMLAL(VMLAL_S,mull_s,add)
2035 DO_VMLAL(VMLAL_U,mull_u,add)
2036 DO_VMLAL(VMLSL_S,mull_s,sub)
2037 DO_VMLAL(VMLSL_U,mull_u,sub)
2038 
2039 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2040 {
2041     gen_helper_neon_mull_s16(rd, rn, rm);
2042     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rd, rd);
2043 }
2044 
2045 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2046 {
2047     gen_mull_s32(rd, rn, rm);
2048     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rd, rd);
2049 }
2050 
2051 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2052 {
2053     static NeonGenTwoOpWidenFn * const opfn[] = {
2054         NULL,
2055         gen_VQDMULL_16,
2056         gen_VQDMULL_32,
2057         NULL,
2058     };
2059 
2060     return do_long_3d(s, a, opfn[a->size], NULL);
2061 }
2062 
2063 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2064 {
2065     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rn, rm);
2066 }
2067 
2068 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2069 {
2070     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rn, rm);
2071 }
2072 
2073 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2074 {
2075     static NeonGenTwoOpWidenFn * const opfn[] = {
2076         NULL,
2077         gen_VQDMULL_16,
2078         gen_VQDMULL_32,
2079         NULL,
2080     };
2081     static NeonGenTwo64OpFn * const accfn[] = {
2082         NULL,
2083         gen_VQDMLAL_acc_16,
2084         gen_VQDMLAL_acc_32,
2085         NULL,
2086     };
2087 
2088     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2089 }
2090 
2091 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2092 {
2093     gen_helper_neon_negl_u32(rm, rm);
2094     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rn, rm);
2095 }
2096 
2097 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2098 {
2099     tcg_gen_neg_i64(rm, rm);
2100     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rn, rm);
2101 }
2102 
2103 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2104 {
2105     static NeonGenTwoOpWidenFn * const opfn[] = {
2106         NULL,
2107         gen_VQDMULL_16,
2108         gen_VQDMULL_32,
2109         NULL,
2110     };
2111     static NeonGenTwo64OpFn * const accfn[] = {
2112         NULL,
2113         gen_VQDMLSL_acc_16,
2114         gen_VQDMLSL_acc_32,
2115         NULL,
2116     };
2117 
2118     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2119 }
2120 
2121 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2122 {
2123     gen_helper_gvec_3 *fn_gvec;
2124 
2125     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2126         return false;
2127     }
2128 
2129     /* UNDEF accesses to D16-D31 if they don't exist. */
2130     if (!dc_isar_feature(aa32_simd_r32, s) &&
2131         ((a->vd | a->vn | a->vm) & 0x10)) {
2132         return false;
2133     }
2134 
2135     if (a->vd & 1) {
2136         return false;
2137     }
2138 
2139     switch (a->size) {
2140     case 0:
2141         fn_gvec = gen_helper_neon_pmull_h;
2142         break;
2143     case 2:
2144         if (!dc_isar_feature(aa32_pmull, s)) {
2145             return false;
2146         }
2147         fn_gvec = gen_helper_gvec_pmull_q;
2148         break;
2149     default:
2150         return false;
2151     }
2152 
2153     if (!vfp_access_check(s)) {
2154         return true;
2155     }
2156 
2157     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2158                        neon_full_reg_offset(a->vn),
2159                        neon_full_reg_offset(a->vm),
2160                        16, 16, 0, fn_gvec);
2161     return true;
2162 }
2163 
2164 static void gen_neon_dup_low16(TCGv_i32 var)
2165 {
2166     TCGv_i32 tmp = tcg_temp_new_i32();
2167     tcg_gen_ext16u_i32(var, var);
2168     tcg_gen_shli_i32(tmp, var, 16);
2169     tcg_gen_or_i32(var, var, tmp);
2170 }
2171 
2172 static void gen_neon_dup_high16(TCGv_i32 var)
2173 {
2174     TCGv_i32 tmp = tcg_temp_new_i32();
2175     tcg_gen_andi_i32(var, var, 0xffff0000);
2176     tcg_gen_shri_i32(tmp, var, 16);
2177     tcg_gen_or_i32(var, var, tmp);
2178 }
2179 
2180 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2181 {
2182     TCGv_i32 tmp = tcg_temp_new_i32();
2183     if (size == MO_16) {
2184         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2185         if (reg & 8) {
2186             gen_neon_dup_high16(tmp);
2187         } else {
2188             gen_neon_dup_low16(tmp);
2189         }
2190     } else {
2191         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2192     }
2193     return tmp;
2194 }
2195 
2196 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2197                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2198 {
2199     /*
2200      * Two registers and a scalar: perform an operation between
2201      * the input elements and the scalar, and then possibly
2202      * perform an accumulation operation of that result into the
2203      * destination.
2204      */
2205     TCGv_i32 scalar, tmp;
2206     int pass;
2207 
2208     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2209         return false;
2210     }
2211 
2212     /* UNDEF accesses to D16-D31 if they don't exist. */
2213     if (!dc_isar_feature(aa32_simd_r32, s) &&
2214         ((a->vd | a->vn | a->vm) & 0x10)) {
2215         return false;
2216     }
2217 
2218     if (!opfn) {
2219         /* Bad size (including size == 3, which is a different insn group) */
2220         return false;
2221     }
2222 
2223     if (a->q && ((a->vd | a->vn) & 1)) {
2224         return false;
2225     }
2226 
2227     if (!vfp_access_check(s)) {
2228         return true;
2229     }
2230 
2231     scalar = neon_get_scalar(a->size, a->vm);
2232     tmp = tcg_temp_new_i32();
2233 
2234     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2235         read_neon_element32(tmp, a->vn, pass, MO_32);
2236         opfn(tmp, tmp, scalar);
2237         if (accfn) {
2238             TCGv_i32 rd = tcg_temp_new_i32();
2239             read_neon_element32(rd, a->vd, pass, MO_32);
2240             accfn(tmp, rd, tmp);
2241         }
2242         write_neon_element32(tmp, a->vd, pass, MO_32);
2243     }
2244     return true;
2245 }
2246 
2247 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2248 {
2249     static NeonGenTwoOpFn * const opfn[] = {
2250         NULL,
2251         gen_helper_neon_mul_u16,
2252         tcg_gen_mul_i32,
2253         NULL,
2254     };
2255 
2256     return do_2scalar(s, a, opfn[a->size], NULL);
2257 }
2258 
2259 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2260 {
2261     static NeonGenTwoOpFn * const opfn[] = {
2262         NULL,
2263         gen_helper_neon_mul_u16,
2264         tcg_gen_mul_i32,
2265         NULL,
2266     };
2267     static NeonGenTwoOpFn * const accfn[] = {
2268         NULL,
2269         gen_helper_neon_add_u16,
2270         tcg_gen_add_i32,
2271         NULL,
2272     };
2273 
2274     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2275 }
2276 
2277 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2278 {
2279     static NeonGenTwoOpFn * const opfn[] = {
2280         NULL,
2281         gen_helper_neon_mul_u16,
2282         tcg_gen_mul_i32,
2283         NULL,
2284     };
2285     static NeonGenTwoOpFn * const accfn[] = {
2286         NULL,
2287         gen_helper_neon_sub_u16,
2288         tcg_gen_sub_i32,
2289         NULL,
2290     };
2291 
2292     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2293 }
2294 
2295 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2296                               gen_helper_gvec_3_ptr *fn)
2297 {
2298     /* Two registers and a scalar, using gvec */
2299     int vec_size = a->q ? 16 : 8;
2300     int rd_ofs = neon_full_reg_offset(a->vd);
2301     int rn_ofs = neon_full_reg_offset(a->vn);
2302     int rm_ofs;
2303     int idx;
2304     TCGv_ptr fpstatus;
2305 
2306     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2307         return false;
2308     }
2309 
2310     /* UNDEF accesses to D16-D31 if they don't exist. */
2311     if (!dc_isar_feature(aa32_simd_r32, s) &&
2312         ((a->vd | a->vn | a->vm) & 0x10)) {
2313         return false;
2314     }
2315 
2316     if (!fn) {
2317         /* Bad size (including size == 3, which is a different insn group) */
2318         return false;
2319     }
2320 
2321     if (a->q && ((a->vd | a->vn) & 1)) {
2322         return false;
2323     }
2324 
2325     if (!vfp_access_check(s)) {
2326         return true;
2327     }
2328 
2329     /* a->vm is M:Vm, which encodes both register and index */
2330     idx = extract32(a->vm, a->size + 2, 2);
2331     a->vm = extract32(a->vm, 0, a->size + 2);
2332     rm_ofs = neon_full_reg_offset(a->vm);
2333 
2334     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2335     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2336                        vec_size, vec_size, idx, fn);
2337     return true;
2338 }
2339 
2340 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2341     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2342     {                                                                   \
2343         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2344             NULL,                                                       \
2345             gen_helper_##FUNC##_h,                                      \
2346             gen_helper_##FUNC##_s,                                      \
2347             NULL,                                                       \
2348         };                                                              \
2349         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2350             return false;                                               \
2351         }                                                               \
2352         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2353     }
2354 
2355 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2356 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2357 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2358 
2359 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2360 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2361 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2362 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2363 
2364 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2365 {
2366     static NeonGenTwoOpFn * const opfn[] = {
2367         NULL,
2368         gen_VQDMULH_16,
2369         gen_VQDMULH_32,
2370         NULL,
2371     };
2372 
2373     return do_2scalar(s, a, opfn[a->size], NULL);
2374 }
2375 
2376 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2377 {
2378     static NeonGenTwoOpFn * const opfn[] = {
2379         NULL,
2380         gen_VQRDMULH_16,
2381         gen_VQRDMULH_32,
2382         NULL,
2383     };
2384 
2385     return do_2scalar(s, a, opfn[a->size], NULL);
2386 }
2387 
2388 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2389                             NeonGenThreeOpEnvFn *opfn)
2390 {
2391     /*
2392      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2393      * performs a kind of fused op-then-accumulate using a helper
2394      * function that takes all of rd, rn and the scalar at once.
2395      */
2396     TCGv_i32 scalar, rn, rd;
2397     int pass;
2398 
2399     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2400         return false;
2401     }
2402 
2403     if (!dc_isar_feature(aa32_rdm, s)) {
2404         return false;
2405     }
2406 
2407     /* UNDEF accesses to D16-D31 if they don't exist. */
2408     if (!dc_isar_feature(aa32_simd_r32, s) &&
2409         ((a->vd | a->vn | a->vm) & 0x10)) {
2410         return false;
2411     }
2412 
2413     if (!opfn) {
2414         /* Bad size (including size == 3, which is a different insn group) */
2415         return false;
2416     }
2417 
2418     if (a->q && ((a->vd | a->vn) & 1)) {
2419         return false;
2420     }
2421 
2422     if (!vfp_access_check(s)) {
2423         return true;
2424     }
2425 
2426     scalar = neon_get_scalar(a->size, a->vm);
2427     rn = tcg_temp_new_i32();
2428     rd = tcg_temp_new_i32();
2429 
2430     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2431         read_neon_element32(rn, a->vn, pass, MO_32);
2432         read_neon_element32(rd, a->vd, pass, MO_32);
2433         opfn(rd, tcg_env, rn, scalar, rd);
2434         write_neon_element32(rd, a->vd, pass, MO_32);
2435     }
2436     return true;
2437 }
2438 
2439 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2440 {
2441     static NeonGenThreeOpEnvFn *opfn[] = {
2442         NULL,
2443         gen_helper_neon_qrdmlah_s16,
2444         gen_helper_neon_qrdmlah_s32,
2445         NULL,
2446     };
2447     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2448 }
2449 
2450 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2451 {
2452     static NeonGenThreeOpEnvFn *opfn[] = {
2453         NULL,
2454         gen_helper_neon_qrdmlsh_s16,
2455         gen_helper_neon_qrdmlsh_s32,
2456         NULL,
2457     };
2458     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2459 }
2460 
2461 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2462                             NeonGenTwoOpWidenFn *opfn,
2463                             NeonGenTwo64OpFn *accfn)
2464 {
2465     /*
2466      * Two registers and a scalar, long operations: perform an
2467      * operation on the input elements and the scalar which produces
2468      * a double-width result, and then possibly perform an accumulation
2469      * operation of that result into the destination.
2470      */
2471     TCGv_i32 scalar, rn;
2472     TCGv_i64 rn0_64, rn1_64;
2473 
2474     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2475         return false;
2476     }
2477 
2478     /* UNDEF accesses to D16-D31 if they don't exist. */
2479     if (!dc_isar_feature(aa32_simd_r32, s) &&
2480         ((a->vd | a->vn | a->vm) & 0x10)) {
2481         return false;
2482     }
2483 
2484     if (!opfn) {
2485         /* Bad size (including size == 3, which is a different insn group) */
2486         return false;
2487     }
2488 
2489     if (a->vd & 1) {
2490         return false;
2491     }
2492 
2493     if (!vfp_access_check(s)) {
2494         return true;
2495     }
2496 
2497     scalar = neon_get_scalar(a->size, a->vm);
2498 
2499     /* Load all inputs before writing any outputs, in case of overlap */
2500     rn = tcg_temp_new_i32();
2501     read_neon_element32(rn, a->vn, 0, MO_32);
2502     rn0_64 = tcg_temp_new_i64();
2503     opfn(rn0_64, rn, scalar);
2504 
2505     read_neon_element32(rn, a->vn, 1, MO_32);
2506     rn1_64 = tcg_temp_new_i64();
2507     opfn(rn1_64, rn, scalar);
2508 
2509     if (accfn) {
2510         TCGv_i64 t64 = tcg_temp_new_i64();
2511         read_neon_element64(t64, a->vd, 0, MO_64);
2512         accfn(rn0_64, t64, rn0_64);
2513         read_neon_element64(t64, a->vd, 1, MO_64);
2514         accfn(rn1_64, t64, rn1_64);
2515     }
2516 
2517     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2518     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2519     return true;
2520 }
2521 
2522 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2523 {
2524     static NeonGenTwoOpWidenFn * const opfn[] = {
2525         NULL,
2526         gen_helper_neon_mull_s16,
2527         gen_mull_s32,
2528         NULL,
2529     };
2530 
2531     return do_2scalar_long(s, a, opfn[a->size], NULL);
2532 }
2533 
2534 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2535 {
2536     static NeonGenTwoOpWidenFn * const opfn[] = {
2537         NULL,
2538         gen_helper_neon_mull_u16,
2539         gen_mull_u32,
2540         NULL,
2541     };
2542 
2543     return do_2scalar_long(s, a, opfn[a->size], NULL);
2544 }
2545 
2546 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2547     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2548     {                                                                   \
2549         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2550             NULL,                                                       \
2551             gen_helper_neon_##MULL##16,                                 \
2552             gen_##MULL##32,                                             \
2553             NULL,                                                       \
2554         };                                                              \
2555         static NeonGenTwo64OpFn * const accfn[] = {                     \
2556             NULL,                                                       \
2557             gen_helper_neon_##ACC##l_u32,                               \
2558             tcg_gen_##ACC##_i64,                                        \
2559             NULL,                                                       \
2560         };                                                              \
2561         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2562     }
2563 
2564 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2565 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2566 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2567 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2568 
2569 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2570 {
2571     static NeonGenTwoOpWidenFn * const opfn[] = {
2572         NULL,
2573         gen_VQDMULL_16,
2574         gen_VQDMULL_32,
2575         NULL,
2576     };
2577 
2578     return do_2scalar_long(s, a, opfn[a->size], NULL);
2579 }
2580 
2581 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2582 {
2583     static NeonGenTwoOpWidenFn * const opfn[] = {
2584         NULL,
2585         gen_VQDMULL_16,
2586         gen_VQDMULL_32,
2587         NULL,
2588     };
2589     static NeonGenTwo64OpFn * const accfn[] = {
2590         NULL,
2591         gen_VQDMLAL_acc_16,
2592         gen_VQDMLAL_acc_32,
2593         NULL,
2594     };
2595 
2596     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2597 }
2598 
2599 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2600 {
2601     static NeonGenTwoOpWidenFn * const opfn[] = {
2602         NULL,
2603         gen_VQDMULL_16,
2604         gen_VQDMULL_32,
2605         NULL,
2606     };
2607     static NeonGenTwo64OpFn * const accfn[] = {
2608         NULL,
2609         gen_VQDMLSL_acc_16,
2610         gen_VQDMLSL_acc_32,
2611         NULL,
2612     };
2613 
2614     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2615 }
2616 
2617 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2618 {
2619     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2620         return false;
2621     }
2622 
2623     /* UNDEF accesses to D16-D31 if they don't exist. */
2624     if (!dc_isar_feature(aa32_simd_r32, s) &&
2625         ((a->vd | a->vn | a->vm) & 0x10)) {
2626         return false;
2627     }
2628 
2629     if ((a->vn | a->vm | a->vd) & a->q) {
2630         return false;
2631     }
2632 
2633     if (a->imm > 7 && !a->q) {
2634         return false;
2635     }
2636 
2637     if (!vfp_access_check(s)) {
2638         return true;
2639     }
2640 
2641     if (!a->q) {
2642         /* Extract 64 bits from <Vm:Vn> */
2643         TCGv_i64 left, right, dest;
2644 
2645         left = tcg_temp_new_i64();
2646         right = tcg_temp_new_i64();
2647         dest = tcg_temp_new_i64();
2648 
2649         read_neon_element64(right, a->vn, 0, MO_64);
2650         read_neon_element64(left, a->vm, 0, MO_64);
2651         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2652         write_neon_element64(dest, a->vd, 0, MO_64);
2653     } else {
2654         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2655         TCGv_i64 left, middle, right, destleft, destright;
2656 
2657         left = tcg_temp_new_i64();
2658         middle = tcg_temp_new_i64();
2659         right = tcg_temp_new_i64();
2660         destleft = tcg_temp_new_i64();
2661         destright = tcg_temp_new_i64();
2662 
2663         if (a->imm < 8) {
2664             read_neon_element64(right, a->vn, 0, MO_64);
2665             read_neon_element64(middle, a->vn, 1, MO_64);
2666             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2667             read_neon_element64(left, a->vm, 0, MO_64);
2668             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2669         } else {
2670             read_neon_element64(right, a->vn, 1, MO_64);
2671             read_neon_element64(middle, a->vm, 0, MO_64);
2672             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2673             read_neon_element64(left, a->vm, 1, MO_64);
2674             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2675         }
2676 
2677         write_neon_element64(destright, a->vd, 0, MO_64);
2678         write_neon_element64(destleft, a->vd, 1, MO_64);
2679     }
2680     return true;
2681 }
2682 
2683 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2684 {
2685     TCGv_i64 val, def;
2686     TCGv_i32 desc;
2687 
2688     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2689         return false;
2690     }
2691 
2692     /* UNDEF accesses to D16-D31 if they don't exist. */
2693     if (!dc_isar_feature(aa32_simd_r32, s) &&
2694         ((a->vd | a->vn | a->vm) & 0x10)) {
2695         return false;
2696     }
2697 
2698     if ((a->vn + a->len + 1) > 32) {
2699         /*
2700          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2701          * helper function running off the end of the register file.
2702          */
2703         return false;
2704     }
2705 
2706     if (!vfp_access_check(s)) {
2707         return true;
2708     }
2709 
2710     desc = tcg_constant_i32((a->vn << 2) | a->len);
2711     def = tcg_temp_new_i64();
2712     if (a->op) {
2713         read_neon_element64(def, a->vd, 0, MO_64);
2714     } else {
2715         tcg_gen_movi_i64(def, 0);
2716     }
2717     val = tcg_temp_new_i64();
2718     read_neon_element64(val, a->vm, 0, MO_64);
2719 
2720     gen_helper_neon_tbl(val, tcg_env, desc, val, def);
2721     write_neon_element64(val, a->vd, 0, MO_64);
2722     return true;
2723 }
2724 
2725 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2726 {
2727     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2728         return false;
2729     }
2730 
2731     /* UNDEF accesses to D16-D31 if they don't exist. */
2732     if (!dc_isar_feature(aa32_simd_r32, s) &&
2733         ((a->vd | a->vm) & 0x10)) {
2734         return false;
2735     }
2736 
2737     if (a->vd & a->q) {
2738         return false;
2739     }
2740 
2741     if (!vfp_access_check(s)) {
2742         return true;
2743     }
2744 
2745     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2746                          neon_element_offset(a->vm, a->index, a->size),
2747                          a->q ? 16 : 8, a->q ? 16 : 8);
2748     return true;
2749 }
2750 
2751 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2752 {
2753     int pass, half;
2754     TCGv_i32 tmp[2];
2755 
2756     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2757         return false;
2758     }
2759 
2760     /* UNDEF accesses to D16-D31 if they don't exist. */
2761     if (!dc_isar_feature(aa32_simd_r32, s) &&
2762         ((a->vd | a->vm) & 0x10)) {
2763         return false;
2764     }
2765 
2766     if ((a->vd | a->vm) & a->q) {
2767         return false;
2768     }
2769 
2770     if (a->size == 3) {
2771         return false;
2772     }
2773 
2774     if (!vfp_access_check(s)) {
2775         return true;
2776     }
2777 
2778     tmp[0] = tcg_temp_new_i32();
2779     tmp[1] = tcg_temp_new_i32();
2780 
2781     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2782         for (half = 0; half < 2; half++) {
2783             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2784             switch (a->size) {
2785             case 0:
2786                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2787                 break;
2788             case 1:
2789                 gen_swap_half(tmp[half], tmp[half]);
2790                 break;
2791             case 2:
2792                 break;
2793             default:
2794                 g_assert_not_reached();
2795             }
2796         }
2797         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2798         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
2799     }
2800     return true;
2801 }
2802 
2803 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2804                               NeonGenWidenFn *widenfn,
2805                               NeonGenTwo64OpFn *opfn,
2806                               NeonGenTwo64OpFn *accfn)
2807 {
2808     /*
2809      * Pairwise long operations: widen both halves of the pair,
2810      * combine the pairs with the opfn, and then possibly accumulate
2811      * into the destination with the accfn.
2812      */
2813     int pass;
2814 
2815     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2816         return false;
2817     }
2818 
2819     /* UNDEF accesses to D16-D31 if they don't exist. */
2820     if (!dc_isar_feature(aa32_simd_r32, s) &&
2821         ((a->vd | a->vm) & 0x10)) {
2822         return false;
2823     }
2824 
2825     if ((a->vd | a->vm) & a->q) {
2826         return false;
2827     }
2828 
2829     if (!widenfn) {
2830         return false;
2831     }
2832 
2833     if (!vfp_access_check(s)) {
2834         return true;
2835     }
2836 
2837     for (pass = 0; pass < a->q + 1; pass++) {
2838         TCGv_i32 tmp;
2839         TCGv_i64 rm0_64, rm1_64, rd_64;
2840 
2841         rm0_64 = tcg_temp_new_i64();
2842         rm1_64 = tcg_temp_new_i64();
2843         rd_64 = tcg_temp_new_i64();
2844 
2845         tmp = tcg_temp_new_i32();
2846         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
2847         widenfn(rm0_64, tmp);
2848         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
2849         widenfn(rm1_64, tmp);
2850 
2851         opfn(rd_64, rm0_64, rm1_64);
2852 
2853         if (accfn) {
2854             TCGv_i64 tmp64 = tcg_temp_new_i64();
2855             read_neon_element64(tmp64, a->vd, pass, MO_64);
2856             accfn(rd_64, tmp64, rd_64);
2857         }
2858         write_neon_element64(rd_64, a->vd, pass, MO_64);
2859     }
2860     return true;
2861 }
2862 
2863 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
2864 {
2865     static NeonGenWidenFn * const widenfn[] = {
2866         gen_helper_neon_widen_s8,
2867         gen_helper_neon_widen_s16,
2868         tcg_gen_ext_i32_i64,
2869         NULL,
2870     };
2871     static NeonGenTwo64OpFn * const opfn[] = {
2872         gen_helper_neon_paddl_u16,
2873         gen_helper_neon_paddl_u32,
2874         tcg_gen_add_i64,
2875         NULL,
2876     };
2877 
2878     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
2879 }
2880 
2881 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
2882 {
2883     static NeonGenWidenFn * const widenfn[] = {
2884         gen_helper_neon_widen_u8,
2885         gen_helper_neon_widen_u16,
2886         tcg_gen_extu_i32_i64,
2887         NULL,
2888     };
2889     static NeonGenTwo64OpFn * const opfn[] = {
2890         gen_helper_neon_paddl_u16,
2891         gen_helper_neon_paddl_u32,
2892         tcg_gen_add_i64,
2893         NULL,
2894     };
2895 
2896     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
2897 }
2898 
2899 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
2900 {
2901     static NeonGenWidenFn * const widenfn[] = {
2902         gen_helper_neon_widen_s8,
2903         gen_helper_neon_widen_s16,
2904         tcg_gen_ext_i32_i64,
2905         NULL,
2906     };
2907     static NeonGenTwo64OpFn * const opfn[] = {
2908         gen_helper_neon_paddl_u16,
2909         gen_helper_neon_paddl_u32,
2910         tcg_gen_add_i64,
2911         NULL,
2912     };
2913     static NeonGenTwo64OpFn * const accfn[] = {
2914         gen_helper_neon_addl_u16,
2915         gen_helper_neon_addl_u32,
2916         tcg_gen_add_i64,
2917         NULL,
2918     };
2919 
2920     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
2921                              accfn[a->size]);
2922 }
2923 
2924 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
2925 {
2926     static NeonGenWidenFn * const widenfn[] = {
2927         gen_helper_neon_widen_u8,
2928         gen_helper_neon_widen_u16,
2929         tcg_gen_extu_i32_i64,
2930         NULL,
2931     };
2932     static NeonGenTwo64OpFn * const opfn[] = {
2933         gen_helper_neon_paddl_u16,
2934         gen_helper_neon_paddl_u32,
2935         tcg_gen_add_i64,
2936         NULL,
2937     };
2938     static NeonGenTwo64OpFn * const accfn[] = {
2939         gen_helper_neon_addl_u16,
2940         gen_helper_neon_addl_u32,
2941         tcg_gen_add_i64,
2942         NULL,
2943     };
2944 
2945     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
2946                              accfn[a->size]);
2947 }
2948 
2949 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
2950 
2951 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
2952                        ZipFn *fn)
2953 {
2954     TCGv_ptr pd, pm;
2955 
2956     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2957         return false;
2958     }
2959 
2960     /* UNDEF accesses to D16-D31 if they don't exist. */
2961     if (!dc_isar_feature(aa32_simd_r32, s) &&
2962         ((a->vd | a->vm) & 0x10)) {
2963         return false;
2964     }
2965 
2966     if ((a->vd | a->vm) & a->q) {
2967         return false;
2968     }
2969 
2970     if (!fn) {
2971         /* Bad size or size/q combination */
2972         return false;
2973     }
2974 
2975     if (!vfp_access_check(s)) {
2976         return true;
2977     }
2978 
2979     pd = vfp_reg_ptr(true, a->vd);
2980     pm = vfp_reg_ptr(true, a->vm);
2981     fn(pd, pm);
2982     return true;
2983 }
2984 
2985 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
2986 {
2987     static ZipFn * const fn[2][4] = {
2988         {
2989             gen_helper_neon_unzip8,
2990             gen_helper_neon_unzip16,
2991             NULL,
2992             NULL,
2993         }, {
2994             gen_helper_neon_qunzip8,
2995             gen_helper_neon_qunzip16,
2996             gen_helper_neon_qunzip32,
2997             NULL,
2998         }
2999     };
3000     return do_zip_uzp(s, a, fn[a->q][a->size]);
3001 }
3002 
3003 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3004 {
3005     static ZipFn * const fn[2][4] = {
3006         {
3007             gen_helper_neon_zip8,
3008             gen_helper_neon_zip16,
3009             NULL,
3010             NULL,
3011         }, {
3012             gen_helper_neon_qzip8,
3013             gen_helper_neon_qzip16,
3014             gen_helper_neon_qzip32,
3015             NULL,
3016         }
3017     };
3018     return do_zip_uzp(s, a, fn[a->q][a->size]);
3019 }
3020 
3021 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3022                      NeonGenNarrowEnvFn *narrowfn)
3023 {
3024     TCGv_i64 rm;
3025     TCGv_i32 rd0, rd1;
3026 
3027     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3028         return false;
3029     }
3030 
3031     /* UNDEF accesses to D16-D31 if they don't exist. */
3032     if (!dc_isar_feature(aa32_simd_r32, s) &&
3033         ((a->vd | a->vm) & 0x10)) {
3034         return false;
3035     }
3036 
3037     if (a->vm & 1) {
3038         return false;
3039     }
3040 
3041     if (!narrowfn) {
3042         return false;
3043     }
3044 
3045     if (!vfp_access_check(s)) {
3046         return true;
3047     }
3048 
3049     rm = tcg_temp_new_i64();
3050     rd0 = tcg_temp_new_i32();
3051     rd1 = tcg_temp_new_i32();
3052 
3053     read_neon_element64(rm, a->vm, 0, MO_64);
3054     narrowfn(rd0, tcg_env, rm);
3055     read_neon_element64(rm, a->vm, 1, MO_64);
3056     narrowfn(rd1, tcg_env, rm);
3057     write_neon_element32(rd0, a->vd, 0, MO_32);
3058     write_neon_element32(rd1, a->vd, 1, MO_32);
3059     return true;
3060 }
3061 
3062 #define DO_VMOVN(INSN, FUNC)                                    \
3063     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3064     {                                                           \
3065         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3066             FUNC##8,                                            \
3067             FUNC##16,                                           \
3068             FUNC##32,                                           \
3069             NULL,                                               \
3070         };                                                      \
3071         return do_vmovn(s, a, narrowfn[a->size]);               \
3072     }
3073 
3074 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3075 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3076 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3077 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3078 
3079 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3080 {
3081     TCGv_i32 rm0, rm1;
3082     TCGv_i64 rd;
3083     static NeonGenWidenFn * const widenfns[] = {
3084         gen_helper_neon_widen_u8,
3085         gen_helper_neon_widen_u16,
3086         tcg_gen_extu_i32_i64,
3087         NULL,
3088     };
3089     NeonGenWidenFn *widenfn = widenfns[a->size];
3090 
3091     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3092         return false;
3093     }
3094 
3095     /* UNDEF accesses to D16-D31 if they don't exist. */
3096     if (!dc_isar_feature(aa32_simd_r32, s) &&
3097         ((a->vd | a->vm) & 0x10)) {
3098         return false;
3099     }
3100 
3101     if (a->vd & 1) {
3102         return false;
3103     }
3104 
3105     if (!widenfn) {
3106         return false;
3107     }
3108 
3109     if (!vfp_access_check(s)) {
3110         return true;
3111     }
3112 
3113     rd = tcg_temp_new_i64();
3114     rm0 = tcg_temp_new_i32();
3115     rm1 = tcg_temp_new_i32();
3116 
3117     read_neon_element32(rm0, a->vm, 0, MO_32);
3118     read_neon_element32(rm1, a->vm, 1, MO_32);
3119 
3120     widenfn(rd, rm0);
3121     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3122     write_neon_element64(rd, a->vd, 0, MO_64);
3123     widenfn(rd, rm1);
3124     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3125     write_neon_element64(rd, a->vd, 1, MO_64);
3126     return true;
3127 }
3128 
3129 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3130 {
3131     TCGv_ptr fpst;
3132     TCGv_i64 tmp;
3133     TCGv_i32 dst0, dst1;
3134 
3135     if (!dc_isar_feature(aa32_bf16, s)) {
3136         return false;
3137     }
3138 
3139     /* UNDEF accesses to D16-D31 if they don't exist. */
3140     if (!dc_isar_feature(aa32_simd_r32, s) &&
3141         ((a->vd | a->vm) & 0x10)) {
3142         return false;
3143     }
3144 
3145     if ((a->vm & 1) || (a->size != 1)) {
3146         return false;
3147     }
3148 
3149     if (!vfp_access_check(s)) {
3150         return true;
3151     }
3152 
3153     fpst = fpstatus_ptr(FPST_STD);
3154     tmp = tcg_temp_new_i64();
3155     dst0 = tcg_temp_new_i32();
3156     dst1 = tcg_temp_new_i32();
3157 
3158     read_neon_element64(tmp, a->vm, 0, MO_64);
3159     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3160 
3161     read_neon_element64(tmp, a->vm, 1, MO_64);
3162     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3163 
3164     write_neon_element32(dst0, a->vd, 0, MO_32);
3165     write_neon_element32(dst1, a->vd, 1, MO_32);
3166     return true;
3167 }
3168 
3169 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3170 {
3171     TCGv_ptr fpst;
3172     TCGv_i32 ahp, tmp, tmp2, tmp3;
3173 
3174     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3175         !dc_isar_feature(aa32_fp16_spconv, s)) {
3176         return false;
3177     }
3178 
3179     /* UNDEF accesses to D16-D31 if they don't exist. */
3180     if (!dc_isar_feature(aa32_simd_r32, s) &&
3181         ((a->vd | a->vm) & 0x10)) {
3182         return false;
3183     }
3184 
3185     if ((a->vm & 1) || (a->size != 1)) {
3186         return false;
3187     }
3188 
3189     if (!vfp_access_check(s)) {
3190         return true;
3191     }
3192 
3193     fpst = fpstatus_ptr(FPST_STD);
3194     ahp = get_ahp_flag();
3195     tmp = tcg_temp_new_i32();
3196     read_neon_element32(tmp, a->vm, 0, MO_32);
3197     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3198     tmp2 = tcg_temp_new_i32();
3199     read_neon_element32(tmp2, a->vm, 1, MO_32);
3200     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3201     tcg_gen_shli_i32(tmp2, tmp2, 16);
3202     tcg_gen_or_i32(tmp2, tmp2, tmp);
3203     read_neon_element32(tmp, a->vm, 2, MO_32);
3204     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3205     tmp3 = tcg_temp_new_i32();
3206     read_neon_element32(tmp3, a->vm, 3, MO_32);
3207     write_neon_element32(tmp2, a->vd, 0, MO_32);
3208     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3209     tcg_gen_shli_i32(tmp3, tmp3, 16);
3210     tcg_gen_or_i32(tmp3, tmp3, tmp);
3211     write_neon_element32(tmp3, a->vd, 1, MO_32);
3212     return true;
3213 }
3214 
3215 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3216 {
3217     TCGv_ptr fpst;
3218     TCGv_i32 ahp, tmp, tmp2, tmp3;
3219 
3220     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3221         !dc_isar_feature(aa32_fp16_spconv, s)) {
3222         return false;
3223     }
3224 
3225     /* UNDEF accesses to D16-D31 if they don't exist. */
3226     if (!dc_isar_feature(aa32_simd_r32, s) &&
3227         ((a->vd | a->vm) & 0x10)) {
3228         return false;
3229     }
3230 
3231     if ((a->vd & 1) || (a->size != 1)) {
3232         return false;
3233     }
3234 
3235     if (!vfp_access_check(s)) {
3236         return true;
3237     }
3238 
3239     fpst = fpstatus_ptr(FPST_STD);
3240     ahp = get_ahp_flag();
3241     tmp3 = tcg_temp_new_i32();
3242     tmp2 = tcg_temp_new_i32();
3243     tmp = tcg_temp_new_i32();
3244     read_neon_element32(tmp, a->vm, 0, MO_32);
3245     read_neon_element32(tmp2, a->vm, 1, MO_32);
3246     tcg_gen_ext16u_i32(tmp3, tmp);
3247     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3248     write_neon_element32(tmp3, a->vd, 0, MO_32);
3249     tcg_gen_shri_i32(tmp, tmp, 16);
3250     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3251     write_neon_element32(tmp, a->vd, 1, MO_32);
3252     tcg_gen_ext16u_i32(tmp3, tmp2);
3253     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3254     write_neon_element32(tmp3, a->vd, 2, MO_32);
3255     tcg_gen_shri_i32(tmp2, tmp2, 16);
3256     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3257     write_neon_element32(tmp2, a->vd, 3, MO_32);
3258     return true;
3259 }
3260 
3261 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3262 {
3263     int vec_size = a->q ? 16 : 8;
3264     int rd_ofs = neon_full_reg_offset(a->vd);
3265     int rm_ofs = neon_full_reg_offset(a->vm);
3266 
3267     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3268         return false;
3269     }
3270 
3271     /* UNDEF accesses to D16-D31 if they don't exist. */
3272     if (!dc_isar_feature(aa32_simd_r32, s) &&
3273         ((a->vd | a->vm) & 0x10)) {
3274         return false;
3275     }
3276 
3277     if (a->size == 3) {
3278         return false;
3279     }
3280 
3281     if ((a->vd | a->vm) & a->q) {
3282         return false;
3283     }
3284 
3285     if (!vfp_access_check(s)) {
3286         return true;
3287     }
3288 
3289     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3290 
3291     return true;
3292 }
3293 
3294 #define DO_2MISC_VEC(INSN, FN)                                  \
3295     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3296     {                                                           \
3297         return do_2misc_vec(s, a, FN);                          \
3298     }
3299 
3300 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3301 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3302 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3303 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3304 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3305 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3306 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3307 
3308 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3309 {
3310     if (a->size != 0) {
3311         return false;
3312     }
3313     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3314 }
3315 
3316 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3317     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3318                          uint32_t rm_ofs, uint32_t oprsz,               \
3319                          uint32_t maxsz)                                \
3320     {                                                                   \
3321         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3322                            DATA, FUNC);                                 \
3323     }
3324 
3325 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3326     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3327                          uint32_t rm_ofs, uint32_t oprsz,               \
3328                          uint32_t maxsz)                                \
3329     {                                                                   \
3330         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3331     }
3332 
3333 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3334 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aesd, 0)
3335 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3336 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesimc, 0)
3337 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3338 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3339 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3340 
3341 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3342     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3343     {                                                           \
3344         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3345             return false;                                       \
3346         }                                                       \
3347         return do_2misc_vec(s, a, gen_##INSN);                  \
3348     }
3349 
3350 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3351 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3352 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3353 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3354 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3355 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3356 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3357 
3358 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3359 {
3360     TCGv_i32 tmp;
3361     int pass;
3362 
3363     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3364     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3365         return false;
3366     }
3367 
3368     /* UNDEF accesses to D16-D31 if they don't exist. */
3369     if (!dc_isar_feature(aa32_simd_r32, s) &&
3370         ((a->vd | a->vm) & 0x10)) {
3371         return false;
3372     }
3373 
3374     if (!fn) {
3375         return false;
3376     }
3377 
3378     if ((a->vd | a->vm) & a->q) {
3379         return false;
3380     }
3381 
3382     if (!vfp_access_check(s)) {
3383         return true;
3384     }
3385 
3386     tmp = tcg_temp_new_i32();
3387     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3388         read_neon_element32(tmp, a->vm, pass, MO_32);
3389         fn(tmp, tmp);
3390         write_neon_element32(tmp, a->vd, pass, MO_32);
3391     }
3392     return true;
3393 }
3394 
3395 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3396 {
3397     static NeonGenOneOpFn * const fn[] = {
3398         tcg_gen_bswap32_i32,
3399         gen_swap_half,
3400         NULL,
3401         NULL,
3402     };
3403     return do_2misc(s, a, fn[a->size]);
3404 }
3405 
3406 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3407 {
3408     if (a->size != 0) {
3409         return false;
3410     }
3411     return do_2misc(s, a, gen_rev16);
3412 }
3413 
3414 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3415 {
3416     static NeonGenOneOpFn * const fn[] = {
3417         gen_helper_neon_cls_s8,
3418         gen_helper_neon_cls_s16,
3419         gen_helper_neon_cls_s32,
3420         NULL,
3421     };
3422     return do_2misc(s, a, fn[a->size]);
3423 }
3424 
3425 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3426 {
3427     tcg_gen_clzi_i32(rd, rm, 32);
3428 }
3429 
3430 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3431 {
3432     static NeonGenOneOpFn * const fn[] = {
3433         gen_helper_neon_clz_u8,
3434         gen_helper_neon_clz_u16,
3435         do_VCLZ_32,
3436         NULL,
3437     };
3438     return do_2misc(s, a, fn[a->size]);
3439 }
3440 
3441 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3442 {
3443     if (a->size != 0) {
3444         return false;
3445     }
3446     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3447 }
3448 
3449 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3450                        uint32_t oprsz, uint32_t maxsz)
3451 {
3452     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3453                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3454                       oprsz, maxsz);
3455 }
3456 
3457 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3458 {
3459     if (a->size == MO_16) {
3460         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3461             return false;
3462         }
3463     } else if (a->size != MO_32) {
3464         return false;
3465     }
3466     return do_2misc_vec(s, a, gen_VABS_F);
3467 }
3468 
3469 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3470                        uint32_t oprsz, uint32_t maxsz)
3471 {
3472     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3473                       vece == MO_16 ? 0x8000 : 0x80000000,
3474                       oprsz, maxsz);
3475 }
3476 
3477 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3478 {
3479     if (a->size == MO_16) {
3480         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3481             return false;
3482         }
3483     } else if (a->size != MO_32) {
3484         return false;
3485     }
3486     return do_2misc_vec(s, a, gen_VNEG_F);
3487 }
3488 
3489 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3490 {
3491     if (a->size != 2) {
3492         return false;
3493     }
3494     return do_2misc(s, a, gen_helper_recpe_u32);
3495 }
3496 
3497 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3498 {
3499     if (a->size != 2) {
3500         return false;
3501     }
3502     return do_2misc(s, a, gen_helper_rsqrte_u32);
3503 }
3504 
3505 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3506     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3507     {                                                   \
3508         FUNC(d, tcg_env, m);                            \
3509     }
3510 
3511 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3512 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3513 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3514 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3515 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3516 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3517 
3518 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3519 {
3520     static NeonGenOneOpFn * const fn[] = {
3521         gen_VQABS_s8,
3522         gen_VQABS_s16,
3523         gen_VQABS_s32,
3524         NULL,
3525     };
3526     return do_2misc(s, a, fn[a->size]);
3527 }
3528 
3529 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3530 {
3531     static NeonGenOneOpFn * const fn[] = {
3532         gen_VQNEG_s8,
3533         gen_VQNEG_s16,
3534         gen_VQNEG_s32,
3535         NULL,
3536     };
3537     return do_2misc(s, a, fn[a->size]);
3538 }
3539 
3540 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3541     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3542                            uint32_t rm_ofs,                             \
3543                            uint32_t oprsz, uint32_t maxsz)              \
3544     {                                                                   \
3545         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3546             NULL, HFUNC, SFUNC, NULL,                                   \
3547         };                                                              \
3548         TCGv_ptr fpst;                                                  \
3549         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3550         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3551                            fns[vece]);                                  \
3552     }                                                                   \
3553     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3554     {                                                                   \
3555         if (a->size == MO_16) {                                         \
3556             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3557                 return false;                                           \
3558             }                                                           \
3559         } else if (a->size != MO_32) {                                  \
3560             return false;                                               \
3561         }                                                               \
3562         return do_2misc_vec(s, a, gen_##INSN);                          \
3563     }
3564 
3565 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3566 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3567 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3568 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3569 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3570 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3571 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3572 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3573 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3574 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3575 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3576 
3577 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3578 
3579 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3580 {
3581     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3582         return false;
3583     }
3584     return trans_VRINTX_impl(s, a);
3585 }
3586 
3587 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3588     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3589                            uint32_t rm_ofs,                             \
3590                            uint32_t oprsz, uint32_t maxsz)              \
3591     {                                                                   \
3592         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3593             NULL,                                                       \
3594             gen_helper_gvec_##OP##h,                                    \
3595             gen_helper_gvec_##OP##s,                                    \
3596             NULL,                                                       \
3597         };                                                              \
3598         TCGv_ptr fpst;                                                  \
3599         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3600         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3601                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3602     }                                                                   \
3603     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3604     {                                                                   \
3605         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3606             return false;                                               \
3607         }                                                               \
3608         if (a->size == MO_16) {                                         \
3609             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3610                 return false;                                           \
3611             }                                                           \
3612         } else if (a->size != MO_32) {                                  \
3613             return false;                                               \
3614         }                                                               \
3615         return do_2misc_vec(s, a, gen_##INSN);                          \
3616     }
3617 
3618 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3619 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3620 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3621 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3622 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3623 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3624 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3625 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3626 
3627 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3628 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3629 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3630 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3631 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3632 
3633 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3634 {
3635     TCGv_i64 rm, rd;
3636     int pass;
3637 
3638     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3639         return false;
3640     }
3641 
3642     /* UNDEF accesses to D16-D31 if they don't exist. */
3643     if (!dc_isar_feature(aa32_simd_r32, s) &&
3644         ((a->vd | a->vm) & 0x10)) {
3645         return false;
3646     }
3647 
3648     if (a->size != 0) {
3649         return false;
3650     }
3651 
3652     if ((a->vd | a->vm) & a->q) {
3653         return false;
3654     }
3655 
3656     if (!vfp_access_check(s)) {
3657         return true;
3658     }
3659 
3660     rm = tcg_temp_new_i64();
3661     rd = tcg_temp_new_i64();
3662     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3663         read_neon_element64(rm, a->vm, pass, MO_64);
3664         read_neon_element64(rd, a->vd, pass, MO_64);
3665         write_neon_element64(rm, a->vd, pass, MO_64);
3666         write_neon_element64(rd, a->vm, pass, MO_64);
3667     }
3668     return true;
3669 }
3670 
3671 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3672 {
3673     TCGv_i32 rd, tmp;
3674 
3675     rd = tcg_temp_new_i32();
3676     tmp = tcg_temp_new_i32();
3677 
3678     tcg_gen_shli_i32(rd, t0, 8);
3679     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3680     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3681     tcg_gen_or_i32(rd, rd, tmp);
3682 
3683     tcg_gen_shri_i32(t1, t1, 8);
3684     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3685     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3686     tcg_gen_or_i32(t1, t1, tmp);
3687     tcg_gen_mov_i32(t0, rd);
3688 }
3689 
3690 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3691 {
3692     TCGv_i32 rd, tmp;
3693 
3694     rd = tcg_temp_new_i32();
3695     tmp = tcg_temp_new_i32();
3696 
3697     tcg_gen_shli_i32(rd, t0, 16);
3698     tcg_gen_andi_i32(tmp, t1, 0xffff);
3699     tcg_gen_or_i32(rd, rd, tmp);
3700     tcg_gen_shri_i32(t1, t1, 16);
3701     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3702     tcg_gen_or_i32(t1, t1, tmp);
3703     tcg_gen_mov_i32(t0, rd);
3704 }
3705 
3706 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3707 {
3708     TCGv_i32 tmp, tmp2;
3709     int pass;
3710 
3711     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3712         return false;
3713     }
3714 
3715     /* UNDEF accesses to D16-D31 if they don't exist. */
3716     if (!dc_isar_feature(aa32_simd_r32, s) &&
3717         ((a->vd | a->vm) & 0x10)) {
3718         return false;
3719     }
3720 
3721     if ((a->vd | a->vm) & a->q) {
3722         return false;
3723     }
3724 
3725     if (a->size == 3) {
3726         return false;
3727     }
3728 
3729     if (!vfp_access_check(s)) {
3730         return true;
3731     }
3732 
3733     tmp = tcg_temp_new_i32();
3734     tmp2 = tcg_temp_new_i32();
3735     if (a->size == MO_32) {
3736         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3737             read_neon_element32(tmp, a->vm, pass, MO_32);
3738             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3739             write_neon_element32(tmp2, a->vm, pass, MO_32);
3740             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3741         }
3742     } else {
3743         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3744             read_neon_element32(tmp, a->vm, pass, MO_32);
3745             read_neon_element32(tmp2, a->vd, pass, MO_32);
3746             if (a->size == MO_8) {
3747                 gen_neon_trn_u8(tmp, tmp2);
3748             } else {
3749                 gen_neon_trn_u16(tmp, tmp2);
3750             }
3751             write_neon_element32(tmp2, a->vm, pass, MO_32);
3752             write_neon_element32(tmp, a->vd, pass, MO_32);
3753         }
3754     }
3755     return true;
3756 }
3757 
3758 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
3759 {
3760     if (!dc_isar_feature(aa32_i8mm, s)) {
3761         return false;
3762     }
3763     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3764                         gen_helper_gvec_smmla_b);
3765 }
3766 
3767 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
3768 {
3769     if (!dc_isar_feature(aa32_i8mm, s)) {
3770         return false;
3771     }
3772     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3773                         gen_helper_gvec_ummla_b);
3774 }
3775 
3776 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
3777 {
3778     if (!dc_isar_feature(aa32_i8mm, s)) {
3779         return false;
3780     }
3781     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3782                         gen_helper_gvec_usmmla_b);
3783 }
3784 
3785 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
3786 {
3787     if (!dc_isar_feature(aa32_bf16, s)) {
3788         return false;
3789     }
3790     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3791                         gen_helper_gvec_bfmmla);
3792 }
3793 
3794 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
3795 {
3796     if (!dc_isar_feature(aa32_bf16, s)) {
3797         return false;
3798     }
3799     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
3800                              gen_helper_gvec_bfmlal);
3801 }
3802 
3803 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
3804 {
3805     if (!dc_isar_feature(aa32_bf16, s)) {
3806         return false;
3807     }
3808     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
3809                              (a->index << 1) | a->q, FPST_STD,
3810                              gen_helper_gvec_bfmlal_idx);
3811 }
3812