xref: /openbmc/qemu/target/arm/tcg/translate-neon.c (revision dfd1b81274140c5f511d549f7b3ec7675a6597f4)
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include "qemu/osdep.h"
24 #include "tcg/tcg-op.h"
25 #include "tcg/tcg-op-gvec.h"
26 #include "exec/exec-all.h"
27 #include "translate.h"
28 #include "translate-a32.h"
29 
30 /* Include the generated Neon decoder */
31 #include "decode-neon-dp.c.inc"
32 #include "decode-neon-ls.c.inc"
33 #include "decode-neon-shared.c.inc"
34 
35 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
36 {
37     TCGv_ptr ret = tcg_temp_new_ptr();
38     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
39     return ret;
40 }
41 
42 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
43 {
44     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
45 
46     switch (mop) {
47     case MO_UB:
48         tcg_gen_ld8u_i32(var, cpu_env, offset);
49         break;
50     case MO_UW:
51         tcg_gen_ld16u_i32(var, cpu_env, offset);
52         break;
53     case MO_UL:
54         tcg_gen_ld_i32(var, cpu_env, offset);
55         break;
56     default:
57         g_assert_not_reached();
58     }
59 }
60 
61 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
62 {
63     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
64 
65     switch (mop) {
66     case MO_UB:
67         tcg_gen_ld8u_i64(var, cpu_env, offset);
68         break;
69     case MO_UW:
70         tcg_gen_ld16u_i64(var, cpu_env, offset);
71         break;
72     case MO_UL:
73         tcg_gen_ld32u_i64(var, cpu_env, offset);
74         break;
75     case MO_UQ:
76         tcg_gen_ld_i64(var, cpu_env, offset);
77         break;
78     default:
79         g_assert_not_reached();
80     }
81 }
82 
83 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
84 {
85     long offset = neon_element_offset(reg, ele, size);
86 
87     switch (size) {
88     case MO_8:
89         tcg_gen_st8_i32(var, cpu_env, offset);
90         break;
91     case MO_16:
92         tcg_gen_st16_i32(var, cpu_env, offset);
93         break;
94     case MO_32:
95         tcg_gen_st_i32(var, cpu_env, offset);
96         break;
97     default:
98         g_assert_not_reached();
99     }
100 }
101 
102 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
103 {
104     long offset = neon_element_offset(reg, ele, size);
105 
106     switch (size) {
107     case MO_8:
108         tcg_gen_st8_i64(var, cpu_env, offset);
109         break;
110     case MO_16:
111         tcg_gen_st16_i64(var, cpu_env, offset);
112         break;
113     case MO_32:
114         tcg_gen_st32_i64(var, cpu_env, offset);
115         break;
116     case MO_64:
117         tcg_gen_st_i64(var, cpu_env, offset);
118         break;
119     default:
120         g_assert_not_reached();
121     }
122 }
123 
124 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
125                          int data, gen_helper_gvec_4 *fn_gvec)
126 {
127     /* UNDEF accesses to D16-D31 if they don't exist. */
128     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
129         return false;
130     }
131 
132     /*
133      * UNDEF accesses to odd registers for each bit of Q.
134      * Q will be 0b111 for all Q-reg instructions, otherwise
135      * when we have mixed Q- and D-reg inputs.
136      */
137     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
138         return false;
139     }
140 
141     if (!vfp_access_check(s)) {
142         return true;
143     }
144 
145     int opr_sz = q ? 16 : 8;
146     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
147                        vfp_reg_offset(1, vn),
148                        vfp_reg_offset(1, vm),
149                        vfp_reg_offset(1, vd),
150                        opr_sz, opr_sz, data, fn_gvec);
151     return true;
152 }
153 
154 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
155                               int data, ARMFPStatusFlavour fp_flavour,
156                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
157 {
158     /* UNDEF accesses to D16-D31 if they don't exist. */
159     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
160         return false;
161     }
162 
163     /*
164      * UNDEF accesses to odd registers for each bit of Q.
165      * Q will be 0b111 for all Q-reg instructions, otherwise
166      * when we have mixed Q- and D-reg inputs.
167      */
168     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
169         return false;
170     }
171 
172     if (!vfp_access_check(s)) {
173         return true;
174     }
175 
176     int opr_sz = q ? 16 : 8;
177     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
178 
179     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
180                        vfp_reg_offset(1, vn),
181                        vfp_reg_offset(1, vm),
182                        vfp_reg_offset(1, vd),
183                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
184     return true;
185 }
186 
187 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
188 {
189     if (!dc_isar_feature(aa32_vcma, s)) {
190         return false;
191     }
192     if (a->size == MO_16) {
193         if (!dc_isar_feature(aa32_fp16_arith, s)) {
194             return false;
195         }
196         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
197                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
198     }
199     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
200                              FPST_STD, gen_helper_gvec_fcmlas);
201 }
202 
203 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
204 {
205     int opr_sz;
206     TCGv_ptr fpst;
207     gen_helper_gvec_3_ptr *fn_gvec_ptr;
208 
209     if (!dc_isar_feature(aa32_vcma, s)
210         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
211         return false;
212     }
213 
214     /* UNDEF accesses to D16-D31 if they don't exist. */
215     if (!dc_isar_feature(aa32_simd_r32, s) &&
216         ((a->vd | a->vn | a->vm) & 0x10)) {
217         return false;
218     }
219 
220     if ((a->vn | a->vm | a->vd) & a->q) {
221         return false;
222     }
223 
224     if (!vfp_access_check(s)) {
225         return true;
226     }
227 
228     opr_sz = (1 + a->q) * 8;
229     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
230     fn_gvec_ptr = (a->size == MO_16) ?
231         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
232     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
233                        vfp_reg_offset(1, a->vn),
234                        vfp_reg_offset(1, a->vm),
235                        fpst, opr_sz, opr_sz, a->rot,
236                        fn_gvec_ptr);
237     return true;
238 }
239 
240 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
241 {
242     if (!dc_isar_feature(aa32_dp, s)) {
243         return false;
244     }
245     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
246                         gen_helper_gvec_sdot_b);
247 }
248 
249 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
250 {
251     if (!dc_isar_feature(aa32_dp, s)) {
252         return false;
253     }
254     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
255                         gen_helper_gvec_udot_b);
256 }
257 
258 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
259 {
260     if (!dc_isar_feature(aa32_i8mm, s)) {
261         return false;
262     }
263     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
264                         gen_helper_gvec_usdot_b);
265 }
266 
267 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
268 {
269     if (!dc_isar_feature(aa32_bf16, s)) {
270         return false;
271     }
272     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
273                         gen_helper_gvec_bfdot);
274 }
275 
276 static bool trans_VFML(DisasContext *s, arg_VFML *a)
277 {
278     int opr_sz;
279 
280     if (!dc_isar_feature(aa32_fhm, s)) {
281         return false;
282     }
283 
284     /* UNDEF accesses to D16-D31 if they don't exist. */
285     if (!dc_isar_feature(aa32_simd_r32, s) &&
286         (a->vd & 0x10)) {
287         return false;
288     }
289 
290     if (a->vd & a->q) {
291         return false;
292     }
293 
294     if (!vfp_access_check(s)) {
295         return true;
296     }
297 
298     opr_sz = (1 + a->q) * 8;
299     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
300                        vfp_reg_offset(a->q, a->vn),
301                        vfp_reg_offset(a->q, a->vm),
302                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
303                        gen_helper_gvec_fmlal_a32);
304     return true;
305 }
306 
307 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
308 {
309     int data = (a->index << 2) | a->rot;
310 
311     if (!dc_isar_feature(aa32_vcma, s)) {
312         return false;
313     }
314     if (a->size == MO_16) {
315         if (!dc_isar_feature(aa32_fp16_arith, s)) {
316             return false;
317         }
318         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
319                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
320     }
321     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
322                              FPST_STD, gen_helper_gvec_fcmlas_idx);
323 }
324 
325 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
326 {
327     if (!dc_isar_feature(aa32_dp, s)) {
328         return false;
329     }
330     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
331                         gen_helper_gvec_sdot_idx_b);
332 }
333 
334 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
335 {
336     if (!dc_isar_feature(aa32_dp, s)) {
337         return false;
338     }
339     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
340                         gen_helper_gvec_udot_idx_b);
341 }
342 
343 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
344 {
345     if (!dc_isar_feature(aa32_i8mm, s)) {
346         return false;
347     }
348     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
349                         gen_helper_gvec_usdot_idx_b);
350 }
351 
352 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
353 {
354     if (!dc_isar_feature(aa32_i8mm, s)) {
355         return false;
356     }
357     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
358                         gen_helper_gvec_sudot_idx_b);
359 }
360 
361 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
362 {
363     if (!dc_isar_feature(aa32_bf16, s)) {
364         return false;
365     }
366     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
367                         gen_helper_gvec_bfdot_idx);
368 }
369 
370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
371 {
372     int opr_sz;
373 
374     if (!dc_isar_feature(aa32_fhm, s)) {
375         return false;
376     }
377 
378     /* UNDEF accesses to D16-D31 if they don't exist. */
379     if (!dc_isar_feature(aa32_simd_r32, s) &&
380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
381         return false;
382     }
383 
384     if (a->vd & a->q) {
385         return false;
386     }
387 
388     if (!vfp_access_check(s)) {
389         return true;
390     }
391 
392     opr_sz = (1 + a->q) * 8;
393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
394                        vfp_reg_offset(a->q, a->vn),
395                        vfp_reg_offset(a->q, a->rm),
396                        cpu_env, opr_sz, opr_sz,
397                        (a->index << 2) | a->s, /* is_2 == 0 */
398                        gen_helper_gvec_fmlal_idx_a32);
399     return true;
400 }
401 
402 static struct {
403     int nregs;
404     int interleave;
405     int spacing;
406 } const neon_ls_element_type[11] = {
407     {1, 4, 1},
408     {1, 4, 2},
409     {4, 1, 1},
410     {2, 2, 2},
411     {1, 3, 1},
412     {1, 3, 2},
413     {3, 1, 1},
414     {1, 1, 1},
415     {1, 2, 1},
416     {1, 2, 2},
417     {2, 1, 1}
418 };
419 
420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
421                                       int stride)
422 {
423     if (rm != 15) {
424         TCGv_i32 base;
425 
426         base = load_reg(s, rn);
427         if (rm == 13) {
428             tcg_gen_addi_i32(base, base, stride);
429         } else {
430             TCGv_i32 index;
431             index = load_reg(s, rm);
432             tcg_gen_add_i32(base, base, index);
433         }
434         store_reg(s, rn, base);
435     }
436 }
437 
438 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
439 {
440     /* Neon load/store multiple structures */
441     int nregs, interleave, spacing, reg, n;
442     MemOp mop, align, endian;
443     int mmu_idx = get_mem_index(s);
444     int size = a->size;
445     TCGv_i64 tmp64;
446     TCGv_i32 addr;
447 
448     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
449         return false;
450     }
451 
452     /* UNDEF accesses to D16-D31 if they don't exist */
453     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
454         return false;
455     }
456     if (a->itype > 10) {
457         return false;
458     }
459     /* Catch UNDEF cases for bad values of align field */
460     switch (a->itype & 0xc) {
461     case 4:
462         if (a->align >= 2) {
463             return false;
464         }
465         break;
466     case 8:
467         if (a->align == 3) {
468             return false;
469         }
470         break;
471     default:
472         break;
473     }
474     nregs = neon_ls_element_type[a->itype].nregs;
475     interleave = neon_ls_element_type[a->itype].interleave;
476     spacing = neon_ls_element_type[a->itype].spacing;
477     if (size == 3 && (interleave | spacing) != 1) {
478         return false;
479     }
480 
481     if (!vfp_access_check(s)) {
482         return true;
483     }
484 
485     /* For our purposes, bytes are always little-endian.  */
486     endian = s->be_data;
487     if (size == 0) {
488         endian = MO_LE;
489     }
490 
491     /* Enforce alignment requested by the instruction */
492     if (a->align) {
493         align = pow2_align(a->align + 2); /* 4 ** a->align */
494     } else {
495         align = s->align_mem ? MO_ALIGN : 0;
496     }
497 
498     /*
499      * Consecutive little-endian elements from a single register
500      * can be promoted to a larger little-endian operation.
501      */
502     if (interleave == 1 && endian == MO_LE) {
503         /* Retain any natural alignment. */
504         if (align == MO_ALIGN) {
505             align = pow2_align(size);
506         }
507         size = 3;
508     }
509 
510     tmp64 = tcg_temp_new_i64();
511     addr = tcg_temp_new_i32();
512     load_reg_var(s, addr, a->rn);
513 
514     mop = endian | size | align;
515     for (reg = 0; reg < nregs; reg++) {
516         for (n = 0; n < 8 >> size; n++) {
517             int xs;
518             for (xs = 0; xs < interleave; xs++) {
519                 int tt = a->vd + reg + spacing * xs;
520 
521                 if (a->l) {
522                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
523                     neon_store_element64(tt, n, size, tmp64);
524                 } else {
525                     neon_load_element64(tmp64, tt, n, size);
526                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
527                 }
528                 tcg_gen_addi_i32(addr, addr, 1 << size);
529 
530                 /* Subsequent memory operations inherit alignment */
531                 mop &= ~MO_AMASK;
532             }
533         }
534     }
535 
536     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
537     return true;
538 }
539 
540 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
541 {
542     /* Neon load single structure to all lanes */
543     int reg, stride, vec_size;
544     int vd = a->vd;
545     int size = a->size;
546     int nregs = a->n + 1;
547     TCGv_i32 addr, tmp;
548     MemOp mop, align;
549 
550     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
551         return false;
552     }
553 
554     /* UNDEF accesses to D16-D31 if they don't exist */
555     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
556         return false;
557     }
558 
559     align = 0;
560     if (size == 3) {
561         if (nregs != 4 || a->a == 0) {
562             return false;
563         }
564         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
565         size = MO_32;
566         align = MO_ALIGN_16;
567     } else if (a->a) {
568         switch (nregs) {
569         case 1:
570             if (size == 0) {
571                 return false;
572             }
573             align = MO_ALIGN;
574             break;
575         case 2:
576             align = pow2_align(size + 1);
577             break;
578         case 3:
579             return false;
580         case 4:
581             if (size == 2) {
582                 align = pow2_align(3);
583             } else {
584                 align = pow2_align(size + 2);
585             }
586             break;
587         default:
588             g_assert_not_reached();
589         }
590     }
591 
592     if (!vfp_access_check(s)) {
593         return true;
594     }
595 
596     /*
597      * VLD1 to all lanes: T bit indicates how many Dregs to write.
598      * VLD2/3/4 to all lanes: T bit indicates register stride.
599      */
600     stride = a->t ? 2 : 1;
601     vec_size = nregs == 1 ? stride * 8 : 8;
602     mop = size | align;
603     tmp = tcg_temp_new_i32();
604     addr = tcg_temp_new_i32();
605     load_reg_var(s, addr, a->rn);
606     for (reg = 0; reg < nregs; reg++) {
607         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
608         if ((vd & 1) && vec_size == 16) {
609             /*
610              * We cannot write 16 bytes at once because the
611              * destination is unaligned.
612              */
613             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
614                                  8, 8, tmp);
615             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
616                              neon_full_reg_offset(vd), 8, 8);
617         } else {
618             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
619                                  vec_size, vec_size, tmp);
620         }
621         tcg_gen_addi_i32(addr, addr, 1 << size);
622         vd += stride;
623 
624         /* Subsequent memory operations inherit alignment */
625         mop &= ~MO_AMASK;
626     }
627 
628     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
629 
630     return true;
631 }
632 
633 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
634 {
635     /* Neon load/store single structure to one lane */
636     int reg;
637     int nregs = a->n + 1;
638     int vd = a->vd;
639     TCGv_i32 addr, tmp;
640     MemOp mop;
641 
642     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
643         return false;
644     }
645 
646     /* UNDEF accesses to D16-D31 if they don't exist */
647     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
648         return false;
649     }
650 
651     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
652     switch (nregs) {
653     case 1:
654         if (a->stride != 1) {
655             return false;
656         }
657         if (((a->align & (1 << a->size)) != 0) ||
658             (a->size == 2 && (a->align == 1 || a->align == 2))) {
659             return false;
660         }
661         break;
662     case 2:
663         if (a->size == 2 && (a->align & 2) != 0) {
664             return false;
665         }
666         break;
667     case 3:
668         if (a->align != 0) {
669             return false;
670         }
671         break;
672     case 4:
673         if (a->size == 2 && a->align == 3) {
674             return false;
675         }
676         break;
677     default:
678         g_assert_not_reached();
679     }
680     if ((vd + a->stride * (nregs - 1)) > 31) {
681         /*
682          * Attempts to write off the end of the register file are
683          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
684          * access off the end of the array that holds the register data.
685          */
686         return false;
687     }
688 
689     if (!vfp_access_check(s)) {
690         return true;
691     }
692 
693     /* Pick up SCTLR settings */
694     mop = finalize_memop(s, a->size);
695 
696     if (a->align) {
697         MemOp align_op;
698 
699         switch (nregs) {
700         case 1:
701             /* For VLD1, use natural alignment. */
702             align_op = MO_ALIGN;
703             break;
704         case 2:
705             /* For VLD2, use double alignment. */
706             align_op = pow2_align(a->size + 1);
707             break;
708         case 4:
709             if (a->size == MO_32) {
710                 /*
711                  * For VLD4.32, align = 1 is double alignment, align = 2 is
712                  * quad alignment; align = 3 is rejected above.
713                  */
714                 align_op = pow2_align(a->size + a->align);
715             } else {
716                 /* For VLD4.8 and VLD.16, we want quad alignment. */
717                 align_op = pow2_align(a->size + 2);
718             }
719             break;
720         default:
721             /* For VLD3, the alignment field is zero and rejected above. */
722             g_assert_not_reached();
723         }
724 
725         mop = (mop & ~MO_AMASK) | align_op;
726     }
727 
728     tmp = tcg_temp_new_i32();
729     addr = tcg_temp_new_i32();
730     load_reg_var(s, addr, a->rn);
731 
732     for (reg = 0; reg < nregs; reg++) {
733         if (a->l) {
734             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
735             neon_store_element(vd, a->reg_idx, a->size, tmp);
736         } else { /* Store */
737             neon_load_element(tmp, vd, a->reg_idx, a->size);
738             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
739         }
740         vd += a->stride;
741         tcg_gen_addi_i32(addr, addr, 1 << a->size);
742 
743         /* Subsequent memory operations inherit alignment */
744         mop &= ~MO_AMASK;
745     }
746 
747     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
748 
749     return true;
750 }
751 
752 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
753 {
754     int vec_size = a->q ? 16 : 8;
755     int rd_ofs = neon_full_reg_offset(a->vd);
756     int rn_ofs = neon_full_reg_offset(a->vn);
757     int rm_ofs = neon_full_reg_offset(a->vm);
758 
759     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
760         return false;
761     }
762 
763     /* UNDEF accesses to D16-D31 if they don't exist. */
764     if (!dc_isar_feature(aa32_simd_r32, s) &&
765         ((a->vd | a->vn | a->vm) & 0x10)) {
766         return false;
767     }
768 
769     if ((a->vn | a->vm | a->vd) & a->q) {
770         return false;
771     }
772 
773     if (!vfp_access_check(s)) {
774         return true;
775     }
776 
777     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
778     return true;
779 }
780 
781 #define DO_3SAME(INSN, FUNC)                                            \
782     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
783     {                                                                   \
784         return do_3same(s, a, FUNC);                                    \
785     }
786 
787 DO_3SAME(VADD, tcg_gen_gvec_add)
788 DO_3SAME(VSUB, tcg_gen_gvec_sub)
789 DO_3SAME(VAND, tcg_gen_gvec_and)
790 DO_3SAME(VBIC, tcg_gen_gvec_andc)
791 DO_3SAME(VORR, tcg_gen_gvec_or)
792 DO_3SAME(VORN, tcg_gen_gvec_orc)
793 DO_3SAME(VEOR, tcg_gen_gvec_xor)
794 DO_3SAME(VSHL_S, gen_gvec_sshl)
795 DO_3SAME(VSHL_U, gen_gvec_ushl)
796 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
797 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
798 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
799 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
800 
801 /* These insns are all gvec_bitsel but with the inputs in various orders. */
802 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
803     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
804                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
805                                 uint32_t oprsz, uint32_t maxsz)         \
806     {                                                                   \
807         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
808     }                                                                   \
809     DO_3SAME(INSN, gen_##INSN##_3s)
810 
811 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
812 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
813 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
814 
815 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
816     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
817     {                                                                   \
818         if (a->size == 3) {                                             \
819             return false;                                               \
820         }                                                               \
821         return do_3same(s, a, FUNC);                                    \
822     }
823 
824 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
825 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
826 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
827 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
828 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
829 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
830 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
831 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
832 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
833 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
834 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
835 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
836 
837 #define DO_3SAME_CMP(INSN, COND)                                        \
838     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
839                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
840                                 uint32_t oprsz, uint32_t maxsz)         \
841     {                                                                   \
842         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
843     }                                                                   \
844     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
845 
846 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
847 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
848 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
849 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
850 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
851 
852 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
853     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
854                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
855     {                                                                      \
856         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
857     }
858 
859 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
860 
861 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
862 {
863     if (a->size != 0) {
864         return false;
865     }
866     return do_3same(s, a, gen_VMUL_p_3s);
867 }
868 
869 #define DO_VQRDMLAH(INSN, FUNC)                                         \
870     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
871     {                                                                   \
872         if (!dc_isar_feature(aa32_rdm, s)) {                            \
873             return false;                                               \
874         }                                                               \
875         if (a->size != 1 && a->size != 2) {                             \
876             return false;                                               \
877         }                                                               \
878         return do_3same(s, a, FUNC);                                    \
879     }
880 
881 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
882 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
883 
884 #define DO_SHA1(NAME, FUNC)                                             \
885     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
886     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
887     {                                                                   \
888         if (!dc_isar_feature(aa32_sha1, s)) {                           \
889             return false;                                               \
890         }                                                               \
891         return do_3same(s, a, gen_##NAME##_3s);                         \
892     }
893 
894 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
895 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
896 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
897 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
898 
899 #define DO_SHA2(NAME, FUNC)                                             \
900     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
901     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
902     {                                                                   \
903         if (!dc_isar_feature(aa32_sha2, s)) {                           \
904             return false;                                               \
905         }                                                               \
906         return do_3same(s, a, gen_##NAME##_3s);                         \
907     }
908 
909 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
910 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
911 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
912 
913 #define DO_3SAME_64(INSN, FUNC)                                         \
914     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
915                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
916                                 uint32_t oprsz, uint32_t maxsz)         \
917     {                                                                   \
918         static const GVecGen3 op = { .fni8 = FUNC };                    \
919         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
920     }                                                                   \
921     DO_3SAME(INSN, gen_##INSN##_3s)
922 
923 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
924     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
925     {                                                                   \
926         FUNC(d, cpu_env, n, m);                                         \
927     }                                                                   \
928     DO_3SAME_64(INSN, gen_##INSN##_elt)
929 
930 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
931 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
932 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
933 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
934 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
935 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
936 
937 #define DO_3SAME_32(INSN, FUNC)                                         \
938     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
939                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
940                                 uint32_t oprsz, uint32_t maxsz)         \
941     {                                                                   \
942         static const GVecGen3 ops[4] = {                                \
943             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
944             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
945             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
946             { 0 },                                                      \
947         };                                                              \
948         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
949     }                                                                   \
950     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
951     {                                                                   \
952         if (a->size > 2) {                                              \
953             return false;                                               \
954         }                                                               \
955         return do_3same(s, a, gen_##INSN##_3s);                         \
956     }
957 
958 /*
959  * Some helper functions need to be passed the cpu_env. In order
960  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
961  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
962  * and which call a NeonGenTwoOpEnvFn().
963  */
964 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
965     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
966     {                                                                   \
967         FUNC(d, cpu_env, n, m);                                         \
968     }
969 
970 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
971     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
972     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
973     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
974     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
975                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
976                                 uint32_t oprsz, uint32_t maxsz)         \
977     {                                                                   \
978         static const GVecGen3 ops[4] = {                                \
979             { .fni4 = gen_##INSN##_tramp8 },                            \
980             { .fni4 = gen_##INSN##_tramp16 },                           \
981             { .fni4 = gen_##INSN##_tramp32 },                           \
982             { 0 },                                                      \
983         };                                                              \
984         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
985     }                                                                   \
986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
987     {                                                                   \
988         if (a->size > 2) {                                              \
989             return false;                                               \
990         }                                                               \
991         return do_3same(s, a, gen_##INSN##_3s);                         \
992     }
993 
994 DO_3SAME_32(VHADD_S, hadd_s)
995 DO_3SAME_32(VHADD_U, hadd_u)
996 DO_3SAME_32(VHSUB_S, hsub_s)
997 DO_3SAME_32(VHSUB_U, hsub_u)
998 DO_3SAME_32(VRHADD_S, rhadd_s)
999 DO_3SAME_32(VRHADD_U, rhadd_u)
1000 DO_3SAME_32(VRSHL_S, rshl_s)
1001 DO_3SAME_32(VRSHL_U, rshl_u)
1002 
1003 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1004 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1005 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1006 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1007 
1008 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1009 {
1010     /* Operations handled pairwise 32 bits at a time */
1011     TCGv_i32 tmp, tmp2, tmp3;
1012 
1013     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1014         return false;
1015     }
1016 
1017     /* UNDEF accesses to D16-D31 if they don't exist. */
1018     if (!dc_isar_feature(aa32_simd_r32, s) &&
1019         ((a->vd | a->vn | a->vm) & 0x10)) {
1020         return false;
1021     }
1022 
1023     if (a->size == 3) {
1024         return false;
1025     }
1026 
1027     if (!vfp_access_check(s)) {
1028         return true;
1029     }
1030 
1031     assert(a->q == 0); /* enforced by decode patterns */
1032 
1033     /*
1034      * Note that we have to be careful not to clobber the source operands
1035      * in the "vm == vd" case by storing the result of the first pass too
1036      * early. Since Q is 0 there are always just two passes, so instead
1037      * of a complicated loop over each pass we just unroll.
1038      */
1039     tmp = tcg_temp_new_i32();
1040     tmp2 = tcg_temp_new_i32();
1041     tmp3 = tcg_temp_new_i32();
1042 
1043     read_neon_element32(tmp, a->vn, 0, MO_32);
1044     read_neon_element32(tmp2, a->vn, 1, MO_32);
1045     fn(tmp, tmp, tmp2);
1046 
1047     read_neon_element32(tmp3, a->vm, 0, MO_32);
1048     read_neon_element32(tmp2, a->vm, 1, MO_32);
1049     fn(tmp3, tmp3, tmp2);
1050 
1051     write_neon_element32(tmp, a->vd, 0, MO_32);
1052     write_neon_element32(tmp3, a->vd, 1, MO_32);
1053 
1054     return true;
1055 }
1056 
1057 #define DO_3SAME_PAIR(INSN, func)                                       \
1058     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1059     {                                                                   \
1060         static NeonGenTwoOpFn * const fns[] = {                         \
1061             gen_helper_neon_##func##8,                                  \
1062             gen_helper_neon_##func##16,                                 \
1063             gen_helper_neon_##func##32,                                 \
1064         };                                                              \
1065         if (a->size > 2) {                                              \
1066             return false;                                               \
1067         }                                                               \
1068         return do_3same_pair(s, a, fns[a->size]);                       \
1069     }
1070 
1071 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1072 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1073 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1074 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1075 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1076 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1077 
1078 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1079 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1080 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1081 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1082 DO_3SAME_PAIR(VPADD, padd_u)
1083 
1084 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1085     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1086     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1087     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1088                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1089                                 uint32_t oprsz, uint32_t maxsz)         \
1090     {                                                                   \
1091         static const GVecGen3 ops[2] = {                                \
1092             { .fni4 = gen_##INSN##_tramp16 },                           \
1093             { .fni4 = gen_##INSN##_tramp32 },                           \
1094         };                                                              \
1095         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1096     }                                                                   \
1097     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1098     {                                                                   \
1099         if (a->size != 1 && a->size != 2) {                             \
1100             return false;                                               \
1101         }                                                               \
1102         return do_3same(s, a, gen_##INSN##_3s);                         \
1103     }
1104 
1105 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1106 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1107 
1108 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1109     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1110                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1111                          uint32_t oprsz, uint32_t maxsz)                \
1112     {                                                                   \
1113         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1114         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1115                            oprsz, maxsz, 0, FUNC);                      \
1116     }
1117 
1118 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1119     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1120     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1121     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1122     {                                                                   \
1123         if (a->size == MO_16) {                                         \
1124             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1125                 return false;                                           \
1126             }                                                           \
1127             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1128         }                                                               \
1129         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1130     }
1131 
1132 
1133 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1134 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1135 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1136 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1137 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1138 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1139 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1140 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1141 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1142 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1143 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1144 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1145 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1146 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1147 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1148 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1149 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1150 
1151 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1152 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1153 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1154 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1155 
1156 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1157 {
1158     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1159         return false;
1160     }
1161 
1162     if (a->size == MO_16) {
1163         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1164             return false;
1165         }
1166         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1167     }
1168     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1169 }
1170 
1171 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1172 {
1173     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1174         return false;
1175     }
1176 
1177     if (a->size == MO_16) {
1178         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1179             return false;
1180         }
1181         return do_3same(s, a, gen_VMINNM_fp16_3s);
1182     }
1183     return do_3same(s, a, gen_VMINNM_fp32_3s);
1184 }
1185 
1186 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1187                              gen_helper_gvec_3_ptr *fn)
1188 {
1189     /* FP pairwise operations */
1190     TCGv_ptr fpstatus;
1191 
1192     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1193         return false;
1194     }
1195 
1196     /* UNDEF accesses to D16-D31 if they don't exist. */
1197     if (!dc_isar_feature(aa32_simd_r32, s) &&
1198         ((a->vd | a->vn | a->vm) & 0x10)) {
1199         return false;
1200     }
1201 
1202     if (!vfp_access_check(s)) {
1203         return true;
1204     }
1205 
1206     assert(a->q == 0); /* enforced by decode patterns */
1207 
1208 
1209     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1210     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1211                        vfp_reg_offset(1, a->vn),
1212                        vfp_reg_offset(1, a->vm),
1213                        fpstatus, 8, 8, 0, fn);
1214 
1215     return true;
1216 }
1217 
1218 /*
1219  * For all the functions using this macro, size == 1 means fp16,
1220  * which is an architecture extension we don't implement yet.
1221  */
1222 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1223     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1224     {                                                               \
1225         if (a->size == MO_16) {                                     \
1226             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1227                 return false;                                       \
1228             }                                                       \
1229             return do_3same_fp_pair(s, a, FUNC##h);                 \
1230         }                                                           \
1231         return do_3same_fp_pair(s, a, FUNC##s);                     \
1232     }
1233 
1234 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1235 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1236 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1237 
1238 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1239 {
1240     /* Handle a 2-reg-shift insn which can be vectorized. */
1241     int vec_size = a->q ? 16 : 8;
1242     int rd_ofs = neon_full_reg_offset(a->vd);
1243     int rm_ofs = neon_full_reg_offset(a->vm);
1244 
1245     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1246         return false;
1247     }
1248 
1249     /* UNDEF accesses to D16-D31 if they don't exist. */
1250     if (!dc_isar_feature(aa32_simd_r32, s) &&
1251         ((a->vd | a->vm) & 0x10)) {
1252         return false;
1253     }
1254 
1255     if ((a->vm | a->vd) & a->q) {
1256         return false;
1257     }
1258 
1259     if (!vfp_access_check(s)) {
1260         return true;
1261     }
1262 
1263     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1264     return true;
1265 }
1266 
1267 #define DO_2SH(INSN, FUNC)                                              \
1268     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1269     {                                                                   \
1270         return do_vector_2sh(s, a, FUNC);                               \
1271     }                                                                   \
1272 
1273 DO_2SH(VSHL, tcg_gen_gvec_shli)
1274 DO_2SH(VSLI, gen_gvec_sli)
1275 DO_2SH(VSRI, gen_gvec_sri)
1276 DO_2SH(VSRA_S, gen_gvec_ssra)
1277 DO_2SH(VSRA_U, gen_gvec_usra)
1278 DO_2SH(VRSHR_S, gen_gvec_srshr)
1279 DO_2SH(VRSHR_U, gen_gvec_urshr)
1280 DO_2SH(VRSRA_S, gen_gvec_srsra)
1281 DO_2SH(VRSRA_U, gen_gvec_ursra)
1282 
1283 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1284 {
1285     /* Signed shift out of range results in all-sign-bits */
1286     a->shift = MIN(a->shift, (8 << a->size) - 1);
1287     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1288 }
1289 
1290 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1291                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1292 {
1293     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1294 }
1295 
1296 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1297 {
1298     /* Shift out of range is architecturally valid and results in zero. */
1299     if (a->shift >= (8 << a->size)) {
1300         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1301     } else {
1302         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1303     }
1304 }
1305 
1306 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1307                              NeonGenTwo64OpEnvFn *fn)
1308 {
1309     /*
1310      * 2-reg-and-shift operations, size == 3 case, where the
1311      * function needs to be passed cpu_env.
1312      */
1313     TCGv_i64 constimm;
1314     int pass;
1315 
1316     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1317         return false;
1318     }
1319 
1320     /* UNDEF accesses to D16-D31 if they don't exist. */
1321     if (!dc_isar_feature(aa32_simd_r32, s) &&
1322         ((a->vd | a->vm) & 0x10)) {
1323         return false;
1324     }
1325 
1326     if ((a->vm | a->vd) & a->q) {
1327         return false;
1328     }
1329 
1330     if (!vfp_access_check(s)) {
1331         return true;
1332     }
1333 
1334     /*
1335      * To avoid excessive duplication of ops we implement shift
1336      * by immediate using the variable shift operations.
1337      */
1338     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1339 
1340     for (pass = 0; pass < a->q + 1; pass++) {
1341         TCGv_i64 tmp = tcg_temp_new_i64();
1342 
1343         read_neon_element64(tmp, a->vm, pass, MO_64);
1344         fn(tmp, cpu_env, tmp, constimm);
1345         write_neon_element64(tmp, a->vd, pass, MO_64);
1346     }
1347     return true;
1348 }
1349 
1350 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1351                              NeonGenTwoOpEnvFn *fn)
1352 {
1353     /*
1354      * 2-reg-and-shift operations, size < 3 case, where the
1355      * helper needs to be passed cpu_env.
1356      */
1357     TCGv_i32 constimm, tmp;
1358     int pass;
1359 
1360     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1361         return false;
1362     }
1363 
1364     /* UNDEF accesses to D16-D31 if they don't exist. */
1365     if (!dc_isar_feature(aa32_simd_r32, s) &&
1366         ((a->vd | a->vm) & 0x10)) {
1367         return false;
1368     }
1369 
1370     if ((a->vm | a->vd) & a->q) {
1371         return false;
1372     }
1373 
1374     if (!vfp_access_check(s)) {
1375         return true;
1376     }
1377 
1378     /*
1379      * To avoid excessive duplication of ops we implement shift
1380      * by immediate using the variable shift operations.
1381      */
1382     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1383     tmp = tcg_temp_new_i32();
1384 
1385     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1386         read_neon_element32(tmp, a->vm, pass, MO_32);
1387         fn(tmp, cpu_env, tmp, constimm);
1388         write_neon_element32(tmp, a->vd, pass, MO_32);
1389     }
1390     return true;
1391 }
1392 
1393 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1394     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1395     {                                                                   \
1396         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1397     }                                                                   \
1398     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1399     {                                                                   \
1400         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1401             gen_helper_neon_##FUNC##8,                                  \
1402             gen_helper_neon_##FUNC##16,                                 \
1403             gen_helper_neon_##FUNC##32,                                 \
1404         };                                                              \
1405         assert(a->size < ARRAY_SIZE(fns));                              \
1406         return do_2shift_env_32(s, a, fns[a->size]);                    \
1407     }
1408 
1409 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1410 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1411 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1412 
1413 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1414                                 NeonGenTwo64OpFn *shiftfn,
1415                                 NeonGenNarrowEnvFn *narrowfn)
1416 {
1417     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1418     TCGv_i64 constimm, rm1, rm2;
1419     TCGv_i32 rd;
1420 
1421     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1422         return false;
1423     }
1424 
1425     /* UNDEF accesses to D16-D31 if they don't exist. */
1426     if (!dc_isar_feature(aa32_simd_r32, s) &&
1427         ((a->vd | a->vm) & 0x10)) {
1428         return false;
1429     }
1430 
1431     if (a->vm & 1) {
1432         return false;
1433     }
1434 
1435     if (!vfp_access_check(s)) {
1436         return true;
1437     }
1438 
1439     /*
1440      * This is always a right shift, and the shiftfn is always a
1441      * left-shift helper, which thus needs the negated shift count.
1442      */
1443     constimm = tcg_constant_i64(-a->shift);
1444     rm1 = tcg_temp_new_i64();
1445     rm2 = tcg_temp_new_i64();
1446     rd = tcg_temp_new_i32();
1447 
1448     /* Load both inputs first to avoid potential overwrite if rm == rd */
1449     read_neon_element64(rm1, a->vm, 0, MO_64);
1450     read_neon_element64(rm2, a->vm, 1, MO_64);
1451 
1452     shiftfn(rm1, rm1, constimm);
1453     narrowfn(rd, cpu_env, rm1);
1454     write_neon_element32(rd, a->vd, 0, MO_32);
1455 
1456     shiftfn(rm2, rm2, constimm);
1457     narrowfn(rd, cpu_env, rm2);
1458     write_neon_element32(rd, a->vd, 1, MO_32);
1459 
1460     return true;
1461 }
1462 
1463 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1464                                 NeonGenTwoOpFn *shiftfn,
1465                                 NeonGenNarrowEnvFn *narrowfn)
1466 {
1467     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1468     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1469     TCGv_i64 rtmp;
1470     uint32_t imm;
1471 
1472     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1473         return false;
1474     }
1475 
1476     /* UNDEF accesses to D16-D31 if they don't exist. */
1477     if (!dc_isar_feature(aa32_simd_r32, s) &&
1478         ((a->vd | a->vm) & 0x10)) {
1479         return false;
1480     }
1481 
1482     if (a->vm & 1) {
1483         return false;
1484     }
1485 
1486     if (!vfp_access_check(s)) {
1487         return true;
1488     }
1489 
1490     /*
1491      * This is always a right shift, and the shiftfn is always a
1492      * left-shift helper, which thus needs the negated shift count
1493      * duplicated into each lane of the immediate value.
1494      */
1495     if (a->size == 1) {
1496         imm = (uint16_t)(-a->shift);
1497         imm |= imm << 16;
1498     } else {
1499         /* size == 2 */
1500         imm = -a->shift;
1501     }
1502     constimm = tcg_constant_i32(imm);
1503 
1504     /* Load all inputs first to avoid potential overwrite */
1505     rm1 = tcg_temp_new_i32();
1506     rm2 = tcg_temp_new_i32();
1507     rm3 = tcg_temp_new_i32();
1508     rm4 = tcg_temp_new_i32();
1509     read_neon_element32(rm1, a->vm, 0, MO_32);
1510     read_neon_element32(rm2, a->vm, 1, MO_32);
1511     read_neon_element32(rm3, a->vm, 2, MO_32);
1512     read_neon_element32(rm4, a->vm, 3, MO_32);
1513     rtmp = tcg_temp_new_i64();
1514 
1515     shiftfn(rm1, rm1, constimm);
1516     shiftfn(rm2, rm2, constimm);
1517 
1518     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1519 
1520     narrowfn(rm1, cpu_env, rtmp);
1521     write_neon_element32(rm1, a->vd, 0, MO_32);
1522 
1523     shiftfn(rm3, rm3, constimm);
1524     shiftfn(rm4, rm4, constimm);
1525 
1526     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1527 
1528     narrowfn(rm3, cpu_env, rtmp);
1529     write_neon_element32(rm3, a->vd, 1, MO_32);
1530     return true;
1531 }
1532 
1533 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1534     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1535     {                                                                   \
1536         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1537     }
1538 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1539     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1540     {                                                                   \
1541         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1542     }
1543 
1544 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1545 {
1546     tcg_gen_extrl_i64_i32(dest, src);
1547 }
1548 
1549 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1550 {
1551     gen_helper_neon_narrow_u16(dest, src);
1552 }
1553 
1554 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1555 {
1556     gen_helper_neon_narrow_u8(dest, src);
1557 }
1558 
1559 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1560 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1561 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1562 
1563 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1564 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1565 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1566 
1567 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1568 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1569 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1570 
1571 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1572 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1573 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1574 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1575 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1576 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1577 
1578 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1579 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1580 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1581 
1582 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1583 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1584 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1585 
1586 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1587 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1588 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1589 
1590 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1591                          NeonGenWidenFn *widenfn, bool u)
1592 {
1593     TCGv_i64 tmp;
1594     TCGv_i32 rm0, rm1;
1595     uint64_t widen_mask = 0;
1596 
1597     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1598         return false;
1599     }
1600 
1601     /* UNDEF accesses to D16-D31 if they don't exist. */
1602     if (!dc_isar_feature(aa32_simd_r32, s) &&
1603         ((a->vd | a->vm) & 0x10)) {
1604         return false;
1605     }
1606 
1607     if (a->vd & 1) {
1608         return false;
1609     }
1610 
1611     if (!vfp_access_check(s)) {
1612         return true;
1613     }
1614 
1615     /*
1616      * This is a widen-and-shift operation. The shift is always less
1617      * than the width of the source type, so after widening the input
1618      * vector we can simply shift the whole 64-bit widened register,
1619      * and then clear the potential overflow bits resulting from left
1620      * bits of the narrow input appearing as right bits of the left
1621      * neighbour narrow input. Calculate a mask of bits to clear.
1622      */
1623     if ((a->shift != 0) && (a->size < 2 || u)) {
1624         int esize = 8 << a->size;
1625         widen_mask = MAKE_64BIT_MASK(0, esize);
1626         widen_mask >>= esize - a->shift;
1627         widen_mask = dup_const(a->size + 1, widen_mask);
1628     }
1629 
1630     rm0 = tcg_temp_new_i32();
1631     rm1 = tcg_temp_new_i32();
1632     read_neon_element32(rm0, a->vm, 0, MO_32);
1633     read_neon_element32(rm1, a->vm, 1, MO_32);
1634     tmp = tcg_temp_new_i64();
1635 
1636     widenfn(tmp, rm0);
1637     if (a->shift != 0) {
1638         tcg_gen_shli_i64(tmp, tmp, a->shift);
1639         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1640     }
1641     write_neon_element64(tmp, a->vd, 0, MO_64);
1642 
1643     widenfn(tmp, rm1);
1644     if (a->shift != 0) {
1645         tcg_gen_shli_i64(tmp, tmp, a->shift);
1646         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1647     }
1648     write_neon_element64(tmp, a->vd, 1, MO_64);
1649     return true;
1650 }
1651 
1652 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1653 {
1654     static NeonGenWidenFn * const widenfn[] = {
1655         gen_helper_neon_widen_s8,
1656         gen_helper_neon_widen_s16,
1657         tcg_gen_ext_i32_i64,
1658     };
1659     return do_vshll_2sh(s, a, widenfn[a->size], false);
1660 }
1661 
1662 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1663 {
1664     static NeonGenWidenFn * const widenfn[] = {
1665         gen_helper_neon_widen_u8,
1666         gen_helper_neon_widen_u16,
1667         tcg_gen_extu_i32_i64,
1668     };
1669     return do_vshll_2sh(s, a, widenfn[a->size], true);
1670 }
1671 
1672 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1673                       gen_helper_gvec_2_ptr *fn)
1674 {
1675     /* FP operations in 2-reg-and-shift group */
1676     int vec_size = a->q ? 16 : 8;
1677     int rd_ofs = neon_full_reg_offset(a->vd);
1678     int rm_ofs = neon_full_reg_offset(a->vm);
1679     TCGv_ptr fpst;
1680 
1681     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1682         return false;
1683     }
1684 
1685     if (a->size == MO_16) {
1686         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1687             return false;
1688         }
1689     }
1690 
1691     /* UNDEF accesses to D16-D31 if they don't exist. */
1692     if (!dc_isar_feature(aa32_simd_r32, s) &&
1693         ((a->vd | a->vm) & 0x10)) {
1694         return false;
1695     }
1696 
1697     if ((a->vm | a->vd) & a->q) {
1698         return false;
1699     }
1700 
1701     if (!vfp_access_check(s)) {
1702         return true;
1703     }
1704 
1705     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1706     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1707     return true;
1708 }
1709 
1710 #define DO_FP_2SH(INSN, FUNC)                                           \
1711     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1712     {                                                                   \
1713         return do_fp_2sh(s, a, FUNC);                                   \
1714     }
1715 
1716 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1717 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1718 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1719 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1720 
1721 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1722 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1723 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1724 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1725 
1726 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1727                         GVecGen2iFn *fn)
1728 {
1729     uint64_t imm;
1730     int reg_ofs, vec_size;
1731 
1732     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1733         return false;
1734     }
1735 
1736     /* UNDEF accesses to D16-D31 if they don't exist. */
1737     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1738         return false;
1739     }
1740 
1741     if (a->vd & a->q) {
1742         return false;
1743     }
1744 
1745     if (!vfp_access_check(s)) {
1746         return true;
1747     }
1748 
1749     reg_ofs = neon_full_reg_offset(a->vd);
1750     vec_size = a->q ? 16 : 8;
1751     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1752 
1753     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1754     return true;
1755 }
1756 
1757 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1758                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1759 {
1760     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1761 }
1762 
1763 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1764 {
1765     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1766     GVecGen2iFn *fn;
1767 
1768     if ((a->cmode & 1) && a->cmode < 12) {
1769         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1770         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1771     } else {
1772         /* There is one unallocated cmode/op combination in this space */
1773         if (a->cmode == 15 && a->op == 1) {
1774             return false;
1775         }
1776         fn = gen_VMOV_1r;
1777     }
1778     return do_1reg_imm(s, a, fn);
1779 }
1780 
1781 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1782                            NeonGenWidenFn *widenfn,
1783                            NeonGenTwo64OpFn *opfn,
1784                            int src1_mop, int src2_mop)
1785 {
1786     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1787     TCGv_i64 rn0_64, rn1_64, rm_64;
1788 
1789     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1790         return false;
1791     }
1792 
1793     /* UNDEF accesses to D16-D31 if they don't exist. */
1794     if (!dc_isar_feature(aa32_simd_r32, s) &&
1795         ((a->vd | a->vn | a->vm) & 0x10)) {
1796         return false;
1797     }
1798 
1799     if (!opfn) {
1800         /* size == 3 case, which is an entirely different insn group */
1801         return false;
1802     }
1803 
1804     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1805         return false;
1806     }
1807 
1808     if (!vfp_access_check(s)) {
1809         return true;
1810     }
1811 
1812     rn0_64 = tcg_temp_new_i64();
1813     rn1_64 = tcg_temp_new_i64();
1814     rm_64 = tcg_temp_new_i64();
1815 
1816     if (src1_mop >= 0) {
1817         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1818     } else {
1819         TCGv_i32 tmp = tcg_temp_new_i32();
1820         read_neon_element32(tmp, a->vn, 0, MO_32);
1821         widenfn(rn0_64, tmp);
1822     }
1823     if (src2_mop >= 0) {
1824         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1825     } else {
1826         TCGv_i32 tmp = tcg_temp_new_i32();
1827         read_neon_element32(tmp, a->vm, 0, MO_32);
1828         widenfn(rm_64, tmp);
1829     }
1830 
1831     opfn(rn0_64, rn0_64, rm_64);
1832 
1833     /*
1834      * Load second pass inputs before storing the first pass result, to
1835      * avoid incorrect results if a narrow input overlaps with the result.
1836      */
1837     if (src1_mop >= 0) {
1838         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1839     } else {
1840         TCGv_i32 tmp = tcg_temp_new_i32();
1841         read_neon_element32(tmp, a->vn, 1, MO_32);
1842         widenfn(rn1_64, tmp);
1843     }
1844     if (src2_mop >= 0) {
1845         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1846     } else {
1847         TCGv_i32 tmp = tcg_temp_new_i32();
1848         read_neon_element32(tmp, a->vm, 1, MO_32);
1849         widenfn(rm_64, tmp);
1850     }
1851 
1852     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1853 
1854     opfn(rn1_64, rn1_64, rm_64);
1855     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1856 
1857     return true;
1858 }
1859 
1860 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1861     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1862     {                                                                   \
1863         static NeonGenWidenFn * const widenfn[] = {                     \
1864             gen_helper_neon_widen_##S##8,                               \
1865             gen_helper_neon_widen_##S##16,                              \
1866             NULL, NULL,                                                 \
1867         };                                                              \
1868         static NeonGenTwo64OpFn * const addfn[] = {                     \
1869             gen_helper_neon_##OP##l_u16,                                \
1870             gen_helper_neon_##OP##l_u32,                                \
1871             tcg_gen_##OP##_i64,                                         \
1872             NULL,                                                       \
1873         };                                                              \
1874         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1875         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1876                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1877                               narrow_mop);                              \
1878     }
1879 
1880 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1881 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1882 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1883 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1884 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1885 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1886 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1887 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1888 
1889 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1890                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1891 {
1892     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1893     TCGv_i64 rn_64, rm_64;
1894     TCGv_i32 rd0, rd1;
1895 
1896     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1897         return false;
1898     }
1899 
1900     /* UNDEF accesses to D16-D31 if they don't exist. */
1901     if (!dc_isar_feature(aa32_simd_r32, s) &&
1902         ((a->vd | a->vn | a->vm) & 0x10)) {
1903         return false;
1904     }
1905 
1906     if (!opfn || !narrowfn) {
1907         /* size == 3 case, which is an entirely different insn group */
1908         return false;
1909     }
1910 
1911     if ((a->vn | a->vm) & 1) {
1912         return false;
1913     }
1914 
1915     if (!vfp_access_check(s)) {
1916         return true;
1917     }
1918 
1919     rn_64 = tcg_temp_new_i64();
1920     rm_64 = tcg_temp_new_i64();
1921     rd0 = tcg_temp_new_i32();
1922     rd1 = tcg_temp_new_i32();
1923 
1924     read_neon_element64(rn_64, a->vn, 0, MO_64);
1925     read_neon_element64(rm_64, a->vm, 0, MO_64);
1926 
1927     opfn(rn_64, rn_64, rm_64);
1928 
1929     narrowfn(rd0, rn_64);
1930 
1931     read_neon_element64(rn_64, a->vn, 1, MO_64);
1932     read_neon_element64(rm_64, a->vm, 1, MO_64);
1933 
1934     opfn(rn_64, rn_64, rm_64);
1935 
1936     narrowfn(rd1, rn_64);
1937 
1938     write_neon_element32(rd0, a->vd, 0, MO_32);
1939     write_neon_element32(rd1, a->vd, 1, MO_32);
1940 
1941     return true;
1942 }
1943 
1944 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1945     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1946     {                                                                   \
1947         static NeonGenTwo64OpFn * const addfn[] = {                     \
1948             gen_helper_neon_##OP##l_u16,                                \
1949             gen_helper_neon_##OP##l_u32,                                \
1950             tcg_gen_##OP##_i64,                                         \
1951             NULL,                                                       \
1952         };                                                              \
1953         static NeonGenNarrowFn * const narrowfn[] = {                   \
1954             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1955             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1956             EXTOP,                                                      \
1957             NULL,                                                       \
1958         };                                                              \
1959         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1960     }
1961 
1962 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1963 {
1964     tcg_gen_addi_i64(rn, rn, 1u << 31);
1965     tcg_gen_extrh_i64_i32(rd, rn);
1966 }
1967 
1968 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1969 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1970 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1971 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1972 
1973 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1974                        NeonGenTwoOpWidenFn *opfn,
1975                        NeonGenTwo64OpFn *accfn)
1976 {
1977     /*
1978      * 3-regs different lengths, long operations.
1979      * These perform an operation on two inputs that returns a double-width
1980      * result, and then possibly perform an accumulation operation of
1981      * that result into the double-width destination.
1982      */
1983     TCGv_i64 rd0, rd1, tmp;
1984     TCGv_i32 rn, rm;
1985 
1986     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1987         return false;
1988     }
1989 
1990     /* UNDEF accesses to D16-D31 if they don't exist. */
1991     if (!dc_isar_feature(aa32_simd_r32, s) &&
1992         ((a->vd | a->vn | a->vm) & 0x10)) {
1993         return false;
1994     }
1995 
1996     if (!opfn) {
1997         /* size == 3 case, which is an entirely different insn group */
1998         return false;
1999     }
2000 
2001     if (a->vd & 1) {
2002         return false;
2003     }
2004 
2005     if (!vfp_access_check(s)) {
2006         return true;
2007     }
2008 
2009     rd0 = tcg_temp_new_i64();
2010     rd1 = tcg_temp_new_i64();
2011 
2012     rn = tcg_temp_new_i32();
2013     rm = tcg_temp_new_i32();
2014     read_neon_element32(rn, a->vn, 0, MO_32);
2015     read_neon_element32(rm, a->vm, 0, MO_32);
2016     opfn(rd0, rn, rm);
2017 
2018     read_neon_element32(rn, a->vn, 1, MO_32);
2019     read_neon_element32(rm, a->vm, 1, MO_32);
2020     opfn(rd1, rn, rm);
2021 
2022     /* Don't store results until after all loads: they might overlap */
2023     if (accfn) {
2024         tmp = tcg_temp_new_i64();
2025         read_neon_element64(tmp, a->vd, 0, MO_64);
2026         accfn(rd0, tmp, rd0);
2027         read_neon_element64(tmp, a->vd, 1, MO_64);
2028         accfn(rd1, tmp, rd1);
2029     }
2030 
2031     write_neon_element64(rd0, a->vd, 0, MO_64);
2032     write_neon_element64(rd1, a->vd, 1, MO_64);
2033 
2034     return true;
2035 }
2036 
2037 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2038 {
2039     static NeonGenTwoOpWidenFn * const opfn[] = {
2040         gen_helper_neon_abdl_s16,
2041         gen_helper_neon_abdl_s32,
2042         gen_helper_neon_abdl_s64,
2043         NULL,
2044     };
2045 
2046     return do_long_3d(s, a, opfn[a->size], NULL);
2047 }
2048 
2049 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2050 {
2051     static NeonGenTwoOpWidenFn * const opfn[] = {
2052         gen_helper_neon_abdl_u16,
2053         gen_helper_neon_abdl_u32,
2054         gen_helper_neon_abdl_u64,
2055         NULL,
2056     };
2057 
2058     return do_long_3d(s, a, opfn[a->size], NULL);
2059 }
2060 
2061 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2062 {
2063     static NeonGenTwoOpWidenFn * const opfn[] = {
2064         gen_helper_neon_abdl_s16,
2065         gen_helper_neon_abdl_s32,
2066         gen_helper_neon_abdl_s64,
2067         NULL,
2068     };
2069     static NeonGenTwo64OpFn * const addfn[] = {
2070         gen_helper_neon_addl_u16,
2071         gen_helper_neon_addl_u32,
2072         tcg_gen_add_i64,
2073         NULL,
2074     };
2075 
2076     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2077 }
2078 
2079 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2080 {
2081     static NeonGenTwoOpWidenFn * const opfn[] = {
2082         gen_helper_neon_abdl_u16,
2083         gen_helper_neon_abdl_u32,
2084         gen_helper_neon_abdl_u64,
2085         NULL,
2086     };
2087     static NeonGenTwo64OpFn * const addfn[] = {
2088         gen_helper_neon_addl_u16,
2089         gen_helper_neon_addl_u32,
2090         tcg_gen_add_i64,
2091         NULL,
2092     };
2093 
2094     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2095 }
2096 
2097 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2098 {
2099     TCGv_i32 lo = tcg_temp_new_i32();
2100     TCGv_i32 hi = tcg_temp_new_i32();
2101 
2102     tcg_gen_muls2_i32(lo, hi, rn, rm);
2103     tcg_gen_concat_i32_i64(rd, lo, hi);
2104 }
2105 
2106 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2107 {
2108     TCGv_i32 lo = tcg_temp_new_i32();
2109     TCGv_i32 hi = tcg_temp_new_i32();
2110 
2111     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2112     tcg_gen_concat_i32_i64(rd, lo, hi);
2113 }
2114 
2115 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2116 {
2117     static NeonGenTwoOpWidenFn * const opfn[] = {
2118         gen_helper_neon_mull_s8,
2119         gen_helper_neon_mull_s16,
2120         gen_mull_s32,
2121         NULL,
2122     };
2123 
2124     return do_long_3d(s, a, opfn[a->size], NULL);
2125 }
2126 
2127 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2128 {
2129     static NeonGenTwoOpWidenFn * const opfn[] = {
2130         gen_helper_neon_mull_u8,
2131         gen_helper_neon_mull_u16,
2132         gen_mull_u32,
2133         NULL,
2134     };
2135 
2136     return do_long_3d(s, a, opfn[a->size], NULL);
2137 }
2138 
2139 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2140     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2141     {                                                                   \
2142         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2143             gen_helper_neon_##MULL##8,                                  \
2144             gen_helper_neon_##MULL##16,                                 \
2145             gen_##MULL##32,                                             \
2146             NULL,                                                       \
2147         };                                                              \
2148         static NeonGenTwo64OpFn * const accfn[] = {                     \
2149             gen_helper_neon_##ACC##l_u16,                               \
2150             gen_helper_neon_##ACC##l_u32,                               \
2151             tcg_gen_##ACC##_i64,                                        \
2152             NULL,                                                       \
2153         };                                                              \
2154         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2155     }
2156 
2157 DO_VMLAL(VMLAL_S,mull_s,add)
2158 DO_VMLAL(VMLAL_U,mull_u,add)
2159 DO_VMLAL(VMLSL_S,mull_s,sub)
2160 DO_VMLAL(VMLSL_U,mull_u,sub)
2161 
2162 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2163 {
2164     gen_helper_neon_mull_s16(rd, rn, rm);
2165     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2166 }
2167 
2168 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2169 {
2170     gen_mull_s32(rd, rn, rm);
2171     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2172 }
2173 
2174 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2175 {
2176     static NeonGenTwoOpWidenFn * const opfn[] = {
2177         NULL,
2178         gen_VQDMULL_16,
2179         gen_VQDMULL_32,
2180         NULL,
2181     };
2182 
2183     return do_long_3d(s, a, opfn[a->size], NULL);
2184 }
2185 
2186 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2187 {
2188     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2189 }
2190 
2191 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2192 {
2193     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2194 }
2195 
2196 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2197 {
2198     static NeonGenTwoOpWidenFn * const opfn[] = {
2199         NULL,
2200         gen_VQDMULL_16,
2201         gen_VQDMULL_32,
2202         NULL,
2203     };
2204     static NeonGenTwo64OpFn * const accfn[] = {
2205         NULL,
2206         gen_VQDMLAL_acc_16,
2207         gen_VQDMLAL_acc_32,
2208         NULL,
2209     };
2210 
2211     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2212 }
2213 
2214 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2215 {
2216     gen_helper_neon_negl_u32(rm, rm);
2217     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2218 }
2219 
2220 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2221 {
2222     tcg_gen_neg_i64(rm, rm);
2223     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2224 }
2225 
2226 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2227 {
2228     static NeonGenTwoOpWidenFn * const opfn[] = {
2229         NULL,
2230         gen_VQDMULL_16,
2231         gen_VQDMULL_32,
2232         NULL,
2233     };
2234     static NeonGenTwo64OpFn * const accfn[] = {
2235         NULL,
2236         gen_VQDMLSL_acc_16,
2237         gen_VQDMLSL_acc_32,
2238         NULL,
2239     };
2240 
2241     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2242 }
2243 
2244 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2245 {
2246     gen_helper_gvec_3 *fn_gvec;
2247 
2248     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2249         return false;
2250     }
2251 
2252     /* UNDEF accesses to D16-D31 if they don't exist. */
2253     if (!dc_isar_feature(aa32_simd_r32, s) &&
2254         ((a->vd | a->vn | a->vm) & 0x10)) {
2255         return false;
2256     }
2257 
2258     if (a->vd & 1) {
2259         return false;
2260     }
2261 
2262     switch (a->size) {
2263     case 0:
2264         fn_gvec = gen_helper_neon_pmull_h;
2265         break;
2266     case 2:
2267         if (!dc_isar_feature(aa32_pmull, s)) {
2268             return false;
2269         }
2270         fn_gvec = gen_helper_gvec_pmull_q;
2271         break;
2272     default:
2273         return false;
2274     }
2275 
2276     if (!vfp_access_check(s)) {
2277         return true;
2278     }
2279 
2280     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2281                        neon_full_reg_offset(a->vn),
2282                        neon_full_reg_offset(a->vm),
2283                        16, 16, 0, fn_gvec);
2284     return true;
2285 }
2286 
2287 static void gen_neon_dup_low16(TCGv_i32 var)
2288 {
2289     TCGv_i32 tmp = tcg_temp_new_i32();
2290     tcg_gen_ext16u_i32(var, var);
2291     tcg_gen_shli_i32(tmp, var, 16);
2292     tcg_gen_or_i32(var, var, tmp);
2293 }
2294 
2295 static void gen_neon_dup_high16(TCGv_i32 var)
2296 {
2297     TCGv_i32 tmp = tcg_temp_new_i32();
2298     tcg_gen_andi_i32(var, var, 0xffff0000);
2299     tcg_gen_shri_i32(tmp, var, 16);
2300     tcg_gen_or_i32(var, var, tmp);
2301 }
2302 
2303 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2304 {
2305     TCGv_i32 tmp = tcg_temp_new_i32();
2306     if (size == MO_16) {
2307         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2308         if (reg & 8) {
2309             gen_neon_dup_high16(tmp);
2310         } else {
2311             gen_neon_dup_low16(tmp);
2312         }
2313     } else {
2314         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2315     }
2316     return tmp;
2317 }
2318 
2319 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2320                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2321 {
2322     /*
2323      * Two registers and a scalar: perform an operation between
2324      * the input elements and the scalar, and then possibly
2325      * perform an accumulation operation of that result into the
2326      * destination.
2327      */
2328     TCGv_i32 scalar, tmp;
2329     int pass;
2330 
2331     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2332         return false;
2333     }
2334 
2335     /* UNDEF accesses to D16-D31 if they don't exist. */
2336     if (!dc_isar_feature(aa32_simd_r32, s) &&
2337         ((a->vd | a->vn | a->vm) & 0x10)) {
2338         return false;
2339     }
2340 
2341     if (!opfn) {
2342         /* Bad size (including size == 3, which is a different insn group) */
2343         return false;
2344     }
2345 
2346     if (a->q && ((a->vd | a->vn) & 1)) {
2347         return false;
2348     }
2349 
2350     if (!vfp_access_check(s)) {
2351         return true;
2352     }
2353 
2354     scalar = neon_get_scalar(a->size, a->vm);
2355     tmp = tcg_temp_new_i32();
2356 
2357     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2358         read_neon_element32(tmp, a->vn, pass, MO_32);
2359         opfn(tmp, tmp, scalar);
2360         if (accfn) {
2361             TCGv_i32 rd = tcg_temp_new_i32();
2362             read_neon_element32(rd, a->vd, pass, MO_32);
2363             accfn(tmp, rd, tmp);
2364         }
2365         write_neon_element32(tmp, a->vd, pass, MO_32);
2366     }
2367     return true;
2368 }
2369 
2370 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2371 {
2372     static NeonGenTwoOpFn * const opfn[] = {
2373         NULL,
2374         gen_helper_neon_mul_u16,
2375         tcg_gen_mul_i32,
2376         NULL,
2377     };
2378 
2379     return do_2scalar(s, a, opfn[a->size], NULL);
2380 }
2381 
2382 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2383 {
2384     static NeonGenTwoOpFn * const opfn[] = {
2385         NULL,
2386         gen_helper_neon_mul_u16,
2387         tcg_gen_mul_i32,
2388         NULL,
2389     };
2390     static NeonGenTwoOpFn * const accfn[] = {
2391         NULL,
2392         gen_helper_neon_add_u16,
2393         tcg_gen_add_i32,
2394         NULL,
2395     };
2396 
2397     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2398 }
2399 
2400 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2401 {
2402     static NeonGenTwoOpFn * const opfn[] = {
2403         NULL,
2404         gen_helper_neon_mul_u16,
2405         tcg_gen_mul_i32,
2406         NULL,
2407     };
2408     static NeonGenTwoOpFn * const accfn[] = {
2409         NULL,
2410         gen_helper_neon_sub_u16,
2411         tcg_gen_sub_i32,
2412         NULL,
2413     };
2414 
2415     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2416 }
2417 
2418 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2419                               gen_helper_gvec_3_ptr *fn)
2420 {
2421     /* Two registers and a scalar, using gvec */
2422     int vec_size = a->q ? 16 : 8;
2423     int rd_ofs = neon_full_reg_offset(a->vd);
2424     int rn_ofs = neon_full_reg_offset(a->vn);
2425     int rm_ofs;
2426     int idx;
2427     TCGv_ptr fpstatus;
2428 
2429     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2430         return false;
2431     }
2432 
2433     /* UNDEF accesses to D16-D31 if they don't exist. */
2434     if (!dc_isar_feature(aa32_simd_r32, s) &&
2435         ((a->vd | a->vn | a->vm) & 0x10)) {
2436         return false;
2437     }
2438 
2439     if (!fn) {
2440         /* Bad size (including size == 3, which is a different insn group) */
2441         return false;
2442     }
2443 
2444     if (a->q && ((a->vd | a->vn) & 1)) {
2445         return false;
2446     }
2447 
2448     if (!vfp_access_check(s)) {
2449         return true;
2450     }
2451 
2452     /* a->vm is M:Vm, which encodes both register and index */
2453     idx = extract32(a->vm, a->size + 2, 2);
2454     a->vm = extract32(a->vm, 0, a->size + 2);
2455     rm_ofs = neon_full_reg_offset(a->vm);
2456 
2457     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2458     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2459                        vec_size, vec_size, idx, fn);
2460     return true;
2461 }
2462 
2463 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2464     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2465     {                                                                   \
2466         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2467             NULL,                                                       \
2468             gen_helper_##FUNC##_h,                                      \
2469             gen_helper_##FUNC##_s,                                      \
2470             NULL,                                                       \
2471         };                                                              \
2472         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2473             return false;                                               \
2474         }                                                               \
2475         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2476     }
2477 
2478 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2479 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2480 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2481 
2482 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2483 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2484 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2485 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2486 
2487 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2488 {
2489     static NeonGenTwoOpFn * const opfn[] = {
2490         NULL,
2491         gen_VQDMULH_16,
2492         gen_VQDMULH_32,
2493         NULL,
2494     };
2495 
2496     return do_2scalar(s, a, opfn[a->size], NULL);
2497 }
2498 
2499 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2500 {
2501     static NeonGenTwoOpFn * const opfn[] = {
2502         NULL,
2503         gen_VQRDMULH_16,
2504         gen_VQRDMULH_32,
2505         NULL,
2506     };
2507 
2508     return do_2scalar(s, a, opfn[a->size], NULL);
2509 }
2510 
2511 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2512                             NeonGenThreeOpEnvFn *opfn)
2513 {
2514     /*
2515      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2516      * performs a kind of fused op-then-accumulate using a helper
2517      * function that takes all of rd, rn and the scalar at once.
2518      */
2519     TCGv_i32 scalar, rn, rd;
2520     int pass;
2521 
2522     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2523         return false;
2524     }
2525 
2526     if (!dc_isar_feature(aa32_rdm, s)) {
2527         return false;
2528     }
2529 
2530     /* UNDEF accesses to D16-D31 if they don't exist. */
2531     if (!dc_isar_feature(aa32_simd_r32, s) &&
2532         ((a->vd | a->vn | a->vm) & 0x10)) {
2533         return false;
2534     }
2535 
2536     if (!opfn) {
2537         /* Bad size (including size == 3, which is a different insn group) */
2538         return false;
2539     }
2540 
2541     if (a->q && ((a->vd | a->vn) & 1)) {
2542         return false;
2543     }
2544 
2545     if (!vfp_access_check(s)) {
2546         return true;
2547     }
2548 
2549     scalar = neon_get_scalar(a->size, a->vm);
2550     rn = tcg_temp_new_i32();
2551     rd = tcg_temp_new_i32();
2552 
2553     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2554         read_neon_element32(rn, a->vn, pass, MO_32);
2555         read_neon_element32(rd, a->vd, pass, MO_32);
2556         opfn(rd, cpu_env, rn, scalar, rd);
2557         write_neon_element32(rd, a->vd, pass, MO_32);
2558     }
2559     return true;
2560 }
2561 
2562 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2563 {
2564     static NeonGenThreeOpEnvFn *opfn[] = {
2565         NULL,
2566         gen_helper_neon_qrdmlah_s16,
2567         gen_helper_neon_qrdmlah_s32,
2568         NULL,
2569     };
2570     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2571 }
2572 
2573 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2574 {
2575     static NeonGenThreeOpEnvFn *opfn[] = {
2576         NULL,
2577         gen_helper_neon_qrdmlsh_s16,
2578         gen_helper_neon_qrdmlsh_s32,
2579         NULL,
2580     };
2581     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2582 }
2583 
2584 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2585                             NeonGenTwoOpWidenFn *opfn,
2586                             NeonGenTwo64OpFn *accfn)
2587 {
2588     /*
2589      * Two registers and a scalar, long operations: perform an
2590      * operation on the input elements and the scalar which produces
2591      * a double-width result, and then possibly perform an accumulation
2592      * operation of that result into the destination.
2593      */
2594     TCGv_i32 scalar, rn;
2595     TCGv_i64 rn0_64, rn1_64;
2596 
2597     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2598         return false;
2599     }
2600 
2601     /* UNDEF accesses to D16-D31 if they don't exist. */
2602     if (!dc_isar_feature(aa32_simd_r32, s) &&
2603         ((a->vd | a->vn | a->vm) & 0x10)) {
2604         return false;
2605     }
2606 
2607     if (!opfn) {
2608         /* Bad size (including size == 3, which is a different insn group) */
2609         return false;
2610     }
2611 
2612     if (a->vd & 1) {
2613         return false;
2614     }
2615 
2616     if (!vfp_access_check(s)) {
2617         return true;
2618     }
2619 
2620     scalar = neon_get_scalar(a->size, a->vm);
2621 
2622     /* Load all inputs before writing any outputs, in case of overlap */
2623     rn = tcg_temp_new_i32();
2624     read_neon_element32(rn, a->vn, 0, MO_32);
2625     rn0_64 = tcg_temp_new_i64();
2626     opfn(rn0_64, rn, scalar);
2627 
2628     read_neon_element32(rn, a->vn, 1, MO_32);
2629     rn1_64 = tcg_temp_new_i64();
2630     opfn(rn1_64, rn, scalar);
2631 
2632     if (accfn) {
2633         TCGv_i64 t64 = tcg_temp_new_i64();
2634         read_neon_element64(t64, a->vd, 0, MO_64);
2635         accfn(rn0_64, t64, rn0_64);
2636         read_neon_element64(t64, a->vd, 1, MO_64);
2637         accfn(rn1_64, t64, rn1_64);
2638     }
2639 
2640     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2641     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2642     return true;
2643 }
2644 
2645 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2646 {
2647     static NeonGenTwoOpWidenFn * const opfn[] = {
2648         NULL,
2649         gen_helper_neon_mull_s16,
2650         gen_mull_s32,
2651         NULL,
2652     };
2653 
2654     return do_2scalar_long(s, a, opfn[a->size], NULL);
2655 }
2656 
2657 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2658 {
2659     static NeonGenTwoOpWidenFn * const opfn[] = {
2660         NULL,
2661         gen_helper_neon_mull_u16,
2662         gen_mull_u32,
2663         NULL,
2664     };
2665 
2666     return do_2scalar_long(s, a, opfn[a->size], NULL);
2667 }
2668 
2669 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2670     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2671     {                                                                   \
2672         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2673             NULL,                                                       \
2674             gen_helper_neon_##MULL##16,                                 \
2675             gen_##MULL##32,                                             \
2676             NULL,                                                       \
2677         };                                                              \
2678         static NeonGenTwo64OpFn * const accfn[] = {                     \
2679             NULL,                                                       \
2680             gen_helper_neon_##ACC##l_u32,                               \
2681             tcg_gen_##ACC##_i64,                                        \
2682             NULL,                                                       \
2683         };                                                              \
2684         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2685     }
2686 
2687 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2688 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2689 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2690 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2691 
2692 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2693 {
2694     static NeonGenTwoOpWidenFn * const opfn[] = {
2695         NULL,
2696         gen_VQDMULL_16,
2697         gen_VQDMULL_32,
2698         NULL,
2699     };
2700 
2701     return do_2scalar_long(s, a, opfn[a->size], NULL);
2702 }
2703 
2704 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2705 {
2706     static NeonGenTwoOpWidenFn * const opfn[] = {
2707         NULL,
2708         gen_VQDMULL_16,
2709         gen_VQDMULL_32,
2710         NULL,
2711     };
2712     static NeonGenTwo64OpFn * const accfn[] = {
2713         NULL,
2714         gen_VQDMLAL_acc_16,
2715         gen_VQDMLAL_acc_32,
2716         NULL,
2717     };
2718 
2719     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2720 }
2721 
2722 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2723 {
2724     static NeonGenTwoOpWidenFn * const opfn[] = {
2725         NULL,
2726         gen_VQDMULL_16,
2727         gen_VQDMULL_32,
2728         NULL,
2729     };
2730     static NeonGenTwo64OpFn * const accfn[] = {
2731         NULL,
2732         gen_VQDMLSL_acc_16,
2733         gen_VQDMLSL_acc_32,
2734         NULL,
2735     };
2736 
2737     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2738 }
2739 
2740 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2741 {
2742     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2743         return false;
2744     }
2745 
2746     /* UNDEF accesses to D16-D31 if they don't exist. */
2747     if (!dc_isar_feature(aa32_simd_r32, s) &&
2748         ((a->vd | a->vn | a->vm) & 0x10)) {
2749         return false;
2750     }
2751 
2752     if ((a->vn | a->vm | a->vd) & a->q) {
2753         return false;
2754     }
2755 
2756     if (a->imm > 7 && !a->q) {
2757         return false;
2758     }
2759 
2760     if (!vfp_access_check(s)) {
2761         return true;
2762     }
2763 
2764     if (!a->q) {
2765         /* Extract 64 bits from <Vm:Vn> */
2766         TCGv_i64 left, right, dest;
2767 
2768         left = tcg_temp_new_i64();
2769         right = tcg_temp_new_i64();
2770         dest = tcg_temp_new_i64();
2771 
2772         read_neon_element64(right, a->vn, 0, MO_64);
2773         read_neon_element64(left, a->vm, 0, MO_64);
2774         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2775         write_neon_element64(dest, a->vd, 0, MO_64);
2776     } else {
2777         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2778         TCGv_i64 left, middle, right, destleft, destright;
2779 
2780         left = tcg_temp_new_i64();
2781         middle = tcg_temp_new_i64();
2782         right = tcg_temp_new_i64();
2783         destleft = tcg_temp_new_i64();
2784         destright = tcg_temp_new_i64();
2785 
2786         if (a->imm < 8) {
2787             read_neon_element64(right, a->vn, 0, MO_64);
2788             read_neon_element64(middle, a->vn, 1, MO_64);
2789             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2790             read_neon_element64(left, a->vm, 0, MO_64);
2791             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2792         } else {
2793             read_neon_element64(right, a->vn, 1, MO_64);
2794             read_neon_element64(middle, a->vm, 0, MO_64);
2795             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2796             read_neon_element64(left, a->vm, 1, MO_64);
2797             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2798         }
2799 
2800         write_neon_element64(destright, a->vd, 0, MO_64);
2801         write_neon_element64(destleft, a->vd, 1, MO_64);
2802     }
2803     return true;
2804 }
2805 
2806 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2807 {
2808     TCGv_i64 val, def;
2809     TCGv_i32 desc;
2810 
2811     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2812         return false;
2813     }
2814 
2815     /* UNDEF accesses to D16-D31 if they don't exist. */
2816     if (!dc_isar_feature(aa32_simd_r32, s) &&
2817         ((a->vd | a->vn | a->vm) & 0x10)) {
2818         return false;
2819     }
2820 
2821     if ((a->vn + a->len + 1) > 32) {
2822         /*
2823          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2824          * helper function running off the end of the register file.
2825          */
2826         return false;
2827     }
2828 
2829     if (!vfp_access_check(s)) {
2830         return true;
2831     }
2832 
2833     desc = tcg_constant_i32((a->vn << 2) | a->len);
2834     def = tcg_temp_new_i64();
2835     if (a->op) {
2836         read_neon_element64(def, a->vd, 0, MO_64);
2837     } else {
2838         tcg_gen_movi_i64(def, 0);
2839     }
2840     val = tcg_temp_new_i64();
2841     read_neon_element64(val, a->vm, 0, MO_64);
2842 
2843     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2844     write_neon_element64(val, a->vd, 0, MO_64);
2845     return true;
2846 }
2847 
2848 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2849 {
2850     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2851         return false;
2852     }
2853 
2854     /* UNDEF accesses to D16-D31 if they don't exist. */
2855     if (!dc_isar_feature(aa32_simd_r32, s) &&
2856         ((a->vd | a->vm) & 0x10)) {
2857         return false;
2858     }
2859 
2860     if (a->vd & a->q) {
2861         return false;
2862     }
2863 
2864     if (!vfp_access_check(s)) {
2865         return true;
2866     }
2867 
2868     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2869                          neon_element_offset(a->vm, a->index, a->size),
2870                          a->q ? 16 : 8, a->q ? 16 : 8);
2871     return true;
2872 }
2873 
2874 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2875 {
2876     int pass, half;
2877     TCGv_i32 tmp[2];
2878 
2879     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2880         return false;
2881     }
2882 
2883     /* UNDEF accesses to D16-D31 if they don't exist. */
2884     if (!dc_isar_feature(aa32_simd_r32, s) &&
2885         ((a->vd | a->vm) & 0x10)) {
2886         return false;
2887     }
2888 
2889     if ((a->vd | a->vm) & a->q) {
2890         return false;
2891     }
2892 
2893     if (a->size == 3) {
2894         return false;
2895     }
2896 
2897     if (!vfp_access_check(s)) {
2898         return true;
2899     }
2900 
2901     tmp[0] = tcg_temp_new_i32();
2902     tmp[1] = tcg_temp_new_i32();
2903 
2904     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2905         for (half = 0; half < 2; half++) {
2906             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2907             switch (a->size) {
2908             case 0:
2909                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2910                 break;
2911             case 1:
2912                 gen_swap_half(tmp[half], tmp[half]);
2913                 break;
2914             case 2:
2915                 break;
2916             default:
2917                 g_assert_not_reached();
2918             }
2919         }
2920         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2921         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
2922     }
2923     return true;
2924 }
2925 
2926 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2927                               NeonGenWidenFn *widenfn,
2928                               NeonGenTwo64OpFn *opfn,
2929                               NeonGenTwo64OpFn *accfn)
2930 {
2931     /*
2932      * Pairwise long operations: widen both halves of the pair,
2933      * combine the pairs with the opfn, and then possibly accumulate
2934      * into the destination with the accfn.
2935      */
2936     int pass;
2937 
2938     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2939         return false;
2940     }
2941 
2942     /* UNDEF accesses to D16-D31 if they don't exist. */
2943     if (!dc_isar_feature(aa32_simd_r32, s) &&
2944         ((a->vd | a->vm) & 0x10)) {
2945         return false;
2946     }
2947 
2948     if ((a->vd | a->vm) & a->q) {
2949         return false;
2950     }
2951 
2952     if (!widenfn) {
2953         return false;
2954     }
2955 
2956     if (!vfp_access_check(s)) {
2957         return true;
2958     }
2959 
2960     for (pass = 0; pass < a->q + 1; pass++) {
2961         TCGv_i32 tmp;
2962         TCGv_i64 rm0_64, rm1_64, rd_64;
2963 
2964         rm0_64 = tcg_temp_new_i64();
2965         rm1_64 = tcg_temp_new_i64();
2966         rd_64 = tcg_temp_new_i64();
2967 
2968         tmp = tcg_temp_new_i32();
2969         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
2970         widenfn(rm0_64, tmp);
2971         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
2972         widenfn(rm1_64, tmp);
2973 
2974         opfn(rd_64, rm0_64, rm1_64);
2975 
2976         if (accfn) {
2977             TCGv_i64 tmp64 = tcg_temp_new_i64();
2978             read_neon_element64(tmp64, a->vd, pass, MO_64);
2979             accfn(rd_64, tmp64, rd_64);
2980         }
2981         write_neon_element64(rd_64, a->vd, pass, MO_64);
2982     }
2983     return true;
2984 }
2985 
2986 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
2987 {
2988     static NeonGenWidenFn * const widenfn[] = {
2989         gen_helper_neon_widen_s8,
2990         gen_helper_neon_widen_s16,
2991         tcg_gen_ext_i32_i64,
2992         NULL,
2993     };
2994     static NeonGenTwo64OpFn * const opfn[] = {
2995         gen_helper_neon_paddl_u16,
2996         gen_helper_neon_paddl_u32,
2997         tcg_gen_add_i64,
2998         NULL,
2999     };
3000 
3001     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3002 }
3003 
3004 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3005 {
3006     static NeonGenWidenFn * const widenfn[] = {
3007         gen_helper_neon_widen_u8,
3008         gen_helper_neon_widen_u16,
3009         tcg_gen_extu_i32_i64,
3010         NULL,
3011     };
3012     static NeonGenTwo64OpFn * const opfn[] = {
3013         gen_helper_neon_paddl_u16,
3014         gen_helper_neon_paddl_u32,
3015         tcg_gen_add_i64,
3016         NULL,
3017     };
3018 
3019     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3020 }
3021 
3022 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3023 {
3024     static NeonGenWidenFn * const widenfn[] = {
3025         gen_helper_neon_widen_s8,
3026         gen_helper_neon_widen_s16,
3027         tcg_gen_ext_i32_i64,
3028         NULL,
3029     };
3030     static NeonGenTwo64OpFn * const opfn[] = {
3031         gen_helper_neon_paddl_u16,
3032         gen_helper_neon_paddl_u32,
3033         tcg_gen_add_i64,
3034         NULL,
3035     };
3036     static NeonGenTwo64OpFn * const accfn[] = {
3037         gen_helper_neon_addl_u16,
3038         gen_helper_neon_addl_u32,
3039         tcg_gen_add_i64,
3040         NULL,
3041     };
3042 
3043     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3044                              accfn[a->size]);
3045 }
3046 
3047 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3048 {
3049     static NeonGenWidenFn * const widenfn[] = {
3050         gen_helper_neon_widen_u8,
3051         gen_helper_neon_widen_u16,
3052         tcg_gen_extu_i32_i64,
3053         NULL,
3054     };
3055     static NeonGenTwo64OpFn * const opfn[] = {
3056         gen_helper_neon_paddl_u16,
3057         gen_helper_neon_paddl_u32,
3058         tcg_gen_add_i64,
3059         NULL,
3060     };
3061     static NeonGenTwo64OpFn * const accfn[] = {
3062         gen_helper_neon_addl_u16,
3063         gen_helper_neon_addl_u32,
3064         tcg_gen_add_i64,
3065         NULL,
3066     };
3067 
3068     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3069                              accfn[a->size]);
3070 }
3071 
3072 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3073 
3074 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3075                        ZipFn *fn)
3076 {
3077     TCGv_ptr pd, pm;
3078 
3079     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3080         return false;
3081     }
3082 
3083     /* UNDEF accesses to D16-D31 if they don't exist. */
3084     if (!dc_isar_feature(aa32_simd_r32, s) &&
3085         ((a->vd | a->vm) & 0x10)) {
3086         return false;
3087     }
3088 
3089     if ((a->vd | a->vm) & a->q) {
3090         return false;
3091     }
3092 
3093     if (!fn) {
3094         /* Bad size or size/q combination */
3095         return false;
3096     }
3097 
3098     if (!vfp_access_check(s)) {
3099         return true;
3100     }
3101 
3102     pd = vfp_reg_ptr(true, a->vd);
3103     pm = vfp_reg_ptr(true, a->vm);
3104     fn(pd, pm);
3105     return true;
3106 }
3107 
3108 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3109 {
3110     static ZipFn * const fn[2][4] = {
3111         {
3112             gen_helper_neon_unzip8,
3113             gen_helper_neon_unzip16,
3114             NULL,
3115             NULL,
3116         }, {
3117             gen_helper_neon_qunzip8,
3118             gen_helper_neon_qunzip16,
3119             gen_helper_neon_qunzip32,
3120             NULL,
3121         }
3122     };
3123     return do_zip_uzp(s, a, fn[a->q][a->size]);
3124 }
3125 
3126 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3127 {
3128     static ZipFn * const fn[2][4] = {
3129         {
3130             gen_helper_neon_zip8,
3131             gen_helper_neon_zip16,
3132             NULL,
3133             NULL,
3134         }, {
3135             gen_helper_neon_qzip8,
3136             gen_helper_neon_qzip16,
3137             gen_helper_neon_qzip32,
3138             NULL,
3139         }
3140     };
3141     return do_zip_uzp(s, a, fn[a->q][a->size]);
3142 }
3143 
3144 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3145                      NeonGenNarrowEnvFn *narrowfn)
3146 {
3147     TCGv_i64 rm;
3148     TCGv_i32 rd0, rd1;
3149 
3150     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3151         return false;
3152     }
3153 
3154     /* UNDEF accesses to D16-D31 if they don't exist. */
3155     if (!dc_isar_feature(aa32_simd_r32, s) &&
3156         ((a->vd | a->vm) & 0x10)) {
3157         return false;
3158     }
3159 
3160     if (a->vm & 1) {
3161         return false;
3162     }
3163 
3164     if (!narrowfn) {
3165         return false;
3166     }
3167 
3168     if (!vfp_access_check(s)) {
3169         return true;
3170     }
3171 
3172     rm = tcg_temp_new_i64();
3173     rd0 = tcg_temp_new_i32();
3174     rd1 = tcg_temp_new_i32();
3175 
3176     read_neon_element64(rm, a->vm, 0, MO_64);
3177     narrowfn(rd0, cpu_env, rm);
3178     read_neon_element64(rm, a->vm, 1, MO_64);
3179     narrowfn(rd1, cpu_env, rm);
3180     write_neon_element32(rd0, a->vd, 0, MO_32);
3181     write_neon_element32(rd1, a->vd, 1, MO_32);
3182     return true;
3183 }
3184 
3185 #define DO_VMOVN(INSN, FUNC)                                    \
3186     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3187     {                                                           \
3188         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3189             FUNC##8,                                            \
3190             FUNC##16,                                           \
3191             FUNC##32,                                           \
3192             NULL,                                               \
3193         };                                                      \
3194         return do_vmovn(s, a, narrowfn[a->size]);               \
3195     }
3196 
3197 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3198 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3199 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3200 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3201 
3202 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3203 {
3204     TCGv_i32 rm0, rm1;
3205     TCGv_i64 rd;
3206     static NeonGenWidenFn * const widenfns[] = {
3207         gen_helper_neon_widen_u8,
3208         gen_helper_neon_widen_u16,
3209         tcg_gen_extu_i32_i64,
3210         NULL,
3211     };
3212     NeonGenWidenFn *widenfn = widenfns[a->size];
3213 
3214     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3215         return false;
3216     }
3217 
3218     /* UNDEF accesses to D16-D31 if they don't exist. */
3219     if (!dc_isar_feature(aa32_simd_r32, s) &&
3220         ((a->vd | a->vm) & 0x10)) {
3221         return false;
3222     }
3223 
3224     if (a->vd & 1) {
3225         return false;
3226     }
3227 
3228     if (!widenfn) {
3229         return false;
3230     }
3231 
3232     if (!vfp_access_check(s)) {
3233         return true;
3234     }
3235 
3236     rd = tcg_temp_new_i64();
3237     rm0 = tcg_temp_new_i32();
3238     rm1 = tcg_temp_new_i32();
3239 
3240     read_neon_element32(rm0, a->vm, 0, MO_32);
3241     read_neon_element32(rm1, a->vm, 1, MO_32);
3242 
3243     widenfn(rd, rm0);
3244     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3245     write_neon_element64(rd, a->vd, 0, MO_64);
3246     widenfn(rd, rm1);
3247     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3248     write_neon_element64(rd, a->vd, 1, MO_64);
3249     return true;
3250 }
3251 
3252 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3253 {
3254     TCGv_ptr fpst;
3255     TCGv_i64 tmp;
3256     TCGv_i32 dst0, dst1;
3257 
3258     if (!dc_isar_feature(aa32_bf16, s)) {
3259         return false;
3260     }
3261 
3262     /* UNDEF accesses to D16-D31 if they don't exist. */
3263     if (!dc_isar_feature(aa32_simd_r32, s) &&
3264         ((a->vd | a->vm) & 0x10)) {
3265         return false;
3266     }
3267 
3268     if ((a->vm & 1) || (a->size != 1)) {
3269         return false;
3270     }
3271 
3272     if (!vfp_access_check(s)) {
3273         return true;
3274     }
3275 
3276     fpst = fpstatus_ptr(FPST_STD);
3277     tmp = tcg_temp_new_i64();
3278     dst0 = tcg_temp_new_i32();
3279     dst1 = tcg_temp_new_i32();
3280 
3281     read_neon_element64(tmp, a->vm, 0, MO_64);
3282     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3283 
3284     read_neon_element64(tmp, a->vm, 1, MO_64);
3285     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3286 
3287     write_neon_element32(dst0, a->vd, 0, MO_32);
3288     write_neon_element32(dst1, a->vd, 1, MO_32);
3289     return true;
3290 }
3291 
3292 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3293 {
3294     TCGv_ptr fpst;
3295     TCGv_i32 ahp, tmp, tmp2, tmp3;
3296 
3297     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3298         !dc_isar_feature(aa32_fp16_spconv, s)) {
3299         return false;
3300     }
3301 
3302     /* UNDEF accesses to D16-D31 if they don't exist. */
3303     if (!dc_isar_feature(aa32_simd_r32, s) &&
3304         ((a->vd | a->vm) & 0x10)) {
3305         return false;
3306     }
3307 
3308     if ((a->vm & 1) || (a->size != 1)) {
3309         return false;
3310     }
3311 
3312     if (!vfp_access_check(s)) {
3313         return true;
3314     }
3315 
3316     fpst = fpstatus_ptr(FPST_STD);
3317     ahp = get_ahp_flag();
3318     tmp = tcg_temp_new_i32();
3319     read_neon_element32(tmp, a->vm, 0, MO_32);
3320     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3321     tmp2 = tcg_temp_new_i32();
3322     read_neon_element32(tmp2, a->vm, 1, MO_32);
3323     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3324     tcg_gen_shli_i32(tmp2, tmp2, 16);
3325     tcg_gen_or_i32(tmp2, tmp2, tmp);
3326     read_neon_element32(tmp, a->vm, 2, MO_32);
3327     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3328     tmp3 = tcg_temp_new_i32();
3329     read_neon_element32(tmp3, a->vm, 3, MO_32);
3330     write_neon_element32(tmp2, a->vd, 0, MO_32);
3331     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3332     tcg_gen_shli_i32(tmp3, tmp3, 16);
3333     tcg_gen_or_i32(tmp3, tmp3, tmp);
3334     write_neon_element32(tmp3, a->vd, 1, MO_32);
3335     return true;
3336 }
3337 
3338 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3339 {
3340     TCGv_ptr fpst;
3341     TCGv_i32 ahp, tmp, tmp2, tmp3;
3342 
3343     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3344         !dc_isar_feature(aa32_fp16_spconv, s)) {
3345         return false;
3346     }
3347 
3348     /* UNDEF accesses to D16-D31 if they don't exist. */
3349     if (!dc_isar_feature(aa32_simd_r32, s) &&
3350         ((a->vd | a->vm) & 0x10)) {
3351         return false;
3352     }
3353 
3354     if ((a->vd & 1) || (a->size != 1)) {
3355         return false;
3356     }
3357 
3358     if (!vfp_access_check(s)) {
3359         return true;
3360     }
3361 
3362     fpst = fpstatus_ptr(FPST_STD);
3363     ahp = get_ahp_flag();
3364     tmp3 = tcg_temp_new_i32();
3365     tmp2 = tcg_temp_new_i32();
3366     tmp = tcg_temp_new_i32();
3367     read_neon_element32(tmp, a->vm, 0, MO_32);
3368     read_neon_element32(tmp2, a->vm, 1, MO_32);
3369     tcg_gen_ext16u_i32(tmp3, tmp);
3370     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3371     write_neon_element32(tmp3, a->vd, 0, MO_32);
3372     tcg_gen_shri_i32(tmp, tmp, 16);
3373     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3374     write_neon_element32(tmp, a->vd, 1, MO_32);
3375     tcg_gen_ext16u_i32(tmp3, tmp2);
3376     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3377     write_neon_element32(tmp3, a->vd, 2, MO_32);
3378     tcg_gen_shri_i32(tmp2, tmp2, 16);
3379     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3380     write_neon_element32(tmp2, a->vd, 3, MO_32);
3381     return true;
3382 }
3383 
3384 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3385 {
3386     int vec_size = a->q ? 16 : 8;
3387     int rd_ofs = neon_full_reg_offset(a->vd);
3388     int rm_ofs = neon_full_reg_offset(a->vm);
3389 
3390     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3391         return false;
3392     }
3393 
3394     /* UNDEF accesses to D16-D31 if they don't exist. */
3395     if (!dc_isar_feature(aa32_simd_r32, s) &&
3396         ((a->vd | a->vm) & 0x10)) {
3397         return false;
3398     }
3399 
3400     if (a->size == 3) {
3401         return false;
3402     }
3403 
3404     if ((a->vd | a->vm) & a->q) {
3405         return false;
3406     }
3407 
3408     if (!vfp_access_check(s)) {
3409         return true;
3410     }
3411 
3412     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3413 
3414     return true;
3415 }
3416 
3417 #define DO_2MISC_VEC(INSN, FN)                                  \
3418     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3419     {                                                           \
3420         return do_2misc_vec(s, a, FN);                          \
3421     }
3422 
3423 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3424 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3425 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3426 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3427 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3428 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3429 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3430 
3431 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3432 {
3433     if (a->size != 0) {
3434         return false;
3435     }
3436     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3437 }
3438 
3439 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3440     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3441                          uint32_t rm_ofs, uint32_t oprsz,               \
3442                          uint32_t maxsz)                                \
3443     {                                                                   \
3444         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3445                            DATA, FUNC);                                 \
3446     }
3447 
3448 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3449     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3450                          uint32_t rm_ofs, uint32_t oprsz,               \
3451                          uint32_t maxsz)                                \
3452     {                                                                   \
3453         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3454     }
3455 
3456 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3457 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3458 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3459 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3460 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3461 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3462 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3463 
3464 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3465     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3466     {                                                           \
3467         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3468             return false;                                       \
3469         }                                                       \
3470         return do_2misc_vec(s, a, gen_##INSN);                  \
3471     }
3472 
3473 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3474 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3475 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3476 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3477 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3478 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3479 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3480 
3481 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3482 {
3483     TCGv_i32 tmp;
3484     int pass;
3485 
3486     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3487     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3488         return false;
3489     }
3490 
3491     /* UNDEF accesses to D16-D31 if they don't exist. */
3492     if (!dc_isar_feature(aa32_simd_r32, s) &&
3493         ((a->vd | a->vm) & 0x10)) {
3494         return false;
3495     }
3496 
3497     if (!fn) {
3498         return false;
3499     }
3500 
3501     if ((a->vd | a->vm) & a->q) {
3502         return false;
3503     }
3504 
3505     if (!vfp_access_check(s)) {
3506         return true;
3507     }
3508 
3509     tmp = tcg_temp_new_i32();
3510     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3511         read_neon_element32(tmp, a->vm, pass, MO_32);
3512         fn(tmp, tmp);
3513         write_neon_element32(tmp, a->vd, pass, MO_32);
3514     }
3515     return true;
3516 }
3517 
3518 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3519 {
3520     static NeonGenOneOpFn * const fn[] = {
3521         tcg_gen_bswap32_i32,
3522         gen_swap_half,
3523         NULL,
3524         NULL,
3525     };
3526     return do_2misc(s, a, fn[a->size]);
3527 }
3528 
3529 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3530 {
3531     if (a->size != 0) {
3532         return false;
3533     }
3534     return do_2misc(s, a, gen_rev16);
3535 }
3536 
3537 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3538 {
3539     static NeonGenOneOpFn * const fn[] = {
3540         gen_helper_neon_cls_s8,
3541         gen_helper_neon_cls_s16,
3542         gen_helper_neon_cls_s32,
3543         NULL,
3544     };
3545     return do_2misc(s, a, fn[a->size]);
3546 }
3547 
3548 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3549 {
3550     tcg_gen_clzi_i32(rd, rm, 32);
3551 }
3552 
3553 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3554 {
3555     static NeonGenOneOpFn * const fn[] = {
3556         gen_helper_neon_clz_u8,
3557         gen_helper_neon_clz_u16,
3558         do_VCLZ_32,
3559         NULL,
3560     };
3561     return do_2misc(s, a, fn[a->size]);
3562 }
3563 
3564 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3565 {
3566     if (a->size != 0) {
3567         return false;
3568     }
3569     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3570 }
3571 
3572 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3573                        uint32_t oprsz, uint32_t maxsz)
3574 {
3575     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3576                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3577                       oprsz, maxsz);
3578 }
3579 
3580 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3581 {
3582     if (a->size == MO_16) {
3583         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3584             return false;
3585         }
3586     } else if (a->size != MO_32) {
3587         return false;
3588     }
3589     return do_2misc_vec(s, a, gen_VABS_F);
3590 }
3591 
3592 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3593                        uint32_t oprsz, uint32_t maxsz)
3594 {
3595     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3596                       vece == MO_16 ? 0x8000 : 0x80000000,
3597                       oprsz, maxsz);
3598 }
3599 
3600 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3601 {
3602     if (a->size == MO_16) {
3603         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3604             return false;
3605         }
3606     } else if (a->size != MO_32) {
3607         return false;
3608     }
3609     return do_2misc_vec(s, a, gen_VNEG_F);
3610 }
3611 
3612 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3613 {
3614     if (a->size != 2) {
3615         return false;
3616     }
3617     return do_2misc(s, a, gen_helper_recpe_u32);
3618 }
3619 
3620 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3621 {
3622     if (a->size != 2) {
3623         return false;
3624     }
3625     return do_2misc(s, a, gen_helper_rsqrte_u32);
3626 }
3627 
3628 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3629     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3630     {                                                   \
3631         FUNC(d, cpu_env, m);                            \
3632     }
3633 
3634 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3635 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3636 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3637 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3638 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3639 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3640 
3641 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3642 {
3643     static NeonGenOneOpFn * const fn[] = {
3644         gen_VQABS_s8,
3645         gen_VQABS_s16,
3646         gen_VQABS_s32,
3647         NULL,
3648     };
3649     return do_2misc(s, a, fn[a->size]);
3650 }
3651 
3652 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3653 {
3654     static NeonGenOneOpFn * const fn[] = {
3655         gen_VQNEG_s8,
3656         gen_VQNEG_s16,
3657         gen_VQNEG_s32,
3658         NULL,
3659     };
3660     return do_2misc(s, a, fn[a->size]);
3661 }
3662 
3663 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3664     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3665                            uint32_t rm_ofs,                             \
3666                            uint32_t oprsz, uint32_t maxsz)              \
3667     {                                                                   \
3668         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3669             NULL, HFUNC, SFUNC, NULL,                                   \
3670         };                                                              \
3671         TCGv_ptr fpst;                                                  \
3672         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3673         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3674                            fns[vece]);                                  \
3675     }                                                                   \
3676     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3677     {                                                                   \
3678         if (a->size == MO_16) {                                         \
3679             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3680                 return false;                                           \
3681             }                                                           \
3682         } else if (a->size != MO_32) {                                  \
3683             return false;                                               \
3684         }                                                               \
3685         return do_2misc_vec(s, a, gen_##INSN);                          \
3686     }
3687 
3688 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3689 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3690 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3691 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3692 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3693 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3694 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3695 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3696 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3697 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3698 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3699 
3700 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3701 
3702 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3703 {
3704     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3705         return false;
3706     }
3707     return trans_VRINTX_impl(s, a);
3708 }
3709 
3710 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3711     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3712                            uint32_t rm_ofs,                             \
3713                            uint32_t oprsz, uint32_t maxsz)              \
3714     {                                                                   \
3715         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3716             NULL,                                                       \
3717             gen_helper_gvec_##OP##h,                                    \
3718             gen_helper_gvec_##OP##s,                                    \
3719             NULL,                                                       \
3720         };                                                              \
3721         TCGv_ptr fpst;                                                  \
3722         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3723         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3724                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3725     }                                                                   \
3726     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3727     {                                                                   \
3728         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3729             return false;                                               \
3730         }                                                               \
3731         if (a->size == MO_16) {                                         \
3732             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3733                 return false;                                           \
3734             }                                                           \
3735         } else if (a->size != MO_32) {                                  \
3736             return false;                                               \
3737         }                                                               \
3738         return do_2misc_vec(s, a, gen_##INSN);                          \
3739     }
3740 
3741 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3742 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3743 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3744 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3745 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3746 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3747 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3748 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3749 
3750 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3751 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3752 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3753 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3754 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3755 
3756 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3757 {
3758     TCGv_i64 rm, rd;
3759     int pass;
3760 
3761     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3762         return false;
3763     }
3764 
3765     /* UNDEF accesses to D16-D31 if they don't exist. */
3766     if (!dc_isar_feature(aa32_simd_r32, s) &&
3767         ((a->vd | a->vm) & 0x10)) {
3768         return false;
3769     }
3770 
3771     if (a->size != 0) {
3772         return false;
3773     }
3774 
3775     if ((a->vd | a->vm) & a->q) {
3776         return false;
3777     }
3778 
3779     if (!vfp_access_check(s)) {
3780         return true;
3781     }
3782 
3783     rm = tcg_temp_new_i64();
3784     rd = tcg_temp_new_i64();
3785     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3786         read_neon_element64(rm, a->vm, pass, MO_64);
3787         read_neon_element64(rd, a->vd, pass, MO_64);
3788         write_neon_element64(rm, a->vd, pass, MO_64);
3789         write_neon_element64(rd, a->vm, pass, MO_64);
3790     }
3791     return true;
3792 }
3793 
3794 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3795 {
3796     TCGv_i32 rd, tmp;
3797 
3798     rd = tcg_temp_new_i32();
3799     tmp = tcg_temp_new_i32();
3800 
3801     tcg_gen_shli_i32(rd, t0, 8);
3802     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3803     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3804     tcg_gen_or_i32(rd, rd, tmp);
3805 
3806     tcg_gen_shri_i32(t1, t1, 8);
3807     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3808     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3809     tcg_gen_or_i32(t1, t1, tmp);
3810     tcg_gen_mov_i32(t0, rd);
3811 }
3812 
3813 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3814 {
3815     TCGv_i32 rd, tmp;
3816 
3817     rd = tcg_temp_new_i32();
3818     tmp = tcg_temp_new_i32();
3819 
3820     tcg_gen_shli_i32(rd, t0, 16);
3821     tcg_gen_andi_i32(tmp, t1, 0xffff);
3822     tcg_gen_or_i32(rd, rd, tmp);
3823     tcg_gen_shri_i32(t1, t1, 16);
3824     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3825     tcg_gen_or_i32(t1, t1, tmp);
3826     tcg_gen_mov_i32(t0, rd);
3827 }
3828 
3829 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3830 {
3831     TCGv_i32 tmp, tmp2;
3832     int pass;
3833 
3834     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3835         return false;
3836     }
3837 
3838     /* UNDEF accesses to D16-D31 if they don't exist. */
3839     if (!dc_isar_feature(aa32_simd_r32, s) &&
3840         ((a->vd | a->vm) & 0x10)) {
3841         return false;
3842     }
3843 
3844     if ((a->vd | a->vm) & a->q) {
3845         return false;
3846     }
3847 
3848     if (a->size == 3) {
3849         return false;
3850     }
3851 
3852     if (!vfp_access_check(s)) {
3853         return true;
3854     }
3855 
3856     tmp = tcg_temp_new_i32();
3857     tmp2 = tcg_temp_new_i32();
3858     if (a->size == MO_32) {
3859         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3860             read_neon_element32(tmp, a->vm, pass, MO_32);
3861             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3862             write_neon_element32(tmp2, a->vm, pass, MO_32);
3863             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3864         }
3865     } else {
3866         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3867             read_neon_element32(tmp, a->vm, pass, MO_32);
3868             read_neon_element32(tmp2, a->vd, pass, MO_32);
3869             if (a->size == MO_8) {
3870                 gen_neon_trn_u8(tmp, tmp2);
3871             } else {
3872                 gen_neon_trn_u16(tmp, tmp2);
3873             }
3874             write_neon_element32(tmp2, a->vm, pass, MO_32);
3875             write_neon_element32(tmp, a->vd, pass, MO_32);
3876         }
3877     }
3878     return true;
3879 }
3880 
3881 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
3882 {
3883     if (!dc_isar_feature(aa32_i8mm, s)) {
3884         return false;
3885     }
3886     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3887                         gen_helper_gvec_smmla_b);
3888 }
3889 
3890 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
3891 {
3892     if (!dc_isar_feature(aa32_i8mm, s)) {
3893         return false;
3894     }
3895     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3896                         gen_helper_gvec_ummla_b);
3897 }
3898 
3899 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
3900 {
3901     if (!dc_isar_feature(aa32_i8mm, s)) {
3902         return false;
3903     }
3904     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3905                         gen_helper_gvec_usmmla_b);
3906 }
3907 
3908 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
3909 {
3910     if (!dc_isar_feature(aa32_bf16, s)) {
3911         return false;
3912     }
3913     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3914                         gen_helper_gvec_bfmmla);
3915 }
3916 
3917 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
3918 {
3919     if (!dc_isar_feature(aa32_bf16, s)) {
3920         return false;
3921     }
3922     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
3923                              gen_helper_gvec_bfmlal);
3924 }
3925 
3926 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
3927 {
3928     if (!dc_isar_feature(aa32_bf16, s)) {
3929         return false;
3930     }
3931     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
3932                              (a->index << 1) | a->q, FPST_STD,
3933                              gen_helper_gvec_bfmlal_idx);
3934 }
3935