xref: /openbmc/qemu/tcg/aarch64/tcg-target.c.inc (revision fcc54e7bf56ba627f9b6ac4a32c6b446d2591ccf)
1/*
2 * Initial TCG Implementation for aarch64
3 *
4 * Copyright (c) 2013 Huawei Technologies Duesseldorf GmbH
5 * Written by Claudio Fontana
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2 or
8 * (at your option) any later version.
9 *
10 * See the COPYING file in the top-level directory for details.
11 */
12
13#include "../tcg-ldst.c.inc"
14#include "../tcg-pool.c.inc"
15#include "qemu/bitops.h"
16
17/* We're going to re-use TCGType in setting of the SF bit, which controls
18   the size of the operation performed.  If we know the values match, it
19   makes things much cleaner.  */
20QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1);
21
22#ifdef CONFIG_DEBUG_TCG
23static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
24    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
25    "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
26    "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
27    "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp",
28
29    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
30    "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
31    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
32    "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31",
33};
34#endif /* CONFIG_DEBUG_TCG */
35
36static const int tcg_target_reg_alloc_order[] = {
37    TCG_REG_X20, TCG_REG_X21, TCG_REG_X22, TCG_REG_X23,
38    TCG_REG_X24, TCG_REG_X25, TCG_REG_X26, TCG_REG_X27,
39    TCG_REG_X28, /* we will reserve this for guest_base if configured */
40
41    TCG_REG_X8, TCG_REG_X9, TCG_REG_X10, TCG_REG_X11,
42    TCG_REG_X12, TCG_REG_X13, TCG_REG_X14, TCG_REG_X15,
43
44    TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
45    TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7,
46
47    /* X16 reserved as temporary */
48    /* X17 reserved as temporary */
49    /* X18 reserved by system */
50    /* X19 reserved for AREG0 */
51    /* X29 reserved as fp */
52    /* X30 reserved as temporary */
53
54    TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
55    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
56    /* V8 - V15 are call-saved, and skipped.  */
57    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
58    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
59    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
60    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
61};
62
63static const int tcg_target_call_iarg_regs[8] = {
64    TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
65    TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7
66};
67
68static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
69{
70    tcg_debug_assert(kind == TCG_CALL_RET_NORMAL);
71    tcg_debug_assert(slot >= 0 && slot <= 1);
72    return TCG_REG_X0 + slot;
73}
74
75#define TCG_REG_TMP0 TCG_REG_X16
76#define TCG_REG_TMP1 TCG_REG_X17
77#define TCG_REG_TMP2 TCG_REG_X30
78#define TCG_VEC_TMP0 TCG_REG_V31
79
80#define TCG_REG_GUEST_BASE TCG_REG_X28
81
82static bool reloc_pc26(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
83{
84    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
85    ptrdiff_t offset = target - src_rx;
86
87    if (offset == sextract64(offset, 0, 26)) {
88        /* read instruction, mask away previous PC_REL26 parameter contents,
89           set the proper offset, then write back the instruction. */
90        *src_rw = deposit32(*src_rw, 0, 26, offset);
91        return true;
92    }
93    return false;
94}
95
96static bool reloc_pc19(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
97{
98    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
99    ptrdiff_t offset = target - src_rx;
100
101    if (offset == sextract64(offset, 0, 19)) {
102        *src_rw = deposit32(*src_rw, 5, 19, offset);
103        return true;
104    }
105    return false;
106}
107
108static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
109{
110    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
111    ptrdiff_t offset = target - src_rx;
112
113    if (offset == sextract64(offset, 0, 14)) {
114        *src_rw = deposit32(*src_rw, 5, 14, offset);
115        return true;
116    }
117    return false;
118}
119
120static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
121                        intptr_t value, intptr_t addend)
122{
123    tcg_debug_assert(addend == 0);
124    switch (type) {
125    case R_AARCH64_JUMP26:
126    case R_AARCH64_CALL26:
127        return reloc_pc26(code_ptr, (const tcg_insn_unit *)value);
128    case R_AARCH64_CONDBR19:
129        return reloc_pc19(code_ptr, (const tcg_insn_unit *)value);
130    case R_AARCH64_TSTBR14:
131        return reloc_pc14(code_ptr, (const tcg_insn_unit *)value);
132    default:
133        g_assert_not_reached();
134    }
135}
136
137#define TCG_CT_CONST_AIMM 0x100
138#define TCG_CT_CONST_LIMM 0x200
139#define TCG_CT_CONST_ZERO 0x400
140#define TCG_CT_CONST_MONE 0x800
141#define TCG_CT_CONST_ORRI 0x1000
142#define TCG_CT_CONST_ANDI 0x2000
143#define TCG_CT_CONST_CMP  0x4000
144
145#define ALL_GENERAL_REGS  0xffffffffu
146#define ALL_VECTOR_REGS   0xffffffff00000000ull
147
148/* Match a constant valid for addition (12-bit, optionally shifted).  */
149static inline bool is_aimm(uint64_t val)
150{
151    return (val & ~0xfff) == 0 || (val & ~0xfff000) == 0;
152}
153
154/* Match a constant valid for logical operations.  */
155static inline bool is_limm(uint64_t val)
156{
157    /* Taking a simplified view of the logical immediates for now, ignoring
158       the replication that can happen across the field.  Match bit patterns
159       of the forms
160           0....01....1
161           0..01..10..0
162       and their inverses.  */
163
164    /* Make things easier below, by testing the form with msb clear. */
165    if ((int64_t)val < 0) {
166        val = ~val;
167    }
168    if (val == 0) {
169        return false;
170    }
171    val += val & -val;
172    return (val & (val - 1)) == 0;
173}
174
175/* Return true if v16 is a valid 16-bit shifted immediate.  */
176static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
177{
178    if (v16 == (v16 & 0xff)) {
179        *cmode = 0x8;
180        *imm8 = v16 & 0xff;
181        return true;
182    } else if (v16 == (v16 & 0xff00)) {
183        *cmode = 0xa;
184        *imm8 = v16 >> 8;
185        return true;
186    }
187    return false;
188}
189
190/* Return true if v32 is a valid 32-bit shifted immediate.  */
191static bool is_shimm32(uint32_t v32, int *cmode, int *imm8)
192{
193    if (v32 == (v32 & 0xff)) {
194        *cmode = 0x0;
195        *imm8 = v32 & 0xff;
196        return true;
197    } else if (v32 == (v32 & 0xff00)) {
198        *cmode = 0x2;
199        *imm8 = (v32 >> 8) & 0xff;
200        return true;
201    } else if (v32 == (v32 & 0xff0000)) {
202        *cmode = 0x4;
203        *imm8 = (v32 >> 16) & 0xff;
204        return true;
205    } else if (v32 == (v32 & 0xff000000)) {
206        *cmode = 0x6;
207        *imm8 = v32 >> 24;
208        return true;
209    }
210    return false;
211}
212
213/* Return true if v32 is a valid 32-bit shifting ones immediate.  */
214static bool is_soimm32(uint32_t v32, int *cmode, int *imm8)
215{
216    if ((v32 & 0xffff00ff) == 0xff) {
217        *cmode = 0xc;
218        *imm8 = (v32 >> 8) & 0xff;
219        return true;
220    } else if ((v32 & 0xff00ffff) == 0xffff) {
221        *cmode = 0xd;
222        *imm8 = (v32 >> 16) & 0xff;
223        return true;
224    }
225    return false;
226}
227
228/* Return true if v32 is a valid float32 immediate.  */
229static bool is_fimm32(uint32_t v32, int *cmode, int *imm8)
230{
231    if (extract32(v32, 0, 19) == 0
232        && (extract32(v32, 25, 6) == 0x20
233            || extract32(v32, 25, 6) == 0x1f)) {
234        *cmode = 0xf;
235        *imm8 = (extract32(v32, 31, 1) << 7)
236              | (extract32(v32, 25, 1) << 6)
237              | extract32(v32, 19, 6);
238        return true;
239    }
240    return false;
241}
242
243/* Return true if v64 is a valid float64 immediate.  */
244static bool is_fimm64(uint64_t v64, int *cmode, int *imm8)
245{
246    if (extract64(v64, 0, 48) == 0
247        && (extract64(v64, 54, 9) == 0x100
248            || extract64(v64, 54, 9) == 0x0ff)) {
249        *cmode = 0xf;
250        *imm8 = (extract64(v64, 63, 1) << 7)
251              | (extract64(v64, 54, 1) << 6)
252              | extract64(v64, 48, 6);
253        return true;
254    }
255    return false;
256}
257
258/*
259 * Return non-zero if v32 can be formed by MOVI+ORR.
260 * Place the parameters for MOVI in (cmode, imm8).
261 * Return the cmode for ORR; the imm8 can be had via extraction from v32.
262 */
263static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
264{
265    int i;
266
267    for (i = 6; i > 0; i -= 2) {
268        /* Mask out one byte we can add with ORR.  */
269        uint32_t tmp = v32 & ~(0xffu << (i * 4));
270        if (is_shimm32(tmp, cmode, imm8) ||
271            is_soimm32(tmp, cmode, imm8)) {
272            break;
273        }
274    }
275    return i;
276}
277
278/* Return true if V is a valid 16-bit or 32-bit shifted immediate.  */
279static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
280{
281    if (v32 == deposit32(v32, 16, 16, v32)) {
282        return is_shimm16(v32, cmode, imm8);
283    } else {
284        return is_shimm32(v32, cmode, imm8);
285    }
286}
287
288static bool tcg_target_const_match(int64_t val, int ct,
289                                   TCGType type, TCGCond cond, int vece)
290{
291    if (ct & TCG_CT_CONST) {
292        return 1;
293    }
294    if (type == TCG_TYPE_I32) {
295        val = (int32_t)val;
296    }
297
298    if (ct & TCG_CT_CONST_CMP) {
299        if (is_tst_cond(cond)) {
300            ct |= TCG_CT_CONST_LIMM;
301        } else {
302            ct |= TCG_CT_CONST_AIMM;
303        }
304    }
305
306    if ((ct & TCG_CT_CONST_AIMM) && (is_aimm(val) || is_aimm(-val))) {
307        return 1;
308    }
309    if ((ct & TCG_CT_CONST_LIMM) && is_limm(val)) {
310        return 1;
311    }
312    if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
313        return 1;
314    }
315    if ((ct & TCG_CT_CONST_MONE) && val == -1) {
316        return 1;
317    }
318
319    switch (ct & (TCG_CT_CONST_ORRI | TCG_CT_CONST_ANDI)) {
320    case 0:
321        break;
322    case TCG_CT_CONST_ANDI:
323        val = ~val;
324        /* fallthru */
325    case TCG_CT_CONST_ORRI:
326        if (val == deposit64(val, 32, 32, val)) {
327            int cmode, imm8;
328            return is_shimm1632(val, &cmode, &imm8);
329        }
330        break;
331    default:
332        /* Both bits should not be set for the same insn.  */
333        g_assert_not_reached();
334    }
335
336    return 0;
337}
338
339enum aarch64_cond_code {
340    COND_EQ = 0x0,
341    COND_NE = 0x1,
342    COND_CS = 0x2,     /* Unsigned greater or equal */
343    COND_HS = COND_CS, /* ALIAS greater or equal */
344    COND_CC = 0x3,     /* Unsigned less than */
345    COND_LO = COND_CC, /* ALIAS Lower */
346    COND_MI = 0x4,     /* Negative */
347    COND_PL = 0x5,     /* Zero or greater */
348    COND_VS = 0x6,     /* Overflow */
349    COND_VC = 0x7,     /* No overflow */
350    COND_HI = 0x8,     /* Unsigned greater than */
351    COND_LS = 0x9,     /* Unsigned less or equal */
352    COND_GE = 0xa,
353    COND_LT = 0xb,
354    COND_GT = 0xc,
355    COND_LE = 0xd,
356    COND_AL = 0xe,
357    COND_NV = 0xf, /* behaves like COND_AL here */
358};
359
360static const enum aarch64_cond_code tcg_cond_to_aarch64[] = {
361    [TCG_COND_EQ] = COND_EQ,
362    [TCG_COND_NE] = COND_NE,
363    [TCG_COND_LT] = COND_LT,
364    [TCG_COND_GE] = COND_GE,
365    [TCG_COND_LE] = COND_LE,
366    [TCG_COND_GT] = COND_GT,
367    /* unsigned */
368    [TCG_COND_LTU] = COND_LO,
369    [TCG_COND_GTU] = COND_HI,
370    [TCG_COND_GEU] = COND_HS,
371    [TCG_COND_LEU] = COND_LS,
372    /* bit test */
373    [TCG_COND_TSTEQ] = COND_EQ,
374    [TCG_COND_TSTNE] = COND_NE,
375};
376
377typedef enum {
378    LDST_ST = 0,    /* store */
379    LDST_LD = 1,    /* load */
380    LDST_LD_S_X = 2,  /* load and sign-extend into Xt */
381    LDST_LD_S_W = 3,  /* load and sign-extend into Wt */
382} AArch64LdstType;
383
384/* We encode the format of the insn into the beginning of the name, so that
385   we can have the preprocessor help "typecheck" the insn vs the output
386   function.  Arm didn't provide us with nice names for the formats, so we
387   use the section number of the architecture reference manual in which the
388   instruction group is described.  */
389typedef enum {
390    /* Compare and branch (immediate).  */
391    I3201_CBZ       = 0x34000000,
392    I3201_CBNZ      = 0x35000000,
393
394    /* Conditional branch (immediate).  */
395    I3202_B_C       = 0x54000000,
396
397    /* Test and branch (immediate).  */
398    I3205_TBZ       = 0x36000000,
399    I3205_TBNZ      = 0x37000000,
400
401    /* Unconditional branch (immediate).  */
402    I3206_B         = 0x14000000,
403    I3206_BL        = 0x94000000,
404
405    /* Unconditional branch (register).  */
406    I3207_BR        = 0xd61f0000,
407    I3207_BLR       = 0xd63f0000,
408    I3207_RET       = 0xd65f0000,
409
410    /* AdvSIMD load/store single structure.  */
411    I3303_LD1R      = 0x0d40c000,
412
413    /* Load literal for loading the address at pc-relative offset */
414    I3305_LDR       = 0x58000000,
415    I3305_LDR_v64   = 0x5c000000,
416    I3305_LDR_v128  = 0x9c000000,
417
418    /* Load/store exclusive. */
419    I3306_LDXP      = 0xc8600000,
420    I3306_STXP      = 0xc8200000,
421
422    /* Load/store register.  Described here as 3.3.12, but the helper
423       that emits them can transform to 3.3.10 or 3.3.13.  */
424    I3312_STRB      = 0x38000000 | LDST_ST << 22 | MO_8 << 30,
425    I3312_STRH      = 0x38000000 | LDST_ST << 22 | MO_16 << 30,
426    I3312_STRW      = 0x38000000 | LDST_ST << 22 | MO_32 << 30,
427    I3312_STRX      = 0x38000000 | LDST_ST << 22 | MO_64 << 30,
428
429    I3312_LDRB      = 0x38000000 | LDST_LD << 22 | MO_8 << 30,
430    I3312_LDRH      = 0x38000000 | LDST_LD << 22 | MO_16 << 30,
431    I3312_LDRW      = 0x38000000 | LDST_LD << 22 | MO_32 << 30,
432    I3312_LDRX      = 0x38000000 | LDST_LD << 22 | MO_64 << 30,
433
434    I3312_LDRSBW    = 0x38000000 | LDST_LD_S_W << 22 | MO_8 << 30,
435    I3312_LDRSHW    = 0x38000000 | LDST_LD_S_W << 22 | MO_16 << 30,
436
437    I3312_LDRSBX    = 0x38000000 | LDST_LD_S_X << 22 | MO_8 << 30,
438    I3312_LDRSHX    = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
439    I3312_LDRSWX    = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
440
441    I3312_LDRVS     = 0x3c000000 | LDST_LD << 22 | MO_32 << 30,
442    I3312_STRVS     = 0x3c000000 | LDST_ST << 22 | MO_32 << 30,
443
444    I3312_LDRVD     = 0x3c000000 | LDST_LD << 22 | MO_64 << 30,
445    I3312_STRVD     = 0x3c000000 | LDST_ST << 22 | MO_64 << 30,
446
447    I3312_LDRVQ     = 0x3c000000 | 3 << 22 | 0 << 30,
448    I3312_STRVQ     = 0x3c000000 | 2 << 22 | 0 << 30,
449
450    I3312_TO_I3310  = 0x00200800,
451    I3312_TO_I3313  = 0x01000000,
452
453    /* Load/store register pair instructions.  */
454    I3314_LDP       = 0x28400000,
455    I3314_STP       = 0x28000000,
456
457    /* Add/subtract immediate instructions.  */
458    I3401_ADDI      = 0x11000000,
459    I3401_ADDSI     = 0x31000000,
460    I3401_SUBI      = 0x51000000,
461    I3401_SUBSI     = 0x71000000,
462
463    /* Bitfield instructions.  */
464    I3402_BFM       = 0x33000000,
465    I3402_SBFM      = 0x13000000,
466    I3402_UBFM      = 0x53000000,
467
468    /* Extract instruction.  */
469    I3403_EXTR      = 0x13800000,
470
471    /* Logical immediate instructions.  */
472    I3404_ANDI      = 0x12000000,
473    I3404_ORRI      = 0x32000000,
474    I3404_EORI      = 0x52000000,
475    I3404_ANDSI     = 0x72000000,
476
477    /* Move wide immediate instructions.  */
478    I3405_MOVN      = 0x12800000,
479    I3405_MOVZ      = 0x52800000,
480    I3405_MOVK      = 0x72800000,
481
482    /* PC relative addressing instructions.  */
483    I3406_ADR       = 0x10000000,
484    I3406_ADRP      = 0x90000000,
485
486    /* Add/subtract extended register instructions. */
487    I3501_ADD       = 0x0b200000,
488
489    /* Add/subtract shifted register instructions (without a shift).  */
490    I3502_ADD       = 0x0b000000,
491    I3502_ADDS      = 0x2b000000,
492    I3502_SUB       = 0x4b000000,
493    I3502_SUBS      = 0x6b000000,
494
495    /* Add/subtract shifted register instructions (with a shift).  */
496    I3502S_ADD_LSL  = I3502_ADD,
497
498    /* Add/subtract with carry instructions.  */
499    I3503_ADC       = 0x1a000000,
500    I3503_SBC       = 0x5a000000,
501
502    /* Conditional select instructions.  */
503    I3506_CSEL      = 0x1a800000,
504    I3506_CSINC     = 0x1a800400,
505    I3506_CSINV     = 0x5a800000,
506    I3506_CSNEG     = 0x5a800400,
507
508    /* Data-processing (1 source) instructions.  */
509    I3507_CLZ       = 0x5ac01000,
510    I3507_RBIT      = 0x5ac00000,
511    I3507_REV       = 0x5ac00000, /* + size << 10 */
512
513    /* Data-processing (2 source) instructions.  */
514    I3508_LSLV      = 0x1ac02000,
515    I3508_LSRV      = 0x1ac02400,
516    I3508_ASRV      = 0x1ac02800,
517    I3508_RORV      = 0x1ac02c00,
518    I3508_SMULH     = 0x9b407c00,
519    I3508_UMULH     = 0x9bc07c00,
520    I3508_UDIV      = 0x1ac00800,
521    I3508_SDIV      = 0x1ac00c00,
522
523    /* Data-processing (3 source) instructions.  */
524    I3509_MADD      = 0x1b000000,
525    I3509_MSUB      = 0x1b008000,
526
527    /* Logical shifted register instructions (without a shift).  */
528    I3510_AND       = 0x0a000000,
529    I3510_BIC       = 0x0a200000,
530    I3510_ORR       = 0x2a000000,
531    I3510_ORN       = 0x2a200000,
532    I3510_EOR       = 0x4a000000,
533    I3510_EON       = 0x4a200000,
534    I3510_ANDS      = 0x6a000000,
535
536    /* Logical shifted register instructions (with a shift).  */
537    I3502S_AND_LSR  = I3510_AND | (1 << 22),
538
539    /* AdvSIMD copy */
540    I3605_DUP      = 0x0e000400,
541    I3605_INS      = 0x4e001c00,
542    I3605_UMOV     = 0x0e003c00,
543
544    /* AdvSIMD modified immediate */
545    I3606_MOVI      = 0x0f000400,
546    I3606_MVNI      = 0x2f000400,
547    I3606_BIC       = 0x2f001400,
548    I3606_ORR       = 0x0f001400,
549
550    /* AdvSIMD scalar shift by immediate */
551    I3609_SSHR      = 0x5f000400,
552    I3609_SSRA      = 0x5f001400,
553    I3609_SHL       = 0x5f005400,
554    I3609_USHR      = 0x7f000400,
555    I3609_USRA      = 0x7f001400,
556    I3609_SLI       = 0x7f005400,
557
558    /* AdvSIMD scalar three same */
559    I3611_SQADD     = 0x5e200c00,
560    I3611_SQSUB     = 0x5e202c00,
561    I3611_CMGT      = 0x5e203400,
562    I3611_CMGE      = 0x5e203c00,
563    I3611_SSHL      = 0x5e204400,
564    I3611_ADD       = 0x5e208400,
565    I3611_CMTST     = 0x5e208c00,
566    I3611_UQADD     = 0x7e200c00,
567    I3611_UQSUB     = 0x7e202c00,
568    I3611_CMHI      = 0x7e203400,
569    I3611_CMHS      = 0x7e203c00,
570    I3611_USHL      = 0x7e204400,
571    I3611_SUB       = 0x7e208400,
572    I3611_CMEQ      = 0x7e208c00,
573
574    /* AdvSIMD scalar two-reg misc */
575    I3612_CMGT0     = 0x5e208800,
576    I3612_CMEQ0     = 0x5e209800,
577    I3612_CMLT0     = 0x5e20a800,
578    I3612_ABS       = 0x5e20b800,
579    I3612_CMGE0     = 0x7e208800,
580    I3612_CMLE0     = 0x7e209800,
581    I3612_NEG       = 0x7e20b800,
582
583    /* AdvSIMD shift by immediate */
584    I3614_SSHR      = 0x0f000400,
585    I3614_SSRA      = 0x0f001400,
586    I3614_SHL       = 0x0f005400,
587    I3614_SLI       = 0x2f005400,
588    I3614_USHR      = 0x2f000400,
589    I3614_USRA      = 0x2f001400,
590
591    /* AdvSIMD three same.  */
592    I3616_ADD       = 0x0e208400,
593    I3616_AND       = 0x0e201c00,
594    I3616_BIC       = 0x0e601c00,
595    I3616_BIF       = 0x2ee01c00,
596    I3616_BIT       = 0x2ea01c00,
597    I3616_BSL       = 0x2e601c00,
598    I3616_EOR       = 0x2e201c00,
599    I3616_MUL       = 0x0e209c00,
600    I3616_ORR       = 0x0ea01c00,
601    I3616_ORN       = 0x0ee01c00,
602    I3616_SUB       = 0x2e208400,
603    I3616_CMGT      = 0x0e203400,
604    I3616_CMGE      = 0x0e203c00,
605    I3616_CMTST     = 0x0e208c00,
606    I3616_CMHI      = 0x2e203400,
607    I3616_CMHS      = 0x2e203c00,
608    I3616_CMEQ      = 0x2e208c00,
609    I3616_SMAX      = 0x0e206400,
610    I3616_SMIN      = 0x0e206c00,
611    I3616_SSHL      = 0x0e204400,
612    I3616_SQADD     = 0x0e200c00,
613    I3616_SQSUB     = 0x0e202c00,
614    I3616_UMAX      = 0x2e206400,
615    I3616_UMIN      = 0x2e206c00,
616    I3616_UQADD     = 0x2e200c00,
617    I3616_UQSUB     = 0x2e202c00,
618    I3616_USHL      = 0x2e204400,
619
620    /* AdvSIMD two-reg misc.  */
621    I3617_CMGT0     = 0x0e208800,
622    I3617_CMEQ0     = 0x0e209800,
623    I3617_CMLT0     = 0x0e20a800,
624    I3617_CMGE0     = 0x2e208800,
625    I3617_CMLE0     = 0x2e209800,
626    I3617_NOT       = 0x2e205800,
627    I3617_ABS       = 0x0e20b800,
628    I3617_NEG       = 0x2e20b800,
629
630    /* System instructions.  */
631    NOP             = 0xd503201f,
632    DMB_ISH         = 0xd50338bf,
633    DMB_LD          = 0x00000100,
634    DMB_ST          = 0x00000200,
635
636    BTI_C           = 0xd503245f,
637    BTI_J           = 0xd503249f,
638    BTI_JC          = 0xd50324df,
639} AArch64Insn;
640
641static inline uint32_t tcg_in32(TCGContext *s)
642{
643    uint32_t v = *(uint32_t *)s->code_ptr;
644    return v;
645}
646
647/* Emit an opcode with "type-checking" of the format.  */
648#define tcg_out_insn(S, FMT, OP, ...) \
649    glue(tcg_out_insn_,FMT)(S, glue(glue(glue(I,FMT),_),OP), ## __VA_ARGS__)
650
651static void tcg_out_insn_3303(TCGContext *s, AArch64Insn insn, bool q,
652                              TCGReg rt, TCGReg rn, unsigned size)
653{
654    tcg_out32(s, insn | (rt & 0x1f) | (rn << 5) | (size << 10) | (q << 30));
655}
656
657static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn,
658                              int imm19, TCGReg rt)
659{
660    tcg_out32(s, insn | (imm19 & 0x7ffff) << 5 | rt);
661}
662
663static void tcg_out_insn_3306(TCGContext *s, AArch64Insn insn, TCGReg rs,
664                              TCGReg rt, TCGReg rt2, TCGReg rn)
665{
666    tcg_out32(s, insn | rs << 16 | rt2 << 10 | rn << 5 | rt);
667}
668
669static void tcg_out_insn_3201(TCGContext *s, AArch64Insn insn, TCGType ext,
670                              TCGReg rt, int imm19)
671{
672    tcg_out32(s, insn | ext << 31 | (imm19 & 0x7ffff) << 5 | rt);
673}
674
675static void tcg_out_insn_3202(TCGContext *s, AArch64Insn insn,
676                              TCGCond c, int imm19)
677{
678    tcg_out32(s, insn | tcg_cond_to_aarch64[c] | (imm19 & 0x7ffff) << 5);
679}
680
681static void tcg_out_insn_3205(TCGContext *s, AArch64Insn insn,
682                              TCGReg rt, int imm6, int imm14)
683{
684    insn |= (imm6 & 0x20) << (31 - 5);
685    insn |= (imm6 & 0x1f) << 19;
686    tcg_out32(s, insn | (imm14 & 0x3fff) << 5 | rt);
687}
688
689static void tcg_out_insn_3206(TCGContext *s, AArch64Insn insn, int imm26)
690{
691    tcg_out32(s, insn | (imm26 & 0x03ffffff));
692}
693
694static void tcg_out_insn_3207(TCGContext *s, AArch64Insn insn, TCGReg rn)
695{
696    tcg_out32(s, insn | rn << 5);
697}
698
699static void tcg_out_insn_3314(TCGContext *s, AArch64Insn insn,
700                              TCGReg r1, TCGReg r2, TCGReg rn,
701                              tcg_target_long ofs, bool pre, bool w)
702{
703    insn |= 1u << 31; /* ext */
704    insn |= pre << 24;
705    insn |= w << 23;
706
707    tcg_debug_assert(ofs >= -0x200 && ofs < 0x200 && (ofs & 7) == 0);
708    insn |= (ofs & (0x7f << 3)) << (15 - 3);
709
710    tcg_out32(s, insn | r2 << 10 | rn << 5 | r1);
711}
712
713static void tcg_out_insn_3401(TCGContext *s, AArch64Insn insn, TCGType ext,
714                              TCGReg rd, TCGReg rn, uint64_t aimm)
715{
716    if (aimm > 0xfff) {
717        tcg_debug_assert((aimm & 0xfff) == 0);
718        aimm >>= 12;
719        tcg_debug_assert(aimm <= 0xfff);
720        aimm |= 1 << 12;  /* apply LSL 12 */
721    }
722    tcg_out32(s, insn | ext << 31 | aimm << 10 | rn << 5 | rd);
723}
724
725/* This function can be used for both 3.4.2 (Bitfield) and 3.4.4
726   (Logical immediate).  Both insn groups have N, IMMR and IMMS fields
727   that feed the DecodeBitMasks pseudo function.  */
728static void tcg_out_insn_3402(TCGContext *s, AArch64Insn insn, TCGType ext,
729                              TCGReg rd, TCGReg rn, int n, int immr, int imms)
730{
731    tcg_out32(s, insn | ext << 31 | n << 22 | immr << 16 | imms << 10
732              | rn << 5 | rd);
733}
734
735#define tcg_out_insn_3404  tcg_out_insn_3402
736
737static void tcg_out_insn_3403(TCGContext *s, AArch64Insn insn, TCGType ext,
738                              TCGReg rd, TCGReg rn, TCGReg rm, int imms)
739{
740    tcg_out32(s, insn | ext << 31 | ext << 22 | rm << 16 | imms << 10
741              | rn << 5 | rd);
742}
743
744/* This function is used for the Move (wide immediate) instruction group.
745   Note that SHIFT is a full shift count, not the 2 bit HW field. */
746static void tcg_out_insn_3405(TCGContext *s, AArch64Insn insn, TCGType ext,
747                              TCGReg rd, uint16_t half, unsigned shift)
748{
749    tcg_debug_assert((shift & ~0x30) == 0);
750    tcg_out32(s, insn | ext << 31 | shift << (21 - 4) | half << 5 | rd);
751}
752
753static void tcg_out_insn_3406(TCGContext *s, AArch64Insn insn,
754                              TCGReg rd, int64_t disp)
755{
756    tcg_out32(s, insn | (disp & 3) << 29 | (disp & 0x1ffffc) << (5 - 2) | rd);
757}
758
759static inline void tcg_out_insn_3501(TCGContext *s, AArch64Insn insn,
760                                     TCGType sf, TCGReg rd, TCGReg rn,
761                                     TCGReg rm, int opt, int imm3)
762{
763    tcg_out32(s, insn | sf << 31 | rm << 16 | opt << 13 |
764              imm3 << 10 | rn << 5 | rd);
765}
766
767/* This function is for both 3.5.2 (Add/Subtract shifted register), for
768   the rare occasion when we actually want to supply a shift amount.  */
769static inline void tcg_out_insn_3502S(TCGContext *s, AArch64Insn insn,
770                                      TCGType ext, TCGReg rd, TCGReg rn,
771                                      TCGReg rm, int imm6)
772{
773    tcg_out32(s, insn | ext << 31 | rm << 16 | imm6 << 10 | rn << 5 | rd);
774}
775
776/* This function is for 3.5.2 (Add/subtract shifted register),
777   and 3.5.10 (Logical shifted register), for the vast majorty of cases
778   when we don't want to apply a shift.  Thus it can also be used for
779   3.5.3 (Add/subtract with carry) and 3.5.8 (Data processing 2 source).  */
780static void tcg_out_insn_3502(TCGContext *s, AArch64Insn insn, TCGType ext,
781                              TCGReg rd, TCGReg rn, TCGReg rm)
782{
783    tcg_out32(s, insn | ext << 31 | rm << 16 | rn << 5 | rd);
784}
785
786#define tcg_out_insn_3503  tcg_out_insn_3502
787#define tcg_out_insn_3508  tcg_out_insn_3502
788#define tcg_out_insn_3510  tcg_out_insn_3502
789
790static void tcg_out_insn_3506(TCGContext *s, AArch64Insn insn, TCGType ext,
791                              TCGReg rd, TCGReg rn, TCGReg rm, TCGCond c)
792{
793    tcg_out32(s, insn | ext << 31 | rm << 16 | rn << 5 | rd
794              | tcg_cond_to_aarch64[c] << 12);
795}
796
797static void tcg_out_insn_3507(TCGContext *s, AArch64Insn insn, TCGType ext,
798                              TCGReg rd, TCGReg rn)
799{
800    tcg_out32(s, insn | ext << 31 | rn << 5 | rd);
801}
802
803static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext,
804                              TCGReg rd, TCGReg rn, TCGReg rm, TCGReg ra)
805{
806    tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd);
807}
808
809static void tcg_out_insn_3605(TCGContext *s, AArch64Insn insn, bool q,
810                              TCGReg rd, TCGReg rn, int dst_idx, int src_idx)
811{
812    /* Note that bit 11 set means general register input.  Therefore
813       we can handle both register sets with one function.  */
814    tcg_out32(s, insn | q << 30 | (dst_idx << 16) | (src_idx << 11)
815              | (rd & 0x1f) | (~rn & 0x20) << 6 | (rn & 0x1f) << 5);
816}
817
818static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
819                              TCGReg rd, bool op, int cmode, uint8_t imm8)
820{
821    tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f)
822              | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
823}
824
825static void tcg_out_insn_3609(TCGContext *s, AArch64Insn insn,
826                              TCGReg rd, TCGReg rn, unsigned immhb)
827{
828    tcg_out32(s, insn | immhb << 16 | (rn & 0x1f) << 5 | (rd & 0x1f));
829}
830
831static void tcg_out_insn_3611(TCGContext *s, AArch64Insn insn,
832                              unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
833{
834    tcg_out32(s, insn | (size << 22) | (rm & 0x1f) << 16
835              | (rn & 0x1f) << 5 | (rd & 0x1f));
836}
837
838static void tcg_out_insn_3612(TCGContext *s, AArch64Insn insn,
839                              unsigned size, TCGReg rd, TCGReg rn)
840{
841    tcg_out32(s, insn | (size << 22) | (rn & 0x1f) << 5 | (rd & 0x1f));
842}
843
844static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
845                              TCGReg rd, TCGReg rn, unsigned immhb)
846{
847    tcg_out32(s, insn | q << 30 | immhb << 16
848              | (rn & 0x1f) << 5 | (rd & 0x1f));
849}
850
851static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q,
852                              unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
853{
854    tcg_out32(s, insn | q << 30 | (size << 22) | (rm & 0x1f) << 16
855              | (rn & 0x1f) << 5 | (rd & 0x1f));
856}
857
858static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q,
859                              unsigned size, TCGReg rd, TCGReg rn)
860{
861    tcg_out32(s, insn | q << 30 | (size << 22)
862              | (rn & 0x1f) << 5 | (rd & 0x1f));
863}
864
865static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
866                              TCGReg rd, TCGReg base, TCGType ext,
867                              TCGReg regoff)
868{
869    /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
870    tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
871              0x4000 | ext << 13 | base << 5 | (rd & 0x1f));
872}
873
874static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
875                              TCGReg rd, TCGReg rn, intptr_t offset)
876{
877    tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f));
878}
879
880static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn,
881                              TCGReg rd, TCGReg rn, uintptr_t scaled_uimm)
882{
883    /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
884    tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10
885              | rn << 5 | (rd & 0x1f));
886}
887
888static void tcg_out_bti(TCGContext *s, AArch64Insn insn)
889{
890    /*
891     * While BTI insns are nops on hosts without FEAT_BTI,
892     * there is no point in emitting them in that case either.
893     */
894    if (cpuinfo & CPUINFO_BTI) {
895        tcg_out32(s, insn);
896    }
897}
898
899/* Register to register move using ORR (shifted register with no shift). */
900static void tcg_out_movr(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rm)
901{
902    tcg_out_insn(s, 3510, ORR, ext, rd, TCG_REG_XZR, rm);
903}
904
905/* Register to register move using ADDI (move to/from SP).  */
906static void tcg_out_movr_sp(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rn)
907{
908    tcg_out_insn(s, 3401, ADDI, ext, rd, rn, 0);
909}
910
911/* This function is used for the Logical (immediate) instruction group.
912   The value of LIMM must satisfy IS_LIMM.  See the comment above about
913   only supporting simplified logical immediates.  */
914static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
915                             TCGReg rd, TCGReg rn, uint64_t limm)
916{
917    unsigned h, l, r, c;
918
919    tcg_debug_assert(is_limm(limm));
920
921    h = clz64(limm);
922    l = ctz64(limm);
923    if (l == 0) {
924        r = 0;                  /* form 0....01....1 */
925        c = ctz64(~limm) - 1;
926        if (h == 0) {
927            r = clz64(~limm);   /* form 1..10..01..1 */
928            c += r;
929        }
930    } else {
931        r = 64 - l;             /* form 1....10....0 or 0..01..10..0 */
932        c = r - h - 1;
933    }
934    if (ext == TCG_TYPE_I32) {
935        r &= 31;
936        c &= 31;
937    }
938
939    tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
940}
941
942static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
943                             TCGReg rd, int64_t v64)
944{
945    bool q = type == TCG_TYPE_V128;
946    int cmode, imm8, i;
947
948    /* Test all bytes equal first.  */
949    if (vece == MO_8) {
950        imm8 = (uint8_t)v64;
951        tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0xe, imm8);
952        return;
953    }
954
955    /*
956     * Test all bytes 0x00 or 0xff second.  This can match cases that
957     * might otherwise take 2 or 3 insns for MO_16 or MO_32 below.
958     */
959    for (i = imm8 = 0; i < 8; i++) {
960        uint8_t byte = v64 >> (i * 8);
961        if (byte == 0xff) {
962            imm8 |= 1 << i;
963        } else if (byte != 0) {
964            goto fail_bytes;
965        }
966    }
967    tcg_out_insn(s, 3606, MOVI, q, rd, 1, 0xe, imm8);
968    return;
969 fail_bytes:
970
971    /*
972     * Tests for various replications.  For each element width, if we
973     * cannot find an expansion there's no point checking a larger
974     * width because we already know by replication it cannot match.
975     */
976    if (vece == MO_16) {
977        uint16_t v16 = v64;
978
979        if (is_shimm16(v16, &cmode, &imm8)) {
980            tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
981            return;
982        }
983        if (is_shimm16(~v16, &cmode, &imm8)) {
984            tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
985            return;
986        }
987
988        /*
989         * Otherwise, all remaining constants can be loaded in two insns:
990         * rd = v16 & 0xff, rd |= v16 & 0xff00.
991         */
992        tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0x8, v16 & 0xff);
993        tcg_out_insn(s, 3606, ORR, q, rd, 0, 0xa, v16 >> 8);
994        return;
995    } else if (vece == MO_32) {
996        uint32_t v32 = v64;
997        uint32_t n32 = ~v32;
998
999        if (is_shimm32(v32, &cmode, &imm8) ||
1000            is_soimm32(v32, &cmode, &imm8) ||
1001            is_fimm32(v32, &cmode, &imm8)) {
1002            tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
1003            return;
1004        }
1005        if (is_shimm32(n32, &cmode, &imm8) ||
1006            is_soimm32(n32, &cmode, &imm8)) {
1007            tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
1008            return;
1009        }
1010
1011        /*
1012         * Restrict the set of constants to those we can load with
1013         * two instructions.  Others we load from the pool.
1014         */
1015        i = is_shimm32_pair(v32, &cmode, &imm8);
1016        if (i) {
1017            tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
1018            tcg_out_insn(s, 3606, ORR, q, rd, 0, i, extract32(v32, i * 4, 8));
1019            return;
1020        }
1021        i = is_shimm32_pair(n32, &cmode, &imm8);
1022        if (i) {
1023            tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
1024            tcg_out_insn(s, 3606, BIC, q, rd, 0, i, extract32(n32, i * 4, 8));
1025            return;
1026        }
1027    } else if (is_fimm64(v64, &cmode, &imm8)) {
1028        tcg_out_insn(s, 3606, MOVI, q, rd, 1, cmode, imm8);
1029        return;
1030    }
1031
1032    /*
1033     * As a last resort, load from the constant pool.  Sadly there
1034     * is no LD1R (literal), so store the full 16-byte vector.
1035     */
1036    if (type == TCG_TYPE_V128) {
1037        new_pool_l2(s, R_AARCH64_CONDBR19, s->code_ptr, 0, v64, v64);
1038        tcg_out_insn(s, 3305, LDR_v128, 0, rd);
1039    } else {
1040        new_pool_label(s, v64, R_AARCH64_CONDBR19, s->code_ptr, 0);
1041        tcg_out_insn(s, 3305, LDR_v64, 0, rd);
1042    }
1043}
1044
1045static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
1046                            TCGReg rd, TCGReg rs)
1047{
1048    int is_q = type - TCG_TYPE_V64;
1049    tcg_out_insn(s, 3605, DUP, is_q, rd, rs, 1 << vece, 0);
1050    return true;
1051}
1052
1053static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
1054                             TCGReg r, TCGReg base, intptr_t offset)
1055{
1056    TCGReg temp = TCG_REG_TMP0;
1057
1058    if (offset < -0xffffff || offset > 0xffffff) {
1059        tcg_out_movi(s, TCG_TYPE_PTR, temp, offset);
1060        tcg_out_insn(s, 3502, ADD, 1, temp, temp, base);
1061        base = temp;
1062    } else {
1063        AArch64Insn add_insn = I3401_ADDI;
1064
1065        if (offset < 0) {
1066            add_insn = I3401_SUBI;
1067            offset = -offset;
1068        }
1069        if (offset & 0xfff000) {
1070            tcg_out_insn_3401(s, add_insn, 1, temp, base, offset & 0xfff000);
1071            base = temp;
1072        }
1073        if (offset & 0xfff) {
1074            tcg_out_insn_3401(s, add_insn, 1, temp, base, offset & 0xfff);
1075            base = temp;
1076        }
1077    }
1078    tcg_out_insn(s, 3303, LD1R, type == TCG_TYPE_V128, r, base, vece);
1079    return true;
1080}
1081
1082static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
1083                         tcg_target_long value)
1084{
1085    tcg_target_long svalue = value;
1086    tcg_target_long ivalue = ~value;
1087    tcg_target_long t0, t1, t2;
1088    int s0, s1;
1089    AArch64Insn opc;
1090
1091    switch (type) {
1092    case TCG_TYPE_I32:
1093    case TCG_TYPE_I64:
1094        tcg_debug_assert(rd < 32);
1095        break;
1096    default:
1097        g_assert_not_reached();
1098    }
1099
1100    /* For 32-bit values, discard potential garbage in value.  For 64-bit
1101       values within [2**31, 2**32-1], we can create smaller sequences by
1102       interpreting this as a negative 32-bit number, while ensuring that
1103       the high 32 bits are cleared by setting SF=0.  */
1104    if (type == TCG_TYPE_I32 || (value & ~0xffffffffull) == 0) {
1105        svalue = (int32_t)value;
1106        value = (uint32_t)value;
1107        ivalue = (uint32_t)ivalue;
1108        type = TCG_TYPE_I32;
1109    }
1110
1111    /* Speed things up by handling the common case of small positive
1112       and negative values specially.  */
1113    if ((value & ~0xffffull) == 0) {
1114        tcg_out_insn(s, 3405, MOVZ, type, rd, value, 0);
1115        return;
1116    } else if ((ivalue & ~0xffffull) == 0) {
1117        tcg_out_insn(s, 3405, MOVN, type, rd, ivalue, 0);
1118        return;
1119    }
1120
1121    /* Check for bitfield immediates.  For the benefit of 32-bit quantities,
1122       use the sign-extended value.  That lets us match rotated values such
1123       as 0xff0000ff with the same 64-bit logic matching 0xffffffffff0000ff. */
1124    if (is_limm(svalue)) {
1125        tcg_out_logicali(s, I3404_ORRI, type, rd, TCG_REG_XZR, svalue);
1126        return;
1127    }
1128
1129    /* Look for host pointer values within 4G of the PC.  This happens
1130       often when loading pointers to QEMU's own data structures.  */
1131    if (type == TCG_TYPE_I64) {
1132        intptr_t src_rx = (intptr_t)tcg_splitwx_to_rx(s->code_ptr);
1133        tcg_target_long disp = value - src_rx;
1134        if (disp == sextract64(disp, 0, 21)) {
1135            tcg_out_insn(s, 3406, ADR, rd, disp);
1136            return;
1137        }
1138        disp = (value >> 12) - (src_rx >> 12);
1139        if (disp == sextract64(disp, 0, 21)) {
1140            tcg_out_insn(s, 3406, ADRP, rd, disp);
1141            if (value & 0xfff) {
1142                tcg_out_insn(s, 3401, ADDI, type, rd, rd, value & 0xfff);
1143            }
1144            return;
1145        }
1146    }
1147
1148    /* Would it take fewer insns to begin with MOVN?  */
1149    if (ctpop64(value) >= 32) {
1150        t0 = ivalue;
1151        opc = I3405_MOVN;
1152    } else {
1153        t0 = value;
1154        opc = I3405_MOVZ;
1155    }
1156    s0 = ctz64(t0) & (63 & -16);
1157    t1 = t0 & ~(0xffffull << s0);
1158    s1 = ctz64(t1) & (63 & -16);
1159    t2 = t1 & ~(0xffffull << s1);
1160    if (t2 == 0) {
1161        tcg_out_insn_3405(s, opc, type, rd, t0 >> s0, s0);
1162        if (t1 != 0) {
1163            tcg_out_insn(s, 3405, MOVK, type, rd, value >> s1, s1);
1164        }
1165        return;
1166    }
1167
1168    /* For more than 2 insns, dump it into the constant pool.  */
1169    new_pool_label(s, value, R_AARCH64_CONDBR19, s->code_ptr, 0);
1170    tcg_out_insn(s, 3305, LDR, 0, rd);
1171}
1172
1173static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1174{
1175    return false;
1176}
1177
1178static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1179                             tcg_target_long imm)
1180{
1181    /* This function is only used for passing structs by reference. */
1182    g_assert_not_reached();
1183}
1184
1185/* Define something more legible for general use.  */
1186#define tcg_out_ldst_r  tcg_out_insn_3310
1187
1188static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
1189                         TCGReg rn, intptr_t offset, int lgsize)
1190{
1191    /* If the offset is naturally aligned and in range, then we can
1192       use the scaled uimm12 encoding */
1193    if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) {
1194        uintptr_t scaled_uimm = offset >> lgsize;
1195        if (scaled_uimm <= 0xfff) {
1196            tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm);
1197            return;
1198        }
1199    }
1200
1201    /* Small signed offsets can use the unscaled encoding.  */
1202    if (offset >= -256 && offset < 256) {
1203        tcg_out_insn_3312(s, insn, rd, rn, offset);
1204        return;
1205    }
1206
1207    /* Worst-case scenario, move offset to temp register, use reg offset.  */
1208    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, offset);
1209    tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP0);
1210}
1211
1212static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
1213{
1214    if (ret == arg) {
1215        return true;
1216    }
1217    switch (type) {
1218    case TCG_TYPE_I32:
1219    case TCG_TYPE_I64:
1220        if (ret < 32 && arg < 32) {
1221            tcg_out_movr(s, type, ret, arg);
1222            break;
1223        } else if (ret < 32) {
1224            tcg_out_insn(s, 3605, UMOV, type, ret, arg, 0, 0);
1225            break;
1226        } else if (arg < 32) {
1227            tcg_out_insn(s, 3605, INS, 0, ret, arg, 4 << type, 0);
1228            break;
1229        }
1230        /* FALLTHRU */
1231
1232    case TCG_TYPE_V64:
1233        tcg_debug_assert(ret >= 32 && arg >= 32);
1234        tcg_out_insn(s, 3616, ORR, 0, 0, ret, arg, arg);
1235        break;
1236    case TCG_TYPE_V128:
1237        tcg_debug_assert(ret >= 32 && arg >= 32);
1238        tcg_out_insn(s, 3616, ORR, 1, 0, ret, arg, arg);
1239        break;
1240
1241    default:
1242        g_assert_not_reached();
1243    }
1244    return true;
1245}
1246
1247static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1248                       TCGReg base, intptr_t ofs)
1249{
1250    AArch64Insn insn;
1251    int lgsz;
1252
1253    switch (type) {
1254    case TCG_TYPE_I32:
1255        insn = (ret < 32 ? I3312_LDRW : I3312_LDRVS);
1256        lgsz = 2;
1257        break;
1258    case TCG_TYPE_I64:
1259        insn = (ret < 32 ? I3312_LDRX : I3312_LDRVD);
1260        lgsz = 3;
1261        break;
1262    case TCG_TYPE_V64:
1263        insn = I3312_LDRVD;
1264        lgsz = 3;
1265        break;
1266    case TCG_TYPE_V128:
1267        insn = I3312_LDRVQ;
1268        lgsz = 4;
1269        break;
1270    default:
1271        g_assert_not_reached();
1272    }
1273    tcg_out_ldst(s, insn, ret, base, ofs, lgsz);
1274}
1275
1276static void tcg_out_st(TCGContext *s, TCGType type, TCGReg src,
1277                       TCGReg base, intptr_t ofs)
1278{
1279    AArch64Insn insn;
1280    int lgsz;
1281
1282    switch (type) {
1283    case TCG_TYPE_I32:
1284        insn = (src < 32 ? I3312_STRW : I3312_STRVS);
1285        lgsz = 2;
1286        break;
1287    case TCG_TYPE_I64:
1288        insn = (src < 32 ? I3312_STRX : I3312_STRVD);
1289        lgsz = 3;
1290        break;
1291    case TCG_TYPE_V64:
1292        insn = I3312_STRVD;
1293        lgsz = 3;
1294        break;
1295    case TCG_TYPE_V128:
1296        insn = I3312_STRVQ;
1297        lgsz = 4;
1298        break;
1299    default:
1300        g_assert_not_reached();
1301    }
1302    tcg_out_ldst(s, insn, src, base, ofs, lgsz);
1303}
1304
1305static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1306                               TCGReg base, intptr_t ofs)
1307{
1308    if (type <= TCG_TYPE_I64 && val == 0) {
1309        tcg_out_st(s, type, TCG_REG_XZR, base, ofs);
1310        return true;
1311    }
1312    return false;
1313}
1314
1315static inline void tcg_out_bfm(TCGContext *s, TCGType ext, TCGReg rd,
1316                               TCGReg rn, unsigned int a, unsigned int b)
1317{
1318    tcg_out_insn(s, 3402, BFM, ext, rd, rn, ext, a, b);
1319}
1320
1321static inline void tcg_out_ubfm(TCGContext *s, TCGType ext, TCGReg rd,
1322                                TCGReg rn, unsigned int a, unsigned int b)
1323{
1324    tcg_out_insn(s, 3402, UBFM, ext, rd, rn, ext, a, b);
1325}
1326
1327static inline void tcg_out_sbfm(TCGContext *s, TCGType ext, TCGReg rd,
1328                                TCGReg rn, unsigned int a, unsigned int b)
1329{
1330    tcg_out_insn(s, 3402, SBFM, ext, rd, rn, ext, a, b);
1331}
1332
1333static inline void tcg_out_extr(TCGContext *s, TCGType ext, TCGReg rd,
1334                                TCGReg rn, TCGReg rm, unsigned int a)
1335{
1336    tcg_out_insn(s, 3403, EXTR, ext, rd, rn, rm, a);
1337}
1338
1339static inline void tcg_out_shl(TCGContext *s, TCGType ext,
1340                               TCGReg rd, TCGReg rn, unsigned int m)
1341{
1342    int bits = ext ? 64 : 32;
1343    int max = bits - 1;
1344    tcg_out_ubfm(s, ext, rd, rn, (bits - m) & max, (max - m) & max);
1345}
1346
1347static inline void tcg_out_shr(TCGContext *s, TCGType ext,
1348                               TCGReg rd, TCGReg rn, unsigned int m)
1349{
1350    int max = ext ? 63 : 31;
1351    tcg_out_ubfm(s, ext, rd, rn, m & max, max);
1352}
1353
1354static inline void tcg_out_sar(TCGContext *s, TCGType ext,
1355                               TCGReg rd, TCGReg rn, unsigned int m)
1356{
1357    int max = ext ? 63 : 31;
1358    tcg_out_sbfm(s, ext, rd, rn, m & max, max);
1359}
1360
1361static inline void tcg_out_rotr(TCGContext *s, TCGType ext,
1362                                TCGReg rd, TCGReg rn, unsigned int m)
1363{
1364    int max = ext ? 63 : 31;
1365    tcg_out_extr(s, ext, rd, rn, rn, m & max);
1366}
1367
1368static inline void tcg_out_rotl(TCGContext *s, TCGType ext,
1369                                TCGReg rd, TCGReg rn, unsigned int m)
1370{
1371    int max = ext ? 63 : 31;
1372    tcg_out_extr(s, ext, rd, rn, rn, -m & max);
1373}
1374
1375static inline void tcg_out_dep(TCGContext *s, TCGType ext, TCGReg rd,
1376                               TCGReg rn, unsigned lsb, unsigned width)
1377{
1378    unsigned size = ext ? 64 : 32;
1379    unsigned a = (size - lsb) & (size - 1);
1380    unsigned b = width - 1;
1381    tcg_out_bfm(s, ext, rd, rn, a, b);
1382}
1383
1384static void tcg_out_cmp(TCGContext *s, TCGType ext, TCGCond cond, TCGReg a,
1385                        tcg_target_long b, bool const_b)
1386{
1387    if (is_tst_cond(cond)) {
1388        if (!const_b) {
1389            tcg_out_insn(s, 3510, ANDS, ext, TCG_REG_XZR, a, b);
1390        } else {
1391            tcg_out_logicali(s, I3404_ANDSI, ext, TCG_REG_XZR, a, b);
1392        }
1393    } else {
1394        if (!const_b) {
1395            tcg_out_insn(s, 3502, SUBS, ext, TCG_REG_XZR, a, b);
1396        } else if (b >= 0) {
1397            tcg_debug_assert(is_aimm(b));
1398            tcg_out_insn(s, 3401, SUBSI, ext, TCG_REG_XZR, a, b);
1399        } else {
1400            tcg_debug_assert(is_aimm(-b));
1401            tcg_out_insn(s, 3401, ADDSI, ext, TCG_REG_XZR, a, -b);
1402        }
1403    }
1404}
1405
1406static void tcg_out_goto(TCGContext *s, const tcg_insn_unit *target)
1407{
1408    ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
1409    tcg_debug_assert(offset == sextract64(offset, 0, 26));
1410    tcg_out_insn(s, 3206, B, offset);
1411}
1412
1413static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target)
1414{
1415    ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
1416    if (offset == sextract64(offset, 0, 26)) {
1417        tcg_out_insn(s, 3206, BL, offset);
1418    } else {
1419        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
1420        tcg_out_insn(s, 3207, BLR, TCG_REG_TMP0);
1421    }
1422}
1423
1424static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
1425                         const TCGHelperInfo *info)
1426{
1427    tcg_out_call_int(s, target);
1428}
1429
1430static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
1431{
1432    if (!l->has_value) {
1433        tcg_out_reloc(s, s->code_ptr, R_AARCH64_JUMP26, l, 0);
1434        tcg_out_insn(s, 3206, B, 0);
1435    } else {
1436        tcg_out_goto(s, l->u.value_ptr);
1437    }
1438}
1439
1440static void tcg_out_brcond(TCGContext *s, TCGType ext, TCGCond c, TCGArg a,
1441                           TCGArg b, bool b_const, TCGLabel *l)
1442{
1443    int tbit = -1;
1444    bool need_cmp = true;
1445
1446    switch (c) {
1447    case TCG_COND_EQ:
1448    case TCG_COND_NE:
1449        /* cmp xN,0; b.ne L -> cbnz xN,L */
1450        if (b_const && b == 0) {
1451            need_cmp = false;
1452        }
1453        break;
1454    case TCG_COND_LT:
1455    case TCG_COND_GE:
1456        /* cmp xN,0; b.mi L -> tbnz xN,63,L */
1457        if (b_const && b == 0) {
1458            c = (c == TCG_COND_LT ? TCG_COND_TSTNE : TCG_COND_TSTEQ);
1459            tbit = ext ? 63 : 31;
1460            need_cmp = false;
1461        }
1462        break;
1463    case TCG_COND_TSTEQ:
1464    case TCG_COND_TSTNE:
1465        /* tst xN,0xffffffff; b.ne L -> cbnz wN,L */
1466        if (b_const && b == UINT32_MAX) {
1467            c = tcg_tst_eqne_cond(c);
1468            ext = TCG_TYPE_I32;
1469            need_cmp = false;
1470            break;
1471        }
1472        /* tst xN,1<<B; b.ne L -> tbnz xN,B,L */
1473        if (b_const && is_power_of_2(b)) {
1474            tbit = ctz64(b);
1475            need_cmp = false;
1476        }
1477        break;
1478    default:
1479        break;
1480    }
1481
1482    if (need_cmp) {
1483        tcg_out_cmp(s, ext, c, a, b, b_const);
1484        tcg_out_reloc(s, s->code_ptr, R_AARCH64_CONDBR19, l, 0);
1485        tcg_out_insn(s, 3202, B_C, c, 0);
1486        return;
1487    }
1488
1489    if (tbit >= 0) {
1490        tcg_out_reloc(s, s->code_ptr, R_AARCH64_TSTBR14, l, 0);
1491        switch (c) {
1492        case TCG_COND_TSTEQ:
1493            tcg_out_insn(s, 3205, TBZ, a, tbit, 0);
1494            break;
1495        case TCG_COND_TSTNE:
1496            tcg_out_insn(s, 3205, TBNZ, a, tbit, 0);
1497            break;
1498        default:
1499            g_assert_not_reached();
1500        }
1501    } else {
1502        tcg_out_reloc(s, s->code_ptr, R_AARCH64_CONDBR19, l, 0);
1503        switch (c) {
1504        case TCG_COND_EQ:
1505            tcg_out_insn(s, 3201, CBZ, ext, a, 0);
1506            break;
1507        case TCG_COND_NE:
1508            tcg_out_insn(s, 3201, CBNZ, ext, a, 0);
1509            break;
1510        default:
1511            g_assert_not_reached();
1512        }
1513    }
1514}
1515
1516static inline void tcg_out_rev(TCGContext *s, int ext, MemOp s_bits,
1517                               TCGReg rd, TCGReg rn)
1518{
1519    /* REV, REV16, REV32 */
1520    tcg_out_insn_3507(s, I3507_REV | (s_bits << 10), ext, rd, rn);
1521}
1522
1523static inline void tcg_out_sxt(TCGContext *s, TCGType ext, MemOp s_bits,
1524                               TCGReg rd, TCGReg rn)
1525{
1526    /* Using ALIASes SXTB, SXTH, SXTW, of SBFM Xd, Xn, #0, #7|15|31 */
1527    int bits = (8 << s_bits) - 1;
1528    tcg_out_sbfm(s, ext, rd, rn, 0, bits);
1529}
1530
1531static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg rd, TCGReg rn)
1532{
1533    tcg_out_sxt(s, type, MO_8, rd, rn);
1534}
1535
1536static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg rd, TCGReg rn)
1537{
1538    tcg_out_sxt(s, type, MO_16, rd, rn);
1539}
1540
1541static void tcg_out_ext32s(TCGContext *s, TCGReg rd, TCGReg rn)
1542{
1543    tcg_out_sxt(s, TCG_TYPE_I64, MO_32, rd, rn);
1544}
1545
1546static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg rd, TCGReg rn)
1547{
1548    tcg_out_ext32s(s, rd, rn);
1549}
1550
1551static inline void tcg_out_uxt(TCGContext *s, MemOp s_bits,
1552                               TCGReg rd, TCGReg rn)
1553{
1554    /* Using ALIASes UXTB, UXTH of UBFM Wd, Wn, #0, #7|15 */
1555    int bits = (8 << s_bits) - 1;
1556    tcg_out_ubfm(s, 0, rd, rn, 0, bits);
1557}
1558
1559static void tcg_out_ext8u(TCGContext *s, TCGReg rd, TCGReg rn)
1560{
1561    tcg_out_uxt(s, MO_8, rd, rn);
1562}
1563
1564static void tcg_out_ext16u(TCGContext *s, TCGReg rd, TCGReg rn)
1565{
1566    tcg_out_uxt(s, MO_16, rd, rn);
1567}
1568
1569static void tcg_out_ext32u(TCGContext *s, TCGReg rd, TCGReg rn)
1570{
1571    tcg_out_movr(s, TCG_TYPE_I32, rd, rn);
1572}
1573
1574static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg rd, TCGReg rn)
1575{
1576    tcg_out_ext32u(s, rd, rn);
1577}
1578
1579static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg rd, TCGReg rn)
1580{
1581    tcg_out_mov(s, TCG_TYPE_I32, rd, rn);
1582}
1583
1584static void tcg_out_addsubi(TCGContext *s, int ext, TCGReg rd,
1585                            TCGReg rn, int64_t aimm)
1586{
1587    if (aimm >= 0) {
1588        tcg_out_insn(s, 3401, ADDI, ext, rd, rn, aimm);
1589    } else {
1590        tcg_out_insn(s, 3401, SUBI, ext, rd, rn, -aimm);
1591    }
1592}
1593
1594static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
1595                            TCGReg rh, TCGReg al, TCGReg ah,
1596                            tcg_target_long bl, tcg_target_long bh,
1597                            bool const_bl, bool const_bh, bool sub)
1598{
1599    TCGReg orig_rl = rl;
1600    AArch64Insn insn;
1601
1602    if (rl == ah || (!const_bh && rl == bh)) {
1603        rl = TCG_REG_TMP0;
1604    }
1605
1606    if (const_bl) {
1607        if (bl < 0) {
1608            bl = -bl;
1609            insn = sub ? I3401_ADDSI : I3401_SUBSI;
1610        } else {
1611            insn = sub ? I3401_SUBSI : I3401_ADDSI;
1612        }
1613
1614        if (unlikely(al == TCG_REG_XZR)) {
1615            /* ??? We want to allow al to be zero for the benefit of
1616               negation via subtraction.  However, that leaves open the
1617               possibility of adding 0+const in the low part, and the
1618               immediate add instructions encode XSP not XZR.  Don't try
1619               anything more elaborate here than loading another zero.  */
1620            al = TCG_REG_TMP0;
1621            tcg_out_movi(s, ext, al, 0);
1622        }
1623        tcg_out_insn_3401(s, insn, ext, rl, al, bl);
1624    } else {
1625        tcg_out_insn_3502(s, sub ? I3502_SUBS : I3502_ADDS, ext, rl, al, bl);
1626    }
1627
1628    insn = I3503_ADC;
1629    if (const_bh) {
1630        /* Note that the only two constants we support are 0 and -1, and
1631           that SBC = rn + ~rm + c, so adc -1 is sbc 0, and vice-versa.  */
1632        if ((bh != 0) ^ sub) {
1633            insn = I3503_SBC;
1634        }
1635        bh = TCG_REG_XZR;
1636    } else if (sub) {
1637        insn = I3503_SBC;
1638    }
1639    tcg_out_insn_3503(s, insn, ext, rh, ah, bh);
1640
1641    tcg_out_mov(s, ext, orig_rl, rl);
1642}
1643
1644static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1645{
1646    static const uint32_t sync[] = {
1647        [0 ... TCG_MO_ALL]            = DMB_ISH | DMB_LD | DMB_ST,
1648        [TCG_MO_ST_ST]                = DMB_ISH | DMB_ST,
1649        [TCG_MO_LD_LD]                = DMB_ISH | DMB_LD,
1650        [TCG_MO_LD_ST]                = DMB_ISH | DMB_LD,
1651        [TCG_MO_LD_ST | TCG_MO_LD_LD] = DMB_ISH | DMB_LD,
1652    };
1653    tcg_out32(s, sync[a0 & TCG_MO_ALL]);
1654}
1655
1656static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
1657                         TCGReg a0, TCGArg b, bool const_b, bool is_ctz)
1658{
1659    TCGReg a1 = a0;
1660    if (is_ctz) {
1661        a1 = TCG_REG_TMP0;
1662        tcg_out_insn(s, 3507, RBIT, ext, a1, a0);
1663    }
1664    if (const_b && b == (ext ? 64 : 32)) {
1665        tcg_out_insn(s, 3507, CLZ, ext, d, a1);
1666    } else {
1667        AArch64Insn sel = I3506_CSEL;
1668
1669        tcg_out_cmp(s, ext, TCG_COND_NE, a0, 0, 1);
1670        tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP0, a1);
1671
1672        if (const_b) {
1673            if (b == -1) {
1674                b = TCG_REG_XZR;
1675                sel = I3506_CSINV;
1676            } else if (b == 0) {
1677                b = TCG_REG_XZR;
1678            } else {
1679                tcg_out_movi(s, ext, d, b);
1680                b = d;
1681            }
1682        }
1683        tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP0, b, TCG_COND_NE);
1684    }
1685}
1686
1687typedef struct {
1688    TCGReg base;
1689    TCGReg index;
1690    TCGType index_ext;
1691    TCGAtomAlign aa;
1692} HostAddress;
1693
1694bool tcg_target_has_memory_bswap(MemOp memop)
1695{
1696    return false;
1697}
1698
1699static const TCGLdstHelperParam ldst_helper_param = {
1700    .ntmp = 1, .tmp = { TCG_REG_TMP0 }
1701};
1702
1703static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
1704{
1705    MemOp opc = get_memop(lb->oi);
1706
1707    if (!reloc_pc19(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
1708        return false;
1709    }
1710
1711    tcg_out_ld_helper_args(s, lb, &ldst_helper_param);
1712    tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
1713    tcg_out_ld_helper_ret(s, lb, false, &ldst_helper_param);
1714    tcg_out_goto(s, lb->raddr);
1715    return true;
1716}
1717
1718static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
1719{
1720    MemOp opc = get_memop(lb->oi);
1721
1722    if (!reloc_pc19(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
1723        return false;
1724    }
1725
1726    tcg_out_st_helper_args(s, lb, &ldst_helper_param);
1727    tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE]);
1728    tcg_out_goto(s, lb->raddr);
1729    return true;
1730}
1731
1732/* We expect to use a 7-bit scaled negative offset from ENV.  */
1733#define MIN_TLB_MASK_TABLE_OFS  -512
1734
1735/*
1736 * For system-mode, perform the TLB load and compare.
1737 * For user-mode, perform any required alignment tests.
1738 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1739 * is required and fill in @h with the host address for the fast path.
1740 */
1741static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1742                                           TCGReg addr_reg, MemOpIdx oi,
1743                                           bool is_ld)
1744{
1745    TCGType addr_type = s->addr_type;
1746    TCGLabelQemuLdst *ldst = NULL;
1747    MemOp opc = get_memop(oi);
1748    MemOp s_bits = opc & MO_SIZE;
1749    unsigned a_mask;
1750
1751    h->aa = atom_and_align_for_opc(s, opc,
1752                                   have_lse2 ? MO_ATOM_WITHIN16
1753                                             : MO_ATOM_IFALIGN,
1754                                   s_bits == MO_128);
1755    a_mask = (1 << h->aa.align) - 1;
1756
1757    if (tcg_use_softmmu) {
1758        unsigned s_mask = (1u << s_bits) - 1;
1759        unsigned mem_index = get_mmuidx(oi);
1760        TCGReg addr_adj;
1761        TCGType mask_type;
1762        uint64_t compare_mask;
1763
1764        ldst = new_ldst_label(s);
1765        ldst->is_ld = is_ld;
1766        ldst->oi = oi;
1767        ldst->addrlo_reg = addr_reg;
1768
1769        mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32
1770                     ? TCG_TYPE_I64 : TCG_TYPE_I32);
1771
1772        /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {tmp0,tmp1}. */
1773        QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
1774        QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
1775        tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0,
1776                     tlb_mask_table_ofs(s, mem_index), 1, 0);
1777
1778        /* Extract the TLB index from the address into X0.  */
1779        tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
1780                     TCG_REG_TMP0, TCG_REG_TMP0, addr_reg,
1781                     s->page_bits - CPU_TLB_ENTRY_BITS);
1782
1783        /* Add the tlb_table pointer, forming the CPUTLBEntry address. */
1784        tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0);
1785
1786        /* Load the tlb comparator into TMP0, and the fast path addend. */
1787        QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
1788        tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1,
1789                   is_ld ? offsetof(CPUTLBEntry, addr_read)
1790                         : offsetof(CPUTLBEntry, addr_write));
1791        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1,
1792                   offsetof(CPUTLBEntry, addend));
1793
1794        /*
1795         * For aligned accesses, we check the first byte and include
1796         * the alignment bits within the address.  For unaligned access,
1797         * we check that we don't cross pages using the address of the
1798         * last byte of the access.
1799         */
1800        if (a_mask >= s_mask) {
1801            addr_adj = addr_reg;
1802        } else {
1803            addr_adj = TCG_REG_TMP2;
1804            tcg_out_insn(s, 3401, ADDI, addr_type,
1805                         addr_adj, addr_reg, s_mask - a_mask);
1806        }
1807        compare_mask = (uint64_t)s->page_mask | a_mask;
1808
1809        /* Store the page mask part of the address into TMP2.  */
1810        tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_TMP2,
1811                         addr_adj, compare_mask);
1812
1813        /* Perform the address comparison. */
1814        tcg_out_cmp(s, addr_type, TCG_COND_NE, TCG_REG_TMP0, TCG_REG_TMP2, 0);
1815
1816        /* If not equal, we jump to the slow path. */
1817        ldst->label_ptr[0] = s->code_ptr;
1818        tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1819
1820        h->base = TCG_REG_TMP1;
1821        h->index = addr_reg;
1822        h->index_ext = addr_type;
1823    } else {
1824        if (a_mask) {
1825            ldst = new_ldst_label(s);
1826
1827            ldst->is_ld = is_ld;
1828            ldst->oi = oi;
1829            ldst->addrlo_reg = addr_reg;
1830
1831            /* tst addr, #mask */
1832            tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask);
1833
1834            /* b.ne slow_path */
1835            ldst->label_ptr[0] = s->code_ptr;
1836            tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1837        }
1838
1839        if (guest_base || addr_type == TCG_TYPE_I32) {
1840            h->base = TCG_REG_GUEST_BASE;
1841            h->index = addr_reg;
1842            h->index_ext = addr_type;
1843        } else {
1844            h->base = addr_reg;
1845            h->index = TCG_REG_XZR;
1846            h->index_ext = TCG_TYPE_I64;
1847        }
1848    }
1849
1850    return ldst;
1851}
1852
1853static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
1854                                   TCGReg data_r, HostAddress h)
1855{
1856    switch (memop & MO_SSIZE) {
1857    case MO_UB:
1858        tcg_out_ldst_r(s, I3312_LDRB, data_r, h.base, h.index_ext, h.index);
1859        break;
1860    case MO_SB:
1861        tcg_out_ldst_r(s, ext ? I3312_LDRSBX : I3312_LDRSBW,
1862                       data_r, h.base, h.index_ext, h.index);
1863        break;
1864    case MO_UW:
1865        tcg_out_ldst_r(s, I3312_LDRH, data_r, h.base, h.index_ext, h.index);
1866        break;
1867    case MO_SW:
1868        tcg_out_ldst_r(s, (ext ? I3312_LDRSHX : I3312_LDRSHW),
1869                       data_r, h.base, h.index_ext, h.index);
1870        break;
1871    case MO_UL:
1872        tcg_out_ldst_r(s, I3312_LDRW, data_r, h.base, h.index_ext, h.index);
1873        break;
1874    case MO_SL:
1875        tcg_out_ldst_r(s, I3312_LDRSWX, data_r, h.base, h.index_ext, h.index);
1876        break;
1877    case MO_UQ:
1878        tcg_out_ldst_r(s, I3312_LDRX, data_r, h.base, h.index_ext, h.index);
1879        break;
1880    default:
1881        g_assert_not_reached();
1882    }
1883}
1884
1885static void tcg_out_qemu_st_direct(TCGContext *s, MemOp memop,
1886                                   TCGReg data_r, HostAddress h)
1887{
1888    switch (memop & MO_SIZE) {
1889    case MO_8:
1890        tcg_out_ldst_r(s, I3312_STRB, data_r, h.base, h.index_ext, h.index);
1891        break;
1892    case MO_16:
1893        tcg_out_ldst_r(s, I3312_STRH, data_r, h.base, h.index_ext, h.index);
1894        break;
1895    case MO_32:
1896        tcg_out_ldst_r(s, I3312_STRW, data_r, h.base, h.index_ext, h.index);
1897        break;
1898    case MO_64:
1899        tcg_out_ldst_r(s, I3312_STRX, data_r, h.base, h.index_ext, h.index);
1900        break;
1901    default:
1902        g_assert_not_reached();
1903    }
1904}
1905
1906static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
1907                            MemOpIdx oi, TCGType data_type)
1908{
1909    TCGLabelQemuLdst *ldst;
1910    HostAddress h;
1911
1912    ldst = prepare_host_addr(s, &h, addr_reg, oi, true);
1913    tcg_out_qemu_ld_direct(s, get_memop(oi), data_type, data_reg, h);
1914
1915    if (ldst) {
1916        ldst->type = data_type;
1917        ldst->datalo_reg = data_reg;
1918        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
1919    }
1920}
1921
1922static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
1923                            MemOpIdx oi, TCGType data_type)
1924{
1925    TCGLabelQemuLdst *ldst;
1926    HostAddress h;
1927
1928    ldst = prepare_host_addr(s, &h, addr_reg, oi, false);
1929    tcg_out_qemu_st_direct(s, get_memop(oi), data_reg, h);
1930
1931    if (ldst) {
1932        ldst->type = data_type;
1933        ldst->datalo_reg = data_reg;
1934        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
1935    }
1936}
1937
1938static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
1939                                   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
1940{
1941    TCGLabelQemuLdst *ldst;
1942    HostAddress h;
1943    TCGReg base;
1944    bool use_pair;
1945
1946    ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
1947
1948    /* Compose the final address, as LDP/STP have no indexing. */
1949    if (h.index == TCG_REG_XZR) {
1950        base = h.base;
1951    } else {
1952        base = TCG_REG_TMP2;
1953        if (h.index_ext == TCG_TYPE_I32) {
1954            /* add base, base, index, uxtw */
1955            tcg_out_insn(s, 3501, ADD, TCG_TYPE_I64, base,
1956                         h.base, h.index, MO_32, 0);
1957        } else {
1958            /* add base, base, index */
1959            tcg_out_insn(s, 3502, ADD, 1, base, h.base, h.index);
1960        }
1961    }
1962
1963    use_pair = h.aa.atom < MO_128 || have_lse2;
1964
1965    if (!use_pair) {
1966        tcg_insn_unit *branch = NULL;
1967        TCGReg ll, lh, sl, sh;
1968
1969        /*
1970         * If we have already checked for 16-byte alignment, that's all
1971         * we need. Otherwise we have determined that misaligned atomicity
1972         * may be handled with two 8-byte loads.
1973         */
1974        if (h.aa.align < MO_128) {
1975            /*
1976             * TODO: align should be MO_64, so we only need test bit 3,
1977             * which means we could use TBNZ instead of ANDS+B_C.
1978             */
1979            tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, 15);
1980            branch = s->code_ptr;
1981            tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1982            use_pair = true;
1983        }
1984
1985        if (is_ld) {
1986            /*
1987             * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
1988             *    ldxp lo, hi, [base]
1989             *    stxp t0, lo, hi, [base]
1990             *    cbnz t0, .-8
1991             * Require no overlap between data{lo,hi} and base.
1992             */
1993            if (datalo == base || datahi == base) {
1994                tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_TMP2, base);
1995                base = TCG_REG_TMP2;
1996            }
1997            ll = sl = datalo;
1998            lh = sh = datahi;
1999        } else {
2000            /*
2001             * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
2002             * 1: ldxp t0, t1, [base]
2003             *    stxp t0, lo, hi, [base]
2004             *    cbnz t0, 1b
2005             */
2006            tcg_debug_assert(base != TCG_REG_TMP0 && base != TCG_REG_TMP1);
2007            ll = TCG_REG_TMP0;
2008            lh = TCG_REG_TMP1;
2009            sl = datalo;
2010            sh = datahi;
2011        }
2012
2013        tcg_out_insn(s, 3306, LDXP, TCG_REG_XZR, ll, lh, base);
2014        tcg_out_insn(s, 3306, STXP, TCG_REG_TMP0, sl, sh, base);
2015        tcg_out_insn(s, 3201, CBNZ, 0, TCG_REG_TMP0, -2);
2016
2017        if (use_pair) {
2018            /* "b .+8", branching across the one insn of use_pair. */
2019            tcg_out_insn(s, 3206, B, 2);
2020            reloc_pc19(branch, tcg_splitwx_to_rx(s->code_ptr));
2021        }
2022    }
2023
2024    if (use_pair) {
2025        if (is_ld) {
2026            tcg_out_insn(s, 3314, LDP, datalo, datahi, base, 0, 1, 0);
2027        } else {
2028            tcg_out_insn(s, 3314, STP, datalo, datahi, base, 0, 1, 0);
2029        }
2030    }
2031
2032    if (ldst) {
2033        ldst->type = TCG_TYPE_I128;
2034        ldst->datalo_reg = datalo;
2035        ldst->datahi_reg = datahi;
2036        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2037    }
2038}
2039
2040static const tcg_insn_unit *tb_ret_addr;
2041
2042static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2043{
2044    const tcg_insn_unit *target;
2045    ptrdiff_t offset;
2046
2047    /* Reuse the zeroing that exists for goto_ptr.  */
2048    if (a0 == 0) {
2049        target = tcg_code_gen_epilogue;
2050    } else {
2051        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
2052        target = tb_ret_addr;
2053    }
2054
2055    offset = tcg_pcrel_diff(s, target) >> 2;
2056    if (offset == sextract64(offset, 0, 26)) {
2057        tcg_out_insn(s, 3206, B, offset);
2058    } else {
2059        /*
2060         * Only x16/x17 generate BTI type Jump (2),
2061         * other registers generate BTI type Jump|Call (3).
2062         */
2063        QEMU_BUILD_BUG_ON(TCG_REG_TMP0 != TCG_REG_X16);
2064        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
2065        tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
2066    }
2067}
2068
2069static void tcg_out_goto_tb(TCGContext *s, int which)
2070{
2071    /*
2072     * Direct branch, or indirect address load, will be patched
2073     * by tb_target_set_jmp_target.  Assert indirect load offset
2074     * in range early, regardless of direct branch distance.
2075     */
2076    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
2077    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
2078
2079    set_jmp_insn_offset(s, which);
2080    tcg_out32(s, I3206_B);
2081    tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
2082    set_jmp_reset_offset(s, which);
2083    tcg_out_bti(s, BTI_J);
2084}
2085
2086void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2087                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2088{
2089    uintptr_t d_addr = tb->jmp_target_addr[n];
2090    ptrdiff_t d_offset = d_addr - jmp_rx;
2091    tcg_insn_unit insn;
2092
2093    /* Either directly branch, or indirect branch load. */
2094    if (d_offset == sextract64(d_offset, 0, 28)) {
2095        insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
2096    } else {
2097        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
2098        ptrdiff_t i_offset = i_addr - jmp_rx;
2099
2100        /* Note that we asserted this in range in tcg_out_goto_tb. */
2101        insn = deposit32(I3305_LDR | TCG_REG_TMP0, 5, 19, i_offset >> 2);
2102    }
2103    qatomic_set((uint32_t *)jmp_rw, insn);
2104    flush_idcache_range(jmp_rx, jmp_rw, 4);
2105}
2106
2107static void tcg_out_op(TCGContext *s, TCGOpcode opc,
2108                       const TCGArg args[TCG_MAX_OP_ARGS],
2109                       const int const_args[TCG_MAX_OP_ARGS])
2110{
2111    /* 99% of the time, we can signal the use of extension registers
2112       by looking to see if the opcode handles 64-bit data.  */
2113    TCGType ext = (tcg_op_defs[opc].flags & TCG_OPF_64BIT) != 0;
2114
2115    /* Hoist the loads of the most common arguments.  */
2116    TCGArg a0 = args[0];
2117    TCGArg a1 = args[1];
2118    TCGArg a2 = args[2];
2119    int c2 = const_args[2];
2120
2121    /* Some operands are defined with "rZ" constraint, a register or
2122       the zero register.  These need not actually test args[I] == 0.  */
2123#define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
2124
2125    switch (opc) {
2126    case INDEX_op_goto_ptr:
2127        tcg_out_insn(s, 3207, BR, a0);
2128        break;
2129
2130    case INDEX_op_br:
2131        tcg_out_goto_label(s, arg_label(a0));
2132        break;
2133
2134    case INDEX_op_ld8u_i32:
2135    case INDEX_op_ld8u_i64:
2136        tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0);
2137        break;
2138    case INDEX_op_ld8s_i32:
2139        tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0);
2140        break;
2141    case INDEX_op_ld8s_i64:
2142        tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0);
2143        break;
2144    case INDEX_op_ld16u_i32:
2145    case INDEX_op_ld16u_i64:
2146        tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1);
2147        break;
2148    case INDEX_op_ld16s_i32:
2149        tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1);
2150        break;
2151    case INDEX_op_ld16s_i64:
2152        tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1);
2153        break;
2154    case INDEX_op_ld_i32:
2155    case INDEX_op_ld32u_i64:
2156        tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2);
2157        break;
2158    case INDEX_op_ld32s_i64:
2159        tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2);
2160        break;
2161    case INDEX_op_ld_i64:
2162        tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3);
2163        break;
2164
2165    case INDEX_op_st8_i32:
2166    case INDEX_op_st8_i64:
2167        tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0);
2168        break;
2169    case INDEX_op_st16_i32:
2170    case INDEX_op_st16_i64:
2171        tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1);
2172        break;
2173    case INDEX_op_st_i32:
2174    case INDEX_op_st32_i64:
2175        tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2);
2176        break;
2177    case INDEX_op_st_i64:
2178        tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3);
2179        break;
2180
2181    case INDEX_op_add_i32:
2182        a2 = (int32_t)a2;
2183        /* FALLTHRU */
2184    case INDEX_op_add_i64:
2185        if (c2) {
2186            tcg_out_addsubi(s, ext, a0, a1, a2);
2187        } else {
2188            tcg_out_insn(s, 3502, ADD, ext, a0, a1, a2);
2189        }
2190        break;
2191
2192    case INDEX_op_sub_i32:
2193        a2 = (int32_t)a2;
2194        /* FALLTHRU */
2195    case INDEX_op_sub_i64:
2196        if (c2) {
2197            tcg_out_addsubi(s, ext, a0, a1, -a2);
2198        } else {
2199            tcg_out_insn(s, 3502, SUB, ext, a0, a1, a2);
2200        }
2201        break;
2202
2203    case INDEX_op_neg_i64:
2204    case INDEX_op_neg_i32:
2205        tcg_out_insn(s, 3502, SUB, ext, a0, TCG_REG_XZR, a1);
2206        break;
2207
2208    case INDEX_op_and_i32:
2209        a2 = (int32_t)a2;
2210        /* FALLTHRU */
2211    case INDEX_op_and_i64:
2212        if (c2) {
2213            tcg_out_logicali(s, I3404_ANDI, ext, a0, a1, a2);
2214        } else {
2215            tcg_out_insn(s, 3510, AND, ext, a0, a1, a2);
2216        }
2217        break;
2218
2219    case INDEX_op_andc_i32:
2220        a2 = (int32_t)a2;
2221        /* FALLTHRU */
2222    case INDEX_op_andc_i64:
2223        if (c2) {
2224            tcg_out_logicali(s, I3404_ANDI, ext, a0, a1, ~a2);
2225        } else {
2226            tcg_out_insn(s, 3510, BIC, ext, a0, a1, a2);
2227        }
2228        break;
2229
2230    case INDEX_op_or_i32:
2231        a2 = (int32_t)a2;
2232        /* FALLTHRU */
2233    case INDEX_op_or_i64:
2234        if (c2) {
2235            tcg_out_logicali(s, I3404_ORRI, ext, a0, a1, a2);
2236        } else {
2237            tcg_out_insn(s, 3510, ORR, ext, a0, a1, a2);
2238        }
2239        break;
2240
2241    case INDEX_op_orc_i32:
2242        a2 = (int32_t)a2;
2243        /* FALLTHRU */
2244    case INDEX_op_orc_i64:
2245        if (c2) {
2246            tcg_out_logicali(s, I3404_ORRI, ext, a0, a1, ~a2);
2247        } else {
2248            tcg_out_insn(s, 3510, ORN, ext, a0, a1, a2);
2249        }
2250        break;
2251
2252    case INDEX_op_xor_i32:
2253        a2 = (int32_t)a2;
2254        /* FALLTHRU */
2255    case INDEX_op_xor_i64:
2256        if (c2) {
2257            tcg_out_logicali(s, I3404_EORI, ext, a0, a1, a2);
2258        } else {
2259            tcg_out_insn(s, 3510, EOR, ext, a0, a1, a2);
2260        }
2261        break;
2262
2263    case INDEX_op_eqv_i32:
2264        a2 = (int32_t)a2;
2265        /* FALLTHRU */
2266    case INDEX_op_eqv_i64:
2267        if (c2) {
2268            tcg_out_logicali(s, I3404_EORI, ext, a0, a1, ~a2);
2269        } else {
2270            tcg_out_insn(s, 3510, EON, ext, a0, a1, a2);
2271        }
2272        break;
2273
2274    case INDEX_op_not_i64:
2275    case INDEX_op_not_i32:
2276        tcg_out_insn(s, 3510, ORN, ext, a0, TCG_REG_XZR, a1);
2277        break;
2278
2279    case INDEX_op_mul_i64:
2280    case INDEX_op_mul_i32:
2281        tcg_out_insn(s, 3509, MADD, ext, a0, a1, a2, TCG_REG_XZR);
2282        break;
2283
2284    case INDEX_op_div_i64:
2285    case INDEX_op_div_i32:
2286        tcg_out_insn(s, 3508, SDIV, ext, a0, a1, a2);
2287        break;
2288    case INDEX_op_divu_i64:
2289    case INDEX_op_divu_i32:
2290        tcg_out_insn(s, 3508, UDIV, ext, a0, a1, a2);
2291        break;
2292
2293    case INDEX_op_rem_i64:
2294    case INDEX_op_rem_i32:
2295        tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP0, a1, a2);
2296        tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
2297        break;
2298    case INDEX_op_remu_i64:
2299    case INDEX_op_remu_i32:
2300        tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP0, a1, a2);
2301        tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
2302        break;
2303
2304    case INDEX_op_shl_i64:
2305    case INDEX_op_shl_i32:
2306        if (c2) {
2307            tcg_out_shl(s, ext, a0, a1, a2);
2308        } else {
2309            tcg_out_insn(s, 3508, LSLV, ext, a0, a1, a2);
2310        }
2311        break;
2312
2313    case INDEX_op_shr_i64:
2314    case INDEX_op_shr_i32:
2315        if (c2) {
2316            tcg_out_shr(s, ext, a0, a1, a2);
2317        } else {
2318            tcg_out_insn(s, 3508, LSRV, ext, a0, a1, a2);
2319        }
2320        break;
2321
2322    case INDEX_op_sar_i64:
2323    case INDEX_op_sar_i32:
2324        if (c2) {
2325            tcg_out_sar(s, ext, a0, a1, a2);
2326        } else {
2327            tcg_out_insn(s, 3508, ASRV, ext, a0, a1, a2);
2328        }
2329        break;
2330
2331    case INDEX_op_rotr_i64:
2332    case INDEX_op_rotr_i32:
2333        if (c2) {
2334            tcg_out_rotr(s, ext, a0, a1, a2);
2335        } else {
2336            tcg_out_insn(s, 3508, RORV, ext, a0, a1, a2);
2337        }
2338        break;
2339
2340    case INDEX_op_rotl_i64:
2341    case INDEX_op_rotl_i32:
2342        if (c2) {
2343            tcg_out_rotl(s, ext, a0, a1, a2);
2344        } else {
2345            tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP0, TCG_REG_XZR, a2);
2346            tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP0);
2347        }
2348        break;
2349
2350    case INDEX_op_clz_i64:
2351    case INDEX_op_clz_i32:
2352        tcg_out_cltz(s, ext, a0, a1, a2, c2, false);
2353        break;
2354    case INDEX_op_ctz_i64:
2355    case INDEX_op_ctz_i32:
2356        tcg_out_cltz(s, ext, a0, a1, a2, c2, true);
2357        break;
2358
2359    case INDEX_op_brcond_i32:
2360        a1 = (int32_t)a1;
2361        /* FALLTHRU */
2362    case INDEX_op_brcond_i64:
2363        tcg_out_brcond(s, ext, a2, a0, a1, const_args[1], arg_label(args[3]));
2364        break;
2365
2366    case INDEX_op_setcond_i32:
2367        a2 = (int32_t)a2;
2368        /* FALLTHRU */
2369    case INDEX_op_setcond_i64:
2370        tcg_out_cmp(s, ext, args[3], a1, a2, c2);
2371        /* Use CSET alias of CSINC Wd, WZR, WZR, invert(cond).  */
2372        tcg_out_insn(s, 3506, CSINC, TCG_TYPE_I32, a0, TCG_REG_XZR,
2373                     TCG_REG_XZR, tcg_invert_cond(args[3]));
2374        break;
2375
2376    case INDEX_op_negsetcond_i32:
2377        a2 = (int32_t)a2;
2378        /* FALLTHRU */
2379    case INDEX_op_negsetcond_i64:
2380        tcg_out_cmp(s, ext, args[3], a1, a2, c2);
2381        /* Use CSETM alias of CSINV Wd, WZR, WZR, invert(cond).  */
2382        tcg_out_insn(s, 3506, CSINV, ext, a0, TCG_REG_XZR,
2383                     TCG_REG_XZR, tcg_invert_cond(args[3]));
2384        break;
2385
2386    case INDEX_op_movcond_i32:
2387        a2 = (int32_t)a2;
2388        /* FALLTHRU */
2389    case INDEX_op_movcond_i64:
2390        tcg_out_cmp(s, ext, args[5], a1, a2, c2);
2391        tcg_out_insn(s, 3506, CSEL, ext, a0, REG0(3), REG0(4), args[5]);
2392        break;
2393
2394    case INDEX_op_qemu_ld_a32_i32:
2395    case INDEX_op_qemu_ld_a64_i32:
2396    case INDEX_op_qemu_ld_a32_i64:
2397    case INDEX_op_qemu_ld_a64_i64:
2398        tcg_out_qemu_ld(s, a0, a1, a2, ext);
2399        break;
2400    case INDEX_op_qemu_st_a32_i32:
2401    case INDEX_op_qemu_st_a64_i32:
2402    case INDEX_op_qemu_st_a32_i64:
2403    case INDEX_op_qemu_st_a64_i64:
2404        tcg_out_qemu_st(s, REG0(0), a1, a2, ext);
2405        break;
2406    case INDEX_op_qemu_ld_a32_i128:
2407    case INDEX_op_qemu_ld_a64_i128:
2408        tcg_out_qemu_ldst_i128(s, a0, a1, a2, args[3], true);
2409        break;
2410    case INDEX_op_qemu_st_a32_i128:
2411    case INDEX_op_qemu_st_a64_i128:
2412        tcg_out_qemu_ldst_i128(s, REG0(0), REG0(1), a2, args[3], false);
2413        break;
2414
2415    case INDEX_op_bswap64_i64:
2416        tcg_out_rev(s, TCG_TYPE_I64, MO_64, a0, a1);
2417        break;
2418    case INDEX_op_bswap32_i64:
2419        tcg_out_rev(s, TCG_TYPE_I32, MO_32, a0, a1);
2420        if (a2 & TCG_BSWAP_OS) {
2421            tcg_out_ext32s(s, a0, a0);
2422        }
2423        break;
2424    case INDEX_op_bswap32_i32:
2425        tcg_out_rev(s, TCG_TYPE_I32, MO_32, a0, a1);
2426        break;
2427    case INDEX_op_bswap16_i64:
2428    case INDEX_op_bswap16_i32:
2429        tcg_out_rev(s, TCG_TYPE_I32, MO_16, a0, a1);
2430        if (a2 & TCG_BSWAP_OS) {
2431            /* Output must be sign-extended. */
2432            tcg_out_ext16s(s, ext, a0, a0);
2433        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2434            /* Output must be zero-extended, but input isn't. */
2435            tcg_out_ext16u(s, a0, a0);
2436        }
2437        break;
2438
2439    case INDEX_op_deposit_i64:
2440    case INDEX_op_deposit_i32:
2441        tcg_out_dep(s, ext, a0, REG0(2), args[3], args[4]);
2442        break;
2443
2444    case INDEX_op_extract_i64:
2445    case INDEX_op_extract_i32:
2446        tcg_out_ubfm(s, ext, a0, a1, a2, a2 + args[3] - 1);
2447        break;
2448
2449    case INDEX_op_sextract_i64:
2450    case INDEX_op_sextract_i32:
2451        tcg_out_sbfm(s, ext, a0, a1, a2, a2 + args[3] - 1);
2452        break;
2453
2454    case INDEX_op_extract2_i64:
2455    case INDEX_op_extract2_i32:
2456        tcg_out_extr(s, ext, a0, REG0(2), REG0(1), args[3]);
2457        break;
2458
2459    case INDEX_op_add2_i32:
2460        tcg_out_addsub2(s, TCG_TYPE_I32, a0, a1, REG0(2), REG0(3),
2461                        (int32_t)args[4], args[5], const_args[4],
2462                        const_args[5], false);
2463        break;
2464    case INDEX_op_add2_i64:
2465        tcg_out_addsub2(s, TCG_TYPE_I64, a0, a1, REG0(2), REG0(3), args[4],
2466                        args[5], const_args[4], const_args[5], false);
2467        break;
2468    case INDEX_op_sub2_i32:
2469        tcg_out_addsub2(s, TCG_TYPE_I32, a0, a1, REG0(2), REG0(3),
2470                        (int32_t)args[4], args[5], const_args[4],
2471                        const_args[5], true);
2472        break;
2473    case INDEX_op_sub2_i64:
2474        tcg_out_addsub2(s, TCG_TYPE_I64, a0, a1, REG0(2), REG0(3), args[4],
2475                        args[5], const_args[4], const_args[5], true);
2476        break;
2477
2478    case INDEX_op_muluh_i64:
2479        tcg_out_insn(s, 3508, UMULH, TCG_TYPE_I64, a0, a1, a2);
2480        break;
2481    case INDEX_op_mulsh_i64:
2482        tcg_out_insn(s, 3508, SMULH, TCG_TYPE_I64, a0, a1, a2);
2483        break;
2484
2485    case INDEX_op_mb:
2486        tcg_out_mb(s, a0);
2487        break;
2488
2489    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2490    case INDEX_op_mov_i64:
2491    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2492    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2493    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2494    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2495    case INDEX_op_ext8s_i64:
2496    case INDEX_op_ext8u_i32:
2497    case INDEX_op_ext8u_i64:
2498    case INDEX_op_ext16s_i64:
2499    case INDEX_op_ext16s_i32:
2500    case INDEX_op_ext16u_i64:
2501    case INDEX_op_ext16u_i32:
2502    case INDEX_op_ext32s_i64:
2503    case INDEX_op_ext32u_i64:
2504    case INDEX_op_ext_i32_i64:
2505    case INDEX_op_extu_i32_i64:
2506    case INDEX_op_extrl_i64_i32:
2507    default:
2508        g_assert_not_reached();
2509    }
2510
2511#undef REG0
2512}
2513
2514static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2515                           unsigned vecl, unsigned vece,
2516                           const TCGArg args[TCG_MAX_OP_ARGS],
2517                           const int const_args[TCG_MAX_OP_ARGS])
2518{
2519    static const AArch64Insn cmp_vec_insn[16] = {
2520        [TCG_COND_EQ] = I3616_CMEQ,
2521        [TCG_COND_GT] = I3616_CMGT,
2522        [TCG_COND_GE] = I3616_CMGE,
2523        [TCG_COND_GTU] = I3616_CMHI,
2524        [TCG_COND_GEU] = I3616_CMHS,
2525    };
2526    static const AArch64Insn cmp_scalar_insn[16] = {
2527        [TCG_COND_EQ] = I3611_CMEQ,
2528        [TCG_COND_GT] = I3611_CMGT,
2529        [TCG_COND_GE] = I3611_CMGE,
2530        [TCG_COND_GTU] = I3611_CMHI,
2531        [TCG_COND_GEU] = I3611_CMHS,
2532    };
2533    static const AArch64Insn cmp0_vec_insn[16] = {
2534        [TCG_COND_EQ] = I3617_CMEQ0,
2535        [TCG_COND_GT] = I3617_CMGT0,
2536        [TCG_COND_GE] = I3617_CMGE0,
2537        [TCG_COND_LT] = I3617_CMLT0,
2538        [TCG_COND_LE] = I3617_CMLE0,
2539    };
2540    static const AArch64Insn cmp0_scalar_insn[16] = {
2541        [TCG_COND_EQ] = I3612_CMEQ0,
2542        [TCG_COND_GT] = I3612_CMGT0,
2543        [TCG_COND_GE] = I3612_CMGE0,
2544        [TCG_COND_LT] = I3612_CMLT0,
2545        [TCG_COND_LE] = I3612_CMLE0,
2546    };
2547
2548    TCGType type = vecl + TCG_TYPE_V64;
2549    unsigned is_q = vecl;
2550    bool is_scalar = !is_q && vece == MO_64;
2551    TCGArg a0, a1, a2, a3;
2552    int cmode, imm8;
2553
2554    a0 = args[0];
2555    a1 = args[1];
2556    a2 = args[2];
2557
2558    switch (opc) {
2559    case INDEX_op_ld_vec:
2560        tcg_out_ld(s, type, a0, a1, a2);
2561        break;
2562    case INDEX_op_st_vec:
2563        tcg_out_st(s, type, a0, a1, a2);
2564        break;
2565    case INDEX_op_dupm_vec:
2566        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2567        break;
2568    case INDEX_op_add_vec:
2569        if (is_scalar) {
2570            tcg_out_insn(s, 3611, ADD, vece, a0, a1, a2);
2571        } else {
2572            tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
2573        }
2574        break;
2575    case INDEX_op_sub_vec:
2576        if (is_scalar) {
2577            tcg_out_insn(s, 3611, SUB, vece, a0, a1, a2);
2578        } else {
2579            tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
2580        }
2581        break;
2582    case INDEX_op_mul_vec:
2583        tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
2584        break;
2585    case INDEX_op_neg_vec:
2586        if (is_scalar) {
2587            tcg_out_insn(s, 3612, NEG, vece, a0, a1);
2588        } else {
2589            tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
2590        }
2591        break;
2592    case INDEX_op_abs_vec:
2593        if (is_scalar) {
2594            tcg_out_insn(s, 3612, ABS, vece, a0, a1);
2595        } else {
2596            tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
2597        }
2598        break;
2599    case INDEX_op_and_vec:
2600        if (const_args[2]) {
2601            is_shimm1632(~a2, &cmode, &imm8);
2602            if (a0 == a1) {
2603                tcg_out_insn(s, 3606, BIC, is_q, a0, 0, cmode, imm8);
2604                return;
2605            }
2606            tcg_out_insn(s, 3606, MVNI, is_q, a0, 0, cmode, imm8);
2607            a2 = a0;
2608        }
2609        tcg_out_insn(s, 3616, AND, is_q, 0, a0, a1, a2);
2610        break;
2611    case INDEX_op_or_vec:
2612        if (const_args[2]) {
2613            is_shimm1632(a2, &cmode, &imm8);
2614            if (a0 == a1) {
2615                tcg_out_insn(s, 3606, ORR, is_q, a0, 0, cmode, imm8);
2616                return;
2617            }
2618            tcg_out_insn(s, 3606, MOVI, is_q, a0, 0, cmode, imm8);
2619            a2 = a0;
2620        }
2621        tcg_out_insn(s, 3616, ORR, is_q, 0, a0, a1, a2);
2622        break;
2623    case INDEX_op_andc_vec:
2624        if (const_args[2]) {
2625            is_shimm1632(a2, &cmode, &imm8);
2626            if (a0 == a1) {
2627                tcg_out_insn(s, 3606, BIC, is_q, a0, 0, cmode, imm8);
2628                return;
2629            }
2630            tcg_out_insn(s, 3606, MOVI, is_q, a0, 0, cmode, imm8);
2631            a2 = a0;
2632        }
2633        tcg_out_insn(s, 3616, BIC, is_q, 0, a0, a1, a2);
2634        break;
2635    case INDEX_op_orc_vec:
2636        if (const_args[2]) {
2637            is_shimm1632(~a2, &cmode, &imm8);
2638            if (a0 == a1) {
2639                tcg_out_insn(s, 3606, ORR, is_q, a0, 0, cmode, imm8);
2640                return;
2641            }
2642            tcg_out_insn(s, 3606, MVNI, is_q, a0, 0, cmode, imm8);
2643            a2 = a0;
2644        }
2645        tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2);
2646        break;
2647    case INDEX_op_xor_vec:
2648        tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
2649        break;
2650    case INDEX_op_ssadd_vec:
2651        if (is_scalar) {
2652            tcg_out_insn(s, 3611, SQADD, vece, a0, a1, a2);
2653        } else {
2654            tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
2655        }
2656        break;
2657    case INDEX_op_sssub_vec:
2658        if (is_scalar) {
2659            tcg_out_insn(s, 3611, SQSUB, vece, a0, a1, a2);
2660        } else {
2661            tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
2662        }
2663        break;
2664    case INDEX_op_usadd_vec:
2665        if (is_scalar) {
2666            tcg_out_insn(s, 3611, UQADD, vece, a0, a1, a2);
2667        } else {
2668            tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
2669        }
2670        break;
2671    case INDEX_op_ussub_vec:
2672        if (is_scalar) {
2673            tcg_out_insn(s, 3611, UQSUB, vece, a0, a1, a2);
2674        } else {
2675            tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
2676        }
2677        break;
2678    case INDEX_op_smax_vec:
2679        tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
2680        break;
2681    case INDEX_op_smin_vec:
2682        tcg_out_insn(s, 3616, SMIN, is_q, vece, a0, a1, a2);
2683        break;
2684    case INDEX_op_umax_vec:
2685        tcg_out_insn(s, 3616, UMAX, is_q, vece, a0, a1, a2);
2686        break;
2687    case INDEX_op_umin_vec:
2688        tcg_out_insn(s, 3616, UMIN, is_q, vece, a0, a1, a2);
2689        break;
2690    case INDEX_op_not_vec:
2691        tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
2692        break;
2693    case INDEX_op_shli_vec:
2694        if (is_scalar) {
2695            tcg_out_insn(s, 3609, SHL, a0, a1, a2 + (8 << vece));
2696        } else {
2697            tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
2698        }
2699        break;
2700    case INDEX_op_shri_vec:
2701        if (is_scalar) {
2702            tcg_out_insn(s, 3609, USHR, a0, a1, (16 << vece) - a2);
2703        } else {
2704            tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
2705        }
2706        break;
2707    case INDEX_op_sari_vec:
2708        if (is_scalar) {
2709            tcg_out_insn(s, 3609, SSHR, a0, a1, (16 << vece) - a2);
2710        } else {
2711            tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
2712        }
2713        break;
2714    case INDEX_op_aa64_sli_vec:
2715        if (is_scalar) {
2716            tcg_out_insn(s, 3609, SLI, a0, a2, args[3] + (8 << vece));
2717        } else {
2718            tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
2719        }
2720        break;
2721    case INDEX_op_shlv_vec:
2722        if (is_scalar) {
2723            tcg_out_insn(s, 3611, USHL, vece, a0, a1, a2);
2724        } else {
2725            tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
2726        }
2727        break;
2728    case INDEX_op_aa64_sshl_vec:
2729        if (is_scalar) {
2730            tcg_out_insn(s, 3611, SSHL, vece, a0, a1, a2);
2731        } else {
2732            tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
2733        }
2734        break;
2735    case INDEX_op_cmp_vec:
2736        {
2737            TCGCond cond = args[3];
2738            AArch64Insn insn;
2739
2740            switch (cond) {
2741            case TCG_COND_NE:
2742                if (const_args[2]) {
2743                    if (is_scalar) {
2744                        tcg_out_insn(s, 3611, CMTST, vece, a0, a1, a1);
2745                    } else {
2746                        tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
2747                    }
2748                } else {
2749                    if (is_scalar) {
2750                        tcg_out_insn(s, 3611, CMEQ, vece, a0, a1, a2);
2751                    } else {
2752                        tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
2753                    }
2754                    tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
2755                }
2756                break;
2757
2758            case TCG_COND_TSTNE:
2759            case TCG_COND_TSTEQ:
2760                if (const_args[2]) {
2761                    /* (x & 0) == 0 */
2762                    tcg_out_dupi_vec(s, type, MO_8, a0,
2763                                     -(cond == TCG_COND_TSTEQ));
2764                    break;
2765                }
2766                if (is_scalar) {
2767                    tcg_out_insn(s, 3611, CMTST, vece, a0, a1, a2);
2768                } else {
2769                    tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a2);
2770                }
2771                if (cond == TCG_COND_TSTEQ) {
2772                    tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
2773                }
2774                break;
2775
2776            default:
2777                if (const_args[2]) {
2778                    if (is_scalar) {
2779                        insn = cmp0_scalar_insn[cond];
2780                        if (insn) {
2781                            tcg_out_insn_3612(s, insn, vece, a0, a1);
2782                            break;
2783                        }
2784                    } else {
2785                        insn = cmp0_vec_insn[cond];
2786                        if (insn) {
2787                            tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
2788                            break;
2789                        }
2790                    }
2791                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP0, 0);
2792                    a2 = TCG_VEC_TMP0;
2793                }
2794                if (is_scalar) {
2795                    insn = cmp_scalar_insn[cond];
2796                    if (insn == 0) {
2797                        TCGArg t;
2798                        t = a1, a1 = a2, a2 = t;
2799                        cond = tcg_swap_cond(cond);
2800                        insn = cmp_scalar_insn[cond];
2801                        tcg_debug_assert(insn != 0);
2802                    }
2803                    tcg_out_insn_3611(s, insn, vece, a0, a1, a2);
2804                } else {
2805                    insn = cmp_vec_insn[cond];
2806                    if (insn == 0) {
2807                        TCGArg t;
2808                        t = a1, a1 = a2, a2 = t;
2809                        cond = tcg_swap_cond(cond);
2810                        insn = cmp_vec_insn[cond];
2811                        tcg_debug_assert(insn != 0);
2812                    }
2813                    tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
2814                }
2815                break;
2816            }
2817        }
2818        break;
2819
2820    case INDEX_op_bitsel_vec:
2821        a3 = args[3];
2822        if (a0 == a3) {
2823            tcg_out_insn(s, 3616, BIT, is_q, 0, a0, a2, a1);
2824        } else if (a0 == a2) {
2825            tcg_out_insn(s, 3616, BIF, is_q, 0, a0, a3, a1);
2826        } else {
2827            if (a0 != a1) {
2828                tcg_out_mov(s, type, a0, a1);
2829            }
2830            tcg_out_insn(s, 3616, BSL, is_q, 0, a0, a2, a3);
2831        }
2832        break;
2833
2834    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2835    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2836    default:
2837        g_assert_not_reached();
2838    }
2839}
2840
2841int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
2842{
2843    switch (opc) {
2844    case INDEX_op_add_vec:
2845    case INDEX_op_sub_vec:
2846    case INDEX_op_and_vec:
2847    case INDEX_op_or_vec:
2848    case INDEX_op_xor_vec:
2849    case INDEX_op_andc_vec:
2850    case INDEX_op_orc_vec:
2851    case INDEX_op_neg_vec:
2852    case INDEX_op_abs_vec:
2853    case INDEX_op_not_vec:
2854    case INDEX_op_cmp_vec:
2855    case INDEX_op_shli_vec:
2856    case INDEX_op_shri_vec:
2857    case INDEX_op_sari_vec:
2858    case INDEX_op_ssadd_vec:
2859    case INDEX_op_sssub_vec:
2860    case INDEX_op_usadd_vec:
2861    case INDEX_op_ussub_vec:
2862    case INDEX_op_shlv_vec:
2863    case INDEX_op_bitsel_vec:
2864        return 1;
2865    case INDEX_op_rotli_vec:
2866    case INDEX_op_shrv_vec:
2867    case INDEX_op_sarv_vec:
2868    case INDEX_op_rotlv_vec:
2869    case INDEX_op_rotrv_vec:
2870        return -1;
2871    case INDEX_op_mul_vec:
2872    case INDEX_op_smax_vec:
2873    case INDEX_op_smin_vec:
2874    case INDEX_op_umax_vec:
2875    case INDEX_op_umin_vec:
2876        return vece < MO_64;
2877
2878    default:
2879        return 0;
2880    }
2881}
2882
2883void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
2884                       TCGArg a0, ...)
2885{
2886    va_list va;
2887    TCGv_vec v0, v1, v2, t1, t2, c1;
2888    TCGArg a2;
2889
2890    va_start(va, a0);
2891    v0 = temp_tcgv_vec(arg_temp(a0));
2892    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
2893    a2 = va_arg(va, TCGArg);
2894    va_end(va);
2895
2896    switch (opc) {
2897    case INDEX_op_rotli_vec:
2898        t1 = tcg_temp_new_vec(type);
2899        tcg_gen_shri_vec(vece, t1, v1, -a2 & ((8 << vece) - 1));
2900        vec_gen_4(INDEX_op_aa64_sli_vec, type, vece,
2901                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(v1), a2);
2902        tcg_temp_free_vec(t1);
2903        break;
2904
2905    case INDEX_op_shrv_vec:
2906    case INDEX_op_sarv_vec:
2907        /* Right shifts are negative left shifts for AArch64.  */
2908        v2 = temp_tcgv_vec(arg_temp(a2));
2909        t1 = tcg_temp_new_vec(type);
2910        tcg_gen_neg_vec(vece, t1, v2);
2911        opc = (opc == INDEX_op_shrv_vec
2912               ? INDEX_op_shlv_vec : INDEX_op_aa64_sshl_vec);
2913        vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
2914                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2915        tcg_temp_free_vec(t1);
2916        break;
2917
2918    case INDEX_op_rotlv_vec:
2919        v2 = temp_tcgv_vec(arg_temp(a2));
2920        t1 = tcg_temp_new_vec(type);
2921        c1 = tcg_constant_vec(type, vece, 8 << vece);
2922        tcg_gen_sub_vec(vece, t1, v2, c1);
2923        /* Right shifts are negative left shifts for AArch64.  */
2924        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
2925                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2926        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(v0),
2927                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
2928        tcg_gen_or_vec(vece, v0, v0, t1);
2929        tcg_temp_free_vec(t1);
2930        break;
2931
2932    case INDEX_op_rotrv_vec:
2933        v2 = temp_tcgv_vec(arg_temp(a2));
2934        t1 = tcg_temp_new_vec(type);
2935        t2 = tcg_temp_new_vec(type);
2936        c1 = tcg_constant_vec(type, vece, 8 << vece);
2937        tcg_gen_neg_vec(vece, t1, v2);
2938        tcg_gen_sub_vec(vece, t2, c1, v2);
2939        /* Right shifts are negative left shifts for AArch64.  */
2940        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
2941                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2942        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t2),
2943                  tcgv_vec_arg(v1), tcgv_vec_arg(t2));
2944        tcg_gen_or_vec(vece, v0, t1, t2);
2945        tcg_temp_free_vec(t1);
2946        tcg_temp_free_vec(t2);
2947        break;
2948
2949    default:
2950        g_assert_not_reached();
2951    }
2952}
2953
2954static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2955{
2956    switch (op) {
2957    case INDEX_op_goto_ptr:
2958        return C_O0_I1(r);
2959
2960    case INDEX_op_ld8u_i32:
2961    case INDEX_op_ld8s_i32:
2962    case INDEX_op_ld16u_i32:
2963    case INDEX_op_ld16s_i32:
2964    case INDEX_op_ld_i32:
2965    case INDEX_op_ld8u_i64:
2966    case INDEX_op_ld8s_i64:
2967    case INDEX_op_ld16u_i64:
2968    case INDEX_op_ld16s_i64:
2969    case INDEX_op_ld32u_i64:
2970    case INDEX_op_ld32s_i64:
2971    case INDEX_op_ld_i64:
2972    case INDEX_op_neg_i32:
2973    case INDEX_op_neg_i64:
2974    case INDEX_op_not_i32:
2975    case INDEX_op_not_i64:
2976    case INDEX_op_bswap16_i32:
2977    case INDEX_op_bswap32_i32:
2978    case INDEX_op_bswap16_i64:
2979    case INDEX_op_bswap32_i64:
2980    case INDEX_op_bswap64_i64:
2981    case INDEX_op_ext8s_i32:
2982    case INDEX_op_ext16s_i32:
2983    case INDEX_op_ext8u_i32:
2984    case INDEX_op_ext16u_i32:
2985    case INDEX_op_ext8s_i64:
2986    case INDEX_op_ext16s_i64:
2987    case INDEX_op_ext32s_i64:
2988    case INDEX_op_ext8u_i64:
2989    case INDEX_op_ext16u_i64:
2990    case INDEX_op_ext32u_i64:
2991    case INDEX_op_ext_i32_i64:
2992    case INDEX_op_extu_i32_i64:
2993    case INDEX_op_extract_i32:
2994    case INDEX_op_extract_i64:
2995    case INDEX_op_sextract_i32:
2996    case INDEX_op_sextract_i64:
2997        return C_O1_I1(r, r);
2998
2999    case INDEX_op_st8_i32:
3000    case INDEX_op_st16_i32:
3001    case INDEX_op_st_i32:
3002    case INDEX_op_st8_i64:
3003    case INDEX_op_st16_i64:
3004    case INDEX_op_st32_i64:
3005    case INDEX_op_st_i64:
3006        return C_O0_I2(rZ, r);
3007
3008    case INDEX_op_add_i32:
3009    case INDEX_op_add_i64:
3010    case INDEX_op_sub_i32:
3011    case INDEX_op_sub_i64:
3012        return C_O1_I2(r, r, rA);
3013
3014    case INDEX_op_setcond_i32:
3015    case INDEX_op_setcond_i64:
3016    case INDEX_op_negsetcond_i32:
3017    case INDEX_op_negsetcond_i64:
3018        return C_O1_I2(r, r, rC);
3019
3020    case INDEX_op_mul_i32:
3021    case INDEX_op_mul_i64:
3022    case INDEX_op_div_i32:
3023    case INDEX_op_div_i64:
3024    case INDEX_op_divu_i32:
3025    case INDEX_op_divu_i64:
3026    case INDEX_op_rem_i32:
3027    case INDEX_op_rem_i64:
3028    case INDEX_op_remu_i32:
3029    case INDEX_op_remu_i64:
3030    case INDEX_op_muluh_i64:
3031    case INDEX_op_mulsh_i64:
3032        return C_O1_I2(r, r, r);
3033
3034    case INDEX_op_and_i32:
3035    case INDEX_op_and_i64:
3036    case INDEX_op_or_i32:
3037    case INDEX_op_or_i64:
3038    case INDEX_op_xor_i32:
3039    case INDEX_op_xor_i64:
3040    case INDEX_op_andc_i32:
3041    case INDEX_op_andc_i64:
3042    case INDEX_op_orc_i32:
3043    case INDEX_op_orc_i64:
3044    case INDEX_op_eqv_i32:
3045    case INDEX_op_eqv_i64:
3046        return C_O1_I2(r, r, rL);
3047
3048    case INDEX_op_shl_i32:
3049    case INDEX_op_shr_i32:
3050    case INDEX_op_sar_i32:
3051    case INDEX_op_rotl_i32:
3052    case INDEX_op_rotr_i32:
3053    case INDEX_op_shl_i64:
3054    case INDEX_op_shr_i64:
3055    case INDEX_op_sar_i64:
3056    case INDEX_op_rotl_i64:
3057    case INDEX_op_rotr_i64:
3058        return C_O1_I2(r, r, ri);
3059
3060    case INDEX_op_clz_i32:
3061    case INDEX_op_ctz_i32:
3062    case INDEX_op_clz_i64:
3063    case INDEX_op_ctz_i64:
3064        return C_O1_I2(r, r, rAL);
3065
3066    case INDEX_op_brcond_i32:
3067    case INDEX_op_brcond_i64:
3068        return C_O0_I2(r, rC);
3069
3070    case INDEX_op_movcond_i32:
3071    case INDEX_op_movcond_i64:
3072        return C_O1_I4(r, r, rC, rZ, rZ);
3073
3074    case INDEX_op_qemu_ld_a32_i32:
3075    case INDEX_op_qemu_ld_a64_i32:
3076    case INDEX_op_qemu_ld_a32_i64:
3077    case INDEX_op_qemu_ld_a64_i64:
3078        return C_O1_I1(r, r);
3079    case INDEX_op_qemu_ld_a32_i128:
3080    case INDEX_op_qemu_ld_a64_i128:
3081        return C_O2_I1(r, r, r);
3082    case INDEX_op_qemu_st_a32_i32:
3083    case INDEX_op_qemu_st_a64_i32:
3084    case INDEX_op_qemu_st_a32_i64:
3085    case INDEX_op_qemu_st_a64_i64:
3086        return C_O0_I2(rZ, r);
3087    case INDEX_op_qemu_st_a32_i128:
3088    case INDEX_op_qemu_st_a64_i128:
3089        return C_O0_I3(rZ, rZ, r);
3090
3091    case INDEX_op_deposit_i32:
3092    case INDEX_op_deposit_i64:
3093        return C_O1_I2(r, 0, rZ);
3094
3095    case INDEX_op_extract2_i32:
3096    case INDEX_op_extract2_i64:
3097        return C_O1_I2(r, rZ, rZ);
3098
3099    case INDEX_op_add2_i32:
3100    case INDEX_op_add2_i64:
3101    case INDEX_op_sub2_i32:
3102    case INDEX_op_sub2_i64:
3103        return C_O2_I4(r, r, rZ, rZ, rA, rMZ);
3104
3105    case INDEX_op_add_vec:
3106    case INDEX_op_sub_vec:
3107    case INDEX_op_mul_vec:
3108    case INDEX_op_xor_vec:
3109    case INDEX_op_ssadd_vec:
3110    case INDEX_op_sssub_vec:
3111    case INDEX_op_usadd_vec:
3112    case INDEX_op_ussub_vec:
3113    case INDEX_op_smax_vec:
3114    case INDEX_op_smin_vec:
3115    case INDEX_op_umax_vec:
3116    case INDEX_op_umin_vec:
3117    case INDEX_op_shlv_vec:
3118    case INDEX_op_shrv_vec:
3119    case INDEX_op_sarv_vec:
3120    case INDEX_op_aa64_sshl_vec:
3121        return C_O1_I2(w, w, w);
3122    case INDEX_op_not_vec:
3123    case INDEX_op_neg_vec:
3124    case INDEX_op_abs_vec:
3125    case INDEX_op_shli_vec:
3126    case INDEX_op_shri_vec:
3127    case INDEX_op_sari_vec:
3128        return C_O1_I1(w, w);
3129    case INDEX_op_ld_vec:
3130    case INDEX_op_dupm_vec:
3131        return C_O1_I1(w, r);
3132    case INDEX_op_st_vec:
3133        return C_O0_I2(w, r);
3134    case INDEX_op_dup_vec:
3135        return C_O1_I1(w, wr);
3136    case INDEX_op_or_vec:
3137    case INDEX_op_andc_vec:
3138        return C_O1_I2(w, w, wO);
3139    case INDEX_op_and_vec:
3140    case INDEX_op_orc_vec:
3141        return C_O1_I2(w, w, wN);
3142    case INDEX_op_cmp_vec:
3143        return C_O1_I2(w, w, wZ);
3144    case INDEX_op_bitsel_vec:
3145        return C_O1_I3(w, w, w, w);
3146    case INDEX_op_aa64_sli_vec:
3147        return C_O1_I2(w, 0, w);
3148
3149    default:
3150        g_assert_not_reached();
3151    }
3152}
3153
3154static void tcg_target_init(TCGContext *s)
3155{
3156    tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
3157    tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
3158    tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
3159    tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull;
3160
3161    tcg_target_call_clobber_regs = -1ull;
3162    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19);
3163    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20);
3164    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21);
3165    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X22);
3166    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X23);
3167    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X24);
3168    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X25);
3169    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X26);
3170    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27);
3171    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28);
3172    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29);
3173    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8);
3174    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9);
3175    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10);
3176    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11);
3177    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12);
3178    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13);
3179    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14);
3180    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15);
3181
3182    s->reserved_regs = 0;
3183    tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
3184    tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP);
3185    tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */
3186    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP0);
3187    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1);
3188    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP2);
3189    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP0);
3190}
3191
3192/* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)).  */
3193#define PUSH_SIZE  ((30 - 19 + 1) * 8)
3194
3195#define FRAME_SIZE \
3196    ((PUSH_SIZE \
3197      + TCG_STATIC_CALL_ARGS_SIZE \
3198      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3199      + TCG_TARGET_STACK_ALIGN - 1) \
3200     & ~(TCG_TARGET_STACK_ALIGN - 1))
3201
3202/* We're expecting a 2 byte uleb128 encoded value.  */
3203QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3204
3205/* We're expecting to use a single ADDI insn.  */
3206QEMU_BUILD_BUG_ON(FRAME_SIZE - PUSH_SIZE > 0xfff);
3207
3208static void tcg_target_qemu_prologue(TCGContext *s)
3209{
3210    TCGReg r;
3211
3212    tcg_out_bti(s, BTI_C);
3213
3214    /* Push (FP, LR) and allocate space for all saved registers.  */
3215    tcg_out_insn(s, 3314, STP, TCG_REG_FP, TCG_REG_LR,
3216                 TCG_REG_SP, -PUSH_SIZE, 1, 1);
3217
3218    /* Set up frame pointer for canonical unwinding.  */
3219    tcg_out_movr_sp(s, TCG_TYPE_I64, TCG_REG_FP, TCG_REG_SP);
3220
3221    /* Store callee-preserved regs x19..x28.  */
3222    for (r = TCG_REG_X19; r <= TCG_REG_X27; r += 2) {
3223        int ofs = (r - TCG_REG_X19 + 2) * 8;
3224        tcg_out_insn(s, 3314, STP, r, r + 1, TCG_REG_SP, ofs, 1, 0);
3225    }
3226
3227    /* Make stack space for TCG locals.  */
3228    tcg_out_insn(s, 3401, SUBI, TCG_TYPE_I64, TCG_REG_SP, TCG_REG_SP,
3229                 FRAME_SIZE - PUSH_SIZE);
3230
3231    /* Inform TCG about how to find TCG locals with register, offset, size.  */
3232    tcg_set_frame(s, TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE,
3233                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3234
3235    if (!tcg_use_softmmu) {
3236        /*
3237         * Note that XZR cannot be encoded in the address base register slot,
3238         * as that actually encodes SP.  Depending on the guest, we may need
3239         * to zero-extend the guest address via the address index register slot,
3240         * therefore we need to load even a zero guest base into a register.
3241         */
3242        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base);
3243        tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE);
3244    }
3245
3246    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3247    tcg_out_insn(s, 3207, BR, tcg_target_call_iarg_regs[1]);
3248
3249    /*
3250     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3251     * and fall through to the rest of the epilogue.
3252     */
3253    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3254    tcg_out_bti(s, BTI_J);
3255    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_X0, 0);
3256
3257    /* TB epilogue */
3258    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3259    tcg_out_bti(s, BTI_J);
3260
3261    /* Remove TCG locals stack space.  */
3262    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_SP, TCG_REG_SP,
3263                 FRAME_SIZE - PUSH_SIZE);
3264
3265    /* Restore registers x19..x28.  */
3266    for (r = TCG_REG_X19; r <= TCG_REG_X27; r += 2) {
3267        int ofs = (r - TCG_REG_X19 + 2) * 8;
3268        tcg_out_insn(s, 3314, LDP, r, r + 1, TCG_REG_SP, ofs, 1, 0);
3269    }
3270
3271    /* Pop (FP, LR), restore SP to previous frame.  */
3272    tcg_out_insn(s, 3314, LDP, TCG_REG_FP, TCG_REG_LR,
3273                 TCG_REG_SP, PUSH_SIZE, 0, 1);
3274    tcg_out_insn(s, 3207, RET, TCG_REG_LR);
3275}
3276
3277static void tcg_out_tb_start(TCGContext *s)
3278{
3279    tcg_out_bti(s, BTI_J);
3280}
3281
3282static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3283{
3284    int i;
3285    for (i = 0; i < count; ++i) {
3286        p[i] = NOP;
3287    }
3288}
3289
3290typedef struct {
3291    DebugFrameHeader h;
3292    uint8_t fde_def_cfa[4];
3293    uint8_t fde_reg_ofs[24];
3294} DebugFrame;
3295
3296#define ELF_HOST_MACHINE EM_AARCH64
3297
3298static const DebugFrame debug_frame = {
3299    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3300    .h.cie.id = -1,
3301    .h.cie.version = 1,
3302    .h.cie.code_align = 1,
3303    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3304    .h.cie.return_column = TCG_REG_LR,
3305
3306    /* Total FDE size does not include the "len" member.  */
3307    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3308
3309    .fde_def_cfa = {
3310        12, TCG_REG_SP,                 /* DW_CFA_def_cfa sp, ... */
3311        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3312        (FRAME_SIZE >> 7)
3313    },
3314    .fde_reg_ofs = {
3315        0x80 + 28, 1,                   /* DW_CFA_offset, x28,  -8 */
3316        0x80 + 27, 2,                   /* DW_CFA_offset, x27, -16 */
3317        0x80 + 26, 3,                   /* DW_CFA_offset, x26, -24 */
3318        0x80 + 25, 4,                   /* DW_CFA_offset, x25, -32 */
3319        0x80 + 24, 5,                   /* DW_CFA_offset, x24, -40 */
3320        0x80 + 23, 6,                   /* DW_CFA_offset, x23, -48 */
3321        0x80 + 22, 7,                   /* DW_CFA_offset, x22, -56 */
3322        0x80 + 21, 8,                   /* DW_CFA_offset, x21, -64 */
3323        0x80 + 20, 9,                   /* DW_CFA_offset, x20, -72 */
3324        0x80 + 19, 10,                  /* DW_CFA_offset, x1p, -80 */
3325        0x80 + 30, 11,                  /* DW_CFA_offset,  lr, -88 */
3326        0x80 + 29, 12,                  /* DW_CFA_offset,  fp, -96 */
3327    }
3328};
3329
3330void tcg_register_jit(const void *buf, size_t buf_size)
3331{
3332    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3333}
3334