xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision fcc54e7bf56ba627f9b6ac4a32c6b446d2591ccf)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135#define TCG_CT_CONST_TST 0x1000
136
137/* Registers used with L constraint, which are the first argument
138   registers on x86_64, and two random call clobbered registers on
139   i386. */
140#if TCG_TARGET_REG_BITS == 64
141# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
142# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
143#else
144# define TCG_REG_L0 TCG_REG_EAX
145# define TCG_REG_L1 TCG_REG_EDX
146#endif
147
148#if TCG_TARGET_REG_BITS == 64
149# define ALL_GENERAL_REGS      0x0000ffffu
150# define ALL_VECTOR_REGS       0xffff0000u
151# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
152#else
153# define ALL_GENERAL_REGS      0x000000ffu
154# define ALL_VECTOR_REGS       0x00ff0000u
155# define ALL_BYTEL_REGS        0x0000000fu
156#endif
157#define SOFTMMU_RESERVE_REGS \
158    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
159
160#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
161#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
162
163static const tcg_insn_unit *tb_ret_addr;
164
165static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
166                        intptr_t value, intptr_t addend)
167{
168    value += addend;
169    switch(type) {
170    case R_386_PC32:
171        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
172        if (value != (int32_t)value) {
173            return false;
174        }
175        /* FALLTHRU */
176    case R_386_32:
177        tcg_patch32(code_ptr, value);
178        break;
179    case R_386_PC8:
180        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
181        if (value != (int8_t)value) {
182            return false;
183        }
184        tcg_patch8(code_ptr, value);
185        break;
186    default:
187        g_assert_not_reached();
188    }
189    return true;
190}
191
192/* test if a constant matches the constraint */
193static bool tcg_target_const_match(int64_t val, int ct,
194                                   TCGType type, TCGCond cond, int vece)
195{
196    if (ct & TCG_CT_CONST) {
197        return 1;
198    }
199    if (type == TCG_TYPE_I32) {
200        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
201                  TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
202            return 1;
203        }
204    } else {
205        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
206            return 1;
207        }
208        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
209            return 1;
210        }
211        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
212            return 1;
213        }
214        /*
215         * This will be used in combination with TCG_CT_CONST_S32,
216         * so "normal" TESTQ is already matched.  Also accept:
217         *    TESTQ -> TESTL   (uint32_t)
218         *    TESTQ -> BT      (is_power_of_2)
219         */
220        if ((ct & TCG_CT_CONST_TST)
221            && is_tst_cond(cond)
222            && (val == (uint32_t)val || is_power_of_2(val))) {
223            return 1;
224        }
225    }
226    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
227        return 1;
228    }
229    return 0;
230}
231
232# define LOWREGMASK(x)	((x) & 7)
233
234#define P_EXT		0x100		/* 0x0f opcode prefix */
235#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
236#define P_DATA16        0x400           /* 0x66 opcode prefix */
237#define P_VEXW          0x1000          /* Set VEX.W = 1 */
238#if TCG_TARGET_REG_BITS == 64
239# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
240# define P_REXB_R       0x2000          /* REG field as byte register */
241# define P_REXB_RM      0x4000          /* R/M field as byte register */
242# define P_GS           0x8000          /* gs segment override */
243#else
244# define P_REXW		0
245# define P_REXB_R	0
246# define P_REXB_RM	0
247# define P_GS           0
248#endif
249#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
250#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
251#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
252#define P_VEXL          0x80000         /* Set VEX.L = 1 */
253#define P_EVEX          0x100000        /* Requires EVEX encoding */
254
255#define OPC_ARITH_EbIb	(0x80)
256#define OPC_ARITH_EvIz	(0x81)
257#define OPC_ARITH_EvIb	(0x83)
258#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
259#define OPC_ANDN        (0xf2 | P_EXT38)
260#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
261#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
262#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
263#define OPC_BSF         (0xbc | P_EXT)
264#define OPC_BSR         (0xbd | P_EXT)
265#define OPC_BSWAP	(0xc8 | P_EXT)
266#define OPC_CALL_Jz	(0xe8)
267#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
268#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
269#define OPC_DEC_r32	(0x48)
270#define OPC_IMUL_GvEv	(0xaf | P_EXT)
271#define OPC_IMUL_GvEvIb	(0x6b)
272#define OPC_IMUL_GvEvIz	(0x69)
273#define OPC_INC_r32	(0x40)
274#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
275#define OPC_JCC_short	(0x70)		/* ... plus condition code */
276#define OPC_JMP_long	(0xe9)
277#define OPC_JMP_short	(0xeb)
278#define OPC_LEA         (0x8d)
279#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
280#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
281#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
282#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
283#define OPC_MOVB_EvIz   (0xc6)
284#define OPC_MOVL_EvIz	(0xc7)
285#define OPC_MOVB_Ib     (0xb0)
286#define OPC_MOVL_Iv     (0xb8)
287#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
288#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
289#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
290#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
291#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
292#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
293#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
294#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
295#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
296#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
297#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
298#define OPC_MOVSBL	(0xbe | P_EXT)
299#define OPC_MOVSWL	(0xbf | P_EXT)
300#define OPC_MOVSLQ	(0x63 | P_REXW)
301#define OPC_MOVZBL	(0xb6 | P_EXT)
302#define OPC_MOVZWL	(0xb7 | P_EXT)
303#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
304#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
305#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
306#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
307#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
308#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
309#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
310#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
311#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
312#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
313#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
314#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
315#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
316#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
317#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
318#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
319#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
320#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
321#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
322#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
323#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
324#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
325#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
326#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
327#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
328#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
329#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
330#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
331#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
332#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
333#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
334#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
335#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
336#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
337#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
338#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
339#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
340#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
341#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
342#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
343#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
344#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
345#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
346#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
347#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
348#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
349#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
350#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
351#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
352#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
353#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
354#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
355#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
356#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
357#define OPC_POR         (0xeb | P_EXT | P_DATA16)
358#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
359#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
360#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
361#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
362#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
363#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
364#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
365#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
366#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
367#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
368#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
369#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
370#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
371#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
372#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
373#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
374#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
375#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
376#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
377#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
378#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
379#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
380#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
381#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
382#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
383#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
384#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
385#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
386#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
387#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
388#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
389#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
390#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
391#define OPC_POP_r32	(0x58)
392#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
393#define OPC_PUSH_r32	(0x50)
394#define OPC_PUSH_Iv	(0x68)
395#define OPC_PUSH_Ib	(0x6a)
396#define OPC_RET		(0xc3)
397#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
398#define OPC_SHIFT_1	(0xd1)
399#define OPC_SHIFT_Ib	(0xc1)
400#define OPC_SHIFT_cl	(0xd3)
401#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
402#define OPC_SHUFPS      (0xc6 | P_EXT)
403#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
404#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
405#define OPC_SHRD_Ib     (0xac | P_EXT)
406#define OPC_TESTB	(0x84)
407#define OPC_TESTL	(0x85)
408#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
409#define OPC_UD2         (0x0b | P_EXT)
410#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
411#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
412#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
413#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
414#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
415#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
416#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
417#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
418#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
419#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
420#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
421#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
422#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
423#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
424#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
425#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
427#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
428#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
429#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
430#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
431#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
433#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
434#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
435#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
436#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
437#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
438#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
439#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
440#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
441#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
442#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
443#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
444#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
445#define OPC_VZEROUPPER  (0x77 | P_EXT)
446#define OPC_XCHG_ax_r32	(0x90)
447#define OPC_XCHG_EvGv   (0x87)
448
449#define OPC_GRP3_Eb     (0xf6)
450#define OPC_GRP3_Ev     (0xf7)
451#define OPC_GRP5        (0xff)
452#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
453#define OPC_GRPBT       (0xba | P_EXT)
454
455#define OPC_GRPBT_BT    4
456#define OPC_GRPBT_BTS   5
457#define OPC_GRPBT_BTR   6
458#define OPC_GRPBT_BTC   7
459
460/* Group 1 opcode extensions for 0x80-0x83.
461   These are also used as modifiers for OPC_ARITH.  */
462#define ARITH_ADD 0
463#define ARITH_OR  1
464#define ARITH_ADC 2
465#define ARITH_SBB 3
466#define ARITH_AND 4
467#define ARITH_SUB 5
468#define ARITH_XOR 6
469#define ARITH_CMP 7
470
471/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
472#define SHIFT_ROL 0
473#define SHIFT_ROR 1
474#define SHIFT_SHL 4
475#define SHIFT_SHR 5
476#define SHIFT_SAR 7
477
478/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
479#define EXT3_TESTi 0
480#define EXT3_NOT   2
481#define EXT3_NEG   3
482#define EXT3_MUL   4
483#define EXT3_IMUL  5
484#define EXT3_DIV   6
485#define EXT3_IDIV  7
486
487/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
488#define EXT5_INC_Ev	0
489#define EXT5_DEC_Ev	1
490#define EXT5_CALLN_Ev	2
491#define EXT5_JMPN_Ev	4
492
493/* Condition codes to be added to OPC_JCC_{long,short}.  */
494#define JCC_JMP (-1)
495#define JCC_JO  0x0
496#define JCC_JNO 0x1
497#define JCC_JB  0x2
498#define JCC_JAE 0x3
499#define JCC_JE  0x4
500#define JCC_JNE 0x5
501#define JCC_JBE 0x6
502#define JCC_JA  0x7
503#define JCC_JS  0x8
504#define JCC_JNS 0x9
505#define JCC_JP  0xa
506#define JCC_JNP 0xb
507#define JCC_JL  0xc
508#define JCC_JGE 0xd
509#define JCC_JLE 0xe
510#define JCC_JG  0xf
511
512static const uint8_t tcg_cond_to_jcc[] = {
513    [TCG_COND_EQ] = JCC_JE,
514    [TCG_COND_NE] = JCC_JNE,
515    [TCG_COND_LT] = JCC_JL,
516    [TCG_COND_GE] = JCC_JGE,
517    [TCG_COND_LE] = JCC_JLE,
518    [TCG_COND_GT] = JCC_JG,
519    [TCG_COND_LTU] = JCC_JB,
520    [TCG_COND_GEU] = JCC_JAE,
521    [TCG_COND_LEU] = JCC_JBE,
522    [TCG_COND_GTU] = JCC_JA,
523    [TCG_COND_TSTEQ] = JCC_JE,
524    [TCG_COND_TSTNE] = JCC_JNE,
525};
526
527#if TCG_TARGET_REG_BITS == 64
528static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
529{
530    int rex;
531
532    if (opc & P_GS) {
533        tcg_out8(s, 0x65);
534    }
535    if (opc & P_DATA16) {
536        /* We should never be asking for both 16 and 64-bit operation.  */
537        tcg_debug_assert((opc & P_REXW) == 0);
538        tcg_out8(s, 0x66);
539    }
540    if (opc & P_SIMDF3) {
541        tcg_out8(s, 0xf3);
542    } else if (opc & P_SIMDF2) {
543        tcg_out8(s, 0xf2);
544    }
545
546    rex = 0;
547    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
548    rex |= (r & 8) >> 1;                /* REX.R */
549    rex |= (x & 8) >> 2;                /* REX.X */
550    rex |= (rm & 8) >> 3;               /* REX.B */
551
552    /* P_REXB_{R,RM} indicates that the given register is the low byte.
553       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
554       as otherwise the encoding indicates %[abcd]h.  Note that the values
555       that are ORed in merely indicate that the REX byte must be present;
556       those bits get discarded in output.  */
557    rex |= opc & (r >= 4 ? P_REXB_R : 0);
558    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
559
560    if (rex) {
561        tcg_out8(s, (uint8_t)(rex | 0x40));
562    }
563
564    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
565        tcg_out8(s, 0x0f);
566        if (opc & P_EXT38) {
567            tcg_out8(s, 0x38);
568        } else if (opc & P_EXT3A) {
569            tcg_out8(s, 0x3a);
570        }
571    }
572
573    tcg_out8(s, opc);
574}
575#else
576static void tcg_out_opc(TCGContext *s, int opc)
577{
578    if (opc & P_DATA16) {
579        tcg_out8(s, 0x66);
580    }
581    if (opc & P_SIMDF3) {
582        tcg_out8(s, 0xf3);
583    } else if (opc & P_SIMDF2) {
584        tcg_out8(s, 0xf2);
585    }
586    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
587        tcg_out8(s, 0x0f);
588        if (opc & P_EXT38) {
589            tcg_out8(s, 0x38);
590        } else if (opc & P_EXT3A) {
591            tcg_out8(s, 0x3a);
592        }
593    }
594    tcg_out8(s, opc);
595}
596/* Discard the register arguments to tcg_out_opc early, so as not to penalize
597   the 32-bit compilation paths.  This method works with all versions of gcc,
598   whereas relying on optimization may not be able to exclude them.  */
599#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
600#endif
601
602static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
603{
604    tcg_out_opc(s, opc, r, rm, 0);
605    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
606}
607
608static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
609                            int rm, int index)
610{
611    int tmp;
612
613    if (opc & P_GS) {
614        tcg_out8(s, 0x65);
615    }
616    /* Use the two byte form if possible, which cannot encode
617       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
618    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
619        && ((rm | index) & 8) == 0) {
620        /* Two byte VEX prefix.  */
621        tcg_out8(s, 0xc5);
622
623        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
624    } else {
625        /* Three byte VEX prefix.  */
626        tcg_out8(s, 0xc4);
627
628        /* VEX.m-mmmm */
629        if (opc & P_EXT3A) {
630            tmp = 3;
631        } else if (opc & P_EXT38) {
632            tmp = 2;
633        } else if (opc & P_EXT) {
634            tmp = 1;
635        } else {
636            g_assert_not_reached();
637        }
638        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
639        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
640        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
641        tcg_out8(s, tmp);
642
643        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
644    }
645
646    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
647    /* VEX.pp */
648    if (opc & P_DATA16) {
649        tmp |= 1;                          /* 0x66 */
650    } else if (opc & P_SIMDF3) {
651        tmp |= 2;                          /* 0xf3 */
652    } else if (opc & P_SIMDF2) {
653        tmp |= 3;                          /* 0xf2 */
654    }
655    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
656    tcg_out8(s, tmp);
657    tcg_out8(s, opc);
658}
659
660static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
661                             int rm, int index)
662{
663    /* The entire 4-byte evex prefix; with R' and V' set. */
664    uint32_t p = 0x08041062;
665    int mm, pp;
666
667    tcg_debug_assert(have_avx512vl);
668
669    /* EVEX.mm */
670    if (opc & P_EXT3A) {
671        mm = 3;
672    } else if (opc & P_EXT38) {
673        mm = 2;
674    } else if (opc & P_EXT) {
675        mm = 1;
676    } else {
677        g_assert_not_reached();
678    }
679
680    /* EVEX.pp */
681    if (opc & P_DATA16) {
682        pp = 1;                          /* 0x66 */
683    } else if (opc & P_SIMDF3) {
684        pp = 2;                          /* 0xf3 */
685    } else if (opc & P_SIMDF2) {
686        pp = 3;                          /* 0xf2 */
687    } else {
688        pp = 0;
689    }
690
691    p = deposit32(p, 8, 2, mm);
692    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
693    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
694    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
695    p = deposit32(p, 16, 2, pp);
696    p = deposit32(p, 19, 4, ~v);
697    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
698    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
699
700    tcg_out32(s, p);
701    tcg_out8(s, opc);
702}
703
704static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
705{
706    if (opc & P_EVEX) {
707        tcg_out_evex_opc(s, opc, r, v, rm, 0);
708    } else {
709        tcg_out_vex_opc(s, opc, r, v, rm, 0);
710    }
711    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
712}
713
714static void tcg_out_vex_modrm_type(TCGContext *s, int opc,
715                                   int r, int v, int rm, TCGType type)
716{
717    if (type == TCG_TYPE_V256) {
718        opc |= P_VEXL;
719    }
720    tcg_out_vex_modrm(s, opc, r, v, rm);
721}
722
723/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
724   We handle either RM and INDEX missing with a negative value.  In 64-bit
725   mode for absolute addresses, ~RM is the size of the immediate operand
726   that will follow the instruction.  */
727
728static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
729                               int shift, intptr_t offset)
730{
731    int mod, len;
732
733    if (index < 0 && rm < 0) {
734        if (TCG_TARGET_REG_BITS == 64) {
735            /* Try for a rip-relative addressing mode.  This has replaced
736               the 32-bit-mode absolute addressing encoding.  */
737            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
738            intptr_t disp = offset - pc;
739            if (disp == (int32_t)disp) {
740                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
741                tcg_out32(s, disp);
742                return;
743            }
744
745            /* Try for an absolute address encoding.  This requires the
746               use of the MODRM+SIB encoding and is therefore larger than
747               rip-relative addressing.  */
748            if (offset == (int32_t)offset) {
749                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
750                tcg_out8(s, (4 << 3) | 5);
751                tcg_out32(s, offset);
752                return;
753            }
754
755            /* ??? The memory isn't directly addressable.  */
756            g_assert_not_reached();
757        } else {
758            /* Absolute address.  */
759            tcg_out8(s, (r << 3) | 5);
760            tcg_out32(s, offset);
761            return;
762        }
763    }
764
765    /* Find the length of the immediate addend.  Note that the encoding
766       that would be used for (%ebp) indicates absolute addressing.  */
767    if (rm < 0) {
768        mod = 0, len = 4, rm = 5;
769    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
770        mod = 0, len = 0;
771    } else if (offset == (int8_t)offset) {
772        mod = 0x40, len = 1;
773    } else {
774        mod = 0x80, len = 4;
775    }
776
777    /* Use a single byte MODRM format if possible.  Note that the encoding
778       that would be used for %esp is the escape to the two byte form.  */
779    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
780        /* Single byte MODRM format.  */
781        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
782    } else {
783        /* Two byte MODRM+SIB format.  */
784
785        /* Note that the encoding that would place %esp into the index
786           field indicates no index register.  In 64-bit mode, the REX.X
787           bit counts, so %r12 can be used as the index.  */
788        if (index < 0) {
789            index = 4;
790        } else {
791            tcg_debug_assert(index != TCG_REG_ESP);
792        }
793
794        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
795        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
796    }
797
798    if (len == 1) {
799        tcg_out8(s, offset);
800    } else if (len == 4) {
801        tcg_out32(s, offset);
802    }
803}
804
805static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
806                                     int index, int shift, intptr_t offset)
807{
808    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
809    tcg_out_sib_offset(s, r, rm, index, shift, offset);
810}
811
812static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
813                                         int rm, int index, int shift,
814                                         intptr_t offset)
815{
816    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
817    tcg_out_sib_offset(s, r, rm, index, shift, offset);
818}
819
820/* A simplification of the above with no index or shift.  */
821static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
822                                        int rm, intptr_t offset)
823{
824    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
825}
826
827static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
828                                            int v, int rm, intptr_t offset)
829{
830    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
831}
832
833/* Output an opcode with an expected reference to the constant pool.  */
834static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
835{
836    tcg_out_opc(s, opc, r, 0, 0);
837    /* Absolute for 32-bit, pc-relative for 64-bit.  */
838    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
839    tcg_out32(s, 0);
840}
841
842/* Output an opcode with an expected reference to the constant pool.  */
843static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
844{
845    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
846    /* Absolute for 32-bit, pc-relative for 64-bit.  */
847    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
848    tcg_out32(s, 0);
849}
850
851/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
852static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
853{
854    /* Propagate an opcode prefix, such as P_REXW.  */
855    int ext = subop & ~0x7;
856    subop &= 0x7;
857
858    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
859}
860
861static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
862{
863    int rexw = 0;
864
865    if (arg == ret) {
866        return true;
867    }
868    switch (type) {
869    case TCG_TYPE_I64:
870        rexw = P_REXW;
871        /* fallthru */
872    case TCG_TYPE_I32:
873        if (ret < 16) {
874            if (arg < 16) {
875                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
876            } else {
877                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
878            }
879        } else {
880            if (arg < 16) {
881                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
882            } else {
883                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
884            }
885        }
886        break;
887
888    case TCG_TYPE_V64:
889        tcg_debug_assert(ret >= 16 && arg >= 16);
890        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
891        break;
892    case TCG_TYPE_V128:
893        tcg_debug_assert(ret >= 16 && arg >= 16);
894        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
895        break;
896    case TCG_TYPE_V256:
897        tcg_debug_assert(ret >= 16 && arg >= 16);
898        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
899        break;
900
901    default:
902        g_assert_not_reached();
903    }
904    return true;
905}
906
907static const int avx2_dup_insn[4] = {
908    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
909    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
910};
911
912static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
913                            TCGReg r, TCGReg a)
914{
915    if (have_avx2) {
916        tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type);
917    } else {
918        switch (vece) {
919        case MO_8:
920            /* ??? With zero in a register, use PSHUFB.  */
921            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
922            a = r;
923            /* FALLTHRU */
924        case MO_16:
925            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
926            a = r;
927            /* FALLTHRU */
928        case MO_32:
929            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
930            /* imm8 operand: all output lanes selected from input lane 0.  */
931            tcg_out8(s, 0);
932            break;
933        case MO_64:
934            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
935            break;
936        default:
937            g_assert_not_reached();
938        }
939    }
940    return true;
941}
942
943static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
944                             TCGReg r, TCGReg base, intptr_t offset)
945{
946    if (have_avx2) {
947        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
948        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
949                                 r, 0, base, offset);
950    } else {
951        switch (vece) {
952        case MO_64:
953            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
954            break;
955        case MO_32:
956            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
957            break;
958        case MO_16:
959            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
960            tcg_out8(s, 0); /* imm8 */
961            tcg_out_dup_vec(s, type, vece, r, r);
962            break;
963        case MO_8:
964            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
965            tcg_out8(s, 0); /* imm8 */
966            tcg_out_dup_vec(s, type, vece, r, r);
967            break;
968        default:
969            g_assert_not_reached();
970        }
971    }
972    return true;
973}
974
975static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
976                             TCGReg ret, int64_t arg)
977{
978    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
979
980    if (arg == 0) {
981        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
982        return;
983    }
984    if (arg == -1) {
985        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
986        return;
987    }
988
989    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
990        if (have_avx2) {
991            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
992        } else {
993            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
994        }
995        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
996    } else {
997        if (type == TCG_TYPE_V64) {
998            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
999        } else if (have_avx2) {
1000            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
1001        } else {
1002            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1003        }
1004        if (TCG_TARGET_REG_BITS == 64) {
1005            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1006        } else {
1007            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1008        }
1009    }
1010}
1011
1012static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1013                             TCGReg ret, tcg_target_long arg)
1014{
1015    if (arg == 0) {
1016        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1017        return;
1018    }
1019    if (arg == -1) {
1020        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1021        return;
1022    }
1023
1024    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1025    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1026    if (TCG_TARGET_REG_BITS == 64) {
1027        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1028    } else {
1029        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1030    }
1031}
1032
1033static void tcg_out_movi_int(TCGContext *s, TCGType type,
1034                             TCGReg ret, tcg_target_long arg)
1035{
1036    tcg_target_long diff;
1037
1038    if (arg == 0) {
1039        tgen_arithr(s, ARITH_XOR, ret, ret);
1040        return;
1041    }
1042    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1043        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1044        tcg_out32(s, arg);
1045        return;
1046    }
1047    if (arg == (int32_t)arg) {
1048        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1049        tcg_out32(s, arg);
1050        return;
1051    }
1052
1053    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1054    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1055    if (diff == (int32_t)diff) {
1056        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1057        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1058        tcg_out32(s, diff);
1059        return;
1060    }
1061
1062    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1063    tcg_out64(s, arg);
1064}
1065
1066static void tcg_out_movi(TCGContext *s, TCGType type,
1067                         TCGReg ret, tcg_target_long arg)
1068{
1069    switch (type) {
1070    case TCG_TYPE_I32:
1071#if TCG_TARGET_REG_BITS == 64
1072    case TCG_TYPE_I64:
1073#endif
1074        if (ret < 16) {
1075            tcg_out_movi_int(s, type, ret, arg);
1076        } else {
1077            tcg_out_movi_vec(s, type, ret, arg);
1078        }
1079        break;
1080    default:
1081        g_assert_not_reached();
1082    }
1083}
1084
1085static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1086{
1087    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1088    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1089    return true;
1090}
1091
1092static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1093                             tcg_target_long imm)
1094{
1095    /* This function is only used for passing structs by reference. */
1096    tcg_debug_assert(imm == (int32_t)imm);
1097    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1098}
1099
1100static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1101{
1102    if (val == (int8_t)val) {
1103        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1104        tcg_out8(s, val);
1105    } else if (val == (int32_t)val) {
1106        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1107        tcg_out32(s, val);
1108    } else {
1109        g_assert_not_reached();
1110    }
1111}
1112
1113static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1114{
1115    /* Given the strength of x86 memory ordering, we only need care for
1116       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1117       faster than "mfence", so don't bother with the sse insn.  */
1118    if (a0 & TCG_MO_ST_LD) {
1119        tcg_out8(s, 0xf0);
1120        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1121        tcg_out8(s, 0);
1122    }
1123}
1124
1125static inline void tcg_out_push(TCGContext *s, int reg)
1126{
1127    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1128}
1129
1130static inline void tcg_out_pop(TCGContext *s, int reg)
1131{
1132    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1133}
1134
1135static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1136                       TCGReg arg1, intptr_t arg2)
1137{
1138    switch (type) {
1139    case TCG_TYPE_I32:
1140        if (ret < 16) {
1141            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1142        } else {
1143            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1144        }
1145        break;
1146    case TCG_TYPE_I64:
1147        if (ret < 16) {
1148            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1149            break;
1150        }
1151        /* FALLTHRU */
1152    case TCG_TYPE_V64:
1153        /* There is no instruction that can validate 8-byte alignment.  */
1154        tcg_debug_assert(ret >= 16);
1155        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1156        break;
1157    case TCG_TYPE_V128:
1158        /*
1159         * The gvec infrastructure is asserts that v128 vector loads
1160         * and stores use a 16-byte aligned offset.  Validate that the
1161         * final pointer is aligned by using an insn that will SIGSEGV.
1162         */
1163        tcg_debug_assert(ret >= 16);
1164        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1165        break;
1166    case TCG_TYPE_V256:
1167        /*
1168         * The gvec infrastructure only requires 16-byte alignment,
1169         * so here we must use an unaligned load.
1170         */
1171        tcg_debug_assert(ret >= 16);
1172        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1173                                 ret, 0, arg1, arg2);
1174        break;
1175    default:
1176        g_assert_not_reached();
1177    }
1178}
1179
1180static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1181                       TCGReg arg1, intptr_t arg2)
1182{
1183    switch (type) {
1184    case TCG_TYPE_I32:
1185        if (arg < 16) {
1186            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1187        } else {
1188            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1189        }
1190        break;
1191    case TCG_TYPE_I64:
1192        if (arg < 16) {
1193            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1194            break;
1195        }
1196        /* FALLTHRU */
1197    case TCG_TYPE_V64:
1198        /* There is no instruction that can validate 8-byte alignment.  */
1199        tcg_debug_assert(arg >= 16);
1200        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1201        break;
1202    case TCG_TYPE_V128:
1203        /*
1204         * The gvec infrastructure is asserts that v128 vector loads
1205         * and stores use a 16-byte aligned offset.  Validate that the
1206         * final pointer is aligned by using an insn that will SIGSEGV.
1207         *
1208         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1209         * for _WIN64, which must have SSE2 but may not have AVX.
1210         */
1211        tcg_debug_assert(arg >= 16);
1212        if (have_avx1) {
1213            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1214        } else {
1215            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1216        }
1217        break;
1218    case TCG_TYPE_V256:
1219        /*
1220         * The gvec infrastructure only requires 16-byte alignment,
1221         * so here we must use an unaligned store.
1222         */
1223        tcg_debug_assert(arg >= 16);
1224        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1225                                 arg, 0, arg1, arg2);
1226        break;
1227    default:
1228        g_assert_not_reached();
1229    }
1230}
1231
1232static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1233                        TCGReg base, intptr_t ofs)
1234{
1235    int rexw = 0;
1236    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1237        if (val != (int32_t)val) {
1238            return false;
1239        }
1240        rexw = P_REXW;
1241    } else if (type != TCG_TYPE_I32) {
1242        return false;
1243    }
1244    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1245    tcg_out32(s, val);
1246    return true;
1247}
1248
1249static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1250{
1251    /* Propagate an opcode prefix, such as P_DATA16.  */
1252    int ext = subopc & ~0x7;
1253    subopc &= 0x7;
1254
1255    if (count == 1) {
1256        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1257    } else {
1258        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1259        tcg_out8(s, count);
1260    }
1261}
1262
1263static inline void tcg_out_bswap32(TCGContext *s, int reg)
1264{
1265    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1266}
1267
1268static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1269{
1270    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1271}
1272
1273static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1274{
1275    /* movzbl */
1276    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1277    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1278}
1279
1280static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1281{
1282    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1283    /* movsbl */
1284    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1285    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1286}
1287
1288static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1289{
1290    /* movzwl */
1291    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1292}
1293
1294static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1295{
1296    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1297    /* movsw[lq] */
1298    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1299}
1300
1301static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1302{
1303    /* 32-bit mov zero extends.  */
1304    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1305}
1306
1307static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1308{
1309    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1310    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1311}
1312
1313static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1314{
1315    tcg_out_ext32s(s, dest, src);
1316}
1317
1318static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1319{
1320    if (dest != src) {
1321        tcg_out_ext32u(s, dest, src);
1322    }
1323}
1324
1325static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1326{
1327    tcg_out_ext32u(s, dest, src);
1328}
1329
1330static inline void tcg_out_bswap64(TCGContext *s, int reg)
1331{
1332    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1333}
1334
1335static void tgen_arithi(TCGContext *s, int c, int r0,
1336                        tcg_target_long val, int cf)
1337{
1338    int rexw = 0;
1339
1340    if (TCG_TARGET_REG_BITS == 64) {
1341        rexw = c & -8;
1342        c &= 7;
1343    }
1344
1345    switch (c) {
1346    case ARITH_ADD:
1347    case ARITH_SUB:
1348        if (!cf) {
1349            /*
1350             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1351             * partial flags update stalls on Pentium4 and are not recommended
1352             * by current Intel optimization manuals.
1353             */
1354            if (val == 1 || val == -1) {
1355                int is_inc = (c == ARITH_ADD) ^ (val < 0);
1356                if (TCG_TARGET_REG_BITS == 64) {
1357                    /*
1358                     * The single-byte increment encodings are re-tasked
1359                     * as the REX prefixes.  Use the MODRM encoding.
1360                     */
1361                    tcg_out_modrm(s, OPC_GRP5 + rexw,
1362                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1363                } else {
1364                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1365                }
1366                return;
1367            }
1368            if (val == 128) {
1369                /*
1370                 * Facilitate using an 8-bit immediate.  Carry is inverted
1371                 * by this transformation, so do it only if cf == 0.
1372                 */
1373                c ^= ARITH_ADD ^ ARITH_SUB;
1374                val = -128;
1375            }
1376        }
1377        break;
1378
1379    case ARITH_AND:
1380        if (TCG_TARGET_REG_BITS == 64) {
1381            if (val == 0xffffffffu) {
1382                tcg_out_ext32u(s, r0, r0);
1383                return;
1384            }
1385            if (val == (uint32_t)val) {
1386                /* AND with no high bits set can use a 32-bit operation.  */
1387                rexw = 0;
1388            }
1389        }
1390        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1391            tcg_out_ext8u(s, r0, r0);
1392            return;
1393        }
1394        if (val == 0xffffu) {
1395            tcg_out_ext16u(s, r0, r0);
1396            return;
1397        }
1398        break;
1399
1400    case ARITH_OR:
1401    case ARITH_XOR:
1402        if (val >= 0x80 && val <= 0xff
1403            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1404            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1405            tcg_out8(s, val);
1406            return;
1407        }
1408        break;
1409    }
1410
1411    if (val == (int8_t)val) {
1412        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1413        tcg_out8(s, val);
1414        return;
1415    }
1416    if (rexw == 0 || val == (int32_t)val) {
1417        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1418        tcg_out32(s, val);
1419        return;
1420    }
1421
1422    g_assert_not_reached();
1423}
1424
1425static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1426{
1427    if (val != 0) {
1428        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1429    }
1430}
1431
1432/* Set SMALL to force a short forward branch.  */
1433static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1434{
1435    int32_t val, val1;
1436
1437    if (l->has_value) {
1438        val = tcg_pcrel_diff(s, l->u.value_ptr);
1439        val1 = val - 2;
1440        if ((int8_t)val1 == val1) {
1441            if (opc == -1) {
1442                tcg_out8(s, OPC_JMP_short);
1443            } else {
1444                tcg_out8(s, OPC_JCC_short + opc);
1445            }
1446            tcg_out8(s, val1);
1447        } else {
1448            tcg_debug_assert(!small);
1449            if (opc == -1) {
1450                tcg_out8(s, OPC_JMP_long);
1451                tcg_out32(s, val - 5);
1452            } else {
1453                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1454                tcg_out32(s, val - 6);
1455            }
1456        }
1457    } else if (small) {
1458        if (opc == -1) {
1459            tcg_out8(s, OPC_JMP_short);
1460        } else {
1461            tcg_out8(s, OPC_JCC_short + opc);
1462        }
1463        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1464        s->code_ptr += 1;
1465    } else {
1466        if (opc == -1) {
1467            tcg_out8(s, OPC_JMP_long);
1468        } else {
1469            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1470        }
1471        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1472        s->code_ptr += 4;
1473    }
1474}
1475
1476static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1477                       TCGArg arg2, int const_arg2, int rexw)
1478{
1479    int jz, js;
1480
1481    if (!is_tst_cond(cond)) {
1482        if (!const_arg2) {
1483            tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1484        } else if (arg2 == 0) {
1485            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1486        } else {
1487            tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1488            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1489        }
1490        return tcg_cond_to_jcc[cond];
1491    }
1492
1493    jz = tcg_cond_to_jcc[cond];
1494    js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1495
1496    if (!const_arg2) {
1497        tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1498        return jz;
1499    }
1500
1501    if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1502        if (arg2 == 0x80) {
1503            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1504            return js;
1505        }
1506        if (arg2 == 0xff) {
1507            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1508            return jz;
1509        }
1510        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1511        tcg_out8(s, arg2);
1512        return jz;
1513    }
1514
1515    if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1516        if (arg2 == 0x8000) {
1517            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1518            return js;
1519        }
1520        if (arg2 == 0xff00) {
1521            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1522            return jz;
1523        }
1524        tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1525        tcg_out8(s, arg2 >> 8);
1526        return jz;
1527    }
1528
1529    if (arg2 == 0xffff) {
1530        tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1531        return jz;
1532    }
1533    if (arg2 == 0xffffffffu) {
1534        tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1535        return jz;
1536    }
1537
1538    if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1539        int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1540        int sh = ctz64(arg2);
1541
1542        rexw = (sh & 32 ? P_REXW : 0);
1543        if ((sh & 31) == 31) {
1544            tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1545            return js;
1546        } else {
1547            tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1548            tcg_out8(s, sh);
1549            return jc;
1550        }
1551    }
1552
1553    if (rexw) {
1554        if (arg2 == (uint32_t)arg2) {
1555            rexw = 0;
1556        } else {
1557            tcg_debug_assert(arg2 == (int32_t)arg2);
1558        }
1559    }
1560    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1561    tcg_out32(s, arg2);
1562    return jz;
1563}
1564
1565static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1566                           TCGArg arg1, TCGArg arg2, int const_arg2,
1567                           TCGLabel *label, bool small)
1568{
1569    int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1570    tcg_out_jxx(s, jcc, label, small);
1571}
1572
1573#if TCG_TARGET_REG_BITS == 32
1574static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1575                            const int *const_args, bool small)
1576{
1577    TCGLabel *label_next = gen_new_label();
1578    TCGLabel *label_this = arg_label(args[5]);
1579    TCGCond cond = args[4];
1580
1581    switch (cond) {
1582    case TCG_COND_EQ:
1583    case TCG_COND_TSTEQ:
1584        tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1585                       args[0], args[2], const_args[2], label_next, 1);
1586        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1587                       label_this, small);
1588        break;
1589    case TCG_COND_NE:
1590    case TCG_COND_TSTNE:
1591        tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1592                       label_this, small);
1593        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1594                       label_this, small);
1595        break;
1596    case TCG_COND_LT:
1597        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1598                       label_this, small);
1599        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1600        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1601                       label_this, small);
1602        break;
1603    case TCG_COND_LE:
1604        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1605                       label_this, small);
1606        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1607        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1608                       label_this, small);
1609        break;
1610    case TCG_COND_GT:
1611        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1612                       label_this, small);
1613        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1614        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1615                       label_this, small);
1616        break;
1617    case TCG_COND_GE:
1618        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1619                       label_this, small);
1620        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1621        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1622                       label_this, small);
1623        break;
1624    case TCG_COND_LTU:
1625        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1626                       label_this, small);
1627        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1628        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1629                       label_this, small);
1630        break;
1631    case TCG_COND_LEU:
1632        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1633                       label_this, small);
1634        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1635        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1636                       label_this, small);
1637        break;
1638    case TCG_COND_GTU:
1639        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1640                       label_this, small);
1641        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1642        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1643                       label_this, small);
1644        break;
1645    case TCG_COND_GEU:
1646        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1647                       label_this, small);
1648        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1649        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1650                       label_this, small);
1651        break;
1652    default:
1653        g_assert_not_reached();
1654    }
1655    tcg_out_label(s, label_next);
1656}
1657#endif
1658
1659static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1660                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1661                            int const_arg2, bool neg)
1662{
1663    int cmp_rexw = rexw;
1664    bool inv = false;
1665    bool cleared;
1666    int jcc;
1667
1668    switch (cond) {
1669    case TCG_COND_NE:
1670        inv = true;
1671        /* fall through */
1672    case TCG_COND_EQ:
1673        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1674        if (const_arg2 && arg2 == 0) {
1675            arg2 = 1;
1676            goto do_ltu;
1677        }
1678        break;
1679
1680    case TCG_COND_TSTNE:
1681        inv = true;
1682        /* fall through */
1683    case TCG_COND_TSTEQ:
1684        /* If arg2 is -1, convert to LTU/GEU vs 1. */
1685        if (const_arg2 && arg2 == 0xffffffffu) {
1686            arg2 = 1;
1687            cmp_rexw = 0;
1688            goto do_ltu;
1689        }
1690        break;
1691
1692    case TCG_COND_LEU:
1693        inv = true;
1694        /* fall through */
1695    case TCG_COND_GTU:
1696        /* If arg2 is a register, swap for LTU/GEU. */
1697        if (!const_arg2) {
1698            TCGReg t = arg1;
1699            arg1 = arg2;
1700            arg2 = t;
1701            goto do_ltu;
1702        }
1703        break;
1704
1705    case TCG_COND_GEU:
1706        inv = true;
1707        /* fall through */
1708    case TCG_COND_LTU:
1709    do_ltu:
1710        /*
1711         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1712         * We can then use NEG or INC to produce the desired result.
1713         * This is always smaller than the SETCC expansion.
1714         */
1715        tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);
1716
1717        /* X - X - C = -C = (C ? -1 : 0) */
1718        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1719        if (inv && neg) {
1720            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1721            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1722        } else if (inv) {
1723            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1724            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1725        } else if (!neg) {
1726            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1727            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1728        }
1729        return;
1730
1731    case TCG_COND_GE:
1732        inv = true;
1733        /* fall through */
1734    case TCG_COND_LT:
1735        /* If arg2 is 0, extract the sign bit. */
1736        if (const_arg2 && arg2 == 0) {
1737            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1738            if (inv) {
1739                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1740            }
1741            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1742                           dest, rexw ? 63 : 31);
1743            return;
1744        }
1745        break;
1746
1747    default:
1748        break;
1749    }
1750
1751    /*
1752     * If dest does not overlap the inputs, clearing it first is preferred.
1753     * The XOR breaks any false dependency for the low-byte write to dest,
1754     * and is also one byte smaller than MOVZBL.
1755     */
1756    cleared = false;
1757    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1758        tgen_arithr(s, ARITH_XOR, dest, dest);
1759        cleared = true;
1760    }
1761
1762    jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
1763    tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1764
1765    if (!cleared) {
1766        tcg_out_ext8u(s, dest, dest);
1767    }
1768    if (neg) {
1769        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1770    }
1771}
1772
1773#if TCG_TARGET_REG_BITS == 32
1774static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1775                             const int *const_args)
1776{
1777    TCGArg new_args[6];
1778    TCGLabel *label_true, *label_over;
1779
1780    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1781
1782    if (args[0] == args[1] || args[0] == args[2]
1783        || (!const_args[3] && args[0] == args[3])
1784        || (!const_args[4] && args[0] == args[4])) {
1785        /* When the destination overlaps with one of the argument
1786           registers, don't do anything tricky.  */
1787        label_true = gen_new_label();
1788        label_over = gen_new_label();
1789
1790        new_args[5] = label_arg(label_true);
1791        tcg_out_brcond2(s, new_args, const_args+1, 1);
1792
1793        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1794        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1795        tcg_out_label(s, label_true);
1796
1797        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1798        tcg_out_label(s, label_over);
1799    } else {
1800        /* When the destination does not overlap one of the arguments,
1801           clear the destination first, jump if cond false, and emit an
1802           increment in the true case.  This results in smaller code.  */
1803
1804        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1805
1806        label_over = gen_new_label();
1807        new_args[4] = tcg_invert_cond(new_args[4]);
1808        new_args[5] = label_arg(label_over);
1809        tcg_out_brcond2(s, new_args, const_args+1, 1);
1810
1811        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1812        tcg_out_label(s, label_over);
1813    }
1814}
1815#endif
1816
1817static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1818                         TCGReg dest, TCGReg v1)
1819{
1820    tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1821}
1822
1823static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1824                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1825                            TCGReg v1)
1826{
1827    int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1828    tcg_out_cmov(s, jcc, rexw, dest, v1);
1829}
1830
1831static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1832                        TCGArg arg2, bool const_a2)
1833{
1834    if (have_bmi1) {
1835        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1836        if (const_a2) {
1837            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1838        } else {
1839            tcg_debug_assert(dest != arg2);
1840            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1841        }
1842    } else {
1843        tcg_debug_assert(dest != arg2);
1844        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1845        tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1846    }
1847}
1848
1849static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1850                        TCGArg arg2, bool const_a2)
1851{
1852    if (have_lzcnt) {
1853        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1854        if (const_a2) {
1855            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1856        } else {
1857            tcg_debug_assert(dest != arg2);
1858            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1859        }
1860    } else {
1861        tcg_debug_assert(!const_a2);
1862        tcg_debug_assert(dest != arg1);
1863        tcg_debug_assert(dest != arg2);
1864
1865        /* Recall that the output of BSR is the index not the count.  */
1866        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1867        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1868
1869        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1870        int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1871        tcg_out_cmov(s, jcc, rexw, dest, arg2);
1872    }
1873}
1874
1875static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1876{
1877    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1878
1879    if (disp == (int32_t)disp) {
1880        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1881        tcg_out32(s, disp);
1882    } else {
1883        /* rip-relative addressing into the constant pool.
1884           This is 6 + 8 = 14 bytes, as compared to using an
1885           immediate load 10 + 6 = 16 bytes, plus we may
1886           be able to re-use the pool constant for more calls.  */
1887        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1888        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1889        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1890        tcg_out32(s, 0);
1891    }
1892}
1893
1894static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1895                         const TCGHelperInfo *info)
1896{
1897    tcg_out_branch(s, 1, dest);
1898
1899#ifndef _WIN32
1900    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1901        /*
1902         * The sysv i386 abi for struct return places a reference as the
1903         * first argument of the stack, and pops that argument with the
1904         * return statement.  Since we want to retain the aligned stack
1905         * pointer for the callee, we do not want to actually push that
1906         * argument before the call but rely on the normal store to the
1907         * stack slot.  But we do need to compensate for the pop in order
1908         * to reset our correct stack pointer value.
1909         * Pushing a garbage value back onto the stack is quickest.
1910         */
1911        tcg_out_push(s, TCG_REG_EAX);
1912    }
1913#endif
1914}
1915
1916static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1917{
1918    tcg_out_branch(s, 0, dest);
1919}
1920
1921static void tcg_out_nopn(TCGContext *s, int n)
1922{
1923    int i;
1924    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1925     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1926     * duplicate prefix, and all of the interesting recent cores can
1927     * decode and discard the duplicates in a single cycle.
1928     */
1929    tcg_debug_assert(n >= 1);
1930    for (i = 1; i < n; ++i) {
1931        tcg_out8(s, 0x66);
1932    }
1933    tcg_out8(s, 0x90);
1934}
1935
1936typedef struct {
1937    TCGReg base;
1938    int index;
1939    int ofs;
1940    int seg;
1941    TCGAtomAlign aa;
1942} HostAddress;
1943
1944bool tcg_target_has_memory_bswap(MemOp memop)
1945{
1946    TCGAtomAlign aa;
1947
1948    if (!have_movbe) {
1949        return false;
1950    }
1951    if ((memop & MO_SIZE) < MO_128) {
1952        return true;
1953    }
1954
1955    /*
1956     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1957     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1958     */
1959    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1960    return aa.atom < MO_128;
1961}
1962
1963/*
1964 * Because i686 has no register parameters and because x86_64 has xchg
1965 * to handle addr/data register overlap, we have placed all input arguments
1966 * before we need might need a scratch reg.
1967 *
1968 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1969 * a general-purpose scratch when we don't actually know it's available,
1970 * use the ra_gen hook to load into RAX if needed.
1971 */
1972#if TCG_TARGET_REG_BITS == 64
1973static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1974{
1975    if (arg < 0) {
1976        arg = TCG_REG_RAX;
1977    }
1978    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1979    return arg;
1980}
1981static const TCGLdstHelperParam ldst_helper_param = {
1982    .ra_gen = ldst_ra_gen
1983};
1984#else
1985static const TCGLdstHelperParam ldst_helper_param = { };
1986#endif
1987
1988static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1989                                TCGReg l, TCGReg h, TCGReg v)
1990{
1991    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1992
1993    /* vpmov{d,q} %v, %l */
1994    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1995    /* vpextr{d,q} $1, %v, %h */
1996    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1997    tcg_out8(s, 1);
1998}
1999
2000static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
2001                                TCGReg v, TCGReg l, TCGReg h)
2002{
2003    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
2004
2005    /* vmov{d,q} %l, %v */
2006    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
2007    /* vpinsr{d,q} $1, %h, %v, %v */
2008    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2009    tcg_out8(s, 1);
2010}
2011
2012/*
2013 * Generate code for the slow path for a load at the end of block
2014 */
2015static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2016{
2017    MemOp opc = get_memop(l->oi);
2018    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2019
2020    /* resolve label address */
2021    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2022    if (label_ptr[1]) {
2023        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2024    }
2025
2026    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2027    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2028    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2029
2030    tcg_out_jmp(s, l->raddr);
2031    return true;
2032}
2033
2034/*
2035 * Generate code for the slow path for a store at the end of block
2036 */
2037static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2038{
2039    MemOp opc = get_memop(l->oi);
2040    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2041
2042    /* resolve label address */
2043    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2044    if (label_ptr[1]) {
2045        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2046    }
2047
2048    tcg_out_st_helper_args(s, l, &ldst_helper_param);
2049    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2050
2051    tcg_out_jmp(s, l->raddr);
2052    return true;
2053}
2054
2055#ifdef CONFIG_USER_ONLY
2056static HostAddress x86_guest_base = {
2057    .index = -1
2058};
2059
2060#if defined(__x86_64__) && defined(__linux__)
2061# include <asm/prctl.h>
2062# include <sys/prctl.h>
2063int arch_prctl(int code, unsigned long addr);
2064static inline int setup_guest_base_seg(void)
2065{
2066    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2067        return P_GS;
2068    }
2069    return 0;
2070}
2071#define setup_guest_base_seg  setup_guest_base_seg
2072#elif defined(__x86_64__) && \
2073      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2074# include <machine/sysarch.h>
2075static inline int setup_guest_base_seg(void)
2076{
2077    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2078        return P_GS;
2079    }
2080    return 0;
2081}
2082#define setup_guest_base_seg  setup_guest_base_seg
2083#endif
2084#else
2085# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2086#endif /* CONFIG_USER_ONLY */
2087#ifndef setup_guest_base_seg
2088# define setup_guest_base_seg()  0
2089#endif
2090
2091#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2092
2093/*
2094 * For softmmu, perform the TLB load and compare.
2095 * For useronly, perform any required alignment tests.
2096 * In both cases, return a TCGLabelQemuLdst structure if the slow path
2097 * is required and fill in @h with the host address for the fast path.
2098 */
2099static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2100                                           TCGReg addrlo, TCGReg addrhi,
2101                                           MemOpIdx oi, bool is_ld)
2102{
2103    TCGLabelQemuLdst *ldst = NULL;
2104    MemOp opc = get_memop(oi);
2105    MemOp s_bits = opc & MO_SIZE;
2106    unsigned a_mask;
2107
2108    if (tcg_use_softmmu) {
2109        h->index = TCG_REG_L0;
2110        h->ofs = 0;
2111        h->seg = 0;
2112    } else {
2113        *h = x86_guest_base;
2114    }
2115    h->base = addrlo;
2116    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2117    a_mask = (1 << h->aa.align) - 1;
2118
2119    if (tcg_use_softmmu) {
2120        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2121                            : offsetof(CPUTLBEntry, addr_write);
2122        TCGType ttype = TCG_TYPE_I32;
2123        TCGType tlbtype = TCG_TYPE_I32;
2124        int trexw = 0, hrexw = 0, tlbrexw = 0;
2125        unsigned mem_index = get_mmuidx(oi);
2126        unsigned s_mask = (1 << s_bits) - 1;
2127        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2128        int tlb_mask;
2129
2130        ldst = new_ldst_label(s);
2131        ldst->is_ld = is_ld;
2132        ldst->oi = oi;
2133        ldst->addrlo_reg = addrlo;
2134        ldst->addrhi_reg = addrhi;
2135
2136        if (TCG_TARGET_REG_BITS == 64) {
2137            ttype = s->addr_type;
2138            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2139            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2140                hrexw = P_REXW;
2141                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2142                    tlbtype = TCG_TYPE_I64;
2143                    tlbrexw = P_REXW;
2144                }
2145            }
2146        }
2147
2148        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2149        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2150                       s->page_bits - CPU_TLB_ENTRY_BITS);
2151
2152        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2153                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2154
2155        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2156                             fast_ofs + offsetof(CPUTLBDescFast, table));
2157
2158        /*
2159         * If the required alignment is at least as large as the access,
2160         * simply copy the address and mask.  For lesser alignments,
2161         * check that we don't cross pages for the complete access.
2162         */
2163        if (a_mask >= s_mask) {
2164            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2165        } else {
2166            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2167                                 addrlo, s_mask - a_mask);
2168        }
2169        tlb_mask = s->page_mask | a_mask;
2170        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2171
2172        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2173        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2174                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2175
2176        /* jne slow_path */
2177        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2178        ldst->label_ptr[0] = s->code_ptr;
2179        s->code_ptr += 4;
2180
2181        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2182            /* cmp 4(TCG_REG_L0), addrhi */
2183            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2184                                 TCG_REG_L0, cmp_ofs + 4);
2185
2186            /* jne slow_path */
2187            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2188            ldst->label_ptr[1] = s->code_ptr;
2189            s->code_ptr += 4;
2190        }
2191
2192        /* TLB Hit.  */
2193        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2194                   offsetof(CPUTLBEntry, addend));
2195    } else if (a_mask) {
2196        int jcc;
2197
2198        ldst = new_ldst_label(s);
2199        ldst->is_ld = is_ld;
2200        ldst->oi = oi;
2201        ldst->addrlo_reg = addrlo;
2202        ldst->addrhi_reg = addrhi;
2203
2204        /* jne slow_path */
2205        jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2206        tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2207        ldst->label_ptr[0] = s->code_ptr;
2208        s->code_ptr += 4;
2209    }
2210
2211    return ldst;
2212}
2213
2214static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2215                                   HostAddress h, TCGType type, MemOp memop)
2216{
2217    bool use_movbe = false;
2218    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2219    int movop = OPC_MOVL_GvEv;
2220
2221    /* Do big-endian loads with movbe.  */
2222    if (memop & MO_BSWAP) {
2223        tcg_debug_assert(have_movbe);
2224        use_movbe = true;
2225        movop = OPC_MOVBE_GyMy;
2226    }
2227
2228    switch (memop & MO_SSIZE) {
2229    case MO_UB:
2230        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2231                                 h.base, h.index, 0, h.ofs);
2232        break;
2233    case MO_SB:
2234        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2235                                 h.base, h.index, 0, h.ofs);
2236        break;
2237    case MO_UW:
2238        if (use_movbe) {
2239            /* There is no extending movbe; only low 16-bits are modified.  */
2240            if (datalo != h.base && datalo != h.index) {
2241                /* XOR breaks dependency chains.  */
2242                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2243                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2244                                         datalo, h.base, h.index, 0, h.ofs);
2245            } else {
2246                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2247                                         datalo, h.base, h.index, 0, h.ofs);
2248                tcg_out_ext16u(s, datalo, datalo);
2249            }
2250        } else {
2251            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2252                                     h.base, h.index, 0, h.ofs);
2253        }
2254        break;
2255    case MO_SW:
2256        if (use_movbe) {
2257            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2258                                     datalo, h.base, h.index, 0, h.ofs);
2259            tcg_out_ext16s(s, type, datalo, datalo);
2260        } else {
2261            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2262                                     datalo, h.base, h.index, 0, h.ofs);
2263        }
2264        break;
2265    case MO_UL:
2266        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2267                                 h.base, h.index, 0, h.ofs);
2268        break;
2269#if TCG_TARGET_REG_BITS == 64
2270    case MO_SL:
2271        if (use_movbe) {
2272            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2273                                     h.base, h.index, 0, h.ofs);
2274            tcg_out_ext32s(s, datalo, datalo);
2275        } else {
2276            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2277                                     h.base, h.index, 0, h.ofs);
2278        }
2279        break;
2280#endif
2281    case MO_UQ:
2282        if (TCG_TARGET_REG_BITS == 64) {
2283            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2284                                     h.base, h.index, 0, h.ofs);
2285            break;
2286        }
2287        if (use_movbe) {
2288            TCGReg t = datalo;
2289            datalo = datahi;
2290            datahi = t;
2291        }
2292        if (h.base == datalo || h.index == datalo) {
2293            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2294                                     h.base, h.index, 0, h.ofs);
2295            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2296            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2297        } else {
2298            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2299                                     h.base, h.index, 0, h.ofs);
2300            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2301                                     h.base, h.index, 0, h.ofs + 4);
2302        }
2303        break;
2304
2305    case MO_128:
2306        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2307
2308        /*
2309         * Without 16-byte atomicity, use integer regs.
2310         * That is where we want the data, and it allows bswaps.
2311         */
2312        if (h.aa.atom < MO_128) {
2313            if (use_movbe) {
2314                TCGReg t = datalo;
2315                datalo = datahi;
2316                datahi = t;
2317            }
2318            if (h.base == datalo || h.index == datalo) {
2319                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2320                                         h.base, h.index, 0, h.ofs);
2321                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2322                                     datalo, datahi, 0);
2323                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2324                                     datahi, datahi, 8);
2325            } else {
2326                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2327                                         h.base, h.index, 0, h.ofs);
2328                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2329                                         h.base, h.index, 0, h.ofs + 8);
2330            }
2331            break;
2332        }
2333
2334        /*
2335         * With 16-byte atomicity, a vector load is required.
2336         * If we already have 16-byte alignment, then VMOVDQA always works.
2337         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2338         * Else use we require a runtime test for alignment for VMOVDQA;
2339         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2340         */
2341        if (h.aa.align >= MO_128) {
2342            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2343                                         TCG_TMP_VEC, 0,
2344                                         h.base, h.index, 0, h.ofs);
2345        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2346            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2347                                         TCG_TMP_VEC, 0,
2348                                         h.base, h.index, 0, h.ofs);
2349        } else {
2350            TCGLabel *l1 = gen_new_label();
2351            TCGLabel *l2 = gen_new_label();
2352            int jcc;
2353
2354            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2355            tcg_out_jxx(s, jcc, l1, true);
2356
2357            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2358                                         TCG_TMP_VEC, 0,
2359                                         h.base, h.index, 0, h.ofs);
2360            tcg_out_jxx(s, JCC_JMP, l2, true);
2361
2362            tcg_out_label(s, l1);
2363            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2364                                         TCG_TMP_VEC, 0,
2365                                         h.base, h.index, 0, h.ofs);
2366            tcg_out_label(s, l2);
2367        }
2368        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2369        break;
2370
2371    default:
2372        g_assert_not_reached();
2373    }
2374}
2375
2376static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2377                            TCGReg addrlo, TCGReg addrhi,
2378                            MemOpIdx oi, TCGType data_type)
2379{
2380    TCGLabelQemuLdst *ldst;
2381    HostAddress h;
2382
2383    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2384    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2385
2386    if (ldst) {
2387        ldst->type = data_type;
2388        ldst->datalo_reg = datalo;
2389        ldst->datahi_reg = datahi;
2390        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2391    }
2392}
2393
2394static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2395                                   HostAddress h, MemOp memop)
2396{
2397    bool use_movbe = false;
2398    int movop = OPC_MOVL_EvGv;
2399
2400    /*
2401     * Do big-endian stores with movbe or system-mode.
2402     * User-only without movbe will have its swapping done generically.
2403     */
2404    if (memop & MO_BSWAP) {
2405        tcg_debug_assert(have_movbe);
2406        use_movbe = true;
2407        movop = OPC_MOVBE_MyGy;
2408    }
2409
2410    switch (memop & MO_SIZE) {
2411    case MO_8:
2412        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2413        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2414        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2415                                 datalo, h.base, h.index, 0, h.ofs);
2416        break;
2417    case MO_16:
2418        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2419                                 h.base, h.index, 0, h.ofs);
2420        break;
2421    case MO_32:
2422        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2423                                 h.base, h.index, 0, h.ofs);
2424        break;
2425    case MO_64:
2426        if (TCG_TARGET_REG_BITS == 64) {
2427            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2428                                     h.base, h.index, 0, h.ofs);
2429        } else {
2430            if (use_movbe) {
2431                TCGReg t = datalo;
2432                datalo = datahi;
2433                datahi = t;
2434            }
2435            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2436                                     h.base, h.index, 0, h.ofs);
2437            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2438                                     h.base, h.index, 0, h.ofs + 4);
2439        }
2440        break;
2441
2442    case MO_128:
2443        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2444
2445        /*
2446         * Without 16-byte atomicity, use integer regs.
2447         * That is where we have the data, and it allows bswaps.
2448         */
2449        if (h.aa.atom < MO_128) {
2450            if (use_movbe) {
2451                TCGReg t = datalo;
2452                datalo = datahi;
2453                datahi = t;
2454            }
2455            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2456                                     h.base, h.index, 0, h.ofs);
2457            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2458                                     h.base, h.index, 0, h.ofs + 8);
2459            break;
2460        }
2461
2462        /*
2463         * With 16-byte atomicity, a vector store is required.
2464         * If we already have 16-byte alignment, then VMOVDQA always works.
2465         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2466         * Else use we require a runtime test for alignment for VMOVDQA;
2467         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2468         */
2469        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2470        if (h.aa.align >= MO_128) {
2471            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2472                                         TCG_TMP_VEC, 0,
2473                                         h.base, h.index, 0, h.ofs);
2474        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2475            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2476                                         TCG_TMP_VEC, 0,
2477                                         h.base, h.index, 0, h.ofs);
2478        } else {
2479            TCGLabel *l1 = gen_new_label();
2480            TCGLabel *l2 = gen_new_label();
2481            int jcc;
2482
2483            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2484            tcg_out_jxx(s, jcc, l1, true);
2485
2486            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2487                                         TCG_TMP_VEC, 0,
2488                                         h.base, h.index, 0, h.ofs);
2489            tcg_out_jxx(s, JCC_JMP, l2, true);
2490
2491            tcg_out_label(s, l1);
2492            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2493                                         TCG_TMP_VEC, 0,
2494                                         h.base, h.index, 0, h.ofs);
2495            tcg_out_label(s, l2);
2496        }
2497        break;
2498
2499    default:
2500        g_assert_not_reached();
2501    }
2502}
2503
2504static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2505                            TCGReg addrlo, TCGReg addrhi,
2506                            MemOpIdx oi, TCGType data_type)
2507{
2508    TCGLabelQemuLdst *ldst;
2509    HostAddress h;
2510
2511    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2512    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2513
2514    if (ldst) {
2515        ldst->type = data_type;
2516        ldst->datalo_reg = datalo;
2517        ldst->datahi_reg = datahi;
2518        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2519    }
2520}
2521
2522static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2523{
2524    /* Reuse the zeroing that exists for goto_ptr.  */
2525    if (a0 == 0) {
2526        tcg_out_jmp(s, tcg_code_gen_epilogue);
2527    } else {
2528        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2529        tcg_out_jmp(s, tb_ret_addr);
2530    }
2531}
2532
2533static void tcg_out_goto_tb(TCGContext *s, int which)
2534{
2535    /*
2536     * Jump displacement must be aligned for atomic patching;
2537     * see if we need to add extra nops before jump
2538     */
2539    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2540    if (gap != 1) {
2541        tcg_out_nopn(s, gap - 1);
2542    }
2543    tcg_out8(s, OPC_JMP_long); /* jmp im */
2544    set_jmp_insn_offset(s, which);
2545    tcg_out32(s, 0);
2546    set_jmp_reset_offset(s, which);
2547}
2548
2549void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2550                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2551{
2552    /* patch the branch destination */
2553    uintptr_t addr = tb->jmp_target_addr[n];
2554    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2555    /* no need to flush icache explicitly */
2556}
2557
2558static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2559                              const TCGArg args[TCG_MAX_OP_ARGS],
2560                              const int const_args[TCG_MAX_OP_ARGS])
2561{
2562    TCGArg a0, a1, a2;
2563    int c, const_a2, vexop, rexw = 0;
2564
2565#if TCG_TARGET_REG_BITS == 64
2566# define OP_32_64(x) \
2567        case glue(glue(INDEX_op_, x), _i64): \
2568            rexw = P_REXW; /* FALLTHRU */    \
2569        case glue(glue(INDEX_op_, x), _i32)
2570#else
2571# define OP_32_64(x) \
2572        case glue(glue(INDEX_op_, x), _i32)
2573#endif
2574
2575    /* Hoist the loads of the most common arguments.  */
2576    a0 = args[0];
2577    a1 = args[1];
2578    a2 = args[2];
2579    const_a2 = const_args[2];
2580
2581    switch (opc) {
2582    case INDEX_op_goto_ptr:
2583        /* jmp to the given host address (could be epilogue) */
2584        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2585        break;
2586    case INDEX_op_br:
2587        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2588        break;
2589    OP_32_64(ld8u):
2590        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2591        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2592        break;
2593    OP_32_64(ld8s):
2594        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2595        break;
2596    OP_32_64(ld16u):
2597        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2598        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2599        break;
2600    OP_32_64(ld16s):
2601        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2602        break;
2603#if TCG_TARGET_REG_BITS == 64
2604    case INDEX_op_ld32u_i64:
2605#endif
2606    case INDEX_op_ld_i32:
2607        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2608        break;
2609
2610    OP_32_64(st8):
2611        if (const_args[0]) {
2612            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2613            tcg_out8(s, a0);
2614        } else {
2615            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2616        }
2617        break;
2618    OP_32_64(st16):
2619        if (const_args[0]) {
2620            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2621            tcg_out16(s, a0);
2622        } else {
2623            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2624        }
2625        break;
2626#if TCG_TARGET_REG_BITS == 64
2627    case INDEX_op_st32_i64:
2628#endif
2629    case INDEX_op_st_i32:
2630        if (const_args[0]) {
2631            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2632            tcg_out32(s, a0);
2633        } else {
2634            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2635        }
2636        break;
2637
2638    OP_32_64(add):
2639        /* For 3-operand addition, use LEA.  */
2640        if (a0 != a1) {
2641            TCGArg c3 = 0;
2642            if (const_a2) {
2643                c3 = a2, a2 = -1;
2644            } else if (a0 == a2) {
2645                /* Watch out for dest = src + dest, since we've removed
2646                   the matching constraint on the add.  */
2647                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2648                break;
2649            }
2650
2651            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2652            break;
2653        }
2654        c = ARITH_ADD;
2655        goto gen_arith;
2656    OP_32_64(sub):
2657        c = ARITH_SUB;
2658        goto gen_arith;
2659    OP_32_64(and):
2660        c = ARITH_AND;
2661        goto gen_arith;
2662    OP_32_64(or):
2663        c = ARITH_OR;
2664        goto gen_arith;
2665    OP_32_64(xor):
2666        c = ARITH_XOR;
2667        goto gen_arith;
2668    gen_arith:
2669        if (const_a2) {
2670            tgen_arithi(s, c + rexw, a0, a2, 0);
2671        } else {
2672            tgen_arithr(s, c + rexw, a0, a2);
2673        }
2674        break;
2675
2676    OP_32_64(andc):
2677        if (const_a2) {
2678            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2679            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2680        } else {
2681            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2682        }
2683        break;
2684
2685    OP_32_64(mul):
2686        if (const_a2) {
2687            int32_t val;
2688            val = a2;
2689            if (val == (int8_t)val) {
2690                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2691                tcg_out8(s, val);
2692            } else {
2693                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2694                tcg_out32(s, val);
2695            }
2696        } else {
2697            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2698        }
2699        break;
2700
2701    OP_32_64(div2):
2702        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2703        break;
2704    OP_32_64(divu2):
2705        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2706        break;
2707
2708    OP_32_64(shl):
2709        /* For small constant 3-operand shift, use LEA.  */
2710        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2711            if (a2 - 1 == 0) {
2712                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2713                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2714            } else {
2715                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2716                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2717            }
2718            break;
2719        }
2720        c = SHIFT_SHL;
2721        vexop = OPC_SHLX;
2722        goto gen_shift_maybe_vex;
2723    OP_32_64(shr):
2724        c = SHIFT_SHR;
2725        vexop = OPC_SHRX;
2726        goto gen_shift_maybe_vex;
2727    OP_32_64(sar):
2728        c = SHIFT_SAR;
2729        vexop = OPC_SARX;
2730        goto gen_shift_maybe_vex;
2731    OP_32_64(rotl):
2732        c = SHIFT_ROL;
2733        goto gen_shift;
2734    OP_32_64(rotr):
2735        c = SHIFT_ROR;
2736        goto gen_shift;
2737    gen_shift_maybe_vex:
2738        if (have_bmi2) {
2739            if (!const_a2) {
2740                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2741                break;
2742            }
2743            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2744        }
2745        /* FALLTHRU */
2746    gen_shift:
2747        if (const_a2) {
2748            tcg_out_shifti(s, c + rexw, a0, a2);
2749        } else {
2750            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2751        }
2752        break;
2753
2754    OP_32_64(ctz):
2755        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2756        break;
2757    OP_32_64(clz):
2758        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2759        break;
2760    OP_32_64(ctpop):
2761        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2762        break;
2763
2764    OP_32_64(brcond):
2765        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2766                       arg_label(args[3]), 0);
2767        break;
2768    OP_32_64(setcond):
2769        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2770        break;
2771    OP_32_64(negsetcond):
2772        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2773        break;
2774    OP_32_64(movcond):
2775        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2776        break;
2777
2778    OP_32_64(bswap16):
2779        if (a2 & TCG_BSWAP_OS) {
2780            /* Output must be sign-extended. */
2781            if (rexw) {
2782                tcg_out_bswap64(s, a0);
2783                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2784            } else {
2785                tcg_out_bswap32(s, a0);
2786                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2787            }
2788        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2789            /* Output must be zero-extended, but input isn't. */
2790            tcg_out_bswap32(s, a0);
2791            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2792        } else {
2793            tcg_out_rolw_8(s, a0);
2794        }
2795        break;
2796    OP_32_64(bswap32):
2797        tcg_out_bswap32(s, a0);
2798        if (rexw && (a2 & TCG_BSWAP_OS)) {
2799            tcg_out_ext32s(s, a0, a0);
2800        }
2801        break;
2802
2803    OP_32_64(neg):
2804        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2805        break;
2806    OP_32_64(not):
2807        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2808        break;
2809
2810    case INDEX_op_qemu_ld_a64_i32:
2811        if (TCG_TARGET_REG_BITS == 32) {
2812            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2813            break;
2814        }
2815        /* fall through */
2816    case INDEX_op_qemu_ld_a32_i32:
2817        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2818        break;
2819    case INDEX_op_qemu_ld_a32_i64:
2820        if (TCG_TARGET_REG_BITS == 64) {
2821            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2822        } else {
2823            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2824        }
2825        break;
2826    case INDEX_op_qemu_ld_a64_i64:
2827        if (TCG_TARGET_REG_BITS == 64) {
2828            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2829        } else {
2830            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2831        }
2832        break;
2833    case INDEX_op_qemu_ld_a32_i128:
2834    case INDEX_op_qemu_ld_a64_i128:
2835        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2836        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2837        break;
2838
2839    case INDEX_op_qemu_st_a64_i32:
2840    case INDEX_op_qemu_st8_a64_i32:
2841        if (TCG_TARGET_REG_BITS == 32) {
2842            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2843            break;
2844        }
2845        /* fall through */
2846    case INDEX_op_qemu_st_a32_i32:
2847    case INDEX_op_qemu_st8_a32_i32:
2848        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2849        break;
2850    case INDEX_op_qemu_st_a32_i64:
2851        if (TCG_TARGET_REG_BITS == 64) {
2852            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2853        } else {
2854            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2855        }
2856        break;
2857    case INDEX_op_qemu_st_a64_i64:
2858        if (TCG_TARGET_REG_BITS == 64) {
2859            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2860        } else {
2861            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2862        }
2863        break;
2864    case INDEX_op_qemu_st_a32_i128:
2865    case INDEX_op_qemu_st_a64_i128:
2866        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2867        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2868        break;
2869
2870    OP_32_64(mulu2):
2871        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2872        break;
2873    OP_32_64(muls2):
2874        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2875        break;
2876    OP_32_64(add2):
2877        if (const_args[4]) {
2878            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2879        } else {
2880            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2881        }
2882        if (const_args[5]) {
2883            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2884        } else {
2885            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2886        }
2887        break;
2888    OP_32_64(sub2):
2889        if (const_args[4]) {
2890            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2891        } else {
2892            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2893        }
2894        if (const_args[5]) {
2895            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2896        } else {
2897            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2898        }
2899        break;
2900
2901#if TCG_TARGET_REG_BITS == 32
2902    case INDEX_op_brcond2_i32:
2903        tcg_out_brcond2(s, args, const_args, 0);
2904        break;
2905    case INDEX_op_setcond2_i32:
2906        tcg_out_setcond2(s, args, const_args);
2907        break;
2908#else /* TCG_TARGET_REG_BITS == 64 */
2909    case INDEX_op_ld32s_i64:
2910        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2911        break;
2912    case INDEX_op_ld_i64:
2913        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2914        break;
2915    case INDEX_op_st_i64:
2916        if (const_args[0]) {
2917            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2918            tcg_out32(s, a0);
2919        } else {
2920            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2921        }
2922        break;
2923
2924    case INDEX_op_bswap64_i64:
2925        tcg_out_bswap64(s, a0);
2926        break;
2927    case INDEX_op_extrh_i64_i32:
2928        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2929        break;
2930#endif
2931
2932    OP_32_64(deposit):
2933        if (args[3] == 0 && args[4] == 8) {
2934            /* load bits 0..7 */
2935            if (const_a2) {
2936                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2937                            0, a0, 0);
2938                tcg_out8(s, a2);
2939            } else {
2940                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2941            }
2942        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2943            /* load bits 8..15 */
2944            if (const_a2) {
2945                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2946                tcg_out8(s, a2);
2947            } else {
2948                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2949            }
2950        } else if (args[3] == 0 && args[4] == 16) {
2951            /* load bits 0..15 */
2952            if (const_a2) {
2953                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2954                            0, a0, 0);
2955                tcg_out16(s, a2);
2956            } else {
2957                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2958            }
2959        } else {
2960            g_assert_not_reached();
2961        }
2962        break;
2963
2964    case INDEX_op_extract_i64:
2965        if (a2 + args[3] == 32) {
2966            /* This is a 32-bit zero-extending right shift.  */
2967            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2968            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2969            break;
2970        }
2971        /* FALLTHRU */
2972    case INDEX_op_extract_i32:
2973        /* On the off-chance that we can use the high-byte registers.
2974           Otherwise we emit the same ext16 + shift pattern that we
2975           would have gotten from the normal tcg-op.c expansion.  */
2976        tcg_debug_assert(a2 == 8 && args[3] == 8);
2977        if (a1 < 4 && a0 < 8) {
2978            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2979        } else {
2980            tcg_out_ext16u(s, a0, a1);
2981            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2982        }
2983        break;
2984
2985    case INDEX_op_sextract_i32:
2986        /* We don't implement sextract_i64, as we cannot sign-extend to
2987           64-bits without using the REX prefix that explicitly excludes
2988           access to the high-byte registers.  */
2989        tcg_debug_assert(a2 == 8 && args[3] == 8);
2990        if (a1 < 4 && a0 < 8) {
2991            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2992        } else {
2993            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2994            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2995        }
2996        break;
2997
2998    OP_32_64(extract2):
2999        /* Note that SHRD outputs to the r/m operand.  */
3000        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
3001        tcg_out8(s, args[3]);
3002        break;
3003
3004    case INDEX_op_mb:
3005        tcg_out_mb(s, a0);
3006        break;
3007    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
3008    case INDEX_op_mov_i64:
3009    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
3010    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
3011    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
3012    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
3013    case INDEX_op_ext8s_i64:
3014    case INDEX_op_ext8u_i32:
3015    case INDEX_op_ext8u_i64:
3016    case INDEX_op_ext16s_i32:
3017    case INDEX_op_ext16s_i64:
3018    case INDEX_op_ext16u_i32:
3019    case INDEX_op_ext16u_i64:
3020    case INDEX_op_ext32s_i64:
3021    case INDEX_op_ext32u_i64:
3022    case INDEX_op_ext_i32_i64:
3023    case INDEX_op_extu_i32_i64:
3024    case INDEX_op_extrl_i64_i32:
3025    default:
3026        g_assert_not_reached();
3027    }
3028
3029#undef OP_32_64
3030}
3031
3032static int const umin_insn[4] = {
3033    OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3034};
3035
3036static int const umax_insn[4] = {
3037    OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3038};
3039
3040static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece,
3041                                  TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
3042{
3043    static int const cmpeq_insn[4] = {
3044        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3045    };
3046    static int const cmpgt_insn[4] = {
3047        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3048    };
3049
3050    enum {
3051        NEED_INV  = 1,
3052        NEED_SWAP = 2,
3053        NEED_UMIN = 4,
3054        NEED_UMAX = 8,
3055        INVALID   = 16,
3056    };
3057    static const uint8_t cond_fixup[16] = {
3058        [0 ... 15] = INVALID,
3059        [TCG_COND_EQ] = 0,
3060        [TCG_COND_GT] = 0,
3061        [TCG_COND_NE] = NEED_INV,
3062        [TCG_COND_LE] = NEED_INV,
3063        [TCG_COND_LT] = NEED_SWAP,
3064        [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3065        [TCG_COND_LEU] = NEED_UMIN,
3066        [TCG_COND_GTU] = NEED_UMIN | NEED_INV,
3067        [TCG_COND_GEU] = NEED_UMAX,
3068        [TCG_COND_LTU] = NEED_UMAX | NEED_INV,
3069    };
3070    int fixup = cond_fixup[cond];
3071
3072    assert(!(fixup & INVALID));
3073
3074    if (fixup & NEED_INV) {
3075        cond = tcg_invert_cond(cond);
3076    }
3077
3078    if (fixup & NEED_SWAP) {
3079        TCGReg swap = v1;
3080        v1 = v2;
3081        v2 = swap;
3082        cond = tcg_swap_cond(cond);
3083    }
3084
3085    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3086        int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]);
3087
3088        /* avx2 does not have 64-bit min/max; adjusted during expand. */
3089        assert(vece <= MO_32);
3090
3091        tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type);
3092        v2 = TCG_TMP_VEC;
3093        cond = TCG_COND_EQ;
3094    }
3095
3096    switch (cond) {
3097    case TCG_COND_EQ:
3098        tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type);
3099        break;
3100    case TCG_COND_GT:
3101        tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type);
3102        break;
3103    default:
3104        g_assert_not_reached();
3105    }
3106    return fixup & NEED_INV;
3107}
3108
3109static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
3110                            TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
3111{
3112    if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) {
3113        tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1);
3114        tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type);
3115    }
3116}
3117
3118static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
3119                               TCGReg v0, TCGReg c1, TCGReg c2,
3120                               TCGReg v3, TCGReg v4, TCGCond cond)
3121{
3122    if (tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond)) {
3123        TCGReg swap = v3;
3124        v3 = v4;
3125        v4 = swap;
3126    }
3127    tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type);
3128    tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4);
3129}
3130
3131static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3132                           unsigned vecl, unsigned vece,
3133                           const TCGArg args[TCG_MAX_OP_ARGS],
3134                           const int const_args[TCG_MAX_OP_ARGS])
3135{
3136    static int const add_insn[4] = {
3137        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3138    };
3139    static int const ssadd_insn[4] = {
3140        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3141    };
3142    static int const usadd_insn[4] = {
3143        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3144    };
3145    static int const sub_insn[4] = {
3146        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3147    };
3148    static int const sssub_insn[4] = {
3149        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3150    };
3151    static int const ussub_insn[4] = {
3152        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3153    };
3154    static int const mul_insn[4] = {
3155        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3156    };
3157    static int const shift_imm_insn[4] = {
3158        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3159    };
3160    static int const punpckl_insn[4] = {
3161        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3162    };
3163    static int const punpckh_insn[4] = {
3164        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3165    };
3166    static int const packss_insn[4] = {
3167        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3168    };
3169    static int const packus_insn[4] = {
3170        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3171    };
3172    static int const smin_insn[4] = {
3173        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3174    };
3175    static int const smax_insn[4] = {
3176        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3177    };
3178    static int const rotlv_insn[4] = {
3179        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3180    };
3181    static int const rotrv_insn[4] = {
3182        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3183    };
3184    static int const shlv_insn[4] = {
3185        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3186    };
3187    static int const shrv_insn[4] = {
3188        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3189    };
3190    static int const sarv_insn[4] = {
3191        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3192    };
3193    static int const shls_insn[4] = {
3194        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3195    };
3196    static int const shrs_insn[4] = {
3197        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3198    };
3199    static int const sars_insn[4] = {
3200        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3201    };
3202    static int const vpshldi_insn[4] = {
3203        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3204    };
3205    static int const vpshldv_insn[4] = {
3206        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3207    };
3208    static int const vpshrdv_insn[4] = {
3209        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3210    };
3211    static int const abs_insn[4] = {
3212        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3213    };
3214
3215    TCGType type = vecl + TCG_TYPE_V64;
3216    int insn, sub;
3217    TCGArg a0, a1, a2, a3;
3218
3219    a0 = args[0];
3220    a1 = args[1];
3221    a2 = args[2];
3222
3223    switch (opc) {
3224    case INDEX_op_add_vec:
3225        insn = add_insn[vece];
3226        goto gen_simd;
3227    case INDEX_op_ssadd_vec:
3228        insn = ssadd_insn[vece];
3229        goto gen_simd;
3230    case INDEX_op_usadd_vec:
3231        insn = usadd_insn[vece];
3232        goto gen_simd;
3233    case INDEX_op_sub_vec:
3234        insn = sub_insn[vece];
3235        goto gen_simd;
3236    case INDEX_op_sssub_vec:
3237        insn = sssub_insn[vece];
3238        goto gen_simd;
3239    case INDEX_op_ussub_vec:
3240        insn = ussub_insn[vece];
3241        goto gen_simd;
3242    case INDEX_op_mul_vec:
3243        insn = mul_insn[vece];
3244        goto gen_simd;
3245    case INDEX_op_and_vec:
3246        insn = OPC_PAND;
3247        goto gen_simd;
3248    case INDEX_op_or_vec:
3249        insn = OPC_POR;
3250        goto gen_simd;
3251    case INDEX_op_xor_vec:
3252        insn = OPC_PXOR;
3253        goto gen_simd;
3254    case INDEX_op_smin_vec:
3255        insn = smin_insn[vece];
3256        goto gen_simd;
3257    case INDEX_op_umin_vec:
3258        insn = umin_insn[vece];
3259        goto gen_simd;
3260    case INDEX_op_smax_vec:
3261        insn = smax_insn[vece];
3262        goto gen_simd;
3263    case INDEX_op_umax_vec:
3264        insn = umax_insn[vece];
3265        goto gen_simd;
3266    case INDEX_op_shlv_vec:
3267        insn = shlv_insn[vece];
3268        goto gen_simd;
3269    case INDEX_op_shrv_vec:
3270        insn = shrv_insn[vece];
3271        goto gen_simd;
3272    case INDEX_op_sarv_vec:
3273        insn = sarv_insn[vece];
3274        goto gen_simd;
3275    case INDEX_op_rotlv_vec:
3276        insn = rotlv_insn[vece];
3277        goto gen_simd;
3278    case INDEX_op_rotrv_vec:
3279        insn = rotrv_insn[vece];
3280        goto gen_simd;
3281    case INDEX_op_shls_vec:
3282        insn = shls_insn[vece];
3283        goto gen_simd;
3284    case INDEX_op_shrs_vec:
3285        insn = shrs_insn[vece];
3286        goto gen_simd;
3287    case INDEX_op_sars_vec:
3288        insn = sars_insn[vece];
3289        goto gen_simd;
3290    case INDEX_op_x86_punpckl_vec:
3291        insn = punpckl_insn[vece];
3292        goto gen_simd;
3293    case INDEX_op_x86_punpckh_vec:
3294        insn = punpckh_insn[vece];
3295        goto gen_simd;
3296    case INDEX_op_x86_packss_vec:
3297        insn = packss_insn[vece];
3298        goto gen_simd;
3299    case INDEX_op_x86_packus_vec:
3300        insn = packus_insn[vece];
3301        goto gen_simd;
3302    case INDEX_op_x86_vpshldv_vec:
3303        insn = vpshldv_insn[vece];
3304        a1 = a2;
3305        a2 = args[3];
3306        goto gen_simd;
3307    case INDEX_op_x86_vpshrdv_vec:
3308        insn = vpshrdv_insn[vece];
3309        a1 = a2;
3310        a2 = args[3];
3311        goto gen_simd;
3312#if TCG_TARGET_REG_BITS == 32
3313    case INDEX_op_dup2_vec:
3314        /* First merge the two 32-bit inputs to a single 64-bit element. */
3315        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3316        /* Then replicate the 64-bit elements across the rest of the vector. */
3317        if (type != TCG_TYPE_V64) {
3318            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3319        }
3320        break;
3321#endif
3322    case INDEX_op_abs_vec:
3323        insn = abs_insn[vece];
3324        a2 = a1;
3325        a1 = 0;
3326        goto gen_simd;
3327    gen_simd:
3328        tcg_debug_assert(insn != OPC_UD2);
3329        tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
3330        break;
3331
3332    case INDEX_op_cmp_vec:
3333        tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]);
3334        break;
3335
3336    case INDEX_op_cmpsel_vec:
3337        tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2,
3338                           args[3], args[4], args[5]);
3339        break;
3340
3341    case INDEX_op_andc_vec:
3342        insn = OPC_PANDN;
3343        tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type);
3344        break;
3345
3346    case INDEX_op_shli_vec:
3347        insn = shift_imm_insn[vece];
3348        sub = 6;
3349        goto gen_shift;
3350    case INDEX_op_shri_vec:
3351        insn = shift_imm_insn[vece];
3352        sub = 2;
3353        goto gen_shift;
3354    case INDEX_op_sari_vec:
3355        if (vece == MO_64) {
3356            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3357        } else {
3358            insn = shift_imm_insn[vece];
3359        }
3360        sub = 4;
3361        goto gen_shift;
3362    case INDEX_op_rotli_vec:
3363        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3364        if (vece == MO_64) {
3365            insn |= P_VEXW;
3366        }
3367        sub = 1;
3368        goto gen_shift;
3369    gen_shift:
3370        tcg_debug_assert(vece != MO_8);
3371        tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type);
3372        tcg_out8(s, a2);
3373        break;
3374
3375    case INDEX_op_ld_vec:
3376        tcg_out_ld(s, type, a0, a1, a2);
3377        break;
3378    case INDEX_op_st_vec:
3379        tcg_out_st(s, type, a0, a1, a2);
3380        break;
3381    case INDEX_op_dupm_vec:
3382        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3383        break;
3384
3385    case INDEX_op_x86_shufps_vec:
3386        insn = OPC_SHUFPS;
3387        sub = args[3];
3388        goto gen_simd_imm8;
3389    case INDEX_op_x86_blend_vec:
3390        if (vece == MO_16) {
3391            insn = OPC_PBLENDW;
3392        } else if (vece == MO_32) {
3393            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3394        } else {
3395            g_assert_not_reached();
3396        }
3397        sub = args[3];
3398        goto gen_simd_imm8;
3399    case INDEX_op_x86_vperm2i128_vec:
3400        insn = OPC_VPERM2I128;
3401        sub = args[3];
3402        goto gen_simd_imm8;
3403    case INDEX_op_x86_vpshldi_vec:
3404        insn = vpshldi_insn[vece];
3405        sub = args[3];
3406        goto gen_simd_imm8;
3407
3408    case INDEX_op_not_vec:
3409        insn = OPC_VPTERNLOGQ;
3410        a2 = a1;
3411        sub = 0x33; /* !B */
3412        goto gen_simd_imm8;
3413    case INDEX_op_nor_vec:
3414        insn = OPC_VPTERNLOGQ;
3415        sub = 0x11; /* norCB */
3416        goto gen_simd_imm8;
3417    case INDEX_op_nand_vec:
3418        insn = OPC_VPTERNLOGQ;
3419        sub = 0x77; /* nandCB */
3420        goto gen_simd_imm8;
3421    case INDEX_op_eqv_vec:
3422        insn = OPC_VPTERNLOGQ;
3423        sub = 0x99; /* xnorCB */
3424        goto gen_simd_imm8;
3425    case INDEX_op_orc_vec:
3426        insn = OPC_VPTERNLOGQ;
3427        sub = 0xdd; /* orB!C */
3428        goto gen_simd_imm8;
3429
3430    case INDEX_op_bitsel_vec:
3431        insn = OPC_VPTERNLOGQ;
3432        a3 = args[3];
3433        if (a0 == a1) {
3434            a1 = a2;
3435            a2 = a3;
3436            sub = 0xca; /* A?B:C */
3437        } else if (a0 == a2) {
3438            a2 = a3;
3439            sub = 0xe2; /* B?A:C */
3440        } else {
3441            tcg_out_mov(s, type, a0, a3);
3442            sub = 0xb8; /* B?C:A */
3443        }
3444        goto gen_simd_imm8;
3445
3446    gen_simd_imm8:
3447        tcg_debug_assert(insn != OPC_UD2);
3448        tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
3449        tcg_out8(s, sub);
3450        break;
3451
3452    case INDEX_op_x86_psrldq_vec:
3453        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3454        tcg_out8(s, a2);
3455        break;
3456
3457    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3458    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3459    default:
3460        g_assert_not_reached();
3461    }
3462}
3463
3464static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3465{
3466    switch (op) {
3467    case INDEX_op_goto_ptr:
3468        return C_O0_I1(r);
3469
3470    case INDEX_op_ld8u_i32:
3471    case INDEX_op_ld8u_i64:
3472    case INDEX_op_ld8s_i32:
3473    case INDEX_op_ld8s_i64:
3474    case INDEX_op_ld16u_i32:
3475    case INDEX_op_ld16u_i64:
3476    case INDEX_op_ld16s_i32:
3477    case INDEX_op_ld16s_i64:
3478    case INDEX_op_ld_i32:
3479    case INDEX_op_ld32u_i64:
3480    case INDEX_op_ld32s_i64:
3481    case INDEX_op_ld_i64:
3482        return C_O1_I1(r, r);
3483
3484    case INDEX_op_st8_i32:
3485    case INDEX_op_st8_i64:
3486        return C_O0_I2(qi, r);
3487
3488    case INDEX_op_st16_i32:
3489    case INDEX_op_st16_i64:
3490    case INDEX_op_st_i32:
3491    case INDEX_op_st32_i64:
3492        return C_O0_I2(ri, r);
3493
3494    case INDEX_op_st_i64:
3495        return C_O0_I2(re, r);
3496
3497    case INDEX_op_add_i32:
3498    case INDEX_op_add_i64:
3499        return C_O1_I2(r, r, re);
3500
3501    case INDEX_op_sub_i32:
3502    case INDEX_op_sub_i64:
3503    case INDEX_op_mul_i32:
3504    case INDEX_op_mul_i64:
3505    case INDEX_op_or_i32:
3506    case INDEX_op_or_i64:
3507    case INDEX_op_xor_i32:
3508    case INDEX_op_xor_i64:
3509        return C_O1_I2(r, 0, re);
3510
3511    case INDEX_op_and_i32:
3512    case INDEX_op_and_i64:
3513        return C_O1_I2(r, 0, reZ);
3514
3515    case INDEX_op_andc_i32:
3516    case INDEX_op_andc_i64:
3517        return C_O1_I2(r, r, rI);
3518
3519    case INDEX_op_shl_i32:
3520    case INDEX_op_shl_i64:
3521    case INDEX_op_shr_i32:
3522    case INDEX_op_shr_i64:
3523    case INDEX_op_sar_i32:
3524    case INDEX_op_sar_i64:
3525        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3526
3527    case INDEX_op_rotl_i32:
3528    case INDEX_op_rotl_i64:
3529    case INDEX_op_rotr_i32:
3530    case INDEX_op_rotr_i64:
3531        return C_O1_I2(r, 0, ci);
3532
3533    case INDEX_op_brcond_i32:
3534    case INDEX_op_brcond_i64:
3535        return C_O0_I2(r, reT);
3536
3537    case INDEX_op_bswap16_i32:
3538    case INDEX_op_bswap16_i64:
3539    case INDEX_op_bswap32_i32:
3540    case INDEX_op_bswap32_i64:
3541    case INDEX_op_bswap64_i64:
3542    case INDEX_op_neg_i32:
3543    case INDEX_op_neg_i64:
3544    case INDEX_op_not_i32:
3545    case INDEX_op_not_i64:
3546    case INDEX_op_extrh_i64_i32:
3547        return C_O1_I1(r, 0);
3548
3549    case INDEX_op_ext8s_i32:
3550    case INDEX_op_ext8s_i64:
3551    case INDEX_op_ext8u_i32:
3552    case INDEX_op_ext8u_i64:
3553        return C_O1_I1(r, q);
3554
3555    case INDEX_op_ext16s_i32:
3556    case INDEX_op_ext16s_i64:
3557    case INDEX_op_ext16u_i32:
3558    case INDEX_op_ext16u_i64:
3559    case INDEX_op_ext32s_i64:
3560    case INDEX_op_ext32u_i64:
3561    case INDEX_op_ext_i32_i64:
3562    case INDEX_op_extu_i32_i64:
3563    case INDEX_op_extrl_i64_i32:
3564    case INDEX_op_extract_i32:
3565    case INDEX_op_extract_i64:
3566    case INDEX_op_sextract_i32:
3567    case INDEX_op_ctpop_i32:
3568    case INDEX_op_ctpop_i64:
3569        return C_O1_I1(r, r);
3570
3571    case INDEX_op_extract2_i32:
3572    case INDEX_op_extract2_i64:
3573        return C_O1_I2(r, 0, r);
3574
3575    case INDEX_op_deposit_i32:
3576    case INDEX_op_deposit_i64:
3577        return C_O1_I2(q, 0, qi);
3578
3579    case INDEX_op_setcond_i32:
3580    case INDEX_op_setcond_i64:
3581    case INDEX_op_negsetcond_i32:
3582    case INDEX_op_negsetcond_i64:
3583        return C_O1_I2(q, r, reT);
3584
3585    case INDEX_op_movcond_i32:
3586    case INDEX_op_movcond_i64:
3587        return C_O1_I4(r, r, reT, r, 0);
3588
3589    case INDEX_op_div2_i32:
3590    case INDEX_op_div2_i64:
3591    case INDEX_op_divu2_i32:
3592    case INDEX_op_divu2_i64:
3593        return C_O2_I3(a, d, 0, 1, r);
3594
3595    case INDEX_op_mulu2_i32:
3596    case INDEX_op_mulu2_i64:
3597    case INDEX_op_muls2_i32:
3598    case INDEX_op_muls2_i64:
3599        return C_O2_I2(a, d, a, r);
3600
3601    case INDEX_op_add2_i32:
3602    case INDEX_op_add2_i64:
3603    case INDEX_op_sub2_i32:
3604    case INDEX_op_sub2_i64:
3605        return C_N1_O1_I4(r, r, 0, 1, re, re);
3606
3607    case INDEX_op_ctz_i32:
3608    case INDEX_op_ctz_i64:
3609        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3610
3611    case INDEX_op_clz_i32:
3612    case INDEX_op_clz_i64:
3613        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3614
3615    case INDEX_op_qemu_ld_a32_i32:
3616        return C_O1_I1(r, L);
3617    case INDEX_op_qemu_ld_a64_i32:
3618        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3619
3620    case INDEX_op_qemu_st_a32_i32:
3621        return C_O0_I2(L, L);
3622    case INDEX_op_qemu_st_a64_i32:
3623        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3624    case INDEX_op_qemu_st8_a32_i32:
3625        return C_O0_I2(s, L);
3626    case INDEX_op_qemu_st8_a64_i32:
3627        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3628
3629    case INDEX_op_qemu_ld_a32_i64:
3630        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3631    case INDEX_op_qemu_ld_a64_i64:
3632        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3633
3634    case INDEX_op_qemu_st_a32_i64:
3635        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3636    case INDEX_op_qemu_st_a64_i64:
3637        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3638
3639    case INDEX_op_qemu_ld_a32_i128:
3640    case INDEX_op_qemu_ld_a64_i128:
3641        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3642        return C_O2_I1(r, r, L);
3643    case INDEX_op_qemu_st_a32_i128:
3644    case INDEX_op_qemu_st_a64_i128:
3645        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3646        return C_O0_I3(L, L, L);
3647
3648    case INDEX_op_brcond2_i32:
3649        return C_O0_I4(r, r, ri, ri);
3650
3651    case INDEX_op_setcond2_i32:
3652        return C_O1_I4(r, r, r, ri, ri);
3653
3654    case INDEX_op_ld_vec:
3655    case INDEX_op_dupm_vec:
3656        return C_O1_I1(x, r);
3657
3658    case INDEX_op_st_vec:
3659        return C_O0_I2(x, r);
3660
3661    case INDEX_op_add_vec:
3662    case INDEX_op_sub_vec:
3663    case INDEX_op_mul_vec:
3664    case INDEX_op_and_vec:
3665    case INDEX_op_or_vec:
3666    case INDEX_op_xor_vec:
3667    case INDEX_op_andc_vec:
3668    case INDEX_op_orc_vec:
3669    case INDEX_op_nand_vec:
3670    case INDEX_op_nor_vec:
3671    case INDEX_op_eqv_vec:
3672    case INDEX_op_ssadd_vec:
3673    case INDEX_op_usadd_vec:
3674    case INDEX_op_sssub_vec:
3675    case INDEX_op_ussub_vec:
3676    case INDEX_op_smin_vec:
3677    case INDEX_op_umin_vec:
3678    case INDEX_op_smax_vec:
3679    case INDEX_op_umax_vec:
3680    case INDEX_op_shlv_vec:
3681    case INDEX_op_shrv_vec:
3682    case INDEX_op_sarv_vec:
3683    case INDEX_op_rotlv_vec:
3684    case INDEX_op_rotrv_vec:
3685    case INDEX_op_shls_vec:
3686    case INDEX_op_shrs_vec:
3687    case INDEX_op_sars_vec:
3688    case INDEX_op_cmp_vec:
3689    case INDEX_op_x86_shufps_vec:
3690    case INDEX_op_x86_blend_vec:
3691    case INDEX_op_x86_packss_vec:
3692    case INDEX_op_x86_packus_vec:
3693    case INDEX_op_x86_vperm2i128_vec:
3694    case INDEX_op_x86_punpckl_vec:
3695    case INDEX_op_x86_punpckh_vec:
3696    case INDEX_op_x86_vpshldi_vec:
3697#if TCG_TARGET_REG_BITS == 32
3698    case INDEX_op_dup2_vec:
3699#endif
3700        return C_O1_I2(x, x, x);
3701
3702    case INDEX_op_abs_vec:
3703    case INDEX_op_dup_vec:
3704    case INDEX_op_not_vec:
3705    case INDEX_op_shli_vec:
3706    case INDEX_op_shri_vec:
3707    case INDEX_op_sari_vec:
3708    case INDEX_op_rotli_vec:
3709    case INDEX_op_x86_psrldq_vec:
3710        return C_O1_I1(x, x);
3711
3712    case INDEX_op_x86_vpshldv_vec:
3713    case INDEX_op_x86_vpshrdv_vec:
3714        return C_O1_I3(x, 0, x, x);
3715
3716    case INDEX_op_bitsel_vec:
3717        return C_O1_I3(x, x, x, x);
3718    case INDEX_op_cmpsel_vec:
3719        return C_O1_I4(x, x, x, x, x);
3720
3721    default:
3722        g_assert_not_reached();
3723    }
3724}
3725
3726int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3727{
3728    switch (opc) {
3729    case INDEX_op_add_vec:
3730    case INDEX_op_sub_vec:
3731    case INDEX_op_and_vec:
3732    case INDEX_op_or_vec:
3733    case INDEX_op_xor_vec:
3734    case INDEX_op_andc_vec:
3735    case INDEX_op_orc_vec:
3736    case INDEX_op_nand_vec:
3737    case INDEX_op_nor_vec:
3738    case INDEX_op_eqv_vec:
3739    case INDEX_op_not_vec:
3740    case INDEX_op_bitsel_vec:
3741        return 1;
3742    case INDEX_op_cmp_vec:
3743    case INDEX_op_cmpsel_vec:
3744        return -1;
3745
3746    case INDEX_op_rotli_vec:
3747        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3748
3749    case INDEX_op_shli_vec:
3750    case INDEX_op_shri_vec:
3751        /* We must expand the operation for MO_8.  */
3752        return vece == MO_8 ? -1 : 1;
3753
3754    case INDEX_op_sari_vec:
3755        switch (vece) {
3756        case MO_8:
3757            return -1;
3758        case MO_16:
3759        case MO_32:
3760            return 1;
3761        case MO_64:
3762            if (have_avx512vl) {
3763                return 1;
3764            }
3765            /*
3766             * We can emulate this for MO_64, but it does not pay off
3767             * unless we're producing at least 4 values.
3768             */
3769            return type >= TCG_TYPE_V256 ? -1 : 0;
3770        }
3771        return 0;
3772
3773    case INDEX_op_shls_vec:
3774    case INDEX_op_shrs_vec:
3775        return vece >= MO_16;
3776    case INDEX_op_sars_vec:
3777        switch (vece) {
3778        case MO_16:
3779        case MO_32:
3780            return 1;
3781        case MO_64:
3782            return have_avx512vl;
3783        }
3784        return 0;
3785    case INDEX_op_rotls_vec:
3786        return vece >= MO_16 ? -1 : 0;
3787
3788    case INDEX_op_shlv_vec:
3789    case INDEX_op_shrv_vec:
3790        switch (vece) {
3791        case MO_16:
3792            return have_avx512bw;
3793        case MO_32:
3794        case MO_64:
3795            return have_avx2;
3796        }
3797        return 0;
3798    case INDEX_op_sarv_vec:
3799        switch (vece) {
3800        case MO_16:
3801            return have_avx512bw;
3802        case MO_32:
3803            return have_avx2;
3804        case MO_64:
3805            return have_avx512vl;
3806        }
3807        return 0;
3808    case INDEX_op_rotlv_vec:
3809    case INDEX_op_rotrv_vec:
3810        switch (vece) {
3811        case MO_16:
3812            return have_avx512vbmi2 ? -1 : 0;
3813        case MO_32:
3814        case MO_64:
3815            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3816        }
3817        return 0;
3818
3819    case INDEX_op_mul_vec:
3820        switch (vece) {
3821        case MO_8:
3822            return -1;
3823        case MO_64:
3824            return have_avx512dq;
3825        }
3826        return 1;
3827
3828    case INDEX_op_ssadd_vec:
3829    case INDEX_op_usadd_vec:
3830    case INDEX_op_sssub_vec:
3831    case INDEX_op_ussub_vec:
3832        return vece <= MO_16;
3833    case INDEX_op_smin_vec:
3834    case INDEX_op_smax_vec:
3835    case INDEX_op_umin_vec:
3836    case INDEX_op_umax_vec:
3837    case INDEX_op_abs_vec:
3838        return vece <= MO_32 || have_avx512vl;
3839
3840    default:
3841        return 0;
3842    }
3843}
3844
3845static void expand_vec_shi(TCGType type, unsigned vece, bool right,
3846                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3847{
3848    uint8_t mask;
3849
3850    tcg_debug_assert(vece == MO_8);
3851    if (right) {
3852        mask = 0xff >> imm;
3853        tcg_gen_shri_vec(MO_16, v0, v1, imm);
3854    } else {
3855        mask = 0xff << imm;
3856        tcg_gen_shli_vec(MO_16, v0, v1, imm);
3857    }
3858    tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
3859}
3860
3861static void expand_vec_sari(TCGType type, unsigned vece,
3862                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3863{
3864    TCGv_vec t1, t2;
3865
3866    switch (vece) {
3867    case MO_8:
3868        /* Unpack to 16-bit, shift, and repack.  */
3869        t1 = tcg_temp_new_vec(type);
3870        t2 = tcg_temp_new_vec(type);
3871        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3872                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3873        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3874                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3875        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3876        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3877        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3878                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3879        tcg_temp_free_vec(t1);
3880        tcg_temp_free_vec(t2);
3881        break;
3882
3883    case MO_64:
3884        t1 = tcg_temp_new_vec(type);
3885        if (imm <= 32) {
3886            /*
3887             * We can emulate a small sign extend by performing an arithmetic
3888             * 32-bit shift and overwriting the high half of a 64-bit logical
3889             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3890             * does not, so we have to bound the smaller shift -- we get the
3891             * same result in the high half either way.
3892             */
3893            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3894            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3895            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3896                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3897                      tcgv_vec_arg(t1), 0xaa);
3898        } else {
3899            /* Otherwise we will need to use a compare vs 0 to produce
3900             * the sign-extend, shift and merge.
3901             */
3902            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3903                            tcg_constant_vec(type, MO_64, 0), v1);
3904            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3905            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3906            tcg_gen_or_vec(MO_64, v0, v0, t1);
3907        }
3908        tcg_temp_free_vec(t1);
3909        break;
3910
3911    default:
3912        g_assert_not_reached();
3913    }
3914}
3915
3916static void expand_vec_rotli(TCGType type, unsigned vece,
3917                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3918{
3919    TCGv_vec t;
3920
3921    if (vece != MO_8 && have_avx512vbmi2) {
3922        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3923                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3924        return;
3925    }
3926
3927    t = tcg_temp_new_vec(type);
3928    tcg_gen_shli_vec(vece, t, v1, imm);
3929    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3930    tcg_gen_or_vec(vece, v0, v0, t);
3931    tcg_temp_free_vec(t);
3932}
3933
3934static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3935                            TCGv_vec v1, TCGv_vec sh, bool right)
3936{
3937    TCGv_vec t;
3938
3939    if (have_avx512vbmi2) {
3940        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3941                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3942                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3943        return;
3944    }
3945
3946    t = tcg_temp_new_vec(type);
3947    tcg_gen_dupi_vec(vece, t, 8 << vece);
3948    tcg_gen_sub_vec(vece, t, t, sh);
3949    if (right) {
3950        tcg_gen_shlv_vec(vece, t, v1, t);
3951        tcg_gen_shrv_vec(vece, v0, v1, sh);
3952    } else {
3953        tcg_gen_shrv_vec(vece, t, v1, t);
3954        tcg_gen_shlv_vec(vece, v0, v1, sh);
3955    }
3956    tcg_gen_or_vec(vece, v0, v0, t);
3957    tcg_temp_free_vec(t);
3958}
3959
3960static void expand_vec_rotls(TCGType type, unsigned vece,
3961                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3962{
3963    TCGv_vec t = tcg_temp_new_vec(type);
3964
3965    tcg_debug_assert(vece != MO_8);
3966
3967    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3968        tcg_gen_dup_i32_vec(vece, t, lsh);
3969        if (vece >= MO_32) {
3970            tcg_gen_rotlv_vec(vece, v0, v1, t);
3971        } else {
3972            expand_vec_rotv(type, vece, v0, v1, t, false);
3973        }
3974    } else {
3975        TCGv_i32 rsh = tcg_temp_new_i32();
3976
3977        tcg_gen_neg_i32(rsh, lsh);
3978        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3979        tcg_gen_shls_vec(vece, t, v1, lsh);
3980        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3981        tcg_gen_or_vec(vece, v0, v0, t);
3982
3983        tcg_temp_free_i32(rsh);
3984    }
3985
3986    tcg_temp_free_vec(t);
3987}
3988
3989static void expand_vec_mul(TCGType type, unsigned vece,
3990                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3991{
3992    TCGv_vec t1, t2, t3, t4, zero;
3993
3994    tcg_debug_assert(vece == MO_8);
3995
3996    /*
3997     * Unpack v1 bytes to words, 0 | x.
3998     * Unpack v2 bytes to words, y | 0.
3999     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
4000     * Shift logical right by 8 bits to clear the high 8 bytes before
4001     * using an unsigned saturated pack.
4002     *
4003     * The difference between the V64, V128 and V256 cases is merely how
4004     * we distribute the expansion between temporaries.
4005     */
4006    switch (type) {
4007    case TCG_TYPE_V64:
4008        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
4009        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
4010        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
4011        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
4012                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4013        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
4014                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4015        tcg_gen_mul_vec(MO_16, t1, t1, t2);
4016        tcg_gen_shri_vec(MO_16, t1, t1, 8);
4017        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
4018                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
4019        tcg_temp_free_vec(t1);
4020        tcg_temp_free_vec(t2);
4021        break;
4022
4023    case TCG_TYPE_V128:
4024    case TCG_TYPE_V256:
4025        t1 = tcg_temp_new_vec(type);
4026        t2 = tcg_temp_new_vec(type);
4027        t3 = tcg_temp_new_vec(type);
4028        t4 = tcg_temp_new_vec(type);
4029        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
4030        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
4031                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4032        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
4033                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4034        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
4035                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4036        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
4037                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4038        tcg_gen_mul_vec(MO_16, t1, t1, t2);
4039        tcg_gen_mul_vec(MO_16, t3, t3, t4);
4040        tcg_gen_shri_vec(MO_16, t1, t1, 8);
4041        tcg_gen_shri_vec(MO_16, t3, t3, 8);
4042        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
4043                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
4044        tcg_temp_free_vec(t1);
4045        tcg_temp_free_vec(t2);
4046        tcg_temp_free_vec(t3);
4047        tcg_temp_free_vec(t4);
4048        break;
4049
4050    default:
4051        g_assert_not_reached();
4052    }
4053}
4054
4055static TCGCond expand_vec_cond(TCGType type, unsigned vece,
4056                               TCGArg *a1, TCGArg *a2, TCGCond cond)
4057{
4058    /*
4059     * Without AVX512, there are no 64-bit unsigned comparisons.
4060     * We must bias the inputs so that they become signed.
4061     * All other swapping and inversion are handled during code generation.
4062     */
4063    if (vece == MO_64 && is_unsigned_cond(cond)) {
4064        TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1));
4065        TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2));
4066        TCGv_vec t1 = tcg_temp_new_vec(type);
4067        TCGv_vec t2 = tcg_temp_new_vec(type);
4068        TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4069
4070        tcg_gen_sub_vec(vece, t1, v1, t3);
4071        tcg_gen_sub_vec(vece, t2, v2, t3);
4072        *a1 = tcgv_vec_arg(t1);
4073        *a2 = tcgv_vec_arg(t2);
4074        cond = tcg_signed_cond(cond);
4075    }
4076    return cond;
4077}
4078
4079static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0,
4080                           TCGArg a1, TCGArg a2, TCGCond cond)
4081{
4082    cond = expand_vec_cond(type, vece, &a1, &a2, cond);
4083    /* Expand directly; do not recurse.  */
4084    vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
4085}
4086
4087static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0,
4088                              TCGArg a1, TCGArg a2,
4089                              TCGArg a3, TCGArg a4, TCGCond cond)
4090{
4091    cond = expand_vec_cond(type, vece, &a1, &a2, cond);
4092    /* Expand directly; do not recurse.  */
4093    vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond);
4094}
4095
4096void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4097                       TCGArg a0, ...)
4098{
4099    va_list va;
4100    TCGArg a1, a2, a3, a4, a5;
4101    TCGv_vec v0, v1, v2;
4102
4103    va_start(va, a0);
4104    a1 = va_arg(va, TCGArg);
4105    a2 = va_arg(va, TCGArg);
4106    v0 = temp_tcgv_vec(arg_temp(a0));
4107    v1 = temp_tcgv_vec(arg_temp(a1));
4108
4109    switch (opc) {
4110    case INDEX_op_shli_vec:
4111        expand_vec_shi(type, vece, false, v0, v1, a2);
4112        break;
4113    case INDEX_op_shri_vec:
4114        expand_vec_shi(type, vece, true, v0, v1, a2);
4115        break;
4116    case INDEX_op_sari_vec:
4117        expand_vec_sari(type, vece, v0, v1, a2);
4118        break;
4119
4120    case INDEX_op_rotli_vec:
4121        expand_vec_rotli(type, vece, v0, v1, a2);
4122        break;
4123
4124    case INDEX_op_rotls_vec:
4125        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4126        break;
4127
4128    case INDEX_op_rotlv_vec:
4129        v2 = temp_tcgv_vec(arg_temp(a2));
4130        expand_vec_rotv(type, vece, v0, v1, v2, false);
4131        break;
4132    case INDEX_op_rotrv_vec:
4133        v2 = temp_tcgv_vec(arg_temp(a2));
4134        expand_vec_rotv(type, vece, v0, v1, v2, true);
4135        break;
4136
4137    case INDEX_op_mul_vec:
4138        v2 = temp_tcgv_vec(arg_temp(a2));
4139        expand_vec_mul(type, vece, v0, v1, v2);
4140        break;
4141
4142    case INDEX_op_cmp_vec:
4143        a3 = va_arg(va, TCGArg);
4144        expand_vec_cmp(type, vece, a0, a1, a2, a3);
4145        break;
4146
4147    case INDEX_op_cmpsel_vec:
4148        a3 = va_arg(va, TCGArg);
4149        a4 = va_arg(va, TCGArg);
4150        a5 = va_arg(va, TCGArg);
4151        expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5);
4152        break;
4153
4154    default:
4155        break;
4156    }
4157
4158    va_end(va);
4159}
4160
4161static const int tcg_target_callee_save_regs[] = {
4162#if TCG_TARGET_REG_BITS == 64
4163    TCG_REG_RBP,
4164    TCG_REG_RBX,
4165#if defined(_WIN64)
4166    TCG_REG_RDI,
4167    TCG_REG_RSI,
4168#endif
4169    TCG_REG_R12,
4170    TCG_REG_R13,
4171    TCG_REG_R14, /* Currently used for the global env. */
4172    TCG_REG_R15,
4173#else
4174    TCG_REG_EBP, /* Currently used for the global env. */
4175    TCG_REG_EBX,
4176    TCG_REG_ESI,
4177    TCG_REG_EDI,
4178#endif
4179};
4180
4181/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4182   and tcg_register_jit.  */
4183
4184#define PUSH_SIZE \
4185    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4186     * (TCG_TARGET_REG_BITS / 8))
4187
4188#define FRAME_SIZE \
4189    ((PUSH_SIZE \
4190      + TCG_STATIC_CALL_ARGS_SIZE \
4191      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4192      + TCG_TARGET_STACK_ALIGN - 1) \
4193     & ~(TCG_TARGET_STACK_ALIGN - 1))
4194
4195/* Generate global QEMU prologue and epilogue code */
4196static void tcg_target_qemu_prologue(TCGContext *s)
4197{
4198    int i, stack_addend;
4199
4200    /* TB prologue */
4201
4202    /* Reserve some stack space, also for TCG temps.  */
4203    stack_addend = FRAME_SIZE - PUSH_SIZE;
4204    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4205                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4206
4207    /* Save all callee saved registers.  */
4208    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4209        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4210    }
4211
4212    if (!tcg_use_softmmu && guest_base) {
4213        int seg = setup_guest_base_seg();
4214        if (seg != 0) {
4215            x86_guest_base.seg = seg;
4216        } else if (guest_base == (int32_t)guest_base) {
4217            x86_guest_base.ofs = guest_base;
4218        } else {
4219            assert(TCG_TARGET_REG_BITS == 64);
4220            /* Choose R12 because, as a base, it requires a SIB byte. */
4221            x86_guest_base.index = TCG_REG_R12;
4222            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4223            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4224        }
4225    }
4226
4227    if (TCG_TARGET_REG_BITS == 32) {
4228        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4229                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4230        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4231        /* jmp *tb.  */
4232        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4233                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4234                             + stack_addend);
4235    } else {
4236        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4237        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4238        /* jmp *tb.  */
4239        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4240    }
4241
4242    /*
4243     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4244     * and fall through to the rest of the epilogue.
4245     */
4246    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4247    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4248
4249    /* TB epilogue */
4250    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4251
4252    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4253
4254    if (have_avx2) {
4255        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4256    }
4257    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4258        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4259    }
4260    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4261}
4262
4263static void tcg_out_tb_start(TCGContext *s)
4264{
4265    /* nothing to do */
4266}
4267
4268static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4269{
4270    memset(p, 0x90, count);
4271}
4272
4273static void tcg_target_init(TCGContext *s)
4274{
4275    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4276    if (TCG_TARGET_REG_BITS == 64) {
4277        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4278    }
4279    if (have_avx1) {
4280        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4281        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4282    }
4283    if (have_avx2) {
4284        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4285    }
4286
4287    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4288    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4289    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4290    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4291    if (TCG_TARGET_REG_BITS == 64) {
4292#if !defined(_WIN64)
4293        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4294        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4295#endif
4296        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4297        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4298        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4299        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4300    }
4301
4302    s->reserved_regs = 0;
4303    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4304    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4305#ifdef _WIN64
4306    /* These are call saved, and we don't save them, so don't use them. */
4307    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4308    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4309    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4310    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4311    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4312    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4313    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4314    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4315    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4316    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4317#endif
4318}
4319
4320typedef struct {
4321    DebugFrameHeader h;
4322    uint8_t fde_def_cfa[4];
4323    uint8_t fde_reg_ofs[14];
4324} DebugFrame;
4325
4326/* We're expecting a 2 byte uleb128 encoded value.  */
4327QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4328
4329#if !defined(__ELF__)
4330    /* Host machine without ELF. */
4331#elif TCG_TARGET_REG_BITS == 64
4332#define ELF_HOST_MACHINE EM_X86_64
4333static const DebugFrame debug_frame = {
4334    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4335    .h.cie.id = -1,
4336    .h.cie.version = 1,
4337    .h.cie.code_align = 1,
4338    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4339    .h.cie.return_column = 16,
4340
4341    /* Total FDE size does not include the "len" member.  */
4342    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4343
4344    .fde_def_cfa = {
4345        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4346        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4347        (FRAME_SIZE >> 7)
4348    },
4349    .fde_reg_ofs = {
4350        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4351        /* The following ordering must match tcg_target_callee_save_regs.  */
4352        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4353        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4354        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4355        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4356        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4357        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4358    }
4359};
4360#else
4361#define ELF_HOST_MACHINE EM_386
4362static const DebugFrame debug_frame = {
4363    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4364    .h.cie.id = -1,
4365    .h.cie.version = 1,
4366    .h.cie.code_align = 1,
4367    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4368    .h.cie.return_column = 8,
4369
4370    /* Total FDE size does not include the "len" member.  */
4371    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4372
4373    .fde_def_cfa = {
4374        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4375        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4376        (FRAME_SIZE >> 7)
4377    },
4378    .fde_reg_ofs = {
4379        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4380        /* The following ordering must match tcg_target_callee_save_regs.  */
4381        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4382        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4383        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4384        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4385    }
4386};
4387#endif
4388
4389#if defined(ELF_HOST_MACHINE)
4390void tcg_register_jit(const void *buf, size_t buf_size)
4391{
4392    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4393}
4394#endif
4395