xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision e68e97ce55b3d17af22dd62c3b3dc72f761b0862)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135#define TCG_CT_CONST_TST 0x1000
136
137/* Registers used with L constraint, which are the first argument
138   registers on x86_64, and two random call clobbered registers on
139   i386. */
140#if TCG_TARGET_REG_BITS == 64
141# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
142# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
143#else
144# define TCG_REG_L0 TCG_REG_EAX
145# define TCG_REG_L1 TCG_REG_EDX
146#endif
147
148#if TCG_TARGET_REG_BITS == 64
149# define ALL_GENERAL_REGS      0x0000ffffu
150# define ALL_VECTOR_REGS       0xffff0000u
151# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
152#else
153# define ALL_GENERAL_REGS      0x000000ffu
154# define ALL_VECTOR_REGS       0x00ff0000u
155# define ALL_BYTEL_REGS        0x0000000fu
156#endif
157#define SOFTMMU_RESERVE_REGS \
158    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
159
160#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
161#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
162
163static const tcg_insn_unit *tb_ret_addr;
164
165static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
166                        intptr_t value, intptr_t addend)
167{
168    value += addend;
169    switch(type) {
170    case R_386_PC32:
171        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
172        if (value != (int32_t)value) {
173            return false;
174        }
175        /* FALLTHRU */
176    case R_386_32:
177        tcg_patch32(code_ptr, value);
178        break;
179    case R_386_PC8:
180        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
181        if (value != (int8_t)value) {
182            return false;
183        }
184        tcg_patch8(code_ptr, value);
185        break;
186    default:
187        g_assert_not_reached();
188    }
189    return true;
190}
191
192/* test if a constant matches the constraint */
193static bool tcg_target_const_match(int64_t val, int ct,
194                                   TCGType type, TCGCond cond, int vece)
195{
196    if (ct & TCG_CT_CONST) {
197        return 1;
198    }
199    if (type == TCG_TYPE_I32) {
200        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
201                  TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
202            return 1;
203        }
204    } else {
205        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
206            return 1;
207        }
208        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
209            return 1;
210        }
211        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
212            return 1;
213        }
214        /*
215         * This will be used in combination with TCG_CT_CONST_S32,
216         * so "normal" TESTQ is already matched.  Also accept:
217         *    TESTQ -> TESTL   (uint32_t)
218         *    TESTQ -> BT      (is_power_of_2)
219         */
220        if ((ct & TCG_CT_CONST_TST)
221            && is_tst_cond(cond)
222            && (val == (uint32_t)val || is_power_of_2(val))) {
223            return 1;
224        }
225    }
226    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
227        return 1;
228    }
229    return 0;
230}
231
232# define LOWREGMASK(x)	((x) & 7)
233
234#define P_EXT		0x100		/* 0x0f opcode prefix */
235#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
236#define P_DATA16        0x400           /* 0x66 opcode prefix */
237#define P_VEXW          0x1000          /* Set VEX.W = 1 */
238#if TCG_TARGET_REG_BITS == 64
239# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
240# define P_REXB_R       0x2000          /* REG field as byte register */
241# define P_REXB_RM      0x4000          /* R/M field as byte register */
242# define P_GS           0x8000          /* gs segment override */
243#else
244# define P_REXW		0
245# define P_REXB_R	0
246# define P_REXB_RM	0
247# define P_GS           0
248#endif
249#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
250#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
251#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
252#define P_VEXL          0x80000         /* Set VEX.L = 1 */
253#define P_EVEX          0x100000        /* Requires EVEX encoding */
254
255#define OPC_ARITH_EbIb	(0x80)
256#define OPC_ARITH_EvIz	(0x81)
257#define OPC_ARITH_EvIb	(0x83)
258#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
259#define OPC_ANDN        (0xf2 | P_EXT38)
260#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
261#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
262#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
263#define OPC_BSF         (0xbc | P_EXT)
264#define OPC_BSR         (0xbd | P_EXT)
265#define OPC_BSWAP	(0xc8 | P_EXT)
266#define OPC_CALL_Jz	(0xe8)
267#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
268#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
269#define OPC_DEC_r32	(0x48)
270#define OPC_IMUL_GvEv	(0xaf | P_EXT)
271#define OPC_IMUL_GvEvIb	(0x6b)
272#define OPC_IMUL_GvEvIz	(0x69)
273#define OPC_INC_r32	(0x40)
274#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
275#define OPC_JCC_short	(0x70)		/* ... plus condition code */
276#define OPC_JMP_long	(0xe9)
277#define OPC_JMP_short	(0xeb)
278#define OPC_LEA         (0x8d)
279#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
280#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
281#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
282#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
283#define OPC_MOVB_EvIz   (0xc6)
284#define OPC_MOVL_EvIz	(0xc7)
285#define OPC_MOVB_Ib     (0xb0)
286#define OPC_MOVL_Iv     (0xb8)
287#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
288#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
289#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
290#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
291#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
292#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
293#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
294#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
295#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
296#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
297#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
298#define OPC_MOVSBL	(0xbe | P_EXT)
299#define OPC_MOVSWL	(0xbf | P_EXT)
300#define OPC_MOVSLQ	(0x63 | P_REXW)
301#define OPC_MOVZBL	(0xb6 | P_EXT)
302#define OPC_MOVZWL	(0xb7 | P_EXT)
303#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
304#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
305#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
306#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
307#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
308#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
309#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
310#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
311#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
312#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
313#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
314#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
315#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
316#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
317#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
318#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
319#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
320#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
321#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
322#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
323#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
324#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
325#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
326#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
327#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
328#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
329#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
330#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
331#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
332#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
333#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
334#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
335#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
336#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
337#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
338#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
339#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
340#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
341#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
342#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
343#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
344#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
345#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
346#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
347#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
348#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
349#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
350#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
351#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
352#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
353#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
354#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
355#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
356#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
357#define OPC_POR         (0xeb | P_EXT | P_DATA16)
358#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
359#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
360#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
361#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
362#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
363#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
364#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
365#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
366#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
367#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
368#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
369#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
370#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
371#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
372#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
373#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
374#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
375#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
376#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
377#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
378#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
379#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
380#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
381#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
382#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
383#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
384#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
385#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
386#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
387#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
388#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
389#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
390#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
391#define OPC_POP_r32	(0x58)
392#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
393#define OPC_PUSH_r32	(0x50)
394#define OPC_PUSH_Iv	(0x68)
395#define OPC_PUSH_Ib	(0x6a)
396#define OPC_RET		(0xc3)
397#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
398#define OPC_SHIFT_1	(0xd1)
399#define OPC_SHIFT_Ib	(0xc1)
400#define OPC_SHIFT_cl	(0xd3)
401#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
402#define OPC_SHUFPS      (0xc6 | P_EXT)
403#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
404#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
405#define OPC_SHRD_Ib     (0xac | P_EXT)
406#define OPC_TESTB	(0x84)
407#define OPC_TESTL	(0x85)
408#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
409#define OPC_UD2         (0x0b | P_EXT)
410#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
411#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
412#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
413#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
414#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
415#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
416#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
417#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
418#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
419#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
420#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
421#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
422#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
423#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
424#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
425#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
427#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
428#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
429#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
430#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
431#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
433#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
434#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
435#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
436#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
437#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
438#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
439#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
440#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
441#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
442#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
443#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
444#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
445#define OPC_VZEROUPPER  (0x77 | P_EXT)
446#define OPC_XCHG_ax_r32	(0x90)
447#define OPC_XCHG_EvGv   (0x87)
448
449#define OPC_GRP3_Eb     (0xf6)
450#define OPC_GRP3_Ev     (0xf7)
451#define OPC_GRP5        (0xff)
452#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
453#define OPC_GRPBT       (0xba | P_EXT)
454
455#define OPC_GRPBT_BT    4
456#define OPC_GRPBT_BTS   5
457#define OPC_GRPBT_BTR   6
458#define OPC_GRPBT_BTC   7
459
460/* Group 1 opcode extensions for 0x80-0x83.
461   These are also used as modifiers for OPC_ARITH.  */
462#define ARITH_ADD 0
463#define ARITH_OR  1
464#define ARITH_ADC 2
465#define ARITH_SBB 3
466#define ARITH_AND 4
467#define ARITH_SUB 5
468#define ARITH_XOR 6
469#define ARITH_CMP 7
470
471/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
472#define SHIFT_ROL 0
473#define SHIFT_ROR 1
474#define SHIFT_SHL 4
475#define SHIFT_SHR 5
476#define SHIFT_SAR 7
477
478/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
479#define EXT3_TESTi 0
480#define EXT3_NOT   2
481#define EXT3_NEG   3
482#define EXT3_MUL   4
483#define EXT3_IMUL  5
484#define EXT3_DIV   6
485#define EXT3_IDIV  7
486
487/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
488#define EXT5_INC_Ev	0
489#define EXT5_DEC_Ev	1
490#define EXT5_CALLN_Ev	2
491#define EXT5_JMPN_Ev	4
492
493/* Condition codes to be added to OPC_JCC_{long,short}.  */
494#define JCC_JMP (-1)
495#define JCC_JO  0x0
496#define JCC_JNO 0x1
497#define JCC_JB  0x2
498#define JCC_JAE 0x3
499#define JCC_JE  0x4
500#define JCC_JNE 0x5
501#define JCC_JBE 0x6
502#define JCC_JA  0x7
503#define JCC_JS  0x8
504#define JCC_JNS 0x9
505#define JCC_JP  0xa
506#define JCC_JNP 0xb
507#define JCC_JL  0xc
508#define JCC_JGE 0xd
509#define JCC_JLE 0xe
510#define JCC_JG  0xf
511
512static const uint8_t tcg_cond_to_jcc[] = {
513    [TCG_COND_EQ] = JCC_JE,
514    [TCG_COND_NE] = JCC_JNE,
515    [TCG_COND_LT] = JCC_JL,
516    [TCG_COND_GE] = JCC_JGE,
517    [TCG_COND_LE] = JCC_JLE,
518    [TCG_COND_GT] = JCC_JG,
519    [TCG_COND_LTU] = JCC_JB,
520    [TCG_COND_GEU] = JCC_JAE,
521    [TCG_COND_LEU] = JCC_JBE,
522    [TCG_COND_GTU] = JCC_JA,
523    [TCG_COND_TSTEQ] = JCC_JE,
524    [TCG_COND_TSTNE] = JCC_JNE,
525};
526
527#if TCG_TARGET_REG_BITS == 64
528static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
529{
530    int rex;
531
532    if (opc & P_GS) {
533        tcg_out8(s, 0x65);
534    }
535    if (opc & P_DATA16) {
536        /* We should never be asking for both 16 and 64-bit operation.  */
537        tcg_debug_assert((opc & P_REXW) == 0);
538        tcg_out8(s, 0x66);
539    }
540    if (opc & P_SIMDF3) {
541        tcg_out8(s, 0xf3);
542    } else if (opc & P_SIMDF2) {
543        tcg_out8(s, 0xf2);
544    }
545
546    rex = 0;
547    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
548    rex |= (r & 8) >> 1;                /* REX.R */
549    rex |= (x & 8) >> 2;                /* REX.X */
550    rex |= (rm & 8) >> 3;               /* REX.B */
551
552    /* P_REXB_{R,RM} indicates that the given register is the low byte.
553       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
554       as otherwise the encoding indicates %[abcd]h.  Note that the values
555       that are ORed in merely indicate that the REX byte must be present;
556       those bits get discarded in output.  */
557    rex |= opc & (r >= 4 ? P_REXB_R : 0);
558    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
559
560    if (rex) {
561        tcg_out8(s, (uint8_t)(rex | 0x40));
562    }
563
564    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
565        tcg_out8(s, 0x0f);
566        if (opc & P_EXT38) {
567            tcg_out8(s, 0x38);
568        } else if (opc & P_EXT3A) {
569            tcg_out8(s, 0x3a);
570        }
571    }
572
573    tcg_out8(s, opc);
574}
575#else
576static void tcg_out_opc(TCGContext *s, int opc)
577{
578    if (opc & P_DATA16) {
579        tcg_out8(s, 0x66);
580    }
581    if (opc & P_SIMDF3) {
582        tcg_out8(s, 0xf3);
583    } else if (opc & P_SIMDF2) {
584        tcg_out8(s, 0xf2);
585    }
586    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
587        tcg_out8(s, 0x0f);
588        if (opc & P_EXT38) {
589            tcg_out8(s, 0x38);
590        } else if (opc & P_EXT3A) {
591            tcg_out8(s, 0x3a);
592        }
593    }
594    tcg_out8(s, opc);
595}
596/* Discard the register arguments to tcg_out_opc early, so as not to penalize
597   the 32-bit compilation paths.  This method works with all versions of gcc,
598   whereas relying on optimization may not be able to exclude them.  */
599#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
600#endif
601
602static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
603{
604    tcg_out_opc(s, opc, r, rm, 0);
605    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
606}
607
608static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
609                            int rm, int index)
610{
611    int tmp;
612
613    if (opc & P_GS) {
614        tcg_out8(s, 0x65);
615    }
616    /* Use the two byte form if possible, which cannot encode
617       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
618    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
619        && ((rm | index) & 8) == 0) {
620        /* Two byte VEX prefix.  */
621        tcg_out8(s, 0xc5);
622
623        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
624    } else {
625        /* Three byte VEX prefix.  */
626        tcg_out8(s, 0xc4);
627
628        /* VEX.m-mmmm */
629        if (opc & P_EXT3A) {
630            tmp = 3;
631        } else if (opc & P_EXT38) {
632            tmp = 2;
633        } else if (opc & P_EXT) {
634            tmp = 1;
635        } else {
636            g_assert_not_reached();
637        }
638        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
639        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
640        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
641        tcg_out8(s, tmp);
642
643        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
644    }
645
646    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
647    /* VEX.pp */
648    if (opc & P_DATA16) {
649        tmp |= 1;                          /* 0x66 */
650    } else if (opc & P_SIMDF3) {
651        tmp |= 2;                          /* 0xf3 */
652    } else if (opc & P_SIMDF2) {
653        tmp |= 3;                          /* 0xf2 */
654    }
655    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
656    tcg_out8(s, tmp);
657    tcg_out8(s, opc);
658}
659
660static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
661                             int rm, int index)
662{
663    /* The entire 4-byte evex prefix; with R' and V' set. */
664    uint32_t p = 0x08041062;
665    int mm, pp;
666
667    tcg_debug_assert(have_avx512vl);
668
669    /* EVEX.mm */
670    if (opc & P_EXT3A) {
671        mm = 3;
672    } else if (opc & P_EXT38) {
673        mm = 2;
674    } else if (opc & P_EXT) {
675        mm = 1;
676    } else {
677        g_assert_not_reached();
678    }
679
680    /* EVEX.pp */
681    if (opc & P_DATA16) {
682        pp = 1;                          /* 0x66 */
683    } else if (opc & P_SIMDF3) {
684        pp = 2;                          /* 0xf3 */
685    } else if (opc & P_SIMDF2) {
686        pp = 3;                          /* 0xf2 */
687    } else {
688        pp = 0;
689    }
690
691    p = deposit32(p, 8, 2, mm);
692    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
693    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
694    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
695    p = deposit32(p, 16, 2, pp);
696    p = deposit32(p, 19, 4, ~v);
697    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
698    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
699
700    tcg_out32(s, p);
701    tcg_out8(s, opc);
702}
703
704static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
705{
706    if (opc & P_EVEX) {
707        tcg_out_evex_opc(s, opc, r, v, rm, 0);
708    } else {
709        tcg_out_vex_opc(s, opc, r, v, rm, 0);
710    }
711    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
712}
713
714/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
715   We handle either RM and INDEX missing with a negative value.  In 64-bit
716   mode for absolute addresses, ~RM is the size of the immediate operand
717   that will follow the instruction.  */
718
719static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
720                               int shift, intptr_t offset)
721{
722    int mod, len;
723
724    if (index < 0 && rm < 0) {
725        if (TCG_TARGET_REG_BITS == 64) {
726            /* Try for a rip-relative addressing mode.  This has replaced
727               the 32-bit-mode absolute addressing encoding.  */
728            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
729            intptr_t disp = offset - pc;
730            if (disp == (int32_t)disp) {
731                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
732                tcg_out32(s, disp);
733                return;
734            }
735
736            /* Try for an absolute address encoding.  This requires the
737               use of the MODRM+SIB encoding and is therefore larger than
738               rip-relative addressing.  */
739            if (offset == (int32_t)offset) {
740                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
741                tcg_out8(s, (4 << 3) | 5);
742                tcg_out32(s, offset);
743                return;
744            }
745
746            /* ??? The memory isn't directly addressable.  */
747            g_assert_not_reached();
748        } else {
749            /* Absolute address.  */
750            tcg_out8(s, (r << 3) | 5);
751            tcg_out32(s, offset);
752            return;
753        }
754    }
755
756    /* Find the length of the immediate addend.  Note that the encoding
757       that would be used for (%ebp) indicates absolute addressing.  */
758    if (rm < 0) {
759        mod = 0, len = 4, rm = 5;
760    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
761        mod = 0, len = 0;
762    } else if (offset == (int8_t)offset) {
763        mod = 0x40, len = 1;
764    } else {
765        mod = 0x80, len = 4;
766    }
767
768    /* Use a single byte MODRM format if possible.  Note that the encoding
769       that would be used for %esp is the escape to the two byte form.  */
770    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
771        /* Single byte MODRM format.  */
772        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
773    } else {
774        /* Two byte MODRM+SIB format.  */
775
776        /* Note that the encoding that would place %esp into the index
777           field indicates no index register.  In 64-bit mode, the REX.X
778           bit counts, so %r12 can be used as the index.  */
779        if (index < 0) {
780            index = 4;
781        } else {
782            tcg_debug_assert(index != TCG_REG_ESP);
783        }
784
785        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
786        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
787    }
788
789    if (len == 1) {
790        tcg_out8(s, offset);
791    } else if (len == 4) {
792        tcg_out32(s, offset);
793    }
794}
795
796static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
797                                     int index, int shift, intptr_t offset)
798{
799    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
800    tcg_out_sib_offset(s, r, rm, index, shift, offset);
801}
802
803static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
804                                         int rm, int index, int shift,
805                                         intptr_t offset)
806{
807    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
808    tcg_out_sib_offset(s, r, rm, index, shift, offset);
809}
810
811/* A simplification of the above with no index or shift.  */
812static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
813                                        int rm, intptr_t offset)
814{
815    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
816}
817
818static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
819                                            int v, int rm, intptr_t offset)
820{
821    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
822}
823
824/* Output an opcode with an expected reference to the constant pool.  */
825static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
826{
827    tcg_out_opc(s, opc, r, 0, 0);
828    /* Absolute for 32-bit, pc-relative for 64-bit.  */
829    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
830    tcg_out32(s, 0);
831}
832
833/* Output an opcode with an expected reference to the constant pool.  */
834static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
835{
836    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
837    /* Absolute for 32-bit, pc-relative for 64-bit.  */
838    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
839    tcg_out32(s, 0);
840}
841
842/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
843static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
844{
845    /* Propagate an opcode prefix, such as P_REXW.  */
846    int ext = subop & ~0x7;
847    subop &= 0x7;
848
849    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
850}
851
852static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
853{
854    int rexw = 0;
855
856    if (arg == ret) {
857        return true;
858    }
859    switch (type) {
860    case TCG_TYPE_I64:
861        rexw = P_REXW;
862        /* fallthru */
863    case TCG_TYPE_I32:
864        if (ret < 16) {
865            if (arg < 16) {
866                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
867            } else {
868                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
869            }
870        } else {
871            if (arg < 16) {
872                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
873            } else {
874                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
875            }
876        }
877        break;
878
879    case TCG_TYPE_V64:
880        tcg_debug_assert(ret >= 16 && arg >= 16);
881        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
882        break;
883    case TCG_TYPE_V128:
884        tcg_debug_assert(ret >= 16 && arg >= 16);
885        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
886        break;
887    case TCG_TYPE_V256:
888        tcg_debug_assert(ret >= 16 && arg >= 16);
889        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
890        break;
891
892    default:
893        g_assert_not_reached();
894    }
895    return true;
896}
897
898static const int avx2_dup_insn[4] = {
899    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
900    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
901};
902
903static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
904                            TCGReg r, TCGReg a)
905{
906    if (have_avx2) {
907        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
908        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
909    } else {
910        switch (vece) {
911        case MO_8:
912            /* ??? With zero in a register, use PSHUFB.  */
913            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
914            a = r;
915            /* FALLTHRU */
916        case MO_16:
917            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
918            a = r;
919            /* FALLTHRU */
920        case MO_32:
921            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
922            /* imm8 operand: all output lanes selected from input lane 0.  */
923            tcg_out8(s, 0);
924            break;
925        case MO_64:
926            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
927            break;
928        default:
929            g_assert_not_reached();
930        }
931    }
932    return true;
933}
934
935static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
936                             TCGReg r, TCGReg base, intptr_t offset)
937{
938    if (have_avx2) {
939        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
940        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
941                                 r, 0, base, offset);
942    } else {
943        switch (vece) {
944        case MO_64:
945            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
946            break;
947        case MO_32:
948            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
949            break;
950        case MO_16:
951            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
952            tcg_out8(s, 0); /* imm8 */
953            tcg_out_dup_vec(s, type, vece, r, r);
954            break;
955        case MO_8:
956            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
957            tcg_out8(s, 0); /* imm8 */
958            tcg_out_dup_vec(s, type, vece, r, r);
959            break;
960        default:
961            g_assert_not_reached();
962        }
963    }
964    return true;
965}
966
967static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
968                             TCGReg ret, int64_t arg)
969{
970    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
971
972    if (arg == 0) {
973        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
974        return;
975    }
976    if (arg == -1) {
977        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
978        return;
979    }
980
981    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
982        if (have_avx2) {
983            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
984        } else {
985            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
986        }
987        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
988    } else {
989        if (type == TCG_TYPE_V64) {
990            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
991        } else if (have_avx2) {
992            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
993        } else {
994            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
995        }
996        if (TCG_TARGET_REG_BITS == 64) {
997            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
998        } else {
999            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1000        }
1001    }
1002}
1003
1004static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1005                             TCGReg ret, tcg_target_long arg)
1006{
1007    if (arg == 0) {
1008        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1009        return;
1010    }
1011    if (arg == -1) {
1012        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1013        return;
1014    }
1015
1016    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1017    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1018    if (TCG_TARGET_REG_BITS == 64) {
1019        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1020    } else {
1021        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1022    }
1023}
1024
1025static void tcg_out_movi_int(TCGContext *s, TCGType type,
1026                             TCGReg ret, tcg_target_long arg)
1027{
1028    tcg_target_long diff;
1029
1030    if (arg == 0) {
1031        tgen_arithr(s, ARITH_XOR, ret, ret);
1032        return;
1033    }
1034    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1035        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1036        tcg_out32(s, arg);
1037        return;
1038    }
1039    if (arg == (int32_t)arg) {
1040        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1041        tcg_out32(s, arg);
1042        return;
1043    }
1044
1045    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1046    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1047    if (diff == (int32_t)diff) {
1048        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1049        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1050        tcg_out32(s, diff);
1051        return;
1052    }
1053
1054    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1055    tcg_out64(s, arg);
1056}
1057
1058static void tcg_out_movi(TCGContext *s, TCGType type,
1059                         TCGReg ret, tcg_target_long arg)
1060{
1061    switch (type) {
1062    case TCG_TYPE_I32:
1063#if TCG_TARGET_REG_BITS == 64
1064    case TCG_TYPE_I64:
1065#endif
1066        if (ret < 16) {
1067            tcg_out_movi_int(s, type, ret, arg);
1068        } else {
1069            tcg_out_movi_vec(s, type, ret, arg);
1070        }
1071        break;
1072    default:
1073        g_assert_not_reached();
1074    }
1075}
1076
1077static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1078{
1079    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1080    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1081    return true;
1082}
1083
1084static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1085                             tcg_target_long imm)
1086{
1087    /* This function is only used for passing structs by reference. */
1088    tcg_debug_assert(imm == (int32_t)imm);
1089    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1090}
1091
1092static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1093{
1094    if (val == (int8_t)val) {
1095        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1096        tcg_out8(s, val);
1097    } else if (val == (int32_t)val) {
1098        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1099        tcg_out32(s, val);
1100    } else {
1101        g_assert_not_reached();
1102    }
1103}
1104
1105static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1106{
1107    /* Given the strength of x86 memory ordering, we only need care for
1108       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1109       faster than "mfence", so don't bother with the sse insn.  */
1110    if (a0 & TCG_MO_ST_LD) {
1111        tcg_out8(s, 0xf0);
1112        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1113        tcg_out8(s, 0);
1114    }
1115}
1116
1117static inline void tcg_out_push(TCGContext *s, int reg)
1118{
1119    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1120}
1121
1122static inline void tcg_out_pop(TCGContext *s, int reg)
1123{
1124    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1125}
1126
1127static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1128                       TCGReg arg1, intptr_t arg2)
1129{
1130    switch (type) {
1131    case TCG_TYPE_I32:
1132        if (ret < 16) {
1133            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1134        } else {
1135            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1136        }
1137        break;
1138    case TCG_TYPE_I64:
1139        if (ret < 16) {
1140            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1141            break;
1142        }
1143        /* FALLTHRU */
1144    case TCG_TYPE_V64:
1145        /* There is no instruction that can validate 8-byte alignment.  */
1146        tcg_debug_assert(ret >= 16);
1147        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1148        break;
1149    case TCG_TYPE_V128:
1150        /*
1151         * The gvec infrastructure is asserts that v128 vector loads
1152         * and stores use a 16-byte aligned offset.  Validate that the
1153         * final pointer is aligned by using an insn that will SIGSEGV.
1154         */
1155        tcg_debug_assert(ret >= 16);
1156        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1157        break;
1158    case TCG_TYPE_V256:
1159        /*
1160         * The gvec infrastructure only requires 16-byte alignment,
1161         * so here we must use an unaligned load.
1162         */
1163        tcg_debug_assert(ret >= 16);
1164        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1165                                 ret, 0, arg1, arg2);
1166        break;
1167    default:
1168        g_assert_not_reached();
1169    }
1170}
1171
1172static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1173                       TCGReg arg1, intptr_t arg2)
1174{
1175    switch (type) {
1176    case TCG_TYPE_I32:
1177        if (arg < 16) {
1178            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1179        } else {
1180            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1181        }
1182        break;
1183    case TCG_TYPE_I64:
1184        if (arg < 16) {
1185            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1186            break;
1187        }
1188        /* FALLTHRU */
1189    case TCG_TYPE_V64:
1190        /* There is no instruction that can validate 8-byte alignment.  */
1191        tcg_debug_assert(arg >= 16);
1192        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1193        break;
1194    case TCG_TYPE_V128:
1195        /*
1196         * The gvec infrastructure is asserts that v128 vector loads
1197         * and stores use a 16-byte aligned offset.  Validate that the
1198         * final pointer is aligned by using an insn that will SIGSEGV.
1199         *
1200         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1201         * for _WIN64, which must have SSE2 but may not have AVX.
1202         */
1203        tcg_debug_assert(arg >= 16);
1204        if (have_avx1) {
1205            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1206        } else {
1207            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1208        }
1209        break;
1210    case TCG_TYPE_V256:
1211        /*
1212         * The gvec infrastructure only requires 16-byte alignment,
1213         * so here we must use an unaligned store.
1214         */
1215        tcg_debug_assert(arg >= 16);
1216        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1217                                 arg, 0, arg1, arg2);
1218        break;
1219    default:
1220        g_assert_not_reached();
1221    }
1222}
1223
1224static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1225                        TCGReg base, intptr_t ofs)
1226{
1227    int rexw = 0;
1228    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1229        if (val != (int32_t)val) {
1230            return false;
1231        }
1232        rexw = P_REXW;
1233    } else if (type != TCG_TYPE_I32) {
1234        return false;
1235    }
1236    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1237    tcg_out32(s, val);
1238    return true;
1239}
1240
1241static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1242{
1243    /* Propagate an opcode prefix, such as P_DATA16.  */
1244    int ext = subopc & ~0x7;
1245    subopc &= 0x7;
1246
1247    if (count == 1) {
1248        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1249    } else {
1250        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1251        tcg_out8(s, count);
1252    }
1253}
1254
1255static inline void tcg_out_bswap32(TCGContext *s, int reg)
1256{
1257    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1258}
1259
1260static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1261{
1262    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1263}
1264
1265static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1266{
1267    /* movzbl */
1268    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1269    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1270}
1271
1272static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1273{
1274    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1275    /* movsbl */
1276    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1277    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1278}
1279
1280static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1281{
1282    /* movzwl */
1283    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1284}
1285
1286static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1287{
1288    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1289    /* movsw[lq] */
1290    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1291}
1292
1293static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1294{
1295    /* 32-bit mov zero extends.  */
1296    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1297}
1298
1299static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1300{
1301    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1302    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1303}
1304
1305static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1306{
1307    tcg_out_ext32s(s, dest, src);
1308}
1309
1310static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1311{
1312    if (dest != src) {
1313        tcg_out_ext32u(s, dest, src);
1314    }
1315}
1316
1317static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1318{
1319    tcg_out_ext32u(s, dest, src);
1320}
1321
1322static inline void tcg_out_bswap64(TCGContext *s, int reg)
1323{
1324    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1325}
1326
1327static void tgen_arithi(TCGContext *s, int c, int r0,
1328                        tcg_target_long val, int cf)
1329{
1330    int rexw = 0;
1331
1332    if (TCG_TARGET_REG_BITS == 64) {
1333        rexw = c & -8;
1334        c &= 7;
1335    }
1336
1337    switch (c) {
1338    case ARITH_ADD:
1339    case ARITH_SUB:
1340        if (!cf) {
1341            /*
1342             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1343             * partial flags update stalls on Pentium4 and are not recommended
1344             * by current Intel optimization manuals.
1345             */
1346            if (val == 1 || val == -1) {
1347                int is_inc = (c == ARITH_ADD) ^ (val < 0);
1348                if (TCG_TARGET_REG_BITS == 64) {
1349                    /*
1350                     * The single-byte increment encodings are re-tasked
1351                     * as the REX prefixes.  Use the MODRM encoding.
1352                     */
1353                    tcg_out_modrm(s, OPC_GRP5 + rexw,
1354                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1355                } else {
1356                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1357                }
1358                return;
1359            }
1360            if (val == 128) {
1361                /*
1362                 * Facilitate using an 8-bit immediate.  Carry is inverted
1363                 * by this transformation, so do it only if cf == 0.
1364                 */
1365                c ^= ARITH_ADD ^ ARITH_SUB;
1366                val = -128;
1367            }
1368        }
1369        break;
1370
1371    case ARITH_AND:
1372        if (TCG_TARGET_REG_BITS == 64) {
1373            if (val == 0xffffffffu) {
1374                tcg_out_ext32u(s, r0, r0);
1375                return;
1376            }
1377            if (val == (uint32_t)val) {
1378                /* AND with no high bits set can use a 32-bit operation.  */
1379                rexw = 0;
1380            }
1381        }
1382        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1383            tcg_out_ext8u(s, r0, r0);
1384            return;
1385        }
1386        if (val == 0xffffu) {
1387            tcg_out_ext16u(s, r0, r0);
1388            return;
1389        }
1390        break;
1391
1392    case ARITH_OR:
1393    case ARITH_XOR:
1394        if (val >= 0x80 && val <= 0xff
1395            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1396            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1397            tcg_out8(s, val);
1398            return;
1399        }
1400        break;
1401    }
1402
1403    if (val == (int8_t)val) {
1404        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1405        tcg_out8(s, val);
1406        return;
1407    }
1408    if (rexw == 0 || val == (int32_t)val) {
1409        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1410        tcg_out32(s, val);
1411        return;
1412    }
1413
1414    g_assert_not_reached();
1415}
1416
1417static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1418{
1419    if (val != 0) {
1420        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1421    }
1422}
1423
1424/* Set SMALL to force a short forward branch.  */
1425static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1426{
1427    int32_t val, val1;
1428
1429    if (l->has_value) {
1430        val = tcg_pcrel_diff(s, l->u.value_ptr);
1431        val1 = val - 2;
1432        if ((int8_t)val1 == val1) {
1433            if (opc == -1) {
1434                tcg_out8(s, OPC_JMP_short);
1435            } else {
1436                tcg_out8(s, OPC_JCC_short + opc);
1437            }
1438            tcg_out8(s, val1);
1439        } else {
1440            tcg_debug_assert(!small);
1441            if (opc == -1) {
1442                tcg_out8(s, OPC_JMP_long);
1443                tcg_out32(s, val - 5);
1444            } else {
1445                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1446                tcg_out32(s, val - 6);
1447            }
1448        }
1449    } else if (small) {
1450        if (opc == -1) {
1451            tcg_out8(s, OPC_JMP_short);
1452        } else {
1453            tcg_out8(s, OPC_JCC_short + opc);
1454        }
1455        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1456        s->code_ptr += 1;
1457    } else {
1458        if (opc == -1) {
1459            tcg_out8(s, OPC_JMP_long);
1460        } else {
1461            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1462        }
1463        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1464        s->code_ptr += 4;
1465    }
1466}
1467
1468static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1469                       TCGArg arg2, int const_arg2, int rexw)
1470{
1471    int jz, js;
1472
1473    if (!is_tst_cond(cond)) {
1474        if (!const_arg2) {
1475            tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1476        } else if (arg2 == 0) {
1477            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1478        } else {
1479            tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1480            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1481        }
1482        return tcg_cond_to_jcc[cond];
1483    }
1484
1485    jz = tcg_cond_to_jcc[cond];
1486    js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1487
1488    if (!const_arg2) {
1489        tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1490        return jz;
1491    }
1492
1493    if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1494        if (arg2 == 0x80) {
1495            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1496            return js;
1497        }
1498        if (arg2 == 0xff) {
1499            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1500            return jz;
1501        }
1502        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1503        tcg_out8(s, arg2);
1504        return jz;
1505    }
1506
1507    if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1508        if (arg2 == 0x8000) {
1509            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1510            return js;
1511        }
1512        if (arg2 == 0xff00) {
1513            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1514            return jz;
1515        }
1516        tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1517        tcg_out8(s, arg2 >> 8);
1518        return jz;
1519    }
1520
1521    if (arg2 == 0xffff) {
1522        tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1523        return jz;
1524    }
1525    if (arg2 == 0xffffffffu) {
1526        tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1527        return jz;
1528    }
1529
1530    if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1531        int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1532        int sh = ctz64(arg2);
1533
1534        rexw = (sh & 32 ? P_REXW : 0);
1535        if ((sh & 31) == 31) {
1536            tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1537            return js;
1538        } else {
1539            tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1540            tcg_out8(s, sh);
1541            return jc;
1542        }
1543    }
1544
1545    if (rexw) {
1546        if (arg2 == (uint32_t)arg2) {
1547            rexw = 0;
1548        } else {
1549            tcg_debug_assert(arg2 == (int32_t)arg2);
1550        }
1551    }
1552    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1553    tcg_out32(s, arg2);
1554    return jz;
1555}
1556
1557static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1558                           TCGArg arg1, TCGArg arg2, int const_arg2,
1559                           TCGLabel *label, bool small)
1560{
1561    int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1562    tcg_out_jxx(s, jcc, label, small);
1563}
1564
1565#if TCG_TARGET_REG_BITS == 32
1566static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1567                            const int *const_args, bool small)
1568{
1569    TCGLabel *label_next = gen_new_label();
1570    TCGLabel *label_this = arg_label(args[5]);
1571    TCGCond cond = args[4];
1572
1573    switch (cond) {
1574    case TCG_COND_EQ:
1575    case TCG_COND_TSTEQ:
1576        tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1577                       args[0], args[2], const_args[2], label_next, 1);
1578        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1579                       label_this, small);
1580        break;
1581    case TCG_COND_NE:
1582    case TCG_COND_TSTNE:
1583        tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1584                       label_this, small);
1585        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1586                       label_this, small);
1587        break;
1588    case TCG_COND_LT:
1589        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1590                       label_this, small);
1591        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1592        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1593                       label_this, small);
1594        break;
1595    case TCG_COND_LE:
1596        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1597                       label_this, small);
1598        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1599        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1600                       label_this, small);
1601        break;
1602    case TCG_COND_GT:
1603        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1604                       label_this, small);
1605        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1606        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1607                       label_this, small);
1608        break;
1609    case TCG_COND_GE:
1610        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1611                       label_this, small);
1612        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1613        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1614                       label_this, small);
1615        break;
1616    case TCG_COND_LTU:
1617        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1618                       label_this, small);
1619        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1620        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1621                       label_this, small);
1622        break;
1623    case TCG_COND_LEU:
1624        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1625                       label_this, small);
1626        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1627        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1628                       label_this, small);
1629        break;
1630    case TCG_COND_GTU:
1631        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1632                       label_this, small);
1633        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1634        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1635                       label_this, small);
1636        break;
1637    case TCG_COND_GEU:
1638        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1639                       label_this, small);
1640        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1641        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1642                       label_this, small);
1643        break;
1644    default:
1645        g_assert_not_reached();
1646    }
1647    tcg_out_label(s, label_next);
1648}
1649#endif
1650
1651static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1652                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1653                            int const_arg2, bool neg)
1654{
1655    int cmp_rexw = rexw;
1656    bool inv = false;
1657    bool cleared;
1658    int jcc;
1659
1660    switch (cond) {
1661    case TCG_COND_NE:
1662        inv = true;
1663        /* fall through */
1664    case TCG_COND_EQ:
1665        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1666        if (const_arg2 && arg2 == 0) {
1667            arg2 = 1;
1668            goto do_ltu;
1669        }
1670        break;
1671
1672    case TCG_COND_TSTNE:
1673        inv = true;
1674        /* fall through */
1675    case TCG_COND_TSTEQ:
1676        /* If arg2 is -1, convert to LTU/GEU vs 1. */
1677        if (const_arg2 && arg2 == 0xffffffffu) {
1678            arg2 = 1;
1679            cmp_rexw = 0;
1680            goto do_ltu;
1681        }
1682        break;
1683
1684    case TCG_COND_LEU:
1685        inv = true;
1686        /* fall through */
1687    case TCG_COND_GTU:
1688        /* If arg2 is a register, swap for LTU/GEU. */
1689        if (!const_arg2) {
1690            TCGReg t = arg1;
1691            arg1 = arg2;
1692            arg2 = t;
1693            goto do_ltu;
1694        }
1695        break;
1696
1697    case TCG_COND_GEU:
1698        inv = true;
1699        /* fall through */
1700    case TCG_COND_LTU:
1701    do_ltu:
1702        /*
1703         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1704         * We can then use NEG or INC to produce the desired result.
1705         * This is always smaller than the SETCC expansion.
1706         */
1707        tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);
1708
1709        /* X - X - C = -C = (C ? -1 : 0) */
1710        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1711        if (inv && neg) {
1712            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1713            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1714        } else if (inv) {
1715            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1716            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1717        } else if (!neg) {
1718            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1719            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1720        }
1721        return;
1722
1723    case TCG_COND_GE:
1724        inv = true;
1725        /* fall through */
1726    case TCG_COND_LT:
1727        /* If arg2 is 0, extract the sign bit. */
1728        if (const_arg2 && arg2 == 0) {
1729            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1730            if (inv) {
1731                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1732            }
1733            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1734                           dest, rexw ? 63 : 31);
1735            return;
1736        }
1737        break;
1738
1739    default:
1740        break;
1741    }
1742
1743    /*
1744     * If dest does not overlap the inputs, clearing it first is preferred.
1745     * The XOR breaks any false dependency for the low-byte write to dest,
1746     * and is also one byte smaller than MOVZBL.
1747     */
1748    cleared = false;
1749    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1750        tgen_arithr(s, ARITH_XOR, dest, dest);
1751        cleared = true;
1752    }
1753
1754    jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
1755    tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1756
1757    if (!cleared) {
1758        tcg_out_ext8u(s, dest, dest);
1759    }
1760    if (neg) {
1761        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1762    }
1763}
1764
1765#if TCG_TARGET_REG_BITS == 32
1766static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1767                             const int *const_args)
1768{
1769    TCGArg new_args[6];
1770    TCGLabel *label_true, *label_over;
1771
1772    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1773
1774    if (args[0] == args[1] || args[0] == args[2]
1775        || (!const_args[3] && args[0] == args[3])
1776        || (!const_args[4] && args[0] == args[4])) {
1777        /* When the destination overlaps with one of the argument
1778           registers, don't do anything tricky.  */
1779        label_true = gen_new_label();
1780        label_over = gen_new_label();
1781
1782        new_args[5] = label_arg(label_true);
1783        tcg_out_brcond2(s, new_args, const_args+1, 1);
1784
1785        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1786        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1787        tcg_out_label(s, label_true);
1788
1789        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1790        tcg_out_label(s, label_over);
1791    } else {
1792        /* When the destination does not overlap one of the arguments,
1793           clear the destination first, jump if cond false, and emit an
1794           increment in the true case.  This results in smaller code.  */
1795
1796        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1797
1798        label_over = gen_new_label();
1799        new_args[4] = tcg_invert_cond(new_args[4]);
1800        new_args[5] = label_arg(label_over);
1801        tcg_out_brcond2(s, new_args, const_args+1, 1);
1802
1803        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1804        tcg_out_label(s, label_over);
1805    }
1806}
1807#endif
1808
1809static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1810                         TCGReg dest, TCGReg v1)
1811{
1812    tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1813}
1814
1815static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1816                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1817                            TCGReg v1)
1818{
1819    int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1820    tcg_out_cmov(s, jcc, rexw, dest, v1);
1821}
1822
1823static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1824                        TCGArg arg2, bool const_a2)
1825{
1826    if (have_bmi1) {
1827        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1828        if (const_a2) {
1829            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1830        } else {
1831            tcg_debug_assert(dest != arg2);
1832            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1833        }
1834    } else {
1835        tcg_debug_assert(dest != arg2);
1836        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1837        tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1838    }
1839}
1840
1841static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1842                        TCGArg arg2, bool const_a2)
1843{
1844    if (have_lzcnt) {
1845        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1846        if (const_a2) {
1847            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1848        } else {
1849            tcg_debug_assert(dest != arg2);
1850            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1851        }
1852    } else {
1853        tcg_debug_assert(!const_a2);
1854        tcg_debug_assert(dest != arg1);
1855        tcg_debug_assert(dest != arg2);
1856
1857        /* Recall that the output of BSR is the index not the count.  */
1858        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1859        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1860
1861        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1862        int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1863        tcg_out_cmov(s, jcc, rexw, dest, arg2);
1864    }
1865}
1866
1867static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1868{
1869    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1870
1871    if (disp == (int32_t)disp) {
1872        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1873        tcg_out32(s, disp);
1874    } else {
1875        /* rip-relative addressing into the constant pool.
1876           This is 6 + 8 = 14 bytes, as compared to using an
1877           immediate load 10 + 6 = 16 bytes, plus we may
1878           be able to re-use the pool constant for more calls.  */
1879        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1880        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1881        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1882        tcg_out32(s, 0);
1883    }
1884}
1885
1886static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1887                         const TCGHelperInfo *info)
1888{
1889    tcg_out_branch(s, 1, dest);
1890
1891#ifndef _WIN32
1892    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1893        /*
1894         * The sysv i386 abi for struct return places a reference as the
1895         * first argument of the stack, and pops that argument with the
1896         * return statement.  Since we want to retain the aligned stack
1897         * pointer for the callee, we do not want to actually push that
1898         * argument before the call but rely on the normal store to the
1899         * stack slot.  But we do need to compensate for the pop in order
1900         * to reset our correct stack pointer value.
1901         * Pushing a garbage value back onto the stack is quickest.
1902         */
1903        tcg_out_push(s, TCG_REG_EAX);
1904    }
1905#endif
1906}
1907
1908static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1909{
1910    tcg_out_branch(s, 0, dest);
1911}
1912
1913static void tcg_out_nopn(TCGContext *s, int n)
1914{
1915    int i;
1916    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1917     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1918     * duplicate prefix, and all of the interesting recent cores can
1919     * decode and discard the duplicates in a single cycle.
1920     */
1921    tcg_debug_assert(n >= 1);
1922    for (i = 1; i < n; ++i) {
1923        tcg_out8(s, 0x66);
1924    }
1925    tcg_out8(s, 0x90);
1926}
1927
1928typedef struct {
1929    TCGReg base;
1930    int index;
1931    int ofs;
1932    int seg;
1933    TCGAtomAlign aa;
1934} HostAddress;
1935
1936bool tcg_target_has_memory_bswap(MemOp memop)
1937{
1938    TCGAtomAlign aa;
1939
1940    if (!have_movbe) {
1941        return false;
1942    }
1943    if ((memop & MO_SIZE) < MO_128) {
1944        return true;
1945    }
1946
1947    /*
1948     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1949     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1950     */
1951    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1952    return aa.atom < MO_128;
1953}
1954
1955/*
1956 * Because i686 has no register parameters and because x86_64 has xchg
1957 * to handle addr/data register overlap, we have placed all input arguments
1958 * before we need might need a scratch reg.
1959 *
1960 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1961 * a general-purpose scratch when we don't actually know it's available,
1962 * use the ra_gen hook to load into RAX if needed.
1963 */
1964#if TCG_TARGET_REG_BITS == 64
1965static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1966{
1967    if (arg < 0) {
1968        arg = TCG_REG_RAX;
1969    }
1970    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1971    return arg;
1972}
1973static const TCGLdstHelperParam ldst_helper_param = {
1974    .ra_gen = ldst_ra_gen
1975};
1976#else
1977static const TCGLdstHelperParam ldst_helper_param = { };
1978#endif
1979
1980static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1981                                TCGReg l, TCGReg h, TCGReg v)
1982{
1983    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1984
1985    /* vpmov{d,q} %v, %l */
1986    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1987    /* vpextr{d,q} $1, %v, %h */
1988    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1989    tcg_out8(s, 1);
1990}
1991
1992static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1993                                TCGReg v, TCGReg l, TCGReg h)
1994{
1995    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1996
1997    /* vmov{d,q} %l, %v */
1998    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1999    /* vpinsr{d,q} $1, %h, %v, %v */
2000    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2001    tcg_out8(s, 1);
2002}
2003
2004/*
2005 * Generate code for the slow path for a load at the end of block
2006 */
2007static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2008{
2009    MemOp opc = get_memop(l->oi);
2010    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2011
2012    /* resolve label address */
2013    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2014    if (label_ptr[1]) {
2015        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2016    }
2017
2018    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2019    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2020    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2021
2022    tcg_out_jmp(s, l->raddr);
2023    return true;
2024}
2025
2026/*
2027 * Generate code for the slow path for a store at the end of block
2028 */
2029static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2030{
2031    MemOp opc = get_memop(l->oi);
2032    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2033
2034    /* resolve label address */
2035    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2036    if (label_ptr[1]) {
2037        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2038    }
2039
2040    tcg_out_st_helper_args(s, l, &ldst_helper_param);
2041    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2042
2043    tcg_out_jmp(s, l->raddr);
2044    return true;
2045}
2046
2047#ifdef CONFIG_USER_ONLY
2048static HostAddress x86_guest_base = {
2049    .index = -1
2050};
2051
2052#if defined(__x86_64__) && defined(__linux__)
2053# include <asm/prctl.h>
2054# include <sys/prctl.h>
2055int arch_prctl(int code, unsigned long addr);
2056static inline int setup_guest_base_seg(void)
2057{
2058    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2059        return P_GS;
2060    }
2061    return 0;
2062}
2063#define setup_guest_base_seg  setup_guest_base_seg
2064#elif defined(__x86_64__) && \
2065      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2066# include <machine/sysarch.h>
2067static inline int setup_guest_base_seg(void)
2068{
2069    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2070        return P_GS;
2071    }
2072    return 0;
2073}
2074#define setup_guest_base_seg  setup_guest_base_seg
2075#endif
2076#else
2077# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2078#endif /* CONFIG_USER_ONLY */
2079#ifndef setup_guest_base_seg
2080# define setup_guest_base_seg()  0
2081#endif
2082
2083#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2084
2085/*
2086 * For softmmu, perform the TLB load and compare.
2087 * For useronly, perform any required alignment tests.
2088 * In both cases, return a TCGLabelQemuLdst structure if the slow path
2089 * is required and fill in @h with the host address for the fast path.
2090 */
2091static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2092                                           TCGReg addrlo, TCGReg addrhi,
2093                                           MemOpIdx oi, bool is_ld)
2094{
2095    TCGLabelQemuLdst *ldst = NULL;
2096    MemOp opc = get_memop(oi);
2097    MemOp s_bits = opc & MO_SIZE;
2098    unsigned a_mask;
2099
2100    if (tcg_use_softmmu) {
2101        h->index = TCG_REG_L0;
2102        h->ofs = 0;
2103        h->seg = 0;
2104    } else {
2105        *h = x86_guest_base;
2106    }
2107    h->base = addrlo;
2108    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2109    a_mask = (1 << h->aa.align) - 1;
2110
2111    if (tcg_use_softmmu) {
2112        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2113                            : offsetof(CPUTLBEntry, addr_write);
2114        TCGType ttype = TCG_TYPE_I32;
2115        TCGType tlbtype = TCG_TYPE_I32;
2116        int trexw = 0, hrexw = 0, tlbrexw = 0;
2117        unsigned mem_index = get_mmuidx(oi);
2118        unsigned s_mask = (1 << s_bits) - 1;
2119        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2120        int tlb_mask;
2121
2122        ldst = new_ldst_label(s);
2123        ldst->is_ld = is_ld;
2124        ldst->oi = oi;
2125        ldst->addrlo_reg = addrlo;
2126        ldst->addrhi_reg = addrhi;
2127
2128        if (TCG_TARGET_REG_BITS == 64) {
2129            ttype = s->addr_type;
2130            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2131            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2132                hrexw = P_REXW;
2133                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2134                    tlbtype = TCG_TYPE_I64;
2135                    tlbrexw = P_REXW;
2136                }
2137            }
2138        }
2139
2140        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2141        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2142                       s->page_bits - CPU_TLB_ENTRY_BITS);
2143
2144        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2145                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2146
2147        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2148                             fast_ofs + offsetof(CPUTLBDescFast, table));
2149
2150        /*
2151         * If the required alignment is at least as large as the access,
2152         * simply copy the address and mask.  For lesser alignments,
2153         * check that we don't cross pages for the complete access.
2154         */
2155        if (a_mask >= s_mask) {
2156            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2157        } else {
2158            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2159                                 addrlo, s_mask - a_mask);
2160        }
2161        tlb_mask = s->page_mask | a_mask;
2162        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2163
2164        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2165        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2166                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2167
2168        /* jne slow_path */
2169        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2170        ldst->label_ptr[0] = s->code_ptr;
2171        s->code_ptr += 4;
2172
2173        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2174            /* cmp 4(TCG_REG_L0), addrhi */
2175            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2176                                 TCG_REG_L0, cmp_ofs + 4);
2177
2178            /* jne slow_path */
2179            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2180            ldst->label_ptr[1] = s->code_ptr;
2181            s->code_ptr += 4;
2182        }
2183
2184        /* TLB Hit.  */
2185        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2186                   offsetof(CPUTLBEntry, addend));
2187    } else if (a_mask) {
2188        int jcc;
2189
2190        ldst = new_ldst_label(s);
2191        ldst->is_ld = is_ld;
2192        ldst->oi = oi;
2193        ldst->addrlo_reg = addrlo;
2194        ldst->addrhi_reg = addrhi;
2195
2196        /* jne slow_path */
2197        jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2198        tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2199        ldst->label_ptr[0] = s->code_ptr;
2200        s->code_ptr += 4;
2201    }
2202
2203    return ldst;
2204}
2205
2206static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2207                                   HostAddress h, TCGType type, MemOp memop)
2208{
2209    bool use_movbe = false;
2210    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2211    int movop = OPC_MOVL_GvEv;
2212
2213    /* Do big-endian loads with movbe.  */
2214    if (memop & MO_BSWAP) {
2215        tcg_debug_assert(have_movbe);
2216        use_movbe = true;
2217        movop = OPC_MOVBE_GyMy;
2218    }
2219
2220    switch (memop & MO_SSIZE) {
2221    case MO_UB:
2222        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2223                                 h.base, h.index, 0, h.ofs);
2224        break;
2225    case MO_SB:
2226        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2227                                 h.base, h.index, 0, h.ofs);
2228        break;
2229    case MO_UW:
2230        if (use_movbe) {
2231            /* There is no extending movbe; only low 16-bits are modified.  */
2232            if (datalo != h.base && datalo != h.index) {
2233                /* XOR breaks dependency chains.  */
2234                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2235                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2236                                         datalo, h.base, h.index, 0, h.ofs);
2237            } else {
2238                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2239                                         datalo, h.base, h.index, 0, h.ofs);
2240                tcg_out_ext16u(s, datalo, datalo);
2241            }
2242        } else {
2243            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2244                                     h.base, h.index, 0, h.ofs);
2245        }
2246        break;
2247    case MO_SW:
2248        if (use_movbe) {
2249            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2250                                     datalo, h.base, h.index, 0, h.ofs);
2251            tcg_out_ext16s(s, type, datalo, datalo);
2252        } else {
2253            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2254                                     datalo, h.base, h.index, 0, h.ofs);
2255        }
2256        break;
2257    case MO_UL:
2258        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2259                                 h.base, h.index, 0, h.ofs);
2260        break;
2261#if TCG_TARGET_REG_BITS == 64
2262    case MO_SL:
2263        if (use_movbe) {
2264            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2265                                     h.base, h.index, 0, h.ofs);
2266            tcg_out_ext32s(s, datalo, datalo);
2267        } else {
2268            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2269                                     h.base, h.index, 0, h.ofs);
2270        }
2271        break;
2272#endif
2273    case MO_UQ:
2274        if (TCG_TARGET_REG_BITS == 64) {
2275            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2276                                     h.base, h.index, 0, h.ofs);
2277            break;
2278        }
2279        if (use_movbe) {
2280            TCGReg t = datalo;
2281            datalo = datahi;
2282            datahi = t;
2283        }
2284        if (h.base == datalo || h.index == datalo) {
2285            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2286                                     h.base, h.index, 0, h.ofs);
2287            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2288            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2289        } else {
2290            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2291                                     h.base, h.index, 0, h.ofs);
2292            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2293                                     h.base, h.index, 0, h.ofs + 4);
2294        }
2295        break;
2296
2297    case MO_128:
2298        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2299
2300        /*
2301         * Without 16-byte atomicity, use integer regs.
2302         * That is where we want the data, and it allows bswaps.
2303         */
2304        if (h.aa.atom < MO_128) {
2305            if (use_movbe) {
2306                TCGReg t = datalo;
2307                datalo = datahi;
2308                datahi = t;
2309            }
2310            if (h.base == datalo || h.index == datalo) {
2311                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2312                                         h.base, h.index, 0, h.ofs);
2313                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2314                                     datalo, datahi, 0);
2315                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2316                                     datahi, datahi, 8);
2317            } else {
2318                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2319                                         h.base, h.index, 0, h.ofs);
2320                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2321                                         h.base, h.index, 0, h.ofs + 8);
2322            }
2323            break;
2324        }
2325
2326        /*
2327         * With 16-byte atomicity, a vector load is required.
2328         * If we already have 16-byte alignment, then VMOVDQA always works.
2329         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2330         * Else use we require a runtime test for alignment for VMOVDQA;
2331         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2332         */
2333        if (h.aa.align >= MO_128) {
2334            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2335                                         TCG_TMP_VEC, 0,
2336                                         h.base, h.index, 0, h.ofs);
2337        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2338            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2339                                         TCG_TMP_VEC, 0,
2340                                         h.base, h.index, 0, h.ofs);
2341        } else {
2342            TCGLabel *l1 = gen_new_label();
2343            TCGLabel *l2 = gen_new_label();
2344            int jcc;
2345
2346            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2347            tcg_out_jxx(s, jcc, l1, true);
2348
2349            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2350                                         TCG_TMP_VEC, 0,
2351                                         h.base, h.index, 0, h.ofs);
2352            tcg_out_jxx(s, JCC_JMP, l2, true);
2353
2354            tcg_out_label(s, l1);
2355            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2356                                         TCG_TMP_VEC, 0,
2357                                         h.base, h.index, 0, h.ofs);
2358            tcg_out_label(s, l2);
2359        }
2360        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2361        break;
2362
2363    default:
2364        g_assert_not_reached();
2365    }
2366}
2367
2368static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2369                            TCGReg addrlo, TCGReg addrhi,
2370                            MemOpIdx oi, TCGType data_type)
2371{
2372    TCGLabelQemuLdst *ldst;
2373    HostAddress h;
2374
2375    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2376    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2377
2378    if (ldst) {
2379        ldst->type = data_type;
2380        ldst->datalo_reg = datalo;
2381        ldst->datahi_reg = datahi;
2382        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2383    }
2384}
2385
2386static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2387                                   HostAddress h, MemOp memop)
2388{
2389    bool use_movbe = false;
2390    int movop = OPC_MOVL_EvGv;
2391
2392    /*
2393     * Do big-endian stores with movbe or system-mode.
2394     * User-only without movbe will have its swapping done generically.
2395     */
2396    if (memop & MO_BSWAP) {
2397        tcg_debug_assert(have_movbe);
2398        use_movbe = true;
2399        movop = OPC_MOVBE_MyGy;
2400    }
2401
2402    switch (memop & MO_SIZE) {
2403    case MO_8:
2404        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2405        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2406        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2407                                 datalo, h.base, h.index, 0, h.ofs);
2408        break;
2409    case MO_16:
2410        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2411                                 h.base, h.index, 0, h.ofs);
2412        break;
2413    case MO_32:
2414        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2415                                 h.base, h.index, 0, h.ofs);
2416        break;
2417    case MO_64:
2418        if (TCG_TARGET_REG_BITS == 64) {
2419            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2420                                     h.base, h.index, 0, h.ofs);
2421        } else {
2422            if (use_movbe) {
2423                TCGReg t = datalo;
2424                datalo = datahi;
2425                datahi = t;
2426            }
2427            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2428                                     h.base, h.index, 0, h.ofs);
2429            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2430                                     h.base, h.index, 0, h.ofs + 4);
2431        }
2432        break;
2433
2434    case MO_128:
2435        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2436
2437        /*
2438         * Without 16-byte atomicity, use integer regs.
2439         * That is where we have the data, and it allows bswaps.
2440         */
2441        if (h.aa.atom < MO_128) {
2442            if (use_movbe) {
2443                TCGReg t = datalo;
2444                datalo = datahi;
2445                datahi = t;
2446            }
2447            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2448                                     h.base, h.index, 0, h.ofs);
2449            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2450                                     h.base, h.index, 0, h.ofs + 8);
2451            break;
2452        }
2453
2454        /*
2455         * With 16-byte atomicity, a vector store is required.
2456         * If we already have 16-byte alignment, then VMOVDQA always works.
2457         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2458         * Else use we require a runtime test for alignment for VMOVDQA;
2459         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2460         */
2461        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2462        if (h.aa.align >= MO_128) {
2463            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2464                                         TCG_TMP_VEC, 0,
2465                                         h.base, h.index, 0, h.ofs);
2466        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2467            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2468                                         TCG_TMP_VEC, 0,
2469                                         h.base, h.index, 0, h.ofs);
2470        } else {
2471            TCGLabel *l1 = gen_new_label();
2472            TCGLabel *l2 = gen_new_label();
2473            int jcc;
2474
2475            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2476            tcg_out_jxx(s, jcc, l1, true);
2477
2478            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2479                                         TCG_TMP_VEC, 0,
2480                                         h.base, h.index, 0, h.ofs);
2481            tcg_out_jxx(s, JCC_JMP, l2, true);
2482
2483            tcg_out_label(s, l1);
2484            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2485                                         TCG_TMP_VEC, 0,
2486                                         h.base, h.index, 0, h.ofs);
2487            tcg_out_label(s, l2);
2488        }
2489        break;
2490
2491    default:
2492        g_assert_not_reached();
2493    }
2494}
2495
2496static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2497                            TCGReg addrlo, TCGReg addrhi,
2498                            MemOpIdx oi, TCGType data_type)
2499{
2500    TCGLabelQemuLdst *ldst;
2501    HostAddress h;
2502
2503    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2504    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2505
2506    if (ldst) {
2507        ldst->type = data_type;
2508        ldst->datalo_reg = datalo;
2509        ldst->datahi_reg = datahi;
2510        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2511    }
2512}
2513
2514static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2515{
2516    /* Reuse the zeroing that exists for goto_ptr.  */
2517    if (a0 == 0) {
2518        tcg_out_jmp(s, tcg_code_gen_epilogue);
2519    } else {
2520        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2521        tcg_out_jmp(s, tb_ret_addr);
2522    }
2523}
2524
2525static void tcg_out_goto_tb(TCGContext *s, int which)
2526{
2527    /*
2528     * Jump displacement must be aligned for atomic patching;
2529     * see if we need to add extra nops before jump
2530     */
2531    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2532    if (gap != 1) {
2533        tcg_out_nopn(s, gap - 1);
2534    }
2535    tcg_out8(s, OPC_JMP_long); /* jmp im */
2536    set_jmp_insn_offset(s, which);
2537    tcg_out32(s, 0);
2538    set_jmp_reset_offset(s, which);
2539}
2540
2541void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2542                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2543{
2544    /* patch the branch destination */
2545    uintptr_t addr = tb->jmp_target_addr[n];
2546    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2547    /* no need to flush icache explicitly */
2548}
2549
2550static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2551                              const TCGArg args[TCG_MAX_OP_ARGS],
2552                              const int const_args[TCG_MAX_OP_ARGS])
2553{
2554    TCGArg a0, a1, a2;
2555    int c, const_a2, vexop, rexw = 0;
2556
2557#if TCG_TARGET_REG_BITS == 64
2558# define OP_32_64(x) \
2559        case glue(glue(INDEX_op_, x), _i64): \
2560            rexw = P_REXW; /* FALLTHRU */    \
2561        case glue(glue(INDEX_op_, x), _i32)
2562#else
2563# define OP_32_64(x) \
2564        case glue(glue(INDEX_op_, x), _i32)
2565#endif
2566
2567    /* Hoist the loads of the most common arguments.  */
2568    a0 = args[0];
2569    a1 = args[1];
2570    a2 = args[2];
2571    const_a2 = const_args[2];
2572
2573    switch (opc) {
2574    case INDEX_op_goto_ptr:
2575        /* jmp to the given host address (could be epilogue) */
2576        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2577        break;
2578    case INDEX_op_br:
2579        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2580        break;
2581    OP_32_64(ld8u):
2582        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2583        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2584        break;
2585    OP_32_64(ld8s):
2586        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2587        break;
2588    OP_32_64(ld16u):
2589        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2590        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2591        break;
2592    OP_32_64(ld16s):
2593        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2594        break;
2595#if TCG_TARGET_REG_BITS == 64
2596    case INDEX_op_ld32u_i64:
2597#endif
2598    case INDEX_op_ld_i32:
2599        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2600        break;
2601
2602    OP_32_64(st8):
2603        if (const_args[0]) {
2604            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2605            tcg_out8(s, a0);
2606        } else {
2607            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2608        }
2609        break;
2610    OP_32_64(st16):
2611        if (const_args[0]) {
2612            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2613            tcg_out16(s, a0);
2614        } else {
2615            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2616        }
2617        break;
2618#if TCG_TARGET_REG_BITS == 64
2619    case INDEX_op_st32_i64:
2620#endif
2621    case INDEX_op_st_i32:
2622        if (const_args[0]) {
2623            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2624            tcg_out32(s, a0);
2625        } else {
2626            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2627        }
2628        break;
2629
2630    OP_32_64(add):
2631        /* For 3-operand addition, use LEA.  */
2632        if (a0 != a1) {
2633            TCGArg c3 = 0;
2634            if (const_a2) {
2635                c3 = a2, a2 = -1;
2636            } else if (a0 == a2) {
2637                /* Watch out for dest = src + dest, since we've removed
2638                   the matching constraint on the add.  */
2639                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2640                break;
2641            }
2642
2643            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2644            break;
2645        }
2646        c = ARITH_ADD;
2647        goto gen_arith;
2648    OP_32_64(sub):
2649        c = ARITH_SUB;
2650        goto gen_arith;
2651    OP_32_64(and):
2652        c = ARITH_AND;
2653        goto gen_arith;
2654    OP_32_64(or):
2655        c = ARITH_OR;
2656        goto gen_arith;
2657    OP_32_64(xor):
2658        c = ARITH_XOR;
2659        goto gen_arith;
2660    gen_arith:
2661        if (const_a2) {
2662            tgen_arithi(s, c + rexw, a0, a2, 0);
2663        } else {
2664            tgen_arithr(s, c + rexw, a0, a2);
2665        }
2666        break;
2667
2668    OP_32_64(andc):
2669        if (const_a2) {
2670            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2671            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2672        } else {
2673            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2674        }
2675        break;
2676
2677    OP_32_64(mul):
2678        if (const_a2) {
2679            int32_t val;
2680            val = a2;
2681            if (val == (int8_t)val) {
2682                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2683                tcg_out8(s, val);
2684            } else {
2685                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2686                tcg_out32(s, val);
2687            }
2688        } else {
2689            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2690        }
2691        break;
2692
2693    OP_32_64(div2):
2694        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2695        break;
2696    OP_32_64(divu2):
2697        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2698        break;
2699
2700    OP_32_64(shl):
2701        /* For small constant 3-operand shift, use LEA.  */
2702        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2703            if (a2 - 1 == 0) {
2704                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2705                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2706            } else {
2707                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2708                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2709            }
2710            break;
2711        }
2712        c = SHIFT_SHL;
2713        vexop = OPC_SHLX;
2714        goto gen_shift_maybe_vex;
2715    OP_32_64(shr):
2716        c = SHIFT_SHR;
2717        vexop = OPC_SHRX;
2718        goto gen_shift_maybe_vex;
2719    OP_32_64(sar):
2720        c = SHIFT_SAR;
2721        vexop = OPC_SARX;
2722        goto gen_shift_maybe_vex;
2723    OP_32_64(rotl):
2724        c = SHIFT_ROL;
2725        goto gen_shift;
2726    OP_32_64(rotr):
2727        c = SHIFT_ROR;
2728        goto gen_shift;
2729    gen_shift_maybe_vex:
2730        if (have_bmi2) {
2731            if (!const_a2) {
2732                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2733                break;
2734            }
2735            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2736        }
2737        /* FALLTHRU */
2738    gen_shift:
2739        if (const_a2) {
2740            tcg_out_shifti(s, c + rexw, a0, a2);
2741        } else {
2742            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2743        }
2744        break;
2745
2746    OP_32_64(ctz):
2747        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2748        break;
2749    OP_32_64(clz):
2750        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2751        break;
2752    OP_32_64(ctpop):
2753        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2754        break;
2755
2756    OP_32_64(brcond):
2757        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2758                       arg_label(args[3]), 0);
2759        break;
2760    OP_32_64(setcond):
2761        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2762        break;
2763    OP_32_64(negsetcond):
2764        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2765        break;
2766    OP_32_64(movcond):
2767        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2768        break;
2769
2770    OP_32_64(bswap16):
2771        if (a2 & TCG_BSWAP_OS) {
2772            /* Output must be sign-extended. */
2773            if (rexw) {
2774                tcg_out_bswap64(s, a0);
2775                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2776            } else {
2777                tcg_out_bswap32(s, a0);
2778                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2779            }
2780        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2781            /* Output must be zero-extended, but input isn't. */
2782            tcg_out_bswap32(s, a0);
2783            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2784        } else {
2785            tcg_out_rolw_8(s, a0);
2786        }
2787        break;
2788    OP_32_64(bswap32):
2789        tcg_out_bswap32(s, a0);
2790        if (rexw && (a2 & TCG_BSWAP_OS)) {
2791            tcg_out_ext32s(s, a0, a0);
2792        }
2793        break;
2794
2795    OP_32_64(neg):
2796        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2797        break;
2798    OP_32_64(not):
2799        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2800        break;
2801
2802    case INDEX_op_qemu_ld_a64_i32:
2803        if (TCG_TARGET_REG_BITS == 32) {
2804            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2805            break;
2806        }
2807        /* fall through */
2808    case INDEX_op_qemu_ld_a32_i32:
2809        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2810        break;
2811    case INDEX_op_qemu_ld_a32_i64:
2812        if (TCG_TARGET_REG_BITS == 64) {
2813            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2814        } else {
2815            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2816        }
2817        break;
2818    case INDEX_op_qemu_ld_a64_i64:
2819        if (TCG_TARGET_REG_BITS == 64) {
2820            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2821        } else {
2822            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2823        }
2824        break;
2825    case INDEX_op_qemu_ld_a32_i128:
2826    case INDEX_op_qemu_ld_a64_i128:
2827        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2828        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2829        break;
2830
2831    case INDEX_op_qemu_st_a64_i32:
2832    case INDEX_op_qemu_st8_a64_i32:
2833        if (TCG_TARGET_REG_BITS == 32) {
2834            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2835            break;
2836        }
2837        /* fall through */
2838    case INDEX_op_qemu_st_a32_i32:
2839    case INDEX_op_qemu_st8_a32_i32:
2840        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2841        break;
2842    case INDEX_op_qemu_st_a32_i64:
2843        if (TCG_TARGET_REG_BITS == 64) {
2844            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2845        } else {
2846            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2847        }
2848        break;
2849    case INDEX_op_qemu_st_a64_i64:
2850        if (TCG_TARGET_REG_BITS == 64) {
2851            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2852        } else {
2853            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2854        }
2855        break;
2856    case INDEX_op_qemu_st_a32_i128:
2857    case INDEX_op_qemu_st_a64_i128:
2858        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2859        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2860        break;
2861
2862    OP_32_64(mulu2):
2863        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2864        break;
2865    OP_32_64(muls2):
2866        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2867        break;
2868    OP_32_64(add2):
2869        if (const_args[4]) {
2870            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2871        } else {
2872            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2873        }
2874        if (const_args[5]) {
2875            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2876        } else {
2877            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2878        }
2879        break;
2880    OP_32_64(sub2):
2881        if (const_args[4]) {
2882            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2883        } else {
2884            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2885        }
2886        if (const_args[5]) {
2887            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2888        } else {
2889            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2890        }
2891        break;
2892
2893#if TCG_TARGET_REG_BITS == 32
2894    case INDEX_op_brcond2_i32:
2895        tcg_out_brcond2(s, args, const_args, 0);
2896        break;
2897    case INDEX_op_setcond2_i32:
2898        tcg_out_setcond2(s, args, const_args);
2899        break;
2900#else /* TCG_TARGET_REG_BITS == 64 */
2901    case INDEX_op_ld32s_i64:
2902        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2903        break;
2904    case INDEX_op_ld_i64:
2905        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2906        break;
2907    case INDEX_op_st_i64:
2908        if (const_args[0]) {
2909            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2910            tcg_out32(s, a0);
2911        } else {
2912            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2913        }
2914        break;
2915
2916    case INDEX_op_bswap64_i64:
2917        tcg_out_bswap64(s, a0);
2918        break;
2919    case INDEX_op_extrh_i64_i32:
2920        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2921        break;
2922#endif
2923
2924    OP_32_64(deposit):
2925        if (args[3] == 0 && args[4] == 8) {
2926            /* load bits 0..7 */
2927            if (const_a2) {
2928                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2929                            0, a0, 0);
2930                tcg_out8(s, a2);
2931            } else {
2932                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2933            }
2934        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2935            /* load bits 8..15 */
2936            if (const_a2) {
2937                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2938                tcg_out8(s, a2);
2939            } else {
2940                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2941            }
2942        } else if (args[3] == 0 && args[4] == 16) {
2943            /* load bits 0..15 */
2944            if (const_a2) {
2945                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2946                            0, a0, 0);
2947                tcg_out16(s, a2);
2948            } else {
2949                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2950            }
2951        } else {
2952            g_assert_not_reached();
2953        }
2954        break;
2955
2956    case INDEX_op_extract_i64:
2957        if (a2 + args[3] == 32) {
2958            /* This is a 32-bit zero-extending right shift.  */
2959            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2960            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2961            break;
2962        }
2963        /* FALLTHRU */
2964    case INDEX_op_extract_i32:
2965        /* On the off-chance that we can use the high-byte registers.
2966           Otherwise we emit the same ext16 + shift pattern that we
2967           would have gotten from the normal tcg-op.c expansion.  */
2968        tcg_debug_assert(a2 == 8 && args[3] == 8);
2969        if (a1 < 4 && a0 < 8) {
2970            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2971        } else {
2972            tcg_out_ext16u(s, a0, a1);
2973            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2974        }
2975        break;
2976
2977    case INDEX_op_sextract_i32:
2978        /* We don't implement sextract_i64, as we cannot sign-extend to
2979           64-bits without using the REX prefix that explicitly excludes
2980           access to the high-byte registers.  */
2981        tcg_debug_assert(a2 == 8 && args[3] == 8);
2982        if (a1 < 4 && a0 < 8) {
2983            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2984        } else {
2985            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2986            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2987        }
2988        break;
2989
2990    OP_32_64(extract2):
2991        /* Note that SHRD outputs to the r/m operand.  */
2992        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2993        tcg_out8(s, args[3]);
2994        break;
2995
2996    case INDEX_op_mb:
2997        tcg_out_mb(s, a0);
2998        break;
2999    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
3000    case INDEX_op_mov_i64:
3001    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
3002    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
3003    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
3004    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
3005    case INDEX_op_ext8s_i64:
3006    case INDEX_op_ext8u_i32:
3007    case INDEX_op_ext8u_i64:
3008    case INDEX_op_ext16s_i32:
3009    case INDEX_op_ext16s_i64:
3010    case INDEX_op_ext16u_i32:
3011    case INDEX_op_ext16u_i64:
3012    case INDEX_op_ext32s_i64:
3013    case INDEX_op_ext32u_i64:
3014    case INDEX_op_ext_i32_i64:
3015    case INDEX_op_extu_i32_i64:
3016    case INDEX_op_extrl_i64_i32:
3017    default:
3018        g_assert_not_reached();
3019    }
3020
3021#undef OP_32_64
3022}
3023
3024static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3025                           unsigned vecl, unsigned vece,
3026                           const TCGArg args[TCG_MAX_OP_ARGS],
3027                           const int const_args[TCG_MAX_OP_ARGS])
3028{
3029    static int const add_insn[4] = {
3030        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3031    };
3032    static int const ssadd_insn[4] = {
3033        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3034    };
3035    static int const usadd_insn[4] = {
3036        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3037    };
3038    static int const sub_insn[4] = {
3039        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3040    };
3041    static int const sssub_insn[4] = {
3042        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3043    };
3044    static int const ussub_insn[4] = {
3045        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3046    };
3047    static int const mul_insn[4] = {
3048        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3049    };
3050    static int const shift_imm_insn[4] = {
3051        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3052    };
3053    static int const cmpeq_insn[4] = {
3054        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3055    };
3056    static int const cmpgt_insn[4] = {
3057        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3058    };
3059    static int const punpckl_insn[4] = {
3060        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3061    };
3062    static int const punpckh_insn[4] = {
3063        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3064    };
3065    static int const packss_insn[4] = {
3066        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3067    };
3068    static int const packus_insn[4] = {
3069        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3070    };
3071    static int const smin_insn[4] = {
3072        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3073    };
3074    static int const smax_insn[4] = {
3075        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3076    };
3077    static int const umin_insn[4] = {
3078        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3079    };
3080    static int const umax_insn[4] = {
3081        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3082    };
3083    static int const rotlv_insn[4] = {
3084        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3085    };
3086    static int const rotrv_insn[4] = {
3087        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3088    };
3089    static int const shlv_insn[4] = {
3090        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3091    };
3092    static int const shrv_insn[4] = {
3093        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3094    };
3095    static int const sarv_insn[4] = {
3096        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3097    };
3098    static int const shls_insn[4] = {
3099        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3100    };
3101    static int const shrs_insn[4] = {
3102        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3103    };
3104    static int const sars_insn[4] = {
3105        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3106    };
3107    static int const vpshldi_insn[4] = {
3108        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3109    };
3110    static int const vpshldv_insn[4] = {
3111        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3112    };
3113    static int const vpshrdv_insn[4] = {
3114        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3115    };
3116    static int const abs_insn[4] = {
3117        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3118    };
3119
3120    TCGType type = vecl + TCG_TYPE_V64;
3121    int insn, sub;
3122    TCGArg a0, a1, a2, a3;
3123
3124    a0 = args[0];
3125    a1 = args[1];
3126    a2 = args[2];
3127
3128    switch (opc) {
3129    case INDEX_op_add_vec:
3130        insn = add_insn[vece];
3131        goto gen_simd;
3132    case INDEX_op_ssadd_vec:
3133        insn = ssadd_insn[vece];
3134        goto gen_simd;
3135    case INDEX_op_usadd_vec:
3136        insn = usadd_insn[vece];
3137        goto gen_simd;
3138    case INDEX_op_sub_vec:
3139        insn = sub_insn[vece];
3140        goto gen_simd;
3141    case INDEX_op_sssub_vec:
3142        insn = sssub_insn[vece];
3143        goto gen_simd;
3144    case INDEX_op_ussub_vec:
3145        insn = ussub_insn[vece];
3146        goto gen_simd;
3147    case INDEX_op_mul_vec:
3148        insn = mul_insn[vece];
3149        goto gen_simd;
3150    case INDEX_op_and_vec:
3151        insn = OPC_PAND;
3152        goto gen_simd;
3153    case INDEX_op_or_vec:
3154        insn = OPC_POR;
3155        goto gen_simd;
3156    case INDEX_op_xor_vec:
3157        insn = OPC_PXOR;
3158        goto gen_simd;
3159    case INDEX_op_smin_vec:
3160        insn = smin_insn[vece];
3161        goto gen_simd;
3162    case INDEX_op_umin_vec:
3163        insn = umin_insn[vece];
3164        goto gen_simd;
3165    case INDEX_op_smax_vec:
3166        insn = smax_insn[vece];
3167        goto gen_simd;
3168    case INDEX_op_umax_vec:
3169        insn = umax_insn[vece];
3170        goto gen_simd;
3171    case INDEX_op_shlv_vec:
3172        insn = shlv_insn[vece];
3173        goto gen_simd;
3174    case INDEX_op_shrv_vec:
3175        insn = shrv_insn[vece];
3176        goto gen_simd;
3177    case INDEX_op_sarv_vec:
3178        insn = sarv_insn[vece];
3179        goto gen_simd;
3180    case INDEX_op_rotlv_vec:
3181        insn = rotlv_insn[vece];
3182        goto gen_simd;
3183    case INDEX_op_rotrv_vec:
3184        insn = rotrv_insn[vece];
3185        goto gen_simd;
3186    case INDEX_op_shls_vec:
3187        insn = shls_insn[vece];
3188        goto gen_simd;
3189    case INDEX_op_shrs_vec:
3190        insn = shrs_insn[vece];
3191        goto gen_simd;
3192    case INDEX_op_sars_vec:
3193        insn = sars_insn[vece];
3194        goto gen_simd;
3195    case INDEX_op_x86_punpckl_vec:
3196        insn = punpckl_insn[vece];
3197        goto gen_simd;
3198    case INDEX_op_x86_punpckh_vec:
3199        insn = punpckh_insn[vece];
3200        goto gen_simd;
3201    case INDEX_op_x86_packss_vec:
3202        insn = packss_insn[vece];
3203        goto gen_simd;
3204    case INDEX_op_x86_packus_vec:
3205        insn = packus_insn[vece];
3206        goto gen_simd;
3207    case INDEX_op_x86_vpshldv_vec:
3208        insn = vpshldv_insn[vece];
3209        a1 = a2;
3210        a2 = args[3];
3211        goto gen_simd;
3212    case INDEX_op_x86_vpshrdv_vec:
3213        insn = vpshrdv_insn[vece];
3214        a1 = a2;
3215        a2 = args[3];
3216        goto gen_simd;
3217#if TCG_TARGET_REG_BITS == 32
3218    case INDEX_op_dup2_vec:
3219        /* First merge the two 32-bit inputs to a single 64-bit element. */
3220        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3221        /* Then replicate the 64-bit elements across the rest of the vector. */
3222        if (type != TCG_TYPE_V64) {
3223            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3224        }
3225        break;
3226#endif
3227    case INDEX_op_abs_vec:
3228        insn = abs_insn[vece];
3229        a2 = a1;
3230        a1 = 0;
3231        goto gen_simd;
3232    gen_simd:
3233        tcg_debug_assert(insn != OPC_UD2);
3234        if (type == TCG_TYPE_V256) {
3235            insn |= P_VEXL;
3236        }
3237        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3238        break;
3239
3240    case INDEX_op_cmp_vec:
3241        sub = args[3];
3242        if (sub == TCG_COND_EQ) {
3243            insn = cmpeq_insn[vece];
3244        } else if (sub == TCG_COND_GT) {
3245            insn = cmpgt_insn[vece];
3246        } else {
3247            g_assert_not_reached();
3248        }
3249        goto gen_simd;
3250
3251    case INDEX_op_andc_vec:
3252        insn = OPC_PANDN;
3253        if (type == TCG_TYPE_V256) {
3254            insn |= P_VEXL;
3255        }
3256        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3257        break;
3258
3259    case INDEX_op_shli_vec:
3260        insn = shift_imm_insn[vece];
3261        sub = 6;
3262        goto gen_shift;
3263    case INDEX_op_shri_vec:
3264        insn = shift_imm_insn[vece];
3265        sub = 2;
3266        goto gen_shift;
3267    case INDEX_op_sari_vec:
3268        if (vece == MO_64) {
3269            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3270        } else {
3271            insn = shift_imm_insn[vece];
3272        }
3273        sub = 4;
3274        goto gen_shift;
3275    case INDEX_op_rotli_vec:
3276        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3277        if (vece == MO_64) {
3278            insn |= P_VEXW;
3279        }
3280        sub = 1;
3281        goto gen_shift;
3282    gen_shift:
3283        tcg_debug_assert(vece != MO_8);
3284        if (type == TCG_TYPE_V256) {
3285            insn |= P_VEXL;
3286        }
3287        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3288        tcg_out8(s, a2);
3289        break;
3290
3291    case INDEX_op_ld_vec:
3292        tcg_out_ld(s, type, a0, a1, a2);
3293        break;
3294    case INDEX_op_st_vec:
3295        tcg_out_st(s, type, a0, a1, a2);
3296        break;
3297    case INDEX_op_dupm_vec:
3298        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3299        break;
3300
3301    case INDEX_op_x86_shufps_vec:
3302        insn = OPC_SHUFPS;
3303        sub = args[3];
3304        goto gen_simd_imm8;
3305    case INDEX_op_x86_blend_vec:
3306        if (vece == MO_16) {
3307            insn = OPC_PBLENDW;
3308        } else if (vece == MO_32) {
3309            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3310        } else {
3311            g_assert_not_reached();
3312        }
3313        sub = args[3];
3314        goto gen_simd_imm8;
3315    case INDEX_op_x86_vperm2i128_vec:
3316        insn = OPC_VPERM2I128;
3317        sub = args[3];
3318        goto gen_simd_imm8;
3319    case INDEX_op_x86_vpshldi_vec:
3320        insn = vpshldi_insn[vece];
3321        sub = args[3];
3322        goto gen_simd_imm8;
3323
3324    case INDEX_op_not_vec:
3325        insn = OPC_VPTERNLOGQ;
3326        a2 = a1;
3327        sub = 0x33; /* !B */
3328        goto gen_simd_imm8;
3329    case INDEX_op_nor_vec:
3330        insn = OPC_VPTERNLOGQ;
3331        sub = 0x11; /* norCB */
3332        goto gen_simd_imm8;
3333    case INDEX_op_nand_vec:
3334        insn = OPC_VPTERNLOGQ;
3335        sub = 0x77; /* nandCB */
3336        goto gen_simd_imm8;
3337    case INDEX_op_eqv_vec:
3338        insn = OPC_VPTERNLOGQ;
3339        sub = 0x99; /* xnorCB */
3340        goto gen_simd_imm8;
3341    case INDEX_op_orc_vec:
3342        insn = OPC_VPTERNLOGQ;
3343        sub = 0xdd; /* orB!C */
3344        goto gen_simd_imm8;
3345
3346    case INDEX_op_bitsel_vec:
3347        insn = OPC_VPTERNLOGQ;
3348        a3 = args[3];
3349        if (a0 == a1) {
3350            a1 = a2;
3351            a2 = a3;
3352            sub = 0xca; /* A?B:C */
3353        } else if (a0 == a2) {
3354            a2 = a3;
3355            sub = 0xe2; /* B?A:C */
3356        } else {
3357            tcg_out_mov(s, type, a0, a3);
3358            sub = 0xb8; /* B?C:A */
3359        }
3360        goto gen_simd_imm8;
3361
3362    gen_simd_imm8:
3363        tcg_debug_assert(insn != OPC_UD2);
3364        if (type == TCG_TYPE_V256) {
3365            insn |= P_VEXL;
3366        }
3367        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3368        tcg_out8(s, sub);
3369        break;
3370
3371    case INDEX_op_x86_vpblendvb_vec:
3372        insn = OPC_VPBLENDVB;
3373        if (type == TCG_TYPE_V256) {
3374            insn |= P_VEXL;
3375        }
3376        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3377        tcg_out8(s, args[3] << 4);
3378        break;
3379
3380    case INDEX_op_x86_psrldq_vec:
3381        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3382        tcg_out8(s, a2);
3383        break;
3384
3385    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3386    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3387    default:
3388        g_assert_not_reached();
3389    }
3390}
3391
3392static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3393{
3394    switch (op) {
3395    case INDEX_op_goto_ptr:
3396        return C_O0_I1(r);
3397
3398    case INDEX_op_ld8u_i32:
3399    case INDEX_op_ld8u_i64:
3400    case INDEX_op_ld8s_i32:
3401    case INDEX_op_ld8s_i64:
3402    case INDEX_op_ld16u_i32:
3403    case INDEX_op_ld16u_i64:
3404    case INDEX_op_ld16s_i32:
3405    case INDEX_op_ld16s_i64:
3406    case INDEX_op_ld_i32:
3407    case INDEX_op_ld32u_i64:
3408    case INDEX_op_ld32s_i64:
3409    case INDEX_op_ld_i64:
3410        return C_O1_I1(r, r);
3411
3412    case INDEX_op_st8_i32:
3413    case INDEX_op_st8_i64:
3414        return C_O0_I2(qi, r);
3415
3416    case INDEX_op_st16_i32:
3417    case INDEX_op_st16_i64:
3418    case INDEX_op_st_i32:
3419    case INDEX_op_st32_i64:
3420        return C_O0_I2(ri, r);
3421
3422    case INDEX_op_st_i64:
3423        return C_O0_I2(re, r);
3424
3425    case INDEX_op_add_i32:
3426    case INDEX_op_add_i64:
3427        return C_O1_I2(r, r, re);
3428
3429    case INDEX_op_sub_i32:
3430    case INDEX_op_sub_i64:
3431    case INDEX_op_mul_i32:
3432    case INDEX_op_mul_i64:
3433    case INDEX_op_or_i32:
3434    case INDEX_op_or_i64:
3435    case INDEX_op_xor_i32:
3436    case INDEX_op_xor_i64:
3437        return C_O1_I2(r, 0, re);
3438
3439    case INDEX_op_and_i32:
3440    case INDEX_op_and_i64:
3441        return C_O1_I2(r, 0, reZ);
3442
3443    case INDEX_op_andc_i32:
3444    case INDEX_op_andc_i64:
3445        return C_O1_I2(r, r, rI);
3446
3447    case INDEX_op_shl_i32:
3448    case INDEX_op_shl_i64:
3449    case INDEX_op_shr_i32:
3450    case INDEX_op_shr_i64:
3451    case INDEX_op_sar_i32:
3452    case INDEX_op_sar_i64:
3453        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3454
3455    case INDEX_op_rotl_i32:
3456    case INDEX_op_rotl_i64:
3457    case INDEX_op_rotr_i32:
3458    case INDEX_op_rotr_i64:
3459        return C_O1_I2(r, 0, ci);
3460
3461    case INDEX_op_brcond_i32:
3462    case INDEX_op_brcond_i64:
3463        return C_O0_I2(r, reT);
3464
3465    case INDEX_op_bswap16_i32:
3466    case INDEX_op_bswap16_i64:
3467    case INDEX_op_bswap32_i32:
3468    case INDEX_op_bswap32_i64:
3469    case INDEX_op_bswap64_i64:
3470    case INDEX_op_neg_i32:
3471    case INDEX_op_neg_i64:
3472    case INDEX_op_not_i32:
3473    case INDEX_op_not_i64:
3474    case INDEX_op_extrh_i64_i32:
3475        return C_O1_I1(r, 0);
3476
3477    case INDEX_op_ext8s_i32:
3478    case INDEX_op_ext8s_i64:
3479    case INDEX_op_ext8u_i32:
3480    case INDEX_op_ext8u_i64:
3481        return C_O1_I1(r, q);
3482
3483    case INDEX_op_ext16s_i32:
3484    case INDEX_op_ext16s_i64:
3485    case INDEX_op_ext16u_i32:
3486    case INDEX_op_ext16u_i64:
3487    case INDEX_op_ext32s_i64:
3488    case INDEX_op_ext32u_i64:
3489    case INDEX_op_ext_i32_i64:
3490    case INDEX_op_extu_i32_i64:
3491    case INDEX_op_extrl_i64_i32:
3492    case INDEX_op_extract_i32:
3493    case INDEX_op_extract_i64:
3494    case INDEX_op_sextract_i32:
3495    case INDEX_op_ctpop_i32:
3496    case INDEX_op_ctpop_i64:
3497        return C_O1_I1(r, r);
3498
3499    case INDEX_op_extract2_i32:
3500    case INDEX_op_extract2_i64:
3501        return C_O1_I2(r, 0, r);
3502
3503    case INDEX_op_deposit_i32:
3504    case INDEX_op_deposit_i64:
3505        return C_O1_I2(q, 0, qi);
3506
3507    case INDEX_op_setcond_i32:
3508    case INDEX_op_setcond_i64:
3509    case INDEX_op_negsetcond_i32:
3510    case INDEX_op_negsetcond_i64:
3511        return C_O1_I2(q, r, reT);
3512
3513    case INDEX_op_movcond_i32:
3514    case INDEX_op_movcond_i64:
3515        return C_O1_I4(r, r, reT, r, 0);
3516
3517    case INDEX_op_div2_i32:
3518    case INDEX_op_div2_i64:
3519    case INDEX_op_divu2_i32:
3520    case INDEX_op_divu2_i64:
3521        return C_O2_I3(a, d, 0, 1, r);
3522
3523    case INDEX_op_mulu2_i32:
3524    case INDEX_op_mulu2_i64:
3525    case INDEX_op_muls2_i32:
3526    case INDEX_op_muls2_i64:
3527        return C_O2_I2(a, d, a, r);
3528
3529    case INDEX_op_add2_i32:
3530    case INDEX_op_add2_i64:
3531    case INDEX_op_sub2_i32:
3532    case INDEX_op_sub2_i64:
3533        return C_N1_O1_I4(r, r, 0, 1, re, re);
3534
3535    case INDEX_op_ctz_i32:
3536    case INDEX_op_ctz_i64:
3537        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3538
3539    case INDEX_op_clz_i32:
3540    case INDEX_op_clz_i64:
3541        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3542
3543    case INDEX_op_qemu_ld_a32_i32:
3544        return C_O1_I1(r, L);
3545    case INDEX_op_qemu_ld_a64_i32:
3546        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3547
3548    case INDEX_op_qemu_st_a32_i32:
3549        return C_O0_I2(L, L);
3550    case INDEX_op_qemu_st_a64_i32:
3551        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3552    case INDEX_op_qemu_st8_a32_i32:
3553        return C_O0_I2(s, L);
3554    case INDEX_op_qemu_st8_a64_i32:
3555        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3556
3557    case INDEX_op_qemu_ld_a32_i64:
3558        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3559    case INDEX_op_qemu_ld_a64_i64:
3560        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3561
3562    case INDEX_op_qemu_st_a32_i64:
3563        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3564    case INDEX_op_qemu_st_a64_i64:
3565        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3566
3567    case INDEX_op_qemu_ld_a32_i128:
3568    case INDEX_op_qemu_ld_a64_i128:
3569        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3570        return C_O2_I1(r, r, L);
3571    case INDEX_op_qemu_st_a32_i128:
3572    case INDEX_op_qemu_st_a64_i128:
3573        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3574        return C_O0_I3(L, L, L);
3575
3576    case INDEX_op_brcond2_i32:
3577        return C_O0_I4(r, r, ri, ri);
3578
3579    case INDEX_op_setcond2_i32:
3580        return C_O1_I4(r, r, r, ri, ri);
3581
3582    case INDEX_op_ld_vec:
3583    case INDEX_op_dupm_vec:
3584        return C_O1_I1(x, r);
3585
3586    case INDEX_op_st_vec:
3587        return C_O0_I2(x, r);
3588
3589    case INDEX_op_add_vec:
3590    case INDEX_op_sub_vec:
3591    case INDEX_op_mul_vec:
3592    case INDEX_op_and_vec:
3593    case INDEX_op_or_vec:
3594    case INDEX_op_xor_vec:
3595    case INDEX_op_andc_vec:
3596    case INDEX_op_orc_vec:
3597    case INDEX_op_nand_vec:
3598    case INDEX_op_nor_vec:
3599    case INDEX_op_eqv_vec:
3600    case INDEX_op_ssadd_vec:
3601    case INDEX_op_usadd_vec:
3602    case INDEX_op_sssub_vec:
3603    case INDEX_op_ussub_vec:
3604    case INDEX_op_smin_vec:
3605    case INDEX_op_umin_vec:
3606    case INDEX_op_smax_vec:
3607    case INDEX_op_umax_vec:
3608    case INDEX_op_shlv_vec:
3609    case INDEX_op_shrv_vec:
3610    case INDEX_op_sarv_vec:
3611    case INDEX_op_rotlv_vec:
3612    case INDEX_op_rotrv_vec:
3613    case INDEX_op_shls_vec:
3614    case INDEX_op_shrs_vec:
3615    case INDEX_op_sars_vec:
3616    case INDEX_op_cmp_vec:
3617    case INDEX_op_x86_shufps_vec:
3618    case INDEX_op_x86_blend_vec:
3619    case INDEX_op_x86_packss_vec:
3620    case INDEX_op_x86_packus_vec:
3621    case INDEX_op_x86_vperm2i128_vec:
3622    case INDEX_op_x86_punpckl_vec:
3623    case INDEX_op_x86_punpckh_vec:
3624    case INDEX_op_x86_vpshldi_vec:
3625#if TCG_TARGET_REG_BITS == 32
3626    case INDEX_op_dup2_vec:
3627#endif
3628        return C_O1_I2(x, x, x);
3629
3630    case INDEX_op_abs_vec:
3631    case INDEX_op_dup_vec:
3632    case INDEX_op_not_vec:
3633    case INDEX_op_shli_vec:
3634    case INDEX_op_shri_vec:
3635    case INDEX_op_sari_vec:
3636    case INDEX_op_rotli_vec:
3637    case INDEX_op_x86_psrldq_vec:
3638        return C_O1_I1(x, x);
3639
3640    case INDEX_op_x86_vpshldv_vec:
3641    case INDEX_op_x86_vpshrdv_vec:
3642        return C_O1_I3(x, 0, x, x);
3643
3644    case INDEX_op_bitsel_vec:
3645    case INDEX_op_x86_vpblendvb_vec:
3646        return C_O1_I3(x, x, x, x);
3647
3648    default:
3649        g_assert_not_reached();
3650    }
3651}
3652
3653int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3654{
3655    switch (opc) {
3656    case INDEX_op_add_vec:
3657    case INDEX_op_sub_vec:
3658    case INDEX_op_and_vec:
3659    case INDEX_op_or_vec:
3660    case INDEX_op_xor_vec:
3661    case INDEX_op_andc_vec:
3662    case INDEX_op_orc_vec:
3663    case INDEX_op_nand_vec:
3664    case INDEX_op_nor_vec:
3665    case INDEX_op_eqv_vec:
3666    case INDEX_op_not_vec:
3667    case INDEX_op_bitsel_vec:
3668        return 1;
3669    case INDEX_op_cmp_vec:
3670    case INDEX_op_cmpsel_vec:
3671        return -1;
3672
3673    case INDEX_op_rotli_vec:
3674        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3675
3676    case INDEX_op_shli_vec:
3677    case INDEX_op_shri_vec:
3678        /* We must expand the operation for MO_8.  */
3679        return vece == MO_8 ? -1 : 1;
3680
3681    case INDEX_op_sari_vec:
3682        switch (vece) {
3683        case MO_8:
3684            return -1;
3685        case MO_16:
3686        case MO_32:
3687            return 1;
3688        case MO_64:
3689            if (have_avx512vl) {
3690                return 1;
3691            }
3692            /*
3693             * We can emulate this for MO_64, but it does not pay off
3694             * unless we're producing at least 4 values.
3695             */
3696            return type >= TCG_TYPE_V256 ? -1 : 0;
3697        }
3698        return 0;
3699
3700    case INDEX_op_shls_vec:
3701    case INDEX_op_shrs_vec:
3702        return vece >= MO_16;
3703    case INDEX_op_sars_vec:
3704        switch (vece) {
3705        case MO_16:
3706        case MO_32:
3707            return 1;
3708        case MO_64:
3709            return have_avx512vl;
3710        }
3711        return 0;
3712    case INDEX_op_rotls_vec:
3713        return vece >= MO_16 ? -1 : 0;
3714
3715    case INDEX_op_shlv_vec:
3716    case INDEX_op_shrv_vec:
3717        switch (vece) {
3718        case MO_16:
3719            return have_avx512bw;
3720        case MO_32:
3721        case MO_64:
3722            return have_avx2;
3723        }
3724        return 0;
3725    case INDEX_op_sarv_vec:
3726        switch (vece) {
3727        case MO_16:
3728            return have_avx512bw;
3729        case MO_32:
3730            return have_avx2;
3731        case MO_64:
3732            return have_avx512vl;
3733        }
3734        return 0;
3735    case INDEX_op_rotlv_vec:
3736    case INDEX_op_rotrv_vec:
3737        switch (vece) {
3738        case MO_16:
3739            return have_avx512vbmi2 ? -1 : 0;
3740        case MO_32:
3741        case MO_64:
3742            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3743        }
3744        return 0;
3745
3746    case INDEX_op_mul_vec:
3747        switch (vece) {
3748        case MO_8:
3749            return -1;
3750        case MO_64:
3751            return have_avx512dq;
3752        }
3753        return 1;
3754
3755    case INDEX_op_ssadd_vec:
3756    case INDEX_op_usadd_vec:
3757    case INDEX_op_sssub_vec:
3758    case INDEX_op_ussub_vec:
3759        return vece <= MO_16;
3760    case INDEX_op_smin_vec:
3761    case INDEX_op_smax_vec:
3762    case INDEX_op_umin_vec:
3763    case INDEX_op_umax_vec:
3764    case INDEX_op_abs_vec:
3765        return vece <= MO_32 || have_avx512vl;
3766
3767    default:
3768        return 0;
3769    }
3770}
3771
3772static void expand_vec_shi(TCGType type, unsigned vece, bool right,
3773                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3774{
3775    uint8_t mask;
3776
3777    tcg_debug_assert(vece == MO_8);
3778    if (right) {
3779        mask = 0xff >> imm;
3780        tcg_gen_shri_vec(MO_16, v0, v1, imm);
3781    } else {
3782        mask = 0xff << imm;
3783        tcg_gen_shli_vec(MO_16, v0, v1, imm);
3784    }
3785    tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
3786}
3787
3788static void expand_vec_sari(TCGType type, unsigned vece,
3789                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3790{
3791    TCGv_vec t1, t2;
3792
3793    switch (vece) {
3794    case MO_8:
3795        /* Unpack to 16-bit, shift, and repack.  */
3796        t1 = tcg_temp_new_vec(type);
3797        t2 = tcg_temp_new_vec(type);
3798        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3799                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3800        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3801                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3802        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3803        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3804        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3805                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3806        tcg_temp_free_vec(t1);
3807        tcg_temp_free_vec(t2);
3808        break;
3809
3810    case MO_64:
3811        t1 = tcg_temp_new_vec(type);
3812        if (imm <= 32) {
3813            /*
3814             * We can emulate a small sign extend by performing an arithmetic
3815             * 32-bit shift and overwriting the high half of a 64-bit logical
3816             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3817             * does not, so we have to bound the smaller shift -- we get the
3818             * same result in the high half either way.
3819             */
3820            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3821            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3822            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3823                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3824                      tcgv_vec_arg(t1), 0xaa);
3825        } else {
3826            /* Otherwise we will need to use a compare vs 0 to produce
3827             * the sign-extend, shift and merge.
3828             */
3829            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3830                            tcg_constant_vec(type, MO_64, 0), v1);
3831            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3832            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3833            tcg_gen_or_vec(MO_64, v0, v0, t1);
3834        }
3835        tcg_temp_free_vec(t1);
3836        break;
3837
3838    default:
3839        g_assert_not_reached();
3840    }
3841}
3842
3843static void expand_vec_rotli(TCGType type, unsigned vece,
3844                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3845{
3846    TCGv_vec t;
3847
3848    if (vece != MO_8 && have_avx512vbmi2) {
3849        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3850                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3851        return;
3852    }
3853
3854    t = tcg_temp_new_vec(type);
3855    tcg_gen_shli_vec(vece, t, v1, imm);
3856    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3857    tcg_gen_or_vec(vece, v0, v0, t);
3858    tcg_temp_free_vec(t);
3859}
3860
3861static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3862                            TCGv_vec v1, TCGv_vec sh, bool right)
3863{
3864    TCGv_vec t;
3865
3866    if (have_avx512vbmi2) {
3867        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3868                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3869                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3870        return;
3871    }
3872
3873    t = tcg_temp_new_vec(type);
3874    tcg_gen_dupi_vec(vece, t, 8 << vece);
3875    tcg_gen_sub_vec(vece, t, t, sh);
3876    if (right) {
3877        tcg_gen_shlv_vec(vece, t, v1, t);
3878        tcg_gen_shrv_vec(vece, v0, v1, sh);
3879    } else {
3880        tcg_gen_shrv_vec(vece, t, v1, t);
3881        tcg_gen_shlv_vec(vece, v0, v1, sh);
3882    }
3883    tcg_gen_or_vec(vece, v0, v0, t);
3884    tcg_temp_free_vec(t);
3885}
3886
3887static void expand_vec_rotls(TCGType type, unsigned vece,
3888                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3889{
3890    TCGv_vec t = tcg_temp_new_vec(type);
3891
3892    tcg_debug_assert(vece != MO_8);
3893
3894    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3895        tcg_gen_dup_i32_vec(vece, t, lsh);
3896        if (vece >= MO_32) {
3897            tcg_gen_rotlv_vec(vece, v0, v1, t);
3898        } else {
3899            expand_vec_rotv(type, vece, v0, v1, t, false);
3900        }
3901    } else {
3902        TCGv_i32 rsh = tcg_temp_new_i32();
3903
3904        tcg_gen_neg_i32(rsh, lsh);
3905        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3906        tcg_gen_shls_vec(vece, t, v1, lsh);
3907        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3908        tcg_gen_or_vec(vece, v0, v0, t);
3909
3910        tcg_temp_free_i32(rsh);
3911    }
3912
3913    tcg_temp_free_vec(t);
3914}
3915
3916static void expand_vec_mul(TCGType type, unsigned vece,
3917                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3918{
3919    TCGv_vec t1, t2, t3, t4, zero;
3920
3921    tcg_debug_assert(vece == MO_8);
3922
3923    /*
3924     * Unpack v1 bytes to words, 0 | x.
3925     * Unpack v2 bytes to words, y | 0.
3926     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3927     * Shift logical right by 8 bits to clear the high 8 bytes before
3928     * using an unsigned saturated pack.
3929     *
3930     * The difference between the V64, V128 and V256 cases is merely how
3931     * we distribute the expansion between temporaries.
3932     */
3933    switch (type) {
3934    case TCG_TYPE_V64:
3935        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3936        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3937        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3938        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3939                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3940        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3941                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3942        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3943        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3944        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3945                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3946        tcg_temp_free_vec(t1);
3947        tcg_temp_free_vec(t2);
3948        break;
3949
3950    case TCG_TYPE_V128:
3951    case TCG_TYPE_V256:
3952        t1 = tcg_temp_new_vec(type);
3953        t2 = tcg_temp_new_vec(type);
3954        t3 = tcg_temp_new_vec(type);
3955        t4 = tcg_temp_new_vec(type);
3956        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3957        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3958                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3959        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3960                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3961        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3962                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3963        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3964                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3965        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3966        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3967        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3968        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3969        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3970                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3971        tcg_temp_free_vec(t1);
3972        tcg_temp_free_vec(t2);
3973        tcg_temp_free_vec(t3);
3974        tcg_temp_free_vec(t4);
3975        break;
3976
3977    default:
3978        g_assert_not_reached();
3979    }
3980}
3981
3982static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3983                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3984{
3985    enum {
3986        NEED_INV  = 1,
3987        NEED_SWAP = 2,
3988        NEED_BIAS = 4,
3989        NEED_UMIN = 8,
3990        NEED_UMAX = 16,
3991    };
3992    TCGv_vec t1, t2, t3;
3993    uint8_t fixup;
3994
3995    switch (cond) {
3996    case TCG_COND_EQ:
3997    case TCG_COND_GT:
3998        fixup = 0;
3999        break;
4000    case TCG_COND_NE:
4001    case TCG_COND_LE:
4002        fixup = NEED_INV;
4003        break;
4004    case TCG_COND_LT:
4005        fixup = NEED_SWAP;
4006        break;
4007    case TCG_COND_GE:
4008        fixup = NEED_SWAP | NEED_INV;
4009        break;
4010    case TCG_COND_LEU:
4011        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4012            fixup = NEED_UMIN;
4013        } else {
4014            fixup = NEED_BIAS | NEED_INV;
4015        }
4016        break;
4017    case TCG_COND_GTU:
4018        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4019            fixup = NEED_UMIN | NEED_INV;
4020        } else {
4021            fixup = NEED_BIAS;
4022        }
4023        break;
4024    case TCG_COND_GEU:
4025        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4026            fixup = NEED_UMAX;
4027        } else {
4028            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
4029        }
4030        break;
4031    case TCG_COND_LTU:
4032        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4033            fixup = NEED_UMAX | NEED_INV;
4034        } else {
4035            fixup = NEED_BIAS | NEED_SWAP;
4036        }
4037        break;
4038    default:
4039        g_assert_not_reached();
4040    }
4041
4042    if (fixup & NEED_INV) {
4043        cond = tcg_invert_cond(cond);
4044    }
4045    if (fixup & NEED_SWAP) {
4046        t1 = v1, v1 = v2, v2 = t1;
4047        cond = tcg_swap_cond(cond);
4048    }
4049
4050    t1 = t2 = NULL;
4051    if (fixup & (NEED_UMIN | NEED_UMAX)) {
4052        t1 = tcg_temp_new_vec(type);
4053        if (fixup & NEED_UMIN) {
4054            tcg_gen_umin_vec(vece, t1, v1, v2);
4055        } else {
4056            tcg_gen_umax_vec(vece, t1, v1, v2);
4057        }
4058        v2 = t1;
4059        cond = TCG_COND_EQ;
4060    } else if (fixup & NEED_BIAS) {
4061        t1 = tcg_temp_new_vec(type);
4062        t2 = tcg_temp_new_vec(type);
4063        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4064        tcg_gen_sub_vec(vece, t1, v1, t3);
4065        tcg_gen_sub_vec(vece, t2, v2, t3);
4066        v1 = t1;
4067        v2 = t2;
4068        cond = tcg_signed_cond(cond);
4069    }
4070
4071    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4072    /* Expand directly; do not recurse.  */
4073    vec_gen_4(INDEX_op_cmp_vec, type, vece,
4074              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4075
4076    if (t1) {
4077        tcg_temp_free_vec(t1);
4078        if (t2) {
4079            tcg_temp_free_vec(t2);
4080        }
4081    }
4082    return fixup & NEED_INV;
4083}
4084
4085static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4086                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4087{
4088    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4089        tcg_gen_not_vec(vece, v0, v0);
4090    }
4091}
4092
4093static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4094                              TCGv_vec c1, TCGv_vec c2,
4095                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4096{
4097    TCGv_vec t = tcg_temp_new_vec(type);
4098
4099    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4100        /* Invert the sense of the compare by swapping arguments.  */
4101        TCGv_vec x;
4102        x = v3, v3 = v4, v4 = x;
4103    }
4104    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4105              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4106              tcgv_vec_arg(v3), tcgv_vec_arg(t));
4107    tcg_temp_free_vec(t);
4108}
4109
4110void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4111                       TCGArg a0, ...)
4112{
4113    va_list va;
4114    TCGArg a2;
4115    TCGv_vec v0, v1, v2, v3, v4;
4116
4117    va_start(va, a0);
4118    v0 = temp_tcgv_vec(arg_temp(a0));
4119    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4120    a2 = va_arg(va, TCGArg);
4121
4122    switch (opc) {
4123    case INDEX_op_shli_vec:
4124        expand_vec_shi(type, vece, false, v0, v1, a2);
4125        break;
4126    case INDEX_op_shri_vec:
4127        expand_vec_shi(type, vece, true, v0, v1, a2);
4128        break;
4129    case INDEX_op_sari_vec:
4130        expand_vec_sari(type, vece, v0, v1, a2);
4131        break;
4132
4133    case INDEX_op_rotli_vec:
4134        expand_vec_rotli(type, vece, v0, v1, a2);
4135        break;
4136
4137    case INDEX_op_rotls_vec:
4138        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4139        break;
4140
4141    case INDEX_op_rotlv_vec:
4142        v2 = temp_tcgv_vec(arg_temp(a2));
4143        expand_vec_rotv(type, vece, v0, v1, v2, false);
4144        break;
4145    case INDEX_op_rotrv_vec:
4146        v2 = temp_tcgv_vec(arg_temp(a2));
4147        expand_vec_rotv(type, vece, v0, v1, v2, true);
4148        break;
4149
4150    case INDEX_op_mul_vec:
4151        v2 = temp_tcgv_vec(arg_temp(a2));
4152        expand_vec_mul(type, vece, v0, v1, v2);
4153        break;
4154
4155    case INDEX_op_cmp_vec:
4156        v2 = temp_tcgv_vec(arg_temp(a2));
4157        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4158        break;
4159
4160    case INDEX_op_cmpsel_vec:
4161        v2 = temp_tcgv_vec(arg_temp(a2));
4162        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4163        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4164        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4165        break;
4166
4167    default:
4168        break;
4169    }
4170
4171    va_end(va);
4172}
4173
4174static const int tcg_target_callee_save_regs[] = {
4175#if TCG_TARGET_REG_BITS == 64
4176    TCG_REG_RBP,
4177    TCG_REG_RBX,
4178#if defined(_WIN64)
4179    TCG_REG_RDI,
4180    TCG_REG_RSI,
4181#endif
4182    TCG_REG_R12,
4183    TCG_REG_R13,
4184    TCG_REG_R14, /* Currently used for the global env. */
4185    TCG_REG_R15,
4186#else
4187    TCG_REG_EBP, /* Currently used for the global env. */
4188    TCG_REG_EBX,
4189    TCG_REG_ESI,
4190    TCG_REG_EDI,
4191#endif
4192};
4193
4194/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4195   and tcg_register_jit.  */
4196
4197#define PUSH_SIZE \
4198    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4199     * (TCG_TARGET_REG_BITS / 8))
4200
4201#define FRAME_SIZE \
4202    ((PUSH_SIZE \
4203      + TCG_STATIC_CALL_ARGS_SIZE \
4204      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4205      + TCG_TARGET_STACK_ALIGN - 1) \
4206     & ~(TCG_TARGET_STACK_ALIGN - 1))
4207
4208/* Generate global QEMU prologue and epilogue code */
4209static void tcg_target_qemu_prologue(TCGContext *s)
4210{
4211    int i, stack_addend;
4212
4213    /* TB prologue */
4214
4215    /* Reserve some stack space, also for TCG temps.  */
4216    stack_addend = FRAME_SIZE - PUSH_SIZE;
4217    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4218                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4219
4220    /* Save all callee saved registers.  */
4221    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4222        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4223    }
4224
4225    if (!tcg_use_softmmu && guest_base) {
4226        int seg = setup_guest_base_seg();
4227        if (seg != 0) {
4228            x86_guest_base.seg = seg;
4229        } else if (guest_base == (int32_t)guest_base) {
4230            x86_guest_base.ofs = guest_base;
4231        } else {
4232            assert(TCG_TARGET_REG_BITS == 64);
4233            /* Choose R12 because, as a base, it requires a SIB byte. */
4234            x86_guest_base.index = TCG_REG_R12;
4235            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4236            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4237        }
4238    }
4239
4240    if (TCG_TARGET_REG_BITS == 32) {
4241        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4242                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4243        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4244        /* jmp *tb.  */
4245        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4246                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4247                             + stack_addend);
4248    } else {
4249        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4250        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4251        /* jmp *tb.  */
4252        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4253    }
4254
4255    /*
4256     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4257     * and fall through to the rest of the epilogue.
4258     */
4259    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4260    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4261
4262    /* TB epilogue */
4263    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4264
4265    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4266
4267    if (have_avx2) {
4268        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4269    }
4270    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4271        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4272    }
4273    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4274}
4275
4276static void tcg_out_tb_start(TCGContext *s)
4277{
4278    /* nothing to do */
4279}
4280
4281static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4282{
4283    memset(p, 0x90, count);
4284}
4285
4286static void tcg_target_init(TCGContext *s)
4287{
4288    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4289    if (TCG_TARGET_REG_BITS == 64) {
4290        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4291    }
4292    if (have_avx1) {
4293        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4294        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4295    }
4296    if (have_avx2) {
4297        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4298    }
4299
4300    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4301    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4302    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4303    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4304    if (TCG_TARGET_REG_BITS == 64) {
4305#if !defined(_WIN64)
4306        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4307        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4308#endif
4309        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4310        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4311        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4312        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4313    }
4314
4315    s->reserved_regs = 0;
4316    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4317    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4318#ifdef _WIN64
4319    /* These are call saved, and we don't save them, so don't use them. */
4320    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4321    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4322    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4323    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4324    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4325    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4326    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4327    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4328    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4329    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4330#endif
4331}
4332
4333typedef struct {
4334    DebugFrameHeader h;
4335    uint8_t fde_def_cfa[4];
4336    uint8_t fde_reg_ofs[14];
4337} DebugFrame;
4338
4339/* We're expecting a 2 byte uleb128 encoded value.  */
4340QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4341
4342#if !defined(__ELF__)
4343    /* Host machine without ELF. */
4344#elif TCG_TARGET_REG_BITS == 64
4345#define ELF_HOST_MACHINE EM_X86_64
4346static const DebugFrame debug_frame = {
4347    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4348    .h.cie.id = -1,
4349    .h.cie.version = 1,
4350    .h.cie.code_align = 1,
4351    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4352    .h.cie.return_column = 16,
4353
4354    /* Total FDE size does not include the "len" member.  */
4355    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4356
4357    .fde_def_cfa = {
4358        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4359        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4360        (FRAME_SIZE >> 7)
4361    },
4362    .fde_reg_ofs = {
4363        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4364        /* The following ordering must match tcg_target_callee_save_regs.  */
4365        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4366        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4367        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4368        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4369        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4370        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4371    }
4372};
4373#else
4374#define ELF_HOST_MACHINE EM_386
4375static const DebugFrame debug_frame = {
4376    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4377    .h.cie.id = -1,
4378    .h.cie.version = 1,
4379    .h.cie.code_align = 1,
4380    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4381    .h.cie.return_column = 8,
4382
4383    /* Total FDE size does not include the "len" member.  */
4384    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4385
4386    .fde_def_cfa = {
4387        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4388        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4389        (FRAME_SIZE >> 7)
4390    },
4391    .fde_reg_ofs = {
4392        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4393        /* The following ordering must match tcg_target_callee_save_regs.  */
4394        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4395        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4396        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4397        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4398    }
4399};
4400#endif
4401
4402#if defined(ELF_HOST_MACHINE)
4403void tcg_register_jit(const void *buf, size_t buf_size)
4404{
4405    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4406}
4407#endif
4408