xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 72fa42cfca7060fab00c534e71fc850b194a4c6d)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135#define TCG_CT_CONST_TST 0x1000
136
137/* Registers used with L constraint, which are the first argument
138   registers on x86_64, and two random call clobbered registers on
139   i386. */
140#if TCG_TARGET_REG_BITS == 64
141# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
142# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
143#else
144# define TCG_REG_L0 TCG_REG_EAX
145# define TCG_REG_L1 TCG_REG_EDX
146#endif
147
148#if TCG_TARGET_REG_BITS == 64
149# define ALL_GENERAL_REGS      0x0000ffffu
150# define ALL_VECTOR_REGS       0xffff0000u
151# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
152#else
153# define ALL_GENERAL_REGS      0x000000ffu
154# define ALL_VECTOR_REGS       0x00ff0000u
155# define ALL_BYTEL_REGS        0x0000000fu
156#endif
157#define SOFTMMU_RESERVE_REGS \
158    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
159
160/* For 64-bit, we always know that CMOV is available.  */
161#if TCG_TARGET_REG_BITS == 64
162# define have_cmov      true
163#else
164# define have_cmov      (cpuinfo & CPUINFO_CMOV)
165#endif
166#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
167#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
168
169static const tcg_insn_unit *tb_ret_addr;
170
171static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
172                        intptr_t value, intptr_t addend)
173{
174    value += addend;
175    switch(type) {
176    case R_386_PC32:
177        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
178        if (value != (int32_t)value) {
179            return false;
180        }
181        /* FALLTHRU */
182    case R_386_32:
183        tcg_patch32(code_ptr, value);
184        break;
185    case R_386_PC8:
186        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
187        if (value != (int8_t)value) {
188            return false;
189        }
190        tcg_patch8(code_ptr, value);
191        break;
192    default:
193        g_assert_not_reached();
194    }
195    return true;
196}
197
198/* test if a constant matches the constraint */
199static bool tcg_target_const_match(int64_t val, int ct,
200                                   TCGType type, TCGCond cond, int vece)
201{
202    if (ct & TCG_CT_CONST) {
203        return 1;
204    }
205    if (type == TCG_TYPE_I32) {
206        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
207                  TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
208            return 1;
209        }
210    } else {
211        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
212            return 1;
213        }
214        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
215            return 1;
216        }
217        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
218            return 1;
219        }
220        /*
221         * This will be used in combination with TCG_CT_CONST_S32,
222         * so "normal" TESTQ is already matched.  Also accept:
223         *    TESTQ -> TESTL   (uint32_t)
224         *    TESTQ -> BT      (is_power_of_2)
225         */
226        if ((ct & TCG_CT_CONST_TST)
227            && is_tst_cond(cond)
228            && (val == (uint32_t)val || is_power_of_2(val))) {
229            return 1;
230        }
231    }
232    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
233        return 1;
234    }
235    return 0;
236}
237
238# define LOWREGMASK(x)	((x) & 7)
239
240#define P_EXT		0x100		/* 0x0f opcode prefix */
241#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
242#define P_DATA16        0x400           /* 0x66 opcode prefix */
243#define P_VEXW          0x1000          /* Set VEX.W = 1 */
244#if TCG_TARGET_REG_BITS == 64
245# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
246# define P_REXB_R       0x2000          /* REG field as byte register */
247# define P_REXB_RM      0x4000          /* R/M field as byte register */
248# define P_GS           0x8000          /* gs segment override */
249#else
250# define P_REXW		0
251# define P_REXB_R	0
252# define P_REXB_RM	0
253# define P_GS           0
254#endif
255#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
256#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
257#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
258#define P_VEXL          0x80000         /* Set VEX.L = 1 */
259#define P_EVEX          0x100000        /* Requires EVEX encoding */
260
261#define OPC_ARITH_EbIb	(0x80)
262#define OPC_ARITH_EvIz	(0x81)
263#define OPC_ARITH_EvIb	(0x83)
264#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
265#define OPC_ANDN        (0xf2 | P_EXT38)
266#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
267#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
268#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
269#define OPC_BSF         (0xbc | P_EXT)
270#define OPC_BSR         (0xbd | P_EXT)
271#define OPC_BSWAP	(0xc8 | P_EXT)
272#define OPC_CALL_Jz	(0xe8)
273#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
274#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
275#define OPC_DEC_r32	(0x48)
276#define OPC_IMUL_GvEv	(0xaf | P_EXT)
277#define OPC_IMUL_GvEvIb	(0x6b)
278#define OPC_IMUL_GvEvIz	(0x69)
279#define OPC_INC_r32	(0x40)
280#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
281#define OPC_JCC_short	(0x70)		/* ... plus condition code */
282#define OPC_JMP_long	(0xe9)
283#define OPC_JMP_short	(0xeb)
284#define OPC_LEA         (0x8d)
285#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
286#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
287#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
288#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
289#define OPC_MOVB_EvIz   (0xc6)
290#define OPC_MOVL_EvIz	(0xc7)
291#define OPC_MOVB_Ib     (0xb0)
292#define OPC_MOVL_Iv     (0xb8)
293#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
294#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
295#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
296#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
297#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
298#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
299#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
300#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
301#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
302#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
303#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
304#define OPC_MOVSBL	(0xbe | P_EXT)
305#define OPC_MOVSWL	(0xbf | P_EXT)
306#define OPC_MOVSLQ	(0x63 | P_REXW)
307#define OPC_MOVZBL	(0xb6 | P_EXT)
308#define OPC_MOVZWL	(0xb7 | P_EXT)
309#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
310#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
311#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
312#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
313#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
314#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
315#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
316#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
317#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
318#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
319#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
320#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
321#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
322#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
323#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
324#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
325#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
326#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
327#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
328#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
329#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
330#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
331#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
332#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
333#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
334#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
335#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
336#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
337#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
338#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
339#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
340#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
341#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
342#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
343#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
344#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
345#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
346#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
347#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
348#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
349#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
350#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
351#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
352#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
353#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
354#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
355#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
356#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
357#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
358#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
359#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
360#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
361#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
362#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
363#define OPC_POR         (0xeb | P_EXT | P_DATA16)
364#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
365#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
366#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
367#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
368#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
369#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
370#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
371#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
372#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
373#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
374#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
375#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
376#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
377#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
378#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
379#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
380#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
381#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
382#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
383#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
384#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
385#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
386#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
387#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
388#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
389#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
390#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
391#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
392#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
393#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
394#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
395#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
396#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
397#define OPC_POP_r32	(0x58)
398#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
399#define OPC_PUSH_r32	(0x50)
400#define OPC_PUSH_Iv	(0x68)
401#define OPC_PUSH_Ib	(0x6a)
402#define OPC_RET		(0xc3)
403#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
404#define OPC_SHIFT_1	(0xd1)
405#define OPC_SHIFT_Ib	(0xc1)
406#define OPC_SHIFT_cl	(0xd3)
407#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
408#define OPC_SHUFPS      (0xc6 | P_EXT)
409#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
410#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
411#define OPC_SHRD_Ib     (0xac | P_EXT)
412#define OPC_TESTB	(0x84)
413#define OPC_TESTL	(0x85)
414#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
415#define OPC_UD2         (0x0b | P_EXT)
416#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
417#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
418#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
419#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
420#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
421#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
422#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
423#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
424#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
425#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
426#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
427#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
428#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
429#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
430#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
431#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
433#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
434#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
435#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
436#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
437#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
438#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
439#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
440#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
441#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
442#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
443#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
444#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
445#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
446#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
447#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
448#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
449#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
450#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
451#define OPC_VZEROUPPER  (0x77 | P_EXT)
452#define OPC_XCHG_ax_r32	(0x90)
453#define OPC_XCHG_EvGv   (0x87)
454
455#define OPC_GRP3_Eb     (0xf6)
456#define OPC_GRP3_Ev     (0xf7)
457#define OPC_GRP5        (0xff)
458#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
459#define OPC_GRPBT       (0xba | P_EXT)
460
461#define OPC_GRPBT_BT    4
462#define OPC_GRPBT_BTS   5
463#define OPC_GRPBT_BTR   6
464#define OPC_GRPBT_BTC   7
465
466/* Group 1 opcode extensions for 0x80-0x83.
467   These are also used as modifiers for OPC_ARITH.  */
468#define ARITH_ADD 0
469#define ARITH_OR  1
470#define ARITH_ADC 2
471#define ARITH_SBB 3
472#define ARITH_AND 4
473#define ARITH_SUB 5
474#define ARITH_XOR 6
475#define ARITH_CMP 7
476
477/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
478#define SHIFT_ROL 0
479#define SHIFT_ROR 1
480#define SHIFT_SHL 4
481#define SHIFT_SHR 5
482#define SHIFT_SAR 7
483
484/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
485#define EXT3_TESTi 0
486#define EXT3_NOT   2
487#define EXT3_NEG   3
488#define EXT3_MUL   4
489#define EXT3_IMUL  5
490#define EXT3_DIV   6
491#define EXT3_IDIV  7
492
493/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
494#define EXT5_INC_Ev	0
495#define EXT5_DEC_Ev	1
496#define EXT5_CALLN_Ev	2
497#define EXT5_JMPN_Ev	4
498
499/* Condition codes to be added to OPC_JCC_{long,short}.  */
500#define JCC_JMP (-1)
501#define JCC_JO  0x0
502#define JCC_JNO 0x1
503#define JCC_JB  0x2
504#define JCC_JAE 0x3
505#define JCC_JE  0x4
506#define JCC_JNE 0x5
507#define JCC_JBE 0x6
508#define JCC_JA  0x7
509#define JCC_JS  0x8
510#define JCC_JNS 0x9
511#define JCC_JP  0xa
512#define JCC_JNP 0xb
513#define JCC_JL  0xc
514#define JCC_JGE 0xd
515#define JCC_JLE 0xe
516#define JCC_JG  0xf
517
518static const uint8_t tcg_cond_to_jcc[] = {
519    [TCG_COND_EQ] = JCC_JE,
520    [TCG_COND_NE] = JCC_JNE,
521    [TCG_COND_LT] = JCC_JL,
522    [TCG_COND_GE] = JCC_JGE,
523    [TCG_COND_LE] = JCC_JLE,
524    [TCG_COND_GT] = JCC_JG,
525    [TCG_COND_LTU] = JCC_JB,
526    [TCG_COND_GEU] = JCC_JAE,
527    [TCG_COND_LEU] = JCC_JBE,
528    [TCG_COND_GTU] = JCC_JA,
529    [TCG_COND_TSTEQ] = JCC_JE,
530    [TCG_COND_TSTNE] = JCC_JNE,
531};
532
533#if TCG_TARGET_REG_BITS == 64
534static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
535{
536    int rex;
537
538    if (opc & P_GS) {
539        tcg_out8(s, 0x65);
540    }
541    if (opc & P_DATA16) {
542        /* We should never be asking for both 16 and 64-bit operation.  */
543        tcg_debug_assert((opc & P_REXW) == 0);
544        tcg_out8(s, 0x66);
545    }
546    if (opc & P_SIMDF3) {
547        tcg_out8(s, 0xf3);
548    } else if (opc & P_SIMDF2) {
549        tcg_out8(s, 0xf2);
550    }
551
552    rex = 0;
553    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
554    rex |= (r & 8) >> 1;                /* REX.R */
555    rex |= (x & 8) >> 2;                /* REX.X */
556    rex |= (rm & 8) >> 3;               /* REX.B */
557
558    /* P_REXB_{R,RM} indicates that the given register is the low byte.
559       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
560       as otherwise the encoding indicates %[abcd]h.  Note that the values
561       that are ORed in merely indicate that the REX byte must be present;
562       those bits get discarded in output.  */
563    rex |= opc & (r >= 4 ? P_REXB_R : 0);
564    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
565
566    if (rex) {
567        tcg_out8(s, (uint8_t)(rex | 0x40));
568    }
569
570    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
571        tcg_out8(s, 0x0f);
572        if (opc & P_EXT38) {
573            tcg_out8(s, 0x38);
574        } else if (opc & P_EXT3A) {
575            tcg_out8(s, 0x3a);
576        }
577    }
578
579    tcg_out8(s, opc);
580}
581#else
582static void tcg_out_opc(TCGContext *s, int opc)
583{
584    if (opc & P_DATA16) {
585        tcg_out8(s, 0x66);
586    }
587    if (opc & P_SIMDF3) {
588        tcg_out8(s, 0xf3);
589    } else if (opc & P_SIMDF2) {
590        tcg_out8(s, 0xf2);
591    }
592    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
593        tcg_out8(s, 0x0f);
594        if (opc & P_EXT38) {
595            tcg_out8(s, 0x38);
596        } else if (opc & P_EXT3A) {
597            tcg_out8(s, 0x3a);
598        }
599    }
600    tcg_out8(s, opc);
601}
602/* Discard the register arguments to tcg_out_opc early, so as not to penalize
603   the 32-bit compilation paths.  This method works with all versions of gcc,
604   whereas relying on optimization may not be able to exclude them.  */
605#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
606#endif
607
608static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
609{
610    tcg_out_opc(s, opc, r, rm, 0);
611    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
612}
613
614static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
615                            int rm, int index)
616{
617    int tmp;
618
619    if (opc & P_GS) {
620        tcg_out8(s, 0x65);
621    }
622    /* Use the two byte form if possible, which cannot encode
623       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
624    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
625        && ((rm | index) & 8) == 0) {
626        /* Two byte VEX prefix.  */
627        tcg_out8(s, 0xc5);
628
629        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
630    } else {
631        /* Three byte VEX prefix.  */
632        tcg_out8(s, 0xc4);
633
634        /* VEX.m-mmmm */
635        if (opc & P_EXT3A) {
636            tmp = 3;
637        } else if (opc & P_EXT38) {
638            tmp = 2;
639        } else if (opc & P_EXT) {
640            tmp = 1;
641        } else {
642            g_assert_not_reached();
643        }
644        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
645        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
646        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
647        tcg_out8(s, tmp);
648
649        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
650    }
651
652    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
653    /* VEX.pp */
654    if (opc & P_DATA16) {
655        tmp |= 1;                          /* 0x66 */
656    } else if (opc & P_SIMDF3) {
657        tmp |= 2;                          /* 0xf3 */
658    } else if (opc & P_SIMDF2) {
659        tmp |= 3;                          /* 0xf2 */
660    }
661    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
662    tcg_out8(s, tmp);
663    tcg_out8(s, opc);
664}
665
666static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
667                             int rm, int index)
668{
669    /* The entire 4-byte evex prefix; with R' and V' set. */
670    uint32_t p = 0x08041062;
671    int mm, pp;
672
673    tcg_debug_assert(have_avx512vl);
674
675    /* EVEX.mm */
676    if (opc & P_EXT3A) {
677        mm = 3;
678    } else if (opc & P_EXT38) {
679        mm = 2;
680    } else if (opc & P_EXT) {
681        mm = 1;
682    } else {
683        g_assert_not_reached();
684    }
685
686    /* EVEX.pp */
687    if (opc & P_DATA16) {
688        pp = 1;                          /* 0x66 */
689    } else if (opc & P_SIMDF3) {
690        pp = 2;                          /* 0xf3 */
691    } else if (opc & P_SIMDF2) {
692        pp = 3;                          /* 0xf2 */
693    } else {
694        pp = 0;
695    }
696
697    p = deposit32(p, 8, 2, mm);
698    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
699    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
700    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
701    p = deposit32(p, 16, 2, pp);
702    p = deposit32(p, 19, 4, ~v);
703    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
704    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
705
706    tcg_out32(s, p);
707    tcg_out8(s, opc);
708}
709
710static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
711{
712    if (opc & P_EVEX) {
713        tcg_out_evex_opc(s, opc, r, v, rm, 0);
714    } else {
715        tcg_out_vex_opc(s, opc, r, v, rm, 0);
716    }
717    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
718}
719
720/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
721   We handle either RM and INDEX missing with a negative value.  In 64-bit
722   mode for absolute addresses, ~RM is the size of the immediate operand
723   that will follow the instruction.  */
724
725static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
726                               int shift, intptr_t offset)
727{
728    int mod, len;
729
730    if (index < 0 && rm < 0) {
731        if (TCG_TARGET_REG_BITS == 64) {
732            /* Try for a rip-relative addressing mode.  This has replaced
733               the 32-bit-mode absolute addressing encoding.  */
734            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
735            intptr_t disp = offset - pc;
736            if (disp == (int32_t)disp) {
737                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
738                tcg_out32(s, disp);
739                return;
740            }
741
742            /* Try for an absolute address encoding.  This requires the
743               use of the MODRM+SIB encoding and is therefore larger than
744               rip-relative addressing.  */
745            if (offset == (int32_t)offset) {
746                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
747                tcg_out8(s, (4 << 3) | 5);
748                tcg_out32(s, offset);
749                return;
750            }
751
752            /* ??? The memory isn't directly addressable.  */
753            g_assert_not_reached();
754        } else {
755            /* Absolute address.  */
756            tcg_out8(s, (r << 3) | 5);
757            tcg_out32(s, offset);
758            return;
759        }
760    }
761
762    /* Find the length of the immediate addend.  Note that the encoding
763       that would be used for (%ebp) indicates absolute addressing.  */
764    if (rm < 0) {
765        mod = 0, len = 4, rm = 5;
766    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
767        mod = 0, len = 0;
768    } else if (offset == (int8_t)offset) {
769        mod = 0x40, len = 1;
770    } else {
771        mod = 0x80, len = 4;
772    }
773
774    /* Use a single byte MODRM format if possible.  Note that the encoding
775       that would be used for %esp is the escape to the two byte form.  */
776    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
777        /* Single byte MODRM format.  */
778        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
779    } else {
780        /* Two byte MODRM+SIB format.  */
781
782        /* Note that the encoding that would place %esp into the index
783           field indicates no index register.  In 64-bit mode, the REX.X
784           bit counts, so %r12 can be used as the index.  */
785        if (index < 0) {
786            index = 4;
787        } else {
788            tcg_debug_assert(index != TCG_REG_ESP);
789        }
790
791        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
792        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
793    }
794
795    if (len == 1) {
796        tcg_out8(s, offset);
797    } else if (len == 4) {
798        tcg_out32(s, offset);
799    }
800}
801
802static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
803                                     int index, int shift, intptr_t offset)
804{
805    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
806    tcg_out_sib_offset(s, r, rm, index, shift, offset);
807}
808
809static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
810                                         int rm, int index, int shift,
811                                         intptr_t offset)
812{
813    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
814    tcg_out_sib_offset(s, r, rm, index, shift, offset);
815}
816
817/* A simplification of the above with no index or shift.  */
818static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
819                                        int rm, intptr_t offset)
820{
821    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
822}
823
824static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
825                                            int v, int rm, intptr_t offset)
826{
827    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
828}
829
830/* Output an opcode with an expected reference to the constant pool.  */
831static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
832{
833    tcg_out_opc(s, opc, r, 0, 0);
834    /* Absolute for 32-bit, pc-relative for 64-bit.  */
835    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
836    tcg_out32(s, 0);
837}
838
839/* Output an opcode with an expected reference to the constant pool.  */
840static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
841{
842    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
843    /* Absolute for 32-bit, pc-relative for 64-bit.  */
844    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
845    tcg_out32(s, 0);
846}
847
848/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
849static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
850{
851    /* Propagate an opcode prefix, such as P_REXW.  */
852    int ext = subop & ~0x7;
853    subop &= 0x7;
854
855    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
856}
857
858static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
859{
860    int rexw = 0;
861
862    if (arg == ret) {
863        return true;
864    }
865    switch (type) {
866    case TCG_TYPE_I64:
867        rexw = P_REXW;
868        /* fallthru */
869    case TCG_TYPE_I32:
870        if (ret < 16) {
871            if (arg < 16) {
872                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
873            } else {
874                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
875            }
876        } else {
877            if (arg < 16) {
878                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
879            } else {
880                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
881            }
882        }
883        break;
884
885    case TCG_TYPE_V64:
886        tcg_debug_assert(ret >= 16 && arg >= 16);
887        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
888        break;
889    case TCG_TYPE_V128:
890        tcg_debug_assert(ret >= 16 && arg >= 16);
891        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
892        break;
893    case TCG_TYPE_V256:
894        tcg_debug_assert(ret >= 16 && arg >= 16);
895        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
896        break;
897
898    default:
899        g_assert_not_reached();
900    }
901    return true;
902}
903
904static const int avx2_dup_insn[4] = {
905    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
906    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
907};
908
909static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
910                            TCGReg r, TCGReg a)
911{
912    if (have_avx2) {
913        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
914        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
915    } else {
916        switch (vece) {
917        case MO_8:
918            /* ??? With zero in a register, use PSHUFB.  */
919            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
920            a = r;
921            /* FALLTHRU */
922        case MO_16:
923            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
924            a = r;
925            /* FALLTHRU */
926        case MO_32:
927            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
928            /* imm8 operand: all output lanes selected from input lane 0.  */
929            tcg_out8(s, 0);
930            break;
931        case MO_64:
932            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
933            break;
934        default:
935            g_assert_not_reached();
936        }
937    }
938    return true;
939}
940
941static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
942                             TCGReg r, TCGReg base, intptr_t offset)
943{
944    if (have_avx2) {
945        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
946        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
947                                 r, 0, base, offset);
948    } else {
949        switch (vece) {
950        case MO_64:
951            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
952            break;
953        case MO_32:
954            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
955            break;
956        case MO_16:
957            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
958            tcg_out8(s, 0); /* imm8 */
959            tcg_out_dup_vec(s, type, vece, r, r);
960            break;
961        case MO_8:
962            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
963            tcg_out8(s, 0); /* imm8 */
964            tcg_out_dup_vec(s, type, vece, r, r);
965            break;
966        default:
967            g_assert_not_reached();
968        }
969    }
970    return true;
971}
972
973static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
974                             TCGReg ret, int64_t arg)
975{
976    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
977
978    if (arg == 0) {
979        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
980        return;
981    }
982    if (arg == -1) {
983        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
984        return;
985    }
986
987    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
988        if (have_avx2) {
989            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
990        } else {
991            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
992        }
993        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
994    } else {
995        if (type == TCG_TYPE_V64) {
996            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
997        } else if (have_avx2) {
998            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
999        } else {
1000            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1001        }
1002        if (TCG_TARGET_REG_BITS == 64) {
1003            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1004        } else {
1005            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1006        }
1007    }
1008}
1009
1010static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1011                             TCGReg ret, tcg_target_long arg)
1012{
1013    if (arg == 0) {
1014        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1015        return;
1016    }
1017    if (arg == -1) {
1018        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1019        return;
1020    }
1021
1022    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1023    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1024    if (TCG_TARGET_REG_BITS == 64) {
1025        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1026    } else {
1027        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1028    }
1029}
1030
1031static void tcg_out_movi_int(TCGContext *s, TCGType type,
1032                             TCGReg ret, tcg_target_long arg)
1033{
1034    tcg_target_long diff;
1035
1036    if (arg == 0) {
1037        tgen_arithr(s, ARITH_XOR, ret, ret);
1038        return;
1039    }
1040    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1041        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1042        tcg_out32(s, arg);
1043        return;
1044    }
1045    if (arg == (int32_t)arg) {
1046        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1047        tcg_out32(s, arg);
1048        return;
1049    }
1050
1051    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1052    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1053    if (diff == (int32_t)diff) {
1054        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1055        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1056        tcg_out32(s, diff);
1057        return;
1058    }
1059
1060    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1061    tcg_out64(s, arg);
1062}
1063
1064static void tcg_out_movi(TCGContext *s, TCGType type,
1065                         TCGReg ret, tcg_target_long arg)
1066{
1067    switch (type) {
1068    case TCG_TYPE_I32:
1069#if TCG_TARGET_REG_BITS == 64
1070    case TCG_TYPE_I64:
1071#endif
1072        if (ret < 16) {
1073            tcg_out_movi_int(s, type, ret, arg);
1074        } else {
1075            tcg_out_movi_vec(s, type, ret, arg);
1076        }
1077        break;
1078    default:
1079        g_assert_not_reached();
1080    }
1081}
1082
1083static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1084{
1085    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1086    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1087    return true;
1088}
1089
1090static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1091                             tcg_target_long imm)
1092{
1093    /* This function is only used for passing structs by reference. */
1094    tcg_debug_assert(imm == (int32_t)imm);
1095    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1096}
1097
1098static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1099{
1100    if (val == (int8_t)val) {
1101        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1102        tcg_out8(s, val);
1103    } else if (val == (int32_t)val) {
1104        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1105        tcg_out32(s, val);
1106    } else {
1107        g_assert_not_reached();
1108    }
1109}
1110
1111static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1112{
1113    /* Given the strength of x86 memory ordering, we only need care for
1114       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1115       faster than "mfence", so don't bother with the sse insn.  */
1116    if (a0 & TCG_MO_ST_LD) {
1117        tcg_out8(s, 0xf0);
1118        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1119        tcg_out8(s, 0);
1120    }
1121}
1122
1123static inline void tcg_out_push(TCGContext *s, int reg)
1124{
1125    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1126}
1127
1128static inline void tcg_out_pop(TCGContext *s, int reg)
1129{
1130    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1131}
1132
1133static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1134                       TCGReg arg1, intptr_t arg2)
1135{
1136    switch (type) {
1137    case TCG_TYPE_I32:
1138        if (ret < 16) {
1139            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1140        } else {
1141            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1142        }
1143        break;
1144    case TCG_TYPE_I64:
1145        if (ret < 16) {
1146            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1147            break;
1148        }
1149        /* FALLTHRU */
1150    case TCG_TYPE_V64:
1151        /* There is no instruction that can validate 8-byte alignment.  */
1152        tcg_debug_assert(ret >= 16);
1153        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1154        break;
1155    case TCG_TYPE_V128:
1156        /*
1157         * The gvec infrastructure is asserts that v128 vector loads
1158         * and stores use a 16-byte aligned offset.  Validate that the
1159         * final pointer is aligned by using an insn that will SIGSEGV.
1160         */
1161        tcg_debug_assert(ret >= 16);
1162        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1163        break;
1164    case TCG_TYPE_V256:
1165        /*
1166         * The gvec infrastructure only requires 16-byte alignment,
1167         * so here we must use an unaligned load.
1168         */
1169        tcg_debug_assert(ret >= 16);
1170        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1171                                 ret, 0, arg1, arg2);
1172        break;
1173    default:
1174        g_assert_not_reached();
1175    }
1176}
1177
1178static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1179                       TCGReg arg1, intptr_t arg2)
1180{
1181    switch (type) {
1182    case TCG_TYPE_I32:
1183        if (arg < 16) {
1184            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1185        } else {
1186            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1187        }
1188        break;
1189    case TCG_TYPE_I64:
1190        if (arg < 16) {
1191            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1192            break;
1193        }
1194        /* FALLTHRU */
1195    case TCG_TYPE_V64:
1196        /* There is no instruction that can validate 8-byte alignment.  */
1197        tcg_debug_assert(arg >= 16);
1198        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1199        break;
1200    case TCG_TYPE_V128:
1201        /*
1202         * The gvec infrastructure is asserts that v128 vector loads
1203         * and stores use a 16-byte aligned offset.  Validate that the
1204         * final pointer is aligned by using an insn that will SIGSEGV.
1205         *
1206         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1207         * for _WIN64, which must have SSE2 but may not have AVX.
1208         */
1209        tcg_debug_assert(arg >= 16);
1210        if (have_avx1) {
1211            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1212        } else {
1213            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1214        }
1215        break;
1216    case TCG_TYPE_V256:
1217        /*
1218         * The gvec infrastructure only requires 16-byte alignment,
1219         * so here we must use an unaligned store.
1220         */
1221        tcg_debug_assert(arg >= 16);
1222        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1223                                 arg, 0, arg1, arg2);
1224        break;
1225    default:
1226        g_assert_not_reached();
1227    }
1228}
1229
1230static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1231                        TCGReg base, intptr_t ofs)
1232{
1233    int rexw = 0;
1234    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1235        if (val != (int32_t)val) {
1236            return false;
1237        }
1238        rexw = P_REXW;
1239    } else if (type != TCG_TYPE_I32) {
1240        return false;
1241    }
1242    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1243    tcg_out32(s, val);
1244    return true;
1245}
1246
1247static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1248{
1249    /* Propagate an opcode prefix, such as P_DATA16.  */
1250    int ext = subopc & ~0x7;
1251    subopc &= 0x7;
1252
1253    if (count == 1) {
1254        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1255    } else {
1256        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1257        tcg_out8(s, count);
1258    }
1259}
1260
1261static inline void tcg_out_bswap32(TCGContext *s, int reg)
1262{
1263    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1264}
1265
1266static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1267{
1268    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1269}
1270
1271static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1272{
1273    /* movzbl */
1274    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1275    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1276}
1277
1278static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1279{
1280    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1281    /* movsbl */
1282    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1283    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1284}
1285
1286static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1287{
1288    /* movzwl */
1289    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1290}
1291
1292static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1293{
1294    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1295    /* movsw[lq] */
1296    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1297}
1298
1299static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1300{
1301    /* 32-bit mov zero extends.  */
1302    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1303}
1304
1305static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1306{
1307    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1308    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1309}
1310
1311static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1312{
1313    tcg_out_ext32s(s, dest, src);
1314}
1315
1316static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1317{
1318    if (dest != src) {
1319        tcg_out_ext32u(s, dest, src);
1320    }
1321}
1322
1323static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1324{
1325    tcg_out_ext32u(s, dest, src);
1326}
1327
1328static inline void tcg_out_bswap64(TCGContext *s, int reg)
1329{
1330    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1331}
1332
1333static void tgen_arithi(TCGContext *s, int c, int r0,
1334                        tcg_target_long val, int cf)
1335{
1336    int rexw = 0;
1337
1338    if (TCG_TARGET_REG_BITS == 64) {
1339        rexw = c & -8;
1340        c &= 7;
1341    }
1342
1343    switch (c) {
1344    case ARITH_ADD:
1345    case ARITH_SUB:
1346        if (!cf) {
1347            /*
1348             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1349             * partial flags update stalls on Pentium4 and are not recommended
1350             * by current Intel optimization manuals.
1351             */
1352            if (val == 1 || val == -1) {
1353                int is_inc = (c == ARITH_ADD) ^ (val < 0);
1354                if (TCG_TARGET_REG_BITS == 64) {
1355                    /*
1356                     * The single-byte increment encodings are re-tasked
1357                     * as the REX prefixes.  Use the MODRM encoding.
1358                     */
1359                    tcg_out_modrm(s, OPC_GRP5 + rexw,
1360                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1361                } else {
1362                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1363                }
1364                return;
1365            }
1366            if (val == 128) {
1367                /*
1368                 * Facilitate using an 8-bit immediate.  Carry is inverted
1369                 * by this transformation, so do it only if cf == 0.
1370                 */
1371                c ^= ARITH_ADD ^ ARITH_SUB;
1372                val = -128;
1373            }
1374        }
1375        break;
1376
1377    case ARITH_AND:
1378        if (TCG_TARGET_REG_BITS == 64) {
1379            if (val == 0xffffffffu) {
1380                tcg_out_ext32u(s, r0, r0);
1381                return;
1382            }
1383            if (val == (uint32_t)val) {
1384                /* AND with no high bits set can use a 32-bit operation.  */
1385                rexw = 0;
1386            }
1387        }
1388        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1389            tcg_out_ext8u(s, r0, r0);
1390            return;
1391        }
1392        if (val == 0xffffu) {
1393            tcg_out_ext16u(s, r0, r0);
1394            return;
1395        }
1396        break;
1397
1398    case ARITH_OR:
1399    case ARITH_XOR:
1400        if (val >= 0x80 && val <= 0xff
1401            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1402            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1403            tcg_out8(s, val);
1404            return;
1405        }
1406        break;
1407    }
1408
1409    if (val == (int8_t)val) {
1410        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1411        tcg_out8(s, val);
1412        return;
1413    }
1414    if (rexw == 0 || val == (int32_t)val) {
1415        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1416        tcg_out32(s, val);
1417        return;
1418    }
1419
1420    g_assert_not_reached();
1421}
1422
1423static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1424{
1425    if (val != 0) {
1426        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1427    }
1428}
1429
1430/* Set SMALL to force a short forward branch.  */
1431static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1432{
1433    int32_t val, val1;
1434
1435    if (l->has_value) {
1436        val = tcg_pcrel_diff(s, l->u.value_ptr);
1437        val1 = val - 2;
1438        if ((int8_t)val1 == val1) {
1439            if (opc == -1) {
1440                tcg_out8(s, OPC_JMP_short);
1441            } else {
1442                tcg_out8(s, OPC_JCC_short + opc);
1443            }
1444            tcg_out8(s, val1);
1445        } else {
1446            tcg_debug_assert(!small);
1447            if (opc == -1) {
1448                tcg_out8(s, OPC_JMP_long);
1449                tcg_out32(s, val - 5);
1450            } else {
1451                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1452                tcg_out32(s, val - 6);
1453            }
1454        }
1455    } else if (small) {
1456        if (opc == -1) {
1457            tcg_out8(s, OPC_JMP_short);
1458        } else {
1459            tcg_out8(s, OPC_JCC_short + opc);
1460        }
1461        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1462        s->code_ptr += 1;
1463    } else {
1464        if (opc == -1) {
1465            tcg_out8(s, OPC_JMP_long);
1466        } else {
1467            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1468        }
1469        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1470        s->code_ptr += 4;
1471    }
1472}
1473
1474static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1475                       TCGArg arg2, int const_arg2, int rexw)
1476{
1477    int jz, js;
1478
1479    if (!is_tst_cond(cond)) {
1480        if (!const_arg2) {
1481            tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1482        } else if (arg2 == 0) {
1483            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1484        } else {
1485            tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1486            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1487        }
1488        return tcg_cond_to_jcc[cond];
1489    }
1490
1491    jz = tcg_cond_to_jcc[cond];
1492    js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1493
1494    if (!const_arg2) {
1495        tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1496        return jz;
1497    }
1498
1499    if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1500        if (arg2 == 0x80) {
1501            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1502            return js;
1503        }
1504        if (arg2 == 0xff) {
1505            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1506            return jz;
1507        }
1508        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1509        tcg_out8(s, arg2);
1510        return jz;
1511    }
1512
1513    if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1514        if (arg2 == 0x8000) {
1515            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1516            return js;
1517        }
1518        if (arg2 == 0xff00) {
1519            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1520            return jz;
1521        }
1522        tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1523        tcg_out8(s, arg2 >> 8);
1524        return jz;
1525    }
1526
1527    if (arg2 == 0xffff) {
1528        tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1529        return jz;
1530    }
1531    if (arg2 == 0xffffffffu) {
1532        tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1533        return jz;
1534    }
1535
1536    if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1537        int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1538        int sh = ctz64(arg2);
1539
1540        rexw = (sh & 32 ? P_REXW : 0);
1541        if ((sh & 31) == 31) {
1542            tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1543            return js;
1544        } else {
1545            tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1546            tcg_out8(s, sh);
1547            return jc;
1548        }
1549    }
1550
1551    if (rexw) {
1552        if (arg2 == (uint32_t)arg2) {
1553            rexw = 0;
1554        } else {
1555            tcg_debug_assert(arg2 == (int32_t)arg2);
1556        }
1557    }
1558    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1559    tcg_out32(s, arg2);
1560    return jz;
1561}
1562
1563static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1564                           TCGArg arg1, TCGArg arg2, int const_arg2,
1565                           TCGLabel *label, bool small)
1566{
1567    int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1568    tcg_out_jxx(s, jcc, label, small);
1569}
1570
1571#if TCG_TARGET_REG_BITS == 32
1572static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1573                            const int *const_args, bool small)
1574{
1575    TCGLabel *label_next = gen_new_label();
1576    TCGLabel *label_this = arg_label(args[5]);
1577    TCGCond cond = args[4];
1578
1579    switch (cond) {
1580    case TCG_COND_EQ:
1581    case TCG_COND_TSTEQ:
1582        tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1583                       args[0], args[2], const_args[2], label_next, 1);
1584        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1585                       label_this, small);
1586        break;
1587    case TCG_COND_NE:
1588    case TCG_COND_TSTNE:
1589        tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1590                       label_this, small);
1591        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1592                       label_this, small);
1593        break;
1594    case TCG_COND_LT:
1595        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1596                       label_this, small);
1597        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1598        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1599                       label_this, small);
1600        break;
1601    case TCG_COND_LE:
1602        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1603                       label_this, small);
1604        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1605        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1606                       label_this, small);
1607        break;
1608    case TCG_COND_GT:
1609        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1610                       label_this, small);
1611        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1612        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1613                       label_this, small);
1614        break;
1615    case TCG_COND_GE:
1616        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1617                       label_this, small);
1618        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1619        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1620                       label_this, small);
1621        break;
1622    case TCG_COND_LTU:
1623        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1624                       label_this, small);
1625        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1626        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1627                       label_this, small);
1628        break;
1629    case TCG_COND_LEU:
1630        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1631                       label_this, small);
1632        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1633        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1634                       label_this, small);
1635        break;
1636    case TCG_COND_GTU:
1637        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1638                       label_this, small);
1639        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1640        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1641                       label_this, small);
1642        break;
1643    case TCG_COND_GEU:
1644        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1645                       label_this, small);
1646        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1647        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1648                       label_this, small);
1649        break;
1650    default:
1651        g_assert_not_reached();
1652    }
1653    tcg_out_label(s, label_next);
1654}
1655#endif
1656
1657static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1658                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1659                            int const_arg2, bool neg)
1660{
1661    int cmp_rexw = rexw;
1662    bool inv = false;
1663    bool cleared;
1664    int jcc;
1665
1666    switch (cond) {
1667    case TCG_COND_NE:
1668        inv = true;
1669        /* fall through */
1670    case TCG_COND_EQ:
1671        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1672        if (const_arg2 && arg2 == 0) {
1673            arg2 = 1;
1674            goto do_ltu;
1675        }
1676        break;
1677
1678    case TCG_COND_TSTNE:
1679        inv = true;
1680        /* fall through */
1681    case TCG_COND_TSTEQ:
1682        /* If arg2 is -1, convert to LTU/GEU vs 1. */
1683        if (const_arg2 && arg2 == 0xffffffffu) {
1684            arg2 = 1;
1685            cmp_rexw = 0;
1686            goto do_ltu;
1687        }
1688        break;
1689
1690    case TCG_COND_LEU:
1691        inv = true;
1692        /* fall through */
1693    case TCG_COND_GTU:
1694        /* If arg2 is a register, swap for LTU/GEU. */
1695        if (!const_arg2) {
1696            TCGReg t = arg1;
1697            arg1 = arg2;
1698            arg2 = t;
1699            goto do_ltu;
1700        }
1701        break;
1702
1703    case TCG_COND_GEU:
1704        inv = true;
1705        /* fall through */
1706    case TCG_COND_LTU:
1707    do_ltu:
1708        /*
1709         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1710         * We can then use NEG or INC to produce the desired result.
1711         * This is always smaller than the SETCC expansion.
1712         */
1713        tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);
1714
1715        /* X - X - C = -C = (C ? -1 : 0) */
1716        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1717        if (inv && neg) {
1718            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1719            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1720        } else if (inv) {
1721            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1722            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1723        } else if (!neg) {
1724            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1725            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1726        }
1727        return;
1728
1729    case TCG_COND_GE:
1730        inv = true;
1731        /* fall through */
1732    case TCG_COND_LT:
1733        /* If arg2 is 0, extract the sign bit. */
1734        if (const_arg2 && arg2 == 0) {
1735            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1736            if (inv) {
1737                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1738            }
1739            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1740                           dest, rexw ? 63 : 31);
1741            return;
1742        }
1743        break;
1744
1745    default:
1746        break;
1747    }
1748
1749    /*
1750     * If dest does not overlap the inputs, clearing it first is preferred.
1751     * The XOR breaks any false dependency for the low-byte write to dest,
1752     * and is also one byte smaller than MOVZBL.
1753     */
1754    cleared = false;
1755    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1756        tgen_arithr(s, ARITH_XOR, dest, dest);
1757        cleared = true;
1758    }
1759
1760    jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
1761    tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1762
1763    if (!cleared) {
1764        tcg_out_ext8u(s, dest, dest);
1765    }
1766    if (neg) {
1767        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1768    }
1769}
1770
1771#if TCG_TARGET_REG_BITS == 32
1772static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1773                             const int *const_args)
1774{
1775    TCGArg new_args[6];
1776    TCGLabel *label_true, *label_over;
1777
1778    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1779
1780    if (args[0] == args[1] || args[0] == args[2]
1781        || (!const_args[3] && args[0] == args[3])
1782        || (!const_args[4] && args[0] == args[4])) {
1783        /* When the destination overlaps with one of the argument
1784           registers, don't do anything tricky.  */
1785        label_true = gen_new_label();
1786        label_over = gen_new_label();
1787
1788        new_args[5] = label_arg(label_true);
1789        tcg_out_brcond2(s, new_args, const_args+1, 1);
1790
1791        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1792        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1793        tcg_out_label(s, label_true);
1794
1795        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1796        tcg_out_label(s, label_over);
1797    } else {
1798        /* When the destination does not overlap one of the arguments,
1799           clear the destination first, jump if cond false, and emit an
1800           increment in the true case.  This results in smaller code.  */
1801
1802        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1803
1804        label_over = gen_new_label();
1805        new_args[4] = tcg_invert_cond(new_args[4]);
1806        new_args[5] = label_arg(label_over);
1807        tcg_out_brcond2(s, new_args, const_args+1, 1);
1808
1809        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1810        tcg_out_label(s, label_over);
1811    }
1812}
1813#endif
1814
1815static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1816                         TCGReg dest, TCGReg v1)
1817{
1818    if (have_cmov) {
1819        tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1820    } else {
1821        TCGLabel *over = gen_new_label();
1822        tcg_out_jxx(s, jcc ^ 1, over, 1);
1823        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1824        tcg_out_label(s, over);
1825    }
1826}
1827
1828static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1829                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1830                            TCGReg v1)
1831{
1832    int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1833    tcg_out_cmov(s, jcc, rexw, dest, v1);
1834}
1835
1836static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1837                        TCGArg arg2, bool const_a2)
1838{
1839    if (have_bmi1) {
1840        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1841        if (const_a2) {
1842            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1843        } else {
1844            tcg_debug_assert(dest != arg2);
1845            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1846        }
1847    } else {
1848        tcg_debug_assert(dest != arg2);
1849        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1850        tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1851    }
1852}
1853
1854static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1855                        TCGArg arg2, bool const_a2)
1856{
1857    if (have_lzcnt) {
1858        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1859        if (const_a2) {
1860            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1861        } else {
1862            tcg_debug_assert(dest != arg2);
1863            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1864        }
1865    } else {
1866        tcg_debug_assert(!const_a2);
1867        tcg_debug_assert(dest != arg1);
1868        tcg_debug_assert(dest != arg2);
1869
1870        /* Recall that the output of BSR is the index not the count.  */
1871        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1872        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1873
1874        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1875        int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1876        tcg_out_cmov(s, jcc, rexw, dest, arg2);
1877    }
1878}
1879
1880static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1881{
1882    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1883
1884    if (disp == (int32_t)disp) {
1885        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1886        tcg_out32(s, disp);
1887    } else {
1888        /* rip-relative addressing into the constant pool.
1889           This is 6 + 8 = 14 bytes, as compared to using an
1890           immediate load 10 + 6 = 16 bytes, plus we may
1891           be able to re-use the pool constant for more calls.  */
1892        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1893        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1894        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1895        tcg_out32(s, 0);
1896    }
1897}
1898
1899static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1900                         const TCGHelperInfo *info)
1901{
1902    tcg_out_branch(s, 1, dest);
1903
1904#ifndef _WIN32
1905    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1906        /*
1907         * The sysv i386 abi for struct return places a reference as the
1908         * first argument of the stack, and pops that argument with the
1909         * return statement.  Since we want to retain the aligned stack
1910         * pointer for the callee, we do not want to actually push that
1911         * argument before the call but rely on the normal store to the
1912         * stack slot.  But we do need to compensate for the pop in order
1913         * to reset our correct stack pointer value.
1914         * Pushing a garbage value back onto the stack is quickest.
1915         */
1916        tcg_out_push(s, TCG_REG_EAX);
1917    }
1918#endif
1919}
1920
1921static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1922{
1923    tcg_out_branch(s, 0, dest);
1924}
1925
1926static void tcg_out_nopn(TCGContext *s, int n)
1927{
1928    int i;
1929    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1930     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1931     * duplicate prefix, and all of the interesting recent cores can
1932     * decode and discard the duplicates in a single cycle.
1933     */
1934    tcg_debug_assert(n >= 1);
1935    for (i = 1; i < n; ++i) {
1936        tcg_out8(s, 0x66);
1937    }
1938    tcg_out8(s, 0x90);
1939}
1940
1941typedef struct {
1942    TCGReg base;
1943    int index;
1944    int ofs;
1945    int seg;
1946    TCGAtomAlign aa;
1947} HostAddress;
1948
1949bool tcg_target_has_memory_bswap(MemOp memop)
1950{
1951    TCGAtomAlign aa;
1952
1953    if (!have_movbe) {
1954        return false;
1955    }
1956    if ((memop & MO_SIZE) < MO_128) {
1957        return true;
1958    }
1959
1960    /*
1961     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1962     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1963     */
1964    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1965    return aa.atom < MO_128;
1966}
1967
1968/*
1969 * Because i686 has no register parameters and because x86_64 has xchg
1970 * to handle addr/data register overlap, we have placed all input arguments
1971 * before we need might need a scratch reg.
1972 *
1973 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1974 * a general-purpose scratch when we don't actually know it's available,
1975 * use the ra_gen hook to load into RAX if needed.
1976 */
1977#if TCG_TARGET_REG_BITS == 64
1978static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1979{
1980    if (arg < 0) {
1981        arg = TCG_REG_RAX;
1982    }
1983    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1984    return arg;
1985}
1986static const TCGLdstHelperParam ldst_helper_param = {
1987    .ra_gen = ldst_ra_gen
1988};
1989#else
1990static const TCGLdstHelperParam ldst_helper_param = { };
1991#endif
1992
1993static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1994                                TCGReg l, TCGReg h, TCGReg v)
1995{
1996    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1997
1998    /* vpmov{d,q} %v, %l */
1999    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
2000    /* vpextr{d,q} $1, %v, %h */
2001    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
2002    tcg_out8(s, 1);
2003}
2004
2005static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
2006                                TCGReg v, TCGReg l, TCGReg h)
2007{
2008    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
2009
2010    /* vmov{d,q} %l, %v */
2011    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
2012    /* vpinsr{d,q} $1, %h, %v, %v */
2013    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2014    tcg_out8(s, 1);
2015}
2016
2017/*
2018 * Generate code for the slow path for a load at the end of block
2019 */
2020static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2021{
2022    MemOp opc = get_memop(l->oi);
2023    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2024
2025    /* resolve label address */
2026    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2027    if (label_ptr[1]) {
2028        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2029    }
2030
2031    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2032    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2033    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2034
2035    tcg_out_jmp(s, l->raddr);
2036    return true;
2037}
2038
2039/*
2040 * Generate code for the slow path for a store at the end of block
2041 */
2042static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2043{
2044    MemOp opc = get_memop(l->oi);
2045    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2046
2047    /* resolve label address */
2048    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2049    if (label_ptr[1]) {
2050        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2051    }
2052
2053    tcg_out_st_helper_args(s, l, &ldst_helper_param);
2054    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2055
2056    tcg_out_jmp(s, l->raddr);
2057    return true;
2058}
2059
2060#ifdef CONFIG_USER_ONLY
2061static HostAddress x86_guest_base = {
2062    .index = -1
2063};
2064
2065#if defined(__x86_64__) && defined(__linux__)
2066# include <asm/prctl.h>
2067# include <sys/prctl.h>
2068int arch_prctl(int code, unsigned long addr);
2069static inline int setup_guest_base_seg(void)
2070{
2071    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2072        return P_GS;
2073    }
2074    return 0;
2075}
2076#define setup_guest_base_seg  setup_guest_base_seg
2077#elif defined(__x86_64__) && \
2078      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2079# include <machine/sysarch.h>
2080static inline int setup_guest_base_seg(void)
2081{
2082    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2083        return P_GS;
2084    }
2085    return 0;
2086}
2087#define setup_guest_base_seg  setup_guest_base_seg
2088#endif
2089#else
2090# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2091#endif /* CONFIG_USER_ONLY */
2092#ifndef setup_guest_base_seg
2093# define setup_guest_base_seg()  0
2094#endif
2095
2096#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2097
2098/*
2099 * For softmmu, perform the TLB load and compare.
2100 * For useronly, perform any required alignment tests.
2101 * In both cases, return a TCGLabelQemuLdst structure if the slow path
2102 * is required and fill in @h with the host address for the fast path.
2103 */
2104static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2105                                           TCGReg addrlo, TCGReg addrhi,
2106                                           MemOpIdx oi, bool is_ld)
2107{
2108    TCGLabelQemuLdst *ldst = NULL;
2109    MemOp opc = get_memop(oi);
2110    MemOp s_bits = opc & MO_SIZE;
2111    unsigned a_mask;
2112
2113    if (tcg_use_softmmu) {
2114        h->index = TCG_REG_L0;
2115        h->ofs = 0;
2116        h->seg = 0;
2117    } else {
2118        *h = x86_guest_base;
2119    }
2120    h->base = addrlo;
2121    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2122    a_mask = (1 << h->aa.align) - 1;
2123
2124    if (tcg_use_softmmu) {
2125        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2126                            : offsetof(CPUTLBEntry, addr_write);
2127        TCGType ttype = TCG_TYPE_I32;
2128        TCGType tlbtype = TCG_TYPE_I32;
2129        int trexw = 0, hrexw = 0, tlbrexw = 0;
2130        unsigned mem_index = get_mmuidx(oi);
2131        unsigned s_mask = (1 << s_bits) - 1;
2132        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2133        int tlb_mask;
2134
2135        ldst = new_ldst_label(s);
2136        ldst->is_ld = is_ld;
2137        ldst->oi = oi;
2138        ldst->addrlo_reg = addrlo;
2139        ldst->addrhi_reg = addrhi;
2140
2141        if (TCG_TARGET_REG_BITS == 64) {
2142            ttype = s->addr_type;
2143            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2144            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2145                hrexw = P_REXW;
2146                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2147                    tlbtype = TCG_TYPE_I64;
2148                    tlbrexw = P_REXW;
2149                }
2150            }
2151        }
2152
2153        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2154        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2155                       s->page_bits - CPU_TLB_ENTRY_BITS);
2156
2157        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2158                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2159
2160        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2161                             fast_ofs + offsetof(CPUTLBDescFast, table));
2162
2163        /*
2164         * If the required alignment is at least as large as the access,
2165         * simply copy the address and mask.  For lesser alignments,
2166         * check that we don't cross pages for the complete access.
2167         */
2168        if (a_mask >= s_mask) {
2169            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2170        } else {
2171            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2172                                 addrlo, s_mask - a_mask);
2173        }
2174        tlb_mask = s->page_mask | a_mask;
2175        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2176
2177        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2178        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2179                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2180
2181        /* jne slow_path */
2182        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2183        ldst->label_ptr[0] = s->code_ptr;
2184        s->code_ptr += 4;
2185
2186        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2187            /* cmp 4(TCG_REG_L0), addrhi */
2188            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2189                                 TCG_REG_L0, cmp_ofs + 4);
2190
2191            /* jne slow_path */
2192            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2193            ldst->label_ptr[1] = s->code_ptr;
2194            s->code_ptr += 4;
2195        }
2196
2197        /* TLB Hit.  */
2198        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2199                   offsetof(CPUTLBEntry, addend));
2200    } else if (a_mask) {
2201        int jcc;
2202
2203        ldst = new_ldst_label(s);
2204        ldst->is_ld = is_ld;
2205        ldst->oi = oi;
2206        ldst->addrlo_reg = addrlo;
2207        ldst->addrhi_reg = addrhi;
2208
2209        /* jne slow_path */
2210        jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2211        tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2212        ldst->label_ptr[0] = s->code_ptr;
2213        s->code_ptr += 4;
2214    }
2215
2216    return ldst;
2217}
2218
2219static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2220                                   HostAddress h, TCGType type, MemOp memop)
2221{
2222    bool use_movbe = false;
2223    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2224    int movop = OPC_MOVL_GvEv;
2225
2226    /* Do big-endian loads with movbe.  */
2227    if (memop & MO_BSWAP) {
2228        tcg_debug_assert(have_movbe);
2229        use_movbe = true;
2230        movop = OPC_MOVBE_GyMy;
2231    }
2232
2233    switch (memop & MO_SSIZE) {
2234    case MO_UB:
2235        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2236                                 h.base, h.index, 0, h.ofs);
2237        break;
2238    case MO_SB:
2239        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2240                                 h.base, h.index, 0, h.ofs);
2241        break;
2242    case MO_UW:
2243        if (use_movbe) {
2244            /* There is no extending movbe; only low 16-bits are modified.  */
2245            if (datalo != h.base && datalo != h.index) {
2246                /* XOR breaks dependency chains.  */
2247                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2248                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2249                                         datalo, h.base, h.index, 0, h.ofs);
2250            } else {
2251                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2252                                         datalo, h.base, h.index, 0, h.ofs);
2253                tcg_out_ext16u(s, datalo, datalo);
2254            }
2255        } else {
2256            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2257                                     h.base, h.index, 0, h.ofs);
2258        }
2259        break;
2260    case MO_SW:
2261        if (use_movbe) {
2262            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2263                                     datalo, h.base, h.index, 0, h.ofs);
2264            tcg_out_ext16s(s, type, datalo, datalo);
2265        } else {
2266            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2267                                     datalo, h.base, h.index, 0, h.ofs);
2268        }
2269        break;
2270    case MO_UL:
2271        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2272                                 h.base, h.index, 0, h.ofs);
2273        break;
2274#if TCG_TARGET_REG_BITS == 64
2275    case MO_SL:
2276        if (use_movbe) {
2277            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2278                                     h.base, h.index, 0, h.ofs);
2279            tcg_out_ext32s(s, datalo, datalo);
2280        } else {
2281            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2282                                     h.base, h.index, 0, h.ofs);
2283        }
2284        break;
2285#endif
2286    case MO_UQ:
2287        if (TCG_TARGET_REG_BITS == 64) {
2288            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2289                                     h.base, h.index, 0, h.ofs);
2290            break;
2291        }
2292        if (use_movbe) {
2293            TCGReg t = datalo;
2294            datalo = datahi;
2295            datahi = t;
2296        }
2297        if (h.base == datalo || h.index == datalo) {
2298            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2299                                     h.base, h.index, 0, h.ofs);
2300            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2301            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2302        } else {
2303            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2304                                     h.base, h.index, 0, h.ofs);
2305            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2306                                     h.base, h.index, 0, h.ofs + 4);
2307        }
2308        break;
2309
2310    case MO_128:
2311        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2312
2313        /*
2314         * Without 16-byte atomicity, use integer regs.
2315         * That is where we want the data, and it allows bswaps.
2316         */
2317        if (h.aa.atom < MO_128) {
2318            if (use_movbe) {
2319                TCGReg t = datalo;
2320                datalo = datahi;
2321                datahi = t;
2322            }
2323            if (h.base == datalo || h.index == datalo) {
2324                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2325                                         h.base, h.index, 0, h.ofs);
2326                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2327                                     datalo, datahi, 0);
2328                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2329                                     datahi, datahi, 8);
2330            } else {
2331                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2332                                         h.base, h.index, 0, h.ofs);
2333                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2334                                         h.base, h.index, 0, h.ofs + 8);
2335            }
2336            break;
2337        }
2338
2339        /*
2340         * With 16-byte atomicity, a vector load is required.
2341         * If we already have 16-byte alignment, then VMOVDQA always works.
2342         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2343         * Else use we require a runtime test for alignment for VMOVDQA;
2344         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2345         */
2346        if (h.aa.align >= MO_128) {
2347            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2348                                         TCG_TMP_VEC, 0,
2349                                         h.base, h.index, 0, h.ofs);
2350        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2351            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2352                                         TCG_TMP_VEC, 0,
2353                                         h.base, h.index, 0, h.ofs);
2354        } else {
2355            TCGLabel *l1 = gen_new_label();
2356            TCGLabel *l2 = gen_new_label();
2357            int jcc;
2358
2359            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2360            tcg_out_jxx(s, jcc, l1, true);
2361
2362            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2363                                         TCG_TMP_VEC, 0,
2364                                         h.base, h.index, 0, h.ofs);
2365            tcg_out_jxx(s, JCC_JMP, l2, true);
2366
2367            tcg_out_label(s, l1);
2368            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2369                                         TCG_TMP_VEC, 0,
2370                                         h.base, h.index, 0, h.ofs);
2371            tcg_out_label(s, l2);
2372        }
2373        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2374        break;
2375
2376    default:
2377        g_assert_not_reached();
2378    }
2379}
2380
2381static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2382                            TCGReg addrlo, TCGReg addrhi,
2383                            MemOpIdx oi, TCGType data_type)
2384{
2385    TCGLabelQemuLdst *ldst;
2386    HostAddress h;
2387
2388    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2389    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2390
2391    if (ldst) {
2392        ldst->type = data_type;
2393        ldst->datalo_reg = datalo;
2394        ldst->datahi_reg = datahi;
2395        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2396    }
2397}
2398
2399static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2400                                   HostAddress h, MemOp memop)
2401{
2402    bool use_movbe = false;
2403    int movop = OPC_MOVL_EvGv;
2404
2405    /*
2406     * Do big-endian stores with movbe or system-mode.
2407     * User-only without movbe will have its swapping done generically.
2408     */
2409    if (memop & MO_BSWAP) {
2410        tcg_debug_assert(have_movbe);
2411        use_movbe = true;
2412        movop = OPC_MOVBE_MyGy;
2413    }
2414
2415    switch (memop & MO_SIZE) {
2416    case MO_8:
2417        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2418        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2419        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2420                                 datalo, h.base, h.index, 0, h.ofs);
2421        break;
2422    case MO_16:
2423        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2424                                 h.base, h.index, 0, h.ofs);
2425        break;
2426    case MO_32:
2427        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2428                                 h.base, h.index, 0, h.ofs);
2429        break;
2430    case MO_64:
2431        if (TCG_TARGET_REG_BITS == 64) {
2432            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2433                                     h.base, h.index, 0, h.ofs);
2434        } else {
2435            if (use_movbe) {
2436                TCGReg t = datalo;
2437                datalo = datahi;
2438                datahi = t;
2439            }
2440            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2441                                     h.base, h.index, 0, h.ofs);
2442            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2443                                     h.base, h.index, 0, h.ofs + 4);
2444        }
2445        break;
2446
2447    case MO_128:
2448        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2449
2450        /*
2451         * Without 16-byte atomicity, use integer regs.
2452         * That is where we have the data, and it allows bswaps.
2453         */
2454        if (h.aa.atom < MO_128) {
2455            if (use_movbe) {
2456                TCGReg t = datalo;
2457                datalo = datahi;
2458                datahi = t;
2459            }
2460            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2461                                     h.base, h.index, 0, h.ofs);
2462            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2463                                     h.base, h.index, 0, h.ofs + 8);
2464            break;
2465        }
2466
2467        /*
2468         * With 16-byte atomicity, a vector store is required.
2469         * If we already have 16-byte alignment, then VMOVDQA always works.
2470         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2471         * Else use we require a runtime test for alignment for VMOVDQA;
2472         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2473         */
2474        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2475        if (h.aa.align >= MO_128) {
2476            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2477                                         TCG_TMP_VEC, 0,
2478                                         h.base, h.index, 0, h.ofs);
2479        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2480            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2481                                         TCG_TMP_VEC, 0,
2482                                         h.base, h.index, 0, h.ofs);
2483        } else {
2484            TCGLabel *l1 = gen_new_label();
2485            TCGLabel *l2 = gen_new_label();
2486            int jcc;
2487
2488            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2489            tcg_out_jxx(s, jcc, l1, true);
2490
2491            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2492                                         TCG_TMP_VEC, 0,
2493                                         h.base, h.index, 0, h.ofs);
2494            tcg_out_jxx(s, JCC_JMP, l2, true);
2495
2496            tcg_out_label(s, l1);
2497            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2498                                         TCG_TMP_VEC, 0,
2499                                         h.base, h.index, 0, h.ofs);
2500            tcg_out_label(s, l2);
2501        }
2502        break;
2503
2504    default:
2505        g_assert_not_reached();
2506    }
2507}
2508
2509static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2510                            TCGReg addrlo, TCGReg addrhi,
2511                            MemOpIdx oi, TCGType data_type)
2512{
2513    TCGLabelQemuLdst *ldst;
2514    HostAddress h;
2515
2516    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2517    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2518
2519    if (ldst) {
2520        ldst->type = data_type;
2521        ldst->datalo_reg = datalo;
2522        ldst->datahi_reg = datahi;
2523        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2524    }
2525}
2526
2527static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2528{
2529    /* Reuse the zeroing that exists for goto_ptr.  */
2530    if (a0 == 0) {
2531        tcg_out_jmp(s, tcg_code_gen_epilogue);
2532    } else {
2533        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2534        tcg_out_jmp(s, tb_ret_addr);
2535    }
2536}
2537
2538static void tcg_out_goto_tb(TCGContext *s, int which)
2539{
2540    /*
2541     * Jump displacement must be aligned for atomic patching;
2542     * see if we need to add extra nops before jump
2543     */
2544    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2545    if (gap != 1) {
2546        tcg_out_nopn(s, gap - 1);
2547    }
2548    tcg_out8(s, OPC_JMP_long); /* jmp im */
2549    set_jmp_insn_offset(s, which);
2550    tcg_out32(s, 0);
2551    set_jmp_reset_offset(s, which);
2552}
2553
2554void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2555                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2556{
2557    /* patch the branch destination */
2558    uintptr_t addr = tb->jmp_target_addr[n];
2559    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2560    /* no need to flush icache explicitly */
2561}
2562
2563static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2564                              const TCGArg args[TCG_MAX_OP_ARGS],
2565                              const int const_args[TCG_MAX_OP_ARGS])
2566{
2567    TCGArg a0, a1, a2;
2568    int c, const_a2, vexop, rexw = 0;
2569
2570#if TCG_TARGET_REG_BITS == 64
2571# define OP_32_64(x) \
2572        case glue(glue(INDEX_op_, x), _i64): \
2573            rexw = P_REXW; /* FALLTHRU */    \
2574        case glue(glue(INDEX_op_, x), _i32)
2575#else
2576# define OP_32_64(x) \
2577        case glue(glue(INDEX_op_, x), _i32)
2578#endif
2579
2580    /* Hoist the loads of the most common arguments.  */
2581    a0 = args[0];
2582    a1 = args[1];
2583    a2 = args[2];
2584    const_a2 = const_args[2];
2585
2586    switch (opc) {
2587    case INDEX_op_goto_ptr:
2588        /* jmp to the given host address (could be epilogue) */
2589        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2590        break;
2591    case INDEX_op_br:
2592        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2593        break;
2594    OP_32_64(ld8u):
2595        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2596        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2597        break;
2598    OP_32_64(ld8s):
2599        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2600        break;
2601    OP_32_64(ld16u):
2602        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2603        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2604        break;
2605    OP_32_64(ld16s):
2606        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2607        break;
2608#if TCG_TARGET_REG_BITS == 64
2609    case INDEX_op_ld32u_i64:
2610#endif
2611    case INDEX_op_ld_i32:
2612        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2613        break;
2614
2615    OP_32_64(st8):
2616        if (const_args[0]) {
2617            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2618            tcg_out8(s, a0);
2619        } else {
2620            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2621        }
2622        break;
2623    OP_32_64(st16):
2624        if (const_args[0]) {
2625            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2626            tcg_out16(s, a0);
2627        } else {
2628            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2629        }
2630        break;
2631#if TCG_TARGET_REG_BITS == 64
2632    case INDEX_op_st32_i64:
2633#endif
2634    case INDEX_op_st_i32:
2635        if (const_args[0]) {
2636            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2637            tcg_out32(s, a0);
2638        } else {
2639            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2640        }
2641        break;
2642
2643    OP_32_64(add):
2644        /* For 3-operand addition, use LEA.  */
2645        if (a0 != a1) {
2646            TCGArg c3 = 0;
2647            if (const_a2) {
2648                c3 = a2, a2 = -1;
2649            } else if (a0 == a2) {
2650                /* Watch out for dest = src + dest, since we've removed
2651                   the matching constraint on the add.  */
2652                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2653                break;
2654            }
2655
2656            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2657            break;
2658        }
2659        c = ARITH_ADD;
2660        goto gen_arith;
2661    OP_32_64(sub):
2662        c = ARITH_SUB;
2663        goto gen_arith;
2664    OP_32_64(and):
2665        c = ARITH_AND;
2666        goto gen_arith;
2667    OP_32_64(or):
2668        c = ARITH_OR;
2669        goto gen_arith;
2670    OP_32_64(xor):
2671        c = ARITH_XOR;
2672        goto gen_arith;
2673    gen_arith:
2674        if (const_a2) {
2675            tgen_arithi(s, c + rexw, a0, a2, 0);
2676        } else {
2677            tgen_arithr(s, c + rexw, a0, a2);
2678        }
2679        break;
2680
2681    OP_32_64(andc):
2682        if (const_a2) {
2683            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2684            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2685        } else {
2686            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2687        }
2688        break;
2689
2690    OP_32_64(mul):
2691        if (const_a2) {
2692            int32_t val;
2693            val = a2;
2694            if (val == (int8_t)val) {
2695                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2696                tcg_out8(s, val);
2697            } else {
2698                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2699                tcg_out32(s, val);
2700            }
2701        } else {
2702            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2703        }
2704        break;
2705
2706    OP_32_64(div2):
2707        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2708        break;
2709    OP_32_64(divu2):
2710        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2711        break;
2712
2713    OP_32_64(shl):
2714        /* For small constant 3-operand shift, use LEA.  */
2715        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2716            if (a2 - 1 == 0) {
2717                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2718                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2719            } else {
2720                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2721                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2722            }
2723            break;
2724        }
2725        c = SHIFT_SHL;
2726        vexop = OPC_SHLX;
2727        goto gen_shift_maybe_vex;
2728    OP_32_64(shr):
2729        c = SHIFT_SHR;
2730        vexop = OPC_SHRX;
2731        goto gen_shift_maybe_vex;
2732    OP_32_64(sar):
2733        c = SHIFT_SAR;
2734        vexop = OPC_SARX;
2735        goto gen_shift_maybe_vex;
2736    OP_32_64(rotl):
2737        c = SHIFT_ROL;
2738        goto gen_shift;
2739    OP_32_64(rotr):
2740        c = SHIFT_ROR;
2741        goto gen_shift;
2742    gen_shift_maybe_vex:
2743        if (have_bmi2) {
2744            if (!const_a2) {
2745                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2746                break;
2747            }
2748            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2749        }
2750        /* FALLTHRU */
2751    gen_shift:
2752        if (const_a2) {
2753            tcg_out_shifti(s, c + rexw, a0, a2);
2754        } else {
2755            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2756        }
2757        break;
2758
2759    OP_32_64(ctz):
2760        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2761        break;
2762    OP_32_64(clz):
2763        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2764        break;
2765    OP_32_64(ctpop):
2766        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2767        break;
2768
2769    OP_32_64(brcond):
2770        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2771                       arg_label(args[3]), 0);
2772        break;
2773    OP_32_64(setcond):
2774        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2775        break;
2776    OP_32_64(negsetcond):
2777        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2778        break;
2779    OP_32_64(movcond):
2780        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2781        break;
2782
2783    OP_32_64(bswap16):
2784        if (a2 & TCG_BSWAP_OS) {
2785            /* Output must be sign-extended. */
2786            if (rexw) {
2787                tcg_out_bswap64(s, a0);
2788                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2789            } else {
2790                tcg_out_bswap32(s, a0);
2791                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2792            }
2793        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2794            /* Output must be zero-extended, but input isn't. */
2795            tcg_out_bswap32(s, a0);
2796            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2797        } else {
2798            tcg_out_rolw_8(s, a0);
2799        }
2800        break;
2801    OP_32_64(bswap32):
2802        tcg_out_bswap32(s, a0);
2803        if (rexw && (a2 & TCG_BSWAP_OS)) {
2804            tcg_out_ext32s(s, a0, a0);
2805        }
2806        break;
2807
2808    OP_32_64(neg):
2809        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2810        break;
2811    OP_32_64(not):
2812        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2813        break;
2814
2815    case INDEX_op_qemu_ld_a64_i32:
2816        if (TCG_TARGET_REG_BITS == 32) {
2817            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2818            break;
2819        }
2820        /* fall through */
2821    case INDEX_op_qemu_ld_a32_i32:
2822        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2823        break;
2824    case INDEX_op_qemu_ld_a32_i64:
2825        if (TCG_TARGET_REG_BITS == 64) {
2826            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2827        } else {
2828            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2829        }
2830        break;
2831    case INDEX_op_qemu_ld_a64_i64:
2832        if (TCG_TARGET_REG_BITS == 64) {
2833            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2834        } else {
2835            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2836        }
2837        break;
2838    case INDEX_op_qemu_ld_a32_i128:
2839    case INDEX_op_qemu_ld_a64_i128:
2840        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2841        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2842        break;
2843
2844    case INDEX_op_qemu_st_a64_i32:
2845    case INDEX_op_qemu_st8_a64_i32:
2846        if (TCG_TARGET_REG_BITS == 32) {
2847            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2848            break;
2849        }
2850        /* fall through */
2851    case INDEX_op_qemu_st_a32_i32:
2852    case INDEX_op_qemu_st8_a32_i32:
2853        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2854        break;
2855    case INDEX_op_qemu_st_a32_i64:
2856        if (TCG_TARGET_REG_BITS == 64) {
2857            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2858        } else {
2859            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2860        }
2861        break;
2862    case INDEX_op_qemu_st_a64_i64:
2863        if (TCG_TARGET_REG_BITS == 64) {
2864            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2865        } else {
2866            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2867        }
2868        break;
2869    case INDEX_op_qemu_st_a32_i128:
2870    case INDEX_op_qemu_st_a64_i128:
2871        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2872        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2873        break;
2874
2875    OP_32_64(mulu2):
2876        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2877        break;
2878    OP_32_64(muls2):
2879        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2880        break;
2881    OP_32_64(add2):
2882        if (const_args[4]) {
2883            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2884        } else {
2885            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2886        }
2887        if (const_args[5]) {
2888            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2889        } else {
2890            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2891        }
2892        break;
2893    OP_32_64(sub2):
2894        if (const_args[4]) {
2895            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2896        } else {
2897            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2898        }
2899        if (const_args[5]) {
2900            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2901        } else {
2902            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2903        }
2904        break;
2905
2906#if TCG_TARGET_REG_BITS == 32
2907    case INDEX_op_brcond2_i32:
2908        tcg_out_brcond2(s, args, const_args, 0);
2909        break;
2910    case INDEX_op_setcond2_i32:
2911        tcg_out_setcond2(s, args, const_args);
2912        break;
2913#else /* TCG_TARGET_REG_BITS == 64 */
2914    case INDEX_op_ld32s_i64:
2915        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2916        break;
2917    case INDEX_op_ld_i64:
2918        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2919        break;
2920    case INDEX_op_st_i64:
2921        if (const_args[0]) {
2922            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2923            tcg_out32(s, a0);
2924        } else {
2925            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2926        }
2927        break;
2928
2929    case INDEX_op_bswap64_i64:
2930        tcg_out_bswap64(s, a0);
2931        break;
2932    case INDEX_op_extrh_i64_i32:
2933        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2934        break;
2935#endif
2936
2937    OP_32_64(deposit):
2938        if (args[3] == 0 && args[4] == 8) {
2939            /* load bits 0..7 */
2940            if (const_a2) {
2941                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2942                            0, a0, 0);
2943                tcg_out8(s, a2);
2944            } else {
2945                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2946            }
2947        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2948            /* load bits 8..15 */
2949            if (const_a2) {
2950                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2951                tcg_out8(s, a2);
2952            } else {
2953                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2954            }
2955        } else if (args[3] == 0 && args[4] == 16) {
2956            /* load bits 0..15 */
2957            if (const_a2) {
2958                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2959                            0, a0, 0);
2960                tcg_out16(s, a2);
2961            } else {
2962                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2963            }
2964        } else {
2965            g_assert_not_reached();
2966        }
2967        break;
2968
2969    case INDEX_op_extract_i64:
2970        if (a2 + args[3] == 32) {
2971            /* This is a 32-bit zero-extending right shift.  */
2972            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2973            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2974            break;
2975        }
2976        /* FALLTHRU */
2977    case INDEX_op_extract_i32:
2978        /* On the off-chance that we can use the high-byte registers.
2979           Otherwise we emit the same ext16 + shift pattern that we
2980           would have gotten from the normal tcg-op.c expansion.  */
2981        tcg_debug_assert(a2 == 8 && args[3] == 8);
2982        if (a1 < 4 && a0 < 8) {
2983            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2984        } else {
2985            tcg_out_ext16u(s, a0, a1);
2986            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2987        }
2988        break;
2989
2990    case INDEX_op_sextract_i32:
2991        /* We don't implement sextract_i64, as we cannot sign-extend to
2992           64-bits without using the REX prefix that explicitly excludes
2993           access to the high-byte registers.  */
2994        tcg_debug_assert(a2 == 8 && args[3] == 8);
2995        if (a1 < 4 && a0 < 8) {
2996            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2997        } else {
2998            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2999            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
3000        }
3001        break;
3002
3003    OP_32_64(extract2):
3004        /* Note that SHRD outputs to the r/m operand.  */
3005        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
3006        tcg_out8(s, args[3]);
3007        break;
3008
3009    case INDEX_op_mb:
3010        tcg_out_mb(s, a0);
3011        break;
3012    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
3013    case INDEX_op_mov_i64:
3014    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
3015    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
3016    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
3017    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
3018    case INDEX_op_ext8s_i64:
3019    case INDEX_op_ext8u_i32:
3020    case INDEX_op_ext8u_i64:
3021    case INDEX_op_ext16s_i32:
3022    case INDEX_op_ext16s_i64:
3023    case INDEX_op_ext16u_i32:
3024    case INDEX_op_ext16u_i64:
3025    case INDEX_op_ext32s_i64:
3026    case INDEX_op_ext32u_i64:
3027    case INDEX_op_ext_i32_i64:
3028    case INDEX_op_extu_i32_i64:
3029    case INDEX_op_extrl_i64_i32:
3030    default:
3031        g_assert_not_reached();
3032    }
3033
3034#undef OP_32_64
3035}
3036
3037static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3038                           unsigned vecl, unsigned vece,
3039                           const TCGArg args[TCG_MAX_OP_ARGS],
3040                           const int const_args[TCG_MAX_OP_ARGS])
3041{
3042    static int const add_insn[4] = {
3043        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3044    };
3045    static int const ssadd_insn[4] = {
3046        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3047    };
3048    static int const usadd_insn[4] = {
3049        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3050    };
3051    static int const sub_insn[4] = {
3052        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3053    };
3054    static int const sssub_insn[4] = {
3055        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3056    };
3057    static int const ussub_insn[4] = {
3058        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3059    };
3060    static int const mul_insn[4] = {
3061        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3062    };
3063    static int const shift_imm_insn[4] = {
3064        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3065    };
3066    static int const cmpeq_insn[4] = {
3067        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3068    };
3069    static int const cmpgt_insn[4] = {
3070        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3071    };
3072    static int const punpckl_insn[4] = {
3073        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3074    };
3075    static int const punpckh_insn[4] = {
3076        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3077    };
3078    static int const packss_insn[4] = {
3079        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3080    };
3081    static int const packus_insn[4] = {
3082        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3083    };
3084    static int const smin_insn[4] = {
3085        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3086    };
3087    static int const smax_insn[4] = {
3088        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3089    };
3090    static int const umin_insn[4] = {
3091        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3092    };
3093    static int const umax_insn[4] = {
3094        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3095    };
3096    static int const rotlv_insn[4] = {
3097        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3098    };
3099    static int const rotrv_insn[4] = {
3100        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3101    };
3102    static int const shlv_insn[4] = {
3103        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3104    };
3105    static int const shrv_insn[4] = {
3106        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3107    };
3108    static int const sarv_insn[4] = {
3109        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3110    };
3111    static int const shls_insn[4] = {
3112        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3113    };
3114    static int const shrs_insn[4] = {
3115        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3116    };
3117    static int const sars_insn[4] = {
3118        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3119    };
3120    static int const vpshldi_insn[4] = {
3121        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3122    };
3123    static int const vpshldv_insn[4] = {
3124        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3125    };
3126    static int const vpshrdv_insn[4] = {
3127        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3128    };
3129    static int const abs_insn[4] = {
3130        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3131    };
3132
3133    TCGType type = vecl + TCG_TYPE_V64;
3134    int insn, sub;
3135    TCGArg a0, a1, a2, a3;
3136
3137    a0 = args[0];
3138    a1 = args[1];
3139    a2 = args[2];
3140
3141    switch (opc) {
3142    case INDEX_op_add_vec:
3143        insn = add_insn[vece];
3144        goto gen_simd;
3145    case INDEX_op_ssadd_vec:
3146        insn = ssadd_insn[vece];
3147        goto gen_simd;
3148    case INDEX_op_usadd_vec:
3149        insn = usadd_insn[vece];
3150        goto gen_simd;
3151    case INDEX_op_sub_vec:
3152        insn = sub_insn[vece];
3153        goto gen_simd;
3154    case INDEX_op_sssub_vec:
3155        insn = sssub_insn[vece];
3156        goto gen_simd;
3157    case INDEX_op_ussub_vec:
3158        insn = ussub_insn[vece];
3159        goto gen_simd;
3160    case INDEX_op_mul_vec:
3161        insn = mul_insn[vece];
3162        goto gen_simd;
3163    case INDEX_op_and_vec:
3164        insn = OPC_PAND;
3165        goto gen_simd;
3166    case INDEX_op_or_vec:
3167        insn = OPC_POR;
3168        goto gen_simd;
3169    case INDEX_op_xor_vec:
3170        insn = OPC_PXOR;
3171        goto gen_simd;
3172    case INDEX_op_smin_vec:
3173        insn = smin_insn[vece];
3174        goto gen_simd;
3175    case INDEX_op_umin_vec:
3176        insn = umin_insn[vece];
3177        goto gen_simd;
3178    case INDEX_op_smax_vec:
3179        insn = smax_insn[vece];
3180        goto gen_simd;
3181    case INDEX_op_umax_vec:
3182        insn = umax_insn[vece];
3183        goto gen_simd;
3184    case INDEX_op_shlv_vec:
3185        insn = shlv_insn[vece];
3186        goto gen_simd;
3187    case INDEX_op_shrv_vec:
3188        insn = shrv_insn[vece];
3189        goto gen_simd;
3190    case INDEX_op_sarv_vec:
3191        insn = sarv_insn[vece];
3192        goto gen_simd;
3193    case INDEX_op_rotlv_vec:
3194        insn = rotlv_insn[vece];
3195        goto gen_simd;
3196    case INDEX_op_rotrv_vec:
3197        insn = rotrv_insn[vece];
3198        goto gen_simd;
3199    case INDEX_op_shls_vec:
3200        insn = shls_insn[vece];
3201        goto gen_simd;
3202    case INDEX_op_shrs_vec:
3203        insn = shrs_insn[vece];
3204        goto gen_simd;
3205    case INDEX_op_sars_vec:
3206        insn = sars_insn[vece];
3207        goto gen_simd;
3208    case INDEX_op_x86_punpckl_vec:
3209        insn = punpckl_insn[vece];
3210        goto gen_simd;
3211    case INDEX_op_x86_punpckh_vec:
3212        insn = punpckh_insn[vece];
3213        goto gen_simd;
3214    case INDEX_op_x86_packss_vec:
3215        insn = packss_insn[vece];
3216        goto gen_simd;
3217    case INDEX_op_x86_packus_vec:
3218        insn = packus_insn[vece];
3219        goto gen_simd;
3220    case INDEX_op_x86_vpshldv_vec:
3221        insn = vpshldv_insn[vece];
3222        a1 = a2;
3223        a2 = args[3];
3224        goto gen_simd;
3225    case INDEX_op_x86_vpshrdv_vec:
3226        insn = vpshrdv_insn[vece];
3227        a1 = a2;
3228        a2 = args[3];
3229        goto gen_simd;
3230#if TCG_TARGET_REG_BITS == 32
3231    case INDEX_op_dup2_vec:
3232        /* First merge the two 32-bit inputs to a single 64-bit element. */
3233        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3234        /* Then replicate the 64-bit elements across the rest of the vector. */
3235        if (type != TCG_TYPE_V64) {
3236            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3237        }
3238        break;
3239#endif
3240    case INDEX_op_abs_vec:
3241        insn = abs_insn[vece];
3242        a2 = a1;
3243        a1 = 0;
3244        goto gen_simd;
3245    gen_simd:
3246        tcg_debug_assert(insn != OPC_UD2);
3247        if (type == TCG_TYPE_V256) {
3248            insn |= P_VEXL;
3249        }
3250        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3251        break;
3252
3253    case INDEX_op_cmp_vec:
3254        sub = args[3];
3255        if (sub == TCG_COND_EQ) {
3256            insn = cmpeq_insn[vece];
3257        } else if (sub == TCG_COND_GT) {
3258            insn = cmpgt_insn[vece];
3259        } else {
3260            g_assert_not_reached();
3261        }
3262        goto gen_simd;
3263
3264    case INDEX_op_andc_vec:
3265        insn = OPC_PANDN;
3266        if (type == TCG_TYPE_V256) {
3267            insn |= P_VEXL;
3268        }
3269        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3270        break;
3271
3272    case INDEX_op_shli_vec:
3273        insn = shift_imm_insn[vece];
3274        sub = 6;
3275        goto gen_shift;
3276    case INDEX_op_shri_vec:
3277        insn = shift_imm_insn[vece];
3278        sub = 2;
3279        goto gen_shift;
3280    case INDEX_op_sari_vec:
3281        if (vece == MO_64) {
3282            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3283        } else {
3284            insn = shift_imm_insn[vece];
3285        }
3286        sub = 4;
3287        goto gen_shift;
3288    case INDEX_op_rotli_vec:
3289        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3290        if (vece == MO_64) {
3291            insn |= P_VEXW;
3292        }
3293        sub = 1;
3294        goto gen_shift;
3295    gen_shift:
3296        tcg_debug_assert(vece != MO_8);
3297        if (type == TCG_TYPE_V256) {
3298            insn |= P_VEXL;
3299        }
3300        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3301        tcg_out8(s, a2);
3302        break;
3303
3304    case INDEX_op_ld_vec:
3305        tcg_out_ld(s, type, a0, a1, a2);
3306        break;
3307    case INDEX_op_st_vec:
3308        tcg_out_st(s, type, a0, a1, a2);
3309        break;
3310    case INDEX_op_dupm_vec:
3311        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3312        break;
3313
3314    case INDEX_op_x86_shufps_vec:
3315        insn = OPC_SHUFPS;
3316        sub = args[3];
3317        goto gen_simd_imm8;
3318    case INDEX_op_x86_blend_vec:
3319        if (vece == MO_16) {
3320            insn = OPC_PBLENDW;
3321        } else if (vece == MO_32) {
3322            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3323        } else {
3324            g_assert_not_reached();
3325        }
3326        sub = args[3];
3327        goto gen_simd_imm8;
3328    case INDEX_op_x86_vperm2i128_vec:
3329        insn = OPC_VPERM2I128;
3330        sub = args[3];
3331        goto gen_simd_imm8;
3332    case INDEX_op_x86_vpshldi_vec:
3333        insn = vpshldi_insn[vece];
3334        sub = args[3];
3335        goto gen_simd_imm8;
3336
3337    case INDEX_op_not_vec:
3338        insn = OPC_VPTERNLOGQ;
3339        a2 = a1;
3340        sub = 0x33; /* !B */
3341        goto gen_simd_imm8;
3342    case INDEX_op_nor_vec:
3343        insn = OPC_VPTERNLOGQ;
3344        sub = 0x11; /* norCB */
3345        goto gen_simd_imm8;
3346    case INDEX_op_nand_vec:
3347        insn = OPC_VPTERNLOGQ;
3348        sub = 0x77; /* nandCB */
3349        goto gen_simd_imm8;
3350    case INDEX_op_eqv_vec:
3351        insn = OPC_VPTERNLOGQ;
3352        sub = 0x99; /* xnorCB */
3353        goto gen_simd_imm8;
3354    case INDEX_op_orc_vec:
3355        insn = OPC_VPTERNLOGQ;
3356        sub = 0xdd; /* orB!C */
3357        goto gen_simd_imm8;
3358
3359    case INDEX_op_bitsel_vec:
3360        insn = OPC_VPTERNLOGQ;
3361        a3 = args[3];
3362        if (a0 == a1) {
3363            a1 = a2;
3364            a2 = a3;
3365            sub = 0xca; /* A?B:C */
3366        } else if (a0 == a2) {
3367            a2 = a3;
3368            sub = 0xe2; /* B?A:C */
3369        } else {
3370            tcg_out_mov(s, type, a0, a3);
3371            sub = 0xb8; /* B?C:A */
3372        }
3373        goto gen_simd_imm8;
3374
3375    gen_simd_imm8:
3376        tcg_debug_assert(insn != OPC_UD2);
3377        if (type == TCG_TYPE_V256) {
3378            insn |= P_VEXL;
3379        }
3380        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3381        tcg_out8(s, sub);
3382        break;
3383
3384    case INDEX_op_x86_vpblendvb_vec:
3385        insn = OPC_VPBLENDVB;
3386        if (type == TCG_TYPE_V256) {
3387            insn |= P_VEXL;
3388        }
3389        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3390        tcg_out8(s, args[3] << 4);
3391        break;
3392
3393    case INDEX_op_x86_psrldq_vec:
3394        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3395        tcg_out8(s, a2);
3396        break;
3397
3398    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3399    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3400    default:
3401        g_assert_not_reached();
3402    }
3403}
3404
3405static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3406{
3407    switch (op) {
3408    case INDEX_op_goto_ptr:
3409        return C_O0_I1(r);
3410
3411    case INDEX_op_ld8u_i32:
3412    case INDEX_op_ld8u_i64:
3413    case INDEX_op_ld8s_i32:
3414    case INDEX_op_ld8s_i64:
3415    case INDEX_op_ld16u_i32:
3416    case INDEX_op_ld16u_i64:
3417    case INDEX_op_ld16s_i32:
3418    case INDEX_op_ld16s_i64:
3419    case INDEX_op_ld_i32:
3420    case INDEX_op_ld32u_i64:
3421    case INDEX_op_ld32s_i64:
3422    case INDEX_op_ld_i64:
3423        return C_O1_I1(r, r);
3424
3425    case INDEX_op_st8_i32:
3426    case INDEX_op_st8_i64:
3427        return C_O0_I2(qi, r);
3428
3429    case INDEX_op_st16_i32:
3430    case INDEX_op_st16_i64:
3431    case INDEX_op_st_i32:
3432    case INDEX_op_st32_i64:
3433        return C_O0_I2(ri, r);
3434
3435    case INDEX_op_st_i64:
3436        return C_O0_I2(re, r);
3437
3438    case INDEX_op_add_i32:
3439    case INDEX_op_add_i64:
3440        return C_O1_I2(r, r, re);
3441
3442    case INDEX_op_sub_i32:
3443    case INDEX_op_sub_i64:
3444    case INDEX_op_mul_i32:
3445    case INDEX_op_mul_i64:
3446    case INDEX_op_or_i32:
3447    case INDEX_op_or_i64:
3448    case INDEX_op_xor_i32:
3449    case INDEX_op_xor_i64:
3450        return C_O1_I2(r, 0, re);
3451
3452    case INDEX_op_and_i32:
3453    case INDEX_op_and_i64:
3454        return C_O1_I2(r, 0, reZ);
3455
3456    case INDEX_op_andc_i32:
3457    case INDEX_op_andc_i64:
3458        return C_O1_I2(r, r, rI);
3459
3460    case INDEX_op_shl_i32:
3461    case INDEX_op_shl_i64:
3462    case INDEX_op_shr_i32:
3463    case INDEX_op_shr_i64:
3464    case INDEX_op_sar_i32:
3465    case INDEX_op_sar_i64:
3466        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3467
3468    case INDEX_op_rotl_i32:
3469    case INDEX_op_rotl_i64:
3470    case INDEX_op_rotr_i32:
3471    case INDEX_op_rotr_i64:
3472        return C_O1_I2(r, 0, ci);
3473
3474    case INDEX_op_brcond_i32:
3475    case INDEX_op_brcond_i64:
3476        return C_O0_I2(r, reT);
3477
3478    case INDEX_op_bswap16_i32:
3479    case INDEX_op_bswap16_i64:
3480    case INDEX_op_bswap32_i32:
3481    case INDEX_op_bswap32_i64:
3482    case INDEX_op_bswap64_i64:
3483    case INDEX_op_neg_i32:
3484    case INDEX_op_neg_i64:
3485    case INDEX_op_not_i32:
3486    case INDEX_op_not_i64:
3487    case INDEX_op_extrh_i64_i32:
3488        return C_O1_I1(r, 0);
3489
3490    case INDEX_op_ext8s_i32:
3491    case INDEX_op_ext8s_i64:
3492    case INDEX_op_ext8u_i32:
3493    case INDEX_op_ext8u_i64:
3494        return C_O1_I1(r, q);
3495
3496    case INDEX_op_ext16s_i32:
3497    case INDEX_op_ext16s_i64:
3498    case INDEX_op_ext16u_i32:
3499    case INDEX_op_ext16u_i64:
3500    case INDEX_op_ext32s_i64:
3501    case INDEX_op_ext32u_i64:
3502    case INDEX_op_ext_i32_i64:
3503    case INDEX_op_extu_i32_i64:
3504    case INDEX_op_extrl_i64_i32:
3505    case INDEX_op_extract_i32:
3506    case INDEX_op_extract_i64:
3507    case INDEX_op_sextract_i32:
3508    case INDEX_op_ctpop_i32:
3509    case INDEX_op_ctpop_i64:
3510        return C_O1_I1(r, r);
3511
3512    case INDEX_op_extract2_i32:
3513    case INDEX_op_extract2_i64:
3514        return C_O1_I2(r, 0, r);
3515
3516    case INDEX_op_deposit_i32:
3517    case INDEX_op_deposit_i64:
3518        return C_O1_I2(q, 0, qi);
3519
3520    case INDEX_op_setcond_i32:
3521    case INDEX_op_setcond_i64:
3522    case INDEX_op_negsetcond_i32:
3523    case INDEX_op_negsetcond_i64:
3524        return C_O1_I2(q, r, reT);
3525
3526    case INDEX_op_movcond_i32:
3527    case INDEX_op_movcond_i64:
3528        return C_O1_I4(r, r, reT, r, 0);
3529
3530    case INDEX_op_div2_i32:
3531    case INDEX_op_div2_i64:
3532    case INDEX_op_divu2_i32:
3533    case INDEX_op_divu2_i64:
3534        return C_O2_I3(a, d, 0, 1, r);
3535
3536    case INDEX_op_mulu2_i32:
3537    case INDEX_op_mulu2_i64:
3538    case INDEX_op_muls2_i32:
3539    case INDEX_op_muls2_i64:
3540        return C_O2_I2(a, d, a, r);
3541
3542    case INDEX_op_add2_i32:
3543    case INDEX_op_add2_i64:
3544    case INDEX_op_sub2_i32:
3545    case INDEX_op_sub2_i64:
3546        return C_N1_O1_I4(r, r, 0, 1, re, re);
3547
3548    case INDEX_op_ctz_i32:
3549    case INDEX_op_ctz_i64:
3550        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3551
3552    case INDEX_op_clz_i32:
3553    case INDEX_op_clz_i64:
3554        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3555
3556    case INDEX_op_qemu_ld_a32_i32:
3557        return C_O1_I1(r, L);
3558    case INDEX_op_qemu_ld_a64_i32:
3559        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3560
3561    case INDEX_op_qemu_st_a32_i32:
3562        return C_O0_I2(L, L);
3563    case INDEX_op_qemu_st_a64_i32:
3564        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3565    case INDEX_op_qemu_st8_a32_i32:
3566        return C_O0_I2(s, L);
3567    case INDEX_op_qemu_st8_a64_i32:
3568        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3569
3570    case INDEX_op_qemu_ld_a32_i64:
3571        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3572    case INDEX_op_qemu_ld_a64_i64:
3573        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3574
3575    case INDEX_op_qemu_st_a32_i64:
3576        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3577    case INDEX_op_qemu_st_a64_i64:
3578        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3579
3580    case INDEX_op_qemu_ld_a32_i128:
3581    case INDEX_op_qemu_ld_a64_i128:
3582        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3583        return C_O2_I1(r, r, L);
3584    case INDEX_op_qemu_st_a32_i128:
3585    case INDEX_op_qemu_st_a64_i128:
3586        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3587        return C_O0_I3(L, L, L);
3588
3589    case INDEX_op_brcond2_i32:
3590        return C_O0_I4(r, r, ri, ri);
3591
3592    case INDEX_op_setcond2_i32:
3593        return C_O1_I4(r, r, r, ri, ri);
3594
3595    case INDEX_op_ld_vec:
3596    case INDEX_op_dupm_vec:
3597        return C_O1_I1(x, r);
3598
3599    case INDEX_op_st_vec:
3600        return C_O0_I2(x, r);
3601
3602    case INDEX_op_add_vec:
3603    case INDEX_op_sub_vec:
3604    case INDEX_op_mul_vec:
3605    case INDEX_op_and_vec:
3606    case INDEX_op_or_vec:
3607    case INDEX_op_xor_vec:
3608    case INDEX_op_andc_vec:
3609    case INDEX_op_orc_vec:
3610    case INDEX_op_nand_vec:
3611    case INDEX_op_nor_vec:
3612    case INDEX_op_eqv_vec:
3613    case INDEX_op_ssadd_vec:
3614    case INDEX_op_usadd_vec:
3615    case INDEX_op_sssub_vec:
3616    case INDEX_op_ussub_vec:
3617    case INDEX_op_smin_vec:
3618    case INDEX_op_umin_vec:
3619    case INDEX_op_smax_vec:
3620    case INDEX_op_umax_vec:
3621    case INDEX_op_shlv_vec:
3622    case INDEX_op_shrv_vec:
3623    case INDEX_op_sarv_vec:
3624    case INDEX_op_rotlv_vec:
3625    case INDEX_op_rotrv_vec:
3626    case INDEX_op_shls_vec:
3627    case INDEX_op_shrs_vec:
3628    case INDEX_op_sars_vec:
3629    case INDEX_op_cmp_vec:
3630    case INDEX_op_x86_shufps_vec:
3631    case INDEX_op_x86_blend_vec:
3632    case INDEX_op_x86_packss_vec:
3633    case INDEX_op_x86_packus_vec:
3634    case INDEX_op_x86_vperm2i128_vec:
3635    case INDEX_op_x86_punpckl_vec:
3636    case INDEX_op_x86_punpckh_vec:
3637    case INDEX_op_x86_vpshldi_vec:
3638#if TCG_TARGET_REG_BITS == 32
3639    case INDEX_op_dup2_vec:
3640#endif
3641        return C_O1_I2(x, x, x);
3642
3643    case INDEX_op_abs_vec:
3644    case INDEX_op_dup_vec:
3645    case INDEX_op_not_vec:
3646    case INDEX_op_shli_vec:
3647    case INDEX_op_shri_vec:
3648    case INDEX_op_sari_vec:
3649    case INDEX_op_rotli_vec:
3650    case INDEX_op_x86_psrldq_vec:
3651        return C_O1_I1(x, x);
3652
3653    case INDEX_op_x86_vpshldv_vec:
3654    case INDEX_op_x86_vpshrdv_vec:
3655        return C_O1_I3(x, 0, x, x);
3656
3657    case INDEX_op_bitsel_vec:
3658    case INDEX_op_x86_vpblendvb_vec:
3659        return C_O1_I3(x, x, x, x);
3660
3661    default:
3662        g_assert_not_reached();
3663    }
3664}
3665
3666int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3667{
3668    switch (opc) {
3669    case INDEX_op_add_vec:
3670    case INDEX_op_sub_vec:
3671    case INDEX_op_and_vec:
3672    case INDEX_op_or_vec:
3673    case INDEX_op_xor_vec:
3674    case INDEX_op_andc_vec:
3675    case INDEX_op_orc_vec:
3676    case INDEX_op_nand_vec:
3677    case INDEX_op_nor_vec:
3678    case INDEX_op_eqv_vec:
3679    case INDEX_op_not_vec:
3680    case INDEX_op_bitsel_vec:
3681        return 1;
3682    case INDEX_op_cmp_vec:
3683    case INDEX_op_cmpsel_vec:
3684        return -1;
3685
3686    case INDEX_op_rotli_vec:
3687        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3688
3689    case INDEX_op_shli_vec:
3690    case INDEX_op_shri_vec:
3691        /* We must expand the operation for MO_8.  */
3692        return vece == MO_8 ? -1 : 1;
3693
3694    case INDEX_op_sari_vec:
3695        switch (vece) {
3696        case MO_8:
3697            return -1;
3698        case MO_16:
3699        case MO_32:
3700            return 1;
3701        case MO_64:
3702            if (have_avx512vl) {
3703                return 1;
3704            }
3705            /*
3706             * We can emulate this for MO_64, but it does not pay off
3707             * unless we're producing at least 4 values.
3708             */
3709            return type >= TCG_TYPE_V256 ? -1 : 0;
3710        }
3711        return 0;
3712
3713    case INDEX_op_shls_vec:
3714    case INDEX_op_shrs_vec:
3715        return vece >= MO_16;
3716    case INDEX_op_sars_vec:
3717        switch (vece) {
3718        case MO_16:
3719        case MO_32:
3720            return 1;
3721        case MO_64:
3722            return have_avx512vl;
3723        }
3724        return 0;
3725    case INDEX_op_rotls_vec:
3726        return vece >= MO_16 ? -1 : 0;
3727
3728    case INDEX_op_shlv_vec:
3729    case INDEX_op_shrv_vec:
3730        switch (vece) {
3731        case MO_16:
3732            return have_avx512bw;
3733        case MO_32:
3734        case MO_64:
3735            return have_avx2;
3736        }
3737        return 0;
3738    case INDEX_op_sarv_vec:
3739        switch (vece) {
3740        case MO_16:
3741            return have_avx512bw;
3742        case MO_32:
3743            return have_avx2;
3744        case MO_64:
3745            return have_avx512vl;
3746        }
3747        return 0;
3748    case INDEX_op_rotlv_vec:
3749    case INDEX_op_rotrv_vec:
3750        switch (vece) {
3751        case MO_16:
3752            return have_avx512vbmi2 ? -1 : 0;
3753        case MO_32:
3754        case MO_64:
3755            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3756        }
3757        return 0;
3758
3759    case INDEX_op_mul_vec:
3760        switch (vece) {
3761        case MO_8:
3762            return -1;
3763        case MO_64:
3764            return have_avx512dq;
3765        }
3766        return 1;
3767
3768    case INDEX_op_ssadd_vec:
3769    case INDEX_op_usadd_vec:
3770    case INDEX_op_sssub_vec:
3771    case INDEX_op_ussub_vec:
3772        return vece <= MO_16;
3773    case INDEX_op_smin_vec:
3774    case INDEX_op_smax_vec:
3775    case INDEX_op_umin_vec:
3776    case INDEX_op_umax_vec:
3777    case INDEX_op_abs_vec:
3778        return vece <= MO_32 || have_avx512vl;
3779
3780    default:
3781        return 0;
3782    }
3783}
3784
3785static void expand_vec_shi(TCGType type, unsigned vece, bool right,
3786                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3787{
3788    uint8_t mask;
3789
3790    tcg_debug_assert(vece == MO_8);
3791    if (right) {
3792        mask = 0xff >> imm;
3793        tcg_gen_shri_vec(MO_16, v0, v1, imm);
3794    } else {
3795        mask = 0xff << imm;
3796        tcg_gen_shli_vec(MO_16, v0, v1, imm);
3797    }
3798    tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
3799}
3800
3801static void expand_vec_sari(TCGType type, unsigned vece,
3802                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3803{
3804    TCGv_vec t1, t2;
3805
3806    switch (vece) {
3807    case MO_8:
3808        /* Unpack to 16-bit, shift, and repack.  */
3809        t1 = tcg_temp_new_vec(type);
3810        t2 = tcg_temp_new_vec(type);
3811        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3812                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3813        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3814                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3815        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3816        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3817        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3818                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3819        tcg_temp_free_vec(t1);
3820        tcg_temp_free_vec(t2);
3821        break;
3822
3823    case MO_64:
3824        t1 = tcg_temp_new_vec(type);
3825        if (imm <= 32) {
3826            /*
3827             * We can emulate a small sign extend by performing an arithmetic
3828             * 32-bit shift and overwriting the high half of a 64-bit logical
3829             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3830             * does not, so we have to bound the smaller shift -- we get the
3831             * same result in the high half either way.
3832             */
3833            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3834            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3835            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3836                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3837                      tcgv_vec_arg(t1), 0xaa);
3838        } else {
3839            /* Otherwise we will need to use a compare vs 0 to produce
3840             * the sign-extend, shift and merge.
3841             */
3842            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3843                            tcg_constant_vec(type, MO_64, 0), v1);
3844            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3845            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3846            tcg_gen_or_vec(MO_64, v0, v0, t1);
3847        }
3848        tcg_temp_free_vec(t1);
3849        break;
3850
3851    default:
3852        g_assert_not_reached();
3853    }
3854}
3855
3856static void expand_vec_rotli(TCGType type, unsigned vece,
3857                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3858{
3859    TCGv_vec t;
3860
3861    if (vece != MO_8 && have_avx512vbmi2) {
3862        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3863                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3864        return;
3865    }
3866
3867    t = tcg_temp_new_vec(type);
3868    tcg_gen_shli_vec(vece, t, v1, imm);
3869    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3870    tcg_gen_or_vec(vece, v0, v0, t);
3871    tcg_temp_free_vec(t);
3872}
3873
3874static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3875                            TCGv_vec v1, TCGv_vec sh, bool right)
3876{
3877    TCGv_vec t;
3878
3879    if (have_avx512vbmi2) {
3880        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3881                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3882                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3883        return;
3884    }
3885
3886    t = tcg_temp_new_vec(type);
3887    tcg_gen_dupi_vec(vece, t, 8 << vece);
3888    tcg_gen_sub_vec(vece, t, t, sh);
3889    if (right) {
3890        tcg_gen_shlv_vec(vece, t, v1, t);
3891        tcg_gen_shrv_vec(vece, v0, v1, sh);
3892    } else {
3893        tcg_gen_shrv_vec(vece, t, v1, t);
3894        tcg_gen_shlv_vec(vece, v0, v1, sh);
3895    }
3896    tcg_gen_or_vec(vece, v0, v0, t);
3897    tcg_temp_free_vec(t);
3898}
3899
3900static void expand_vec_rotls(TCGType type, unsigned vece,
3901                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3902{
3903    TCGv_vec t = tcg_temp_new_vec(type);
3904
3905    tcg_debug_assert(vece != MO_8);
3906
3907    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3908        tcg_gen_dup_i32_vec(vece, t, lsh);
3909        if (vece >= MO_32) {
3910            tcg_gen_rotlv_vec(vece, v0, v1, t);
3911        } else {
3912            expand_vec_rotv(type, vece, v0, v1, t, false);
3913        }
3914    } else {
3915        TCGv_i32 rsh = tcg_temp_new_i32();
3916
3917        tcg_gen_neg_i32(rsh, lsh);
3918        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3919        tcg_gen_shls_vec(vece, t, v1, lsh);
3920        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3921        tcg_gen_or_vec(vece, v0, v0, t);
3922
3923        tcg_temp_free_i32(rsh);
3924    }
3925
3926    tcg_temp_free_vec(t);
3927}
3928
3929static void expand_vec_mul(TCGType type, unsigned vece,
3930                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3931{
3932    TCGv_vec t1, t2, t3, t4, zero;
3933
3934    tcg_debug_assert(vece == MO_8);
3935
3936    /*
3937     * Unpack v1 bytes to words, 0 | x.
3938     * Unpack v2 bytes to words, y | 0.
3939     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3940     * Shift logical right by 8 bits to clear the high 8 bytes before
3941     * using an unsigned saturated pack.
3942     *
3943     * The difference between the V64, V128 and V256 cases is merely how
3944     * we distribute the expansion between temporaries.
3945     */
3946    switch (type) {
3947    case TCG_TYPE_V64:
3948        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3949        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3950        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3951        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3952                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3953        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3954                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3955        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3956        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3957        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3958                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3959        tcg_temp_free_vec(t1);
3960        tcg_temp_free_vec(t2);
3961        break;
3962
3963    case TCG_TYPE_V128:
3964    case TCG_TYPE_V256:
3965        t1 = tcg_temp_new_vec(type);
3966        t2 = tcg_temp_new_vec(type);
3967        t3 = tcg_temp_new_vec(type);
3968        t4 = tcg_temp_new_vec(type);
3969        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3970        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3971                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3972        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3973                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3974        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3975                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3976        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3977                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3978        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3979        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3980        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3981        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3982        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3983                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3984        tcg_temp_free_vec(t1);
3985        tcg_temp_free_vec(t2);
3986        tcg_temp_free_vec(t3);
3987        tcg_temp_free_vec(t4);
3988        break;
3989
3990    default:
3991        g_assert_not_reached();
3992    }
3993}
3994
3995static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3996                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3997{
3998    enum {
3999        NEED_INV  = 1,
4000        NEED_SWAP = 2,
4001        NEED_BIAS = 4,
4002        NEED_UMIN = 8,
4003        NEED_UMAX = 16,
4004    };
4005    TCGv_vec t1, t2, t3;
4006    uint8_t fixup;
4007
4008    switch (cond) {
4009    case TCG_COND_EQ:
4010    case TCG_COND_GT:
4011        fixup = 0;
4012        break;
4013    case TCG_COND_NE:
4014    case TCG_COND_LE:
4015        fixup = NEED_INV;
4016        break;
4017    case TCG_COND_LT:
4018        fixup = NEED_SWAP;
4019        break;
4020    case TCG_COND_GE:
4021        fixup = NEED_SWAP | NEED_INV;
4022        break;
4023    case TCG_COND_LEU:
4024        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4025            fixup = NEED_UMIN;
4026        } else {
4027            fixup = NEED_BIAS | NEED_INV;
4028        }
4029        break;
4030    case TCG_COND_GTU:
4031        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4032            fixup = NEED_UMIN | NEED_INV;
4033        } else {
4034            fixup = NEED_BIAS;
4035        }
4036        break;
4037    case TCG_COND_GEU:
4038        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4039            fixup = NEED_UMAX;
4040        } else {
4041            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
4042        }
4043        break;
4044    case TCG_COND_LTU:
4045        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4046            fixup = NEED_UMAX | NEED_INV;
4047        } else {
4048            fixup = NEED_BIAS | NEED_SWAP;
4049        }
4050        break;
4051    default:
4052        g_assert_not_reached();
4053    }
4054
4055    if (fixup & NEED_INV) {
4056        cond = tcg_invert_cond(cond);
4057    }
4058    if (fixup & NEED_SWAP) {
4059        t1 = v1, v1 = v2, v2 = t1;
4060        cond = tcg_swap_cond(cond);
4061    }
4062
4063    t1 = t2 = NULL;
4064    if (fixup & (NEED_UMIN | NEED_UMAX)) {
4065        t1 = tcg_temp_new_vec(type);
4066        if (fixup & NEED_UMIN) {
4067            tcg_gen_umin_vec(vece, t1, v1, v2);
4068        } else {
4069            tcg_gen_umax_vec(vece, t1, v1, v2);
4070        }
4071        v2 = t1;
4072        cond = TCG_COND_EQ;
4073    } else if (fixup & NEED_BIAS) {
4074        t1 = tcg_temp_new_vec(type);
4075        t2 = tcg_temp_new_vec(type);
4076        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4077        tcg_gen_sub_vec(vece, t1, v1, t3);
4078        tcg_gen_sub_vec(vece, t2, v2, t3);
4079        v1 = t1;
4080        v2 = t2;
4081        cond = tcg_signed_cond(cond);
4082    }
4083
4084    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4085    /* Expand directly; do not recurse.  */
4086    vec_gen_4(INDEX_op_cmp_vec, type, vece,
4087              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4088
4089    if (t1) {
4090        tcg_temp_free_vec(t1);
4091        if (t2) {
4092            tcg_temp_free_vec(t2);
4093        }
4094    }
4095    return fixup & NEED_INV;
4096}
4097
4098static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4099                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4100{
4101    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4102        tcg_gen_not_vec(vece, v0, v0);
4103    }
4104}
4105
4106static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4107                              TCGv_vec c1, TCGv_vec c2,
4108                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4109{
4110    TCGv_vec t = tcg_temp_new_vec(type);
4111
4112    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4113        /* Invert the sense of the compare by swapping arguments.  */
4114        TCGv_vec x;
4115        x = v3, v3 = v4, v4 = x;
4116    }
4117    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4118              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4119              tcgv_vec_arg(v3), tcgv_vec_arg(t));
4120    tcg_temp_free_vec(t);
4121}
4122
4123void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4124                       TCGArg a0, ...)
4125{
4126    va_list va;
4127    TCGArg a2;
4128    TCGv_vec v0, v1, v2, v3, v4;
4129
4130    va_start(va, a0);
4131    v0 = temp_tcgv_vec(arg_temp(a0));
4132    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4133    a2 = va_arg(va, TCGArg);
4134
4135    switch (opc) {
4136    case INDEX_op_shli_vec:
4137        expand_vec_shi(type, vece, false, v0, v1, a2);
4138        break;
4139    case INDEX_op_shri_vec:
4140        expand_vec_shi(type, vece, true, v0, v1, a2);
4141        break;
4142    case INDEX_op_sari_vec:
4143        expand_vec_sari(type, vece, v0, v1, a2);
4144        break;
4145
4146    case INDEX_op_rotli_vec:
4147        expand_vec_rotli(type, vece, v0, v1, a2);
4148        break;
4149
4150    case INDEX_op_rotls_vec:
4151        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4152        break;
4153
4154    case INDEX_op_rotlv_vec:
4155        v2 = temp_tcgv_vec(arg_temp(a2));
4156        expand_vec_rotv(type, vece, v0, v1, v2, false);
4157        break;
4158    case INDEX_op_rotrv_vec:
4159        v2 = temp_tcgv_vec(arg_temp(a2));
4160        expand_vec_rotv(type, vece, v0, v1, v2, true);
4161        break;
4162
4163    case INDEX_op_mul_vec:
4164        v2 = temp_tcgv_vec(arg_temp(a2));
4165        expand_vec_mul(type, vece, v0, v1, v2);
4166        break;
4167
4168    case INDEX_op_cmp_vec:
4169        v2 = temp_tcgv_vec(arg_temp(a2));
4170        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4171        break;
4172
4173    case INDEX_op_cmpsel_vec:
4174        v2 = temp_tcgv_vec(arg_temp(a2));
4175        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4176        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4177        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4178        break;
4179
4180    default:
4181        break;
4182    }
4183
4184    va_end(va);
4185}
4186
4187static const int tcg_target_callee_save_regs[] = {
4188#if TCG_TARGET_REG_BITS == 64
4189    TCG_REG_RBP,
4190    TCG_REG_RBX,
4191#if defined(_WIN64)
4192    TCG_REG_RDI,
4193    TCG_REG_RSI,
4194#endif
4195    TCG_REG_R12,
4196    TCG_REG_R13,
4197    TCG_REG_R14, /* Currently used for the global env. */
4198    TCG_REG_R15,
4199#else
4200    TCG_REG_EBP, /* Currently used for the global env. */
4201    TCG_REG_EBX,
4202    TCG_REG_ESI,
4203    TCG_REG_EDI,
4204#endif
4205};
4206
4207/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4208   and tcg_register_jit.  */
4209
4210#define PUSH_SIZE \
4211    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4212     * (TCG_TARGET_REG_BITS / 8))
4213
4214#define FRAME_SIZE \
4215    ((PUSH_SIZE \
4216      + TCG_STATIC_CALL_ARGS_SIZE \
4217      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4218      + TCG_TARGET_STACK_ALIGN - 1) \
4219     & ~(TCG_TARGET_STACK_ALIGN - 1))
4220
4221/* Generate global QEMU prologue and epilogue code */
4222static void tcg_target_qemu_prologue(TCGContext *s)
4223{
4224    int i, stack_addend;
4225
4226    /* TB prologue */
4227
4228    /* Reserve some stack space, also for TCG temps.  */
4229    stack_addend = FRAME_SIZE - PUSH_SIZE;
4230    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4231                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4232
4233    /* Save all callee saved registers.  */
4234    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4235        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4236    }
4237
4238    if (!tcg_use_softmmu && guest_base) {
4239        int seg = setup_guest_base_seg();
4240        if (seg != 0) {
4241            x86_guest_base.seg = seg;
4242        } else if (guest_base == (int32_t)guest_base) {
4243            x86_guest_base.ofs = guest_base;
4244        } else {
4245            assert(TCG_TARGET_REG_BITS == 64);
4246            /* Choose R12 because, as a base, it requires a SIB byte. */
4247            x86_guest_base.index = TCG_REG_R12;
4248            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4249            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4250        }
4251    }
4252
4253    if (TCG_TARGET_REG_BITS == 32) {
4254        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4255                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4256        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4257        /* jmp *tb.  */
4258        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4259                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4260                             + stack_addend);
4261    } else {
4262        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4263        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4264        /* jmp *tb.  */
4265        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4266    }
4267
4268    /*
4269     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4270     * and fall through to the rest of the epilogue.
4271     */
4272    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4273    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4274
4275    /* TB epilogue */
4276    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4277
4278    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4279
4280    if (have_avx2) {
4281        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4282    }
4283    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4284        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4285    }
4286    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4287}
4288
4289static void tcg_out_tb_start(TCGContext *s)
4290{
4291    /* nothing to do */
4292}
4293
4294static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4295{
4296    memset(p, 0x90, count);
4297}
4298
4299static void tcg_target_init(TCGContext *s)
4300{
4301    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4302    if (TCG_TARGET_REG_BITS == 64) {
4303        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4304    }
4305    if (have_avx1) {
4306        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4307        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4308    }
4309    if (have_avx2) {
4310        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4311    }
4312
4313    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4314    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4315    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4316    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4317    if (TCG_TARGET_REG_BITS == 64) {
4318#if !defined(_WIN64)
4319        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4320        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4321#endif
4322        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4323        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4324        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4325        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4326    }
4327
4328    s->reserved_regs = 0;
4329    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4330    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4331#ifdef _WIN64
4332    /* These are call saved, and we don't save them, so don't use them. */
4333    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4334    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4335    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4336    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4337    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4338    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4339    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4340    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4341    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4342    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4343#endif
4344}
4345
4346typedef struct {
4347    DebugFrameHeader h;
4348    uint8_t fde_def_cfa[4];
4349    uint8_t fde_reg_ofs[14];
4350} DebugFrame;
4351
4352/* We're expecting a 2 byte uleb128 encoded value.  */
4353QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4354
4355#if !defined(__ELF__)
4356    /* Host machine without ELF. */
4357#elif TCG_TARGET_REG_BITS == 64
4358#define ELF_HOST_MACHINE EM_X86_64
4359static const DebugFrame debug_frame = {
4360    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4361    .h.cie.id = -1,
4362    .h.cie.version = 1,
4363    .h.cie.code_align = 1,
4364    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4365    .h.cie.return_column = 16,
4366
4367    /* Total FDE size does not include the "len" member.  */
4368    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4369
4370    .fde_def_cfa = {
4371        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4372        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4373        (FRAME_SIZE >> 7)
4374    },
4375    .fde_reg_ofs = {
4376        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4377        /* The following ordering must match tcg_target_callee_save_regs.  */
4378        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4379        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4380        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4381        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4382        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4383        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4384    }
4385};
4386#else
4387#define ELF_HOST_MACHINE EM_386
4388static const DebugFrame debug_frame = {
4389    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4390    .h.cie.id = -1,
4391    .h.cie.version = 1,
4392    .h.cie.code_align = 1,
4393    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4394    .h.cie.return_column = 8,
4395
4396    /* Total FDE size does not include the "len" member.  */
4397    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4398
4399    .fde_def_cfa = {
4400        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4401        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4402        (FRAME_SIZE >> 7)
4403    },
4404    .fde_reg_ofs = {
4405        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4406        /* The following ordering must match tcg_target_callee_save_regs.  */
4407        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4408        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4409        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4410        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4411    }
4412};
4413#endif
4414
4415#if defined(ELF_HOST_MACHINE)
4416void tcg_register_jit(const void *buf, size_t buf_size)
4417{
4418    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4419}
4420#endif
4421