xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 2bfb10df)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94static const int tcg_target_call_iarg_regs[] = {
95#if TCG_TARGET_REG_BITS == 64
96#if defined(_WIN64)
97    TCG_REG_RCX,
98    TCG_REG_RDX,
99#else
100    TCG_REG_RDI,
101    TCG_REG_RSI,
102    TCG_REG_RDX,
103    TCG_REG_RCX,
104#endif
105    TCG_REG_R8,
106    TCG_REG_R9,
107#else
108    /* 32 bit mode uses stack based calling convention (GCC default). */
109#endif
110};
111
112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
113{
114    switch (kind) {
115    case TCG_CALL_RET_NORMAL:
116        tcg_debug_assert(slot >= 0 && slot <= 1);
117        return slot ? TCG_REG_EDX : TCG_REG_EAX;
118#ifdef _WIN64
119    case TCG_CALL_RET_BY_VEC:
120        tcg_debug_assert(slot == 0);
121        return TCG_REG_XMM0;
122#endif
123    default:
124        g_assert_not_reached();
125    }
126}
127
128/* Constants we accept.  */
129#define TCG_CT_CONST_S32 0x100
130#define TCG_CT_CONST_U32 0x200
131#define TCG_CT_CONST_I32 0x400
132#define TCG_CT_CONST_WSZ 0x800
133
134/* Registers used with L constraint, which are the first argument
135   registers on x86_64, and two random call clobbered registers on
136   i386. */
137#if TCG_TARGET_REG_BITS == 64
138# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
139# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
140#else
141# define TCG_REG_L0 TCG_REG_EAX
142# define TCG_REG_L1 TCG_REG_EDX
143#endif
144
145#define ALL_BYTEH_REGS         0x0000000fu
146#if TCG_TARGET_REG_BITS == 64
147# define ALL_GENERAL_REGS      0x0000ffffu
148# define ALL_VECTOR_REGS       0xffff0000u
149# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
150#else
151# define ALL_GENERAL_REGS      0x000000ffu
152# define ALL_VECTOR_REGS       0x00ff0000u
153# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
154#endif
155#ifdef CONFIG_SOFTMMU
156# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
157#else
158# define SOFTMMU_RESERVE_REGS  0
159#endif
160
161/* For 64-bit, we always know that CMOV is available.  */
162#if TCG_TARGET_REG_BITS == 64
163# define have_cmov      true
164#else
165# define have_cmov      (cpuinfo & CPUINFO_CMOV)
166#endif
167#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
168#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
169
170static const tcg_insn_unit *tb_ret_addr;
171
172static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
173                        intptr_t value, intptr_t addend)
174{
175    value += addend;
176    switch(type) {
177    case R_386_PC32:
178        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
179        if (value != (int32_t)value) {
180            return false;
181        }
182        /* FALLTHRU */
183    case R_386_32:
184        tcg_patch32(code_ptr, value);
185        break;
186    case R_386_PC8:
187        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
188        if (value != (int8_t)value) {
189            return false;
190        }
191        tcg_patch8(code_ptr, value);
192        break;
193    default:
194        g_assert_not_reached();
195    }
196    return true;
197}
198
199/* test if a constant matches the constraint */
200static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
201{
202    if (ct & TCG_CT_CONST) {
203        return 1;
204    }
205    if (type == TCG_TYPE_I32) {
206        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
207            return 1;
208        }
209    } else {
210        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
211            return 1;
212        }
213        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
214            return 1;
215        }
216        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
217            return 1;
218        }
219    }
220    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
221        return 1;
222    }
223    return 0;
224}
225
226# define LOWREGMASK(x)	((x) & 7)
227
228#define P_EXT		0x100		/* 0x0f opcode prefix */
229#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
230#define P_DATA16        0x400           /* 0x66 opcode prefix */
231#define P_VEXW          0x1000          /* Set VEX.W = 1 */
232#if TCG_TARGET_REG_BITS == 64
233# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
234# define P_REXB_R       0x2000          /* REG field as byte register */
235# define P_REXB_RM      0x4000          /* R/M field as byte register */
236# define P_GS           0x8000          /* gs segment override */
237#else
238# define P_REXW		0
239# define P_REXB_R	0
240# define P_REXB_RM	0
241# define P_GS           0
242#endif
243#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
244#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
245#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
246#define P_VEXL          0x80000         /* Set VEX.L = 1 */
247#define P_EVEX          0x100000        /* Requires EVEX encoding */
248
249#define OPC_ARITH_EvIz	(0x81)
250#define OPC_ARITH_EvIb	(0x83)
251#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
252#define OPC_ANDN        (0xf2 | P_EXT38)
253#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
254#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
255#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
256#define OPC_BSF         (0xbc | P_EXT)
257#define OPC_BSR         (0xbd | P_EXT)
258#define OPC_BSWAP	(0xc8 | P_EXT)
259#define OPC_CALL_Jz	(0xe8)
260#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
261#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
262#define OPC_DEC_r32	(0x48)
263#define OPC_IMUL_GvEv	(0xaf | P_EXT)
264#define OPC_IMUL_GvEvIb	(0x6b)
265#define OPC_IMUL_GvEvIz	(0x69)
266#define OPC_INC_r32	(0x40)
267#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
268#define OPC_JCC_short	(0x70)		/* ... plus condition code */
269#define OPC_JMP_long	(0xe9)
270#define OPC_JMP_short	(0xeb)
271#define OPC_LEA         (0x8d)
272#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
273#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
274#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
275#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
276#define OPC_MOVB_EvIz   (0xc6)
277#define OPC_MOVL_EvIz	(0xc7)
278#define OPC_MOVL_Iv     (0xb8)
279#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
280#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
281#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
282#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
283#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
284#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
285#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
286#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
287#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
288#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
289#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
290#define OPC_MOVSBL	(0xbe | P_EXT)
291#define OPC_MOVSWL	(0xbf | P_EXT)
292#define OPC_MOVSLQ	(0x63 | P_REXW)
293#define OPC_MOVZBL	(0xb6 | P_EXT)
294#define OPC_MOVZWL	(0xb7 | P_EXT)
295#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
296#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
297#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
298#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
299#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
300#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
301#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
302#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
303#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
304#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
305#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
306#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
307#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
308#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
309#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
310#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
311#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
312#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
313#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
314#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
315#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
316#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
317#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
318#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
319#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
320#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
321#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
322#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
323#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
324#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
325#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
326#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
327#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
328#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
329#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
330#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
331#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
332#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
333#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
334#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
335#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
336#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
337#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
338#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
339#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
340#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
341#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
342#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
343#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
344#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
345#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
346#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
347#define OPC_POR         (0xeb | P_EXT | P_DATA16)
348#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
349#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
350#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
351#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
352#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
353#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
354#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
355#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
356#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
357#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
358#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
359#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
360#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
361#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
362#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
363#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
364#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
365#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
366#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
367#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
368#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
369#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
370#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
371#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
372#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
373#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
374#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
375#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
376#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
377#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
378#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
379#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
380#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
381#define OPC_POP_r32	(0x58)
382#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
383#define OPC_PUSH_r32	(0x50)
384#define OPC_PUSH_Iv	(0x68)
385#define OPC_PUSH_Ib	(0x6a)
386#define OPC_RET		(0xc3)
387#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
388#define OPC_SHIFT_1	(0xd1)
389#define OPC_SHIFT_Ib	(0xc1)
390#define OPC_SHIFT_cl	(0xd3)
391#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
392#define OPC_SHUFPS      (0xc6 | P_EXT)
393#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
394#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
395#define OPC_SHRD_Ib     (0xac | P_EXT)
396#define OPC_TESTL	(0x85)
397#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
398#define OPC_UD2         (0x0b | P_EXT)
399#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
400#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
401#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
402#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
403#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
404#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
405#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
406#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
407#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
408#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
409#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
410#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
411#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
412#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
413#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
414#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
415#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
416#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
417#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
418#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
419#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
420#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
421#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
422#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
423#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
424#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
425#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
426#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
427#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
428#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
429#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
430#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
431#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
432#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
433#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
434#define OPC_VZEROUPPER  (0x77 | P_EXT)
435#define OPC_XCHG_ax_r32	(0x90)
436#define OPC_XCHG_EvGv   (0x87)
437
438#define OPC_GRP3_Eb     (0xf6)
439#define OPC_GRP3_Ev     (0xf7)
440#define OPC_GRP5        (0xff)
441#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
442
443/* Group 1 opcode extensions for 0x80-0x83.
444   These are also used as modifiers for OPC_ARITH.  */
445#define ARITH_ADD 0
446#define ARITH_OR  1
447#define ARITH_ADC 2
448#define ARITH_SBB 3
449#define ARITH_AND 4
450#define ARITH_SUB 5
451#define ARITH_XOR 6
452#define ARITH_CMP 7
453
454/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
455#define SHIFT_ROL 0
456#define SHIFT_ROR 1
457#define SHIFT_SHL 4
458#define SHIFT_SHR 5
459#define SHIFT_SAR 7
460
461/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
462#define EXT3_TESTi 0
463#define EXT3_NOT   2
464#define EXT3_NEG   3
465#define EXT3_MUL   4
466#define EXT3_IMUL  5
467#define EXT3_DIV   6
468#define EXT3_IDIV  7
469
470/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
471#define EXT5_INC_Ev	0
472#define EXT5_DEC_Ev	1
473#define EXT5_CALLN_Ev	2
474#define EXT5_JMPN_Ev	4
475
476/* Condition codes to be added to OPC_JCC_{long,short}.  */
477#define JCC_JMP (-1)
478#define JCC_JO  0x0
479#define JCC_JNO 0x1
480#define JCC_JB  0x2
481#define JCC_JAE 0x3
482#define JCC_JE  0x4
483#define JCC_JNE 0x5
484#define JCC_JBE 0x6
485#define JCC_JA  0x7
486#define JCC_JS  0x8
487#define JCC_JNS 0x9
488#define JCC_JP  0xa
489#define JCC_JNP 0xb
490#define JCC_JL  0xc
491#define JCC_JGE 0xd
492#define JCC_JLE 0xe
493#define JCC_JG  0xf
494
495static const uint8_t tcg_cond_to_jcc[] = {
496    [TCG_COND_EQ] = JCC_JE,
497    [TCG_COND_NE] = JCC_JNE,
498    [TCG_COND_LT] = JCC_JL,
499    [TCG_COND_GE] = JCC_JGE,
500    [TCG_COND_LE] = JCC_JLE,
501    [TCG_COND_GT] = JCC_JG,
502    [TCG_COND_LTU] = JCC_JB,
503    [TCG_COND_GEU] = JCC_JAE,
504    [TCG_COND_LEU] = JCC_JBE,
505    [TCG_COND_GTU] = JCC_JA,
506};
507
508#if TCG_TARGET_REG_BITS == 64
509static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
510{
511    int rex;
512
513    if (opc & P_GS) {
514        tcg_out8(s, 0x65);
515    }
516    if (opc & P_DATA16) {
517        /* We should never be asking for both 16 and 64-bit operation.  */
518        tcg_debug_assert((opc & P_REXW) == 0);
519        tcg_out8(s, 0x66);
520    }
521    if (opc & P_SIMDF3) {
522        tcg_out8(s, 0xf3);
523    } else if (opc & P_SIMDF2) {
524        tcg_out8(s, 0xf2);
525    }
526
527    rex = 0;
528    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
529    rex |= (r & 8) >> 1;                /* REX.R */
530    rex |= (x & 8) >> 2;                /* REX.X */
531    rex |= (rm & 8) >> 3;               /* REX.B */
532
533    /* P_REXB_{R,RM} indicates that the given register is the low byte.
534       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
535       as otherwise the encoding indicates %[abcd]h.  Note that the values
536       that are ORed in merely indicate that the REX byte must be present;
537       those bits get discarded in output.  */
538    rex |= opc & (r >= 4 ? P_REXB_R : 0);
539    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
540
541    if (rex) {
542        tcg_out8(s, (uint8_t)(rex | 0x40));
543    }
544
545    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
546        tcg_out8(s, 0x0f);
547        if (opc & P_EXT38) {
548            tcg_out8(s, 0x38);
549        } else if (opc & P_EXT3A) {
550            tcg_out8(s, 0x3a);
551        }
552    }
553
554    tcg_out8(s, opc);
555}
556#else
557static void tcg_out_opc(TCGContext *s, int opc)
558{
559    if (opc & P_DATA16) {
560        tcg_out8(s, 0x66);
561    }
562    if (opc & P_SIMDF3) {
563        tcg_out8(s, 0xf3);
564    } else if (opc & P_SIMDF2) {
565        tcg_out8(s, 0xf2);
566    }
567    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
568        tcg_out8(s, 0x0f);
569        if (opc & P_EXT38) {
570            tcg_out8(s, 0x38);
571        } else if (opc & P_EXT3A) {
572            tcg_out8(s, 0x3a);
573        }
574    }
575    tcg_out8(s, opc);
576}
577/* Discard the register arguments to tcg_out_opc early, so as not to penalize
578   the 32-bit compilation paths.  This method works with all versions of gcc,
579   whereas relying on optimization may not be able to exclude them.  */
580#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
581#endif
582
583static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
584{
585    tcg_out_opc(s, opc, r, rm, 0);
586    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
587}
588
589static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
590                            int rm, int index)
591{
592    int tmp;
593
594    /* Use the two byte form if possible, which cannot encode
595       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
596    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
597        && ((rm | index) & 8) == 0) {
598        /* Two byte VEX prefix.  */
599        tcg_out8(s, 0xc5);
600
601        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
602    } else {
603        /* Three byte VEX prefix.  */
604        tcg_out8(s, 0xc4);
605
606        /* VEX.m-mmmm */
607        if (opc & P_EXT3A) {
608            tmp = 3;
609        } else if (opc & P_EXT38) {
610            tmp = 2;
611        } else if (opc & P_EXT) {
612            tmp = 1;
613        } else {
614            g_assert_not_reached();
615        }
616        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
617        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
618        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
619        tcg_out8(s, tmp);
620
621        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
622    }
623
624    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
625    /* VEX.pp */
626    if (opc & P_DATA16) {
627        tmp |= 1;                          /* 0x66 */
628    } else if (opc & P_SIMDF3) {
629        tmp |= 2;                          /* 0xf3 */
630    } else if (opc & P_SIMDF2) {
631        tmp |= 3;                          /* 0xf2 */
632    }
633    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
634    tcg_out8(s, tmp);
635    tcg_out8(s, opc);
636}
637
638static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
639                             int rm, int index)
640{
641    /* The entire 4-byte evex prefix; with R' and V' set. */
642    uint32_t p = 0x08041062;
643    int mm, pp;
644
645    tcg_debug_assert(have_avx512vl);
646
647    /* EVEX.mm */
648    if (opc & P_EXT3A) {
649        mm = 3;
650    } else if (opc & P_EXT38) {
651        mm = 2;
652    } else if (opc & P_EXT) {
653        mm = 1;
654    } else {
655        g_assert_not_reached();
656    }
657
658    /* EVEX.pp */
659    if (opc & P_DATA16) {
660        pp = 1;                          /* 0x66 */
661    } else if (opc & P_SIMDF3) {
662        pp = 2;                          /* 0xf3 */
663    } else if (opc & P_SIMDF2) {
664        pp = 3;                          /* 0xf2 */
665    } else {
666        pp = 0;
667    }
668
669    p = deposit32(p, 8, 2, mm);
670    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
671    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
672    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
673    p = deposit32(p, 16, 2, pp);
674    p = deposit32(p, 19, 4, ~v);
675    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
676    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
677
678    tcg_out32(s, p);
679    tcg_out8(s, opc);
680}
681
682static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
683{
684    if (opc & P_EVEX) {
685        tcg_out_evex_opc(s, opc, r, v, rm, 0);
686    } else {
687        tcg_out_vex_opc(s, opc, r, v, rm, 0);
688    }
689    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
690}
691
692/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
693   We handle either RM and INDEX missing with a negative value.  In 64-bit
694   mode for absolute addresses, ~RM is the size of the immediate operand
695   that will follow the instruction.  */
696
697static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
698                               int shift, intptr_t offset)
699{
700    int mod, len;
701
702    if (index < 0 && rm < 0) {
703        if (TCG_TARGET_REG_BITS == 64) {
704            /* Try for a rip-relative addressing mode.  This has replaced
705               the 32-bit-mode absolute addressing encoding.  */
706            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
707            intptr_t disp = offset - pc;
708            if (disp == (int32_t)disp) {
709                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
710                tcg_out32(s, disp);
711                return;
712            }
713
714            /* Try for an absolute address encoding.  This requires the
715               use of the MODRM+SIB encoding and is therefore larger than
716               rip-relative addressing.  */
717            if (offset == (int32_t)offset) {
718                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
719                tcg_out8(s, (4 << 3) | 5);
720                tcg_out32(s, offset);
721                return;
722            }
723
724            /* ??? The memory isn't directly addressable.  */
725            g_assert_not_reached();
726        } else {
727            /* Absolute address.  */
728            tcg_out8(s, (r << 3) | 5);
729            tcg_out32(s, offset);
730            return;
731        }
732    }
733
734    /* Find the length of the immediate addend.  Note that the encoding
735       that would be used for (%ebp) indicates absolute addressing.  */
736    if (rm < 0) {
737        mod = 0, len = 4, rm = 5;
738    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
739        mod = 0, len = 0;
740    } else if (offset == (int8_t)offset) {
741        mod = 0x40, len = 1;
742    } else {
743        mod = 0x80, len = 4;
744    }
745
746    /* Use a single byte MODRM format if possible.  Note that the encoding
747       that would be used for %esp is the escape to the two byte form.  */
748    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
749        /* Single byte MODRM format.  */
750        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
751    } else {
752        /* Two byte MODRM+SIB format.  */
753
754        /* Note that the encoding that would place %esp into the index
755           field indicates no index register.  In 64-bit mode, the REX.X
756           bit counts, so %r12 can be used as the index.  */
757        if (index < 0) {
758            index = 4;
759        } else {
760            tcg_debug_assert(index != TCG_REG_ESP);
761        }
762
763        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
764        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
765    }
766
767    if (len == 1) {
768        tcg_out8(s, offset);
769    } else if (len == 4) {
770        tcg_out32(s, offset);
771    }
772}
773
774static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
775                                     int index, int shift, intptr_t offset)
776{
777    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
778    tcg_out_sib_offset(s, r, rm, index, shift, offset);
779}
780
781static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
782                                         int rm, int index, int shift,
783                                         intptr_t offset)
784{
785    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
786    tcg_out_sib_offset(s, r, rm, index, shift, offset);
787}
788
789/* A simplification of the above with no index or shift.  */
790static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
791                                        int rm, intptr_t offset)
792{
793    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
794}
795
796static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
797                                            int v, int rm, intptr_t offset)
798{
799    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
800}
801
802/* Output an opcode with an expected reference to the constant pool.  */
803static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
804{
805    tcg_out_opc(s, opc, r, 0, 0);
806    /* Absolute for 32-bit, pc-relative for 64-bit.  */
807    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
808    tcg_out32(s, 0);
809}
810
811/* Output an opcode with an expected reference to the constant pool.  */
812static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
813{
814    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
815    /* Absolute for 32-bit, pc-relative for 64-bit.  */
816    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
817    tcg_out32(s, 0);
818}
819
820/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
821static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
822{
823    /* Propagate an opcode prefix, such as P_REXW.  */
824    int ext = subop & ~0x7;
825    subop &= 0x7;
826
827    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
828}
829
830static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
831{
832    int rexw = 0;
833
834    if (arg == ret) {
835        return true;
836    }
837    switch (type) {
838    case TCG_TYPE_I64:
839        rexw = P_REXW;
840        /* fallthru */
841    case TCG_TYPE_I32:
842        if (ret < 16) {
843            if (arg < 16) {
844                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
845            } else {
846                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
847            }
848        } else {
849            if (arg < 16) {
850                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
851            } else {
852                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
853            }
854        }
855        break;
856
857    case TCG_TYPE_V64:
858        tcg_debug_assert(ret >= 16 && arg >= 16);
859        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
860        break;
861    case TCG_TYPE_V128:
862        tcg_debug_assert(ret >= 16 && arg >= 16);
863        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
864        break;
865    case TCG_TYPE_V256:
866        tcg_debug_assert(ret >= 16 && arg >= 16);
867        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
868        break;
869
870    default:
871        g_assert_not_reached();
872    }
873    return true;
874}
875
876static const int avx2_dup_insn[4] = {
877    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
878    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
879};
880
881static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
882                            TCGReg r, TCGReg a)
883{
884    if (have_avx2) {
885        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
886        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
887    } else {
888        switch (vece) {
889        case MO_8:
890            /* ??? With zero in a register, use PSHUFB.  */
891            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
892            a = r;
893            /* FALLTHRU */
894        case MO_16:
895            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
896            a = r;
897            /* FALLTHRU */
898        case MO_32:
899            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
900            /* imm8 operand: all output lanes selected from input lane 0.  */
901            tcg_out8(s, 0);
902            break;
903        case MO_64:
904            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
905            break;
906        default:
907            g_assert_not_reached();
908        }
909    }
910    return true;
911}
912
913static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
914                             TCGReg r, TCGReg base, intptr_t offset)
915{
916    if (have_avx2) {
917        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
918        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
919                                 r, 0, base, offset);
920    } else {
921        switch (vece) {
922        case MO_64:
923            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
924            break;
925        case MO_32:
926            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
927            break;
928        case MO_16:
929            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
930            tcg_out8(s, 0); /* imm8 */
931            tcg_out_dup_vec(s, type, vece, r, r);
932            break;
933        case MO_8:
934            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
935            tcg_out8(s, 0); /* imm8 */
936            tcg_out_dup_vec(s, type, vece, r, r);
937            break;
938        default:
939            g_assert_not_reached();
940        }
941    }
942    return true;
943}
944
945static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
946                             TCGReg ret, int64_t arg)
947{
948    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
949
950    if (arg == 0) {
951        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
952        return;
953    }
954    if (arg == -1) {
955        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
956        return;
957    }
958
959    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
960        if (have_avx2) {
961            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
962        } else {
963            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
964        }
965        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
966    } else {
967        if (type == TCG_TYPE_V64) {
968            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
969        } else if (have_avx2) {
970            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
971        } else {
972            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
973        }
974        if (TCG_TARGET_REG_BITS == 64) {
975            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
976        } else {
977            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
978        }
979    }
980}
981
982static void tcg_out_movi_vec(TCGContext *s, TCGType type,
983                             TCGReg ret, tcg_target_long arg)
984{
985    if (arg == 0) {
986        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
987        return;
988    }
989    if (arg == -1) {
990        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
991        return;
992    }
993
994    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
995    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
996    if (TCG_TARGET_REG_BITS == 64) {
997        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
998    } else {
999        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1000    }
1001}
1002
1003static void tcg_out_movi_int(TCGContext *s, TCGType type,
1004                             TCGReg ret, tcg_target_long arg)
1005{
1006    tcg_target_long diff;
1007
1008    if (arg == 0) {
1009        tgen_arithr(s, ARITH_XOR, ret, ret);
1010        return;
1011    }
1012    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1013        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1014        tcg_out32(s, arg);
1015        return;
1016    }
1017    if (arg == (int32_t)arg) {
1018        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1019        tcg_out32(s, arg);
1020        return;
1021    }
1022
1023    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1024    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1025    if (diff == (int32_t)diff) {
1026        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1027        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1028        tcg_out32(s, diff);
1029        return;
1030    }
1031
1032    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1033    tcg_out64(s, arg);
1034}
1035
1036static void tcg_out_movi(TCGContext *s, TCGType type,
1037                         TCGReg ret, tcg_target_long arg)
1038{
1039    switch (type) {
1040    case TCG_TYPE_I32:
1041#if TCG_TARGET_REG_BITS == 64
1042    case TCG_TYPE_I64:
1043#endif
1044        if (ret < 16) {
1045            tcg_out_movi_int(s, type, ret, arg);
1046        } else {
1047            tcg_out_movi_vec(s, type, ret, arg);
1048        }
1049        break;
1050    default:
1051        g_assert_not_reached();
1052    }
1053}
1054
1055static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1056{
1057    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1058    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1059    return true;
1060}
1061
1062static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1063                             tcg_target_long imm)
1064{
1065    /* This function is only used for passing structs by reference. */
1066    tcg_debug_assert(imm == (int32_t)imm);
1067    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1068}
1069
1070static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1071{
1072    if (val == (int8_t)val) {
1073        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1074        tcg_out8(s, val);
1075    } else if (val == (int32_t)val) {
1076        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1077        tcg_out32(s, val);
1078    } else {
1079        g_assert_not_reached();
1080    }
1081}
1082
1083static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1084{
1085    /* Given the strength of x86 memory ordering, we only need care for
1086       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1087       faster than "mfence", so don't bother with the sse insn.  */
1088    if (a0 & TCG_MO_ST_LD) {
1089        tcg_out8(s, 0xf0);
1090        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1091        tcg_out8(s, 0);
1092    }
1093}
1094
1095static inline void tcg_out_push(TCGContext *s, int reg)
1096{
1097    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1098}
1099
1100static inline void tcg_out_pop(TCGContext *s, int reg)
1101{
1102    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1103}
1104
1105static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1106                       TCGReg arg1, intptr_t arg2)
1107{
1108    switch (type) {
1109    case TCG_TYPE_I32:
1110        if (ret < 16) {
1111            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1112        } else {
1113            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1114        }
1115        break;
1116    case TCG_TYPE_I64:
1117        if (ret < 16) {
1118            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1119            break;
1120        }
1121        /* FALLTHRU */
1122    case TCG_TYPE_V64:
1123        /* There is no instruction that can validate 8-byte alignment.  */
1124        tcg_debug_assert(ret >= 16);
1125        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1126        break;
1127    case TCG_TYPE_V128:
1128        /*
1129         * The gvec infrastructure is asserts that v128 vector loads
1130         * and stores use a 16-byte aligned offset.  Validate that the
1131         * final pointer is aligned by using an insn that will SIGSEGV.
1132         */
1133        tcg_debug_assert(ret >= 16);
1134        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1135        break;
1136    case TCG_TYPE_V256:
1137        /*
1138         * The gvec infrastructure only requires 16-byte alignment,
1139         * so here we must use an unaligned load.
1140         */
1141        tcg_debug_assert(ret >= 16);
1142        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1143                                 ret, 0, arg1, arg2);
1144        break;
1145    default:
1146        g_assert_not_reached();
1147    }
1148}
1149
1150static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1151                       TCGReg arg1, intptr_t arg2)
1152{
1153    switch (type) {
1154    case TCG_TYPE_I32:
1155        if (arg < 16) {
1156            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1157        } else {
1158            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1159        }
1160        break;
1161    case TCG_TYPE_I64:
1162        if (arg < 16) {
1163            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1164            break;
1165        }
1166        /* FALLTHRU */
1167    case TCG_TYPE_V64:
1168        /* There is no instruction that can validate 8-byte alignment.  */
1169        tcg_debug_assert(arg >= 16);
1170        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1171        break;
1172    case TCG_TYPE_V128:
1173        /*
1174         * The gvec infrastructure is asserts that v128 vector loads
1175         * and stores use a 16-byte aligned offset.  Validate that the
1176         * final pointer is aligned by using an insn that will SIGSEGV.
1177         *
1178         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1179         * for _WIN64, which must have SSE2 but may not have AVX.
1180         */
1181        tcg_debug_assert(arg >= 16);
1182        if (have_avx1) {
1183            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1184        } else {
1185            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1186        }
1187        break;
1188    case TCG_TYPE_V256:
1189        /*
1190         * The gvec infrastructure only requires 16-byte alignment,
1191         * so here we must use an unaligned store.
1192         */
1193        tcg_debug_assert(arg >= 16);
1194        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1195                                 arg, 0, arg1, arg2);
1196        break;
1197    default:
1198        g_assert_not_reached();
1199    }
1200}
1201
1202static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1203                        TCGReg base, intptr_t ofs)
1204{
1205    int rexw = 0;
1206    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1207        if (val != (int32_t)val) {
1208            return false;
1209        }
1210        rexw = P_REXW;
1211    } else if (type != TCG_TYPE_I32) {
1212        return false;
1213    }
1214    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1215    tcg_out32(s, val);
1216    return true;
1217}
1218
1219static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1220{
1221    /* Propagate an opcode prefix, such as P_DATA16.  */
1222    int ext = subopc & ~0x7;
1223    subopc &= 0x7;
1224
1225    if (count == 1) {
1226        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1227    } else {
1228        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1229        tcg_out8(s, count);
1230    }
1231}
1232
1233static inline void tcg_out_bswap32(TCGContext *s, int reg)
1234{
1235    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1236}
1237
1238static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1239{
1240    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1241}
1242
1243static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1244{
1245    /* movzbl */
1246    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1247    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1248}
1249
1250static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1251{
1252    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1253    /* movsbl */
1254    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1255    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1256}
1257
1258static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1259{
1260    /* movzwl */
1261    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1262}
1263
1264static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1265{
1266    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1267    /* movsw[lq] */
1268    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1269}
1270
1271static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1272{
1273    /* 32-bit mov zero extends.  */
1274    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1275}
1276
1277static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1278{
1279    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1280    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1281}
1282
1283static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1284{
1285    tcg_out_ext32s(s, dest, src);
1286}
1287
1288static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1289{
1290    if (dest != src) {
1291        tcg_out_ext32u(s, dest, src);
1292    }
1293}
1294
1295static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1296{
1297    tcg_out_ext32u(s, dest, src);
1298}
1299
1300static inline void tcg_out_bswap64(TCGContext *s, int reg)
1301{
1302    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1303}
1304
1305static void tgen_arithi(TCGContext *s, int c, int r0,
1306                        tcg_target_long val, int cf)
1307{
1308    int rexw = 0;
1309
1310    if (TCG_TARGET_REG_BITS == 64) {
1311        rexw = c & -8;
1312        c &= 7;
1313    }
1314
1315    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1316       partial flags update stalls on Pentium4 and are not recommended
1317       by current Intel optimization manuals.  */
1318    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1319        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1320        if (TCG_TARGET_REG_BITS == 64) {
1321            /* The single-byte increment encodings are re-tasked as the
1322               REX prefixes.  Use the MODRM encoding.  */
1323            tcg_out_modrm(s, OPC_GRP5 + rexw,
1324                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1325        } else {
1326            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1327        }
1328        return;
1329    }
1330
1331    if (c == ARITH_AND) {
1332        if (TCG_TARGET_REG_BITS == 64) {
1333            if (val == 0xffffffffu) {
1334                tcg_out_ext32u(s, r0, r0);
1335                return;
1336            }
1337            if (val == (uint32_t)val) {
1338                /* AND with no high bits set can use a 32-bit operation.  */
1339                rexw = 0;
1340            }
1341        }
1342        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1343            tcg_out_ext8u(s, r0, r0);
1344            return;
1345        }
1346        if (val == 0xffffu) {
1347            tcg_out_ext16u(s, r0, r0);
1348            return;
1349        }
1350    }
1351
1352    if (val == (int8_t)val) {
1353        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1354        tcg_out8(s, val);
1355        return;
1356    }
1357    if (rexw == 0 || val == (int32_t)val) {
1358        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1359        tcg_out32(s, val);
1360        return;
1361    }
1362
1363    g_assert_not_reached();
1364}
1365
1366static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1367{
1368    if (val != 0) {
1369        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1370    }
1371}
1372
1373/* Set SMALL to force a short forward branch.  */
1374static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1375{
1376    int32_t val, val1;
1377
1378    if (l->has_value) {
1379        val = tcg_pcrel_diff(s, l->u.value_ptr);
1380        val1 = val - 2;
1381        if ((int8_t)val1 == val1) {
1382            if (opc == -1) {
1383                tcg_out8(s, OPC_JMP_short);
1384            } else {
1385                tcg_out8(s, OPC_JCC_short + opc);
1386            }
1387            tcg_out8(s, val1);
1388        } else {
1389            tcg_debug_assert(!small);
1390            if (opc == -1) {
1391                tcg_out8(s, OPC_JMP_long);
1392                tcg_out32(s, val - 5);
1393            } else {
1394                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1395                tcg_out32(s, val - 6);
1396            }
1397        }
1398    } else if (small) {
1399        if (opc == -1) {
1400            tcg_out8(s, OPC_JMP_short);
1401        } else {
1402            tcg_out8(s, OPC_JCC_short + opc);
1403        }
1404        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1405        s->code_ptr += 1;
1406    } else {
1407        if (opc == -1) {
1408            tcg_out8(s, OPC_JMP_long);
1409        } else {
1410            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1411        }
1412        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1413        s->code_ptr += 4;
1414    }
1415}
1416
1417static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1418                        int const_arg2, int rexw)
1419{
1420    if (const_arg2) {
1421        if (arg2 == 0) {
1422            /* test r, r */
1423            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1424        } else {
1425            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1426        }
1427    } else {
1428        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1429    }
1430}
1431
1432static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1433                             TCGArg arg1, TCGArg arg2, int const_arg2,
1434                             TCGLabel *label, int small)
1435{
1436    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1437    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1438}
1439
1440#if TCG_TARGET_REG_BITS == 64
1441static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1442                             TCGArg arg1, TCGArg arg2, int const_arg2,
1443                             TCGLabel *label, int small)
1444{
1445    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1446    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1447}
1448#else
1449/* XXX: we implement it at the target level to avoid having to
1450   handle cross basic blocks temporaries */
1451static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1452                            const int *const_args, int small)
1453{
1454    TCGLabel *label_next = gen_new_label();
1455    TCGLabel *label_this = arg_label(args[5]);
1456
1457    switch(args[4]) {
1458    case TCG_COND_EQ:
1459        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1460                         label_next, 1);
1461        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1462                         label_this, small);
1463        break;
1464    case TCG_COND_NE:
1465        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1466                         label_this, small);
1467        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1468                         label_this, small);
1469        break;
1470    case TCG_COND_LT:
1471        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1472                         label_this, small);
1473        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1474        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1475                         label_this, small);
1476        break;
1477    case TCG_COND_LE:
1478        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1479                         label_this, small);
1480        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1481        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1482                         label_this, small);
1483        break;
1484    case TCG_COND_GT:
1485        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1486                         label_this, small);
1487        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1488        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1489                         label_this, small);
1490        break;
1491    case TCG_COND_GE:
1492        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1493                         label_this, small);
1494        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1495        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1496                         label_this, small);
1497        break;
1498    case TCG_COND_LTU:
1499        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1500                         label_this, small);
1501        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1502        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1503                         label_this, small);
1504        break;
1505    case TCG_COND_LEU:
1506        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1507                         label_this, small);
1508        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1509        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1510                         label_this, small);
1511        break;
1512    case TCG_COND_GTU:
1513        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1514                         label_this, small);
1515        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1516        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1517                         label_this, small);
1518        break;
1519    case TCG_COND_GEU:
1520        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1521                         label_this, small);
1522        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1523        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1524                         label_this, small);
1525        break;
1526    default:
1527        g_assert_not_reached();
1528    }
1529    tcg_out_label(s, label_next);
1530}
1531#endif
1532
1533static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1534                              TCGArg arg1, TCGArg arg2, int const_arg2)
1535{
1536    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1537    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1538    tcg_out_ext8u(s, dest, dest);
1539}
1540
1541#if TCG_TARGET_REG_BITS == 64
1542static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1543                              TCGArg arg1, TCGArg arg2, int const_arg2)
1544{
1545    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1546    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1547    tcg_out_ext8u(s, dest, dest);
1548}
1549#else
1550static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1551                             const int *const_args)
1552{
1553    TCGArg new_args[6];
1554    TCGLabel *label_true, *label_over;
1555
1556    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1557
1558    if (args[0] == args[1] || args[0] == args[2]
1559        || (!const_args[3] && args[0] == args[3])
1560        || (!const_args[4] && args[0] == args[4])) {
1561        /* When the destination overlaps with one of the argument
1562           registers, don't do anything tricky.  */
1563        label_true = gen_new_label();
1564        label_over = gen_new_label();
1565
1566        new_args[5] = label_arg(label_true);
1567        tcg_out_brcond2(s, new_args, const_args+1, 1);
1568
1569        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1570        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1571        tcg_out_label(s, label_true);
1572
1573        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1574        tcg_out_label(s, label_over);
1575    } else {
1576        /* When the destination does not overlap one of the arguments,
1577           clear the destination first, jump if cond false, and emit an
1578           increment in the true case.  This results in smaller code.  */
1579
1580        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1581
1582        label_over = gen_new_label();
1583        new_args[4] = tcg_invert_cond(new_args[4]);
1584        new_args[5] = label_arg(label_over);
1585        tcg_out_brcond2(s, new_args, const_args+1, 1);
1586
1587        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1588        tcg_out_label(s, label_over);
1589    }
1590}
1591#endif
1592
1593static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1594                         TCGReg dest, TCGReg v1)
1595{
1596    if (have_cmov) {
1597        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1598    } else {
1599        TCGLabel *over = gen_new_label();
1600        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1601        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1602        tcg_out_label(s, over);
1603    }
1604}
1605
1606static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1607                              TCGReg c1, TCGArg c2, int const_c2,
1608                              TCGReg v1)
1609{
1610    tcg_out_cmp(s, c1, c2, const_c2, 0);
1611    tcg_out_cmov(s, cond, 0, dest, v1);
1612}
1613
1614#if TCG_TARGET_REG_BITS == 64
1615static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1616                              TCGReg c1, TCGArg c2, int const_c2,
1617                              TCGReg v1)
1618{
1619    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1620    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1621}
1622#endif
1623
1624static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1625                        TCGArg arg2, bool const_a2)
1626{
1627    if (have_bmi1) {
1628        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1629        if (const_a2) {
1630            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1631        } else {
1632            tcg_debug_assert(dest != arg2);
1633            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1634        }
1635    } else {
1636        tcg_debug_assert(dest != arg2);
1637        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1638        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1639    }
1640}
1641
1642static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1643                        TCGArg arg2, bool const_a2)
1644{
1645    if (have_lzcnt) {
1646        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1647        if (const_a2) {
1648            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1649        } else {
1650            tcg_debug_assert(dest != arg2);
1651            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1652        }
1653    } else {
1654        tcg_debug_assert(!const_a2);
1655        tcg_debug_assert(dest != arg1);
1656        tcg_debug_assert(dest != arg2);
1657
1658        /* Recall that the output of BSR is the index not the count.  */
1659        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1660        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1661
1662        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1663        tcg_out_cmp(s, arg1, 0, 1, rexw);
1664        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1665    }
1666}
1667
1668static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1669{
1670    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1671
1672    if (disp == (int32_t)disp) {
1673        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1674        tcg_out32(s, disp);
1675    } else {
1676        /* rip-relative addressing into the constant pool.
1677           This is 6 + 8 = 14 bytes, as compared to using an
1678           immediate load 10 + 6 = 16 bytes, plus we may
1679           be able to re-use the pool constant for more calls.  */
1680        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1681        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1682        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1683        tcg_out32(s, 0);
1684    }
1685}
1686
1687static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1688                         const TCGHelperInfo *info)
1689{
1690    tcg_out_branch(s, 1, dest);
1691
1692#ifndef _WIN32
1693    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1694        /*
1695         * The sysv i386 abi for struct return places a reference as the
1696         * first argument of the stack, and pops that argument with the
1697         * return statement.  Since we want to retain the aligned stack
1698         * pointer for the callee, we do not want to actually push that
1699         * argument before the call but rely on the normal store to the
1700         * stack slot.  But we do need to compensate for the pop in order
1701         * to reset our correct stack pointer value.
1702         * Pushing a garbage value back onto the stack is quickest.
1703         */
1704        tcg_out_push(s, TCG_REG_EAX);
1705    }
1706#endif
1707}
1708
1709static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1710{
1711    tcg_out_branch(s, 0, dest);
1712}
1713
1714static void tcg_out_nopn(TCGContext *s, int n)
1715{
1716    int i;
1717    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1718     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1719     * duplicate prefix, and all of the interesting recent cores can
1720     * decode and discard the duplicates in a single cycle.
1721     */
1722    tcg_debug_assert(n >= 1);
1723    for (i = 1; i < n; ++i) {
1724        tcg_out8(s, 0x66);
1725    }
1726    tcg_out8(s, 0x90);
1727}
1728
1729/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1730static void __attribute__((unused))
1731tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1732{
1733    /*
1734     * This is used for testing alignment, so we can usually use testb.
1735     * For i686, we have to use testl for %esi/%edi.
1736     */
1737    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1738        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1739        tcg_out8(s, i);
1740    } else {
1741        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1742        tcg_out32(s, i);
1743    }
1744}
1745
1746typedef struct {
1747    TCGReg base;
1748    int index;
1749    int ofs;
1750    int seg;
1751    TCGAtomAlign aa;
1752} HostAddress;
1753
1754bool tcg_target_has_memory_bswap(MemOp memop)
1755{
1756    return have_movbe;
1757}
1758
1759/*
1760 * Because i686 has no register parameters and because x86_64 has xchg
1761 * to handle addr/data register overlap, we have placed all input arguments
1762 * before we need might need a scratch reg.
1763 *
1764 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1765 * a general-purpose scratch when we don't actually know it's available,
1766 * use the ra_gen hook to load into RAX if needed.
1767 */
1768#if TCG_TARGET_REG_BITS == 64
1769static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1770{
1771    if (arg < 0) {
1772        arg = TCG_REG_RAX;
1773    }
1774    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1775    return arg;
1776}
1777static const TCGLdstHelperParam ldst_helper_param = {
1778    .ra_gen = ldst_ra_gen
1779};
1780#else
1781static const TCGLdstHelperParam ldst_helper_param = { };
1782#endif
1783
1784/*
1785 * Generate code for the slow path for a load at the end of block
1786 */
1787static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1788{
1789    MemOp opc = get_memop(l->oi);
1790    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1791
1792    /* resolve label address */
1793    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1794    if (label_ptr[1]) {
1795        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1796    }
1797
1798    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1799    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1800    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1801
1802    tcg_out_jmp(s, l->raddr);
1803    return true;
1804}
1805
1806/*
1807 * Generate code for the slow path for a store at the end of block
1808 */
1809static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1810{
1811    MemOp opc = get_memop(l->oi);
1812    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1813
1814    /* resolve label address */
1815    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1816    if (label_ptr[1]) {
1817        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1818    }
1819
1820    tcg_out_st_helper_args(s, l, &ldst_helper_param);
1821    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1822
1823    tcg_out_jmp(s, l->raddr);
1824    return true;
1825}
1826
1827#ifndef CONFIG_SOFTMMU
1828static HostAddress x86_guest_base = {
1829    .index = -1
1830};
1831
1832#if defined(__x86_64__) && defined(__linux__)
1833# include <asm/prctl.h>
1834# include <sys/prctl.h>
1835int arch_prctl(int code, unsigned long addr);
1836static inline int setup_guest_base_seg(void)
1837{
1838    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1839        return P_GS;
1840    }
1841    return 0;
1842}
1843#elif defined(__x86_64__) && \
1844      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1845# include <machine/sysarch.h>
1846static inline int setup_guest_base_seg(void)
1847{
1848    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1849        return P_GS;
1850    }
1851    return 0;
1852}
1853#else
1854static inline int setup_guest_base_seg(void)
1855{
1856    return 0;
1857}
1858#endif /* setup_guest_base_seg */
1859#endif /* !SOFTMMU */
1860
1861/*
1862 * For softmmu, perform the TLB load and compare.
1863 * For useronly, perform any required alignment tests.
1864 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1865 * is required and fill in @h with the host address for the fast path.
1866 */
1867static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1868                                           TCGReg addrlo, TCGReg addrhi,
1869                                           MemOpIdx oi, bool is_ld)
1870{
1871    TCGLabelQemuLdst *ldst = NULL;
1872    MemOp opc = get_memop(oi);
1873    unsigned a_mask;
1874
1875#ifdef CONFIG_SOFTMMU
1876    h->index = TCG_REG_L0;
1877    h->ofs = 0;
1878    h->seg = 0;
1879#else
1880    *h = x86_guest_base;
1881#endif
1882    h->base = addrlo;
1883    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false);
1884    a_mask = (1 << h->aa.align) - 1;
1885
1886#ifdef CONFIG_SOFTMMU
1887    int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
1888                        : offsetof(CPUTLBEntry, addr_write);
1889    TCGType ttype = TCG_TYPE_I32;
1890    TCGType tlbtype = TCG_TYPE_I32;
1891    int trexw = 0, hrexw = 0, tlbrexw = 0;
1892    unsigned mem_index = get_mmuidx(oi);
1893    unsigned s_bits = opc & MO_SIZE;
1894    unsigned s_mask = (1 << s_bits) - 1;
1895    int tlb_mask;
1896
1897    ldst = new_ldst_label(s);
1898    ldst->is_ld = is_ld;
1899    ldst->oi = oi;
1900    ldst->addrlo_reg = addrlo;
1901    ldst->addrhi_reg = addrhi;
1902
1903    if (TCG_TARGET_REG_BITS == 64) {
1904        ttype = s->addr_type;
1905        trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
1906        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1907            hrexw = P_REXW;
1908            if (s->page_bits + s->tlb_dyn_max_bits > 32) {
1909                tlbtype = TCG_TYPE_I64;
1910                tlbrexw = P_REXW;
1911            }
1912        }
1913    }
1914
1915    tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
1916    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
1917                   s->page_bits - CPU_TLB_ENTRY_BITS);
1918
1919    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
1920                         TLB_MASK_TABLE_OFS(mem_index) +
1921                         offsetof(CPUTLBDescFast, mask));
1922
1923    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
1924                         TLB_MASK_TABLE_OFS(mem_index) +
1925                         offsetof(CPUTLBDescFast, table));
1926
1927    /*
1928     * If the required alignment is at least as large as the access, simply
1929     * copy the address and mask.  For lesser alignments, check that we don't
1930     * cross pages for the complete access.
1931     */
1932    if (a_mask >= s_mask) {
1933        tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
1934    } else {
1935        tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
1936                             addrlo, s_mask - a_mask);
1937    }
1938    tlb_mask = s->page_mask | a_mask;
1939    tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
1940
1941    /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
1942    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
1943                         TCG_REG_L1, TCG_REG_L0, cmp_ofs);
1944
1945    /* jne slow_path */
1946    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1947    ldst->label_ptr[0] = s->code_ptr;
1948    s->code_ptr += 4;
1949
1950    if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
1951        /* cmp 4(TCG_REG_L0), addrhi */
1952        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4);
1953
1954        /* jne slow_path */
1955        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1956        ldst->label_ptr[1] = s->code_ptr;
1957        s->code_ptr += 4;
1958    }
1959
1960    /* TLB Hit.  */
1961    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
1962               offsetof(CPUTLBEntry, addend));
1963#else
1964    if (a_mask) {
1965        ldst = new_ldst_label(s);
1966
1967        ldst->is_ld = is_ld;
1968        ldst->oi = oi;
1969        ldst->addrlo_reg = addrlo;
1970        ldst->addrhi_reg = addrhi;
1971
1972        tcg_out_testi(s, addrlo, a_mask);
1973        /* jne slow_path */
1974        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1975        ldst->label_ptr[0] = s->code_ptr;
1976        s->code_ptr += 4;
1977    }
1978#endif
1979
1980    return ldst;
1981}
1982
1983static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1984                                   HostAddress h, TCGType type, MemOp memop)
1985{
1986    bool use_movbe = false;
1987    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1988    int movop = OPC_MOVL_GvEv;
1989
1990    /* Do big-endian loads with movbe.  */
1991    if (memop & MO_BSWAP) {
1992        tcg_debug_assert(have_movbe);
1993        use_movbe = true;
1994        movop = OPC_MOVBE_GyMy;
1995    }
1996
1997    switch (memop & MO_SSIZE) {
1998    case MO_UB:
1999        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2000                                 h.base, h.index, 0, h.ofs);
2001        break;
2002    case MO_SB:
2003        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2004                                 h.base, h.index, 0, h.ofs);
2005        break;
2006    case MO_UW:
2007        if (use_movbe) {
2008            /* There is no extending movbe; only low 16-bits are modified.  */
2009            if (datalo != h.base && datalo != h.index) {
2010                /* XOR breaks dependency chains.  */
2011                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2012                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2013                                         datalo, h.base, h.index, 0, h.ofs);
2014            } else {
2015                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2016                                         datalo, h.base, h.index, 0, h.ofs);
2017                tcg_out_ext16u(s, datalo, datalo);
2018            }
2019        } else {
2020            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2021                                     h.base, h.index, 0, h.ofs);
2022        }
2023        break;
2024    case MO_SW:
2025        if (use_movbe) {
2026            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2027                                     datalo, h.base, h.index, 0, h.ofs);
2028            tcg_out_ext16s(s, type, datalo, datalo);
2029        } else {
2030            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2031                                     datalo, h.base, h.index, 0, h.ofs);
2032        }
2033        break;
2034    case MO_UL:
2035        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2036                                 h.base, h.index, 0, h.ofs);
2037        break;
2038#if TCG_TARGET_REG_BITS == 64
2039    case MO_SL:
2040        if (use_movbe) {
2041            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2042                                     h.base, h.index, 0, h.ofs);
2043            tcg_out_ext32s(s, datalo, datalo);
2044        } else {
2045            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2046                                     h.base, h.index, 0, h.ofs);
2047        }
2048        break;
2049#endif
2050    case MO_UQ:
2051        if (TCG_TARGET_REG_BITS == 64) {
2052            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2053                                     h.base, h.index, 0, h.ofs);
2054            break;
2055        }
2056        if (use_movbe) {
2057            TCGReg t = datalo;
2058            datalo = datahi;
2059            datahi = t;
2060        }
2061        if (h.base == datalo || h.index == datalo) {
2062            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2063                                     h.base, h.index, 0, h.ofs);
2064            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2065            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2066        } else {
2067            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2068                                     h.base, h.index, 0, h.ofs);
2069            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2070                                     h.base, h.index, 0, h.ofs + 4);
2071        }
2072        break;
2073    default:
2074        g_assert_not_reached();
2075    }
2076}
2077
2078static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2079                            TCGReg addrlo, TCGReg addrhi,
2080                            MemOpIdx oi, TCGType data_type)
2081{
2082    TCGLabelQemuLdst *ldst;
2083    HostAddress h;
2084
2085    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2086    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2087
2088    if (ldst) {
2089        ldst->type = data_type;
2090        ldst->datalo_reg = datalo;
2091        ldst->datahi_reg = datahi;
2092        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2093    }
2094}
2095
2096static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2097                                   HostAddress h, MemOp memop)
2098{
2099    bool use_movbe = false;
2100    int movop = OPC_MOVL_EvGv;
2101
2102    /*
2103     * Do big-endian stores with movbe or softmmu.
2104     * User-only without movbe will have its swapping done generically.
2105     */
2106    if (memop & MO_BSWAP) {
2107        tcg_debug_assert(have_movbe);
2108        use_movbe = true;
2109        movop = OPC_MOVBE_MyGy;
2110    }
2111
2112    switch (memop & MO_SIZE) {
2113    case MO_8:
2114        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2115        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2116        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2117                                 datalo, h.base, h.index, 0, h.ofs);
2118        break;
2119    case MO_16:
2120        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2121                                 h.base, h.index, 0, h.ofs);
2122        break;
2123    case MO_32:
2124        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2125                                 h.base, h.index, 0, h.ofs);
2126        break;
2127    case MO_64:
2128        if (TCG_TARGET_REG_BITS == 64) {
2129            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2130                                     h.base, h.index, 0, h.ofs);
2131        } else {
2132            if (use_movbe) {
2133                TCGReg t = datalo;
2134                datalo = datahi;
2135                datahi = t;
2136            }
2137            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2138                                     h.base, h.index, 0, h.ofs);
2139            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2140                                     h.base, h.index, 0, h.ofs + 4);
2141        }
2142        break;
2143    default:
2144        g_assert_not_reached();
2145    }
2146}
2147
2148static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2149                            TCGReg addrlo, TCGReg addrhi,
2150                            MemOpIdx oi, TCGType data_type)
2151{
2152    TCGLabelQemuLdst *ldst;
2153    HostAddress h;
2154
2155    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2156    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2157
2158    if (ldst) {
2159        ldst->type = data_type;
2160        ldst->datalo_reg = datalo;
2161        ldst->datahi_reg = datahi;
2162        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2163    }
2164}
2165
2166static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2167{
2168    /* Reuse the zeroing that exists for goto_ptr.  */
2169    if (a0 == 0) {
2170        tcg_out_jmp(s, tcg_code_gen_epilogue);
2171    } else {
2172        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2173        tcg_out_jmp(s, tb_ret_addr);
2174    }
2175}
2176
2177static void tcg_out_goto_tb(TCGContext *s, int which)
2178{
2179    /*
2180     * Jump displacement must be aligned for atomic patching;
2181     * see if we need to add extra nops before jump
2182     */
2183    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2184    if (gap != 1) {
2185        tcg_out_nopn(s, gap - 1);
2186    }
2187    tcg_out8(s, OPC_JMP_long); /* jmp im */
2188    set_jmp_insn_offset(s, which);
2189    tcg_out32(s, 0);
2190    set_jmp_reset_offset(s, which);
2191}
2192
2193void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2194                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2195{
2196    /* patch the branch destination */
2197    uintptr_t addr = tb->jmp_target_addr[n];
2198    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2199    /* no need to flush icache explicitly */
2200}
2201
2202static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2203                              const TCGArg args[TCG_MAX_OP_ARGS],
2204                              const int const_args[TCG_MAX_OP_ARGS])
2205{
2206    TCGArg a0, a1, a2;
2207    int c, const_a2, vexop, rexw = 0;
2208
2209#if TCG_TARGET_REG_BITS == 64
2210# define OP_32_64(x) \
2211        case glue(glue(INDEX_op_, x), _i64): \
2212            rexw = P_REXW; /* FALLTHRU */    \
2213        case glue(glue(INDEX_op_, x), _i32)
2214#else
2215# define OP_32_64(x) \
2216        case glue(glue(INDEX_op_, x), _i32)
2217#endif
2218
2219    /* Hoist the loads of the most common arguments.  */
2220    a0 = args[0];
2221    a1 = args[1];
2222    a2 = args[2];
2223    const_a2 = const_args[2];
2224
2225    switch (opc) {
2226    case INDEX_op_goto_ptr:
2227        /* jmp to the given host address (could be epilogue) */
2228        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2229        break;
2230    case INDEX_op_br:
2231        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2232        break;
2233    OP_32_64(ld8u):
2234        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2235        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2236        break;
2237    OP_32_64(ld8s):
2238        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2239        break;
2240    OP_32_64(ld16u):
2241        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2242        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2243        break;
2244    OP_32_64(ld16s):
2245        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2246        break;
2247#if TCG_TARGET_REG_BITS == 64
2248    case INDEX_op_ld32u_i64:
2249#endif
2250    case INDEX_op_ld_i32:
2251        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2252        break;
2253
2254    OP_32_64(st8):
2255        if (const_args[0]) {
2256            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2257            tcg_out8(s, a0);
2258        } else {
2259            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2260        }
2261        break;
2262    OP_32_64(st16):
2263        if (const_args[0]) {
2264            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2265            tcg_out16(s, a0);
2266        } else {
2267            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2268        }
2269        break;
2270#if TCG_TARGET_REG_BITS == 64
2271    case INDEX_op_st32_i64:
2272#endif
2273    case INDEX_op_st_i32:
2274        if (const_args[0]) {
2275            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2276            tcg_out32(s, a0);
2277        } else {
2278            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2279        }
2280        break;
2281
2282    OP_32_64(add):
2283        /* For 3-operand addition, use LEA.  */
2284        if (a0 != a1) {
2285            TCGArg c3 = 0;
2286            if (const_a2) {
2287                c3 = a2, a2 = -1;
2288            } else if (a0 == a2) {
2289                /* Watch out for dest = src + dest, since we've removed
2290                   the matching constraint on the add.  */
2291                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2292                break;
2293            }
2294
2295            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2296            break;
2297        }
2298        c = ARITH_ADD;
2299        goto gen_arith;
2300    OP_32_64(sub):
2301        c = ARITH_SUB;
2302        goto gen_arith;
2303    OP_32_64(and):
2304        c = ARITH_AND;
2305        goto gen_arith;
2306    OP_32_64(or):
2307        c = ARITH_OR;
2308        goto gen_arith;
2309    OP_32_64(xor):
2310        c = ARITH_XOR;
2311        goto gen_arith;
2312    gen_arith:
2313        if (const_a2) {
2314            tgen_arithi(s, c + rexw, a0, a2, 0);
2315        } else {
2316            tgen_arithr(s, c + rexw, a0, a2);
2317        }
2318        break;
2319
2320    OP_32_64(andc):
2321        if (const_a2) {
2322            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2323            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2324        } else {
2325            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2326        }
2327        break;
2328
2329    OP_32_64(mul):
2330        if (const_a2) {
2331            int32_t val;
2332            val = a2;
2333            if (val == (int8_t)val) {
2334                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2335                tcg_out8(s, val);
2336            } else {
2337                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2338                tcg_out32(s, val);
2339            }
2340        } else {
2341            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2342        }
2343        break;
2344
2345    OP_32_64(div2):
2346        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2347        break;
2348    OP_32_64(divu2):
2349        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2350        break;
2351
2352    OP_32_64(shl):
2353        /* For small constant 3-operand shift, use LEA.  */
2354        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2355            if (a2 - 1 == 0) {
2356                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2357                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2358            } else {
2359                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2360                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2361            }
2362            break;
2363        }
2364        c = SHIFT_SHL;
2365        vexop = OPC_SHLX;
2366        goto gen_shift_maybe_vex;
2367    OP_32_64(shr):
2368        c = SHIFT_SHR;
2369        vexop = OPC_SHRX;
2370        goto gen_shift_maybe_vex;
2371    OP_32_64(sar):
2372        c = SHIFT_SAR;
2373        vexop = OPC_SARX;
2374        goto gen_shift_maybe_vex;
2375    OP_32_64(rotl):
2376        c = SHIFT_ROL;
2377        goto gen_shift;
2378    OP_32_64(rotr):
2379        c = SHIFT_ROR;
2380        goto gen_shift;
2381    gen_shift_maybe_vex:
2382        if (have_bmi2) {
2383            if (!const_a2) {
2384                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2385                break;
2386            }
2387            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2388        }
2389        /* FALLTHRU */
2390    gen_shift:
2391        if (const_a2) {
2392            tcg_out_shifti(s, c + rexw, a0, a2);
2393        } else {
2394            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2395        }
2396        break;
2397
2398    OP_32_64(ctz):
2399        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2400        break;
2401    OP_32_64(clz):
2402        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2403        break;
2404    OP_32_64(ctpop):
2405        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2406        break;
2407
2408    case INDEX_op_brcond_i32:
2409        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2410        break;
2411    case INDEX_op_setcond_i32:
2412        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2413        break;
2414    case INDEX_op_movcond_i32:
2415        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2416        break;
2417
2418    OP_32_64(bswap16):
2419        if (a2 & TCG_BSWAP_OS) {
2420            /* Output must be sign-extended. */
2421            if (rexw) {
2422                tcg_out_bswap64(s, a0);
2423                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2424            } else {
2425                tcg_out_bswap32(s, a0);
2426                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2427            }
2428        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2429            /* Output must be zero-extended, but input isn't. */
2430            tcg_out_bswap32(s, a0);
2431            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2432        } else {
2433            tcg_out_rolw_8(s, a0);
2434        }
2435        break;
2436    OP_32_64(bswap32):
2437        tcg_out_bswap32(s, a0);
2438        if (rexw && (a2 & TCG_BSWAP_OS)) {
2439            tcg_out_ext32s(s, a0, a0);
2440        }
2441        break;
2442
2443    OP_32_64(neg):
2444        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2445        break;
2446    OP_32_64(not):
2447        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2448        break;
2449
2450    case INDEX_op_qemu_ld_a64_i32:
2451        if (TCG_TARGET_REG_BITS == 32) {
2452            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2453            break;
2454        }
2455        /* fall through */
2456    case INDEX_op_qemu_ld_a32_i32:
2457        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2458        break;
2459    case INDEX_op_qemu_ld_a32_i64:
2460        if (TCG_TARGET_REG_BITS == 64) {
2461            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2462        } else {
2463            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2464        }
2465        break;
2466    case INDEX_op_qemu_ld_a64_i64:
2467        if (TCG_TARGET_REG_BITS == 64) {
2468            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2469        } else {
2470            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2471        }
2472        break;
2473
2474    case INDEX_op_qemu_st_a64_i32:
2475    case INDEX_op_qemu_st8_a64_i32:
2476        if (TCG_TARGET_REG_BITS == 32) {
2477            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2478            break;
2479        }
2480        /* fall through */
2481    case INDEX_op_qemu_st_a32_i32:
2482    case INDEX_op_qemu_st8_a32_i32:
2483        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2484        break;
2485    case INDEX_op_qemu_st_a32_i64:
2486        if (TCG_TARGET_REG_BITS == 64) {
2487            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2488        } else {
2489            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2490        }
2491        break;
2492    case INDEX_op_qemu_st_a64_i64:
2493        if (TCG_TARGET_REG_BITS == 64) {
2494            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2495        } else {
2496            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2497        }
2498        break;
2499
2500    OP_32_64(mulu2):
2501        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2502        break;
2503    OP_32_64(muls2):
2504        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2505        break;
2506    OP_32_64(add2):
2507        if (const_args[4]) {
2508            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2509        } else {
2510            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2511        }
2512        if (const_args[5]) {
2513            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2514        } else {
2515            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2516        }
2517        break;
2518    OP_32_64(sub2):
2519        if (const_args[4]) {
2520            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2521        } else {
2522            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2523        }
2524        if (const_args[5]) {
2525            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2526        } else {
2527            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2528        }
2529        break;
2530
2531#if TCG_TARGET_REG_BITS == 32
2532    case INDEX_op_brcond2_i32:
2533        tcg_out_brcond2(s, args, const_args, 0);
2534        break;
2535    case INDEX_op_setcond2_i32:
2536        tcg_out_setcond2(s, args, const_args);
2537        break;
2538#else /* TCG_TARGET_REG_BITS == 64 */
2539    case INDEX_op_ld32s_i64:
2540        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2541        break;
2542    case INDEX_op_ld_i64:
2543        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2544        break;
2545    case INDEX_op_st_i64:
2546        if (const_args[0]) {
2547            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2548            tcg_out32(s, a0);
2549        } else {
2550            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2551        }
2552        break;
2553
2554    case INDEX_op_brcond_i64:
2555        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2556        break;
2557    case INDEX_op_setcond_i64:
2558        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2559        break;
2560    case INDEX_op_movcond_i64:
2561        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2562        break;
2563
2564    case INDEX_op_bswap64_i64:
2565        tcg_out_bswap64(s, a0);
2566        break;
2567    case INDEX_op_extrh_i64_i32:
2568        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2569        break;
2570#endif
2571
2572    OP_32_64(deposit):
2573        if (args[3] == 0 && args[4] == 8) {
2574            /* load bits 0..7 */
2575            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2576        } else if (args[3] == 8 && args[4] == 8) {
2577            /* load bits 8..15 */
2578            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2579        } else if (args[3] == 0 && args[4] == 16) {
2580            /* load bits 0..15 */
2581            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2582        } else {
2583            g_assert_not_reached();
2584        }
2585        break;
2586
2587    case INDEX_op_extract_i64:
2588        if (a2 + args[3] == 32) {
2589            /* This is a 32-bit zero-extending right shift.  */
2590            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2591            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2592            break;
2593        }
2594        /* FALLTHRU */
2595    case INDEX_op_extract_i32:
2596        /* On the off-chance that we can use the high-byte registers.
2597           Otherwise we emit the same ext16 + shift pattern that we
2598           would have gotten from the normal tcg-op.c expansion.  */
2599        tcg_debug_assert(a2 == 8 && args[3] == 8);
2600        if (a1 < 4 && a0 < 8) {
2601            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2602        } else {
2603            tcg_out_ext16u(s, a0, a1);
2604            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2605        }
2606        break;
2607
2608    case INDEX_op_sextract_i32:
2609        /* We don't implement sextract_i64, as we cannot sign-extend to
2610           64-bits without using the REX prefix that explicitly excludes
2611           access to the high-byte registers.  */
2612        tcg_debug_assert(a2 == 8 && args[3] == 8);
2613        if (a1 < 4 && a0 < 8) {
2614            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2615        } else {
2616            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2617            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2618        }
2619        break;
2620
2621    OP_32_64(extract2):
2622        /* Note that SHRD outputs to the r/m operand.  */
2623        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2624        tcg_out8(s, args[3]);
2625        break;
2626
2627    case INDEX_op_mb:
2628        tcg_out_mb(s, a0);
2629        break;
2630    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2631    case INDEX_op_mov_i64:
2632    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2633    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2634    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2635    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2636    case INDEX_op_ext8s_i64:
2637    case INDEX_op_ext8u_i32:
2638    case INDEX_op_ext8u_i64:
2639    case INDEX_op_ext16s_i32:
2640    case INDEX_op_ext16s_i64:
2641    case INDEX_op_ext16u_i32:
2642    case INDEX_op_ext16u_i64:
2643    case INDEX_op_ext32s_i64:
2644    case INDEX_op_ext32u_i64:
2645    case INDEX_op_ext_i32_i64:
2646    case INDEX_op_extu_i32_i64:
2647    case INDEX_op_extrl_i64_i32:
2648    default:
2649        g_assert_not_reached();
2650    }
2651
2652#undef OP_32_64
2653}
2654
2655static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2656                           unsigned vecl, unsigned vece,
2657                           const TCGArg args[TCG_MAX_OP_ARGS],
2658                           const int const_args[TCG_MAX_OP_ARGS])
2659{
2660    static int const add_insn[4] = {
2661        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2662    };
2663    static int const ssadd_insn[4] = {
2664        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2665    };
2666    static int const usadd_insn[4] = {
2667        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2668    };
2669    static int const sub_insn[4] = {
2670        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2671    };
2672    static int const sssub_insn[4] = {
2673        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2674    };
2675    static int const ussub_insn[4] = {
2676        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2677    };
2678    static int const mul_insn[4] = {
2679        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2680    };
2681    static int const shift_imm_insn[4] = {
2682        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2683    };
2684    static int const cmpeq_insn[4] = {
2685        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2686    };
2687    static int const cmpgt_insn[4] = {
2688        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2689    };
2690    static int const punpckl_insn[4] = {
2691        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2692    };
2693    static int const punpckh_insn[4] = {
2694        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2695    };
2696    static int const packss_insn[4] = {
2697        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2698    };
2699    static int const packus_insn[4] = {
2700        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2701    };
2702    static int const smin_insn[4] = {
2703        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2704    };
2705    static int const smax_insn[4] = {
2706        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2707    };
2708    static int const umin_insn[4] = {
2709        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2710    };
2711    static int const umax_insn[4] = {
2712        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2713    };
2714    static int const rotlv_insn[4] = {
2715        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2716    };
2717    static int const rotrv_insn[4] = {
2718        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2719    };
2720    static int const shlv_insn[4] = {
2721        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2722    };
2723    static int const shrv_insn[4] = {
2724        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2725    };
2726    static int const sarv_insn[4] = {
2727        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2728    };
2729    static int const shls_insn[4] = {
2730        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2731    };
2732    static int const shrs_insn[4] = {
2733        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2734    };
2735    static int const sars_insn[4] = {
2736        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2737    };
2738    static int const vpshldi_insn[4] = {
2739        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2740    };
2741    static int const vpshldv_insn[4] = {
2742        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2743    };
2744    static int const vpshrdv_insn[4] = {
2745        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2746    };
2747    static int const abs_insn[4] = {
2748        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2749    };
2750
2751    TCGType type = vecl + TCG_TYPE_V64;
2752    int insn, sub;
2753    TCGArg a0, a1, a2, a3;
2754
2755    a0 = args[0];
2756    a1 = args[1];
2757    a2 = args[2];
2758
2759    switch (opc) {
2760    case INDEX_op_add_vec:
2761        insn = add_insn[vece];
2762        goto gen_simd;
2763    case INDEX_op_ssadd_vec:
2764        insn = ssadd_insn[vece];
2765        goto gen_simd;
2766    case INDEX_op_usadd_vec:
2767        insn = usadd_insn[vece];
2768        goto gen_simd;
2769    case INDEX_op_sub_vec:
2770        insn = sub_insn[vece];
2771        goto gen_simd;
2772    case INDEX_op_sssub_vec:
2773        insn = sssub_insn[vece];
2774        goto gen_simd;
2775    case INDEX_op_ussub_vec:
2776        insn = ussub_insn[vece];
2777        goto gen_simd;
2778    case INDEX_op_mul_vec:
2779        insn = mul_insn[vece];
2780        goto gen_simd;
2781    case INDEX_op_and_vec:
2782        insn = OPC_PAND;
2783        goto gen_simd;
2784    case INDEX_op_or_vec:
2785        insn = OPC_POR;
2786        goto gen_simd;
2787    case INDEX_op_xor_vec:
2788        insn = OPC_PXOR;
2789        goto gen_simd;
2790    case INDEX_op_smin_vec:
2791        insn = smin_insn[vece];
2792        goto gen_simd;
2793    case INDEX_op_umin_vec:
2794        insn = umin_insn[vece];
2795        goto gen_simd;
2796    case INDEX_op_smax_vec:
2797        insn = smax_insn[vece];
2798        goto gen_simd;
2799    case INDEX_op_umax_vec:
2800        insn = umax_insn[vece];
2801        goto gen_simd;
2802    case INDEX_op_shlv_vec:
2803        insn = shlv_insn[vece];
2804        goto gen_simd;
2805    case INDEX_op_shrv_vec:
2806        insn = shrv_insn[vece];
2807        goto gen_simd;
2808    case INDEX_op_sarv_vec:
2809        insn = sarv_insn[vece];
2810        goto gen_simd;
2811    case INDEX_op_rotlv_vec:
2812        insn = rotlv_insn[vece];
2813        goto gen_simd;
2814    case INDEX_op_rotrv_vec:
2815        insn = rotrv_insn[vece];
2816        goto gen_simd;
2817    case INDEX_op_shls_vec:
2818        insn = shls_insn[vece];
2819        goto gen_simd;
2820    case INDEX_op_shrs_vec:
2821        insn = shrs_insn[vece];
2822        goto gen_simd;
2823    case INDEX_op_sars_vec:
2824        insn = sars_insn[vece];
2825        goto gen_simd;
2826    case INDEX_op_x86_punpckl_vec:
2827        insn = punpckl_insn[vece];
2828        goto gen_simd;
2829    case INDEX_op_x86_punpckh_vec:
2830        insn = punpckh_insn[vece];
2831        goto gen_simd;
2832    case INDEX_op_x86_packss_vec:
2833        insn = packss_insn[vece];
2834        goto gen_simd;
2835    case INDEX_op_x86_packus_vec:
2836        insn = packus_insn[vece];
2837        goto gen_simd;
2838    case INDEX_op_x86_vpshldv_vec:
2839        insn = vpshldv_insn[vece];
2840        a1 = a2;
2841        a2 = args[3];
2842        goto gen_simd;
2843    case INDEX_op_x86_vpshrdv_vec:
2844        insn = vpshrdv_insn[vece];
2845        a1 = a2;
2846        a2 = args[3];
2847        goto gen_simd;
2848#if TCG_TARGET_REG_BITS == 32
2849    case INDEX_op_dup2_vec:
2850        /* First merge the two 32-bit inputs to a single 64-bit element. */
2851        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2852        /* Then replicate the 64-bit elements across the rest of the vector. */
2853        if (type != TCG_TYPE_V64) {
2854            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2855        }
2856        break;
2857#endif
2858    case INDEX_op_abs_vec:
2859        insn = abs_insn[vece];
2860        a2 = a1;
2861        a1 = 0;
2862        goto gen_simd;
2863    gen_simd:
2864        tcg_debug_assert(insn != OPC_UD2);
2865        if (type == TCG_TYPE_V256) {
2866            insn |= P_VEXL;
2867        }
2868        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2869        break;
2870
2871    case INDEX_op_cmp_vec:
2872        sub = args[3];
2873        if (sub == TCG_COND_EQ) {
2874            insn = cmpeq_insn[vece];
2875        } else if (sub == TCG_COND_GT) {
2876            insn = cmpgt_insn[vece];
2877        } else {
2878            g_assert_not_reached();
2879        }
2880        goto gen_simd;
2881
2882    case INDEX_op_andc_vec:
2883        insn = OPC_PANDN;
2884        if (type == TCG_TYPE_V256) {
2885            insn |= P_VEXL;
2886        }
2887        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2888        break;
2889
2890    case INDEX_op_shli_vec:
2891        insn = shift_imm_insn[vece];
2892        sub = 6;
2893        goto gen_shift;
2894    case INDEX_op_shri_vec:
2895        insn = shift_imm_insn[vece];
2896        sub = 2;
2897        goto gen_shift;
2898    case INDEX_op_sari_vec:
2899        if (vece == MO_64) {
2900            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
2901        } else {
2902            insn = shift_imm_insn[vece];
2903        }
2904        sub = 4;
2905        goto gen_shift;
2906    case INDEX_op_rotli_vec:
2907        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
2908        if (vece == MO_64) {
2909            insn |= P_VEXW;
2910        }
2911        sub = 1;
2912        goto gen_shift;
2913    gen_shift:
2914        tcg_debug_assert(vece != MO_8);
2915        if (type == TCG_TYPE_V256) {
2916            insn |= P_VEXL;
2917        }
2918        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2919        tcg_out8(s, a2);
2920        break;
2921
2922    case INDEX_op_ld_vec:
2923        tcg_out_ld(s, type, a0, a1, a2);
2924        break;
2925    case INDEX_op_st_vec:
2926        tcg_out_st(s, type, a0, a1, a2);
2927        break;
2928    case INDEX_op_dupm_vec:
2929        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2930        break;
2931
2932    case INDEX_op_x86_shufps_vec:
2933        insn = OPC_SHUFPS;
2934        sub = args[3];
2935        goto gen_simd_imm8;
2936    case INDEX_op_x86_blend_vec:
2937        if (vece == MO_16) {
2938            insn = OPC_PBLENDW;
2939        } else if (vece == MO_32) {
2940            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2941        } else {
2942            g_assert_not_reached();
2943        }
2944        sub = args[3];
2945        goto gen_simd_imm8;
2946    case INDEX_op_x86_vperm2i128_vec:
2947        insn = OPC_VPERM2I128;
2948        sub = args[3];
2949        goto gen_simd_imm8;
2950    case INDEX_op_x86_vpshldi_vec:
2951        insn = vpshldi_insn[vece];
2952        sub = args[3];
2953        goto gen_simd_imm8;
2954
2955    case INDEX_op_not_vec:
2956        insn = OPC_VPTERNLOGQ;
2957        a2 = a1;
2958        sub = 0x33; /* !B */
2959        goto gen_simd_imm8;
2960    case INDEX_op_nor_vec:
2961        insn = OPC_VPTERNLOGQ;
2962        sub = 0x11; /* norCB */
2963        goto gen_simd_imm8;
2964    case INDEX_op_nand_vec:
2965        insn = OPC_VPTERNLOGQ;
2966        sub = 0x77; /* nandCB */
2967        goto gen_simd_imm8;
2968    case INDEX_op_eqv_vec:
2969        insn = OPC_VPTERNLOGQ;
2970        sub = 0x99; /* xnorCB */
2971        goto gen_simd_imm8;
2972    case INDEX_op_orc_vec:
2973        insn = OPC_VPTERNLOGQ;
2974        sub = 0xdd; /* orB!C */
2975        goto gen_simd_imm8;
2976
2977    case INDEX_op_bitsel_vec:
2978        insn = OPC_VPTERNLOGQ;
2979        a3 = args[3];
2980        if (a0 == a1) {
2981            a1 = a2;
2982            a2 = a3;
2983            sub = 0xca; /* A?B:C */
2984        } else if (a0 == a2) {
2985            a2 = a3;
2986            sub = 0xe2; /* B?A:C */
2987        } else {
2988            tcg_out_mov(s, type, a0, a3);
2989            sub = 0xb8; /* B?C:A */
2990        }
2991        goto gen_simd_imm8;
2992
2993    gen_simd_imm8:
2994        tcg_debug_assert(insn != OPC_UD2);
2995        if (type == TCG_TYPE_V256) {
2996            insn |= P_VEXL;
2997        }
2998        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2999        tcg_out8(s, sub);
3000        break;
3001
3002    case INDEX_op_x86_vpblendvb_vec:
3003        insn = OPC_VPBLENDVB;
3004        if (type == TCG_TYPE_V256) {
3005            insn |= P_VEXL;
3006        }
3007        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3008        tcg_out8(s, args[3] << 4);
3009        break;
3010
3011    case INDEX_op_x86_psrldq_vec:
3012        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3013        tcg_out8(s, a2);
3014        break;
3015
3016    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3017    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3018    default:
3019        g_assert_not_reached();
3020    }
3021}
3022
3023static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3024{
3025    switch (op) {
3026    case INDEX_op_goto_ptr:
3027        return C_O0_I1(r);
3028
3029    case INDEX_op_ld8u_i32:
3030    case INDEX_op_ld8u_i64:
3031    case INDEX_op_ld8s_i32:
3032    case INDEX_op_ld8s_i64:
3033    case INDEX_op_ld16u_i32:
3034    case INDEX_op_ld16u_i64:
3035    case INDEX_op_ld16s_i32:
3036    case INDEX_op_ld16s_i64:
3037    case INDEX_op_ld_i32:
3038    case INDEX_op_ld32u_i64:
3039    case INDEX_op_ld32s_i64:
3040    case INDEX_op_ld_i64:
3041        return C_O1_I1(r, r);
3042
3043    case INDEX_op_st8_i32:
3044    case INDEX_op_st8_i64:
3045        return C_O0_I2(qi, r);
3046
3047    case INDEX_op_st16_i32:
3048    case INDEX_op_st16_i64:
3049    case INDEX_op_st_i32:
3050    case INDEX_op_st32_i64:
3051        return C_O0_I2(ri, r);
3052
3053    case INDEX_op_st_i64:
3054        return C_O0_I2(re, r);
3055
3056    case INDEX_op_add_i32:
3057    case INDEX_op_add_i64:
3058        return C_O1_I2(r, r, re);
3059
3060    case INDEX_op_sub_i32:
3061    case INDEX_op_sub_i64:
3062    case INDEX_op_mul_i32:
3063    case INDEX_op_mul_i64:
3064    case INDEX_op_or_i32:
3065    case INDEX_op_or_i64:
3066    case INDEX_op_xor_i32:
3067    case INDEX_op_xor_i64:
3068        return C_O1_I2(r, 0, re);
3069
3070    case INDEX_op_and_i32:
3071    case INDEX_op_and_i64:
3072        return C_O1_I2(r, 0, reZ);
3073
3074    case INDEX_op_andc_i32:
3075    case INDEX_op_andc_i64:
3076        return C_O1_I2(r, r, rI);
3077
3078    case INDEX_op_shl_i32:
3079    case INDEX_op_shl_i64:
3080    case INDEX_op_shr_i32:
3081    case INDEX_op_shr_i64:
3082    case INDEX_op_sar_i32:
3083    case INDEX_op_sar_i64:
3084        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3085
3086    case INDEX_op_rotl_i32:
3087    case INDEX_op_rotl_i64:
3088    case INDEX_op_rotr_i32:
3089    case INDEX_op_rotr_i64:
3090        return C_O1_I2(r, 0, ci);
3091
3092    case INDEX_op_brcond_i32:
3093    case INDEX_op_brcond_i64:
3094        return C_O0_I2(r, re);
3095
3096    case INDEX_op_bswap16_i32:
3097    case INDEX_op_bswap16_i64:
3098    case INDEX_op_bswap32_i32:
3099    case INDEX_op_bswap32_i64:
3100    case INDEX_op_bswap64_i64:
3101    case INDEX_op_neg_i32:
3102    case INDEX_op_neg_i64:
3103    case INDEX_op_not_i32:
3104    case INDEX_op_not_i64:
3105    case INDEX_op_extrh_i64_i32:
3106        return C_O1_I1(r, 0);
3107
3108    case INDEX_op_ext8s_i32:
3109    case INDEX_op_ext8s_i64:
3110    case INDEX_op_ext8u_i32:
3111    case INDEX_op_ext8u_i64:
3112        return C_O1_I1(r, q);
3113
3114    case INDEX_op_ext16s_i32:
3115    case INDEX_op_ext16s_i64:
3116    case INDEX_op_ext16u_i32:
3117    case INDEX_op_ext16u_i64:
3118    case INDEX_op_ext32s_i64:
3119    case INDEX_op_ext32u_i64:
3120    case INDEX_op_ext_i32_i64:
3121    case INDEX_op_extu_i32_i64:
3122    case INDEX_op_extrl_i64_i32:
3123    case INDEX_op_extract_i32:
3124    case INDEX_op_extract_i64:
3125    case INDEX_op_sextract_i32:
3126    case INDEX_op_ctpop_i32:
3127    case INDEX_op_ctpop_i64:
3128        return C_O1_I1(r, r);
3129
3130    case INDEX_op_extract2_i32:
3131    case INDEX_op_extract2_i64:
3132        return C_O1_I2(r, 0, r);
3133
3134    case INDEX_op_deposit_i32:
3135    case INDEX_op_deposit_i64:
3136        return C_O1_I2(Q, 0, Q);
3137
3138    case INDEX_op_setcond_i32:
3139    case INDEX_op_setcond_i64:
3140        return C_O1_I2(q, r, re);
3141
3142    case INDEX_op_movcond_i32:
3143    case INDEX_op_movcond_i64:
3144        return C_O1_I4(r, r, re, r, 0);
3145
3146    case INDEX_op_div2_i32:
3147    case INDEX_op_div2_i64:
3148    case INDEX_op_divu2_i32:
3149    case INDEX_op_divu2_i64:
3150        return C_O2_I3(a, d, 0, 1, r);
3151
3152    case INDEX_op_mulu2_i32:
3153    case INDEX_op_mulu2_i64:
3154    case INDEX_op_muls2_i32:
3155    case INDEX_op_muls2_i64:
3156        return C_O2_I2(a, d, a, r);
3157
3158    case INDEX_op_add2_i32:
3159    case INDEX_op_add2_i64:
3160    case INDEX_op_sub2_i32:
3161    case INDEX_op_sub2_i64:
3162        return C_O2_I4(r, r, 0, 1, re, re);
3163
3164    case INDEX_op_ctz_i32:
3165    case INDEX_op_ctz_i64:
3166        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3167
3168    case INDEX_op_clz_i32:
3169    case INDEX_op_clz_i64:
3170        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3171
3172    case INDEX_op_qemu_ld_a32_i32:
3173        return C_O1_I1(r, L);
3174    case INDEX_op_qemu_ld_a64_i32:
3175        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3176
3177    case INDEX_op_qemu_st_a32_i32:
3178        return C_O0_I2(L, L);
3179    case INDEX_op_qemu_st_a64_i32:
3180        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3181    case INDEX_op_qemu_st8_a32_i32:
3182        return C_O0_I2(s, L);
3183    case INDEX_op_qemu_st8_a64_i32:
3184        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3185
3186    case INDEX_op_qemu_ld_a32_i64:
3187        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3188    case INDEX_op_qemu_ld_a64_i64:
3189        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3190
3191    case INDEX_op_qemu_st_a32_i64:
3192        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3193    case INDEX_op_qemu_st_a64_i64:
3194        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3195
3196    case INDEX_op_brcond2_i32:
3197        return C_O0_I4(r, r, ri, ri);
3198
3199    case INDEX_op_setcond2_i32:
3200        return C_O1_I4(r, r, r, ri, ri);
3201
3202    case INDEX_op_ld_vec:
3203    case INDEX_op_dupm_vec:
3204        return C_O1_I1(x, r);
3205
3206    case INDEX_op_st_vec:
3207        return C_O0_I2(x, r);
3208
3209    case INDEX_op_add_vec:
3210    case INDEX_op_sub_vec:
3211    case INDEX_op_mul_vec:
3212    case INDEX_op_and_vec:
3213    case INDEX_op_or_vec:
3214    case INDEX_op_xor_vec:
3215    case INDEX_op_andc_vec:
3216    case INDEX_op_orc_vec:
3217    case INDEX_op_nand_vec:
3218    case INDEX_op_nor_vec:
3219    case INDEX_op_eqv_vec:
3220    case INDEX_op_ssadd_vec:
3221    case INDEX_op_usadd_vec:
3222    case INDEX_op_sssub_vec:
3223    case INDEX_op_ussub_vec:
3224    case INDEX_op_smin_vec:
3225    case INDEX_op_umin_vec:
3226    case INDEX_op_smax_vec:
3227    case INDEX_op_umax_vec:
3228    case INDEX_op_shlv_vec:
3229    case INDEX_op_shrv_vec:
3230    case INDEX_op_sarv_vec:
3231    case INDEX_op_rotlv_vec:
3232    case INDEX_op_rotrv_vec:
3233    case INDEX_op_shls_vec:
3234    case INDEX_op_shrs_vec:
3235    case INDEX_op_sars_vec:
3236    case INDEX_op_cmp_vec:
3237    case INDEX_op_x86_shufps_vec:
3238    case INDEX_op_x86_blend_vec:
3239    case INDEX_op_x86_packss_vec:
3240    case INDEX_op_x86_packus_vec:
3241    case INDEX_op_x86_vperm2i128_vec:
3242    case INDEX_op_x86_punpckl_vec:
3243    case INDEX_op_x86_punpckh_vec:
3244    case INDEX_op_x86_vpshldi_vec:
3245#if TCG_TARGET_REG_BITS == 32
3246    case INDEX_op_dup2_vec:
3247#endif
3248        return C_O1_I2(x, x, x);
3249
3250    case INDEX_op_abs_vec:
3251    case INDEX_op_dup_vec:
3252    case INDEX_op_not_vec:
3253    case INDEX_op_shli_vec:
3254    case INDEX_op_shri_vec:
3255    case INDEX_op_sari_vec:
3256    case INDEX_op_rotli_vec:
3257    case INDEX_op_x86_psrldq_vec:
3258        return C_O1_I1(x, x);
3259
3260    case INDEX_op_x86_vpshldv_vec:
3261    case INDEX_op_x86_vpshrdv_vec:
3262        return C_O1_I3(x, 0, x, x);
3263
3264    case INDEX_op_bitsel_vec:
3265    case INDEX_op_x86_vpblendvb_vec:
3266        return C_O1_I3(x, x, x, x);
3267
3268    default:
3269        g_assert_not_reached();
3270    }
3271}
3272
3273int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3274{
3275    switch (opc) {
3276    case INDEX_op_add_vec:
3277    case INDEX_op_sub_vec:
3278    case INDEX_op_and_vec:
3279    case INDEX_op_or_vec:
3280    case INDEX_op_xor_vec:
3281    case INDEX_op_andc_vec:
3282    case INDEX_op_orc_vec:
3283    case INDEX_op_nand_vec:
3284    case INDEX_op_nor_vec:
3285    case INDEX_op_eqv_vec:
3286    case INDEX_op_not_vec:
3287    case INDEX_op_bitsel_vec:
3288        return 1;
3289    case INDEX_op_cmp_vec:
3290    case INDEX_op_cmpsel_vec:
3291        return -1;
3292
3293    case INDEX_op_rotli_vec:
3294        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3295
3296    case INDEX_op_shli_vec:
3297    case INDEX_op_shri_vec:
3298        /* We must expand the operation for MO_8.  */
3299        return vece == MO_8 ? -1 : 1;
3300
3301    case INDEX_op_sari_vec:
3302        switch (vece) {
3303        case MO_8:
3304            return -1;
3305        case MO_16:
3306        case MO_32:
3307            return 1;
3308        case MO_64:
3309            if (have_avx512vl) {
3310                return 1;
3311            }
3312            /*
3313             * We can emulate this for MO_64, but it does not pay off
3314             * unless we're producing at least 4 values.
3315             */
3316            return type >= TCG_TYPE_V256 ? -1 : 0;
3317        }
3318        return 0;
3319
3320    case INDEX_op_shls_vec:
3321    case INDEX_op_shrs_vec:
3322        return vece >= MO_16;
3323    case INDEX_op_sars_vec:
3324        switch (vece) {
3325        case MO_16:
3326        case MO_32:
3327            return 1;
3328        case MO_64:
3329            return have_avx512vl;
3330        }
3331        return 0;
3332    case INDEX_op_rotls_vec:
3333        return vece >= MO_16 ? -1 : 0;
3334
3335    case INDEX_op_shlv_vec:
3336    case INDEX_op_shrv_vec:
3337        switch (vece) {
3338        case MO_16:
3339            return have_avx512bw;
3340        case MO_32:
3341        case MO_64:
3342            return have_avx2;
3343        }
3344        return 0;
3345    case INDEX_op_sarv_vec:
3346        switch (vece) {
3347        case MO_16:
3348            return have_avx512bw;
3349        case MO_32:
3350            return have_avx2;
3351        case MO_64:
3352            return have_avx512vl;
3353        }
3354        return 0;
3355    case INDEX_op_rotlv_vec:
3356    case INDEX_op_rotrv_vec:
3357        switch (vece) {
3358        case MO_16:
3359            return have_avx512vbmi2 ? -1 : 0;
3360        case MO_32:
3361        case MO_64:
3362            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3363        }
3364        return 0;
3365
3366    case INDEX_op_mul_vec:
3367        switch (vece) {
3368        case MO_8:
3369            return -1;
3370        case MO_64:
3371            return have_avx512dq;
3372        }
3373        return 1;
3374
3375    case INDEX_op_ssadd_vec:
3376    case INDEX_op_usadd_vec:
3377    case INDEX_op_sssub_vec:
3378    case INDEX_op_ussub_vec:
3379        return vece <= MO_16;
3380    case INDEX_op_smin_vec:
3381    case INDEX_op_smax_vec:
3382    case INDEX_op_umin_vec:
3383    case INDEX_op_umax_vec:
3384    case INDEX_op_abs_vec:
3385        return vece <= MO_32 || have_avx512vl;
3386
3387    default:
3388        return 0;
3389    }
3390}
3391
3392static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3393                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3394{
3395    TCGv_vec t1, t2;
3396
3397    tcg_debug_assert(vece == MO_8);
3398
3399    t1 = tcg_temp_new_vec(type);
3400    t2 = tcg_temp_new_vec(type);
3401
3402    /*
3403     * Unpack to W, shift, and repack.  Tricky bits:
3404     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3405     *     i.e. duplicate in other half of the 16-bit lane.
3406     * (2) For right-shift, add 8 so that the high half of the lane
3407     *     becomes zero.  For left-shift, and left-rotate, we must
3408     *     shift up and down again.
3409     * (3) Step 2 leaves high half zero such that PACKUSWB
3410     *     (pack with unsigned saturation) does not modify
3411     *     the quantity.
3412     */
3413    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3414              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3415    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3416              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3417
3418    if (opc != INDEX_op_rotli_vec) {
3419        imm += 8;
3420    }
3421    if (opc == INDEX_op_shri_vec) {
3422        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3423        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3424    } else {
3425        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3426        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3427        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3428        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3429    }
3430
3431    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3432              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3433    tcg_temp_free_vec(t1);
3434    tcg_temp_free_vec(t2);
3435}
3436
3437static void expand_vec_sari(TCGType type, unsigned vece,
3438                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3439{
3440    TCGv_vec t1, t2;
3441
3442    switch (vece) {
3443    case MO_8:
3444        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3445        t1 = tcg_temp_new_vec(type);
3446        t2 = tcg_temp_new_vec(type);
3447        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3448                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3449        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3450                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3451        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3452        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3453        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3454                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3455        tcg_temp_free_vec(t1);
3456        tcg_temp_free_vec(t2);
3457        break;
3458
3459    case MO_64:
3460        t1 = tcg_temp_new_vec(type);
3461        if (imm <= 32) {
3462            /*
3463             * We can emulate a small sign extend by performing an arithmetic
3464             * 32-bit shift and overwriting the high half of a 64-bit logical
3465             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3466             * does not, so we have to bound the smaller shift -- we get the
3467             * same result in the high half either way.
3468             */
3469            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3470            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3471            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3472                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3473                      tcgv_vec_arg(t1), 0xaa);
3474        } else {
3475            /* Otherwise we will need to use a compare vs 0 to produce
3476             * the sign-extend, shift and merge.
3477             */
3478            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3479                            tcg_constant_vec(type, MO_64, 0), v1);
3480            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3481            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3482            tcg_gen_or_vec(MO_64, v0, v0, t1);
3483        }
3484        tcg_temp_free_vec(t1);
3485        break;
3486
3487    default:
3488        g_assert_not_reached();
3489    }
3490}
3491
3492static void expand_vec_rotli(TCGType type, unsigned vece,
3493                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3494{
3495    TCGv_vec t;
3496
3497    if (vece == MO_8) {
3498        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3499        return;
3500    }
3501
3502    if (have_avx512vbmi2) {
3503        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3504                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3505        return;
3506    }
3507
3508    t = tcg_temp_new_vec(type);
3509    tcg_gen_shli_vec(vece, t, v1, imm);
3510    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3511    tcg_gen_or_vec(vece, v0, v0, t);
3512    tcg_temp_free_vec(t);
3513}
3514
3515static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3516                            TCGv_vec v1, TCGv_vec sh, bool right)
3517{
3518    TCGv_vec t;
3519
3520    if (have_avx512vbmi2) {
3521        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3522                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3523                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3524        return;
3525    }
3526
3527    t = tcg_temp_new_vec(type);
3528    tcg_gen_dupi_vec(vece, t, 8 << vece);
3529    tcg_gen_sub_vec(vece, t, t, sh);
3530    if (right) {
3531        tcg_gen_shlv_vec(vece, t, v1, t);
3532        tcg_gen_shrv_vec(vece, v0, v1, sh);
3533    } else {
3534        tcg_gen_shrv_vec(vece, t, v1, t);
3535        tcg_gen_shlv_vec(vece, v0, v1, sh);
3536    }
3537    tcg_gen_or_vec(vece, v0, v0, t);
3538    tcg_temp_free_vec(t);
3539}
3540
3541static void expand_vec_rotls(TCGType type, unsigned vece,
3542                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3543{
3544    TCGv_vec t = tcg_temp_new_vec(type);
3545
3546    tcg_debug_assert(vece != MO_8);
3547
3548    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3549        tcg_gen_dup_i32_vec(vece, t, lsh);
3550        if (vece >= MO_32) {
3551            tcg_gen_rotlv_vec(vece, v0, v1, t);
3552        } else {
3553            expand_vec_rotv(type, vece, v0, v1, t, false);
3554        }
3555    } else {
3556        TCGv_i32 rsh = tcg_temp_new_i32();
3557
3558        tcg_gen_neg_i32(rsh, lsh);
3559        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3560        tcg_gen_shls_vec(vece, t, v1, lsh);
3561        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3562        tcg_gen_or_vec(vece, v0, v0, t);
3563
3564        tcg_temp_free_i32(rsh);
3565    }
3566
3567    tcg_temp_free_vec(t);
3568}
3569
3570static void expand_vec_mul(TCGType type, unsigned vece,
3571                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3572{
3573    TCGv_vec t1, t2, t3, t4, zero;
3574
3575    tcg_debug_assert(vece == MO_8);
3576
3577    /*
3578     * Unpack v1 bytes to words, 0 | x.
3579     * Unpack v2 bytes to words, y | 0.
3580     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3581     * Shift logical right by 8 bits to clear the high 8 bytes before
3582     * using an unsigned saturated pack.
3583     *
3584     * The difference between the V64, V128 and V256 cases is merely how
3585     * we distribute the expansion between temporaries.
3586     */
3587    switch (type) {
3588    case TCG_TYPE_V64:
3589        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3590        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3591        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3592        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3593                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3594        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3595                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3596        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3597        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3598        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3599                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3600        tcg_temp_free_vec(t1);
3601        tcg_temp_free_vec(t2);
3602        break;
3603
3604    case TCG_TYPE_V128:
3605    case TCG_TYPE_V256:
3606        t1 = tcg_temp_new_vec(type);
3607        t2 = tcg_temp_new_vec(type);
3608        t3 = tcg_temp_new_vec(type);
3609        t4 = tcg_temp_new_vec(type);
3610        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3611        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3612                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3613        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3614                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3615        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3616                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3617        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3618                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3619        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3620        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3621        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3622        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3623        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3624                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3625        tcg_temp_free_vec(t1);
3626        tcg_temp_free_vec(t2);
3627        tcg_temp_free_vec(t3);
3628        tcg_temp_free_vec(t4);
3629        break;
3630
3631    default:
3632        g_assert_not_reached();
3633    }
3634}
3635
3636static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3637                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3638{
3639    enum {
3640        NEED_INV  = 1,
3641        NEED_SWAP = 2,
3642        NEED_BIAS = 4,
3643        NEED_UMIN = 8,
3644        NEED_UMAX = 16,
3645    };
3646    TCGv_vec t1, t2, t3;
3647    uint8_t fixup;
3648
3649    switch (cond) {
3650    case TCG_COND_EQ:
3651    case TCG_COND_GT:
3652        fixup = 0;
3653        break;
3654    case TCG_COND_NE:
3655    case TCG_COND_LE:
3656        fixup = NEED_INV;
3657        break;
3658    case TCG_COND_LT:
3659        fixup = NEED_SWAP;
3660        break;
3661    case TCG_COND_GE:
3662        fixup = NEED_SWAP | NEED_INV;
3663        break;
3664    case TCG_COND_LEU:
3665        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3666            fixup = NEED_UMIN;
3667        } else {
3668            fixup = NEED_BIAS | NEED_INV;
3669        }
3670        break;
3671    case TCG_COND_GTU:
3672        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3673            fixup = NEED_UMIN | NEED_INV;
3674        } else {
3675            fixup = NEED_BIAS;
3676        }
3677        break;
3678    case TCG_COND_GEU:
3679        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3680            fixup = NEED_UMAX;
3681        } else {
3682            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3683        }
3684        break;
3685    case TCG_COND_LTU:
3686        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3687            fixup = NEED_UMAX | NEED_INV;
3688        } else {
3689            fixup = NEED_BIAS | NEED_SWAP;
3690        }
3691        break;
3692    default:
3693        g_assert_not_reached();
3694    }
3695
3696    if (fixup & NEED_INV) {
3697        cond = tcg_invert_cond(cond);
3698    }
3699    if (fixup & NEED_SWAP) {
3700        t1 = v1, v1 = v2, v2 = t1;
3701        cond = tcg_swap_cond(cond);
3702    }
3703
3704    t1 = t2 = NULL;
3705    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3706        t1 = tcg_temp_new_vec(type);
3707        if (fixup & NEED_UMIN) {
3708            tcg_gen_umin_vec(vece, t1, v1, v2);
3709        } else {
3710            tcg_gen_umax_vec(vece, t1, v1, v2);
3711        }
3712        v2 = t1;
3713        cond = TCG_COND_EQ;
3714    } else if (fixup & NEED_BIAS) {
3715        t1 = tcg_temp_new_vec(type);
3716        t2 = tcg_temp_new_vec(type);
3717        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3718        tcg_gen_sub_vec(vece, t1, v1, t3);
3719        tcg_gen_sub_vec(vece, t2, v2, t3);
3720        v1 = t1;
3721        v2 = t2;
3722        cond = tcg_signed_cond(cond);
3723    }
3724
3725    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3726    /* Expand directly; do not recurse.  */
3727    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3728              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3729
3730    if (t1) {
3731        tcg_temp_free_vec(t1);
3732        if (t2) {
3733            tcg_temp_free_vec(t2);
3734        }
3735    }
3736    return fixup & NEED_INV;
3737}
3738
3739static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3740                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3741{
3742    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3743        tcg_gen_not_vec(vece, v0, v0);
3744    }
3745}
3746
3747static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3748                              TCGv_vec c1, TCGv_vec c2,
3749                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3750{
3751    TCGv_vec t = tcg_temp_new_vec(type);
3752
3753    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3754        /* Invert the sense of the compare by swapping arguments.  */
3755        TCGv_vec x;
3756        x = v3, v3 = v4, v4 = x;
3757    }
3758    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3759              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3760              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3761    tcg_temp_free_vec(t);
3762}
3763
3764void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3765                       TCGArg a0, ...)
3766{
3767    va_list va;
3768    TCGArg a2;
3769    TCGv_vec v0, v1, v2, v3, v4;
3770
3771    va_start(va, a0);
3772    v0 = temp_tcgv_vec(arg_temp(a0));
3773    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3774    a2 = va_arg(va, TCGArg);
3775
3776    switch (opc) {
3777    case INDEX_op_shli_vec:
3778    case INDEX_op_shri_vec:
3779        expand_vec_shi(type, vece, opc, v0, v1, a2);
3780        break;
3781
3782    case INDEX_op_sari_vec:
3783        expand_vec_sari(type, vece, v0, v1, a2);
3784        break;
3785
3786    case INDEX_op_rotli_vec:
3787        expand_vec_rotli(type, vece, v0, v1, a2);
3788        break;
3789
3790    case INDEX_op_rotls_vec:
3791        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3792        break;
3793
3794    case INDEX_op_rotlv_vec:
3795        v2 = temp_tcgv_vec(arg_temp(a2));
3796        expand_vec_rotv(type, vece, v0, v1, v2, false);
3797        break;
3798    case INDEX_op_rotrv_vec:
3799        v2 = temp_tcgv_vec(arg_temp(a2));
3800        expand_vec_rotv(type, vece, v0, v1, v2, true);
3801        break;
3802
3803    case INDEX_op_mul_vec:
3804        v2 = temp_tcgv_vec(arg_temp(a2));
3805        expand_vec_mul(type, vece, v0, v1, v2);
3806        break;
3807
3808    case INDEX_op_cmp_vec:
3809        v2 = temp_tcgv_vec(arg_temp(a2));
3810        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3811        break;
3812
3813    case INDEX_op_cmpsel_vec:
3814        v2 = temp_tcgv_vec(arg_temp(a2));
3815        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3816        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3817        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3818        break;
3819
3820    default:
3821        break;
3822    }
3823
3824    va_end(va);
3825}
3826
3827static const int tcg_target_callee_save_regs[] = {
3828#if TCG_TARGET_REG_BITS == 64
3829    TCG_REG_RBP,
3830    TCG_REG_RBX,
3831#if defined(_WIN64)
3832    TCG_REG_RDI,
3833    TCG_REG_RSI,
3834#endif
3835    TCG_REG_R12,
3836    TCG_REG_R13,
3837    TCG_REG_R14, /* Currently used for the global env. */
3838    TCG_REG_R15,
3839#else
3840    TCG_REG_EBP, /* Currently used for the global env. */
3841    TCG_REG_EBX,
3842    TCG_REG_ESI,
3843    TCG_REG_EDI,
3844#endif
3845};
3846
3847/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3848   and tcg_register_jit.  */
3849
3850#define PUSH_SIZE \
3851    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3852     * (TCG_TARGET_REG_BITS / 8))
3853
3854#define FRAME_SIZE \
3855    ((PUSH_SIZE \
3856      + TCG_STATIC_CALL_ARGS_SIZE \
3857      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3858      + TCG_TARGET_STACK_ALIGN - 1) \
3859     & ~(TCG_TARGET_STACK_ALIGN - 1))
3860
3861/* Generate global QEMU prologue and epilogue code */
3862static void tcg_target_qemu_prologue(TCGContext *s)
3863{
3864    int i, stack_addend;
3865
3866    /* TB prologue */
3867
3868    /* Reserve some stack space, also for TCG temps.  */
3869    stack_addend = FRAME_SIZE - PUSH_SIZE;
3870    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3871                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3872
3873    /* Save all callee saved registers.  */
3874    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3875        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3876    }
3877
3878#if TCG_TARGET_REG_BITS == 32
3879    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3880               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3881    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3882    /* jmp *tb.  */
3883    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3884                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3885                         + stack_addend);
3886#else
3887# if !defined(CONFIG_SOFTMMU)
3888    if (guest_base) {
3889        int seg = setup_guest_base_seg();
3890        if (seg != 0) {
3891            x86_guest_base.seg = seg;
3892        } else if (guest_base == (int32_t)guest_base) {
3893            x86_guest_base.ofs = guest_base;
3894        } else {
3895            /* Choose R12 because, as a base, it requires a SIB byte. */
3896            x86_guest_base.index = TCG_REG_R12;
3897            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
3898            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
3899        }
3900    }
3901# endif
3902    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3903    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3904    /* jmp *tb.  */
3905    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3906#endif
3907
3908    /*
3909     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3910     * and fall through to the rest of the epilogue.
3911     */
3912    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3913    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3914
3915    /* TB epilogue */
3916    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3917
3918    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3919
3920    if (have_avx2) {
3921        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3922    }
3923    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3924        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3925    }
3926    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3927}
3928
3929static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3930{
3931    memset(p, 0x90, count);
3932}
3933
3934static void tcg_target_init(TCGContext *s)
3935{
3936    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3937    if (TCG_TARGET_REG_BITS == 64) {
3938        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3939    }
3940    if (have_avx1) {
3941        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3942        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3943    }
3944    if (have_avx2) {
3945        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3946    }
3947
3948    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3949    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3950    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3951    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3952    if (TCG_TARGET_REG_BITS == 64) {
3953#if !defined(_WIN64)
3954        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3955        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3956#endif
3957        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3958        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3959        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3960        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3961    }
3962
3963    s->reserved_regs = 0;
3964    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3965#ifdef _WIN64
3966    /* These are call saved, and we don't save them, so don't use them. */
3967    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
3968    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
3969    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
3970    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
3971    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
3972    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
3973    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
3974    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
3975    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
3976    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
3977#endif
3978}
3979
3980typedef struct {
3981    DebugFrameHeader h;
3982    uint8_t fde_def_cfa[4];
3983    uint8_t fde_reg_ofs[14];
3984} DebugFrame;
3985
3986/* We're expecting a 2 byte uleb128 encoded value.  */
3987QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3988
3989#if !defined(__ELF__)
3990    /* Host machine without ELF. */
3991#elif TCG_TARGET_REG_BITS == 64
3992#define ELF_HOST_MACHINE EM_X86_64
3993static const DebugFrame debug_frame = {
3994    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3995    .h.cie.id = -1,
3996    .h.cie.version = 1,
3997    .h.cie.code_align = 1,
3998    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3999    .h.cie.return_column = 16,
4000
4001    /* Total FDE size does not include the "len" member.  */
4002    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4003
4004    .fde_def_cfa = {
4005        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4006        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4007        (FRAME_SIZE >> 7)
4008    },
4009    .fde_reg_ofs = {
4010        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4011        /* The following ordering must match tcg_target_callee_save_regs.  */
4012        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4013        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4014        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4015        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4016        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4017        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4018    }
4019};
4020#else
4021#define ELF_HOST_MACHINE EM_386
4022static const DebugFrame debug_frame = {
4023    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4024    .h.cie.id = -1,
4025    .h.cie.version = 1,
4026    .h.cie.code_align = 1,
4027    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4028    .h.cie.return_column = 8,
4029
4030    /* Total FDE size does not include the "len" member.  */
4031    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4032
4033    .fde_def_cfa = {
4034        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4035        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4036        (FRAME_SIZE >> 7)
4037    },
4038    .fde_reg_ofs = {
4039        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4040        /* The following ordering must match tcg_target_callee_save_regs.  */
4041        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4042        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4043        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4044        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4045    }
4046};
4047#endif
4048
4049#if defined(ELF_HOST_MACHINE)
4050void tcg_register_jit(const void *buf, size_t buf_size)
4051{
4052    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4053}
4054#endif
4055