xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 12fde9bcdb52118495d10c32ed375679f23e323c)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94static const int tcg_target_call_iarg_regs[] = {
95#if TCG_TARGET_REG_BITS == 64
96#if defined(_WIN64)
97    TCG_REG_RCX,
98    TCG_REG_RDX,
99#else
100    TCG_REG_RDI,
101    TCG_REG_RSI,
102    TCG_REG_RDX,
103    TCG_REG_RCX,
104#endif
105    TCG_REG_R8,
106    TCG_REG_R9,
107#else
108    /* 32 bit mode uses stack based calling convention (GCC default). */
109#endif
110};
111
112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
113{
114    switch (kind) {
115    case TCG_CALL_RET_NORMAL:
116        tcg_debug_assert(slot >= 0 && slot <= 1);
117        return slot ? TCG_REG_EDX : TCG_REG_EAX;
118#ifdef _WIN64
119    case TCG_CALL_RET_BY_VEC:
120        tcg_debug_assert(slot == 0);
121        return TCG_REG_XMM0;
122#endif
123    default:
124        g_assert_not_reached();
125    }
126}
127
128/* Constants we accept.  */
129#define TCG_CT_CONST_S32 0x100
130#define TCG_CT_CONST_U32 0x200
131#define TCG_CT_CONST_I32 0x400
132#define TCG_CT_CONST_WSZ 0x800
133
134/* Registers used with L constraint, which are the first argument
135   registers on x86_64, and two random call clobbered registers on
136   i386. */
137#if TCG_TARGET_REG_BITS == 64
138# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
139# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
140#else
141# define TCG_REG_L0 TCG_REG_EAX
142# define TCG_REG_L1 TCG_REG_EDX
143#endif
144
145#define ALL_BYTEH_REGS         0x0000000fu
146#if TCG_TARGET_REG_BITS == 64
147# define ALL_GENERAL_REGS      0x0000ffffu
148# define ALL_VECTOR_REGS       0xffff0000u
149# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
150#else
151# define ALL_GENERAL_REGS      0x000000ffu
152# define ALL_VECTOR_REGS       0x00ff0000u
153# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
154#endif
155#ifdef CONFIG_SOFTMMU
156# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
157#else
158# define SOFTMMU_RESERVE_REGS  0
159#endif
160
161/* The host compiler should supply <cpuid.h> to enable runtime features
162   detection, as we're not going to go so far as our own inline assembly.
163   If not available, default values will be assumed.  */
164#if defined(CONFIG_CPUID_H)
165#include "qemu/cpuid.h"
166#endif
167
168/* For 64-bit, we always know that CMOV is available.  */
169#if TCG_TARGET_REG_BITS == 64
170# define have_cmov 1
171#elif defined(CONFIG_CPUID_H)
172static bool have_cmov;
173#else
174# define have_cmov 0
175#endif
176
177/* We need these symbols in tcg-target.h, and we can't properly conditionalize
178   it there.  Therefore we always define the variable.  */
179bool have_bmi1;
180bool have_popcnt;
181bool have_avx1;
182bool have_avx2;
183bool have_avx512bw;
184bool have_avx512dq;
185bool have_avx512vbmi2;
186bool have_avx512vl;
187bool have_movbe;
188bool have_atomic16;
189
190#ifdef CONFIG_CPUID_H
191static bool have_bmi2;
192static bool have_lzcnt;
193#else
194# define have_bmi2 0
195# define have_lzcnt 0
196#endif
197
198static const tcg_insn_unit *tb_ret_addr;
199
200static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
201                        intptr_t value, intptr_t addend)
202{
203    value += addend;
204    switch(type) {
205    case R_386_PC32:
206        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
207        if (value != (int32_t)value) {
208            return false;
209        }
210        /* FALLTHRU */
211    case R_386_32:
212        tcg_patch32(code_ptr, value);
213        break;
214    case R_386_PC8:
215        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
216        if (value != (int8_t)value) {
217            return false;
218        }
219        tcg_patch8(code_ptr, value);
220        break;
221    default:
222        g_assert_not_reached();
223    }
224    return true;
225}
226
227/* test if a constant matches the constraint */
228static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
229{
230    if (ct & TCG_CT_CONST) {
231        return 1;
232    }
233    if (type == TCG_TYPE_I32) {
234        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
235            return 1;
236        }
237    } else {
238        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
239            return 1;
240        }
241        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
242            return 1;
243        }
244        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
245            return 1;
246        }
247    }
248    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
249        return 1;
250    }
251    return 0;
252}
253
254# define LOWREGMASK(x)	((x) & 7)
255
256#define P_EXT		0x100		/* 0x0f opcode prefix */
257#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
258#define P_DATA16        0x400           /* 0x66 opcode prefix */
259#define P_VEXW          0x1000          /* Set VEX.W = 1 */
260#if TCG_TARGET_REG_BITS == 64
261# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
262# define P_REXB_R       0x2000          /* REG field as byte register */
263# define P_REXB_RM      0x4000          /* R/M field as byte register */
264# define P_GS           0x8000          /* gs segment override */
265#else
266# define P_REXW		0
267# define P_REXB_R	0
268# define P_REXB_RM	0
269# define P_GS           0
270#endif
271#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
272#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
273#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
274#define P_VEXL          0x80000         /* Set VEX.L = 1 */
275#define P_EVEX          0x100000        /* Requires EVEX encoding */
276
277#define OPC_ARITH_EvIz	(0x81)
278#define OPC_ARITH_EvIb	(0x83)
279#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
280#define OPC_ANDN        (0xf2 | P_EXT38)
281#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
282#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
283#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
284#define OPC_BSF         (0xbc | P_EXT)
285#define OPC_BSR         (0xbd | P_EXT)
286#define OPC_BSWAP	(0xc8 | P_EXT)
287#define OPC_CALL_Jz	(0xe8)
288#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
289#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
290#define OPC_DEC_r32	(0x48)
291#define OPC_IMUL_GvEv	(0xaf | P_EXT)
292#define OPC_IMUL_GvEvIb	(0x6b)
293#define OPC_IMUL_GvEvIz	(0x69)
294#define OPC_INC_r32	(0x40)
295#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
296#define OPC_JCC_short	(0x70)		/* ... plus condition code */
297#define OPC_JMP_long	(0xe9)
298#define OPC_JMP_short	(0xeb)
299#define OPC_LEA         (0x8d)
300#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
301#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
302#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
303#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
304#define OPC_MOVB_EvIz   (0xc6)
305#define OPC_MOVL_EvIz	(0xc7)
306#define OPC_MOVL_Iv     (0xb8)
307#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
308#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
309#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
310#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
311#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
312#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
313#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
314#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
315#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
316#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
317#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
318#define OPC_MOVSBL	(0xbe | P_EXT)
319#define OPC_MOVSWL	(0xbf | P_EXT)
320#define OPC_MOVSLQ	(0x63 | P_REXW)
321#define OPC_MOVZBL	(0xb6 | P_EXT)
322#define OPC_MOVZWL	(0xb7 | P_EXT)
323#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
324#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
325#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
326#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
327#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
328#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
329#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
330#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
331#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
332#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
333#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
334#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
335#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
336#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
337#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
338#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
339#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
340#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
341#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
342#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
343#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
344#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
345#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
346#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
347#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
348#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
349#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
350#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
351#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
352#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
353#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
354#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
355#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
356#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
357#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
358#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
359#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
360#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
361#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
362#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
363#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
364#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
365#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
366#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
367#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
368#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
369#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
370#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
371#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
372#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
373#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
374#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
375#define OPC_POR         (0xeb | P_EXT | P_DATA16)
376#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
377#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
378#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
379#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
380#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
381#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
382#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
383#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
384#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
385#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
386#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
387#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
388#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
389#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
390#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
391#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
392#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
393#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
394#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
395#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
396#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
397#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
398#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
399#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
400#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
401#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
402#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
403#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
404#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
405#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
406#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
407#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
408#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
409#define OPC_POP_r32	(0x58)
410#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
411#define OPC_PUSH_r32	(0x50)
412#define OPC_PUSH_Iv	(0x68)
413#define OPC_PUSH_Ib	(0x6a)
414#define OPC_RET		(0xc3)
415#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
416#define OPC_SHIFT_1	(0xd1)
417#define OPC_SHIFT_Ib	(0xc1)
418#define OPC_SHIFT_cl	(0xd3)
419#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
420#define OPC_SHUFPS      (0xc6 | P_EXT)
421#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
422#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
423#define OPC_SHRD_Ib     (0xac | P_EXT)
424#define OPC_TESTL	(0x85)
425#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
426#define OPC_UD2         (0x0b | P_EXT)
427#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
428#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
429#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
430#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
431#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
432#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
433#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
434#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
435#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
436#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
437#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
438#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
439#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
440#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
441#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
442#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
443#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
444#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
445#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
446#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
447#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
448#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
449#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
450#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
451#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
452#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
453#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
454#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
455#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
456#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
457#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
458#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
459#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
460#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
461#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
462#define OPC_VZEROUPPER  (0x77 | P_EXT)
463#define OPC_XCHG_ax_r32	(0x90)
464#define OPC_XCHG_EvGv   (0x87)
465
466#define OPC_GRP3_Eb     (0xf6)
467#define OPC_GRP3_Ev     (0xf7)
468#define OPC_GRP5        (0xff)
469#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
470
471/* Group 1 opcode extensions for 0x80-0x83.
472   These are also used as modifiers for OPC_ARITH.  */
473#define ARITH_ADD 0
474#define ARITH_OR  1
475#define ARITH_ADC 2
476#define ARITH_SBB 3
477#define ARITH_AND 4
478#define ARITH_SUB 5
479#define ARITH_XOR 6
480#define ARITH_CMP 7
481
482/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
483#define SHIFT_ROL 0
484#define SHIFT_ROR 1
485#define SHIFT_SHL 4
486#define SHIFT_SHR 5
487#define SHIFT_SAR 7
488
489/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
490#define EXT3_TESTi 0
491#define EXT3_NOT   2
492#define EXT3_NEG   3
493#define EXT3_MUL   4
494#define EXT3_IMUL  5
495#define EXT3_DIV   6
496#define EXT3_IDIV  7
497
498/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
499#define EXT5_INC_Ev	0
500#define EXT5_DEC_Ev	1
501#define EXT5_CALLN_Ev	2
502#define EXT5_JMPN_Ev	4
503
504/* Condition codes to be added to OPC_JCC_{long,short}.  */
505#define JCC_JMP (-1)
506#define JCC_JO  0x0
507#define JCC_JNO 0x1
508#define JCC_JB  0x2
509#define JCC_JAE 0x3
510#define JCC_JE  0x4
511#define JCC_JNE 0x5
512#define JCC_JBE 0x6
513#define JCC_JA  0x7
514#define JCC_JS  0x8
515#define JCC_JNS 0x9
516#define JCC_JP  0xa
517#define JCC_JNP 0xb
518#define JCC_JL  0xc
519#define JCC_JGE 0xd
520#define JCC_JLE 0xe
521#define JCC_JG  0xf
522
523static const uint8_t tcg_cond_to_jcc[] = {
524    [TCG_COND_EQ] = JCC_JE,
525    [TCG_COND_NE] = JCC_JNE,
526    [TCG_COND_LT] = JCC_JL,
527    [TCG_COND_GE] = JCC_JGE,
528    [TCG_COND_LE] = JCC_JLE,
529    [TCG_COND_GT] = JCC_JG,
530    [TCG_COND_LTU] = JCC_JB,
531    [TCG_COND_GEU] = JCC_JAE,
532    [TCG_COND_LEU] = JCC_JBE,
533    [TCG_COND_GTU] = JCC_JA,
534};
535
536#if TCG_TARGET_REG_BITS == 64
537static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
538{
539    int rex;
540
541    if (opc & P_GS) {
542        tcg_out8(s, 0x65);
543    }
544    if (opc & P_DATA16) {
545        /* We should never be asking for both 16 and 64-bit operation.  */
546        tcg_debug_assert((opc & P_REXW) == 0);
547        tcg_out8(s, 0x66);
548    }
549    if (opc & P_SIMDF3) {
550        tcg_out8(s, 0xf3);
551    } else if (opc & P_SIMDF2) {
552        tcg_out8(s, 0xf2);
553    }
554
555    rex = 0;
556    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
557    rex |= (r & 8) >> 1;                /* REX.R */
558    rex |= (x & 8) >> 2;                /* REX.X */
559    rex |= (rm & 8) >> 3;               /* REX.B */
560
561    /* P_REXB_{R,RM} indicates that the given register is the low byte.
562       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
563       as otherwise the encoding indicates %[abcd]h.  Note that the values
564       that are ORed in merely indicate that the REX byte must be present;
565       those bits get discarded in output.  */
566    rex |= opc & (r >= 4 ? P_REXB_R : 0);
567    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
568
569    if (rex) {
570        tcg_out8(s, (uint8_t)(rex | 0x40));
571    }
572
573    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
574        tcg_out8(s, 0x0f);
575        if (opc & P_EXT38) {
576            tcg_out8(s, 0x38);
577        } else if (opc & P_EXT3A) {
578            tcg_out8(s, 0x3a);
579        }
580    }
581
582    tcg_out8(s, opc);
583}
584#else
585static void tcg_out_opc(TCGContext *s, int opc)
586{
587    if (opc & P_DATA16) {
588        tcg_out8(s, 0x66);
589    }
590    if (opc & P_SIMDF3) {
591        tcg_out8(s, 0xf3);
592    } else if (opc & P_SIMDF2) {
593        tcg_out8(s, 0xf2);
594    }
595    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
596        tcg_out8(s, 0x0f);
597        if (opc & P_EXT38) {
598            tcg_out8(s, 0x38);
599        } else if (opc & P_EXT3A) {
600            tcg_out8(s, 0x3a);
601        }
602    }
603    tcg_out8(s, opc);
604}
605/* Discard the register arguments to tcg_out_opc early, so as not to penalize
606   the 32-bit compilation paths.  This method works with all versions of gcc,
607   whereas relying on optimization may not be able to exclude them.  */
608#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
609#endif
610
611static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
612{
613    tcg_out_opc(s, opc, r, rm, 0);
614    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
615}
616
617static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
618                            int rm, int index)
619{
620    int tmp;
621
622    /* Use the two byte form if possible, which cannot encode
623       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
624    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
625        && ((rm | index) & 8) == 0) {
626        /* Two byte VEX prefix.  */
627        tcg_out8(s, 0xc5);
628
629        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
630    } else {
631        /* Three byte VEX prefix.  */
632        tcg_out8(s, 0xc4);
633
634        /* VEX.m-mmmm */
635        if (opc & P_EXT3A) {
636            tmp = 3;
637        } else if (opc & P_EXT38) {
638            tmp = 2;
639        } else if (opc & P_EXT) {
640            tmp = 1;
641        } else {
642            g_assert_not_reached();
643        }
644        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
645        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
646        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
647        tcg_out8(s, tmp);
648
649        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
650    }
651
652    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
653    /* VEX.pp */
654    if (opc & P_DATA16) {
655        tmp |= 1;                          /* 0x66 */
656    } else if (opc & P_SIMDF3) {
657        tmp |= 2;                          /* 0xf3 */
658    } else if (opc & P_SIMDF2) {
659        tmp |= 3;                          /* 0xf2 */
660    }
661    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
662    tcg_out8(s, tmp);
663    tcg_out8(s, opc);
664}
665
666static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
667                             int rm, int index)
668{
669    /* The entire 4-byte evex prefix; with R' and V' set. */
670    uint32_t p = 0x08041062;
671    int mm, pp;
672
673    tcg_debug_assert(have_avx512vl);
674
675    /* EVEX.mm */
676    if (opc & P_EXT3A) {
677        mm = 3;
678    } else if (opc & P_EXT38) {
679        mm = 2;
680    } else if (opc & P_EXT) {
681        mm = 1;
682    } else {
683        g_assert_not_reached();
684    }
685
686    /* EVEX.pp */
687    if (opc & P_DATA16) {
688        pp = 1;                          /* 0x66 */
689    } else if (opc & P_SIMDF3) {
690        pp = 2;                          /* 0xf3 */
691    } else if (opc & P_SIMDF2) {
692        pp = 3;                          /* 0xf2 */
693    } else {
694        pp = 0;
695    }
696
697    p = deposit32(p, 8, 2, mm);
698    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
699    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
700    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
701    p = deposit32(p, 16, 2, pp);
702    p = deposit32(p, 19, 4, ~v);
703    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
704    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
705
706    tcg_out32(s, p);
707    tcg_out8(s, opc);
708}
709
710static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
711{
712    if (opc & P_EVEX) {
713        tcg_out_evex_opc(s, opc, r, v, rm, 0);
714    } else {
715        tcg_out_vex_opc(s, opc, r, v, rm, 0);
716    }
717    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
718}
719
720/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
721   We handle either RM and INDEX missing with a negative value.  In 64-bit
722   mode for absolute addresses, ~RM is the size of the immediate operand
723   that will follow the instruction.  */
724
725static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
726                               int shift, intptr_t offset)
727{
728    int mod, len;
729
730    if (index < 0 && rm < 0) {
731        if (TCG_TARGET_REG_BITS == 64) {
732            /* Try for a rip-relative addressing mode.  This has replaced
733               the 32-bit-mode absolute addressing encoding.  */
734            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
735            intptr_t disp = offset - pc;
736            if (disp == (int32_t)disp) {
737                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
738                tcg_out32(s, disp);
739                return;
740            }
741
742            /* Try for an absolute address encoding.  This requires the
743               use of the MODRM+SIB encoding and is therefore larger than
744               rip-relative addressing.  */
745            if (offset == (int32_t)offset) {
746                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
747                tcg_out8(s, (4 << 3) | 5);
748                tcg_out32(s, offset);
749                return;
750            }
751
752            /* ??? The memory isn't directly addressable.  */
753            g_assert_not_reached();
754        } else {
755            /* Absolute address.  */
756            tcg_out8(s, (r << 3) | 5);
757            tcg_out32(s, offset);
758            return;
759        }
760    }
761
762    /* Find the length of the immediate addend.  Note that the encoding
763       that would be used for (%ebp) indicates absolute addressing.  */
764    if (rm < 0) {
765        mod = 0, len = 4, rm = 5;
766    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
767        mod = 0, len = 0;
768    } else if (offset == (int8_t)offset) {
769        mod = 0x40, len = 1;
770    } else {
771        mod = 0x80, len = 4;
772    }
773
774    /* Use a single byte MODRM format if possible.  Note that the encoding
775       that would be used for %esp is the escape to the two byte form.  */
776    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
777        /* Single byte MODRM format.  */
778        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
779    } else {
780        /* Two byte MODRM+SIB format.  */
781
782        /* Note that the encoding that would place %esp into the index
783           field indicates no index register.  In 64-bit mode, the REX.X
784           bit counts, so %r12 can be used as the index.  */
785        if (index < 0) {
786            index = 4;
787        } else {
788            tcg_debug_assert(index != TCG_REG_ESP);
789        }
790
791        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
792        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
793    }
794
795    if (len == 1) {
796        tcg_out8(s, offset);
797    } else if (len == 4) {
798        tcg_out32(s, offset);
799    }
800}
801
802static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
803                                     int index, int shift, intptr_t offset)
804{
805    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
806    tcg_out_sib_offset(s, r, rm, index, shift, offset);
807}
808
809static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
810                                         int rm, int index, int shift,
811                                         intptr_t offset)
812{
813    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
814    tcg_out_sib_offset(s, r, rm, index, shift, offset);
815}
816
817/* A simplification of the above with no index or shift.  */
818static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
819                                        int rm, intptr_t offset)
820{
821    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
822}
823
824static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
825                                            int v, int rm, intptr_t offset)
826{
827    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
828}
829
830/* Output an opcode with an expected reference to the constant pool.  */
831static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
832{
833    tcg_out_opc(s, opc, r, 0, 0);
834    /* Absolute for 32-bit, pc-relative for 64-bit.  */
835    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
836    tcg_out32(s, 0);
837}
838
839/* Output an opcode with an expected reference to the constant pool.  */
840static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
841{
842    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
843    /* Absolute for 32-bit, pc-relative for 64-bit.  */
844    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
845    tcg_out32(s, 0);
846}
847
848/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
849static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
850{
851    /* Propagate an opcode prefix, such as P_REXW.  */
852    int ext = subop & ~0x7;
853    subop &= 0x7;
854
855    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
856}
857
858static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
859{
860    int rexw = 0;
861
862    if (arg == ret) {
863        return true;
864    }
865    switch (type) {
866    case TCG_TYPE_I64:
867        rexw = P_REXW;
868        /* fallthru */
869    case TCG_TYPE_I32:
870        if (ret < 16) {
871            if (arg < 16) {
872                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
873            } else {
874                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
875            }
876        } else {
877            if (arg < 16) {
878                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
879            } else {
880                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
881            }
882        }
883        break;
884
885    case TCG_TYPE_V64:
886        tcg_debug_assert(ret >= 16 && arg >= 16);
887        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
888        break;
889    case TCG_TYPE_V128:
890        tcg_debug_assert(ret >= 16 && arg >= 16);
891        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
892        break;
893    case TCG_TYPE_V256:
894        tcg_debug_assert(ret >= 16 && arg >= 16);
895        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
896        break;
897
898    default:
899        g_assert_not_reached();
900    }
901    return true;
902}
903
904static const int avx2_dup_insn[4] = {
905    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
906    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
907};
908
909static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
910                            TCGReg r, TCGReg a)
911{
912    if (have_avx2) {
913        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
914        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
915    } else {
916        switch (vece) {
917        case MO_8:
918            /* ??? With zero in a register, use PSHUFB.  */
919            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
920            a = r;
921            /* FALLTHRU */
922        case MO_16:
923            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
924            a = r;
925            /* FALLTHRU */
926        case MO_32:
927            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
928            /* imm8 operand: all output lanes selected from input lane 0.  */
929            tcg_out8(s, 0);
930            break;
931        case MO_64:
932            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
933            break;
934        default:
935            g_assert_not_reached();
936        }
937    }
938    return true;
939}
940
941static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
942                             TCGReg r, TCGReg base, intptr_t offset)
943{
944    if (have_avx2) {
945        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
946        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
947                                 r, 0, base, offset);
948    } else {
949        switch (vece) {
950        case MO_64:
951            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
952            break;
953        case MO_32:
954            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
955            break;
956        case MO_16:
957            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
958            tcg_out8(s, 0); /* imm8 */
959            tcg_out_dup_vec(s, type, vece, r, r);
960            break;
961        case MO_8:
962            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
963            tcg_out8(s, 0); /* imm8 */
964            tcg_out_dup_vec(s, type, vece, r, r);
965            break;
966        default:
967            g_assert_not_reached();
968        }
969    }
970    return true;
971}
972
973static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
974                             TCGReg ret, int64_t arg)
975{
976    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
977
978    if (arg == 0) {
979        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
980        return;
981    }
982    if (arg == -1) {
983        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
984        return;
985    }
986
987    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
988        if (have_avx2) {
989            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
990        } else {
991            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
992        }
993        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
994    } else {
995        if (type == TCG_TYPE_V64) {
996            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
997        } else if (have_avx2) {
998            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
999        } else {
1000            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1001        }
1002        if (TCG_TARGET_REG_BITS == 64) {
1003            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1004        } else {
1005            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1006        }
1007    }
1008}
1009
1010static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1011                             TCGReg ret, tcg_target_long arg)
1012{
1013    if (arg == 0) {
1014        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1015        return;
1016    }
1017    if (arg == -1) {
1018        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1019        return;
1020    }
1021
1022    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1023    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1024    if (TCG_TARGET_REG_BITS == 64) {
1025        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1026    } else {
1027        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1028    }
1029}
1030
1031static void tcg_out_movi_int(TCGContext *s, TCGType type,
1032                             TCGReg ret, tcg_target_long arg)
1033{
1034    tcg_target_long diff;
1035
1036    if (arg == 0) {
1037        tgen_arithr(s, ARITH_XOR, ret, ret);
1038        return;
1039    }
1040    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1041        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1042        tcg_out32(s, arg);
1043        return;
1044    }
1045    if (arg == (int32_t)arg) {
1046        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1047        tcg_out32(s, arg);
1048        return;
1049    }
1050
1051    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1052    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1053    if (diff == (int32_t)diff) {
1054        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1055        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1056        tcg_out32(s, diff);
1057        return;
1058    }
1059
1060    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1061    tcg_out64(s, arg);
1062}
1063
1064static void tcg_out_movi(TCGContext *s, TCGType type,
1065                         TCGReg ret, tcg_target_long arg)
1066{
1067    switch (type) {
1068    case TCG_TYPE_I32:
1069#if TCG_TARGET_REG_BITS == 64
1070    case TCG_TYPE_I64:
1071#endif
1072        if (ret < 16) {
1073            tcg_out_movi_int(s, type, ret, arg);
1074        } else {
1075            tcg_out_movi_vec(s, type, ret, arg);
1076        }
1077        break;
1078    default:
1079        g_assert_not_reached();
1080    }
1081}
1082
1083static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1084{
1085    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1086    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1087    return true;
1088}
1089
1090static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1091                             tcg_target_long imm)
1092{
1093    /* This function is only used for passing structs by reference. */
1094    tcg_debug_assert(imm == (int32_t)imm);
1095    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1096}
1097
1098static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1099{
1100    if (val == (int8_t)val) {
1101        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1102        tcg_out8(s, val);
1103    } else if (val == (int32_t)val) {
1104        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1105        tcg_out32(s, val);
1106    } else {
1107        g_assert_not_reached();
1108    }
1109}
1110
1111static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1112{
1113    /* Given the strength of x86 memory ordering, we only need care for
1114       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1115       faster than "mfence", so don't bother with the sse insn.  */
1116    if (a0 & TCG_MO_ST_LD) {
1117        tcg_out8(s, 0xf0);
1118        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1119        tcg_out8(s, 0);
1120    }
1121}
1122
1123static inline void tcg_out_push(TCGContext *s, int reg)
1124{
1125    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1126}
1127
1128static inline void tcg_out_pop(TCGContext *s, int reg)
1129{
1130    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1131}
1132
1133static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1134                       TCGReg arg1, intptr_t arg2)
1135{
1136    switch (type) {
1137    case TCG_TYPE_I32:
1138        if (ret < 16) {
1139            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1140        } else {
1141            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1142        }
1143        break;
1144    case TCG_TYPE_I64:
1145        if (ret < 16) {
1146            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1147            break;
1148        }
1149        /* FALLTHRU */
1150    case TCG_TYPE_V64:
1151        /* There is no instruction that can validate 8-byte alignment.  */
1152        tcg_debug_assert(ret >= 16);
1153        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1154        break;
1155    case TCG_TYPE_V128:
1156        /*
1157         * The gvec infrastructure is asserts that v128 vector loads
1158         * and stores use a 16-byte aligned offset.  Validate that the
1159         * final pointer is aligned by using an insn that will SIGSEGV.
1160         */
1161        tcg_debug_assert(ret >= 16);
1162        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1163        break;
1164    case TCG_TYPE_V256:
1165        /*
1166         * The gvec infrastructure only requires 16-byte alignment,
1167         * so here we must use an unaligned load.
1168         */
1169        tcg_debug_assert(ret >= 16);
1170        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1171                                 ret, 0, arg1, arg2);
1172        break;
1173    default:
1174        g_assert_not_reached();
1175    }
1176}
1177
1178static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1179                       TCGReg arg1, intptr_t arg2)
1180{
1181    switch (type) {
1182    case TCG_TYPE_I32:
1183        if (arg < 16) {
1184            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1185        } else {
1186            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1187        }
1188        break;
1189    case TCG_TYPE_I64:
1190        if (arg < 16) {
1191            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1192            break;
1193        }
1194        /* FALLTHRU */
1195    case TCG_TYPE_V64:
1196        /* There is no instruction that can validate 8-byte alignment.  */
1197        tcg_debug_assert(arg >= 16);
1198        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1199        break;
1200    case TCG_TYPE_V128:
1201        /*
1202         * The gvec infrastructure is asserts that v128 vector loads
1203         * and stores use a 16-byte aligned offset.  Validate that the
1204         * final pointer is aligned by using an insn that will SIGSEGV.
1205         *
1206         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1207         * for _WIN64, which must have SSE2 but may not have AVX.
1208         */
1209        tcg_debug_assert(arg >= 16);
1210        if (have_avx1) {
1211            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1212        } else {
1213            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1214        }
1215        break;
1216    case TCG_TYPE_V256:
1217        /*
1218         * The gvec infrastructure only requires 16-byte alignment,
1219         * so here we must use an unaligned store.
1220         */
1221        tcg_debug_assert(arg >= 16);
1222        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1223                                 arg, 0, arg1, arg2);
1224        break;
1225    default:
1226        g_assert_not_reached();
1227    }
1228}
1229
1230static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1231                        TCGReg base, intptr_t ofs)
1232{
1233    int rexw = 0;
1234    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1235        if (val != (int32_t)val) {
1236            return false;
1237        }
1238        rexw = P_REXW;
1239    } else if (type != TCG_TYPE_I32) {
1240        return false;
1241    }
1242    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1243    tcg_out32(s, val);
1244    return true;
1245}
1246
1247static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1248{
1249    /* Propagate an opcode prefix, such as P_DATA16.  */
1250    int ext = subopc & ~0x7;
1251    subopc &= 0x7;
1252
1253    if (count == 1) {
1254        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1255    } else {
1256        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1257        tcg_out8(s, count);
1258    }
1259}
1260
1261static inline void tcg_out_bswap32(TCGContext *s, int reg)
1262{
1263    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1264}
1265
1266static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1267{
1268    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1269}
1270
1271static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1272{
1273    /* movzbl */
1274    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1275    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1276}
1277
1278static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1279{
1280    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1281    /* movsbl */
1282    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1283    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1284}
1285
1286static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1287{
1288    /* movzwl */
1289    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1290}
1291
1292static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1293{
1294    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1295    /* movsw[lq] */
1296    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1297}
1298
1299static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1300{
1301    /* 32-bit mov zero extends.  */
1302    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1303}
1304
1305static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1306{
1307    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1308    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1309}
1310
1311static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1312{
1313    tcg_out_ext32s(s, dest, src);
1314}
1315
1316static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1317{
1318    tcg_out_ext32u(s, dest, src);
1319}
1320
1321static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1322{
1323    tcg_out_ext32u(s, dest, src);
1324}
1325
1326static inline void tcg_out_bswap64(TCGContext *s, int reg)
1327{
1328    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1329}
1330
1331static void tgen_arithi(TCGContext *s, int c, int r0,
1332                        tcg_target_long val, int cf)
1333{
1334    int rexw = 0;
1335
1336    if (TCG_TARGET_REG_BITS == 64) {
1337        rexw = c & -8;
1338        c &= 7;
1339    }
1340
1341    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1342       partial flags update stalls on Pentium4 and are not recommended
1343       by current Intel optimization manuals.  */
1344    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1345        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1346        if (TCG_TARGET_REG_BITS == 64) {
1347            /* The single-byte increment encodings are re-tasked as the
1348               REX prefixes.  Use the MODRM encoding.  */
1349            tcg_out_modrm(s, OPC_GRP5 + rexw,
1350                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1351        } else {
1352            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1353        }
1354        return;
1355    }
1356
1357    if (c == ARITH_AND) {
1358        if (TCG_TARGET_REG_BITS == 64) {
1359            if (val == 0xffffffffu) {
1360                tcg_out_ext32u(s, r0, r0);
1361                return;
1362            }
1363            if (val == (uint32_t)val) {
1364                /* AND with no high bits set can use a 32-bit operation.  */
1365                rexw = 0;
1366            }
1367        }
1368        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1369            tcg_out_ext8u(s, r0, r0);
1370            return;
1371        }
1372        if (val == 0xffffu) {
1373            tcg_out_ext16u(s, r0, r0);
1374            return;
1375        }
1376    }
1377
1378    if (val == (int8_t)val) {
1379        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1380        tcg_out8(s, val);
1381        return;
1382    }
1383    if (rexw == 0 || val == (int32_t)val) {
1384        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1385        tcg_out32(s, val);
1386        return;
1387    }
1388
1389    g_assert_not_reached();
1390}
1391
1392static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1393{
1394    if (val != 0) {
1395        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1396    }
1397}
1398
1399/* Set SMALL to force a short forward branch.  */
1400static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1401{
1402    int32_t val, val1;
1403
1404    if (l->has_value) {
1405        val = tcg_pcrel_diff(s, l->u.value_ptr);
1406        val1 = val - 2;
1407        if ((int8_t)val1 == val1) {
1408            if (opc == -1) {
1409                tcg_out8(s, OPC_JMP_short);
1410            } else {
1411                tcg_out8(s, OPC_JCC_short + opc);
1412            }
1413            tcg_out8(s, val1);
1414        } else {
1415            tcg_debug_assert(!small);
1416            if (opc == -1) {
1417                tcg_out8(s, OPC_JMP_long);
1418                tcg_out32(s, val - 5);
1419            } else {
1420                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1421                tcg_out32(s, val - 6);
1422            }
1423        }
1424    } else if (small) {
1425        if (opc == -1) {
1426            tcg_out8(s, OPC_JMP_short);
1427        } else {
1428            tcg_out8(s, OPC_JCC_short + opc);
1429        }
1430        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1431        s->code_ptr += 1;
1432    } else {
1433        if (opc == -1) {
1434            tcg_out8(s, OPC_JMP_long);
1435        } else {
1436            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1437        }
1438        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1439        s->code_ptr += 4;
1440    }
1441}
1442
1443static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1444                        int const_arg2, int rexw)
1445{
1446    if (const_arg2) {
1447        if (arg2 == 0) {
1448            /* test r, r */
1449            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1450        } else {
1451            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1452        }
1453    } else {
1454        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1455    }
1456}
1457
1458static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1459                             TCGArg arg1, TCGArg arg2, int const_arg2,
1460                             TCGLabel *label, int small)
1461{
1462    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1463    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1464}
1465
1466#if TCG_TARGET_REG_BITS == 64
1467static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1468                             TCGArg arg1, TCGArg arg2, int const_arg2,
1469                             TCGLabel *label, int small)
1470{
1471    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1472    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1473}
1474#else
1475/* XXX: we implement it at the target level to avoid having to
1476   handle cross basic blocks temporaries */
1477static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1478                            const int *const_args, int small)
1479{
1480    TCGLabel *label_next = gen_new_label();
1481    TCGLabel *label_this = arg_label(args[5]);
1482
1483    switch(args[4]) {
1484    case TCG_COND_EQ:
1485        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1486                         label_next, 1);
1487        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1488                         label_this, small);
1489        break;
1490    case TCG_COND_NE:
1491        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1492                         label_this, small);
1493        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1494                         label_this, small);
1495        break;
1496    case TCG_COND_LT:
1497        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1498                         label_this, small);
1499        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1500        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1501                         label_this, small);
1502        break;
1503    case TCG_COND_LE:
1504        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1505                         label_this, small);
1506        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1507        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1508                         label_this, small);
1509        break;
1510    case TCG_COND_GT:
1511        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1512                         label_this, small);
1513        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1514        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1515                         label_this, small);
1516        break;
1517    case TCG_COND_GE:
1518        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1519                         label_this, small);
1520        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1521        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1522                         label_this, small);
1523        break;
1524    case TCG_COND_LTU:
1525        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1526                         label_this, small);
1527        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1528        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1529                         label_this, small);
1530        break;
1531    case TCG_COND_LEU:
1532        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1533                         label_this, small);
1534        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1535        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1536                         label_this, small);
1537        break;
1538    case TCG_COND_GTU:
1539        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1540                         label_this, small);
1541        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1542        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1543                         label_this, small);
1544        break;
1545    case TCG_COND_GEU:
1546        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1547                         label_this, small);
1548        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1549        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1550                         label_this, small);
1551        break;
1552    default:
1553        g_assert_not_reached();
1554    }
1555    tcg_out_label(s, label_next);
1556}
1557#endif
1558
1559static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1560                              TCGArg arg1, TCGArg arg2, int const_arg2)
1561{
1562    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1563    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1564    tcg_out_ext8u(s, dest, dest);
1565}
1566
1567#if TCG_TARGET_REG_BITS == 64
1568static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1569                              TCGArg arg1, TCGArg arg2, int const_arg2)
1570{
1571    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1572    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1573    tcg_out_ext8u(s, dest, dest);
1574}
1575#else
1576static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1577                             const int *const_args)
1578{
1579    TCGArg new_args[6];
1580    TCGLabel *label_true, *label_over;
1581
1582    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1583
1584    if (args[0] == args[1] || args[0] == args[2]
1585        || (!const_args[3] && args[0] == args[3])
1586        || (!const_args[4] && args[0] == args[4])) {
1587        /* When the destination overlaps with one of the argument
1588           registers, don't do anything tricky.  */
1589        label_true = gen_new_label();
1590        label_over = gen_new_label();
1591
1592        new_args[5] = label_arg(label_true);
1593        tcg_out_brcond2(s, new_args, const_args+1, 1);
1594
1595        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1596        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1597        tcg_out_label(s, label_true);
1598
1599        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1600        tcg_out_label(s, label_over);
1601    } else {
1602        /* When the destination does not overlap one of the arguments,
1603           clear the destination first, jump if cond false, and emit an
1604           increment in the true case.  This results in smaller code.  */
1605
1606        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1607
1608        label_over = gen_new_label();
1609        new_args[4] = tcg_invert_cond(new_args[4]);
1610        new_args[5] = label_arg(label_over);
1611        tcg_out_brcond2(s, new_args, const_args+1, 1);
1612
1613        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1614        tcg_out_label(s, label_over);
1615    }
1616}
1617#endif
1618
1619static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1620                         TCGReg dest, TCGReg v1)
1621{
1622    if (have_cmov) {
1623        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1624    } else {
1625        TCGLabel *over = gen_new_label();
1626        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1627        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1628        tcg_out_label(s, over);
1629    }
1630}
1631
1632static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1633                              TCGReg c1, TCGArg c2, int const_c2,
1634                              TCGReg v1)
1635{
1636    tcg_out_cmp(s, c1, c2, const_c2, 0);
1637    tcg_out_cmov(s, cond, 0, dest, v1);
1638}
1639
1640#if TCG_TARGET_REG_BITS == 64
1641static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1642                              TCGReg c1, TCGArg c2, int const_c2,
1643                              TCGReg v1)
1644{
1645    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1646    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1647}
1648#endif
1649
1650static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1651                        TCGArg arg2, bool const_a2)
1652{
1653    if (have_bmi1) {
1654        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1655        if (const_a2) {
1656            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1657        } else {
1658            tcg_debug_assert(dest != arg2);
1659            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1660        }
1661    } else {
1662        tcg_debug_assert(dest != arg2);
1663        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1664        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1665    }
1666}
1667
1668static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1669                        TCGArg arg2, bool const_a2)
1670{
1671    if (have_lzcnt) {
1672        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1673        if (const_a2) {
1674            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1675        } else {
1676            tcg_debug_assert(dest != arg2);
1677            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1678        }
1679    } else {
1680        tcg_debug_assert(!const_a2);
1681        tcg_debug_assert(dest != arg1);
1682        tcg_debug_assert(dest != arg2);
1683
1684        /* Recall that the output of BSR is the index not the count.  */
1685        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1686        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1687
1688        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1689        tcg_out_cmp(s, arg1, 0, 1, rexw);
1690        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1691    }
1692}
1693
1694static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1695{
1696    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1697
1698    if (disp == (int32_t)disp) {
1699        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1700        tcg_out32(s, disp);
1701    } else {
1702        /* rip-relative addressing into the constant pool.
1703           This is 6 + 8 = 14 bytes, as compared to using an
1704           immediate load 10 + 6 = 16 bytes, plus we may
1705           be able to re-use the pool constant for more calls.  */
1706        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1707        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1708        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1709        tcg_out32(s, 0);
1710    }
1711}
1712
1713static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1714                         const TCGHelperInfo *info)
1715{
1716    tcg_out_branch(s, 1, dest);
1717
1718#ifndef _WIN32
1719    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1720        /*
1721         * The sysv i386 abi for struct return places a reference as the
1722         * first argument of the stack, and pops that argument with the
1723         * return statement.  Since we want to retain the aligned stack
1724         * pointer for the callee, we do not want to actually push that
1725         * argument before the call but rely on the normal store to the
1726         * stack slot.  But we do need to compensate for the pop in order
1727         * to reset our correct stack pointer value.
1728         * Pushing a garbage value back onto the stack is quickest.
1729         */
1730        tcg_out_push(s, TCG_REG_EAX);
1731    }
1732#endif
1733}
1734
1735static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1736{
1737    tcg_out_branch(s, 0, dest);
1738}
1739
1740static void tcg_out_nopn(TCGContext *s, int n)
1741{
1742    int i;
1743    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1744     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1745     * duplicate prefix, and all of the interesting recent cores can
1746     * decode and discard the duplicates in a single cycle.
1747     */
1748    tcg_debug_assert(n >= 1);
1749    for (i = 1; i < n; ++i) {
1750        tcg_out8(s, 0x66);
1751    }
1752    tcg_out8(s, 0x90);
1753}
1754
1755/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1756static void __attribute__((unused))
1757tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1758{
1759    /*
1760     * This is used for testing alignment, so we can usually use testb.
1761     * For i686, we have to use testl for %esi/%edi.
1762     */
1763    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1764        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1765        tcg_out8(s, i);
1766    } else {
1767        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1768        tcg_out32(s, i);
1769    }
1770}
1771
1772typedef struct {
1773    TCGReg base;
1774    int index;
1775    int ofs;
1776    int seg;
1777} HostAddress;
1778
1779bool tcg_target_has_memory_bswap(MemOp memop)
1780{
1781    return have_movbe;
1782}
1783
1784/*
1785 * Because i686 has no register parameters and because x86_64 has xchg
1786 * to handle addr/data register overlap, we have placed all input arguments
1787 * before we need might need a scratch reg.
1788 *
1789 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1790 * a general-purpose scratch when we don't actually know it's available,
1791 * use the ra_gen hook to load into RAX if needed.
1792 */
1793#if TCG_TARGET_REG_BITS == 64
1794static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1795{
1796    if (arg < 0) {
1797        arg = TCG_REG_RAX;
1798    }
1799    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1800    return arg;
1801}
1802static const TCGLdstHelperParam ldst_helper_param = {
1803    .ra_gen = ldst_ra_gen
1804};
1805#else
1806static const TCGLdstHelperParam ldst_helper_param = { };
1807#endif
1808
1809/*
1810 * Generate code for the slow path for a load at the end of block
1811 */
1812static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1813{
1814    MemOp opc = get_memop(l->oi);
1815    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1816
1817    /* resolve label address */
1818    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1819    if (label_ptr[1]) {
1820        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1821    }
1822
1823    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1824    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1825    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1826
1827    tcg_out_jmp(s, l->raddr);
1828    return true;
1829}
1830
1831/*
1832 * Generate code for the slow path for a store at the end of block
1833 */
1834static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1835{
1836    MemOp opc = get_memop(l->oi);
1837    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1838
1839    /* resolve label address */
1840    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1841    if (label_ptr[1]) {
1842        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1843    }
1844
1845    tcg_out_st_helper_args(s, l, &ldst_helper_param);
1846    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1847
1848    tcg_out_jmp(s, l->raddr);
1849    return true;
1850}
1851
1852#ifndef CONFIG_SOFTMMU
1853static HostAddress x86_guest_base = {
1854    .index = -1
1855};
1856
1857#if defined(__x86_64__) && defined(__linux__)
1858# include <asm/prctl.h>
1859# include <sys/prctl.h>
1860int arch_prctl(int code, unsigned long addr);
1861static inline int setup_guest_base_seg(void)
1862{
1863    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1864        return P_GS;
1865    }
1866    return 0;
1867}
1868#elif defined(__x86_64__) && \
1869      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1870# include <machine/sysarch.h>
1871static inline int setup_guest_base_seg(void)
1872{
1873    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1874        return P_GS;
1875    }
1876    return 0;
1877}
1878#else
1879static inline int setup_guest_base_seg(void)
1880{
1881    return 0;
1882}
1883#endif /* setup_guest_base_seg */
1884#endif /* !SOFTMMU */
1885
1886/*
1887 * For softmmu, perform the TLB load and compare.
1888 * For useronly, perform any required alignment tests.
1889 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1890 * is required and fill in @h with the host address for the fast path.
1891 */
1892static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1893                                           TCGReg addrlo, TCGReg addrhi,
1894                                           MemOpIdx oi, bool is_ld)
1895{
1896    TCGLabelQemuLdst *ldst = NULL;
1897    MemOp opc = get_memop(oi);
1898    unsigned a_bits = get_alignment_bits(opc);
1899    unsigned a_mask = (1 << a_bits) - 1;
1900
1901#ifdef CONFIG_SOFTMMU
1902    int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
1903                        : offsetof(CPUTLBEntry, addr_write);
1904    TCGType ttype = TCG_TYPE_I32;
1905    TCGType tlbtype = TCG_TYPE_I32;
1906    int trexw = 0, hrexw = 0, tlbrexw = 0;
1907    unsigned mem_index = get_mmuidx(oi);
1908    unsigned s_bits = opc & MO_SIZE;
1909    unsigned s_mask = (1 << s_bits) - 1;
1910    target_ulong tlb_mask;
1911
1912    ldst = new_ldst_label(s);
1913    ldst->is_ld = is_ld;
1914    ldst->oi = oi;
1915    ldst->addrlo_reg = addrlo;
1916    ldst->addrhi_reg = addrhi;
1917
1918    if (TCG_TARGET_REG_BITS == 64) {
1919        if (TARGET_LONG_BITS == 64) {
1920            ttype = TCG_TYPE_I64;
1921            trexw = P_REXW;
1922        }
1923        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1924            hrexw = P_REXW;
1925            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1926                tlbtype = TCG_TYPE_I64;
1927                tlbrexw = P_REXW;
1928            }
1929        }
1930    }
1931
1932    tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
1933    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
1934                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1935
1936    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
1937                         TLB_MASK_TABLE_OFS(mem_index) +
1938                         offsetof(CPUTLBDescFast, mask));
1939
1940    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
1941                         TLB_MASK_TABLE_OFS(mem_index) +
1942                         offsetof(CPUTLBDescFast, table));
1943
1944    /*
1945     * If the required alignment is at least as large as the access, simply
1946     * copy the address and mask.  For lesser alignments, check that we don't
1947     * cross pages for the complete access.
1948     */
1949    if (a_bits >= s_bits) {
1950        tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
1951    } else {
1952        tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
1953                             addrlo, s_mask - a_mask);
1954    }
1955    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1956    tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
1957
1958    /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
1959    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
1960                         TCG_REG_L1, TCG_REG_L0, cmp_ofs);
1961
1962    /* jne slow_path */
1963    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1964    ldst->label_ptr[0] = s->code_ptr;
1965    s->code_ptr += 4;
1966
1967    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1968        /* cmp 4(TCG_REG_L0), addrhi */
1969        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4);
1970
1971        /* jne slow_path */
1972        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1973        ldst->label_ptr[1] = s->code_ptr;
1974        s->code_ptr += 4;
1975    }
1976
1977    /* TLB Hit.  */
1978    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
1979               offsetof(CPUTLBEntry, addend));
1980
1981    *h = (HostAddress) {
1982        .base = addrlo,
1983        .index = TCG_REG_L0,
1984    };
1985#else
1986    if (a_bits) {
1987        ldst = new_ldst_label(s);
1988
1989        ldst->is_ld = is_ld;
1990        ldst->oi = oi;
1991        ldst->addrlo_reg = addrlo;
1992        ldst->addrhi_reg = addrhi;
1993
1994        tcg_out_testi(s, addrlo, a_mask);
1995        /* jne slow_path */
1996        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1997        ldst->label_ptr[0] = s->code_ptr;
1998        s->code_ptr += 4;
1999    }
2000
2001    *h = x86_guest_base;
2002    h->base = addrlo;
2003#endif
2004
2005    return ldst;
2006}
2007
2008static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2009                                   HostAddress h, TCGType type, MemOp memop)
2010{
2011    bool use_movbe = false;
2012    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2013    int movop = OPC_MOVL_GvEv;
2014
2015    /* Do big-endian loads with movbe.  */
2016    if (memop & MO_BSWAP) {
2017        tcg_debug_assert(have_movbe);
2018        use_movbe = true;
2019        movop = OPC_MOVBE_GyMy;
2020    }
2021
2022    switch (memop & MO_SSIZE) {
2023    case MO_UB:
2024        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2025                                 h.base, h.index, 0, h.ofs);
2026        break;
2027    case MO_SB:
2028        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2029                                 h.base, h.index, 0, h.ofs);
2030        break;
2031    case MO_UW:
2032        if (use_movbe) {
2033            /* There is no extending movbe; only low 16-bits are modified.  */
2034            if (datalo != h.base && datalo != h.index) {
2035                /* XOR breaks dependency chains.  */
2036                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2037                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2038                                         datalo, h.base, h.index, 0, h.ofs);
2039            } else {
2040                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2041                                         datalo, h.base, h.index, 0, h.ofs);
2042                tcg_out_ext16u(s, datalo, datalo);
2043            }
2044        } else {
2045            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2046                                     h.base, h.index, 0, h.ofs);
2047        }
2048        break;
2049    case MO_SW:
2050        if (use_movbe) {
2051            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2052                                     datalo, h.base, h.index, 0, h.ofs);
2053            tcg_out_ext16s(s, type, datalo, datalo);
2054        } else {
2055            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2056                                     datalo, h.base, h.index, 0, h.ofs);
2057        }
2058        break;
2059    case MO_UL:
2060        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2061                                 h.base, h.index, 0, h.ofs);
2062        break;
2063#if TCG_TARGET_REG_BITS == 64
2064    case MO_SL:
2065        if (use_movbe) {
2066            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2067                                     h.base, h.index, 0, h.ofs);
2068            tcg_out_ext32s(s, datalo, datalo);
2069        } else {
2070            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2071                                     h.base, h.index, 0, h.ofs);
2072        }
2073        break;
2074#endif
2075    case MO_UQ:
2076        if (TCG_TARGET_REG_BITS == 64) {
2077            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2078                                     h.base, h.index, 0, h.ofs);
2079            break;
2080        }
2081        if (use_movbe) {
2082            TCGReg t = datalo;
2083            datalo = datahi;
2084            datahi = t;
2085        }
2086        if (h.base == datalo || h.index == datalo) {
2087            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2088                                     h.base, h.index, 0, h.ofs);
2089            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2090            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2091        } else {
2092            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2093                                     h.base, h.index, 0, h.ofs);
2094            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2095                                     h.base, h.index, 0, h.ofs + 4);
2096        }
2097        break;
2098    default:
2099        g_assert_not_reached();
2100    }
2101}
2102
2103static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2104                            TCGReg addrlo, TCGReg addrhi,
2105                            MemOpIdx oi, TCGType data_type)
2106{
2107    TCGLabelQemuLdst *ldst;
2108    HostAddress h;
2109
2110    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2111    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2112
2113    if (ldst) {
2114        ldst->type = data_type;
2115        ldst->datalo_reg = datalo;
2116        ldst->datahi_reg = datahi;
2117        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2118    }
2119}
2120
2121static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2122                                   HostAddress h, MemOp memop)
2123{
2124    bool use_movbe = false;
2125    int movop = OPC_MOVL_EvGv;
2126
2127    /*
2128     * Do big-endian stores with movbe or softmmu.
2129     * User-only without movbe will have its swapping done generically.
2130     */
2131    if (memop & MO_BSWAP) {
2132        tcg_debug_assert(have_movbe);
2133        use_movbe = true;
2134        movop = OPC_MOVBE_MyGy;
2135    }
2136
2137    switch (memop & MO_SIZE) {
2138    case MO_8:
2139        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2140        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2141        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2142                                 datalo, h.base, h.index, 0, h.ofs);
2143        break;
2144    case MO_16:
2145        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2146                                 h.base, h.index, 0, h.ofs);
2147        break;
2148    case MO_32:
2149        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2150                                 h.base, h.index, 0, h.ofs);
2151        break;
2152    case MO_64:
2153        if (TCG_TARGET_REG_BITS == 64) {
2154            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2155                                     h.base, h.index, 0, h.ofs);
2156        } else {
2157            if (use_movbe) {
2158                TCGReg t = datalo;
2159                datalo = datahi;
2160                datahi = t;
2161            }
2162            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2163                                     h.base, h.index, 0, h.ofs);
2164            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2165                                     h.base, h.index, 0, h.ofs + 4);
2166        }
2167        break;
2168    default:
2169        g_assert_not_reached();
2170    }
2171}
2172
2173static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2174                            TCGReg addrlo, TCGReg addrhi,
2175                            MemOpIdx oi, TCGType data_type)
2176{
2177    TCGLabelQemuLdst *ldst;
2178    HostAddress h;
2179
2180    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2181    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2182
2183    if (ldst) {
2184        ldst->type = data_type;
2185        ldst->datalo_reg = datalo;
2186        ldst->datahi_reg = datahi;
2187        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2188    }
2189}
2190
2191static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2192{
2193    /* Reuse the zeroing that exists for goto_ptr.  */
2194    if (a0 == 0) {
2195        tcg_out_jmp(s, tcg_code_gen_epilogue);
2196    } else {
2197        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2198        tcg_out_jmp(s, tb_ret_addr);
2199    }
2200}
2201
2202static void tcg_out_goto_tb(TCGContext *s, int which)
2203{
2204    /*
2205     * Jump displacement must be aligned for atomic patching;
2206     * see if we need to add extra nops before jump
2207     */
2208    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2209    if (gap != 1) {
2210        tcg_out_nopn(s, gap - 1);
2211    }
2212    tcg_out8(s, OPC_JMP_long); /* jmp im */
2213    set_jmp_insn_offset(s, which);
2214    tcg_out32(s, 0);
2215    set_jmp_reset_offset(s, which);
2216}
2217
2218void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2219                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2220{
2221    /* patch the branch destination */
2222    uintptr_t addr = tb->jmp_target_addr[n];
2223    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2224    /* no need to flush icache explicitly */
2225}
2226
2227static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2228                              const TCGArg args[TCG_MAX_OP_ARGS],
2229                              const int const_args[TCG_MAX_OP_ARGS])
2230{
2231    TCGArg a0, a1, a2;
2232    int c, const_a2, vexop, rexw = 0;
2233
2234#if TCG_TARGET_REG_BITS == 64
2235# define OP_32_64(x) \
2236        case glue(glue(INDEX_op_, x), _i64): \
2237            rexw = P_REXW; /* FALLTHRU */    \
2238        case glue(glue(INDEX_op_, x), _i32)
2239#else
2240# define OP_32_64(x) \
2241        case glue(glue(INDEX_op_, x), _i32)
2242#endif
2243
2244    /* Hoist the loads of the most common arguments.  */
2245    a0 = args[0];
2246    a1 = args[1];
2247    a2 = args[2];
2248    const_a2 = const_args[2];
2249
2250    switch (opc) {
2251    case INDEX_op_goto_ptr:
2252        /* jmp to the given host address (could be epilogue) */
2253        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2254        break;
2255    case INDEX_op_br:
2256        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2257        break;
2258    OP_32_64(ld8u):
2259        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2260        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2261        break;
2262    OP_32_64(ld8s):
2263        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2264        break;
2265    OP_32_64(ld16u):
2266        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2267        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2268        break;
2269    OP_32_64(ld16s):
2270        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2271        break;
2272#if TCG_TARGET_REG_BITS == 64
2273    case INDEX_op_ld32u_i64:
2274#endif
2275    case INDEX_op_ld_i32:
2276        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2277        break;
2278
2279    OP_32_64(st8):
2280        if (const_args[0]) {
2281            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2282            tcg_out8(s, a0);
2283        } else {
2284            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2285        }
2286        break;
2287    OP_32_64(st16):
2288        if (const_args[0]) {
2289            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2290            tcg_out16(s, a0);
2291        } else {
2292            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2293        }
2294        break;
2295#if TCG_TARGET_REG_BITS == 64
2296    case INDEX_op_st32_i64:
2297#endif
2298    case INDEX_op_st_i32:
2299        if (const_args[0]) {
2300            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2301            tcg_out32(s, a0);
2302        } else {
2303            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2304        }
2305        break;
2306
2307    OP_32_64(add):
2308        /* For 3-operand addition, use LEA.  */
2309        if (a0 != a1) {
2310            TCGArg c3 = 0;
2311            if (const_a2) {
2312                c3 = a2, a2 = -1;
2313            } else if (a0 == a2) {
2314                /* Watch out for dest = src + dest, since we've removed
2315                   the matching constraint on the add.  */
2316                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2317                break;
2318            }
2319
2320            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2321            break;
2322        }
2323        c = ARITH_ADD;
2324        goto gen_arith;
2325    OP_32_64(sub):
2326        c = ARITH_SUB;
2327        goto gen_arith;
2328    OP_32_64(and):
2329        c = ARITH_AND;
2330        goto gen_arith;
2331    OP_32_64(or):
2332        c = ARITH_OR;
2333        goto gen_arith;
2334    OP_32_64(xor):
2335        c = ARITH_XOR;
2336        goto gen_arith;
2337    gen_arith:
2338        if (const_a2) {
2339            tgen_arithi(s, c + rexw, a0, a2, 0);
2340        } else {
2341            tgen_arithr(s, c + rexw, a0, a2);
2342        }
2343        break;
2344
2345    OP_32_64(andc):
2346        if (const_a2) {
2347            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2348            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2349        } else {
2350            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2351        }
2352        break;
2353
2354    OP_32_64(mul):
2355        if (const_a2) {
2356            int32_t val;
2357            val = a2;
2358            if (val == (int8_t)val) {
2359                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2360                tcg_out8(s, val);
2361            } else {
2362                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2363                tcg_out32(s, val);
2364            }
2365        } else {
2366            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2367        }
2368        break;
2369
2370    OP_32_64(div2):
2371        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2372        break;
2373    OP_32_64(divu2):
2374        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2375        break;
2376
2377    OP_32_64(shl):
2378        /* For small constant 3-operand shift, use LEA.  */
2379        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2380            if (a2 - 1 == 0) {
2381                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2382                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2383            } else {
2384                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2385                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2386            }
2387            break;
2388        }
2389        c = SHIFT_SHL;
2390        vexop = OPC_SHLX;
2391        goto gen_shift_maybe_vex;
2392    OP_32_64(shr):
2393        c = SHIFT_SHR;
2394        vexop = OPC_SHRX;
2395        goto gen_shift_maybe_vex;
2396    OP_32_64(sar):
2397        c = SHIFT_SAR;
2398        vexop = OPC_SARX;
2399        goto gen_shift_maybe_vex;
2400    OP_32_64(rotl):
2401        c = SHIFT_ROL;
2402        goto gen_shift;
2403    OP_32_64(rotr):
2404        c = SHIFT_ROR;
2405        goto gen_shift;
2406    gen_shift_maybe_vex:
2407        if (have_bmi2) {
2408            if (!const_a2) {
2409                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2410                break;
2411            }
2412            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2413        }
2414        /* FALLTHRU */
2415    gen_shift:
2416        if (const_a2) {
2417            tcg_out_shifti(s, c + rexw, a0, a2);
2418        } else {
2419            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2420        }
2421        break;
2422
2423    OP_32_64(ctz):
2424        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2425        break;
2426    OP_32_64(clz):
2427        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2428        break;
2429    OP_32_64(ctpop):
2430        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2431        break;
2432
2433    case INDEX_op_brcond_i32:
2434        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2435        break;
2436    case INDEX_op_setcond_i32:
2437        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2438        break;
2439    case INDEX_op_movcond_i32:
2440        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2441        break;
2442
2443    OP_32_64(bswap16):
2444        if (a2 & TCG_BSWAP_OS) {
2445            /* Output must be sign-extended. */
2446            if (rexw) {
2447                tcg_out_bswap64(s, a0);
2448                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2449            } else {
2450                tcg_out_bswap32(s, a0);
2451                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2452            }
2453        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2454            /* Output must be zero-extended, but input isn't. */
2455            tcg_out_bswap32(s, a0);
2456            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2457        } else {
2458            tcg_out_rolw_8(s, a0);
2459        }
2460        break;
2461    OP_32_64(bswap32):
2462        tcg_out_bswap32(s, a0);
2463        if (rexw && (a2 & TCG_BSWAP_OS)) {
2464            tcg_out_ext32s(s, a0, a0);
2465        }
2466        break;
2467
2468    OP_32_64(neg):
2469        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2470        break;
2471    OP_32_64(not):
2472        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2473        break;
2474
2475    case INDEX_op_qemu_ld_i32:
2476        if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
2477            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2478        } else {
2479            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2480        }
2481        break;
2482    case INDEX_op_qemu_ld_i64:
2483        if (TCG_TARGET_REG_BITS == 64) {
2484            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2485        } else if (TARGET_LONG_BITS == 32) {
2486            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2487        } else {
2488            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2489        }
2490        break;
2491    case INDEX_op_qemu_st_i32:
2492    case INDEX_op_qemu_st8_i32:
2493        if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
2494            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2495        } else {
2496            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2497        }
2498        break;
2499    case INDEX_op_qemu_st_i64:
2500        if (TCG_TARGET_REG_BITS == 64) {
2501            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2502        } else if (TARGET_LONG_BITS == 32) {
2503            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2504        } else {
2505            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2506        }
2507        break;
2508
2509    OP_32_64(mulu2):
2510        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2511        break;
2512    OP_32_64(muls2):
2513        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2514        break;
2515    OP_32_64(add2):
2516        if (const_args[4]) {
2517            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2518        } else {
2519            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2520        }
2521        if (const_args[5]) {
2522            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2523        } else {
2524            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2525        }
2526        break;
2527    OP_32_64(sub2):
2528        if (const_args[4]) {
2529            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2530        } else {
2531            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2532        }
2533        if (const_args[5]) {
2534            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2535        } else {
2536            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2537        }
2538        break;
2539
2540#if TCG_TARGET_REG_BITS == 32
2541    case INDEX_op_brcond2_i32:
2542        tcg_out_brcond2(s, args, const_args, 0);
2543        break;
2544    case INDEX_op_setcond2_i32:
2545        tcg_out_setcond2(s, args, const_args);
2546        break;
2547#else /* TCG_TARGET_REG_BITS == 64 */
2548    case INDEX_op_ld32s_i64:
2549        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2550        break;
2551    case INDEX_op_ld_i64:
2552        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2553        break;
2554    case INDEX_op_st_i64:
2555        if (const_args[0]) {
2556            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2557            tcg_out32(s, a0);
2558        } else {
2559            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2560        }
2561        break;
2562
2563    case INDEX_op_brcond_i64:
2564        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2565        break;
2566    case INDEX_op_setcond_i64:
2567        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2568        break;
2569    case INDEX_op_movcond_i64:
2570        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2571        break;
2572
2573    case INDEX_op_bswap64_i64:
2574        tcg_out_bswap64(s, a0);
2575        break;
2576    case INDEX_op_extrh_i64_i32:
2577        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2578        break;
2579#endif
2580
2581    OP_32_64(deposit):
2582        if (args[3] == 0 && args[4] == 8) {
2583            /* load bits 0..7 */
2584            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2585        } else if (args[3] == 8 && args[4] == 8) {
2586            /* load bits 8..15 */
2587            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2588        } else if (args[3] == 0 && args[4] == 16) {
2589            /* load bits 0..15 */
2590            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2591        } else {
2592            g_assert_not_reached();
2593        }
2594        break;
2595
2596    case INDEX_op_extract_i64:
2597        if (a2 + args[3] == 32) {
2598            /* This is a 32-bit zero-extending right shift.  */
2599            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2600            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2601            break;
2602        }
2603        /* FALLTHRU */
2604    case INDEX_op_extract_i32:
2605        /* On the off-chance that we can use the high-byte registers.
2606           Otherwise we emit the same ext16 + shift pattern that we
2607           would have gotten from the normal tcg-op.c expansion.  */
2608        tcg_debug_assert(a2 == 8 && args[3] == 8);
2609        if (a1 < 4 && a0 < 8) {
2610            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2611        } else {
2612            tcg_out_ext16u(s, a0, a1);
2613            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2614        }
2615        break;
2616
2617    case INDEX_op_sextract_i32:
2618        /* We don't implement sextract_i64, as we cannot sign-extend to
2619           64-bits without using the REX prefix that explicitly excludes
2620           access to the high-byte registers.  */
2621        tcg_debug_assert(a2 == 8 && args[3] == 8);
2622        if (a1 < 4 && a0 < 8) {
2623            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2624        } else {
2625            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2626            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2627        }
2628        break;
2629
2630    OP_32_64(extract2):
2631        /* Note that SHRD outputs to the r/m operand.  */
2632        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2633        tcg_out8(s, args[3]);
2634        break;
2635
2636    case INDEX_op_mb:
2637        tcg_out_mb(s, a0);
2638        break;
2639    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2640    case INDEX_op_mov_i64:
2641    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2642    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2643    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2644    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2645    case INDEX_op_ext8s_i64:
2646    case INDEX_op_ext8u_i32:
2647    case INDEX_op_ext8u_i64:
2648    case INDEX_op_ext16s_i32:
2649    case INDEX_op_ext16s_i64:
2650    case INDEX_op_ext16u_i32:
2651    case INDEX_op_ext16u_i64:
2652    case INDEX_op_ext32s_i64:
2653    case INDEX_op_ext32u_i64:
2654    case INDEX_op_ext_i32_i64:
2655    case INDEX_op_extu_i32_i64:
2656    case INDEX_op_extrl_i64_i32:
2657    default:
2658        g_assert_not_reached();
2659    }
2660
2661#undef OP_32_64
2662}
2663
2664static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2665                           unsigned vecl, unsigned vece,
2666                           const TCGArg args[TCG_MAX_OP_ARGS],
2667                           const int const_args[TCG_MAX_OP_ARGS])
2668{
2669    static int const add_insn[4] = {
2670        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2671    };
2672    static int const ssadd_insn[4] = {
2673        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2674    };
2675    static int const usadd_insn[4] = {
2676        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2677    };
2678    static int const sub_insn[4] = {
2679        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2680    };
2681    static int const sssub_insn[4] = {
2682        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2683    };
2684    static int const ussub_insn[4] = {
2685        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2686    };
2687    static int const mul_insn[4] = {
2688        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2689    };
2690    static int const shift_imm_insn[4] = {
2691        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2692    };
2693    static int const cmpeq_insn[4] = {
2694        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2695    };
2696    static int const cmpgt_insn[4] = {
2697        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2698    };
2699    static int const punpckl_insn[4] = {
2700        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2701    };
2702    static int const punpckh_insn[4] = {
2703        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2704    };
2705    static int const packss_insn[4] = {
2706        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2707    };
2708    static int const packus_insn[4] = {
2709        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2710    };
2711    static int const smin_insn[4] = {
2712        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2713    };
2714    static int const smax_insn[4] = {
2715        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2716    };
2717    static int const umin_insn[4] = {
2718        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2719    };
2720    static int const umax_insn[4] = {
2721        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2722    };
2723    static int const rotlv_insn[4] = {
2724        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2725    };
2726    static int const rotrv_insn[4] = {
2727        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2728    };
2729    static int const shlv_insn[4] = {
2730        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2731    };
2732    static int const shrv_insn[4] = {
2733        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2734    };
2735    static int const sarv_insn[4] = {
2736        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2737    };
2738    static int const shls_insn[4] = {
2739        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2740    };
2741    static int const shrs_insn[4] = {
2742        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2743    };
2744    static int const sars_insn[4] = {
2745        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2746    };
2747    static int const vpshldi_insn[4] = {
2748        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2749    };
2750    static int const vpshldv_insn[4] = {
2751        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2752    };
2753    static int const vpshrdv_insn[4] = {
2754        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2755    };
2756    static int const abs_insn[4] = {
2757        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2758    };
2759
2760    TCGType type = vecl + TCG_TYPE_V64;
2761    int insn, sub;
2762    TCGArg a0, a1, a2, a3;
2763
2764    a0 = args[0];
2765    a1 = args[1];
2766    a2 = args[2];
2767
2768    switch (opc) {
2769    case INDEX_op_add_vec:
2770        insn = add_insn[vece];
2771        goto gen_simd;
2772    case INDEX_op_ssadd_vec:
2773        insn = ssadd_insn[vece];
2774        goto gen_simd;
2775    case INDEX_op_usadd_vec:
2776        insn = usadd_insn[vece];
2777        goto gen_simd;
2778    case INDEX_op_sub_vec:
2779        insn = sub_insn[vece];
2780        goto gen_simd;
2781    case INDEX_op_sssub_vec:
2782        insn = sssub_insn[vece];
2783        goto gen_simd;
2784    case INDEX_op_ussub_vec:
2785        insn = ussub_insn[vece];
2786        goto gen_simd;
2787    case INDEX_op_mul_vec:
2788        insn = mul_insn[vece];
2789        goto gen_simd;
2790    case INDEX_op_and_vec:
2791        insn = OPC_PAND;
2792        goto gen_simd;
2793    case INDEX_op_or_vec:
2794        insn = OPC_POR;
2795        goto gen_simd;
2796    case INDEX_op_xor_vec:
2797        insn = OPC_PXOR;
2798        goto gen_simd;
2799    case INDEX_op_smin_vec:
2800        insn = smin_insn[vece];
2801        goto gen_simd;
2802    case INDEX_op_umin_vec:
2803        insn = umin_insn[vece];
2804        goto gen_simd;
2805    case INDEX_op_smax_vec:
2806        insn = smax_insn[vece];
2807        goto gen_simd;
2808    case INDEX_op_umax_vec:
2809        insn = umax_insn[vece];
2810        goto gen_simd;
2811    case INDEX_op_shlv_vec:
2812        insn = shlv_insn[vece];
2813        goto gen_simd;
2814    case INDEX_op_shrv_vec:
2815        insn = shrv_insn[vece];
2816        goto gen_simd;
2817    case INDEX_op_sarv_vec:
2818        insn = sarv_insn[vece];
2819        goto gen_simd;
2820    case INDEX_op_rotlv_vec:
2821        insn = rotlv_insn[vece];
2822        goto gen_simd;
2823    case INDEX_op_rotrv_vec:
2824        insn = rotrv_insn[vece];
2825        goto gen_simd;
2826    case INDEX_op_shls_vec:
2827        insn = shls_insn[vece];
2828        goto gen_simd;
2829    case INDEX_op_shrs_vec:
2830        insn = shrs_insn[vece];
2831        goto gen_simd;
2832    case INDEX_op_sars_vec:
2833        insn = sars_insn[vece];
2834        goto gen_simd;
2835    case INDEX_op_x86_punpckl_vec:
2836        insn = punpckl_insn[vece];
2837        goto gen_simd;
2838    case INDEX_op_x86_punpckh_vec:
2839        insn = punpckh_insn[vece];
2840        goto gen_simd;
2841    case INDEX_op_x86_packss_vec:
2842        insn = packss_insn[vece];
2843        goto gen_simd;
2844    case INDEX_op_x86_packus_vec:
2845        insn = packus_insn[vece];
2846        goto gen_simd;
2847    case INDEX_op_x86_vpshldv_vec:
2848        insn = vpshldv_insn[vece];
2849        a1 = a2;
2850        a2 = args[3];
2851        goto gen_simd;
2852    case INDEX_op_x86_vpshrdv_vec:
2853        insn = vpshrdv_insn[vece];
2854        a1 = a2;
2855        a2 = args[3];
2856        goto gen_simd;
2857#if TCG_TARGET_REG_BITS == 32
2858    case INDEX_op_dup2_vec:
2859        /* First merge the two 32-bit inputs to a single 64-bit element. */
2860        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2861        /* Then replicate the 64-bit elements across the rest of the vector. */
2862        if (type != TCG_TYPE_V64) {
2863            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2864        }
2865        break;
2866#endif
2867    case INDEX_op_abs_vec:
2868        insn = abs_insn[vece];
2869        a2 = a1;
2870        a1 = 0;
2871        goto gen_simd;
2872    gen_simd:
2873        tcg_debug_assert(insn != OPC_UD2);
2874        if (type == TCG_TYPE_V256) {
2875            insn |= P_VEXL;
2876        }
2877        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2878        break;
2879
2880    case INDEX_op_cmp_vec:
2881        sub = args[3];
2882        if (sub == TCG_COND_EQ) {
2883            insn = cmpeq_insn[vece];
2884        } else if (sub == TCG_COND_GT) {
2885            insn = cmpgt_insn[vece];
2886        } else {
2887            g_assert_not_reached();
2888        }
2889        goto gen_simd;
2890
2891    case INDEX_op_andc_vec:
2892        insn = OPC_PANDN;
2893        if (type == TCG_TYPE_V256) {
2894            insn |= P_VEXL;
2895        }
2896        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2897        break;
2898
2899    case INDEX_op_shli_vec:
2900        insn = shift_imm_insn[vece];
2901        sub = 6;
2902        goto gen_shift;
2903    case INDEX_op_shri_vec:
2904        insn = shift_imm_insn[vece];
2905        sub = 2;
2906        goto gen_shift;
2907    case INDEX_op_sari_vec:
2908        if (vece == MO_64) {
2909            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
2910        } else {
2911            insn = shift_imm_insn[vece];
2912        }
2913        sub = 4;
2914        goto gen_shift;
2915    case INDEX_op_rotli_vec:
2916        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
2917        if (vece == MO_64) {
2918            insn |= P_VEXW;
2919        }
2920        sub = 1;
2921        goto gen_shift;
2922    gen_shift:
2923        tcg_debug_assert(vece != MO_8);
2924        if (type == TCG_TYPE_V256) {
2925            insn |= P_VEXL;
2926        }
2927        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2928        tcg_out8(s, a2);
2929        break;
2930
2931    case INDEX_op_ld_vec:
2932        tcg_out_ld(s, type, a0, a1, a2);
2933        break;
2934    case INDEX_op_st_vec:
2935        tcg_out_st(s, type, a0, a1, a2);
2936        break;
2937    case INDEX_op_dupm_vec:
2938        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2939        break;
2940
2941    case INDEX_op_x86_shufps_vec:
2942        insn = OPC_SHUFPS;
2943        sub = args[3];
2944        goto gen_simd_imm8;
2945    case INDEX_op_x86_blend_vec:
2946        if (vece == MO_16) {
2947            insn = OPC_PBLENDW;
2948        } else if (vece == MO_32) {
2949            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2950        } else {
2951            g_assert_not_reached();
2952        }
2953        sub = args[3];
2954        goto gen_simd_imm8;
2955    case INDEX_op_x86_vperm2i128_vec:
2956        insn = OPC_VPERM2I128;
2957        sub = args[3];
2958        goto gen_simd_imm8;
2959    case INDEX_op_x86_vpshldi_vec:
2960        insn = vpshldi_insn[vece];
2961        sub = args[3];
2962        goto gen_simd_imm8;
2963
2964    case INDEX_op_not_vec:
2965        insn = OPC_VPTERNLOGQ;
2966        a2 = a1;
2967        sub = 0x33; /* !B */
2968        goto gen_simd_imm8;
2969    case INDEX_op_nor_vec:
2970        insn = OPC_VPTERNLOGQ;
2971        sub = 0x11; /* norCB */
2972        goto gen_simd_imm8;
2973    case INDEX_op_nand_vec:
2974        insn = OPC_VPTERNLOGQ;
2975        sub = 0x77; /* nandCB */
2976        goto gen_simd_imm8;
2977    case INDEX_op_eqv_vec:
2978        insn = OPC_VPTERNLOGQ;
2979        sub = 0x99; /* xnorCB */
2980        goto gen_simd_imm8;
2981    case INDEX_op_orc_vec:
2982        insn = OPC_VPTERNLOGQ;
2983        sub = 0xdd; /* orB!C */
2984        goto gen_simd_imm8;
2985
2986    case INDEX_op_bitsel_vec:
2987        insn = OPC_VPTERNLOGQ;
2988        a3 = args[3];
2989        if (a0 == a1) {
2990            a1 = a2;
2991            a2 = a3;
2992            sub = 0xca; /* A?B:C */
2993        } else if (a0 == a2) {
2994            a2 = a3;
2995            sub = 0xe2; /* B?A:C */
2996        } else {
2997            tcg_out_mov(s, type, a0, a3);
2998            sub = 0xb8; /* B?C:A */
2999        }
3000        goto gen_simd_imm8;
3001
3002    gen_simd_imm8:
3003        tcg_debug_assert(insn != OPC_UD2);
3004        if (type == TCG_TYPE_V256) {
3005            insn |= P_VEXL;
3006        }
3007        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3008        tcg_out8(s, sub);
3009        break;
3010
3011    case INDEX_op_x86_vpblendvb_vec:
3012        insn = OPC_VPBLENDVB;
3013        if (type == TCG_TYPE_V256) {
3014            insn |= P_VEXL;
3015        }
3016        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3017        tcg_out8(s, args[3] << 4);
3018        break;
3019
3020    case INDEX_op_x86_psrldq_vec:
3021        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3022        tcg_out8(s, a2);
3023        break;
3024
3025    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3026    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3027    default:
3028        g_assert_not_reached();
3029    }
3030}
3031
3032static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3033{
3034    switch (op) {
3035    case INDEX_op_goto_ptr:
3036        return C_O0_I1(r);
3037
3038    case INDEX_op_ld8u_i32:
3039    case INDEX_op_ld8u_i64:
3040    case INDEX_op_ld8s_i32:
3041    case INDEX_op_ld8s_i64:
3042    case INDEX_op_ld16u_i32:
3043    case INDEX_op_ld16u_i64:
3044    case INDEX_op_ld16s_i32:
3045    case INDEX_op_ld16s_i64:
3046    case INDEX_op_ld_i32:
3047    case INDEX_op_ld32u_i64:
3048    case INDEX_op_ld32s_i64:
3049    case INDEX_op_ld_i64:
3050        return C_O1_I1(r, r);
3051
3052    case INDEX_op_st8_i32:
3053    case INDEX_op_st8_i64:
3054        return C_O0_I2(qi, r);
3055
3056    case INDEX_op_st16_i32:
3057    case INDEX_op_st16_i64:
3058    case INDEX_op_st_i32:
3059    case INDEX_op_st32_i64:
3060        return C_O0_I2(ri, r);
3061
3062    case INDEX_op_st_i64:
3063        return C_O0_I2(re, r);
3064
3065    case INDEX_op_add_i32:
3066    case INDEX_op_add_i64:
3067        return C_O1_I2(r, r, re);
3068
3069    case INDEX_op_sub_i32:
3070    case INDEX_op_sub_i64:
3071    case INDEX_op_mul_i32:
3072    case INDEX_op_mul_i64:
3073    case INDEX_op_or_i32:
3074    case INDEX_op_or_i64:
3075    case INDEX_op_xor_i32:
3076    case INDEX_op_xor_i64:
3077        return C_O1_I2(r, 0, re);
3078
3079    case INDEX_op_and_i32:
3080    case INDEX_op_and_i64:
3081        return C_O1_I2(r, 0, reZ);
3082
3083    case INDEX_op_andc_i32:
3084    case INDEX_op_andc_i64:
3085        return C_O1_I2(r, r, rI);
3086
3087    case INDEX_op_shl_i32:
3088    case INDEX_op_shl_i64:
3089    case INDEX_op_shr_i32:
3090    case INDEX_op_shr_i64:
3091    case INDEX_op_sar_i32:
3092    case INDEX_op_sar_i64:
3093        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3094
3095    case INDEX_op_rotl_i32:
3096    case INDEX_op_rotl_i64:
3097    case INDEX_op_rotr_i32:
3098    case INDEX_op_rotr_i64:
3099        return C_O1_I2(r, 0, ci);
3100
3101    case INDEX_op_brcond_i32:
3102    case INDEX_op_brcond_i64:
3103        return C_O0_I2(r, re);
3104
3105    case INDEX_op_bswap16_i32:
3106    case INDEX_op_bswap16_i64:
3107    case INDEX_op_bswap32_i32:
3108    case INDEX_op_bswap32_i64:
3109    case INDEX_op_bswap64_i64:
3110    case INDEX_op_neg_i32:
3111    case INDEX_op_neg_i64:
3112    case INDEX_op_not_i32:
3113    case INDEX_op_not_i64:
3114    case INDEX_op_extrh_i64_i32:
3115        return C_O1_I1(r, 0);
3116
3117    case INDEX_op_ext8s_i32:
3118    case INDEX_op_ext8s_i64:
3119    case INDEX_op_ext8u_i32:
3120    case INDEX_op_ext8u_i64:
3121        return C_O1_I1(r, q);
3122
3123    case INDEX_op_ext16s_i32:
3124    case INDEX_op_ext16s_i64:
3125    case INDEX_op_ext16u_i32:
3126    case INDEX_op_ext16u_i64:
3127    case INDEX_op_ext32s_i64:
3128    case INDEX_op_ext32u_i64:
3129    case INDEX_op_ext_i32_i64:
3130    case INDEX_op_extu_i32_i64:
3131    case INDEX_op_extrl_i64_i32:
3132    case INDEX_op_extract_i32:
3133    case INDEX_op_extract_i64:
3134    case INDEX_op_sextract_i32:
3135    case INDEX_op_ctpop_i32:
3136    case INDEX_op_ctpop_i64:
3137        return C_O1_I1(r, r);
3138
3139    case INDEX_op_extract2_i32:
3140    case INDEX_op_extract2_i64:
3141        return C_O1_I2(r, 0, r);
3142
3143    case INDEX_op_deposit_i32:
3144    case INDEX_op_deposit_i64:
3145        return C_O1_I2(Q, 0, Q);
3146
3147    case INDEX_op_setcond_i32:
3148    case INDEX_op_setcond_i64:
3149        return C_O1_I2(q, r, re);
3150
3151    case INDEX_op_movcond_i32:
3152    case INDEX_op_movcond_i64:
3153        return C_O1_I4(r, r, re, r, 0);
3154
3155    case INDEX_op_div2_i32:
3156    case INDEX_op_div2_i64:
3157    case INDEX_op_divu2_i32:
3158    case INDEX_op_divu2_i64:
3159        return C_O2_I3(a, d, 0, 1, r);
3160
3161    case INDEX_op_mulu2_i32:
3162    case INDEX_op_mulu2_i64:
3163    case INDEX_op_muls2_i32:
3164    case INDEX_op_muls2_i64:
3165        return C_O2_I2(a, d, a, r);
3166
3167    case INDEX_op_add2_i32:
3168    case INDEX_op_add2_i64:
3169    case INDEX_op_sub2_i32:
3170    case INDEX_op_sub2_i64:
3171        return C_O2_I4(r, r, 0, 1, re, re);
3172
3173    case INDEX_op_ctz_i32:
3174    case INDEX_op_ctz_i64:
3175        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3176
3177    case INDEX_op_clz_i32:
3178    case INDEX_op_clz_i64:
3179        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3180
3181    case INDEX_op_qemu_ld_i32:
3182        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3183                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3184
3185    case INDEX_op_qemu_st_i32:
3186        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3187                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3188    case INDEX_op_qemu_st8_i32:
3189        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3190                ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3191
3192    case INDEX_op_qemu_ld_i64:
3193        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3194                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3195                : C_O2_I2(r, r, L, L));
3196
3197    case INDEX_op_qemu_st_i64:
3198        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3199                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3200                : C_O0_I4(L, L, L, L));
3201
3202    case INDEX_op_brcond2_i32:
3203        return C_O0_I4(r, r, ri, ri);
3204
3205    case INDEX_op_setcond2_i32:
3206        return C_O1_I4(r, r, r, ri, ri);
3207
3208    case INDEX_op_ld_vec:
3209    case INDEX_op_dupm_vec:
3210        return C_O1_I1(x, r);
3211
3212    case INDEX_op_st_vec:
3213        return C_O0_I2(x, r);
3214
3215    case INDEX_op_add_vec:
3216    case INDEX_op_sub_vec:
3217    case INDEX_op_mul_vec:
3218    case INDEX_op_and_vec:
3219    case INDEX_op_or_vec:
3220    case INDEX_op_xor_vec:
3221    case INDEX_op_andc_vec:
3222    case INDEX_op_orc_vec:
3223    case INDEX_op_nand_vec:
3224    case INDEX_op_nor_vec:
3225    case INDEX_op_eqv_vec:
3226    case INDEX_op_ssadd_vec:
3227    case INDEX_op_usadd_vec:
3228    case INDEX_op_sssub_vec:
3229    case INDEX_op_ussub_vec:
3230    case INDEX_op_smin_vec:
3231    case INDEX_op_umin_vec:
3232    case INDEX_op_smax_vec:
3233    case INDEX_op_umax_vec:
3234    case INDEX_op_shlv_vec:
3235    case INDEX_op_shrv_vec:
3236    case INDEX_op_sarv_vec:
3237    case INDEX_op_rotlv_vec:
3238    case INDEX_op_rotrv_vec:
3239    case INDEX_op_shls_vec:
3240    case INDEX_op_shrs_vec:
3241    case INDEX_op_sars_vec:
3242    case INDEX_op_cmp_vec:
3243    case INDEX_op_x86_shufps_vec:
3244    case INDEX_op_x86_blend_vec:
3245    case INDEX_op_x86_packss_vec:
3246    case INDEX_op_x86_packus_vec:
3247    case INDEX_op_x86_vperm2i128_vec:
3248    case INDEX_op_x86_punpckl_vec:
3249    case INDEX_op_x86_punpckh_vec:
3250    case INDEX_op_x86_vpshldi_vec:
3251#if TCG_TARGET_REG_BITS == 32
3252    case INDEX_op_dup2_vec:
3253#endif
3254        return C_O1_I2(x, x, x);
3255
3256    case INDEX_op_abs_vec:
3257    case INDEX_op_dup_vec:
3258    case INDEX_op_not_vec:
3259    case INDEX_op_shli_vec:
3260    case INDEX_op_shri_vec:
3261    case INDEX_op_sari_vec:
3262    case INDEX_op_rotli_vec:
3263    case INDEX_op_x86_psrldq_vec:
3264        return C_O1_I1(x, x);
3265
3266    case INDEX_op_x86_vpshldv_vec:
3267    case INDEX_op_x86_vpshrdv_vec:
3268        return C_O1_I3(x, 0, x, x);
3269
3270    case INDEX_op_bitsel_vec:
3271    case INDEX_op_x86_vpblendvb_vec:
3272        return C_O1_I3(x, x, x, x);
3273
3274    default:
3275        g_assert_not_reached();
3276    }
3277}
3278
3279int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3280{
3281    switch (opc) {
3282    case INDEX_op_add_vec:
3283    case INDEX_op_sub_vec:
3284    case INDEX_op_and_vec:
3285    case INDEX_op_or_vec:
3286    case INDEX_op_xor_vec:
3287    case INDEX_op_andc_vec:
3288    case INDEX_op_orc_vec:
3289    case INDEX_op_nand_vec:
3290    case INDEX_op_nor_vec:
3291    case INDEX_op_eqv_vec:
3292    case INDEX_op_not_vec:
3293    case INDEX_op_bitsel_vec:
3294        return 1;
3295    case INDEX_op_cmp_vec:
3296    case INDEX_op_cmpsel_vec:
3297        return -1;
3298
3299    case INDEX_op_rotli_vec:
3300        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3301
3302    case INDEX_op_shli_vec:
3303    case INDEX_op_shri_vec:
3304        /* We must expand the operation for MO_8.  */
3305        return vece == MO_8 ? -1 : 1;
3306
3307    case INDEX_op_sari_vec:
3308        switch (vece) {
3309        case MO_8:
3310            return -1;
3311        case MO_16:
3312        case MO_32:
3313            return 1;
3314        case MO_64:
3315            if (have_avx512vl) {
3316                return 1;
3317            }
3318            /*
3319             * We can emulate this for MO_64, but it does not pay off
3320             * unless we're producing at least 4 values.
3321             */
3322            return type >= TCG_TYPE_V256 ? -1 : 0;
3323        }
3324        return 0;
3325
3326    case INDEX_op_shls_vec:
3327    case INDEX_op_shrs_vec:
3328        return vece >= MO_16;
3329    case INDEX_op_sars_vec:
3330        switch (vece) {
3331        case MO_16:
3332        case MO_32:
3333            return 1;
3334        case MO_64:
3335            return have_avx512vl;
3336        }
3337        return 0;
3338    case INDEX_op_rotls_vec:
3339        return vece >= MO_16 ? -1 : 0;
3340
3341    case INDEX_op_shlv_vec:
3342    case INDEX_op_shrv_vec:
3343        switch (vece) {
3344        case MO_16:
3345            return have_avx512bw;
3346        case MO_32:
3347        case MO_64:
3348            return have_avx2;
3349        }
3350        return 0;
3351    case INDEX_op_sarv_vec:
3352        switch (vece) {
3353        case MO_16:
3354            return have_avx512bw;
3355        case MO_32:
3356            return have_avx2;
3357        case MO_64:
3358            return have_avx512vl;
3359        }
3360        return 0;
3361    case INDEX_op_rotlv_vec:
3362    case INDEX_op_rotrv_vec:
3363        switch (vece) {
3364        case MO_16:
3365            return have_avx512vbmi2 ? -1 : 0;
3366        case MO_32:
3367        case MO_64:
3368            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3369        }
3370        return 0;
3371
3372    case INDEX_op_mul_vec:
3373        switch (vece) {
3374        case MO_8:
3375            return -1;
3376        case MO_64:
3377            return have_avx512dq;
3378        }
3379        return 1;
3380
3381    case INDEX_op_ssadd_vec:
3382    case INDEX_op_usadd_vec:
3383    case INDEX_op_sssub_vec:
3384    case INDEX_op_ussub_vec:
3385        return vece <= MO_16;
3386    case INDEX_op_smin_vec:
3387    case INDEX_op_smax_vec:
3388    case INDEX_op_umin_vec:
3389    case INDEX_op_umax_vec:
3390    case INDEX_op_abs_vec:
3391        return vece <= MO_32 || have_avx512vl;
3392
3393    default:
3394        return 0;
3395    }
3396}
3397
3398static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3399                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3400{
3401    TCGv_vec t1, t2;
3402
3403    tcg_debug_assert(vece == MO_8);
3404
3405    t1 = tcg_temp_new_vec(type);
3406    t2 = tcg_temp_new_vec(type);
3407
3408    /*
3409     * Unpack to W, shift, and repack.  Tricky bits:
3410     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3411     *     i.e. duplicate in other half of the 16-bit lane.
3412     * (2) For right-shift, add 8 so that the high half of the lane
3413     *     becomes zero.  For left-shift, and left-rotate, we must
3414     *     shift up and down again.
3415     * (3) Step 2 leaves high half zero such that PACKUSWB
3416     *     (pack with unsigned saturation) does not modify
3417     *     the quantity.
3418     */
3419    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3420              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3421    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3422              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3423
3424    if (opc != INDEX_op_rotli_vec) {
3425        imm += 8;
3426    }
3427    if (opc == INDEX_op_shri_vec) {
3428        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3429        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3430    } else {
3431        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3432        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3433        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3434        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3435    }
3436
3437    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3438              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3439    tcg_temp_free_vec(t1);
3440    tcg_temp_free_vec(t2);
3441}
3442
3443static void expand_vec_sari(TCGType type, unsigned vece,
3444                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3445{
3446    TCGv_vec t1, t2;
3447
3448    switch (vece) {
3449    case MO_8:
3450        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3451        t1 = tcg_temp_new_vec(type);
3452        t2 = tcg_temp_new_vec(type);
3453        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3454                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3455        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3456                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3457        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3458        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3459        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3460                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3461        tcg_temp_free_vec(t1);
3462        tcg_temp_free_vec(t2);
3463        break;
3464
3465    case MO_64:
3466        t1 = tcg_temp_new_vec(type);
3467        if (imm <= 32) {
3468            /*
3469             * We can emulate a small sign extend by performing an arithmetic
3470             * 32-bit shift and overwriting the high half of a 64-bit logical
3471             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3472             * does not, so we have to bound the smaller shift -- we get the
3473             * same result in the high half either way.
3474             */
3475            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3476            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3477            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3478                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3479                      tcgv_vec_arg(t1), 0xaa);
3480        } else {
3481            /* Otherwise we will need to use a compare vs 0 to produce
3482             * the sign-extend, shift and merge.
3483             */
3484            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3485                            tcg_constant_vec(type, MO_64, 0), v1);
3486            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3487            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3488            tcg_gen_or_vec(MO_64, v0, v0, t1);
3489        }
3490        tcg_temp_free_vec(t1);
3491        break;
3492
3493    default:
3494        g_assert_not_reached();
3495    }
3496}
3497
3498static void expand_vec_rotli(TCGType type, unsigned vece,
3499                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3500{
3501    TCGv_vec t;
3502
3503    if (vece == MO_8) {
3504        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3505        return;
3506    }
3507
3508    if (have_avx512vbmi2) {
3509        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3510                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3511        return;
3512    }
3513
3514    t = tcg_temp_new_vec(type);
3515    tcg_gen_shli_vec(vece, t, v1, imm);
3516    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3517    tcg_gen_or_vec(vece, v0, v0, t);
3518    tcg_temp_free_vec(t);
3519}
3520
3521static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3522                            TCGv_vec v1, TCGv_vec sh, bool right)
3523{
3524    TCGv_vec t;
3525
3526    if (have_avx512vbmi2) {
3527        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3528                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3529                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3530        return;
3531    }
3532
3533    t = tcg_temp_new_vec(type);
3534    tcg_gen_dupi_vec(vece, t, 8 << vece);
3535    tcg_gen_sub_vec(vece, t, t, sh);
3536    if (right) {
3537        tcg_gen_shlv_vec(vece, t, v1, t);
3538        tcg_gen_shrv_vec(vece, v0, v1, sh);
3539    } else {
3540        tcg_gen_shrv_vec(vece, t, v1, t);
3541        tcg_gen_shlv_vec(vece, v0, v1, sh);
3542    }
3543    tcg_gen_or_vec(vece, v0, v0, t);
3544    tcg_temp_free_vec(t);
3545}
3546
3547static void expand_vec_rotls(TCGType type, unsigned vece,
3548                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3549{
3550    TCGv_vec t = tcg_temp_new_vec(type);
3551
3552    tcg_debug_assert(vece != MO_8);
3553
3554    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3555        tcg_gen_dup_i32_vec(vece, t, lsh);
3556        if (vece >= MO_32) {
3557            tcg_gen_rotlv_vec(vece, v0, v1, t);
3558        } else {
3559            expand_vec_rotv(type, vece, v0, v1, t, false);
3560        }
3561    } else {
3562        TCGv_i32 rsh = tcg_temp_new_i32();
3563
3564        tcg_gen_neg_i32(rsh, lsh);
3565        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3566        tcg_gen_shls_vec(vece, t, v1, lsh);
3567        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3568        tcg_gen_or_vec(vece, v0, v0, t);
3569
3570        tcg_temp_free_i32(rsh);
3571    }
3572
3573    tcg_temp_free_vec(t);
3574}
3575
3576static void expand_vec_mul(TCGType type, unsigned vece,
3577                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3578{
3579    TCGv_vec t1, t2, t3, t4, zero;
3580
3581    tcg_debug_assert(vece == MO_8);
3582
3583    /*
3584     * Unpack v1 bytes to words, 0 | x.
3585     * Unpack v2 bytes to words, y | 0.
3586     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3587     * Shift logical right by 8 bits to clear the high 8 bytes before
3588     * using an unsigned saturated pack.
3589     *
3590     * The difference between the V64, V128 and V256 cases is merely how
3591     * we distribute the expansion between temporaries.
3592     */
3593    switch (type) {
3594    case TCG_TYPE_V64:
3595        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3596        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3597        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3598        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3599                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3600        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3601                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3602        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3603        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3604        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3605                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3606        tcg_temp_free_vec(t1);
3607        tcg_temp_free_vec(t2);
3608        break;
3609
3610    case TCG_TYPE_V128:
3611    case TCG_TYPE_V256:
3612        t1 = tcg_temp_new_vec(type);
3613        t2 = tcg_temp_new_vec(type);
3614        t3 = tcg_temp_new_vec(type);
3615        t4 = tcg_temp_new_vec(type);
3616        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3617        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3618                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3619        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3620                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3621        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3622                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3623        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3624                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3625        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3626        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3627        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3628        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3629        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3630                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3631        tcg_temp_free_vec(t1);
3632        tcg_temp_free_vec(t2);
3633        tcg_temp_free_vec(t3);
3634        tcg_temp_free_vec(t4);
3635        break;
3636
3637    default:
3638        g_assert_not_reached();
3639    }
3640}
3641
3642static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3643                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3644{
3645    enum {
3646        NEED_INV  = 1,
3647        NEED_SWAP = 2,
3648        NEED_BIAS = 4,
3649        NEED_UMIN = 8,
3650        NEED_UMAX = 16,
3651    };
3652    TCGv_vec t1, t2, t3;
3653    uint8_t fixup;
3654
3655    switch (cond) {
3656    case TCG_COND_EQ:
3657    case TCG_COND_GT:
3658        fixup = 0;
3659        break;
3660    case TCG_COND_NE:
3661    case TCG_COND_LE:
3662        fixup = NEED_INV;
3663        break;
3664    case TCG_COND_LT:
3665        fixup = NEED_SWAP;
3666        break;
3667    case TCG_COND_GE:
3668        fixup = NEED_SWAP | NEED_INV;
3669        break;
3670    case TCG_COND_LEU:
3671        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3672            fixup = NEED_UMIN;
3673        } else {
3674            fixup = NEED_BIAS | NEED_INV;
3675        }
3676        break;
3677    case TCG_COND_GTU:
3678        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3679            fixup = NEED_UMIN | NEED_INV;
3680        } else {
3681            fixup = NEED_BIAS;
3682        }
3683        break;
3684    case TCG_COND_GEU:
3685        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3686            fixup = NEED_UMAX;
3687        } else {
3688            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3689        }
3690        break;
3691    case TCG_COND_LTU:
3692        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3693            fixup = NEED_UMAX | NEED_INV;
3694        } else {
3695            fixup = NEED_BIAS | NEED_SWAP;
3696        }
3697        break;
3698    default:
3699        g_assert_not_reached();
3700    }
3701
3702    if (fixup & NEED_INV) {
3703        cond = tcg_invert_cond(cond);
3704    }
3705    if (fixup & NEED_SWAP) {
3706        t1 = v1, v1 = v2, v2 = t1;
3707        cond = tcg_swap_cond(cond);
3708    }
3709
3710    t1 = t2 = NULL;
3711    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3712        t1 = tcg_temp_new_vec(type);
3713        if (fixup & NEED_UMIN) {
3714            tcg_gen_umin_vec(vece, t1, v1, v2);
3715        } else {
3716            tcg_gen_umax_vec(vece, t1, v1, v2);
3717        }
3718        v2 = t1;
3719        cond = TCG_COND_EQ;
3720    } else if (fixup & NEED_BIAS) {
3721        t1 = tcg_temp_new_vec(type);
3722        t2 = tcg_temp_new_vec(type);
3723        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3724        tcg_gen_sub_vec(vece, t1, v1, t3);
3725        tcg_gen_sub_vec(vece, t2, v2, t3);
3726        v1 = t1;
3727        v2 = t2;
3728        cond = tcg_signed_cond(cond);
3729    }
3730
3731    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3732    /* Expand directly; do not recurse.  */
3733    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3734              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3735
3736    if (t1) {
3737        tcg_temp_free_vec(t1);
3738        if (t2) {
3739            tcg_temp_free_vec(t2);
3740        }
3741    }
3742    return fixup & NEED_INV;
3743}
3744
3745static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3746                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3747{
3748    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3749        tcg_gen_not_vec(vece, v0, v0);
3750    }
3751}
3752
3753static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3754                              TCGv_vec c1, TCGv_vec c2,
3755                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3756{
3757    TCGv_vec t = tcg_temp_new_vec(type);
3758
3759    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3760        /* Invert the sense of the compare by swapping arguments.  */
3761        TCGv_vec x;
3762        x = v3, v3 = v4, v4 = x;
3763    }
3764    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3765              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3766              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3767    tcg_temp_free_vec(t);
3768}
3769
3770void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3771                       TCGArg a0, ...)
3772{
3773    va_list va;
3774    TCGArg a2;
3775    TCGv_vec v0, v1, v2, v3, v4;
3776
3777    va_start(va, a0);
3778    v0 = temp_tcgv_vec(arg_temp(a0));
3779    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3780    a2 = va_arg(va, TCGArg);
3781
3782    switch (opc) {
3783    case INDEX_op_shli_vec:
3784    case INDEX_op_shri_vec:
3785        expand_vec_shi(type, vece, opc, v0, v1, a2);
3786        break;
3787
3788    case INDEX_op_sari_vec:
3789        expand_vec_sari(type, vece, v0, v1, a2);
3790        break;
3791
3792    case INDEX_op_rotli_vec:
3793        expand_vec_rotli(type, vece, v0, v1, a2);
3794        break;
3795
3796    case INDEX_op_rotls_vec:
3797        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3798        break;
3799
3800    case INDEX_op_rotlv_vec:
3801        v2 = temp_tcgv_vec(arg_temp(a2));
3802        expand_vec_rotv(type, vece, v0, v1, v2, false);
3803        break;
3804    case INDEX_op_rotrv_vec:
3805        v2 = temp_tcgv_vec(arg_temp(a2));
3806        expand_vec_rotv(type, vece, v0, v1, v2, true);
3807        break;
3808
3809    case INDEX_op_mul_vec:
3810        v2 = temp_tcgv_vec(arg_temp(a2));
3811        expand_vec_mul(type, vece, v0, v1, v2);
3812        break;
3813
3814    case INDEX_op_cmp_vec:
3815        v2 = temp_tcgv_vec(arg_temp(a2));
3816        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3817        break;
3818
3819    case INDEX_op_cmpsel_vec:
3820        v2 = temp_tcgv_vec(arg_temp(a2));
3821        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3822        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3823        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3824        break;
3825
3826    default:
3827        break;
3828    }
3829
3830    va_end(va);
3831}
3832
3833static const int tcg_target_callee_save_regs[] = {
3834#if TCG_TARGET_REG_BITS == 64
3835    TCG_REG_RBP,
3836    TCG_REG_RBX,
3837#if defined(_WIN64)
3838    TCG_REG_RDI,
3839    TCG_REG_RSI,
3840#endif
3841    TCG_REG_R12,
3842    TCG_REG_R13,
3843    TCG_REG_R14, /* Currently used for the global env. */
3844    TCG_REG_R15,
3845#else
3846    TCG_REG_EBP, /* Currently used for the global env. */
3847    TCG_REG_EBX,
3848    TCG_REG_ESI,
3849    TCG_REG_EDI,
3850#endif
3851};
3852
3853/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3854   and tcg_register_jit.  */
3855
3856#define PUSH_SIZE \
3857    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3858     * (TCG_TARGET_REG_BITS / 8))
3859
3860#define FRAME_SIZE \
3861    ((PUSH_SIZE \
3862      + TCG_STATIC_CALL_ARGS_SIZE \
3863      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3864      + TCG_TARGET_STACK_ALIGN - 1) \
3865     & ~(TCG_TARGET_STACK_ALIGN - 1))
3866
3867/* Generate global QEMU prologue and epilogue code */
3868static void tcg_target_qemu_prologue(TCGContext *s)
3869{
3870    int i, stack_addend;
3871
3872    /* TB prologue */
3873
3874    /* Reserve some stack space, also for TCG temps.  */
3875    stack_addend = FRAME_SIZE - PUSH_SIZE;
3876    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3877                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3878
3879    /* Save all callee saved registers.  */
3880    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3881        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3882    }
3883
3884#if TCG_TARGET_REG_BITS == 32
3885    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3886               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3887    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3888    /* jmp *tb.  */
3889    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3890                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3891                         + stack_addend);
3892#else
3893# if !defined(CONFIG_SOFTMMU)
3894    if (guest_base) {
3895        int seg = setup_guest_base_seg();
3896        if (seg != 0) {
3897            x86_guest_base.seg = seg;
3898        } else if (guest_base == (int32_t)guest_base) {
3899            x86_guest_base.ofs = guest_base;
3900        } else {
3901            /* Choose R12 because, as a base, it requires a SIB byte. */
3902            x86_guest_base.index = TCG_REG_R12;
3903            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
3904            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
3905        }
3906    }
3907# endif
3908    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3909    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3910    /* jmp *tb.  */
3911    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3912#endif
3913
3914    /*
3915     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3916     * and fall through to the rest of the epilogue.
3917     */
3918    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3919    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3920
3921    /* TB epilogue */
3922    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3923
3924    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3925
3926    if (have_avx2) {
3927        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3928    }
3929    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3930        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3931    }
3932    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3933}
3934
3935static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3936{
3937    memset(p, 0x90, count);
3938}
3939
3940static void tcg_target_init(TCGContext *s)
3941{
3942#ifdef CONFIG_CPUID_H
3943    unsigned a, b, c, d, b7 = 0, c7 = 0;
3944    unsigned max = __get_cpuid_max(0, 0);
3945
3946    if (max >= 7) {
3947        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3948        __cpuid_count(7, 0, a, b7, c7, d);
3949        have_bmi1 = (b7 & bit_BMI) != 0;
3950        have_bmi2 = (b7 & bit_BMI2) != 0;
3951    }
3952
3953    if (max >= 1) {
3954        __cpuid(1, a, b, c, d);
3955#ifndef have_cmov
3956        /* For 32-bit, 99% certainty that we're running on hardware that
3957           supports cmov, but we still need to check.  In case cmov is not
3958           available, we'll use a small forward branch.  */
3959        have_cmov = (d & bit_CMOV) != 0;
3960#endif
3961
3962        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3963           need to probe for it.  */
3964        have_movbe = (c & bit_MOVBE) != 0;
3965        have_popcnt = (c & bit_POPCNT) != 0;
3966
3967        /* There are a number of things we must check before we can be
3968           sure of not hitting invalid opcode.  */
3969        if (c & bit_OSXSAVE) {
3970            unsigned bv = xgetbv_low(0);
3971
3972            if ((bv & 6) == 6) {
3973                have_avx1 = (c & bit_AVX) != 0;
3974                have_avx2 = (b7 & bit_AVX2) != 0;
3975
3976                /*
3977                 * There are interesting instructions in AVX512, so long
3978                 * as we have AVX512VL, which indicates support for EVEX
3979                 * on sizes smaller than 512 bits.  We are required to
3980                 * check that OPMASK and all extended ZMM state are enabled
3981                 * even if we're not using them -- the insns will fault.
3982                 */
3983                if ((bv & 0xe0) == 0xe0
3984                    && (b7 & bit_AVX512F)
3985                    && (b7 & bit_AVX512VL)) {
3986                    have_avx512vl = true;
3987                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
3988                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
3989                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
3990                }
3991
3992                /*
3993                 * The Intel SDM has added:
3994                 *   Processors that enumerate support for Intel® AVX
3995                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
3996                 *   guarantee that the 16-byte memory operations performed
3997                 *   by the following instructions will always be carried
3998                 *   out atomically:
3999                 *   - MOVAPD, MOVAPS, and MOVDQA.
4000                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
4001                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
4002                 *     with EVEX.128 and k0 (masking disabled).
4003                 * Note that these instructions require the linear addresses
4004                 * of their memory operands to be 16-byte aligned.
4005                 *
4006                 * AMD has provided an even stronger guarantee that processors
4007                 * with AVX provide 16-byte atomicity for all cachable,
4008                 * naturally aligned single loads and stores, e.g. MOVDQU.
4009                 *
4010                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
4011                 */
4012                if (have_avx1) {
4013                    __cpuid(0, a, b, c, d);
4014                    have_atomic16 = (c == signature_INTEL_ecx ||
4015                                     c == signature_AMD_ecx);
4016                }
4017            }
4018        }
4019    }
4020
4021    max = __get_cpuid_max(0x8000000, 0);
4022    if (max >= 1) {
4023        __cpuid(0x80000001, a, b, c, d);
4024        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
4025        have_lzcnt = (c & bit_LZCNT) != 0;
4026    }
4027#endif /* CONFIG_CPUID_H */
4028
4029    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4030    if (TCG_TARGET_REG_BITS == 64) {
4031        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4032    }
4033    if (have_avx1) {
4034        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4035        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4036    }
4037    if (have_avx2) {
4038        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4039    }
4040
4041    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4042    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4043    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4044    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4045    if (TCG_TARGET_REG_BITS == 64) {
4046#if !defined(_WIN64)
4047        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4048        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4049#endif
4050        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4051        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4052        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4053        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4054    }
4055
4056    s->reserved_regs = 0;
4057    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4058#ifdef _WIN64
4059    /* These are call saved, and we don't save them, so don't use them. */
4060    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4061    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4062    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4063    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4064    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4065    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4066    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4067    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4068    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4069    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4070#endif
4071}
4072
4073typedef struct {
4074    DebugFrameHeader h;
4075    uint8_t fde_def_cfa[4];
4076    uint8_t fde_reg_ofs[14];
4077} DebugFrame;
4078
4079/* We're expecting a 2 byte uleb128 encoded value.  */
4080QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4081
4082#if !defined(__ELF__)
4083    /* Host machine without ELF. */
4084#elif TCG_TARGET_REG_BITS == 64
4085#define ELF_HOST_MACHINE EM_X86_64
4086static const DebugFrame debug_frame = {
4087    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4088    .h.cie.id = -1,
4089    .h.cie.version = 1,
4090    .h.cie.code_align = 1,
4091    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4092    .h.cie.return_column = 16,
4093
4094    /* Total FDE size does not include the "len" member.  */
4095    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4096
4097    .fde_def_cfa = {
4098        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4099        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4100        (FRAME_SIZE >> 7)
4101    },
4102    .fde_reg_ofs = {
4103        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4104        /* The following ordering must match tcg_target_callee_save_regs.  */
4105        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4106        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4107        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4108        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4109        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4110        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4111    }
4112};
4113#else
4114#define ELF_HOST_MACHINE EM_386
4115static const DebugFrame debug_frame = {
4116    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4117    .h.cie.id = -1,
4118    .h.cie.version = 1,
4119    .h.cie.code_align = 1,
4120    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4121    .h.cie.return_column = 8,
4122
4123    /* Total FDE size does not include the "len" member.  */
4124    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4125
4126    .fde_def_cfa = {
4127        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4128        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4129        (FRAME_SIZE >> 7)
4130    },
4131    .fde_reg_ofs = {
4132        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4133        /* The following ordering must match tcg_target_callee_save_regs.  */
4134        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4135        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4136        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4137        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4138    }
4139};
4140#endif
4141
4142#if defined(ELF_HOST_MACHINE)
4143void tcg_register_jit(const void *buf, size_t buf_size)
4144{
4145    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4146}
4147#endif
4148