xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 7d87775f)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135#define TCG_CT_CONST_TST 0x1000
136#define TCG_CT_CONST_ZERO 0x2000
137
138/* Registers used with L constraint, which are the first argument
139   registers on x86_64, and two random call clobbered registers on
140   i386. */
141#if TCG_TARGET_REG_BITS == 64
142# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
143# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
144#else
145# define TCG_REG_L0 TCG_REG_EAX
146# define TCG_REG_L1 TCG_REG_EDX
147#endif
148
149#if TCG_TARGET_REG_BITS == 64
150# define ALL_GENERAL_REGS      0x0000ffffu
151# define ALL_VECTOR_REGS       0xffff0000u
152# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
153#else
154# define ALL_GENERAL_REGS      0x000000ffu
155# define ALL_VECTOR_REGS       0x00ff0000u
156# define ALL_BYTEL_REGS        0x0000000fu
157#endif
158#define SOFTMMU_RESERVE_REGS \
159    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
160
161#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
162#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
163
164static const tcg_insn_unit *tb_ret_addr;
165
166static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
167                        intptr_t value, intptr_t addend)
168{
169    value += addend;
170    switch(type) {
171    case R_386_PC32:
172        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
173        if (value != (int32_t)value) {
174            return false;
175        }
176        /* FALLTHRU */
177    case R_386_32:
178        tcg_patch32(code_ptr, value);
179        break;
180    case R_386_PC8:
181        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
182        if (value != (int8_t)value) {
183            return false;
184        }
185        tcg_patch8(code_ptr, value);
186        break;
187    default:
188        g_assert_not_reached();
189    }
190    return true;
191}
192
193/* test if a constant matches the constraint */
194static bool tcg_target_const_match(int64_t val, int ct,
195                                   TCGType type, TCGCond cond, int vece)
196{
197    if (ct & TCG_CT_CONST) {
198        return 1;
199    }
200    if (type == TCG_TYPE_I32) {
201        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
202                  TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
203            return 1;
204        }
205    } else {
206        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
207            return 1;
208        }
209        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
210            return 1;
211        }
212        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
213            return 1;
214        }
215        /*
216         * This will be used in combination with TCG_CT_CONST_S32,
217         * so "normal" TESTQ is already matched.  Also accept:
218         *    TESTQ -> TESTL   (uint32_t)
219         *    TESTQ -> BT      (is_power_of_2)
220         */
221        if ((ct & TCG_CT_CONST_TST)
222            && is_tst_cond(cond)
223            && (val == (uint32_t)val || is_power_of_2(val))) {
224            return 1;
225        }
226    }
227    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
228        return 1;
229    }
230    if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
231        return 1;
232    }
233    return 0;
234}
235
236# define LOWREGMASK(x)	((x) & 7)
237
238#define P_EXT		0x100		/* 0x0f opcode prefix */
239#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
240#define P_DATA16        0x400           /* 0x66 opcode prefix */
241#define P_VEXW          0x1000          /* Set VEX.W = 1 */
242#if TCG_TARGET_REG_BITS == 64
243# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
244# define P_REXB_R       0x2000          /* REG field as byte register */
245# define P_REXB_RM      0x4000          /* R/M field as byte register */
246# define P_GS           0x8000          /* gs segment override */
247#else
248# define P_REXW		0
249# define P_REXB_R	0
250# define P_REXB_RM	0
251# define P_GS           0
252#endif
253#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
254#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
255#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
256#define P_VEXL          0x80000         /* Set VEX.L = 1 */
257#define P_EVEX          0x100000        /* Requires EVEX encoding */
258
259#define OPC_ARITH_EbIb	(0x80)
260#define OPC_ARITH_EvIz	(0x81)
261#define OPC_ARITH_EvIb	(0x83)
262#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
263#define OPC_ANDN        (0xf2 | P_EXT38)
264#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
265#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
266#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
267#define OPC_BSF         (0xbc | P_EXT)
268#define OPC_BSR         (0xbd | P_EXT)
269#define OPC_BSWAP	(0xc8 | P_EXT)
270#define OPC_CALL_Jz	(0xe8)
271#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
272#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
273#define OPC_DEC_r32	(0x48)
274#define OPC_IMUL_GvEv	(0xaf | P_EXT)
275#define OPC_IMUL_GvEvIb	(0x6b)
276#define OPC_IMUL_GvEvIz	(0x69)
277#define OPC_INC_r32	(0x40)
278#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
279#define OPC_JCC_short	(0x70)		/* ... plus condition code */
280#define OPC_JMP_long	(0xe9)
281#define OPC_JMP_short	(0xeb)
282#define OPC_LEA         (0x8d)
283#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
284#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
285#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
286#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
287#define OPC_MOVB_EvIz   (0xc6)
288#define OPC_MOVL_EvIz	(0xc7)
289#define OPC_MOVB_Ib     (0xb0)
290#define OPC_MOVL_Iv     (0xb8)
291#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
292#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
293#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
294#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
295#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
296#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
297#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
298#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
299#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
300#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
301#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
302#define OPC_MOVSBL	(0xbe | P_EXT)
303#define OPC_MOVSWL	(0xbf | P_EXT)
304#define OPC_MOVSLQ	(0x63 | P_REXW)
305#define OPC_MOVZBL	(0xb6 | P_EXT)
306#define OPC_MOVZWL	(0xb7 | P_EXT)
307#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
308#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
309#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
310#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
311#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
312#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
313#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
314#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
315#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
316#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
317#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
318#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
319#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
320#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
321#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
322#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
323#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
324#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
325#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
326#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
327#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
328#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
329#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
330#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
331#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
332#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
333#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
334#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
335#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
336#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
337#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
338#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
339#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
340#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
341#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
342#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
343#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
344#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
345#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
346#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
347#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
348#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
349#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
350#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
351#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
352#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
353#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
354#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
355#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
356#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
357#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
358#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
359#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
360#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
361#define OPC_POR         (0xeb | P_EXT | P_DATA16)
362#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
363#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
364#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
365#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
366#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
367#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
368#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
369#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
370#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
371#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
372#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
373#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
374#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
375#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
376#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
377#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
378#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
379#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
380#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
381#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
382#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
383#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
384#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
385#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
386#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
387#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
388#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
389#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
390#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
391#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
392#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
393#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
394#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
395#define OPC_POP_r32	(0x58)
396#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
397#define OPC_PUSH_r32	(0x50)
398#define OPC_PUSH_Iv	(0x68)
399#define OPC_PUSH_Ib	(0x6a)
400#define OPC_RET		(0xc3)
401#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
402#define OPC_SHIFT_1	(0xd1)
403#define OPC_SHIFT_Ib	(0xc1)
404#define OPC_SHIFT_cl	(0xd3)
405#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
406#define OPC_SHUFPS      (0xc6 | P_EXT)
407#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
408#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
409#define OPC_SHRD_Ib     (0xac | P_EXT)
410#define OPC_TESTB	(0x84)
411#define OPC_TESTL	(0x85)
412#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
413#define OPC_UD2         (0x0b | P_EXT)
414#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
415#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
416#define OPC_VPBLENDMB   (0x66 | P_EXT38 | P_DATA16 | P_EVEX)
417#define OPC_VPBLENDMW   (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
418#define OPC_VPBLENDMD   (0x64 | P_EXT38 | P_DATA16 | P_EVEX)
419#define OPC_VPBLENDMQ   (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
420#define OPC_VPCMPB      (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
421#define OPC_VPCMPUB     (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
422#define OPC_VPCMPW      (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
423#define OPC_VPCMPUW     (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
424#define OPC_VPCMPD      (0x1f | P_EXT3A | P_DATA16 | P_EVEX)
425#define OPC_VPCMPUD     (0x1e | P_EXT3A | P_DATA16 | P_EVEX)
426#define OPC_VPCMPQ      (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
427#define OPC_VPCMPUQ     (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
428#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
429#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
430#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
431#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
432#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
433#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
434#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
435#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
436#define OPC_VPMOVM2B    (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX)
437#define OPC_VPMOVM2W    (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
438#define OPC_VPMOVM2D    (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX)
439#define OPC_VPMOVM2Q    (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
440#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
441#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
442#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
443#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
444#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
445#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
446#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
447#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
448#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
449#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
450#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
451#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
452#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
453#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
454#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
455#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
456#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
457#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
458#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
459#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
460#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
461#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
462#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
463#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
464#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
465#define OPC_VPTESTMB    (0x26 | P_EXT38 | P_DATA16 | P_EVEX)
466#define OPC_VPTESTMW    (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
467#define OPC_VPTESTMD    (0x27 | P_EXT38 | P_DATA16 | P_EVEX)
468#define OPC_VPTESTMQ    (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
469#define OPC_VPTESTNMB   (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX)
470#define OPC_VPTESTNMW   (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
471#define OPC_VPTESTNMD   (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX)
472#define OPC_VPTESTNMQ   (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
473#define OPC_VZEROUPPER  (0x77 | P_EXT)
474#define OPC_XCHG_ax_r32	(0x90)
475#define OPC_XCHG_EvGv   (0x87)
476
477#define OPC_GRP3_Eb     (0xf6)
478#define OPC_GRP3_Ev     (0xf7)
479#define OPC_GRP5        (0xff)
480#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
481#define OPC_GRPBT       (0xba | P_EXT)
482
483#define OPC_GRPBT_BT    4
484#define OPC_GRPBT_BTS   5
485#define OPC_GRPBT_BTR   6
486#define OPC_GRPBT_BTC   7
487
488/* Group 1 opcode extensions for 0x80-0x83.
489   These are also used as modifiers for OPC_ARITH.  */
490#define ARITH_ADD 0
491#define ARITH_OR  1
492#define ARITH_ADC 2
493#define ARITH_SBB 3
494#define ARITH_AND 4
495#define ARITH_SUB 5
496#define ARITH_XOR 6
497#define ARITH_CMP 7
498
499/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
500#define SHIFT_ROL 0
501#define SHIFT_ROR 1
502#define SHIFT_SHL 4
503#define SHIFT_SHR 5
504#define SHIFT_SAR 7
505
506/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
507#define EXT3_TESTi 0
508#define EXT3_NOT   2
509#define EXT3_NEG   3
510#define EXT3_MUL   4
511#define EXT3_IMUL  5
512#define EXT3_DIV   6
513#define EXT3_IDIV  7
514
515/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
516#define EXT5_INC_Ev	0
517#define EXT5_DEC_Ev	1
518#define EXT5_CALLN_Ev	2
519#define EXT5_JMPN_Ev	4
520
521/* Condition codes to be added to OPC_JCC_{long,short}.  */
522#define JCC_JMP (-1)
523#define JCC_JO  0x0
524#define JCC_JNO 0x1
525#define JCC_JB  0x2
526#define JCC_JAE 0x3
527#define JCC_JE  0x4
528#define JCC_JNE 0x5
529#define JCC_JBE 0x6
530#define JCC_JA  0x7
531#define JCC_JS  0x8
532#define JCC_JNS 0x9
533#define JCC_JP  0xa
534#define JCC_JNP 0xb
535#define JCC_JL  0xc
536#define JCC_JGE 0xd
537#define JCC_JLE 0xe
538#define JCC_JG  0xf
539
540static const uint8_t tcg_cond_to_jcc[] = {
541    [TCG_COND_EQ] = JCC_JE,
542    [TCG_COND_NE] = JCC_JNE,
543    [TCG_COND_LT] = JCC_JL,
544    [TCG_COND_GE] = JCC_JGE,
545    [TCG_COND_LE] = JCC_JLE,
546    [TCG_COND_GT] = JCC_JG,
547    [TCG_COND_LTU] = JCC_JB,
548    [TCG_COND_GEU] = JCC_JAE,
549    [TCG_COND_LEU] = JCC_JBE,
550    [TCG_COND_GTU] = JCC_JA,
551    [TCG_COND_TSTEQ] = JCC_JE,
552    [TCG_COND_TSTNE] = JCC_JNE,
553};
554
555#if TCG_TARGET_REG_BITS == 64
556static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
557{
558    int rex;
559
560    if (opc & P_GS) {
561        tcg_out8(s, 0x65);
562    }
563    if (opc & P_DATA16) {
564        /* We should never be asking for both 16 and 64-bit operation.  */
565        tcg_debug_assert((opc & P_REXW) == 0);
566        tcg_out8(s, 0x66);
567    }
568    if (opc & P_SIMDF3) {
569        tcg_out8(s, 0xf3);
570    } else if (opc & P_SIMDF2) {
571        tcg_out8(s, 0xf2);
572    }
573
574    rex = 0;
575    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
576    rex |= (r & 8) >> 1;                /* REX.R */
577    rex |= (x & 8) >> 2;                /* REX.X */
578    rex |= (rm & 8) >> 3;               /* REX.B */
579
580    /* P_REXB_{R,RM} indicates that the given register is the low byte.
581       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
582       as otherwise the encoding indicates %[abcd]h.  Note that the values
583       that are ORed in merely indicate that the REX byte must be present;
584       those bits get discarded in output.  */
585    rex |= opc & (r >= 4 ? P_REXB_R : 0);
586    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
587
588    if (rex) {
589        tcg_out8(s, (uint8_t)(rex | 0x40));
590    }
591
592    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
593        tcg_out8(s, 0x0f);
594        if (opc & P_EXT38) {
595            tcg_out8(s, 0x38);
596        } else if (opc & P_EXT3A) {
597            tcg_out8(s, 0x3a);
598        }
599    }
600
601    tcg_out8(s, opc);
602}
603#else
604static void tcg_out_opc(TCGContext *s, int opc)
605{
606    if (opc & P_DATA16) {
607        tcg_out8(s, 0x66);
608    }
609    if (opc & P_SIMDF3) {
610        tcg_out8(s, 0xf3);
611    } else if (opc & P_SIMDF2) {
612        tcg_out8(s, 0xf2);
613    }
614    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
615        tcg_out8(s, 0x0f);
616        if (opc & P_EXT38) {
617            tcg_out8(s, 0x38);
618        } else if (opc & P_EXT3A) {
619            tcg_out8(s, 0x3a);
620        }
621    }
622    tcg_out8(s, opc);
623}
624/* Discard the register arguments to tcg_out_opc early, so as not to penalize
625   the 32-bit compilation paths.  This method works with all versions of gcc,
626   whereas relying on optimization may not be able to exclude them.  */
627#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
628#endif
629
630static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
631{
632    tcg_out_opc(s, opc, r, rm, 0);
633    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
634}
635
636static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
637                            int rm, int index)
638{
639    int tmp;
640
641    if (opc & P_GS) {
642        tcg_out8(s, 0x65);
643    }
644    /* Use the two byte form if possible, which cannot encode
645       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
646    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
647        && ((rm | index) & 8) == 0) {
648        /* Two byte VEX prefix.  */
649        tcg_out8(s, 0xc5);
650
651        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
652    } else {
653        /* Three byte VEX prefix.  */
654        tcg_out8(s, 0xc4);
655
656        /* VEX.m-mmmm */
657        if (opc & P_EXT3A) {
658            tmp = 3;
659        } else if (opc & P_EXT38) {
660            tmp = 2;
661        } else if (opc & P_EXT) {
662            tmp = 1;
663        } else {
664            g_assert_not_reached();
665        }
666        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
667        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
668        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
669        tcg_out8(s, tmp);
670
671        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
672    }
673
674    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
675    /* VEX.pp */
676    if (opc & P_DATA16) {
677        tmp |= 1;                          /* 0x66 */
678    } else if (opc & P_SIMDF3) {
679        tmp |= 2;                          /* 0xf3 */
680    } else if (opc & P_SIMDF2) {
681        tmp |= 3;                          /* 0xf2 */
682    }
683    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
684    tcg_out8(s, tmp);
685    tcg_out8(s, opc);
686}
687
688static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
689                             int rm, int index, int aaa, bool z)
690{
691    /* The entire 4-byte evex prefix; with R' and V' set. */
692    uint32_t p = 0x08041062;
693    int mm, pp;
694
695    tcg_debug_assert(have_avx512vl);
696
697    /* EVEX.mm */
698    if (opc & P_EXT3A) {
699        mm = 3;
700    } else if (opc & P_EXT38) {
701        mm = 2;
702    } else if (opc & P_EXT) {
703        mm = 1;
704    } else {
705        g_assert_not_reached();
706    }
707
708    /* EVEX.pp */
709    if (opc & P_DATA16) {
710        pp = 1;                          /* 0x66 */
711    } else if (opc & P_SIMDF3) {
712        pp = 2;                          /* 0xf3 */
713    } else if (opc & P_SIMDF2) {
714        pp = 3;                          /* 0xf2 */
715    } else {
716        pp = 0;
717    }
718
719    p = deposit32(p, 8, 2, mm);
720    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
721    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
722    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
723    p = deposit32(p, 16, 2, pp);
724    p = deposit32(p, 19, 4, ~v);
725    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
726    p = deposit32(p, 24, 3, aaa);
727    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
728    p = deposit32(p, 31, 1, z);
729
730    tcg_out32(s, p);
731    tcg_out8(s, opc);
732}
733
734static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
735{
736    if (opc & P_EVEX) {
737        tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false);
738    } else {
739        tcg_out_vex_opc(s, opc, r, v, rm, 0);
740    }
741    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
742}
743
744static void tcg_out_vex_modrm_type(TCGContext *s, int opc,
745                                   int r, int v, int rm, TCGType type)
746{
747    if (type == TCG_TYPE_V256) {
748        opc |= P_VEXL;
749    }
750    tcg_out_vex_modrm(s, opc, r, v, rm);
751}
752
753static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v,
754                                    int rm, int aaa, bool z, TCGType type)
755{
756    if (type == TCG_TYPE_V256) {
757        opc |= P_VEXL;
758    }
759    tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z);
760    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
761}
762
763/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
764   We handle either RM and INDEX missing with a negative value.  In 64-bit
765   mode for absolute addresses, ~RM is the size of the immediate operand
766   that will follow the instruction.  */
767
768static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
769                               int shift, intptr_t offset)
770{
771    int mod, len;
772
773    if (index < 0 && rm < 0) {
774        if (TCG_TARGET_REG_BITS == 64) {
775            /* Try for a rip-relative addressing mode.  This has replaced
776               the 32-bit-mode absolute addressing encoding.  */
777            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
778            intptr_t disp = offset - pc;
779            if (disp == (int32_t)disp) {
780                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
781                tcg_out32(s, disp);
782                return;
783            }
784
785            /* Try for an absolute address encoding.  This requires the
786               use of the MODRM+SIB encoding and is therefore larger than
787               rip-relative addressing.  */
788            if (offset == (int32_t)offset) {
789                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
790                tcg_out8(s, (4 << 3) | 5);
791                tcg_out32(s, offset);
792                return;
793            }
794
795            /* ??? The memory isn't directly addressable.  */
796            g_assert_not_reached();
797        } else {
798            /* Absolute address.  */
799            tcg_out8(s, (r << 3) | 5);
800            tcg_out32(s, offset);
801            return;
802        }
803    }
804
805    /* Find the length of the immediate addend.  Note that the encoding
806       that would be used for (%ebp) indicates absolute addressing.  */
807    if (rm < 0) {
808        mod = 0, len = 4, rm = 5;
809    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
810        mod = 0, len = 0;
811    } else if (offset == (int8_t)offset) {
812        mod = 0x40, len = 1;
813    } else {
814        mod = 0x80, len = 4;
815    }
816
817    /* Use a single byte MODRM format if possible.  Note that the encoding
818       that would be used for %esp is the escape to the two byte form.  */
819    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
820        /* Single byte MODRM format.  */
821        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
822    } else {
823        /* Two byte MODRM+SIB format.  */
824
825        /* Note that the encoding that would place %esp into the index
826           field indicates no index register.  In 64-bit mode, the REX.X
827           bit counts, so %r12 can be used as the index.  */
828        if (index < 0) {
829            index = 4;
830        } else {
831            tcg_debug_assert(index != TCG_REG_ESP);
832        }
833
834        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
835        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
836    }
837
838    if (len == 1) {
839        tcg_out8(s, offset);
840    } else if (len == 4) {
841        tcg_out32(s, offset);
842    }
843}
844
845static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
846                                     int index, int shift, intptr_t offset)
847{
848    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
849    tcg_out_sib_offset(s, r, rm, index, shift, offset);
850}
851
852static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
853                                         int rm, int index, int shift,
854                                         intptr_t offset)
855{
856    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
857    tcg_out_sib_offset(s, r, rm, index, shift, offset);
858}
859
860/* A simplification of the above with no index or shift.  */
861static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
862                                        int rm, intptr_t offset)
863{
864    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
865}
866
867static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
868                                            int v, int rm, intptr_t offset)
869{
870    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
871}
872
873/* Output an opcode with an expected reference to the constant pool.  */
874static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
875{
876    tcg_out_opc(s, opc, r, 0, 0);
877    /* Absolute for 32-bit, pc-relative for 64-bit.  */
878    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
879    tcg_out32(s, 0);
880}
881
882/* Output an opcode with an expected reference to the constant pool.  */
883static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
884{
885    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
886    /* Absolute for 32-bit, pc-relative for 64-bit.  */
887    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
888    tcg_out32(s, 0);
889}
890
891/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
892static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
893{
894    /* Propagate an opcode prefix, such as P_REXW.  */
895    int ext = subop & ~0x7;
896    subop &= 0x7;
897
898    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
899}
900
901static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
902{
903    int rexw = 0;
904
905    if (arg == ret) {
906        return true;
907    }
908    switch (type) {
909    case TCG_TYPE_I64:
910        rexw = P_REXW;
911        /* fallthru */
912    case TCG_TYPE_I32:
913        if (ret < 16) {
914            if (arg < 16) {
915                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
916            } else {
917                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
918            }
919        } else {
920            if (arg < 16) {
921                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
922            } else {
923                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
924            }
925        }
926        break;
927
928    case TCG_TYPE_V64:
929        tcg_debug_assert(ret >= 16 && arg >= 16);
930        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
931        break;
932    case TCG_TYPE_V128:
933        tcg_debug_assert(ret >= 16 && arg >= 16);
934        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
935        break;
936    case TCG_TYPE_V256:
937        tcg_debug_assert(ret >= 16 && arg >= 16);
938        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
939        break;
940
941    default:
942        g_assert_not_reached();
943    }
944    return true;
945}
946
947static const int avx2_dup_insn[4] = {
948    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
949    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
950};
951
952static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
953                            TCGReg r, TCGReg a)
954{
955    if (have_avx2) {
956        tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type);
957    } else {
958        switch (vece) {
959        case MO_8:
960            /* ??? With zero in a register, use PSHUFB.  */
961            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
962            a = r;
963            /* FALLTHRU */
964        case MO_16:
965            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
966            a = r;
967            /* FALLTHRU */
968        case MO_32:
969            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
970            /* imm8 operand: all output lanes selected from input lane 0.  */
971            tcg_out8(s, 0);
972            break;
973        case MO_64:
974            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
975            break;
976        default:
977            g_assert_not_reached();
978        }
979    }
980    return true;
981}
982
983static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
984                             TCGReg r, TCGReg base, intptr_t offset)
985{
986    if (have_avx2) {
987        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
988        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
989                                 r, 0, base, offset);
990    } else {
991        switch (vece) {
992        case MO_64:
993            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
994            break;
995        case MO_32:
996            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
997            break;
998        case MO_16:
999            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
1000            tcg_out8(s, 0); /* imm8 */
1001            tcg_out_dup_vec(s, type, vece, r, r);
1002            break;
1003        case MO_8:
1004            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
1005            tcg_out8(s, 0); /* imm8 */
1006            tcg_out_dup_vec(s, type, vece, r, r);
1007            break;
1008        default:
1009            g_assert_not_reached();
1010        }
1011    }
1012    return true;
1013}
1014
1015static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
1016                             TCGReg ret, int64_t arg)
1017{
1018    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
1019
1020    if (arg == 0) {
1021        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1022        return;
1023    }
1024    if (arg == -1) {
1025        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
1026        return;
1027    }
1028
1029    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
1030        if (have_avx2) {
1031            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
1032        } else {
1033            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
1034        }
1035        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1036    } else {
1037        if (type == TCG_TYPE_V64) {
1038            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
1039        } else if (have_avx2) {
1040            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
1041        } else {
1042            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1043        }
1044        if (TCG_TARGET_REG_BITS == 64) {
1045            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1046        } else {
1047            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1048        }
1049    }
1050}
1051
1052static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1053                             TCGReg ret, tcg_target_long arg)
1054{
1055    if (arg == 0) {
1056        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1057        return;
1058    }
1059    if (arg == -1) {
1060        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1061        return;
1062    }
1063
1064    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1065    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1066    if (TCG_TARGET_REG_BITS == 64) {
1067        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1068    } else {
1069        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1070    }
1071}
1072
1073static void tcg_out_movi_int(TCGContext *s, TCGType type,
1074                             TCGReg ret, tcg_target_long arg)
1075{
1076    tcg_target_long diff;
1077
1078    if (arg == 0) {
1079        tgen_arithr(s, ARITH_XOR, ret, ret);
1080        return;
1081    }
1082    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1083        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1084        tcg_out32(s, arg);
1085        return;
1086    }
1087    if (arg == (int32_t)arg) {
1088        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1089        tcg_out32(s, arg);
1090        return;
1091    }
1092
1093    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1094    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1095    if (diff == (int32_t)diff) {
1096        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1097        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1098        tcg_out32(s, diff);
1099        return;
1100    }
1101
1102    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1103    tcg_out64(s, arg);
1104}
1105
1106static void tcg_out_movi(TCGContext *s, TCGType type,
1107                         TCGReg ret, tcg_target_long arg)
1108{
1109    switch (type) {
1110    case TCG_TYPE_I32:
1111#if TCG_TARGET_REG_BITS == 64
1112    case TCG_TYPE_I64:
1113#endif
1114        if (ret < 16) {
1115            tcg_out_movi_int(s, type, ret, arg);
1116        } else {
1117            tcg_out_movi_vec(s, type, ret, arg);
1118        }
1119        break;
1120    default:
1121        g_assert_not_reached();
1122    }
1123}
1124
1125static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1126{
1127    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1128    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1129    return true;
1130}
1131
1132static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1133                             tcg_target_long imm)
1134{
1135    /* This function is only used for passing structs by reference. */
1136    tcg_debug_assert(imm == (int32_t)imm);
1137    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1138}
1139
1140static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1141{
1142    if (val == (int8_t)val) {
1143        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1144        tcg_out8(s, val);
1145    } else if (val == (int32_t)val) {
1146        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1147        tcg_out32(s, val);
1148    } else {
1149        g_assert_not_reached();
1150    }
1151}
1152
1153static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1154{
1155    /* Given the strength of x86 memory ordering, we only need care for
1156       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1157       faster than "mfence", so don't bother with the sse insn.  */
1158    if (a0 & TCG_MO_ST_LD) {
1159        tcg_out8(s, 0xf0);
1160        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1161        tcg_out8(s, 0);
1162    }
1163}
1164
1165static inline void tcg_out_push(TCGContext *s, int reg)
1166{
1167    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1168}
1169
1170static inline void tcg_out_pop(TCGContext *s, int reg)
1171{
1172    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1173}
1174
1175static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1176                       TCGReg arg1, intptr_t arg2)
1177{
1178    switch (type) {
1179    case TCG_TYPE_I32:
1180        if (ret < 16) {
1181            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1182        } else {
1183            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1184        }
1185        break;
1186    case TCG_TYPE_I64:
1187        if (ret < 16) {
1188            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1189            break;
1190        }
1191        /* FALLTHRU */
1192    case TCG_TYPE_V64:
1193        /* There is no instruction that can validate 8-byte alignment.  */
1194        tcg_debug_assert(ret >= 16);
1195        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1196        break;
1197    case TCG_TYPE_V128:
1198        /*
1199         * The gvec infrastructure is asserts that v128 vector loads
1200         * and stores use a 16-byte aligned offset.  Validate that the
1201         * final pointer is aligned by using an insn that will SIGSEGV.
1202         */
1203        tcg_debug_assert(ret >= 16);
1204        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1205        break;
1206    case TCG_TYPE_V256:
1207        /*
1208         * The gvec infrastructure only requires 16-byte alignment,
1209         * so here we must use an unaligned load.
1210         */
1211        tcg_debug_assert(ret >= 16);
1212        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1213                                 ret, 0, arg1, arg2);
1214        break;
1215    default:
1216        g_assert_not_reached();
1217    }
1218}
1219
1220static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1221                       TCGReg arg1, intptr_t arg2)
1222{
1223    switch (type) {
1224    case TCG_TYPE_I32:
1225        if (arg < 16) {
1226            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1227        } else {
1228            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1229        }
1230        break;
1231    case TCG_TYPE_I64:
1232        if (arg < 16) {
1233            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1234            break;
1235        }
1236        /* FALLTHRU */
1237    case TCG_TYPE_V64:
1238        /* There is no instruction that can validate 8-byte alignment.  */
1239        tcg_debug_assert(arg >= 16);
1240        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1241        break;
1242    case TCG_TYPE_V128:
1243        /*
1244         * The gvec infrastructure is asserts that v128 vector loads
1245         * and stores use a 16-byte aligned offset.  Validate that the
1246         * final pointer is aligned by using an insn that will SIGSEGV.
1247         *
1248         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1249         * for _WIN64, which must have SSE2 but may not have AVX.
1250         */
1251        tcg_debug_assert(arg >= 16);
1252        if (have_avx1) {
1253            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1254        } else {
1255            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1256        }
1257        break;
1258    case TCG_TYPE_V256:
1259        /*
1260         * The gvec infrastructure only requires 16-byte alignment,
1261         * so here we must use an unaligned store.
1262         */
1263        tcg_debug_assert(arg >= 16);
1264        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1265                                 arg, 0, arg1, arg2);
1266        break;
1267    default:
1268        g_assert_not_reached();
1269    }
1270}
1271
1272static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1273                        TCGReg base, intptr_t ofs)
1274{
1275    int rexw = 0;
1276    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1277        if (val != (int32_t)val) {
1278            return false;
1279        }
1280        rexw = P_REXW;
1281    } else if (type != TCG_TYPE_I32) {
1282        return false;
1283    }
1284    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1285    tcg_out32(s, val);
1286    return true;
1287}
1288
1289static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1290{
1291    /* Propagate an opcode prefix, such as P_DATA16.  */
1292    int ext = subopc & ~0x7;
1293    subopc &= 0x7;
1294
1295    if (count == 1) {
1296        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1297    } else {
1298        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1299        tcg_out8(s, count);
1300    }
1301}
1302
1303static inline void tcg_out_bswap32(TCGContext *s, int reg)
1304{
1305    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1306}
1307
1308static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1309{
1310    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1311}
1312
1313static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1314{
1315    /* movzbl */
1316    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1317    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1318}
1319
1320static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1321{
1322    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1323    /* movsbl */
1324    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1325    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1326}
1327
1328static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1329{
1330    /* movzwl */
1331    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1332}
1333
1334static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1335{
1336    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1337    /* movsw[lq] */
1338    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1339}
1340
1341static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1342{
1343    /* 32-bit mov zero extends.  */
1344    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1345}
1346
1347static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1348{
1349    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1350    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1351}
1352
1353static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1354{
1355    tcg_out_ext32s(s, dest, src);
1356}
1357
1358static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1359{
1360    if (dest != src) {
1361        tcg_out_ext32u(s, dest, src);
1362    }
1363}
1364
1365static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1366{
1367    tcg_out_ext32u(s, dest, src);
1368}
1369
1370static inline void tcg_out_bswap64(TCGContext *s, int reg)
1371{
1372    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1373}
1374
1375static void tgen_arithi(TCGContext *s, int c, int r0,
1376                        tcg_target_long val, int cf)
1377{
1378    int rexw = 0;
1379
1380    if (TCG_TARGET_REG_BITS == 64) {
1381        rexw = c & -8;
1382        c &= 7;
1383    }
1384
1385    switch (c) {
1386    case ARITH_ADD:
1387    case ARITH_SUB:
1388        if (!cf) {
1389            /*
1390             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1391             * partial flags update stalls on Pentium4 and are not recommended
1392             * by current Intel optimization manuals.
1393             */
1394            if (val == 1 || val == -1) {
1395                int is_inc = (c == ARITH_ADD) ^ (val < 0);
1396                if (TCG_TARGET_REG_BITS == 64) {
1397                    /*
1398                     * The single-byte increment encodings are re-tasked
1399                     * as the REX prefixes.  Use the MODRM encoding.
1400                     */
1401                    tcg_out_modrm(s, OPC_GRP5 + rexw,
1402                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1403                } else {
1404                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1405                }
1406                return;
1407            }
1408            if (val == 128) {
1409                /*
1410                 * Facilitate using an 8-bit immediate.  Carry is inverted
1411                 * by this transformation, so do it only if cf == 0.
1412                 */
1413                c ^= ARITH_ADD ^ ARITH_SUB;
1414                val = -128;
1415            }
1416        }
1417        break;
1418
1419    case ARITH_AND:
1420        if (TCG_TARGET_REG_BITS == 64) {
1421            if (val == 0xffffffffu) {
1422                tcg_out_ext32u(s, r0, r0);
1423                return;
1424            }
1425            if (val == (uint32_t)val) {
1426                /* AND with no high bits set can use a 32-bit operation.  */
1427                rexw = 0;
1428            }
1429        }
1430        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1431            tcg_out_ext8u(s, r0, r0);
1432            return;
1433        }
1434        if (val == 0xffffu) {
1435            tcg_out_ext16u(s, r0, r0);
1436            return;
1437        }
1438        break;
1439
1440    case ARITH_OR:
1441    case ARITH_XOR:
1442        if (val >= 0x80 && val <= 0xff
1443            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1444            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1445            tcg_out8(s, val);
1446            return;
1447        }
1448        break;
1449    }
1450
1451    if (val == (int8_t)val) {
1452        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1453        tcg_out8(s, val);
1454        return;
1455    }
1456    if (rexw == 0 || val == (int32_t)val) {
1457        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1458        tcg_out32(s, val);
1459        return;
1460    }
1461
1462    g_assert_not_reached();
1463}
1464
1465static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1466{
1467    if (val != 0) {
1468        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1469    }
1470}
1471
1472/* Set SMALL to force a short forward branch.  */
1473static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1474{
1475    int32_t val, val1;
1476
1477    if (l->has_value) {
1478        val = tcg_pcrel_diff(s, l->u.value_ptr);
1479        val1 = val - 2;
1480        if ((int8_t)val1 == val1) {
1481            if (opc == -1) {
1482                tcg_out8(s, OPC_JMP_short);
1483            } else {
1484                tcg_out8(s, OPC_JCC_short + opc);
1485            }
1486            tcg_out8(s, val1);
1487        } else {
1488            tcg_debug_assert(!small);
1489            if (opc == -1) {
1490                tcg_out8(s, OPC_JMP_long);
1491                tcg_out32(s, val - 5);
1492            } else {
1493                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1494                tcg_out32(s, val - 6);
1495            }
1496        }
1497    } else if (small) {
1498        if (opc == -1) {
1499            tcg_out8(s, OPC_JMP_short);
1500        } else {
1501            tcg_out8(s, OPC_JCC_short + opc);
1502        }
1503        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1504        s->code_ptr += 1;
1505    } else {
1506        if (opc == -1) {
1507            tcg_out8(s, OPC_JMP_long);
1508        } else {
1509            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1510        }
1511        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1512        s->code_ptr += 4;
1513    }
1514}
1515
1516static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1517                       TCGArg arg2, int const_arg2, int rexw)
1518{
1519    int jz, js;
1520
1521    if (!is_tst_cond(cond)) {
1522        if (!const_arg2) {
1523            tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1524        } else if (arg2 == 0) {
1525            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1526        } else {
1527            tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1528            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1529        }
1530        return tcg_cond_to_jcc[cond];
1531    }
1532
1533    jz = tcg_cond_to_jcc[cond];
1534    js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1535
1536    if (!const_arg2) {
1537        tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1538        return jz;
1539    }
1540
1541    if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1542        if (arg2 == 0x80) {
1543            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1544            return js;
1545        }
1546        if (arg2 == 0xff) {
1547            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1548            return jz;
1549        }
1550        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1551        tcg_out8(s, arg2);
1552        return jz;
1553    }
1554
1555    if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1556        if (arg2 == 0x8000) {
1557            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1558            return js;
1559        }
1560        if (arg2 == 0xff00) {
1561            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1562            return jz;
1563        }
1564        tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1565        tcg_out8(s, arg2 >> 8);
1566        return jz;
1567    }
1568
1569    if (arg2 == 0xffff) {
1570        tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1571        return jz;
1572    }
1573    if (arg2 == 0xffffffffu) {
1574        tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1575        return jz;
1576    }
1577
1578    if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1579        int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1580        int sh = ctz64(arg2);
1581
1582        rexw = (sh & 32 ? P_REXW : 0);
1583        if ((sh & 31) == 31) {
1584            tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1585            return js;
1586        } else {
1587            tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1588            tcg_out8(s, sh);
1589            return jc;
1590        }
1591    }
1592
1593    if (rexw) {
1594        if (arg2 == (uint32_t)arg2) {
1595            rexw = 0;
1596        } else {
1597            tcg_debug_assert(arg2 == (int32_t)arg2);
1598        }
1599    }
1600    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1601    tcg_out32(s, arg2);
1602    return jz;
1603}
1604
1605static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1606                           TCGArg arg1, TCGArg arg2, int const_arg2,
1607                           TCGLabel *label, bool small)
1608{
1609    int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1610    tcg_out_jxx(s, jcc, label, small);
1611}
1612
1613#if TCG_TARGET_REG_BITS == 32
1614static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1615                            const int *const_args, bool small)
1616{
1617    TCGLabel *label_next = gen_new_label();
1618    TCGLabel *label_this = arg_label(args[5]);
1619    TCGCond cond = args[4];
1620
1621    switch (cond) {
1622    case TCG_COND_EQ:
1623    case TCG_COND_TSTEQ:
1624        tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1625                       args[0], args[2], const_args[2], label_next, 1);
1626        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1627                       label_this, small);
1628        break;
1629    case TCG_COND_NE:
1630    case TCG_COND_TSTNE:
1631        tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1632                       label_this, small);
1633        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1634                       label_this, small);
1635        break;
1636    case TCG_COND_LT:
1637        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1638                       label_this, small);
1639        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1640        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1641                       label_this, small);
1642        break;
1643    case TCG_COND_LE:
1644        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1645                       label_this, small);
1646        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1647        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1648                       label_this, small);
1649        break;
1650    case TCG_COND_GT:
1651        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1652                       label_this, small);
1653        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1654        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1655                       label_this, small);
1656        break;
1657    case TCG_COND_GE:
1658        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1659                       label_this, small);
1660        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1661        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1662                       label_this, small);
1663        break;
1664    case TCG_COND_LTU:
1665        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1666                       label_this, small);
1667        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1668        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1669                       label_this, small);
1670        break;
1671    case TCG_COND_LEU:
1672        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1673                       label_this, small);
1674        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1675        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1676                       label_this, small);
1677        break;
1678    case TCG_COND_GTU:
1679        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1680                       label_this, small);
1681        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1682        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1683                       label_this, small);
1684        break;
1685    case TCG_COND_GEU:
1686        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1687                       label_this, small);
1688        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1689        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1690                       label_this, small);
1691        break;
1692    default:
1693        g_assert_not_reached();
1694    }
1695    tcg_out_label(s, label_next);
1696}
1697#endif
1698
1699static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1700                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1701                            int const_arg2, bool neg)
1702{
1703    int cmp_rexw = rexw;
1704    bool inv = false;
1705    bool cleared;
1706    int jcc;
1707
1708    switch (cond) {
1709    case TCG_COND_NE:
1710        inv = true;
1711        /* fall through */
1712    case TCG_COND_EQ:
1713        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1714        if (const_arg2 && arg2 == 0) {
1715            arg2 = 1;
1716            goto do_ltu;
1717        }
1718        break;
1719
1720    case TCG_COND_TSTNE:
1721        inv = true;
1722        /* fall through */
1723    case TCG_COND_TSTEQ:
1724        /* If arg2 is -1, convert to LTU/GEU vs 1. */
1725        if (const_arg2 && arg2 == 0xffffffffu) {
1726            arg2 = 1;
1727            cmp_rexw = 0;
1728            goto do_ltu;
1729        }
1730        break;
1731
1732    case TCG_COND_LEU:
1733        inv = true;
1734        /* fall through */
1735    case TCG_COND_GTU:
1736        /* If arg2 is a register, swap for LTU/GEU. */
1737        if (!const_arg2) {
1738            TCGReg t = arg1;
1739            arg1 = arg2;
1740            arg2 = t;
1741            goto do_ltu;
1742        }
1743        break;
1744
1745    case TCG_COND_GEU:
1746        inv = true;
1747        /* fall through */
1748    case TCG_COND_LTU:
1749    do_ltu:
1750        /*
1751         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1752         * We can then use NEG or INC to produce the desired result.
1753         * This is always smaller than the SETCC expansion.
1754         */
1755        tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);
1756
1757        /* X - X - C = -C = (C ? -1 : 0) */
1758        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1759        if (inv && neg) {
1760            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1761            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1762        } else if (inv) {
1763            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1764            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1765        } else if (!neg) {
1766            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1767            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1768        }
1769        return;
1770
1771    case TCG_COND_GE:
1772        inv = true;
1773        /* fall through */
1774    case TCG_COND_LT:
1775        /* If arg2 is 0, extract the sign bit. */
1776        if (const_arg2 && arg2 == 0) {
1777            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1778            if (inv) {
1779                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1780            }
1781            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1782                           dest, rexw ? 63 : 31);
1783            return;
1784        }
1785        break;
1786
1787    default:
1788        break;
1789    }
1790
1791    /*
1792     * If dest does not overlap the inputs, clearing it first is preferred.
1793     * The XOR breaks any false dependency for the low-byte write to dest,
1794     * and is also one byte smaller than MOVZBL.
1795     */
1796    cleared = false;
1797    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1798        tgen_arithr(s, ARITH_XOR, dest, dest);
1799        cleared = true;
1800    }
1801
1802    jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
1803    tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1804
1805    if (!cleared) {
1806        tcg_out_ext8u(s, dest, dest);
1807    }
1808    if (neg) {
1809        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1810    }
1811}
1812
1813#if TCG_TARGET_REG_BITS == 32
1814static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1815                             const int *const_args)
1816{
1817    TCGArg new_args[6];
1818    TCGLabel *label_true, *label_over;
1819
1820    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1821
1822    if (args[0] == args[1] || args[0] == args[2]
1823        || (!const_args[3] && args[0] == args[3])
1824        || (!const_args[4] && args[0] == args[4])) {
1825        /* When the destination overlaps with one of the argument
1826           registers, don't do anything tricky.  */
1827        label_true = gen_new_label();
1828        label_over = gen_new_label();
1829
1830        new_args[5] = label_arg(label_true);
1831        tcg_out_brcond2(s, new_args, const_args+1, 1);
1832
1833        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1834        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1835        tcg_out_label(s, label_true);
1836
1837        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1838        tcg_out_label(s, label_over);
1839    } else {
1840        /* When the destination does not overlap one of the arguments,
1841           clear the destination first, jump if cond false, and emit an
1842           increment in the true case.  This results in smaller code.  */
1843
1844        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1845
1846        label_over = gen_new_label();
1847        new_args[4] = tcg_invert_cond(new_args[4]);
1848        new_args[5] = label_arg(label_over);
1849        tcg_out_brcond2(s, new_args, const_args+1, 1);
1850
1851        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1852        tcg_out_label(s, label_over);
1853    }
1854}
1855#endif
1856
1857static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1858                         TCGReg dest, TCGReg v1)
1859{
1860    tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1861}
1862
1863static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1864                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1865                            TCGReg v1)
1866{
1867    int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1868    tcg_out_cmov(s, jcc, rexw, dest, v1);
1869}
1870
1871static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1872                        TCGArg arg2, bool const_a2)
1873{
1874    if (have_bmi1) {
1875        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1876        if (const_a2) {
1877            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1878        } else {
1879            tcg_debug_assert(dest != arg2);
1880            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1881        }
1882    } else {
1883        tcg_debug_assert(dest != arg2);
1884        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1885        tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1886    }
1887}
1888
1889static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1890                        TCGArg arg2, bool const_a2)
1891{
1892    if (have_lzcnt) {
1893        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1894        if (const_a2) {
1895            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1896        } else {
1897            tcg_debug_assert(dest != arg2);
1898            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1899        }
1900    } else {
1901        tcg_debug_assert(!const_a2);
1902        tcg_debug_assert(dest != arg1);
1903        tcg_debug_assert(dest != arg2);
1904
1905        /* Recall that the output of BSR is the index not the count.  */
1906        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1907        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1908
1909        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1910        int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1911        tcg_out_cmov(s, jcc, rexw, dest, arg2);
1912    }
1913}
1914
1915static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1916{
1917    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1918
1919    if (disp == (int32_t)disp) {
1920        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1921        tcg_out32(s, disp);
1922    } else {
1923        /* rip-relative addressing into the constant pool.
1924           This is 6 + 8 = 14 bytes, as compared to using an
1925           immediate load 10 + 6 = 16 bytes, plus we may
1926           be able to re-use the pool constant for more calls.  */
1927        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1928        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1929        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1930        tcg_out32(s, 0);
1931    }
1932}
1933
1934static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1935                         const TCGHelperInfo *info)
1936{
1937    tcg_out_branch(s, 1, dest);
1938
1939#ifndef _WIN32
1940    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1941        /*
1942         * The sysv i386 abi for struct return places a reference as the
1943         * first argument of the stack, and pops that argument with the
1944         * return statement.  Since we want to retain the aligned stack
1945         * pointer for the callee, we do not want to actually push that
1946         * argument before the call but rely on the normal store to the
1947         * stack slot.  But we do need to compensate for the pop in order
1948         * to reset our correct stack pointer value.
1949         * Pushing a garbage value back onto the stack is quickest.
1950         */
1951        tcg_out_push(s, TCG_REG_EAX);
1952    }
1953#endif
1954}
1955
1956static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1957{
1958    tcg_out_branch(s, 0, dest);
1959}
1960
1961static void tcg_out_nopn(TCGContext *s, int n)
1962{
1963    int i;
1964    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1965     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1966     * duplicate prefix, and all of the interesting recent cores can
1967     * decode and discard the duplicates in a single cycle.
1968     */
1969    tcg_debug_assert(n >= 1);
1970    for (i = 1; i < n; ++i) {
1971        tcg_out8(s, 0x66);
1972    }
1973    tcg_out8(s, 0x90);
1974}
1975
1976typedef struct {
1977    TCGReg base;
1978    int index;
1979    int ofs;
1980    int seg;
1981    TCGAtomAlign aa;
1982} HostAddress;
1983
1984bool tcg_target_has_memory_bswap(MemOp memop)
1985{
1986    TCGAtomAlign aa;
1987
1988    if (!have_movbe) {
1989        return false;
1990    }
1991    if ((memop & MO_SIZE) < MO_128) {
1992        return true;
1993    }
1994
1995    /*
1996     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1997     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1998     */
1999    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
2000    return aa.atom < MO_128;
2001}
2002
2003/*
2004 * Because i686 has no register parameters and because x86_64 has xchg
2005 * to handle addr/data register overlap, we have placed all input arguments
2006 * before we need might need a scratch reg.
2007 *
2008 * Even then, a scratch is only needed for l->raddr.  Rather than expose
2009 * a general-purpose scratch when we don't actually know it's available,
2010 * use the ra_gen hook to load into RAX if needed.
2011 */
2012#if TCG_TARGET_REG_BITS == 64
2013static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
2014{
2015    if (arg < 0) {
2016        arg = TCG_REG_RAX;
2017    }
2018    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
2019    return arg;
2020}
2021static const TCGLdstHelperParam ldst_helper_param = {
2022    .ra_gen = ldst_ra_gen
2023};
2024#else
2025static const TCGLdstHelperParam ldst_helper_param = { };
2026#endif
2027
2028static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
2029                                TCGReg l, TCGReg h, TCGReg v)
2030{
2031    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
2032
2033    /* vpmov{d,q} %v, %l */
2034    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
2035    /* vpextr{d,q} $1, %v, %h */
2036    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
2037    tcg_out8(s, 1);
2038}
2039
2040static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
2041                                TCGReg v, TCGReg l, TCGReg h)
2042{
2043    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
2044
2045    /* vmov{d,q} %l, %v */
2046    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
2047    /* vpinsr{d,q} $1, %h, %v, %v */
2048    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2049    tcg_out8(s, 1);
2050}
2051
2052/*
2053 * Generate code for the slow path for a load at the end of block
2054 */
2055static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2056{
2057    MemOp opc = get_memop(l->oi);
2058    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2059
2060    /* resolve label address */
2061    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2062    if (label_ptr[1]) {
2063        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2064    }
2065
2066    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2067    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2068    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2069
2070    tcg_out_jmp(s, l->raddr);
2071    return true;
2072}
2073
2074/*
2075 * Generate code for the slow path for a store at the end of block
2076 */
2077static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2078{
2079    MemOp opc = get_memop(l->oi);
2080    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2081
2082    /* resolve label address */
2083    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2084    if (label_ptr[1]) {
2085        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2086    }
2087
2088    tcg_out_st_helper_args(s, l, &ldst_helper_param);
2089    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2090
2091    tcg_out_jmp(s, l->raddr);
2092    return true;
2093}
2094
2095#ifdef CONFIG_USER_ONLY
2096static HostAddress x86_guest_base = {
2097    .index = -1
2098};
2099
2100#if defined(__x86_64__) && defined(__linux__)
2101# include <asm/prctl.h>
2102# include <sys/prctl.h>
2103int arch_prctl(int code, unsigned long addr);
2104static inline int setup_guest_base_seg(void)
2105{
2106    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2107        return P_GS;
2108    }
2109    return 0;
2110}
2111#define setup_guest_base_seg  setup_guest_base_seg
2112#elif defined(__x86_64__) && \
2113      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2114# include <machine/sysarch.h>
2115static inline int setup_guest_base_seg(void)
2116{
2117    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2118        return P_GS;
2119    }
2120    return 0;
2121}
2122#define setup_guest_base_seg  setup_guest_base_seg
2123#endif
2124#else
2125# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2126#endif /* CONFIG_USER_ONLY */
2127#ifndef setup_guest_base_seg
2128# define setup_guest_base_seg()  0
2129#endif
2130
2131#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2132
2133/*
2134 * For softmmu, perform the TLB load and compare.
2135 * For useronly, perform any required alignment tests.
2136 * In both cases, return a TCGLabelQemuLdst structure if the slow path
2137 * is required and fill in @h with the host address for the fast path.
2138 */
2139static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2140                                           TCGReg addrlo, TCGReg addrhi,
2141                                           MemOpIdx oi, bool is_ld)
2142{
2143    TCGLabelQemuLdst *ldst = NULL;
2144    MemOp opc = get_memop(oi);
2145    MemOp s_bits = opc & MO_SIZE;
2146    unsigned a_mask;
2147
2148    if (tcg_use_softmmu) {
2149        h->index = TCG_REG_L0;
2150        h->ofs = 0;
2151        h->seg = 0;
2152    } else {
2153        *h = x86_guest_base;
2154    }
2155    h->base = addrlo;
2156    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2157    a_mask = (1 << h->aa.align) - 1;
2158
2159    if (tcg_use_softmmu) {
2160        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2161                            : offsetof(CPUTLBEntry, addr_write);
2162        TCGType ttype = TCG_TYPE_I32;
2163        TCGType tlbtype = TCG_TYPE_I32;
2164        int trexw = 0, hrexw = 0, tlbrexw = 0;
2165        unsigned mem_index = get_mmuidx(oi);
2166        unsigned s_mask = (1 << s_bits) - 1;
2167        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2168        int tlb_mask;
2169
2170        ldst = new_ldst_label(s);
2171        ldst->is_ld = is_ld;
2172        ldst->oi = oi;
2173        ldst->addrlo_reg = addrlo;
2174        ldst->addrhi_reg = addrhi;
2175
2176        if (TCG_TARGET_REG_BITS == 64) {
2177            ttype = s->addr_type;
2178            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2179            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2180                hrexw = P_REXW;
2181                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2182                    tlbtype = TCG_TYPE_I64;
2183                    tlbrexw = P_REXW;
2184                }
2185            }
2186        }
2187
2188        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2189        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2190                       s->page_bits - CPU_TLB_ENTRY_BITS);
2191
2192        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2193                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2194
2195        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2196                             fast_ofs + offsetof(CPUTLBDescFast, table));
2197
2198        /*
2199         * If the required alignment is at least as large as the access,
2200         * simply copy the address and mask.  For lesser alignments,
2201         * check that we don't cross pages for the complete access.
2202         */
2203        if (a_mask >= s_mask) {
2204            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2205        } else {
2206            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2207                                 addrlo, s_mask - a_mask);
2208        }
2209        tlb_mask = s->page_mask | a_mask;
2210        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2211
2212        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2213        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2214                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2215
2216        /* jne slow_path */
2217        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2218        ldst->label_ptr[0] = s->code_ptr;
2219        s->code_ptr += 4;
2220
2221        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2222            /* cmp 4(TCG_REG_L0), addrhi */
2223            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2224                                 TCG_REG_L0, cmp_ofs + 4);
2225
2226            /* jne slow_path */
2227            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2228            ldst->label_ptr[1] = s->code_ptr;
2229            s->code_ptr += 4;
2230        }
2231
2232        /* TLB Hit.  */
2233        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2234                   offsetof(CPUTLBEntry, addend));
2235    } else if (a_mask) {
2236        int jcc;
2237
2238        ldst = new_ldst_label(s);
2239        ldst->is_ld = is_ld;
2240        ldst->oi = oi;
2241        ldst->addrlo_reg = addrlo;
2242        ldst->addrhi_reg = addrhi;
2243
2244        /* jne slow_path */
2245        jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2246        tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2247        ldst->label_ptr[0] = s->code_ptr;
2248        s->code_ptr += 4;
2249    }
2250
2251    return ldst;
2252}
2253
2254static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2255                                   HostAddress h, TCGType type, MemOp memop)
2256{
2257    bool use_movbe = false;
2258    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2259    int movop = OPC_MOVL_GvEv;
2260
2261    /* Do big-endian loads with movbe.  */
2262    if (memop & MO_BSWAP) {
2263        tcg_debug_assert(have_movbe);
2264        use_movbe = true;
2265        movop = OPC_MOVBE_GyMy;
2266    }
2267
2268    switch (memop & MO_SSIZE) {
2269    case MO_UB:
2270        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2271                                 h.base, h.index, 0, h.ofs);
2272        break;
2273    case MO_SB:
2274        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2275                                 h.base, h.index, 0, h.ofs);
2276        break;
2277    case MO_UW:
2278        if (use_movbe) {
2279            /* There is no extending movbe; only low 16-bits are modified.  */
2280            if (datalo != h.base && datalo != h.index) {
2281                /* XOR breaks dependency chains.  */
2282                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2283                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2284                                         datalo, h.base, h.index, 0, h.ofs);
2285            } else {
2286                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2287                                         datalo, h.base, h.index, 0, h.ofs);
2288                tcg_out_ext16u(s, datalo, datalo);
2289            }
2290        } else {
2291            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2292                                     h.base, h.index, 0, h.ofs);
2293        }
2294        break;
2295    case MO_SW:
2296        if (use_movbe) {
2297            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2298                                     datalo, h.base, h.index, 0, h.ofs);
2299            tcg_out_ext16s(s, type, datalo, datalo);
2300        } else {
2301            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2302                                     datalo, h.base, h.index, 0, h.ofs);
2303        }
2304        break;
2305    case MO_UL:
2306        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2307                                 h.base, h.index, 0, h.ofs);
2308        break;
2309#if TCG_TARGET_REG_BITS == 64
2310    case MO_SL:
2311        if (use_movbe) {
2312            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2313                                     h.base, h.index, 0, h.ofs);
2314            tcg_out_ext32s(s, datalo, datalo);
2315        } else {
2316            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2317                                     h.base, h.index, 0, h.ofs);
2318        }
2319        break;
2320#endif
2321    case MO_UQ:
2322        if (TCG_TARGET_REG_BITS == 64) {
2323            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2324                                     h.base, h.index, 0, h.ofs);
2325            break;
2326        }
2327        if (use_movbe) {
2328            TCGReg t = datalo;
2329            datalo = datahi;
2330            datahi = t;
2331        }
2332        if (h.base == datalo || h.index == datalo) {
2333            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2334                                     h.base, h.index, 0, h.ofs);
2335            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2336            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2337        } else {
2338            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2339                                     h.base, h.index, 0, h.ofs);
2340            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2341                                     h.base, h.index, 0, h.ofs + 4);
2342        }
2343        break;
2344
2345    case MO_128:
2346        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2347
2348        /*
2349         * Without 16-byte atomicity, use integer regs.
2350         * That is where we want the data, and it allows bswaps.
2351         */
2352        if (h.aa.atom < MO_128) {
2353            if (use_movbe) {
2354                TCGReg t = datalo;
2355                datalo = datahi;
2356                datahi = t;
2357            }
2358            if (h.base == datalo || h.index == datalo) {
2359                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2360                                         h.base, h.index, 0, h.ofs);
2361                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2362                                     datalo, datahi, 0);
2363                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2364                                     datahi, datahi, 8);
2365            } else {
2366                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2367                                         h.base, h.index, 0, h.ofs);
2368                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2369                                         h.base, h.index, 0, h.ofs + 8);
2370            }
2371            break;
2372        }
2373
2374        /*
2375         * With 16-byte atomicity, a vector load is required.
2376         * If we already have 16-byte alignment, then VMOVDQA always works.
2377         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2378         * Else use we require a runtime test for alignment for VMOVDQA;
2379         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2380         */
2381        if (h.aa.align >= MO_128) {
2382            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2383                                         TCG_TMP_VEC, 0,
2384                                         h.base, h.index, 0, h.ofs);
2385        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2386            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2387                                         TCG_TMP_VEC, 0,
2388                                         h.base, h.index, 0, h.ofs);
2389        } else {
2390            TCGLabel *l1 = gen_new_label();
2391            TCGLabel *l2 = gen_new_label();
2392            int jcc;
2393
2394            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2395            tcg_out_jxx(s, jcc, l1, true);
2396
2397            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2398                                         TCG_TMP_VEC, 0,
2399                                         h.base, h.index, 0, h.ofs);
2400            tcg_out_jxx(s, JCC_JMP, l2, true);
2401
2402            tcg_out_label(s, l1);
2403            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2404                                         TCG_TMP_VEC, 0,
2405                                         h.base, h.index, 0, h.ofs);
2406            tcg_out_label(s, l2);
2407        }
2408        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2409        break;
2410
2411    default:
2412        g_assert_not_reached();
2413    }
2414}
2415
2416static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2417                            TCGReg addrlo, TCGReg addrhi,
2418                            MemOpIdx oi, TCGType data_type)
2419{
2420    TCGLabelQemuLdst *ldst;
2421    HostAddress h;
2422
2423    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2424    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2425
2426    if (ldst) {
2427        ldst->type = data_type;
2428        ldst->datalo_reg = datalo;
2429        ldst->datahi_reg = datahi;
2430        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2431    }
2432}
2433
2434static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2435                                   HostAddress h, MemOp memop)
2436{
2437    bool use_movbe = false;
2438    int movop = OPC_MOVL_EvGv;
2439
2440    /*
2441     * Do big-endian stores with movbe or system-mode.
2442     * User-only without movbe will have its swapping done generically.
2443     */
2444    if (memop & MO_BSWAP) {
2445        tcg_debug_assert(have_movbe);
2446        use_movbe = true;
2447        movop = OPC_MOVBE_MyGy;
2448    }
2449
2450    switch (memop & MO_SIZE) {
2451    case MO_8:
2452        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2453        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2454        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2455                                 datalo, h.base, h.index, 0, h.ofs);
2456        break;
2457    case MO_16:
2458        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2459                                 h.base, h.index, 0, h.ofs);
2460        break;
2461    case MO_32:
2462        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2463                                 h.base, h.index, 0, h.ofs);
2464        break;
2465    case MO_64:
2466        if (TCG_TARGET_REG_BITS == 64) {
2467            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2468                                     h.base, h.index, 0, h.ofs);
2469        } else {
2470            if (use_movbe) {
2471                TCGReg t = datalo;
2472                datalo = datahi;
2473                datahi = t;
2474            }
2475            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2476                                     h.base, h.index, 0, h.ofs);
2477            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2478                                     h.base, h.index, 0, h.ofs + 4);
2479        }
2480        break;
2481
2482    case MO_128:
2483        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2484
2485        /*
2486         * Without 16-byte atomicity, use integer regs.
2487         * That is where we have the data, and it allows bswaps.
2488         */
2489        if (h.aa.atom < MO_128) {
2490            if (use_movbe) {
2491                TCGReg t = datalo;
2492                datalo = datahi;
2493                datahi = t;
2494            }
2495            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2496                                     h.base, h.index, 0, h.ofs);
2497            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2498                                     h.base, h.index, 0, h.ofs + 8);
2499            break;
2500        }
2501
2502        /*
2503         * With 16-byte atomicity, a vector store is required.
2504         * If we already have 16-byte alignment, then VMOVDQA always works.
2505         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2506         * Else use we require a runtime test for alignment for VMOVDQA;
2507         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2508         */
2509        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2510        if (h.aa.align >= MO_128) {
2511            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2512                                         TCG_TMP_VEC, 0,
2513                                         h.base, h.index, 0, h.ofs);
2514        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2515            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2516                                         TCG_TMP_VEC, 0,
2517                                         h.base, h.index, 0, h.ofs);
2518        } else {
2519            TCGLabel *l1 = gen_new_label();
2520            TCGLabel *l2 = gen_new_label();
2521            int jcc;
2522
2523            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2524            tcg_out_jxx(s, jcc, l1, true);
2525
2526            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2527                                         TCG_TMP_VEC, 0,
2528                                         h.base, h.index, 0, h.ofs);
2529            tcg_out_jxx(s, JCC_JMP, l2, true);
2530
2531            tcg_out_label(s, l1);
2532            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2533                                         TCG_TMP_VEC, 0,
2534                                         h.base, h.index, 0, h.ofs);
2535            tcg_out_label(s, l2);
2536        }
2537        break;
2538
2539    default:
2540        g_assert_not_reached();
2541    }
2542}
2543
2544static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2545                            TCGReg addrlo, TCGReg addrhi,
2546                            MemOpIdx oi, TCGType data_type)
2547{
2548    TCGLabelQemuLdst *ldst;
2549    HostAddress h;
2550
2551    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2552    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2553
2554    if (ldst) {
2555        ldst->type = data_type;
2556        ldst->datalo_reg = datalo;
2557        ldst->datahi_reg = datahi;
2558        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2559    }
2560}
2561
2562static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2563{
2564    /* Reuse the zeroing that exists for goto_ptr.  */
2565    if (a0 == 0) {
2566        tcg_out_jmp(s, tcg_code_gen_epilogue);
2567    } else {
2568        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2569        tcg_out_jmp(s, tb_ret_addr);
2570    }
2571}
2572
2573static void tcg_out_goto_tb(TCGContext *s, int which)
2574{
2575    /*
2576     * Jump displacement must be aligned for atomic patching;
2577     * see if we need to add extra nops before jump
2578     */
2579    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2580    if (gap != 1) {
2581        tcg_out_nopn(s, gap - 1);
2582    }
2583    tcg_out8(s, OPC_JMP_long); /* jmp im */
2584    set_jmp_insn_offset(s, which);
2585    tcg_out32(s, 0);
2586    set_jmp_reset_offset(s, which);
2587}
2588
2589void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2590                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2591{
2592    /* patch the branch destination */
2593    uintptr_t addr = tb->jmp_target_addr[n];
2594    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2595    /* no need to flush icache explicitly */
2596}
2597
2598static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2599                              const TCGArg args[TCG_MAX_OP_ARGS],
2600                              const int const_args[TCG_MAX_OP_ARGS])
2601{
2602    TCGArg a0, a1, a2;
2603    int c, const_a2, vexop, rexw = 0;
2604
2605#if TCG_TARGET_REG_BITS == 64
2606# define OP_32_64(x) \
2607        case glue(glue(INDEX_op_, x), _i64): \
2608            rexw = P_REXW; /* FALLTHRU */    \
2609        case glue(glue(INDEX_op_, x), _i32)
2610#else
2611# define OP_32_64(x) \
2612        case glue(glue(INDEX_op_, x), _i32)
2613#endif
2614
2615    /* Hoist the loads of the most common arguments.  */
2616    a0 = args[0];
2617    a1 = args[1];
2618    a2 = args[2];
2619    const_a2 = const_args[2];
2620
2621    switch (opc) {
2622    case INDEX_op_goto_ptr:
2623        /* jmp to the given host address (could be epilogue) */
2624        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2625        break;
2626    case INDEX_op_br:
2627        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2628        break;
2629    OP_32_64(ld8u):
2630        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2631        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2632        break;
2633    OP_32_64(ld8s):
2634        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2635        break;
2636    OP_32_64(ld16u):
2637        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2638        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2639        break;
2640    OP_32_64(ld16s):
2641        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2642        break;
2643#if TCG_TARGET_REG_BITS == 64
2644    case INDEX_op_ld32u_i64:
2645#endif
2646    case INDEX_op_ld_i32:
2647        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2648        break;
2649
2650    OP_32_64(st8):
2651        if (const_args[0]) {
2652            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2653            tcg_out8(s, a0);
2654        } else {
2655            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2656        }
2657        break;
2658    OP_32_64(st16):
2659        if (const_args[0]) {
2660            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2661            tcg_out16(s, a0);
2662        } else {
2663            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2664        }
2665        break;
2666#if TCG_TARGET_REG_BITS == 64
2667    case INDEX_op_st32_i64:
2668#endif
2669    case INDEX_op_st_i32:
2670        if (const_args[0]) {
2671            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2672            tcg_out32(s, a0);
2673        } else {
2674            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2675        }
2676        break;
2677
2678    OP_32_64(add):
2679        /* For 3-operand addition, use LEA.  */
2680        if (a0 != a1) {
2681            TCGArg c3 = 0;
2682            if (const_a2) {
2683                c3 = a2, a2 = -1;
2684            } else if (a0 == a2) {
2685                /* Watch out for dest = src + dest, since we've removed
2686                   the matching constraint on the add.  */
2687                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2688                break;
2689            }
2690
2691            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2692            break;
2693        }
2694        c = ARITH_ADD;
2695        goto gen_arith;
2696    OP_32_64(sub):
2697        c = ARITH_SUB;
2698        goto gen_arith;
2699    OP_32_64(and):
2700        c = ARITH_AND;
2701        goto gen_arith;
2702    OP_32_64(or):
2703        c = ARITH_OR;
2704        goto gen_arith;
2705    OP_32_64(xor):
2706        c = ARITH_XOR;
2707        goto gen_arith;
2708    gen_arith:
2709        if (const_a2) {
2710            tgen_arithi(s, c + rexw, a0, a2, 0);
2711        } else {
2712            tgen_arithr(s, c + rexw, a0, a2);
2713        }
2714        break;
2715
2716    OP_32_64(andc):
2717        if (const_a2) {
2718            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2719            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2720        } else {
2721            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2722        }
2723        break;
2724
2725    OP_32_64(mul):
2726        if (const_a2) {
2727            int32_t val;
2728            val = a2;
2729            if (val == (int8_t)val) {
2730                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2731                tcg_out8(s, val);
2732            } else {
2733                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2734                tcg_out32(s, val);
2735            }
2736        } else {
2737            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2738        }
2739        break;
2740
2741    OP_32_64(div2):
2742        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2743        break;
2744    OP_32_64(divu2):
2745        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2746        break;
2747
2748    OP_32_64(shl):
2749        /* For small constant 3-operand shift, use LEA.  */
2750        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2751            if (a2 - 1 == 0) {
2752                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2753                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2754            } else {
2755                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2756                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2757            }
2758            break;
2759        }
2760        c = SHIFT_SHL;
2761        vexop = OPC_SHLX;
2762        goto gen_shift_maybe_vex;
2763    OP_32_64(shr):
2764        c = SHIFT_SHR;
2765        vexop = OPC_SHRX;
2766        goto gen_shift_maybe_vex;
2767    OP_32_64(sar):
2768        c = SHIFT_SAR;
2769        vexop = OPC_SARX;
2770        goto gen_shift_maybe_vex;
2771    OP_32_64(rotl):
2772        c = SHIFT_ROL;
2773        goto gen_shift;
2774    OP_32_64(rotr):
2775        c = SHIFT_ROR;
2776        goto gen_shift;
2777    gen_shift_maybe_vex:
2778        if (have_bmi2) {
2779            if (!const_a2) {
2780                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2781                break;
2782            }
2783            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2784        }
2785        /* FALLTHRU */
2786    gen_shift:
2787        if (const_a2) {
2788            tcg_out_shifti(s, c + rexw, a0, a2);
2789        } else {
2790            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2791        }
2792        break;
2793
2794    OP_32_64(ctz):
2795        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2796        break;
2797    OP_32_64(clz):
2798        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2799        break;
2800    OP_32_64(ctpop):
2801        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2802        break;
2803
2804    OP_32_64(brcond):
2805        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2806                       arg_label(args[3]), 0);
2807        break;
2808    OP_32_64(setcond):
2809        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2810        break;
2811    OP_32_64(negsetcond):
2812        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2813        break;
2814    OP_32_64(movcond):
2815        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2816        break;
2817
2818    OP_32_64(bswap16):
2819        if (a2 & TCG_BSWAP_OS) {
2820            /* Output must be sign-extended. */
2821            if (rexw) {
2822                tcg_out_bswap64(s, a0);
2823                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2824            } else {
2825                tcg_out_bswap32(s, a0);
2826                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2827            }
2828        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2829            /* Output must be zero-extended, but input isn't. */
2830            tcg_out_bswap32(s, a0);
2831            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2832        } else {
2833            tcg_out_rolw_8(s, a0);
2834        }
2835        break;
2836    OP_32_64(bswap32):
2837        tcg_out_bswap32(s, a0);
2838        if (rexw && (a2 & TCG_BSWAP_OS)) {
2839            tcg_out_ext32s(s, a0, a0);
2840        }
2841        break;
2842
2843    OP_32_64(neg):
2844        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2845        break;
2846    OP_32_64(not):
2847        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2848        break;
2849
2850    case INDEX_op_qemu_ld_a64_i32:
2851        if (TCG_TARGET_REG_BITS == 32) {
2852            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2853            break;
2854        }
2855        /* fall through */
2856    case INDEX_op_qemu_ld_a32_i32:
2857        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2858        break;
2859    case INDEX_op_qemu_ld_a32_i64:
2860        if (TCG_TARGET_REG_BITS == 64) {
2861            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2862        } else {
2863            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2864        }
2865        break;
2866    case INDEX_op_qemu_ld_a64_i64:
2867        if (TCG_TARGET_REG_BITS == 64) {
2868            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2869        } else {
2870            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2871        }
2872        break;
2873    case INDEX_op_qemu_ld_a32_i128:
2874    case INDEX_op_qemu_ld_a64_i128:
2875        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2876        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2877        break;
2878
2879    case INDEX_op_qemu_st_a64_i32:
2880    case INDEX_op_qemu_st8_a64_i32:
2881        if (TCG_TARGET_REG_BITS == 32) {
2882            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2883            break;
2884        }
2885        /* fall through */
2886    case INDEX_op_qemu_st_a32_i32:
2887    case INDEX_op_qemu_st8_a32_i32:
2888        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2889        break;
2890    case INDEX_op_qemu_st_a32_i64:
2891        if (TCG_TARGET_REG_BITS == 64) {
2892            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2893        } else {
2894            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2895        }
2896        break;
2897    case INDEX_op_qemu_st_a64_i64:
2898        if (TCG_TARGET_REG_BITS == 64) {
2899            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2900        } else {
2901            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2902        }
2903        break;
2904    case INDEX_op_qemu_st_a32_i128:
2905    case INDEX_op_qemu_st_a64_i128:
2906        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2907        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2908        break;
2909
2910    OP_32_64(mulu2):
2911        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2912        break;
2913    OP_32_64(muls2):
2914        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2915        break;
2916    OP_32_64(add2):
2917        if (const_args[4]) {
2918            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2919        } else {
2920            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2921        }
2922        if (const_args[5]) {
2923            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2924        } else {
2925            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2926        }
2927        break;
2928    OP_32_64(sub2):
2929        if (const_args[4]) {
2930            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2931        } else {
2932            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2933        }
2934        if (const_args[5]) {
2935            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2936        } else {
2937            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2938        }
2939        break;
2940
2941#if TCG_TARGET_REG_BITS == 32
2942    case INDEX_op_brcond2_i32:
2943        tcg_out_brcond2(s, args, const_args, 0);
2944        break;
2945    case INDEX_op_setcond2_i32:
2946        tcg_out_setcond2(s, args, const_args);
2947        break;
2948#else /* TCG_TARGET_REG_BITS == 64 */
2949    case INDEX_op_ld32s_i64:
2950        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2951        break;
2952    case INDEX_op_ld_i64:
2953        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2954        break;
2955    case INDEX_op_st_i64:
2956        if (const_args[0]) {
2957            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2958            tcg_out32(s, a0);
2959        } else {
2960            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2961        }
2962        break;
2963
2964    case INDEX_op_bswap64_i64:
2965        tcg_out_bswap64(s, a0);
2966        break;
2967    case INDEX_op_extrh_i64_i32:
2968        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2969        break;
2970#endif
2971
2972    OP_32_64(deposit):
2973        if (args[3] == 0 && args[4] == 8) {
2974            /* load bits 0..7 */
2975            if (const_a2) {
2976                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2977                            0, a0, 0);
2978                tcg_out8(s, a2);
2979            } else {
2980                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2981            }
2982        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2983            /* load bits 8..15 */
2984            if (const_a2) {
2985                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2986                tcg_out8(s, a2);
2987            } else {
2988                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2989            }
2990        } else if (args[3] == 0 && args[4] == 16) {
2991            /* load bits 0..15 */
2992            if (const_a2) {
2993                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2994                            0, a0, 0);
2995                tcg_out16(s, a2);
2996            } else {
2997                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2998            }
2999        } else {
3000            g_assert_not_reached();
3001        }
3002        break;
3003
3004    case INDEX_op_extract_i64:
3005        if (a2 + args[3] == 32) {
3006            /* This is a 32-bit zero-extending right shift.  */
3007            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
3008            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
3009            break;
3010        }
3011        /* FALLTHRU */
3012    case INDEX_op_extract_i32:
3013        /* On the off-chance that we can use the high-byte registers.
3014           Otherwise we emit the same ext16 + shift pattern that we
3015           would have gotten from the normal tcg-op.c expansion.  */
3016        tcg_debug_assert(a2 == 8 && args[3] == 8);
3017        if (a1 < 4 && a0 < 8) {
3018            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
3019        } else {
3020            tcg_out_ext16u(s, a0, a1);
3021            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
3022        }
3023        break;
3024
3025    case INDEX_op_sextract_i32:
3026        /* We don't implement sextract_i64, as we cannot sign-extend to
3027           64-bits without using the REX prefix that explicitly excludes
3028           access to the high-byte registers.  */
3029        tcg_debug_assert(a2 == 8 && args[3] == 8);
3030        if (a1 < 4 && a0 < 8) {
3031            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
3032        } else {
3033            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
3034            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
3035        }
3036        break;
3037
3038    OP_32_64(extract2):
3039        /* Note that SHRD outputs to the r/m operand.  */
3040        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
3041        tcg_out8(s, args[3]);
3042        break;
3043
3044    case INDEX_op_mb:
3045        tcg_out_mb(s, a0);
3046        break;
3047    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
3048    case INDEX_op_mov_i64:
3049    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
3050    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
3051    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
3052    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
3053    case INDEX_op_ext8s_i64:
3054    case INDEX_op_ext8u_i32:
3055    case INDEX_op_ext8u_i64:
3056    case INDEX_op_ext16s_i32:
3057    case INDEX_op_ext16s_i64:
3058    case INDEX_op_ext16u_i32:
3059    case INDEX_op_ext16u_i64:
3060    case INDEX_op_ext32s_i64:
3061    case INDEX_op_ext32u_i64:
3062    case INDEX_op_ext_i32_i64:
3063    case INDEX_op_extu_i32_i64:
3064    case INDEX_op_extrl_i64_i32:
3065    default:
3066        g_assert_not_reached();
3067    }
3068
3069#undef OP_32_64
3070}
3071
3072static int const umin_insn[4] = {
3073    OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3074};
3075
3076static int const umax_insn[4] = {
3077    OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3078};
3079
3080static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece,
3081                                  TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
3082{
3083    static int const cmpeq_insn[4] = {
3084        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3085    };
3086    static int const cmpgt_insn[4] = {
3087        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3088    };
3089
3090    enum {
3091        NEED_INV  = 1,
3092        NEED_SWAP = 2,
3093        NEED_UMIN = 4,
3094        NEED_UMAX = 8,
3095        INVALID   = 16,
3096    };
3097    static const uint8_t cond_fixup[16] = {
3098        [0 ... 15] = INVALID,
3099        [TCG_COND_EQ] = 0,
3100        [TCG_COND_GT] = 0,
3101        [TCG_COND_NE] = NEED_INV,
3102        [TCG_COND_LE] = NEED_INV,
3103        [TCG_COND_LT] = NEED_SWAP,
3104        [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3105        [TCG_COND_LEU] = NEED_UMIN,
3106        [TCG_COND_GTU] = NEED_UMIN | NEED_INV,
3107        [TCG_COND_GEU] = NEED_UMAX,
3108        [TCG_COND_LTU] = NEED_UMAX | NEED_INV,
3109    };
3110    int fixup = cond_fixup[cond];
3111
3112    assert(!(fixup & INVALID));
3113
3114    if (fixup & NEED_INV) {
3115        cond = tcg_invert_cond(cond);
3116    }
3117
3118    if (fixup & NEED_SWAP) {
3119        TCGReg swap = v1;
3120        v1 = v2;
3121        v2 = swap;
3122        cond = tcg_swap_cond(cond);
3123    }
3124
3125    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3126        int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]);
3127
3128        /* avx2 does not have 64-bit min/max; adjusted during expand. */
3129        assert(vece <= MO_32);
3130
3131        tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type);
3132        v2 = TCG_TMP_VEC;
3133        cond = TCG_COND_EQ;
3134    }
3135
3136    switch (cond) {
3137    case TCG_COND_EQ:
3138        tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type);
3139        break;
3140    case TCG_COND_GT:
3141        tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type);
3142        break;
3143    default:
3144        g_assert_not_reached();
3145    }
3146    return fixup & NEED_INV;
3147}
3148
3149static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece,
3150                               TCGReg v1, TCGReg v2, TCGCond cond)
3151{
3152    static const int cmpm_insn[2][4] = {
3153        { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ },
3154        { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ }
3155    };
3156    static const int testm_insn[4] = {
3157        OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ
3158    };
3159    static const int testnm_insn[4] = {
3160        OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ
3161    };
3162
3163    static const int cond_ext[16] = {
3164        [TCG_COND_EQ] = 0,
3165        [TCG_COND_NE] = 4,
3166        [TCG_COND_LT] = 1,
3167        [TCG_COND_LTU] = 1,
3168        [TCG_COND_LE] = 2,
3169        [TCG_COND_LEU] = 2,
3170        [TCG_COND_NEVER] = 3,
3171        [TCG_COND_GE] = 5,
3172        [TCG_COND_GEU] = 5,
3173        [TCG_COND_GT] = 6,
3174        [TCG_COND_GTU] = 6,
3175        [TCG_COND_ALWAYS] = 7,
3176    };
3177
3178    switch (cond) {
3179    case TCG_COND_TSTNE:
3180        tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type);
3181        break;
3182    case TCG_COND_TSTEQ:
3183        tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type);
3184        break;
3185    default:
3186        tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece],
3187                               /* k1 */ 1, v1, v2, type);
3188        tcg_out8(s, cond_ext[cond]);
3189        break;
3190    }
3191}
3192
3193static void tcg_out_k1_to_vec(TCGContext *s, TCGType type,
3194                              unsigned vece, TCGReg dest)
3195{
3196    static const int movm_insn[] = {
3197        OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q
3198    };
3199    tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type);
3200}
3201
3202static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
3203                            TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
3204{
3205    /*
3206     * With avx512, we have a complete set of comparisons into mask.
3207     * Unless there's a single insn expansion for the comparision,
3208     * expand via a mask in k1.
3209     */
3210    if ((vece <= MO_16 ? have_avx512bw : have_avx512dq)
3211        && cond != TCG_COND_EQ
3212        && cond != TCG_COND_LT
3213        && cond != TCG_COND_GT) {
3214        tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond);
3215        tcg_out_k1_to_vec(s, type, vece, v0);
3216        return;
3217    }
3218
3219    if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) {
3220        tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1);
3221        tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type);
3222    }
3223}
3224
3225static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece,
3226                                  TCGReg v0, TCGReg c1, TCGReg c2,
3227                                  TCGReg v3, TCGReg v4, TCGCond cond)
3228{
3229    static const int vpblendm_insn[] = {
3230        OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ
3231    };
3232    bool z = false;
3233
3234    /* Swap to place constant in V4 to take advantage of zero-masking. */
3235    if (!v3) {
3236        z = true;
3237        v3 = v4;
3238        cond = tcg_invert_cond(cond);
3239    }
3240
3241    tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond);
3242    tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3,
3243                            /* k1 */1, z, type);
3244}
3245
3246static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
3247                               TCGReg v0, TCGReg c1, TCGReg c2,
3248                               TCGReg v3, TCGReg v4, TCGCond cond)
3249{
3250    bool inv;
3251
3252    if (vece <= MO_16 ? have_avx512bw : have_avx512vl) {
3253        tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond);
3254        return;
3255    }
3256
3257    inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond);
3258
3259    /*
3260     * Since XMM0 is 16, the only way we get 0 into V3
3261     * is via the constant zero constraint.
3262     */
3263    if (!v3) {
3264        if (inv) {
3265            tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type);
3266        } else {
3267            tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type);
3268        }
3269    } else {
3270        if (inv) {
3271            TCGReg swap = v3;
3272            v3 = v4;
3273            v4 = swap;
3274        }
3275        tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type);
3276        tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4);
3277    }
3278}
3279
3280static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3281                           unsigned vecl, unsigned vece,
3282                           const TCGArg args[TCG_MAX_OP_ARGS],
3283                           const int const_args[TCG_MAX_OP_ARGS])
3284{
3285    static int const add_insn[4] = {
3286        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3287    };
3288    static int const ssadd_insn[4] = {
3289        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3290    };
3291    static int const usadd_insn[4] = {
3292        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3293    };
3294    static int const sub_insn[4] = {
3295        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3296    };
3297    static int const sssub_insn[4] = {
3298        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3299    };
3300    static int const ussub_insn[4] = {
3301        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3302    };
3303    static int const mul_insn[4] = {
3304        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3305    };
3306    static int const shift_imm_insn[4] = {
3307        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3308    };
3309    static int const punpckl_insn[4] = {
3310        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3311    };
3312    static int const punpckh_insn[4] = {
3313        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3314    };
3315    static int const packss_insn[4] = {
3316        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3317    };
3318    static int const packus_insn[4] = {
3319        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3320    };
3321    static int const smin_insn[4] = {
3322        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3323    };
3324    static int const smax_insn[4] = {
3325        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3326    };
3327    static int const rotlv_insn[4] = {
3328        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3329    };
3330    static int const rotrv_insn[4] = {
3331        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3332    };
3333    static int const shlv_insn[4] = {
3334        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3335    };
3336    static int const shrv_insn[4] = {
3337        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3338    };
3339    static int const sarv_insn[4] = {
3340        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3341    };
3342    static int const shls_insn[4] = {
3343        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3344    };
3345    static int const shrs_insn[4] = {
3346        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3347    };
3348    static int const sars_insn[4] = {
3349        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3350    };
3351    static int const vpshldi_insn[4] = {
3352        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3353    };
3354    static int const vpshldv_insn[4] = {
3355        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3356    };
3357    static int const vpshrdv_insn[4] = {
3358        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3359    };
3360    static int const abs_insn[4] = {
3361        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3362    };
3363
3364    TCGType type = vecl + TCG_TYPE_V64;
3365    int insn, sub;
3366    TCGArg a0, a1, a2, a3;
3367
3368    a0 = args[0];
3369    a1 = args[1];
3370    a2 = args[2];
3371
3372    switch (opc) {
3373    case INDEX_op_add_vec:
3374        insn = add_insn[vece];
3375        goto gen_simd;
3376    case INDEX_op_ssadd_vec:
3377        insn = ssadd_insn[vece];
3378        goto gen_simd;
3379    case INDEX_op_usadd_vec:
3380        insn = usadd_insn[vece];
3381        goto gen_simd;
3382    case INDEX_op_sub_vec:
3383        insn = sub_insn[vece];
3384        goto gen_simd;
3385    case INDEX_op_sssub_vec:
3386        insn = sssub_insn[vece];
3387        goto gen_simd;
3388    case INDEX_op_ussub_vec:
3389        insn = ussub_insn[vece];
3390        goto gen_simd;
3391    case INDEX_op_mul_vec:
3392        insn = mul_insn[vece];
3393        goto gen_simd;
3394    case INDEX_op_and_vec:
3395        insn = OPC_PAND;
3396        goto gen_simd;
3397    case INDEX_op_or_vec:
3398        insn = OPC_POR;
3399        goto gen_simd;
3400    case INDEX_op_xor_vec:
3401        insn = OPC_PXOR;
3402        goto gen_simd;
3403    case INDEX_op_smin_vec:
3404        insn = smin_insn[vece];
3405        goto gen_simd;
3406    case INDEX_op_umin_vec:
3407        insn = umin_insn[vece];
3408        goto gen_simd;
3409    case INDEX_op_smax_vec:
3410        insn = smax_insn[vece];
3411        goto gen_simd;
3412    case INDEX_op_umax_vec:
3413        insn = umax_insn[vece];
3414        goto gen_simd;
3415    case INDEX_op_shlv_vec:
3416        insn = shlv_insn[vece];
3417        goto gen_simd;
3418    case INDEX_op_shrv_vec:
3419        insn = shrv_insn[vece];
3420        goto gen_simd;
3421    case INDEX_op_sarv_vec:
3422        insn = sarv_insn[vece];
3423        goto gen_simd;
3424    case INDEX_op_rotlv_vec:
3425        insn = rotlv_insn[vece];
3426        goto gen_simd;
3427    case INDEX_op_rotrv_vec:
3428        insn = rotrv_insn[vece];
3429        goto gen_simd;
3430    case INDEX_op_shls_vec:
3431        insn = shls_insn[vece];
3432        goto gen_simd;
3433    case INDEX_op_shrs_vec:
3434        insn = shrs_insn[vece];
3435        goto gen_simd;
3436    case INDEX_op_sars_vec:
3437        insn = sars_insn[vece];
3438        goto gen_simd;
3439    case INDEX_op_x86_punpckl_vec:
3440        insn = punpckl_insn[vece];
3441        goto gen_simd;
3442    case INDEX_op_x86_punpckh_vec:
3443        insn = punpckh_insn[vece];
3444        goto gen_simd;
3445    case INDEX_op_x86_packss_vec:
3446        insn = packss_insn[vece];
3447        goto gen_simd;
3448    case INDEX_op_x86_packus_vec:
3449        insn = packus_insn[vece];
3450        goto gen_simd;
3451    case INDEX_op_x86_vpshldv_vec:
3452        insn = vpshldv_insn[vece];
3453        a1 = a2;
3454        a2 = args[3];
3455        goto gen_simd;
3456    case INDEX_op_x86_vpshrdv_vec:
3457        insn = vpshrdv_insn[vece];
3458        a1 = a2;
3459        a2 = args[3];
3460        goto gen_simd;
3461#if TCG_TARGET_REG_BITS == 32
3462    case INDEX_op_dup2_vec:
3463        /* First merge the two 32-bit inputs to a single 64-bit element. */
3464        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3465        /* Then replicate the 64-bit elements across the rest of the vector. */
3466        if (type != TCG_TYPE_V64) {
3467            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3468        }
3469        break;
3470#endif
3471    case INDEX_op_abs_vec:
3472        insn = abs_insn[vece];
3473        a2 = a1;
3474        a1 = 0;
3475        goto gen_simd;
3476    gen_simd:
3477        tcg_debug_assert(insn != OPC_UD2);
3478        tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
3479        break;
3480
3481    case INDEX_op_cmp_vec:
3482        tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]);
3483        break;
3484
3485    case INDEX_op_cmpsel_vec:
3486        tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2,
3487                           args[3], args[4], args[5]);
3488        break;
3489
3490    case INDEX_op_andc_vec:
3491        insn = OPC_PANDN;
3492        tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type);
3493        break;
3494
3495    case INDEX_op_shli_vec:
3496        insn = shift_imm_insn[vece];
3497        sub = 6;
3498        goto gen_shift;
3499    case INDEX_op_shri_vec:
3500        insn = shift_imm_insn[vece];
3501        sub = 2;
3502        goto gen_shift;
3503    case INDEX_op_sari_vec:
3504        if (vece == MO_64) {
3505            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3506        } else {
3507            insn = shift_imm_insn[vece];
3508        }
3509        sub = 4;
3510        goto gen_shift;
3511    case INDEX_op_rotli_vec:
3512        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3513        if (vece == MO_64) {
3514            insn |= P_VEXW;
3515        }
3516        sub = 1;
3517        goto gen_shift;
3518    gen_shift:
3519        tcg_debug_assert(vece != MO_8);
3520        tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type);
3521        tcg_out8(s, a2);
3522        break;
3523
3524    case INDEX_op_ld_vec:
3525        tcg_out_ld(s, type, a0, a1, a2);
3526        break;
3527    case INDEX_op_st_vec:
3528        tcg_out_st(s, type, a0, a1, a2);
3529        break;
3530    case INDEX_op_dupm_vec:
3531        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3532        break;
3533
3534    case INDEX_op_x86_shufps_vec:
3535        insn = OPC_SHUFPS;
3536        sub = args[3];
3537        goto gen_simd_imm8;
3538    case INDEX_op_x86_blend_vec:
3539        if (vece == MO_16) {
3540            insn = OPC_PBLENDW;
3541        } else if (vece == MO_32) {
3542            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3543        } else {
3544            g_assert_not_reached();
3545        }
3546        sub = args[3];
3547        goto gen_simd_imm8;
3548    case INDEX_op_x86_vperm2i128_vec:
3549        insn = OPC_VPERM2I128;
3550        sub = args[3];
3551        goto gen_simd_imm8;
3552    case INDEX_op_x86_vpshldi_vec:
3553        insn = vpshldi_insn[vece];
3554        sub = args[3];
3555        goto gen_simd_imm8;
3556
3557    case INDEX_op_not_vec:
3558        insn = OPC_VPTERNLOGQ;
3559        a2 = a1;
3560        sub = 0x33; /* !B */
3561        goto gen_simd_imm8;
3562    case INDEX_op_nor_vec:
3563        insn = OPC_VPTERNLOGQ;
3564        sub = 0x11; /* norCB */
3565        goto gen_simd_imm8;
3566    case INDEX_op_nand_vec:
3567        insn = OPC_VPTERNLOGQ;
3568        sub = 0x77; /* nandCB */
3569        goto gen_simd_imm8;
3570    case INDEX_op_eqv_vec:
3571        insn = OPC_VPTERNLOGQ;
3572        sub = 0x99; /* xnorCB */
3573        goto gen_simd_imm8;
3574    case INDEX_op_orc_vec:
3575        insn = OPC_VPTERNLOGQ;
3576        sub = 0xdd; /* orB!C */
3577        goto gen_simd_imm8;
3578
3579    case INDEX_op_bitsel_vec:
3580        insn = OPC_VPTERNLOGQ;
3581        a3 = args[3];
3582        if (a0 == a1) {
3583            a1 = a2;
3584            a2 = a3;
3585            sub = 0xca; /* A?B:C */
3586        } else if (a0 == a2) {
3587            a2 = a3;
3588            sub = 0xe2; /* B?A:C */
3589        } else {
3590            tcg_out_mov(s, type, a0, a3);
3591            sub = 0xb8; /* B?C:A */
3592        }
3593        goto gen_simd_imm8;
3594
3595    gen_simd_imm8:
3596        tcg_debug_assert(insn != OPC_UD2);
3597        tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
3598        tcg_out8(s, sub);
3599        break;
3600
3601    case INDEX_op_x86_psrldq_vec:
3602        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3603        tcg_out8(s, a2);
3604        break;
3605
3606    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3607    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3608    default:
3609        g_assert_not_reached();
3610    }
3611}
3612
3613static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3614{
3615    switch (op) {
3616    case INDEX_op_goto_ptr:
3617        return C_O0_I1(r);
3618
3619    case INDEX_op_ld8u_i32:
3620    case INDEX_op_ld8u_i64:
3621    case INDEX_op_ld8s_i32:
3622    case INDEX_op_ld8s_i64:
3623    case INDEX_op_ld16u_i32:
3624    case INDEX_op_ld16u_i64:
3625    case INDEX_op_ld16s_i32:
3626    case INDEX_op_ld16s_i64:
3627    case INDEX_op_ld_i32:
3628    case INDEX_op_ld32u_i64:
3629    case INDEX_op_ld32s_i64:
3630    case INDEX_op_ld_i64:
3631        return C_O1_I1(r, r);
3632
3633    case INDEX_op_st8_i32:
3634    case INDEX_op_st8_i64:
3635        return C_O0_I2(qi, r);
3636
3637    case INDEX_op_st16_i32:
3638    case INDEX_op_st16_i64:
3639    case INDEX_op_st_i32:
3640    case INDEX_op_st32_i64:
3641        return C_O0_I2(ri, r);
3642
3643    case INDEX_op_st_i64:
3644        return C_O0_I2(re, r);
3645
3646    case INDEX_op_add_i32:
3647    case INDEX_op_add_i64:
3648        return C_O1_I2(r, r, re);
3649
3650    case INDEX_op_sub_i32:
3651    case INDEX_op_sub_i64:
3652    case INDEX_op_mul_i32:
3653    case INDEX_op_mul_i64:
3654    case INDEX_op_or_i32:
3655    case INDEX_op_or_i64:
3656    case INDEX_op_xor_i32:
3657    case INDEX_op_xor_i64:
3658        return C_O1_I2(r, 0, re);
3659
3660    case INDEX_op_and_i32:
3661    case INDEX_op_and_i64:
3662        return C_O1_I2(r, 0, reZ);
3663
3664    case INDEX_op_andc_i32:
3665    case INDEX_op_andc_i64:
3666        return C_O1_I2(r, r, rI);
3667
3668    case INDEX_op_shl_i32:
3669    case INDEX_op_shl_i64:
3670    case INDEX_op_shr_i32:
3671    case INDEX_op_shr_i64:
3672    case INDEX_op_sar_i32:
3673    case INDEX_op_sar_i64:
3674        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3675
3676    case INDEX_op_rotl_i32:
3677    case INDEX_op_rotl_i64:
3678    case INDEX_op_rotr_i32:
3679    case INDEX_op_rotr_i64:
3680        return C_O1_I2(r, 0, ci);
3681
3682    case INDEX_op_brcond_i32:
3683    case INDEX_op_brcond_i64:
3684        return C_O0_I2(r, reT);
3685
3686    case INDEX_op_bswap16_i32:
3687    case INDEX_op_bswap16_i64:
3688    case INDEX_op_bswap32_i32:
3689    case INDEX_op_bswap32_i64:
3690    case INDEX_op_bswap64_i64:
3691    case INDEX_op_neg_i32:
3692    case INDEX_op_neg_i64:
3693    case INDEX_op_not_i32:
3694    case INDEX_op_not_i64:
3695    case INDEX_op_extrh_i64_i32:
3696        return C_O1_I1(r, 0);
3697
3698    case INDEX_op_ext8s_i32:
3699    case INDEX_op_ext8s_i64:
3700    case INDEX_op_ext8u_i32:
3701    case INDEX_op_ext8u_i64:
3702        return C_O1_I1(r, q);
3703
3704    case INDEX_op_ext16s_i32:
3705    case INDEX_op_ext16s_i64:
3706    case INDEX_op_ext16u_i32:
3707    case INDEX_op_ext16u_i64:
3708    case INDEX_op_ext32s_i64:
3709    case INDEX_op_ext32u_i64:
3710    case INDEX_op_ext_i32_i64:
3711    case INDEX_op_extu_i32_i64:
3712    case INDEX_op_extrl_i64_i32:
3713    case INDEX_op_extract_i32:
3714    case INDEX_op_extract_i64:
3715    case INDEX_op_sextract_i32:
3716    case INDEX_op_ctpop_i32:
3717    case INDEX_op_ctpop_i64:
3718        return C_O1_I1(r, r);
3719
3720    case INDEX_op_extract2_i32:
3721    case INDEX_op_extract2_i64:
3722        return C_O1_I2(r, 0, r);
3723
3724    case INDEX_op_deposit_i32:
3725    case INDEX_op_deposit_i64:
3726        return C_O1_I2(q, 0, qi);
3727
3728    case INDEX_op_setcond_i32:
3729    case INDEX_op_setcond_i64:
3730    case INDEX_op_negsetcond_i32:
3731    case INDEX_op_negsetcond_i64:
3732        return C_O1_I2(q, r, reT);
3733
3734    case INDEX_op_movcond_i32:
3735    case INDEX_op_movcond_i64:
3736        return C_O1_I4(r, r, reT, r, 0);
3737
3738    case INDEX_op_div2_i32:
3739    case INDEX_op_div2_i64:
3740    case INDEX_op_divu2_i32:
3741    case INDEX_op_divu2_i64:
3742        return C_O2_I3(a, d, 0, 1, r);
3743
3744    case INDEX_op_mulu2_i32:
3745    case INDEX_op_mulu2_i64:
3746    case INDEX_op_muls2_i32:
3747    case INDEX_op_muls2_i64:
3748        return C_O2_I2(a, d, a, r);
3749
3750    case INDEX_op_add2_i32:
3751    case INDEX_op_add2_i64:
3752    case INDEX_op_sub2_i32:
3753    case INDEX_op_sub2_i64:
3754        return C_N1_O1_I4(r, r, 0, 1, re, re);
3755
3756    case INDEX_op_ctz_i32:
3757    case INDEX_op_ctz_i64:
3758        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3759
3760    case INDEX_op_clz_i32:
3761    case INDEX_op_clz_i64:
3762        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3763
3764    case INDEX_op_qemu_ld_a32_i32:
3765        return C_O1_I1(r, L);
3766    case INDEX_op_qemu_ld_a64_i32:
3767        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3768
3769    case INDEX_op_qemu_st_a32_i32:
3770        return C_O0_I2(L, L);
3771    case INDEX_op_qemu_st_a64_i32:
3772        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3773    case INDEX_op_qemu_st8_a32_i32:
3774        return C_O0_I2(s, L);
3775    case INDEX_op_qemu_st8_a64_i32:
3776        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3777
3778    case INDEX_op_qemu_ld_a32_i64:
3779        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3780    case INDEX_op_qemu_ld_a64_i64:
3781        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3782
3783    case INDEX_op_qemu_st_a32_i64:
3784        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3785    case INDEX_op_qemu_st_a64_i64:
3786        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3787
3788    case INDEX_op_qemu_ld_a32_i128:
3789    case INDEX_op_qemu_ld_a64_i128:
3790        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3791        return C_O2_I1(r, r, L);
3792    case INDEX_op_qemu_st_a32_i128:
3793    case INDEX_op_qemu_st_a64_i128:
3794        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3795        return C_O0_I3(L, L, L);
3796
3797    case INDEX_op_brcond2_i32:
3798        return C_O0_I4(r, r, ri, ri);
3799
3800    case INDEX_op_setcond2_i32:
3801        return C_O1_I4(r, r, r, ri, ri);
3802
3803    case INDEX_op_ld_vec:
3804    case INDEX_op_dupm_vec:
3805        return C_O1_I1(x, r);
3806
3807    case INDEX_op_st_vec:
3808        return C_O0_I2(x, r);
3809
3810    case INDEX_op_add_vec:
3811    case INDEX_op_sub_vec:
3812    case INDEX_op_mul_vec:
3813    case INDEX_op_and_vec:
3814    case INDEX_op_or_vec:
3815    case INDEX_op_xor_vec:
3816    case INDEX_op_andc_vec:
3817    case INDEX_op_orc_vec:
3818    case INDEX_op_nand_vec:
3819    case INDEX_op_nor_vec:
3820    case INDEX_op_eqv_vec:
3821    case INDEX_op_ssadd_vec:
3822    case INDEX_op_usadd_vec:
3823    case INDEX_op_sssub_vec:
3824    case INDEX_op_ussub_vec:
3825    case INDEX_op_smin_vec:
3826    case INDEX_op_umin_vec:
3827    case INDEX_op_smax_vec:
3828    case INDEX_op_umax_vec:
3829    case INDEX_op_shlv_vec:
3830    case INDEX_op_shrv_vec:
3831    case INDEX_op_sarv_vec:
3832    case INDEX_op_rotlv_vec:
3833    case INDEX_op_rotrv_vec:
3834    case INDEX_op_shls_vec:
3835    case INDEX_op_shrs_vec:
3836    case INDEX_op_sars_vec:
3837    case INDEX_op_cmp_vec:
3838    case INDEX_op_x86_shufps_vec:
3839    case INDEX_op_x86_blend_vec:
3840    case INDEX_op_x86_packss_vec:
3841    case INDEX_op_x86_packus_vec:
3842    case INDEX_op_x86_vperm2i128_vec:
3843    case INDEX_op_x86_punpckl_vec:
3844    case INDEX_op_x86_punpckh_vec:
3845    case INDEX_op_x86_vpshldi_vec:
3846#if TCG_TARGET_REG_BITS == 32
3847    case INDEX_op_dup2_vec:
3848#endif
3849        return C_O1_I2(x, x, x);
3850
3851    case INDEX_op_abs_vec:
3852    case INDEX_op_dup_vec:
3853    case INDEX_op_not_vec:
3854    case INDEX_op_shli_vec:
3855    case INDEX_op_shri_vec:
3856    case INDEX_op_sari_vec:
3857    case INDEX_op_rotli_vec:
3858    case INDEX_op_x86_psrldq_vec:
3859        return C_O1_I1(x, x);
3860
3861    case INDEX_op_x86_vpshldv_vec:
3862    case INDEX_op_x86_vpshrdv_vec:
3863        return C_O1_I3(x, 0, x, x);
3864
3865    case INDEX_op_bitsel_vec:
3866        return C_O1_I3(x, x, x, x);
3867    case INDEX_op_cmpsel_vec:
3868        return C_O1_I4(x, x, x, xO, x);
3869
3870    default:
3871        g_assert_not_reached();
3872    }
3873}
3874
3875int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3876{
3877    switch (opc) {
3878    case INDEX_op_add_vec:
3879    case INDEX_op_sub_vec:
3880    case INDEX_op_and_vec:
3881    case INDEX_op_or_vec:
3882    case INDEX_op_xor_vec:
3883    case INDEX_op_andc_vec:
3884    case INDEX_op_orc_vec:
3885    case INDEX_op_nand_vec:
3886    case INDEX_op_nor_vec:
3887    case INDEX_op_eqv_vec:
3888    case INDEX_op_not_vec:
3889    case INDEX_op_bitsel_vec:
3890        return 1;
3891    case INDEX_op_cmp_vec:
3892    case INDEX_op_cmpsel_vec:
3893        return -1;
3894
3895    case INDEX_op_rotli_vec:
3896        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3897
3898    case INDEX_op_shli_vec:
3899    case INDEX_op_shri_vec:
3900        /* We must expand the operation for MO_8.  */
3901        return vece == MO_8 ? -1 : 1;
3902
3903    case INDEX_op_sari_vec:
3904        switch (vece) {
3905        case MO_8:
3906            return -1;
3907        case MO_16:
3908        case MO_32:
3909            return 1;
3910        case MO_64:
3911            if (have_avx512vl) {
3912                return 1;
3913            }
3914            /*
3915             * We can emulate this for MO_64, but it does not pay off
3916             * unless we're producing at least 4 values.
3917             */
3918            return type >= TCG_TYPE_V256 ? -1 : 0;
3919        }
3920        return 0;
3921
3922    case INDEX_op_shls_vec:
3923    case INDEX_op_shrs_vec:
3924        return vece >= MO_16;
3925    case INDEX_op_sars_vec:
3926        switch (vece) {
3927        case MO_16:
3928        case MO_32:
3929            return 1;
3930        case MO_64:
3931            return have_avx512vl;
3932        }
3933        return 0;
3934    case INDEX_op_rotls_vec:
3935        return vece >= MO_16 ? -1 : 0;
3936
3937    case INDEX_op_shlv_vec:
3938    case INDEX_op_shrv_vec:
3939        switch (vece) {
3940        case MO_16:
3941            return have_avx512bw;
3942        case MO_32:
3943        case MO_64:
3944            return have_avx2;
3945        }
3946        return 0;
3947    case INDEX_op_sarv_vec:
3948        switch (vece) {
3949        case MO_16:
3950            return have_avx512bw;
3951        case MO_32:
3952            return have_avx2;
3953        case MO_64:
3954            return have_avx512vl;
3955        }
3956        return 0;
3957    case INDEX_op_rotlv_vec:
3958    case INDEX_op_rotrv_vec:
3959        switch (vece) {
3960        case MO_16:
3961            return have_avx512vbmi2 ? -1 : 0;
3962        case MO_32:
3963        case MO_64:
3964            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3965        }
3966        return 0;
3967
3968    case INDEX_op_mul_vec:
3969        switch (vece) {
3970        case MO_8:
3971            return -1;
3972        case MO_64:
3973            return have_avx512dq;
3974        }
3975        return 1;
3976
3977    case INDEX_op_ssadd_vec:
3978    case INDEX_op_usadd_vec:
3979    case INDEX_op_sssub_vec:
3980    case INDEX_op_ussub_vec:
3981        return vece <= MO_16;
3982    case INDEX_op_smin_vec:
3983    case INDEX_op_smax_vec:
3984    case INDEX_op_umin_vec:
3985    case INDEX_op_umax_vec:
3986    case INDEX_op_abs_vec:
3987        return vece <= MO_32 || have_avx512vl;
3988
3989    default:
3990        return 0;
3991    }
3992}
3993
3994static void expand_vec_shi(TCGType type, unsigned vece, bool right,
3995                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3996{
3997    uint8_t mask;
3998
3999    tcg_debug_assert(vece == MO_8);
4000    if (right) {
4001        mask = 0xff >> imm;
4002        tcg_gen_shri_vec(MO_16, v0, v1, imm);
4003    } else {
4004        mask = 0xff << imm;
4005        tcg_gen_shli_vec(MO_16, v0, v1, imm);
4006    }
4007    tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
4008}
4009
4010static void expand_vec_sari(TCGType type, unsigned vece,
4011                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
4012{
4013    TCGv_vec t1, t2;
4014
4015    switch (vece) {
4016    case MO_8:
4017        /* Unpack to 16-bit, shift, and repack.  */
4018        t1 = tcg_temp_new_vec(type);
4019        t2 = tcg_temp_new_vec(type);
4020        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
4021                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
4022        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
4023                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
4024        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
4025        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
4026        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
4027                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
4028        tcg_temp_free_vec(t1);
4029        tcg_temp_free_vec(t2);
4030        break;
4031
4032    case MO_64:
4033        t1 = tcg_temp_new_vec(type);
4034        if (imm <= 32) {
4035            /*
4036             * We can emulate a small sign extend by performing an arithmetic
4037             * 32-bit shift and overwriting the high half of a 64-bit logical
4038             * shift.  Note that the ISA says shift of 32 is valid, but TCG
4039             * does not, so we have to bound the smaller shift -- we get the
4040             * same result in the high half either way.
4041             */
4042            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
4043            tcg_gen_shri_vec(MO_64, v0, v1, imm);
4044            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
4045                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
4046                      tcgv_vec_arg(t1), 0xaa);
4047        } else {
4048            /* Otherwise we will need to use a compare vs 0 to produce
4049             * the sign-extend, shift and merge.
4050             */
4051            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
4052                            tcg_constant_vec(type, MO_64, 0), v1);
4053            tcg_gen_shri_vec(MO_64, v0, v1, imm);
4054            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
4055            tcg_gen_or_vec(MO_64, v0, v0, t1);
4056        }
4057        tcg_temp_free_vec(t1);
4058        break;
4059
4060    default:
4061        g_assert_not_reached();
4062    }
4063}
4064
4065static void expand_vec_rotli(TCGType type, unsigned vece,
4066                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
4067{
4068    TCGv_vec t;
4069
4070    if (vece != MO_8 && have_avx512vbmi2) {
4071        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
4072                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
4073        return;
4074    }
4075
4076    t = tcg_temp_new_vec(type);
4077    tcg_gen_shli_vec(vece, t, v1, imm);
4078    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
4079    tcg_gen_or_vec(vece, v0, v0, t);
4080    tcg_temp_free_vec(t);
4081}
4082
4083static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
4084                            TCGv_vec v1, TCGv_vec sh, bool right)
4085{
4086    TCGv_vec t;
4087
4088    if (have_avx512vbmi2) {
4089        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
4090                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
4091                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
4092        return;
4093    }
4094
4095    t = tcg_temp_new_vec(type);
4096    tcg_gen_dupi_vec(vece, t, 8 << vece);
4097    tcg_gen_sub_vec(vece, t, t, sh);
4098    if (right) {
4099        tcg_gen_shlv_vec(vece, t, v1, t);
4100        tcg_gen_shrv_vec(vece, v0, v1, sh);
4101    } else {
4102        tcg_gen_shrv_vec(vece, t, v1, t);
4103        tcg_gen_shlv_vec(vece, v0, v1, sh);
4104    }
4105    tcg_gen_or_vec(vece, v0, v0, t);
4106    tcg_temp_free_vec(t);
4107}
4108
4109static void expand_vec_rotls(TCGType type, unsigned vece,
4110                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
4111{
4112    TCGv_vec t = tcg_temp_new_vec(type);
4113
4114    tcg_debug_assert(vece != MO_8);
4115
4116    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
4117        tcg_gen_dup_i32_vec(vece, t, lsh);
4118        if (vece >= MO_32) {
4119            tcg_gen_rotlv_vec(vece, v0, v1, t);
4120        } else {
4121            expand_vec_rotv(type, vece, v0, v1, t, false);
4122        }
4123    } else {
4124        TCGv_i32 rsh = tcg_temp_new_i32();
4125
4126        tcg_gen_neg_i32(rsh, lsh);
4127        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
4128        tcg_gen_shls_vec(vece, t, v1, lsh);
4129        tcg_gen_shrs_vec(vece, v0, v1, rsh);
4130        tcg_gen_or_vec(vece, v0, v0, t);
4131
4132        tcg_temp_free_i32(rsh);
4133    }
4134
4135    tcg_temp_free_vec(t);
4136}
4137
4138static void expand_vec_mul(TCGType type, unsigned vece,
4139                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
4140{
4141    TCGv_vec t1, t2, t3, t4, zero;
4142
4143    tcg_debug_assert(vece == MO_8);
4144
4145    /*
4146     * Unpack v1 bytes to words, 0 | x.
4147     * Unpack v2 bytes to words, y | 0.
4148     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
4149     * Shift logical right by 8 bits to clear the high 8 bytes before
4150     * using an unsigned saturated pack.
4151     *
4152     * The difference between the V64, V128 and V256 cases is merely how
4153     * we distribute the expansion between temporaries.
4154     */
4155    switch (type) {
4156    case TCG_TYPE_V64:
4157        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
4158        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
4159        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
4160        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
4161                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4162        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
4163                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4164        tcg_gen_mul_vec(MO_16, t1, t1, t2);
4165        tcg_gen_shri_vec(MO_16, t1, t1, 8);
4166        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
4167                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
4168        tcg_temp_free_vec(t1);
4169        tcg_temp_free_vec(t2);
4170        break;
4171
4172    case TCG_TYPE_V128:
4173    case TCG_TYPE_V256:
4174        t1 = tcg_temp_new_vec(type);
4175        t2 = tcg_temp_new_vec(type);
4176        t3 = tcg_temp_new_vec(type);
4177        t4 = tcg_temp_new_vec(type);
4178        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
4179        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
4180                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4181        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
4182                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4183        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
4184                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4185        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
4186                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4187        tcg_gen_mul_vec(MO_16, t1, t1, t2);
4188        tcg_gen_mul_vec(MO_16, t3, t3, t4);
4189        tcg_gen_shri_vec(MO_16, t1, t1, 8);
4190        tcg_gen_shri_vec(MO_16, t3, t3, 8);
4191        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
4192                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
4193        tcg_temp_free_vec(t1);
4194        tcg_temp_free_vec(t2);
4195        tcg_temp_free_vec(t3);
4196        tcg_temp_free_vec(t4);
4197        break;
4198
4199    default:
4200        g_assert_not_reached();
4201    }
4202}
4203
4204static TCGCond expand_vec_cond(TCGType type, unsigned vece,
4205                               TCGArg *a1, TCGArg *a2, TCGCond cond)
4206{
4207    /*
4208     * Without AVX512, there are no 64-bit unsigned comparisons.
4209     * We must bias the inputs so that they become signed.
4210     * All other swapping and inversion are handled during code generation.
4211     */
4212    if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) {
4213        TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1));
4214        TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2));
4215        TCGv_vec t1 = tcg_temp_new_vec(type);
4216        TCGv_vec t2 = tcg_temp_new_vec(type);
4217        TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4218
4219        tcg_gen_sub_vec(vece, t1, v1, t3);
4220        tcg_gen_sub_vec(vece, t2, v2, t3);
4221        *a1 = tcgv_vec_arg(t1);
4222        *a2 = tcgv_vec_arg(t2);
4223        cond = tcg_signed_cond(cond);
4224    }
4225    return cond;
4226}
4227
4228static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0,
4229                           TCGArg a1, TCGArg a2, TCGCond cond)
4230{
4231    cond = expand_vec_cond(type, vece, &a1, &a2, cond);
4232    /* Expand directly; do not recurse.  */
4233    vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
4234}
4235
4236static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0,
4237                              TCGArg a1, TCGArg a2,
4238                              TCGArg a3, TCGArg a4, TCGCond cond)
4239{
4240    cond = expand_vec_cond(type, vece, &a1, &a2, cond);
4241    /* Expand directly; do not recurse.  */
4242    vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond);
4243}
4244
4245void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4246                       TCGArg a0, ...)
4247{
4248    va_list va;
4249    TCGArg a1, a2, a3, a4, a5;
4250    TCGv_vec v0, v1, v2;
4251
4252    va_start(va, a0);
4253    a1 = va_arg(va, TCGArg);
4254    a2 = va_arg(va, TCGArg);
4255    v0 = temp_tcgv_vec(arg_temp(a0));
4256    v1 = temp_tcgv_vec(arg_temp(a1));
4257
4258    switch (opc) {
4259    case INDEX_op_shli_vec:
4260        expand_vec_shi(type, vece, false, v0, v1, a2);
4261        break;
4262    case INDEX_op_shri_vec:
4263        expand_vec_shi(type, vece, true, v0, v1, a2);
4264        break;
4265    case INDEX_op_sari_vec:
4266        expand_vec_sari(type, vece, v0, v1, a2);
4267        break;
4268
4269    case INDEX_op_rotli_vec:
4270        expand_vec_rotli(type, vece, v0, v1, a2);
4271        break;
4272
4273    case INDEX_op_rotls_vec:
4274        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4275        break;
4276
4277    case INDEX_op_rotlv_vec:
4278        v2 = temp_tcgv_vec(arg_temp(a2));
4279        expand_vec_rotv(type, vece, v0, v1, v2, false);
4280        break;
4281    case INDEX_op_rotrv_vec:
4282        v2 = temp_tcgv_vec(arg_temp(a2));
4283        expand_vec_rotv(type, vece, v0, v1, v2, true);
4284        break;
4285
4286    case INDEX_op_mul_vec:
4287        v2 = temp_tcgv_vec(arg_temp(a2));
4288        expand_vec_mul(type, vece, v0, v1, v2);
4289        break;
4290
4291    case INDEX_op_cmp_vec:
4292        a3 = va_arg(va, TCGArg);
4293        expand_vec_cmp(type, vece, a0, a1, a2, a3);
4294        break;
4295
4296    case INDEX_op_cmpsel_vec:
4297        a3 = va_arg(va, TCGArg);
4298        a4 = va_arg(va, TCGArg);
4299        a5 = va_arg(va, TCGArg);
4300        expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5);
4301        break;
4302
4303    default:
4304        break;
4305    }
4306
4307    va_end(va);
4308}
4309
4310static const int tcg_target_callee_save_regs[] = {
4311#if TCG_TARGET_REG_BITS == 64
4312    TCG_REG_RBP,
4313    TCG_REG_RBX,
4314#if defined(_WIN64)
4315    TCG_REG_RDI,
4316    TCG_REG_RSI,
4317#endif
4318    TCG_REG_R12,
4319    TCG_REG_R13,
4320    TCG_REG_R14, /* Currently used for the global env. */
4321    TCG_REG_R15,
4322#else
4323    TCG_REG_EBP, /* Currently used for the global env. */
4324    TCG_REG_EBX,
4325    TCG_REG_ESI,
4326    TCG_REG_EDI,
4327#endif
4328};
4329
4330/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4331   and tcg_register_jit.  */
4332
4333#define PUSH_SIZE \
4334    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4335     * (TCG_TARGET_REG_BITS / 8))
4336
4337#define FRAME_SIZE \
4338    ((PUSH_SIZE \
4339      + TCG_STATIC_CALL_ARGS_SIZE \
4340      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4341      + TCG_TARGET_STACK_ALIGN - 1) \
4342     & ~(TCG_TARGET_STACK_ALIGN - 1))
4343
4344/* Generate global QEMU prologue and epilogue code */
4345static void tcg_target_qemu_prologue(TCGContext *s)
4346{
4347    int i, stack_addend;
4348
4349    /* TB prologue */
4350
4351    /* Reserve some stack space, also for TCG temps.  */
4352    stack_addend = FRAME_SIZE - PUSH_SIZE;
4353    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4354                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4355
4356    /* Save all callee saved registers.  */
4357    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4358        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4359    }
4360
4361    if (!tcg_use_softmmu && guest_base) {
4362        int seg = setup_guest_base_seg();
4363        if (seg != 0) {
4364            x86_guest_base.seg = seg;
4365        } else if (guest_base == (int32_t)guest_base) {
4366            x86_guest_base.ofs = guest_base;
4367        } else {
4368            assert(TCG_TARGET_REG_BITS == 64);
4369            /* Choose R12 because, as a base, it requires a SIB byte. */
4370            x86_guest_base.index = TCG_REG_R12;
4371            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4372            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4373        }
4374    }
4375
4376    if (TCG_TARGET_REG_BITS == 32) {
4377        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4378                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4379        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4380        /* jmp *tb.  */
4381        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4382                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4383                             + stack_addend);
4384    } else {
4385        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4386        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4387        /* jmp *tb.  */
4388        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4389    }
4390
4391    /*
4392     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4393     * and fall through to the rest of the epilogue.
4394     */
4395    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4396    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4397
4398    /* TB epilogue */
4399    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4400
4401    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4402
4403    if (have_avx2) {
4404        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4405    }
4406    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4407        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4408    }
4409    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4410}
4411
4412static void tcg_out_tb_start(TCGContext *s)
4413{
4414    /* nothing to do */
4415}
4416
4417static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4418{
4419    memset(p, 0x90, count);
4420}
4421
4422static void tcg_target_init(TCGContext *s)
4423{
4424    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4425    if (TCG_TARGET_REG_BITS == 64) {
4426        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4427    }
4428    if (have_avx1) {
4429        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4430        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4431    }
4432    if (have_avx2) {
4433        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4434    }
4435
4436    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4437    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4438    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4439    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4440    if (TCG_TARGET_REG_BITS == 64) {
4441#if !defined(_WIN64)
4442        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4443        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4444#endif
4445        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4446        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4447        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4448        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4449    }
4450
4451    s->reserved_regs = 0;
4452    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4453    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4454#ifdef _WIN64
4455    /* These are call saved, and we don't save them, so don't use them. */
4456    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4457    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4458    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4459    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4460    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4461    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4462    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4463    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4464    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4465    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4466#endif
4467}
4468
4469typedef struct {
4470    DebugFrameHeader h;
4471    uint8_t fde_def_cfa[4];
4472    uint8_t fde_reg_ofs[14];
4473} DebugFrame;
4474
4475/* We're expecting a 2 byte uleb128 encoded value.  */
4476QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4477
4478#if !defined(__ELF__)
4479    /* Host machine without ELF. */
4480#elif TCG_TARGET_REG_BITS == 64
4481#define ELF_HOST_MACHINE EM_X86_64
4482static const DebugFrame debug_frame = {
4483    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4484    .h.cie.id = -1,
4485    .h.cie.version = 1,
4486    .h.cie.code_align = 1,
4487    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4488    .h.cie.return_column = 16,
4489
4490    /* Total FDE size does not include the "len" member.  */
4491    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4492
4493    .fde_def_cfa = {
4494        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4495        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4496        (FRAME_SIZE >> 7)
4497    },
4498    .fde_reg_ofs = {
4499        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4500        /* The following ordering must match tcg_target_callee_save_regs.  */
4501        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4502        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4503        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4504        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4505        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4506        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4507    }
4508};
4509#else
4510#define ELF_HOST_MACHINE EM_386
4511static const DebugFrame debug_frame = {
4512    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4513    .h.cie.id = -1,
4514    .h.cie.version = 1,
4515    .h.cie.code_align = 1,
4516    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4517    .h.cie.return_column = 8,
4518
4519    /* Total FDE size does not include the "len" member.  */
4520    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4521
4522    .fde_def_cfa = {
4523        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4524        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4525        (FRAME_SIZE >> 7)
4526    },
4527    .fde_reg_ofs = {
4528        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4529        /* The following ordering must match tcg_target_callee_save_regs.  */
4530        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4531        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4532        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4533        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4534    }
4535};
4536#endif
4537
4538#if defined(ELF_HOST_MACHINE)
4539void tcg_register_jit(const void *buf, size_t buf_size)
4540{
4541    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4542}
4543#endif
4544