xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision b17693453ef4d06f732295c2e70128a7cf20bf30)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135#define TCG_CT_CONST_TST 0x1000
136
137/* Registers used with L constraint, which are the first argument
138   registers on x86_64, and two random call clobbered registers on
139   i386. */
140#if TCG_TARGET_REG_BITS == 64
141# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
142# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
143#else
144# define TCG_REG_L0 TCG_REG_EAX
145# define TCG_REG_L1 TCG_REG_EDX
146#endif
147
148#if TCG_TARGET_REG_BITS == 64
149# define ALL_GENERAL_REGS      0x0000ffffu
150# define ALL_VECTOR_REGS       0xffff0000u
151# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
152#else
153# define ALL_GENERAL_REGS      0x000000ffu
154# define ALL_VECTOR_REGS       0x00ff0000u
155# define ALL_BYTEL_REGS        0x0000000fu
156#endif
157#define SOFTMMU_RESERVE_REGS \
158    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
159
160/* For 64-bit, we always know that CMOV is available.  */
161#if TCG_TARGET_REG_BITS == 64
162# define have_cmov      true
163#else
164# define have_cmov      (cpuinfo & CPUINFO_CMOV)
165#endif
166#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
167#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
168
169static const tcg_insn_unit *tb_ret_addr;
170
171static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
172                        intptr_t value, intptr_t addend)
173{
174    value += addend;
175    switch(type) {
176    case R_386_PC32:
177        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
178        if (value != (int32_t)value) {
179            return false;
180        }
181        /* FALLTHRU */
182    case R_386_32:
183        tcg_patch32(code_ptr, value);
184        break;
185    case R_386_PC8:
186        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
187        if (value != (int8_t)value) {
188            return false;
189        }
190        tcg_patch8(code_ptr, value);
191        break;
192    default:
193        g_assert_not_reached();
194    }
195    return true;
196}
197
198/* test if a constant matches the constraint */
199static bool tcg_target_const_match(int64_t val, int ct,
200                                   TCGType type, TCGCond cond, int vece)
201{
202    if (ct & TCG_CT_CONST) {
203        return 1;
204    }
205    if (type == TCG_TYPE_I32) {
206        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
207                  TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
208            return 1;
209        }
210    } else {
211        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
212            return 1;
213        }
214        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
215            return 1;
216        }
217        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
218            return 1;
219        }
220        /*
221         * This will be used in combination with TCG_CT_CONST_S32,
222         * so "normal" TESTQ is already matched.  Also accept:
223         *    TESTQ -> TESTL   (uint32_t)
224         *    TESTQ -> BT      (is_power_of_2)
225         */
226        if ((ct & TCG_CT_CONST_TST)
227            && is_tst_cond(cond)
228            && (val == (uint32_t)val || is_power_of_2(val))) {
229            return 1;
230        }
231    }
232    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
233        return 1;
234    }
235    return 0;
236}
237
238# define LOWREGMASK(x)	((x) & 7)
239
240#define P_EXT		0x100		/* 0x0f opcode prefix */
241#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
242#define P_DATA16        0x400           /* 0x66 opcode prefix */
243#define P_VEXW          0x1000          /* Set VEX.W = 1 */
244#if TCG_TARGET_REG_BITS == 64
245# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
246# define P_REXB_R       0x2000          /* REG field as byte register */
247# define P_REXB_RM      0x4000          /* R/M field as byte register */
248# define P_GS           0x8000          /* gs segment override */
249#else
250# define P_REXW		0
251# define P_REXB_R	0
252# define P_REXB_RM	0
253# define P_GS           0
254#endif
255#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
256#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
257#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
258#define P_VEXL          0x80000         /* Set VEX.L = 1 */
259#define P_EVEX          0x100000        /* Requires EVEX encoding */
260
261#define OPC_ARITH_EbIb	(0x80)
262#define OPC_ARITH_EvIz	(0x81)
263#define OPC_ARITH_EvIb	(0x83)
264#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
265#define OPC_ANDN        (0xf2 | P_EXT38)
266#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
267#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
268#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
269#define OPC_BSF         (0xbc | P_EXT)
270#define OPC_BSR         (0xbd | P_EXT)
271#define OPC_BSWAP	(0xc8 | P_EXT)
272#define OPC_CALL_Jz	(0xe8)
273#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
274#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
275#define OPC_DEC_r32	(0x48)
276#define OPC_IMUL_GvEv	(0xaf | P_EXT)
277#define OPC_IMUL_GvEvIb	(0x6b)
278#define OPC_IMUL_GvEvIz	(0x69)
279#define OPC_INC_r32	(0x40)
280#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
281#define OPC_JCC_short	(0x70)		/* ... plus condition code */
282#define OPC_JMP_long	(0xe9)
283#define OPC_JMP_short	(0xeb)
284#define OPC_LEA         (0x8d)
285#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
286#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
287#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
288#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
289#define OPC_MOVB_EvIz   (0xc6)
290#define OPC_MOVL_EvIz	(0xc7)
291#define OPC_MOVB_Ib     (0xb0)
292#define OPC_MOVL_Iv     (0xb8)
293#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
294#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
295#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
296#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
297#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
298#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
299#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
300#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
301#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
302#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
303#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
304#define OPC_MOVSBL	(0xbe | P_EXT)
305#define OPC_MOVSWL	(0xbf | P_EXT)
306#define OPC_MOVSLQ	(0x63 | P_REXW)
307#define OPC_MOVZBL	(0xb6 | P_EXT)
308#define OPC_MOVZWL	(0xb7 | P_EXT)
309#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
310#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
311#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
312#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
313#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
314#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
315#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
316#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
317#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
318#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
319#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
320#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
321#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
322#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
323#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
324#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
325#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
326#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
327#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
328#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
329#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
330#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
331#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
332#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
333#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
334#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
335#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
336#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
337#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
338#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
339#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
340#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
341#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
342#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
343#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
344#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
345#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
346#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
347#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
348#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
349#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
350#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
351#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
352#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
353#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
354#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
355#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
356#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
357#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
358#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
359#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
360#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
361#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
362#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
363#define OPC_POR         (0xeb | P_EXT | P_DATA16)
364#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
365#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
366#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
367#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
368#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
369#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
370#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
371#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
372#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
373#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
374#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
375#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
376#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
377#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
378#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
379#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
380#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
381#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
382#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
383#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
384#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
385#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
386#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
387#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
388#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
389#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
390#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
391#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
392#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
393#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
394#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
395#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
396#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
397#define OPC_POP_r32	(0x58)
398#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
399#define OPC_PUSH_r32	(0x50)
400#define OPC_PUSH_Iv	(0x68)
401#define OPC_PUSH_Ib	(0x6a)
402#define OPC_RET		(0xc3)
403#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
404#define OPC_SHIFT_1	(0xd1)
405#define OPC_SHIFT_Ib	(0xc1)
406#define OPC_SHIFT_cl	(0xd3)
407#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
408#define OPC_SHUFPS      (0xc6 | P_EXT)
409#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
410#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
411#define OPC_SHRD_Ib     (0xac | P_EXT)
412#define OPC_TESTB	(0x84)
413#define OPC_TESTL	(0x85)
414#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
415#define OPC_UD2         (0x0b | P_EXT)
416#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
417#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
418#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
419#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
420#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
421#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
422#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
423#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
424#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
425#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
426#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
427#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
428#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
429#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
430#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
431#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
433#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
434#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
435#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
436#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
437#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
438#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
439#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
440#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
441#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
442#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
443#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
444#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
445#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
446#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
447#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
448#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
449#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
450#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
451#define OPC_VZEROUPPER  (0x77 | P_EXT)
452#define OPC_XCHG_ax_r32	(0x90)
453#define OPC_XCHG_EvGv   (0x87)
454
455#define OPC_GRP3_Eb     (0xf6)
456#define OPC_GRP3_Ev     (0xf7)
457#define OPC_GRP5        (0xff)
458#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
459#define OPC_GRPBT       (0xba | P_EXT)
460
461#define OPC_GRPBT_BT    4
462#define OPC_GRPBT_BTS   5
463#define OPC_GRPBT_BTR   6
464#define OPC_GRPBT_BTC   7
465
466/* Group 1 opcode extensions for 0x80-0x83.
467   These are also used as modifiers for OPC_ARITH.  */
468#define ARITH_ADD 0
469#define ARITH_OR  1
470#define ARITH_ADC 2
471#define ARITH_SBB 3
472#define ARITH_AND 4
473#define ARITH_SUB 5
474#define ARITH_XOR 6
475#define ARITH_CMP 7
476
477/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
478#define SHIFT_ROL 0
479#define SHIFT_ROR 1
480#define SHIFT_SHL 4
481#define SHIFT_SHR 5
482#define SHIFT_SAR 7
483
484/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
485#define EXT3_TESTi 0
486#define EXT3_NOT   2
487#define EXT3_NEG   3
488#define EXT3_MUL   4
489#define EXT3_IMUL  5
490#define EXT3_DIV   6
491#define EXT3_IDIV  7
492
493/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
494#define EXT5_INC_Ev	0
495#define EXT5_DEC_Ev	1
496#define EXT5_CALLN_Ev	2
497#define EXT5_JMPN_Ev	4
498
499/* Condition codes to be added to OPC_JCC_{long,short}.  */
500#define JCC_JMP (-1)
501#define JCC_JO  0x0
502#define JCC_JNO 0x1
503#define JCC_JB  0x2
504#define JCC_JAE 0x3
505#define JCC_JE  0x4
506#define JCC_JNE 0x5
507#define JCC_JBE 0x6
508#define JCC_JA  0x7
509#define JCC_JS  0x8
510#define JCC_JNS 0x9
511#define JCC_JP  0xa
512#define JCC_JNP 0xb
513#define JCC_JL  0xc
514#define JCC_JGE 0xd
515#define JCC_JLE 0xe
516#define JCC_JG  0xf
517
518static const uint8_t tcg_cond_to_jcc[] = {
519    [TCG_COND_EQ] = JCC_JE,
520    [TCG_COND_NE] = JCC_JNE,
521    [TCG_COND_LT] = JCC_JL,
522    [TCG_COND_GE] = JCC_JGE,
523    [TCG_COND_LE] = JCC_JLE,
524    [TCG_COND_GT] = JCC_JG,
525    [TCG_COND_LTU] = JCC_JB,
526    [TCG_COND_GEU] = JCC_JAE,
527    [TCG_COND_LEU] = JCC_JBE,
528    [TCG_COND_GTU] = JCC_JA,
529    [TCG_COND_TSTEQ] = JCC_JE,
530    [TCG_COND_TSTNE] = JCC_JNE,
531};
532
533#if TCG_TARGET_REG_BITS == 64
534static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
535{
536    int rex;
537
538    if (opc & P_GS) {
539        tcg_out8(s, 0x65);
540    }
541    if (opc & P_DATA16) {
542        /* We should never be asking for both 16 and 64-bit operation.  */
543        tcg_debug_assert((opc & P_REXW) == 0);
544        tcg_out8(s, 0x66);
545    }
546    if (opc & P_SIMDF3) {
547        tcg_out8(s, 0xf3);
548    } else if (opc & P_SIMDF2) {
549        tcg_out8(s, 0xf2);
550    }
551
552    rex = 0;
553    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
554    rex |= (r & 8) >> 1;                /* REX.R */
555    rex |= (x & 8) >> 2;                /* REX.X */
556    rex |= (rm & 8) >> 3;               /* REX.B */
557
558    /* P_REXB_{R,RM} indicates that the given register is the low byte.
559       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
560       as otherwise the encoding indicates %[abcd]h.  Note that the values
561       that are ORed in merely indicate that the REX byte must be present;
562       those bits get discarded in output.  */
563    rex |= opc & (r >= 4 ? P_REXB_R : 0);
564    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
565
566    if (rex) {
567        tcg_out8(s, (uint8_t)(rex | 0x40));
568    }
569
570    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
571        tcg_out8(s, 0x0f);
572        if (opc & P_EXT38) {
573            tcg_out8(s, 0x38);
574        } else if (opc & P_EXT3A) {
575            tcg_out8(s, 0x3a);
576        }
577    }
578
579    tcg_out8(s, opc);
580}
581#else
582static void tcg_out_opc(TCGContext *s, int opc)
583{
584    if (opc & P_DATA16) {
585        tcg_out8(s, 0x66);
586    }
587    if (opc & P_SIMDF3) {
588        tcg_out8(s, 0xf3);
589    } else if (opc & P_SIMDF2) {
590        tcg_out8(s, 0xf2);
591    }
592    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
593        tcg_out8(s, 0x0f);
594        if (opc & P_EXT38) {
595            tcg_out8(s, 0x38);
596        } else if (opc & P_EXT3A) {
597            tcg_out8(s, 0x3a);
598        }
599    }
600    tcg_out8(s, opc);
601}
602/* Discard the register arguments to tcg_out_opc early, so as not to penalize
603   the 32-bit compilation paths.  This method works with all versions of gcc,
604   whereas relying on optimization may not be able to exclude them.  */
605#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
606#endif
607
608static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
609{
610    tcg_out_opc(s, opc, r, rm, 0);
611    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
612}
613
614static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
615                            int rm, int index)
616{
617    int tmp;
618
619    if (opc & P_GS) {
620        tcg_out8(s, 0x65);
621    }
622    /* Use the two byte form if possible, which cannot encode
623       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
624    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
625        && ((rm | index) & 8) == 0) {
626        /* Two byte VEX prefix.  */
627        tcg_out8(s, 0xc5);
628
629        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
630    } else {
631        /* Three byte VEX prefix.  */
632        tcg_out8(s, 0xc4);
633
634        /* VEX.m-mmmm */
635        if (opc & P_EXT3A) {
636            tmp = 3;
637        } else if (opc & P_EXT38) {
638            tmp = 2;
639        } else if (opc & P_EXT) {
640            tmp = 1;
641        } else {
642            g_assert_not_reached();
643        }
644        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
645        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
646        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
647        tcg_out8(s, tmp);
648
649        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
650    }
651
652    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
653    /* VEX.pp */
654    if (opc & P_DATA16) {
655        tmp |= 1;                          /* 0x66 */
656    } else if (opc & P_SIMDF3) {
657        tmp |= 2;                          /* 0xf3 */
658    } else if (opc & P_SIMDF2) {
659        tmp |= 3;                          /* 0xf2 */
660    }
661    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
662    tcg_out8(s, tmp);
663    tcg_out8(s, opc);
664}
665
666static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
667                             int rm, int index)
668{
669    /* The entire 4-byte evex prefix; with R' and V' set. */
670    uint32_t p = 0x08041062;
671    int mm, pp;
672
673    tcg_debug_assert(have_avx512vl);
674
675    /* EVEX.mm */
676    if (opc & P_EXT3A) {
677        mm = 3;
678    } else if (opc & P_EXT38) {
679        mm = 2;
680    } else if (opc & P_EXT) {
681        mm = 1;
682    } else {
683        g_assert_not_reached();
684    }
685
686    /* EVEX.pp */
687    if (opc & P_DATA16) {
688        pp = 1;                          /* 0x66 */
689    } else if (opc & P_SIMDF3) {
690        pp = 2;                          /* 0xf3 */
691    } else if (opc & P_SIMDF2) {
692        pp = 3;                          /* 0xf2 */
693    } else {
694        pp = 0;
695    }
696
697    p = deposit32(p, 8, 2, mm);
698    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
699    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
700    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
701    p = deposit32(p, 16, 2, pp);
702    p = deposit32(p, 19, 4, ~v);
703    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
704    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
705
706    tcg_out32(s, p);
707    tcg_out8(s, opc);
708}
709
710static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
711{
712    if (opc & P_EVEX) {
713        tcg_out_evex_opc(s, opc, r, v, rm, 0);
714    } else {
715        tcg_out_vex_opc(s, opc, r, v, rm, 0);
716    }
717    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
718}
719
720/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
721   We handle either RM and INDEX missing with a negative value.  In 64-bit
722   mode for absolute addresses, ~RM is the size of the immediate operand
723   that will follow the instruction.  */
724
725static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
726                               int shift, intptr_t offset)
727{
728    int mod, len;
729
730    if (index < 0 && rm < 0) {
731        if (TCG_TARGET_REG_BITS == 64) {
732            /* Try for a rip-relative addressing mode.  This has replaced
733               the 32-bit-mode absolute addressing encoding.  */
734            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
735            intptr_t disp = offset - pc;
736            if (disp == (int32_t)disp) {
737                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
738                tcg_out32(s, disp);
739                return;
740            }
741
742            /* Try for an absolute address encoding.  This requires the
743               use of the MODRM+SIB encoding and is therefore larger than
744               rip-relative addressing.  */
745            if (offset == (int32_t)offset) {
746                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
747                tcg_out8(s, (4 << 3) | 5);
748                tcg_out32(s, offset);
749                return;
750            }
751
752            /* ??? The memory isn't directly addressable.  */
753            g_assert_not_reached();
754        } else {
755            /* Absolute address.  */
756            tcg_out8(s, (r << 3) | 5);
757            tcg_out32(s, offset);
758            return;
759        }
760    }
761
762    /* Find the length of the immediate addend.  Note that the encoding
763       that would be used for (%ebp) indicates absolute addressing.  */
764    if (rm < 0) {
765        mod = 0, len = 4, rm = 5;
766    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
767        mod = 0, len = 0;
768    } else if (offset == (int8_t)offset) {
769        mod = 0x40, len = 1;
770    } else {
771        mod = 0x80, len = 4;
772    }
773
774    /* Use a single byte MODRM format if possible.  Note that the encoding
775       that would be used for %esp is the escape to the two byte form.  */
776    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
777        /* Single byte MODRM format.  */
778        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
779    } else {
780        /* Two byte MODRM+SIB format.  */
781
782        /* Note that the encoding that would place %esp into the index
783           field indicates no index register.  In 64-bit mode, the REX.X
784           bit counts, so %r12 can be used as the index.  */
785        if (index < 0) {
786            index = 4;
787        } else {
788            tcg_debug_assert(index != TCG_REG_ESP);
789        }
790
791        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
792        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
793    }
794
795    if (len == 1) {
796        tcg_out8(s, offset);
797    } else if (len == 4) {
798        tcg_out32(s, offset);
799    }
800}
801
802static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
803                                     int index, int shift, intptr_t offset)
804{
805    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
806    tcg_out_sib_offset(s, r, rm, index, shift, offset);
807}
808
809static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
810                                         int rm, int index, int shift,
811                                         intptr_t offset)
812{
813    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
814    tcg_out_sib_offset(s, r, rm, index, shift, offset);
815}
816
817/* A simplification of the above with no index or shift.  */
818static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
819                                        int rm, intptr_t offset)
820{
821    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
822}
823
824static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
825                                            int v, int rm, intptr_t offset)
826{
827    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
828}
829
830/* Output an opcode with an expected reference to the constant pool.  */
831static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
832{
833    tcg_out_opc(s, opc, r, 0, 0);
834    /* Absolute for 32-bit, pc-relative for 64-bit.  */
835    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
836    tcg_out32(s, 0);
837}
838
839/* Output an opcode with an expected reference to the constant pool.  */
840static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
841{
842    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
843    /* Absolute for 32-bit, pc-relative for 64-bit.  */
844    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
845    tcg_out32(s, 0);
846}
847
848/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
849static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
850{
851    /* Propagate an opcode prefix, such as P_REXW.  */
852    int ext = subop & ~0x7;
853    subop &= 0x7;
854
855    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
856}
857
858static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
859{
860    int rexw = 0;
861
862    if (arg == ret) {
863        return true;
864    }
865    switch (type) {
866    case TCG_TYPE_I64:
867        rexw = P_REXW;
868        /* fallthru */
869    case TCG_TYPE_I32:
870        if (ret < 16) {
871            if (arg < 16) {
872                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
873            } else {
874                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
875            }
876        } else {
877            if (arg < 16) {
878                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
879            } else {
880                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
881            }
882        }
883        break;
884
885    case TCG_TYPE_V64:
886        tcg_debug_assert(ret >= 16 && arg >= 16);
887        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
888        break;
889    case TCG_TYPE_V128:
890        tcg_debug_assert(ret >= 16 && arg >= 16);
891        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
892        break;
893    case TCG_TYPE_V256:
894        tcg_debug_assert(ret >= 16 && arg >= 16);
895        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
896        break;
897
898    default:
899        g_assert_not_reached();
900    }
901    return true;
902}
903
904static const int avx2_dup_insn[4] = {
905    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
906    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
907};
908
909static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
910                            TCGReg r, TCGReg a)
911{
912    if (have_avx2) {
913        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
914        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
915    } else {
916        switch (vece) {
917        case MO_8:
918            /* ??? With zero in a register, use PSHUFB.  */
919            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
920            a = r;
921            /* FALLTHRU */
922        case MO_16:
923            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
924            a = r;
925            /* FALLTHRU */
926        case MO_32:
927            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
928            /* imm8 operand: all output lanes selected from input lane 0.  */
929            tcg_out8(s, 0);
930            break;
931        case MO_64:
932            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
933            break;
934        default:
935            g_assert_not_reached();
936        }
937    }
938    return true;
939}
940
941static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
942                             TCGReg r, TCGReg base, intptr_t offset)
943{
944    if (have_avx2) {
945        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
946        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
947                                 r, 0, base, offset);
948    } else {
949        switch (vece) {
950        case MO_64:
951            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
952            break;
953        case MO_32:
954            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
955            break;
956        case MO_16:
957            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
958            tcg_out8(s, 0); /* imm8 */
959            tcg_out_dup_vec(s, type, vece, r, r);
960            break;
961        case MO_8:
962            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
963            tcg_out8(s, 0); /* imm8 */
964            tcg_out_dup_vec(s, type, vece, r, r);
965            break;
966        default:
967            g_assert_not_reached();
968        }
969    }
970    return true;
971}
972
973static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
974                             TCGReg ret, int64_t arg)
975{
976    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
977
978    if (arg == 0) {
979        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
980        return;
981    }
982    if (arg == -1) {
983        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
984        return;
985    }
986
987    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
988        if (have_avx2) {
989            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
990        } else {
991            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
992        }
993        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
994    } else {
995        if (type == TCG_TYPE_V64) {
996            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
997        } else if (have_avx2) {
998            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
999        } else {
1000            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1001        }
1002        if (TCG_TARGET_REG_BITS == 64) {
1003            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1004        } else {
1005            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1006        }
1007    }
1008}
1009
1010static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1011                             TCGReg ret, tcg_target_long arg)
1012{
1013    if (arg == 0) {
1014        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1015        return;
1016    }
1017    if (arg == -1) {
1018        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1019        return;
1020    }
1021
1022    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1023    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1024    if (TCG_TARGET_REG_BITS == 64) {
1025        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1026    } else {
1027        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1028    }
1029}
1030
1031static void tcg_out_movi_int(TCGContext *s, TCGType type,
1032                             TCGReg ret, tcg_target_long arg)
1033{
1034    tcg_target_long diff;
1035
1036    if (arg == 0) {
1037        tgen_arithr(s, ARITH_XOR, ret, ret);
1038        return;
1039    }
1040    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1041        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1042        tcg_out32(s, arg);
1043        return;
1044    }
1045    if (arg == (int32_t)arg) {
1046        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1047        tcg_out32(s, arg);
1048        return;
1049    }
1050
1051    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1052    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1053    if (diff == (int32_t)diff) {
1054        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1055        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1056        tcg_out32(s, diff);
1057        return;
1058    }
1059
1060    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1061    tcg_out64(s, arg);
1062}
1063
1064static void tcg_out_movi(TCGContext *s, TCGType type,
1065                         TCGReg ret, tcg_target_long arg)
1066{
1067    switch (type) {
1068    case TCG_TYPE_I32:
1069#if TCG_TARGET_REG_BITS == 64
1070    case TCG_TYPE_I64:
1071#endif
1072        if (ret < 16) {
1073            tcg_out_movi_int(s, type, ret, arg);
1074        } else {
1075            tcg_out_movi_vec(s, type, ret, arg);
1076        }
1077        break;
1078    default:
1079        g_assert_not_reached();
1080    }
1081}
1082
1083static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1084{
1085    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1086    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1087    return true;
1088}
1089
1090static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1091                             tcg_target_long imm)
1092{
1093    /* This function is only used for passing structs by reference. */
1094    tcg_debug_assert(imm == (int32_t)imm);
1095    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1096}
1097
1098static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1099{
1100    if (val == (int8_t)val) {
1101        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1102        tcg_out8(s, val);
1103    } else if (val == (int32_t)val) {
1104        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1105        tcg_out32(s, val);
1106    } else {
1107        g_assert_not_reached();
1108    }
1109}
1110
1111static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1112{
1113    /* Given the strength of x86 memory ordering, we only need care for
1114       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1115       faster than "mfence", so don't bother with the sse insn.  */
1116    if (a0 & TCG_MO_ST_LD) {
1117        tcg_out8(s, 0xf0);
1118        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1119        tcg_out8(s, 0);
1120    }
1121}
1122
1123static inline void tcg_out_push(TCGContext *s, int reg)
1124{
1125    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1126}
1127
1128static inline void tcg_out_pop(TCGContext *s, int reg)
1129{
1130    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1131}
1132
1133static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1134                       TCGReg arg1, intptr_t arg2)
1135{
1136    switch (type) {
1137    case TCG_TYPE_I32:
1138        if (ret < 16) {
1139            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1140        } else {
1141            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1142        }
1143        break;
1144    case TCG_TYPE_I64:
1145        if (ret < 16) {
1146            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1147            break;
1148        }
1149        /* FALLTHRU */
1150    case TCG_TYPE_V64:
1151        /* There is no instruction that can validate 8-byte alignment.  */
1152        tcg_debug_assert(ret >= 16);
1153        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1154        break;
1155    case TCG_TYPE_V128:
1156        /*
1157         * The gvec infrastructure is asserts that v128 vector loads
1158         * and stores use a 16-byte aligned offset.  Validate that the
1159         * final pointer is aligned by using an insn that will SIGSEGV.
1160         */
1161        tcg_debug_assert(ret >= 16);
1162        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1163        break;
1164    case TCG_TYPE_V256:
1165        /*
1166         * The gvec infrastructure only requires 16-byte alignment,
1167         * so here we must use an unaligned load.
1168         */
1169        tcg_debug_assert(ret >= 16);
1170        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1171                                 ret, 0, arg1, arg2);
1172        break;
1173    default:
1174        g_assert_not_reached();
1175    }
1176}
1177
1178static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1179                       TCGReg arg1, intptr_t arg2)
1180{
1181    switch (type) {
1182    case TCG_TYPE_I32:
1183        if (arg < 16) {
1184            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1185        } else {
1186            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1187        }
1188        break;
1189    case TCG_TYPE_I64:
1190        if (arg < 16) {
1191            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1192            break;
1193        }
1194        /* FALLTHRU */
1195    case TCG_TYPE_V64:
1196        /* There is no instruction that can validate 8-byte alignment.  */
1197        tcg_debug_assert(arg >= 16);
1198        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1199        break;
1200    case TCG_TYPE_V128:
1201        /*
1202         * The gvec infrastructure is asserts that v128 vector loads
1203         * and stores use a 16-byte aligned offset.  Validate that the
1204         * final pointer is aligned by using an insn that will SIGSEGV.
1205         *
1206         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1207         * for _WIN64, which must have SSE2 but may not have AVX.
1208         */
1209        tcg_debug_assert(arg >= 16);
1210        if (have_avx1) {
1211            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1212        } else {
1213            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1214        }
1215        break;
1216    case TCG_TYPE_V256:
1217        /*
1218         * The gvec infrastructure only requires 16-byte alignment,
1219         * so here we must use an unaligned store.
1220         */
1221        tcg_debug_assert(arg >= 16);
1222        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1223                                 arg, 0, arg1, arg2);
1224        break;
1225    default:
1226        g_assert_not_reached();
1227    }
1228}
1229
1230static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1231                        TCGReg base, intptr_t ofs)
1232{
1233    int rexw = 0;
1234    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1235        if (val != (int32_t)val) {
1236            return false;
1237        }
1238        rexw = P_REXW;
1239    } else if (type != TCG_TYPE_I32) {
1240        return false;
1241    }
1242    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1243    tcg_out32(s, val);
1244    return true;
1245}
1246
1247static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1248{
1249    /* Propagate an opcode prefix, such as P_DATA16.  */
1250    int ext = subopc & ~0x7;
1251    subopc &= 0x7;
1252
1253    if (count == 1) {
1254        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1255    } else {
1256        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1257        tcg_out8(s, count);
1258    }
1259}
1260
1261static inline void tcg_out_bswap32(TCGContext *s, int reg)
1262{
1263    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1264}
1265
1266static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1267{
1268    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1269}
1270
1271static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1272{
1273    /* movzbl */
1274    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1275    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1276}
1277
1278static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1279{
1280    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1281    /* movsbl */
1282    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1283    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1284}
1285
1286static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1287{
1288    /* movzwl */
1289    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1290}
1291
1292static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1293{
1294    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1295    /* movsw[lq] */
1296    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1297}
1298
1299static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1300{
1301    /* 32-bit mov zero extends.  */
1302    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1303}
1304
1305static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1306{
1307    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1308    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1309}
1310
1311static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1312{
1313    tcg_out_ext32s(s, dest, src);
1314}
1315
1316static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1317{
1318    if (dest != src) {
1319        tcg_out_ext32u(s, dest, src);
1320    }
1321}
1322
1323static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1324{
1325    tcg_out_ext32u(s, dest, src);
1326}
1327
1328static inline void tcg_out_bswap64(TCGContext *s, int reg)
1329{
1330    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1331}
1332
1333static void tgen_arithi(TCGContext *s, int c, int r0,
1334                        tcg_target_long val, int cf)
1335{
1336    int rexw = 0;
1337
1338    if (TCG_TARGET_REG_BITS == 64) {
1339        rexw = c & -8;
1340        c &= 7;
1341    }
1342
1343    switch (c) {
1344    case ARITH_ADD:
1345    case ARITH_SUB:
1346        if (!cf) {
1347            /*
1348             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1349             * partial flags update stalls on Pentium4 and are not recommended
1350             * by current Intel optimization manuals.
1351             */
1352            if (val == 1 || val == -1) {
1353                int is_inc = (c == ARITH_ADD) ^ (val < 0);
1354                if (TCG_TARGET_REG_BITS == 64) {
1355                    /*
1356                     * The single-byte increment encodings are re-tasked
1357                     * as the REX prefixes.  Use the MODRM encoding.
1358                     */
1359                    tcg_out_modrm(s, OPC_GRP5 + rexw,
1360                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1361                } else {
1362                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1363                }
1364                return;
1365            }
1366            if (val == 128) {
1367                /*
1368                 * Facilitate using an 8-bit immediate.  Carry is inverted
1369                 * by this transformation, so do it only if cf == 0.
1370                 */
1371                c ^= ARITH_ADD ^ ARITH_SUB;
1372                val = -128;
1373            }
1374        }
1375        break;
1376
1377    case ARITH_AND:
1378        if (TCG_TARGET_REG_BITS == 64) {
1379            if (val == 0xffffffffu) {
1380                tcg_out_ext32u(s, r0, r0);
1381                return;
1382            }
1383            if (val == (uint32_t)val) {
1384                /* AND with no high bits set can use a 32-bit operation.  */
1385                rexw = 0;
1386            }
1387        }
1388        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1389            tcg_out_ext8u(s, r0, r0);
1390            return;
1391        }
1392        if (val == 0xffffu) {
1393            tcg_out_ext16u(s, r0, r0);
1394            return;
1395        }
1396        break;
1397
1398    case ARITH_OR:
1399    case ARITH_XOR:
1400        if (val >= 0x80 && val <= 0xff
1401            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1402            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1403            tcg_out8(s, val);
1404            return;
1405        }
1406        break;
1407    }
1408
1409    if (val == (int8_t)val) {
1410        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1411        tcg_out8(s, val);
1412        return;
1413    }
1414    if (rexw == 0 || val == (int32_t)val) {
1415        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1416        tcg_out32(s, val);
1417        return;
1418    }
1419
1420    g_assert_not_reached();
1421}
1422
1423static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1424{
1425    if (val != 0) {
1426        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1427    }
1428}
1429
1430/* Set SMALL to force a short forward branch.  */
1431static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1432{
1433    int32_t val, val1;
1434
1435    if (l->has_value) {
1436        val = tcg_pcrel_diff(s, l->u.value_ptr);
1437        val1 = val - 2;
1438        if ((int8_t)val1 == val1) {
1439            if (opc == -1) {
1440                tcg_out8(s, OPC_JMP_short);
1441            } else {
1442                tcg_out8(s, OPC_JCC_short + opc);
1443            }
1444            tcg_out8(s, val1);
1445        } else {
1446            tcg_debug_assert(!small);
1447            if (opc == -1) {
1448                tcg_out8(s, OPC_JMP_long);
1449                tcg_out32(s, val - 5);
1450            } else {
1451                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1452                tcg_out32(s, val - 6);
1453            }
1454        }
1455    } else if (small) {
1456        if (opc == -1) {
1457            tcg_out8(s, OPC_JMP_short);
1458        } else {
1459            tcg_out8(s, OPC_JCC_short + opc);
1460        }
1461        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1462        s->code_ptr += 1;
1463    } else {
1464        if (opc == -1) {
1465            tcg_out8(s, OPC_JMP_long);
1466        } else {
1467            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1468        }
1469        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1470        s->code_ptr += 4;
1471    }
1472}
1473
1474static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1475                       TCGArg arg2, int const_arg2, int rexw)
1476{
1477    int jz, js;
1478
1479    if (!is_tst_cond(cond)) {
1480        if (!const_arg2) {
1481            tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1482        } else if (arg2 == 0) {
1483            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1484        } else {
1485            tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1486            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1487        }
1488        return tcg_cond_to_jcc[cond];
1489    }
1490
1491    jz = tcg_cond_to_jcc[cond];
1492    js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1493
1494    if (!const_arg2) {
1495        tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1496        return jz;
1497    }
1498
1499    if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1500        if (arg2 == 0x80) {
1501            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1502            return js;
1503        }
1504        if (arg2 == 0xff) {
1505            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1506            return jz;
1507        }
1508        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1509        tcg_out8(s, arg2);
1510        return jz;
1511    }
1512
1513    if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1514        if (arg2 == 0x8000) {
1515            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1516            return js;
1517        }
1518        if (arg2 == 0xff00) {
1519            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1520            return jz;
1521        }
1522        tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1523        tcg_out8(s, arg2 >> 8);
1524        return jz;
1525    }
1526
1527    if (arg2 == 0xffff) {
1528        tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1529        return jz;
1530    }
1531    if (arg2 == 0xffffffffu) {
1532        tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1533        return jz;
1534    }
1535
1536    if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1537        int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1538        int sh = ctz64(arg2);
1539
1540        rexw = (sh & 32 ? P_REXW : 0);
1541        if ((sh & 31) == 31) {
1542            tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1543            return js;
1544        } else {
1545            tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1546            tcg_out8(s, sh);
1547            return jc;
1548        }
1549    }
1550
1551    if (rexw) {
1552        if (arg2 == (uint32_t)arg2) {
1553            rexw = 0;
1554        } else {
1555            tcg_debug_assert(arg2 == (int32_t)arg2);
1556        }
1557    }
1558    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1559    tcg_out32(s, arg2);
1560    return jz;
1561}
1562
1563static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1564                           TCGArg arg1, TCGArg arg2, int const_arg2,
1565                           TCGLabel *label, bool small)
1566{
1567    int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1568    tcg_out_jxx(s, jcc, label, small);
1569}
1570
1571#if TCG_TARGET_REG_BITS == 32
1572static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1573                            const int *const_args, bool small)
1574{
1575    TCGLabel *label_next = gen_new_label();
1576    TCGLabel *label_this = arg_label(args[5]);
1577    TCGCond cond = args[4];
1578
1579    switch (cond) {
1580    case TCG_COND_EQ:
1581    case TCG_COND_TSTEQ:
1582        tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1583                       args[0], args[2], const_args[2], label_next, 1);
1584        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1585                       label_this, small);
1586        break;
1587    case TCG_COND_NE:
1588    case TCG_COND_TSTNE:
1589        tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1590                       label_this, small);
1591        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1592                       label_this, small);
1593        break;
1594    case TCG_COND_LT:
1595        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1596                       label_this, small);
1597        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1598        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1599                       label_this, small);
1600        break;
1601    case TCG_COND_LE:
1602        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1603                       label_this, small);
1604        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1605        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1606                       label_this, small);
1607        break;
1608    case TCG_COND_GT:
1609        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1610                       label_this, small);
1611        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1612        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1613                       label_this, small);
1614        break;
1615    case TCG_COND_GE:
1616        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1617                       label_this, small);
1618        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1619        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1620                       label_this, small);
1621        break;
1622    case TCG_COND_LTU:
1623        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1624                       label_this, small);
1625        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1626        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1627                       label_this, small);
1628        break;
1629    case TCG_COND_LEU:
1630        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1631                       label_this, small);
1632        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1633        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1634                       label_this, small);
1635        break;
1636    case TCG_COND_GTU:
1637        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1638                       label_this, small);
1639        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1640        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1641                       label_this, small);
1642        break;
1643    case TCG_COND_GEU:
1644        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1645                       label_this, small);
1646        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1647        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1648                       label_this, small);
1649        break;
1650    default:
1651        g_assert_not_reached();
1652    }
1653    tcg_out_label(s, label_next);
1654}
1655#endif
1656
1657static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1658                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1659                            int const_arg2, bool neg)
1660{
1661    bool inv = false;
1662    bool cleared;
1663    int jcc;
1664
1665    switch (cond) {
1666    case TCG_COND_NE:
1667        inv = true;
1668        /* fall through */
1669    case TCG_COND_EQ:
1670        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1671        if (const_arg2 && arg2 == 0) {
1672            arg2 = 1;
1673            goto do_ltu;
1674        }
1675        break;
1676
1677    case TCG_COND_LEU:
1678        inv = true;
1679        /* fall through */
1680    case TCG_COND_GTU:
1681        /* If arg2 is a register, swap for LTU/GEU. */
1682        if (!const_arg2) {
1683            TCGReg t = arg1;
1684            arg1 = arg2;
1685            arg2 = t;
1686            goto do_ltu;
1687        }
1688        break;
1689
1690    case TCG_COND_GEU:
1691        inv = true;
1692        /* fall through */
1693    case TCG_COND_LTU:
1694    do_ltu:
1695        /*
1696         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1697         * We can then use NEG or INC to produce the desired result.
1698         * This is always smaller than the SETCC expansion.
1699         */
1700        tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, rexw);
1701
1702        /* X - X - C = -C = (C ? -1 : 0) */
1703        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1704        if (inv && neg) {
1705            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1706            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1707        } else if (inv) {
1708            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1709            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1710        } else if (!neg) {
1711            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1712            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1713        }
1714        return;
1715
1716    case TCG_COND_GE:
1717        inv = true;
1718        /* fall through */
1719    case TCG_COND_LT:
1720        /* If arg2 is 0, extract the sign bit. */
1721        if (const_arg2 && arg2 == 0) {
1722            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1723            if (inv) {
1724                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1725            }
1726            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1727                           dest, rexw ? 63 : 31);
1728            return;
1729        }
1730        break;
1731
1732    default:
1733        break;
1734    }
1735
1736    /*
1737     * If dest does not overlap the inputs, clearing it first is preferred.
1738     * The XOR breaks any false dependency for the low-byte write to dest,
1739     * and is also one byte smaller than MOVZBL.
1740     */
1741    cleared = false;
1742    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1743        tgen_arithr(s, ARITH_XOR, dest, dest);
1744        cleared = true;
1745    }
1746
1747    jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1748    tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1749
1750    if (!cleared) {
1751        tcg_out_ext8u(s, dest, dest);
1752    }
1753    if (neg) {
1754        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1755    }
1756}
1757
1758#if TCG_TARGET_REG_BITS == 32
1759static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1760                             const int *const_args)
1761{
1762    TCGArg new_args[6];
1763    TCGLabel *label_true, *label_over;
1764
1765    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1766
1767    if (args[0] == args[1] || args[0] == args[2]
1768        || (!const_args[3] && args[0] == args[3])
1769        || (!const_args[4] && args[0] == args[4])) {
1770        /* When the destination overlaps with one of the argument
1771           registers, don't do anything tricky.  */
1772        label_true = gen_new_label();
1773        label_over = gen_new_label();
1774
1775        new_args[5] = label_arg(label_true);
1776        tcg_out_brcond2(s, new_args, const_args+1, 1);
1777
1778        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1779        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1780        tcg_out_label(s, label_true);
1781
1782        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1783        tcg_out_label(s, label_over);
1784    } else {
1785        /* When the destination does not overlap one of the arguments,
1786           clear the destination first, jump if cond false, and emit an
1787           increment in the true case.  This results in smaller code.  */
1788
1789        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1790
1791        label_over = gen_new_label();
1792        new_args[4] = tcg_invert_cond(new_args[4]);
1793        new_args[5] = label_arg(label_over);
1794        tcg_out_brcond2(s, new_args, const_args+1, 1);
1795
1796        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1797        tcg_out_label(s, label_over);
1798    }
1799}
1800#endif
1801
1802static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1803                         TCGReg dest, TCGReg v1)
1804{
1805    if (have_cmov) {
1806        tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1807    } else {
1808        TCGLabel *over = gen_new_label();
1809        tcg_out_jxx(s, jcc ^ 1, over, 1);
1810        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1811        tcg_out_label(s, over);
1812    }
1813}
1814
1815static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1816                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1817                            TCGReg v1)
1818{
1819    int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1820    tcg_out_cmov(s, jcc, rexw, dest, v1);
1821}
1822
1823static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1824                        TCGArg arg2, bool const_a2)
1825{
1826    if (have_bmi1) {
1827        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1828        if (const_a2) {
1829            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1830        } else {
1831            tcg_debug_assert(dest != arg2);
1832            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1833        }
1834    } else {
1835        tcg_debug_assert(dest != arg2);
1836        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1837        tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1838    }
1839}
1840
1841static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1842                        TCGArg arg2, bool const_a2)
1843{
1844    if (have_lzcnt) {
1845        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1846        if (const_a2) {
1847            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1848        } else {
1849            tcg_debug_assert(dest != arg2);
1850            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1851        }
1852    } else {
1853        tcg_debug_assert(!const_a2);
1854        tcg_debug_assert(dest != arg1);
1855        tcg_debug_assert(dest != arg2);
1856
1857        /* Recall that the output of BSR is the index not the count.  */
1858        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1859        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1860
1861        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1862        int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1863        tcg_out_cmov(s, jcc, rexw, dest, arg2);
1864    }
1865}
1866
1867static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1868{
1869    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1870
1871    if (disp == (int32_t)disp) {
1872        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1873        tcg_out32(s, disp);
1874    } else {
1875        /* rip-relative addressing into the constant pool.
1876           This is 6 + 8 = 14 bytes, as compared to using an
1877           immediate load 10 + 6 = 16 bytes, plus we may
1878           be able to re-use the pool constant for more calls.  */
1879        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1880        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1881        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1882        tcg_out32(s, 0);
1883    }
1884}
1885
1886static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1887                         const TCGHelperInfo *info)
1888{
1889    tcg_out_branch(s, 1, dest);
1890
1891#ifndef _WIN32
1892    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1893        /*
1894         * The sysv i386 abi for struct return places a reference as the
1895         * first argument of the stack, and pops that argument with the
1896         * return statement.  Since we want to retain the aligned stack
1897         * pointer for the callee, we do not want to actually push that
1898         * argument before the call but rely on the normal store to the
1899         * stack slot.  But we do need to compensate for the pop in order
1900         * to reset our correct stack pointer value.
1901         * Pushing a garbage value back onto the stack is quickest.
1902         */
1903        tcg_out_push(s, TCG_REG_EAX);
1904    }
1905#endif
1906}
1907
1908static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1909{
1910    tcg_out_branch(s, 0, dest);
1911}
1912
1913static void tcg_out_nopn(TCGContext *s, int n)
1914{
1915    int i;
1916    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1917     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1918     * duplicate prefix, and all of the interesting recent cores can
1919     * decode and discard the duplicates in a single cycle.
1920     */
1921    tcg_debug_assert(n >= 1);
1922    for (i = 1; i < n; ++i) {
1923        tcg_out8(s, 0x66);
1924    }
1925    tcg_out8(s, 0x90);
1926}
1927
1928typedef struct {
1929    TCGReg base;
1930    int index;
1931    int ofs;
1932    int seg;
1933    TCGAtomAlign aa;
1934} HostAddress;
1935
1936bool tcg_target_has_memory_bswap(MemOp memop)
1937{
1938    TCGAtomAlign aa;
1939
1940    if (!have_movbe) {
1941        return false;
1942    }
1943    if ((memop & MO_SIZE) < MO_128) {
1944        return true;
1945    }
1946
1947    /*
1948     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1949     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1950     */
1951    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1952    return aa.atom < MO_128;
1953}
1954
1955/*
1956 * Because i686 has no register parameters and because x86_64 has xchg
1957 * to handle addr/data register overlap, we have placed all input arguments
1958 * before we need might need a scratch reg.
1959 *
1960 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1961 * a general-purpose scratch when we don't actually know it's available,
1962 * use the ra_gen hook to load into RAX if needed.
1963 */
1964#if TCG_TARGET_REG_BITS == 64
1965static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1966{
1967    if (arg < 0) {
1968        arg = TCG_REG_RAX;
1969    }
1970    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1971    return arg;
1972}
1973static const TCGLdstHelperParam ldst_helper_param = {
1974    .ra_gen = ldst_ra_gen
1975};
1976#else
1977static const TCGLdstHelperParam ldst_helper_param = { };
1978#endif
1979
1980static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1981                                TCGReg l, TCGReg h, TCGReg v)
1982{
1983    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1984
1985    /* vpmov{d,q} %v, %l */
1986    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1987    /* vpextr{d,q} $1, %v, %h */
1988    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1989    tcg_out8(s, 1);
1990}
1991
1992static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1993                                TCGReg v, TCGReg l, TCGReg h)
1994{
1995    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1996
1997    /* vmov{d,q} %l, %v */
1998    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1999    /* vpinsr{d,q} $1, %h, %v, %v */
2000    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2001    tcg_out8(s, 1);
2002}
2003
2004/*
2005 * Generate code for the slow path for a load at the end of block
2006 */
2007static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2008{
2009    MemOp opc = get_memop(l->oi);
2010    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2011
2012    /* resolve label address */
2013    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2014    if (label_ptr[1]) {
2015        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2016    }
2017
2018    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2019    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2020    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2021
2022    tcg_out_jmp(s, l->raddr);
2023    return true;
2024}
2025
2026/*
2027 * Generate code for the slow path for a store at the end of block
2028 */
2029static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2030{
2031    MemOp opc = get_memop(l->oi);
2032    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2033
2034    /* resolve label address */
2035    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2036    if (label_ptr[1]) {
2037        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2038    }
2039
2040    tcg_out_st_helper_args(s, l, &ldst_helper_param);
2041    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2042
2043    tcg_out_jmp(s, l->raddr);
2044    return true;
2045}
2046
2047#ifdef CONFIG_USER_ONLY
2048static HostAddress x86_guest_base = {
2049    .index = -1
2050};
2051
2052#if defined(__x86_64__) && defined(__linux__)
2053# include <asm/prctl.h>
2054# include <sys/prctl.h>
2055int arch_prctl(int code, unsigned long addr);
2056static inline int setup_guest_base_seg(void)
2057{
2058    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2059        return P_GS;
2060    }
2061    return 0;
2062}
2063#define setup_guest_base_seg  setup_guest_base_seg
2064#elif defined(__x86_64__) && \
2065      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2066# include <machine/sysarch.h>
2067static inline int setup_guest_base_seg(void)
2068{
2069    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2070        return P_GS;
2071    }
2072    return 0;
2073}
2074#define setup_guest_base_seg  setup_guest_base_seg
2075#endif
2076#else
2077# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2078#endif /* CONFIG_USER_ONLY */
2079#ifndef setup_guest_base_seg
2080# define setup_guest_base_seg()  0
2081#endif
2082
2083#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2084
2085/*
2086 * For softmmu, perform the TLB load and compare.
2087 * For useronly, perform any required alignment tests.
2088 * In both cases, return a TCGLabelQemuLdst structure if the slow path
2089 * is required and fill in @h with the host address for the fast path.
2090 */
2091static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2092                                           TCGReg addrlo, TCGReg addrhi,
2093                                           MemOpIdx oi, bool is_ld)
2094{
2095    TCGLabelQemuLdst *ldst = NULL;
2096    MemOp opc = get_memop(oi);
2097    MemOp s_bits = opc & MO_SIZE;
2098    unsigned a_mask;
2099
2100    if (tcg_use_softmmu) {
2101        h->index = TCG_REG_L0;
2102        h->ofs = 0;
2103        h->seg = 0;
2104    } else {
2105        *h = x86_guest_base;
2106    }
2107    h->base = addrlo;
2108    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2109    a_mask = (1 << h->aa.align) - 1;
2110
2111    if (tcg_use_softmmu) {
2112        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2113                            : offsetof(CPUTLBEntry, addr_write);
2114        TCGType ttype = TCG_TYPE_I32;
2115        TCGType tlbtype = TCG_TYPE_I32;
2116        int trexw = 0, hrexw = 0, tlbrexw = 0;
2117        unsigned mem_index = get_mmuidx(oi);
2118        unsigned s_mask = (1 << s_bits) - 1;
2119        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2120        int tlb_mask;
2121
2122        ldst = new_ldst_label(s);
2123        ldst->is_ld = is_ld;
2124        ldst->oi = oi;
2125        ldst->addrlo_reg = addrlo;
2126        ldst->addrhi_reg = addrhi;
2127
2128        if (TCG_TARGET_REG_BITS == 64) {
2129            ttype = s->addr_type;
2130            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2131            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2132                hrexw = P_REXW;
2133                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2134                    tlbtype = TCG_TYPE_I64;
2135                    tlbrexw = P_REXW;
2136                }
2137            }
2138        }
2139
2140        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2141        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2142                       s->page_bits - CPU_TLB_ENTRY_BITS);
2143
2144        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2145                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2146
2147        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2148                             fast_ofs + offsetof(CPUTLBDescFast, table));
2149
2150        /*
2151         * If the required alignment is at least as large as the access,
2152         * simply copy the address and mask.  For lesser alignments,
2153         * check that we don't cross pages for the complete access.
2154         */
2155        if (a_mask >= s_mask) {
2156            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2157        } else {
2158            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2159                                 addrlo, s_mask - a_mask);
2160        }
2161        tlb_mask = s->page_mask | a_mask;
2162        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2163
2164        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2165        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2166                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2167
2168        /* jne slow_path */
2169        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2170        ldst->label_ptr[0] = s->code_ptr;
2171        s->code_ptr += 4;
2172
2173        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2174            /* cmp 4(TCG_REG_L0), addrhi */
2175            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2176                                 TCG_REG_L0, cmp_ofs + 4);
2177
2178            /* jne slow_path */
2179            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2180            ldst->label_ptr[1] = s->code_ptr;
2181            s->code_ptr += 4;
2182        }
2183
2184        /* TLB Hit.  */
2185        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2186                   offsetof(CPUTLBEntry, addend));
2187    } else if (a_mask) {
2188        int jcc;
2189
2190        ldst = new_ldst_label(s);
2191        ldst->is_ld = is_ld;
2192        ldst->oi = oi;
2193        ldst->addrlo_reg = addrlo;
2194        ldst->addrhi_reg = addrhi;
2195
2196        /* jne slow_path */
2197        jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2198        tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2199        ldst->label_ptr[0] = s->code_ptr;
2200        s->code_ptr += 4;
2201    }
2202
2203    return ldst;
2204}
2205
2206static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2207                                   HostAddress h, TCGType type, MemOp memop)
2208{
2209    bool use_movbe = false;
2210    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2211    int movop = OPC_MOVL_GvEv;
2212
2213    /* Do big-endian loads with movbe.  */
2214    if (memop & MO_BSWAP) {
2215        tcg_debug_assert(have_movbe);
2216        use_movbe = true;
2217        movop = OPC_MOVBE_GyMy;
2218    }
2219
2220    switch (memop & MO_SSIZE) {
2221    case MO_UB:
2222        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2223                                 h.base, h.index, 0, h.ofs);
2224        break;
2225    case MO_SB:
2226        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2227                                 h.base, h.index, 0, h.ofs);
2228        break;
2229    case MO_UW:
2230        if (use_movbe) {
2231            /* There is no extending movbe; only low 16-bits are modified.  */
2232            if (datalo != h.base && datalo != h.index) {
2233                /* XOR breaks dependency chains.  */
2234                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2235                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2236                                         datalo, h.base, h.index, 0, h.ofs);
2237            } else {
2238                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2239                                         datalo, h.base, h.index, 0, h.ofs);
2240                tcg_out_ext16u(s, datalo, datalo);
2241            }
2242        } else {
2243            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2244                                     h.base, h.index, 0, h.ofs);
2245        }
2246        break;
2247    case MO_SW:
2248        if (use_movbe) {
2249            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2250                                     datalo, h.base, h.index, 0, h.ofs);
2251            tcg_out_ext16s(s, type, datalo, datalo);
2252        } else {
2253            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2254                                     datalo, h.base, h.index, 0, h.ofs);
2255        }
2256        break;
2257    case MO_UL:
2258        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2259                                 h.base, h.index, 0, h.ofs);
2260        break;
2261#if TCG_TARGET_REG_BITS == 64
2262    case MO_SL:
2263        if (use_movbe) {
2264            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2265                                     h.base, h.index, 0, h.ofs);
2266            tcg_out_ext32s(s, datalo, datalo);
2267        } else {
2268            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2269                                     h.base, h.index, 0, h.ofs);
2270        }
2271        break;
2272#endif
2273    case MO_UQ:
2274        if (TCG_TARGET_REG_BITS == 64) {
2275            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2276                                     h.base, h.index, 0, h.ofs);
2277            break;
2278        }
2279        if (use_movbe) {
2280            TCGReg t = datalo;
2281            datalo = datahi;
2282            datahi = t;
2283        }
2284        if (h.base == datalo || h.index == datalo) {
2285            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2286                                     h.base, h.index, 0, h.ofs);
2287            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2288            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2289        } else {
2290            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2291                                     h.base, h.index, 0, h.ofs);
2292            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2293                                     h.base, h.index, 0, h.ofs + 4);
2294        }
2295        break;
2296
2297    case MO_128:
2298        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2299
2300        /*
2301         * Without 16-byte atomicity, use integer regs.
2302         * That is where we want the data, and it allows bswaps.
2303         */
2304        if (h.aa.atom < MO_128) {
2305            if (use_movbe) {
2306                TCGReg t = datalo;
2307                datalo = datahi;
2308                datahi = t;
2309            }
2310            if (h.base == datalo || h.index == datalo) {
2311                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2312                                         h.base, h.index, 0, h.ofs);
2313                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2314                                     datalo, datahi, 0);
2315                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2316                                     datahi, datahi, 8);
2317            } else {
2318                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2319                                         h.base, h.index, 0, h.ofs);
2320                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2321                                         h.base, h.index, 0, h.ofs + 8);
2322            }
2323            break;
2324        }
2325
2326        /*
2327         * With 16-byte atomicity, a vector load is required.
2328         * If we already have 16-byte alignment, then VMOVDQA always works.
2329         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2330         * Else use we require a runtime test for alignment for VMOVDQA;
2331         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2332         */
2333        if (h.aa.align >= MO_128) {
2334            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2335                                         TCG_TMP_VEC, 0,
2336                                         h.base, h.index, 0, h.ofs);
2337        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2338            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2339                                         TCG_TMP_VEC, 0,
2340                                         h.base, h.index, 0, h.ofs);
2341        } else {
2342            TCGLabel *l1 = gen_new_label();
2343            TCGLabel *l2 = gen_new_label();
2344            int jcc;
2345
2346            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2347            tcg_out_jxx(s, jcc, l1, true);
2348
2349            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2350                                         TCG_TMP_VEC, 0,
2351                                         h.base, h.index, 0, h.ofs);
2352            tcg_out_jxx(s, JCC_JMP, l2, true);
2353
2354            tcg_out_label(s, l1);
2355            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2356                                         TCG_TMP_VEC, 0,
2357                                         h.base, h.index, 0, h.ofs);
2358            tcg_out_label(s, l2);
2359        }
2360        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2361        break;
2362
2363    default:
2364        g_assert_not_reached();
2365    }
2366}
2367
2368static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2369                            TCGReg addrlo, TCGReg addrhi,
2370                            MemOpIdx oi, TCGType data_type)
2371{
2372    TCGLabelQemuLdst *ldst;
2373    HostAddress h;
2374
2375    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2376    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2377
2378    if (ldst) {
2379        ldst->type = data_type;
2380        ldst->datalo_reg = datalo;
2381        ldst->datahi_reg = datahi;
2382        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2383    }
2384}
2385
2386static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2387                                   HostAddress h, MemOp memop)
2388{
2389    bool use_movbe = false;
2390    int movop = OPC_MOVL_EvGv;
2391
2392    /*
2393     * Do big-endian stores with movbe or system-mode.
2394     * User-only without movbe will have its swapping done generically.
2395     */
2396    if (memop & MO_BSWAP) {
2397        tcg_debug_assert(have_movbe);
2398        use_movbe = true;
2399        movop = OPC_MOVBE_MyGy;
2400    }
2401
2402    switch (memop & MO_SIZE) {
2403    case MO_8:
2404        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2405        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2406        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2407                                 datalo, h.base, h.index, 0, h.ofs);
2408        break;
2409    case MO_16:
2410        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2411                                 h.base, h.index, 0, h.ofs);
2412        break;
2413    case MO_32:
2414        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2415                                 h.base, h.index, 0, h.ofs);
2416        break;
2417    case MO_64:
2418        if (TCG_TARGET_REG_BITS == 64) {
2419            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2420                                     h.base, h.index, 0, h.ofs);
2421        } else {
2422            if (use_movbe) {
2423                TCGReg t = datalo;
2424                datalo = datahi;
2425                datahi = t;
2426            }
2427            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2428                                     h.base, h.index, 0, h.ofs);
2429            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2430                                     h.base, h.index, 0, h.ofs + 4);
2431        }
2432        break;
2433
2434    case MO_128:
2435        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2436
2437        /*
2438         * Without 16-byte atomicity, use integer regs.
2439         * That is where we have the data, and it allows bswaps.
2440         */
2441        if (h.aa.atom < MO_128) {
2442            if (use_movbe) {
2443                TCGReg t = datalo;
2444                datalo = datahi;
2445                datahi = t;
2446            }
2447            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2448                                     h.base, h.index, 0, h.ofs);
2449            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2450                                     h.base, h.index, 0, h.ofs + 8);
2451            break;
2452        }
2453
2454        /*
2455         * With 16-byte atomicity, a vector store is required.
2456         * If we already have 16-byte alignment, then VMOVDQA always works.
2457         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2458         * Else use we require a runtime test for alignment for VMOVDQA;
2459         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2460         */
2461        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2462        if (h.aa.align >= MO_128) {
2463            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2464                                         TCG_TMP_VEC, 0,
2465                                         h.base, h.index, 0, h.ofs);
2466        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2467            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2468                                         TCG_TMP_VEC, 0,
2469                                         h.base, h.index, 0, h.ofs);
2470        } else {
2471            TCGLabel *l1 = gen_new_label();
2472            TCGLabel *l2 = gen_new_label();
2473            int jcc;
2474
2475            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2476            tcg_out_jxx(s, jcc, l1, true);
2477
2478            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2479                                         TCG_TMP_VEC, 0,
2480                                         h.base, h.index, 0, h.ofs);
2481            tcg_out_jxx(s, JCC_JMP, l2, true);
2482
2483            tcg_out_label(s, l1);
2484            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2485                                         TCG_TMP_VEC, 0,
2486                                         h.base, h.index, 0, h.ofs);
2487            tcg_out_label(s, l2);
2488        }
2489        break;
2490
2491    default:
2492        g_assert_not_reached();
2493    }
2494}
2495
2496static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2497                            TCGReg addrlo, TCGReg addrhi,
2498                            MemOpIdx oi, TCGType data_type)
2499{
2500    TCGLabelQemuLdst *ldst;
2501    HostAddress h;
2502
2503    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2504    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2505
2506    if (ldst) {
2507        ldst->type = data_type;
2508        ldst->datalo_reg = datalo;
2509        ldst->datahi_reg = datahi;
2510        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2511    }
2512}
2513
2514static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2515{
2516    /* Reuse the zeroing that exists for goto_ptr.  */
2517    if (a0 == 0) {
2518        tcg_out_jmp(s, tcg_code_gen_epilogue);
2519    } else {
2520        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2521        tcg_out_jmp(s, tb_ret_addr);
2522    }
2523}
2524
2525static void tcg_out_goto_tb(TCGContext *s, int which)
2526{
2527    /*
2528     * Jump displacement must be aligned for atomic patching;
2529     * see if we need to add extra nops before jump
2530     */
2531    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2532    if (gap != 1) {
2533        tcg_out_nopn(s, gap - 1);
2534    }
2535    tcg_out8(s, OPC_JMP_long); /* jmp im */
2536    set_jmp_insn_offset(s, which);
2537    tcg_out32(s, 0);
2538    set_jmp_reset_offset(s, which);
2539}
2540
2541void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2542                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2543{
2544    /* patch the branch destination */
2545    uintptr_t addr = tb->jmp_target_addr[n];
2546    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2547    /* no need to flush icache explicitly */
2548}
2549
2550static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2551                              const TCGArg args[TCG_MAX_OP_ARGS],
2552                              const int const_args[TCG_MAX_OP_ARGS])
2553{
2554    TCGArg a0, a1, a2;
2555    int c, const_a2, vexop, rexw = 0;
2556
2557#if TCG_TARGET_REG_BITS == 64
2558# define OP_32_64(x) \
2559        case glue(glue(INDEX_op_, x), _i64): \
2560            rexw = P_REXW; /* FALLTHRU */    \
2561        case glue(glue(INDEX_op_, x), _i32)
2562#else
2563# define OP_32_64(x) \
2564        case glue(glue(INDEX_op_, x), _i32)
2565#endif
2566
2567    /* Hoist the loads of the most common arguments.  */
2568    a0 = args[0];
2569    a1 = args[1];
2570    a2 = args[2];
2571    const_a2 = const_args[2];
2572
2573    switch (opc) {
2574    case INDEX_op_goto_ptr:
2575        /* jmp to the given host address (could be epilogue) */
2576        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2577        break;
2578    case INDEX_op_br:
2579        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2580        break;
2581    OP_32_64(ld8u):
2582        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2583        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2584        break;
2585    OP_32_64(ld8s):
2586        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2587        break;
2588    OP_32_64(ld16u):
2589        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2590        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2591        break;
2592    OP_32_64(ld16s):
2593        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2594        break;
2595#if TCG_TARGET_REG_BITS == 64
2596    case INDEX_op_ld32u_i64:
2597#endif
2598    case INDEX_op_ld_i32:
2599        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2600        break;
2601
2602    OP_32_64(st8):
2603        if (const_args[0]) {
2604            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2605            tcg_out8(s, a0);
2606        } else {
2607            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2608        }
2609        break;
2610    OP_32_64(st16):
2611        if (const_args[0]) {
2612            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2613            tcg_out16(s, a0);
2614        } else {
2615            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2616        }
2617        break;
2618#if TCG_TARGET_REG_BITS == 64
2619    case INDEX_op_st32_i64:
2620#endif
2621    case INDEX_op_st_i32:
2622        if (const_args[0]) {
2623            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2624            tcg_out32(s, a0);
2625        } else {
2626            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2627        }
2628        break;
2629
2630    OP_32_64(add):
2631        /* For 3-operand addition, use LEA.  */
2632        if (a0 != a1) {
2633            TCGArg c3 = 0;
2634            if (const_a2) {
2635                c3 = a2, a2 = -1;
2636            } else if (a0 == a2) {
2637                /* Watch out for dest = src + dest, since we've removed
2638                   the matching constraint on the add.  */
2639                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2640                break;
2641            }
2642
2643            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2644            break;
2645        }
2646        c = ARITH_ADD;
2647        goto gen_arith;
2648    OP_32_64(sub):
2649        c = ARITH_SUB;
2650        goto gen_arith;
2651    OP_32_64(and):
2652        c = ARITH_AND;
2653        goto gen_arith;
2654    OP_32_64(or):
2655        c = ARITH_OR;
2656        goto gen_arith;
2657    OP_32_64(xor):
2658        c = ARITH_XOR;
2659        goto gen_arith;
2660    gen_arith:
2661        if (const_a2) {
2662            tgen_arithi(s, c + rexw, a0, a2, 0);
2663        } else {
2664            tgen_arithr(s, c + rexw, a0, a2);
2665        }
2666        break;
2667
2668    OP_32_64(andc):
2669        if (const_a2) {
2670            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2671            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2672        } else {
2673            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2674        }
2675        break;
2676
2677    OP_32_64(mul):
2678        if (const_a2) {
2679            int32_t val;
2680            val = a2;
2681            if (val == (int8_t)val) {
2682                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2683                tcg_out8(s, val);
2684            } else {
2685                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2686                tcg_out32(s, val);
2687            }
2688        } else {
2689            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2690        }
2691        break;
2692
2693    OP_32_64(div2):
2694        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2695        break;
2696    OP_32_64(divu2):
2697        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2698        break;
2699
2700    OP_32_64(shl):
2701        /* For small constant 3-operand shift, use LEA.  */
2702        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2703            if (a2 - 1 == 0) {
2704                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2705                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2706            } else {
2707                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2708                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2709            }
2710            break;
2711        }
2712        c = SHIFT_SHL;
2713        vexop = OPC_SHLX;
2714        goto gen_shift_maybe_vex;
2715    OP_32_64(shr):
2716        c = SHIFT_SHR;
2717        vexop = OPC_SHRX;
2718        goto gen_shift_maybe_vex;
2719    OP_32_64(sar):
2720        c = SHIFT_SAR;
2721        vexop = OPC_SARX;
2722        goto gen_shift_maybe_vex;
2723    OP_32_64(rotl):
2724        c = SHIFT_ROL;
2725        goto gen_shift;
2726    OP_32_64(rotr):
2727        c = SHIFT_ROR;
2728        goto gen_shift;
2729    gen_shift_maybe_vex:
2730        if (have_bmi2) {
2731            if (!const_a2) {
2732                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2733                break;
2734            }
2735            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2736        }
2737        /* FALLTHRU */
2738    gen_shift:
2739        if (const_a2) {
2740            tcg_out_shifti(s, c + rexw, a0, a2);
2741        } else {
2742            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2743        }
2744        break;
2745
2746    OP_32_64(ctz):
2747        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2748        break;
2749    OP_32_64(clz):
2750        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2751        break;
2752    OP_32_64(ctpop):
2753        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2754        break;
2755
2756    OP_32_64(brcond):
2757        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2758                       arg_label(args[3]), 0);
2759        break;
2760    OP_32_64(setcond):
2761        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2762        break;
2763    OP_32_64(negsetcond):
2764        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2765        break;
2766    OP_32_64(movcond):
2767        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2768        break;
2769
2770    OP_32_64(bswap16):
2771        if (a2 & TCG_BSWAP_OS) {
2772            /* Output must be sign-extended. */
2773            if (rexw) {
2774                tcg_out_bswap64(s, a0);
2775                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2776            } else {
2777                tcg_out_bswap32(s, a0);
2778                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2779            }
2780        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2781            /* Output must be zero-extended, but input isn't. */
2782            tcg_out_bswap32(s, a0);
2783            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2784        } else {
2785            tcg_out_rolw_8(s, a0);
2786        }
2787        break;
2788    OP_32_64(bswap32):
2789        tcg_out_bswap32(s, a0);
2790        if (rexw && (a2 & TCG_BSWAP_OS)) {
2791            tcg_out_ext32s(s, a0, a0);
2792        }
2793        break;
2794
2795    OP_32_64(neg):
2796        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2797        break;
2798    OP_32_64(not):
2799        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2800        break;
2801
2802    case INDEX_op_qemu_ld_a64_i32:
2803        if (TCG_TARGET_REG_BITS == 32) {
2804            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2805            break;
2806        }
2807        /* fall through */
2808    case INDEX_op_qemu_ld_a32_i32:
2809        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2810        break;
2811    case INDEX_op_qemu_ld_a32_i64:
2812        if (TCG_TARGET_REG_BITS == 64) {
2813            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2814        } else {
2815            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2816        }
2817        break;
2818    case INDEX_op_qemu_ld_a64_i64:
2819        if (TCG_TARGET_REG_BITS == 64) {
2820            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2821        } else {
2822            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2823        }
2824        break;
2825    case INDEX_op_qemu_ld_a32_i128:
2826    case INDEX_op_qemu_ld_a64_i128:
2827        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2828        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2829        break;
2830
2831    case INDEX_op_qemu_st_a64_i32:
2832    case INDEX_op_qemu_st8_a64_i32:
2833        if (TCG_TARGET_REG_BITS == 32) {
2834            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2835            break;
2836        }
2837        /* fall through */
2838    case INDEX_op_qemu_st_a32_i32:
2839    case INDEX_op_qemu_st8_a32_i32:
2840        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2841        break;
2842    case INDEX_op_qemu_st_a32_i64:
2843        if (TCG_TARGET_REG_BITS == 64) {
2844            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2845        } else {
2846            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2847        }
2848        break;
2849    case INDEX_op_qemu_st_a64_i64:
2850        if (TCG_TARGET_REG_BITS == 64) {
2851            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2852        } else {
2853            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2854        }
2855        break;
2856    case INDEX_op_qemu_st_a32_i128:
2857    case INDEX_op_qemu_st_a64_i128:
2858        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2859        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2860        break;
2861
2862    OP_32_64(mulu2):
2863        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2864        break;
2865    OP_32_64(muls2):
2866        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2867        break;
2868    OP_32_64(add2):
2869        if (const_args[4]) {
2870            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2871        } else {
2872            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2873        }
2874        if (const_args[5]) {
2875            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2876        } else {
2877            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2878        }
2879        break;
2880    OP_32_64(sub2):
2881        if (const_args[4]) {
2882            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2883        } else {
2884            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2885        }
2886        if (const_args[5]) {
2887            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2888        } else {
2889            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2890        }
2891        break;
2892
2893#if TCG_TARGET_REG_BITS == 32
2894    case INDEX_op_brcond2_i32:
2895        tcg_out_brcond2(s, args, const_args, 0);
2896        break;
2897    case INDEX_op_setcond2_i32:
2898        tcg_out_setcond2(s, args, const_args);
2899        break;
2900#else /* TCG_TARGET_REG_BITS == 64 */
2901    case INDEX_op_ld32s_i64:
2902        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2903        break;
2904    case INDEX_op_ld_i64:
2905        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2906        break;
2907    case INDEX_op_st_i64:
2908        if (const_args[0]) {
2909            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2910            tcg_out32(s, a0);
2911        } else {
2912            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2913        }
2914        break;
2915
2916    case INDEX_op_bswap64_i64:
2917        tcg_out_bswap64(s, a0);
2918        break;
2919    case INDEX_op_extrh_i64_i32:
2920        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2921        break;
2922#endif
2923
2924    OP_32_64(deposit):
2925        if (args[3] == 0 && args[4] == 8) {
2926            /* load bits 0..7 */
2927            if (const_a2) {
2928                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2929                            0, a0, 0);
2930                tcg_out8(s, a2);
2931            } else {
2932                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2933            }
2934        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2935            /* load bits 8..15 */
2936            if (const_a2) {
2937                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2938                tcg_out8(s, a2);
2939            } else {
2940                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2941            }
2942        } else if (args[3] == 0 && args[4] == 16) {
2943            /* load bits 0..15 */
2944            if (const_a2) {
2945                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2946                            0, a0, 0);
2947                tcg_out16(s, a2);
2948            } else {
2949                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2950            }
2951        } else {
2952            g_assert_not_reached();
2953        }
2954        break;
2955
2956    case INDEX_op_extract_i64:
2957        if (a2 + args[3] == 32) {
2958            /* This is a 32-bit zero-extending right shift.  */
2959            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2960            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2961            break;
2962        }
2963        /* FALLTHRU */
2964    case INDEX_op_extract_i32:
2965        /* On the off-chance that we can use the high-byte registers.
2966           Otherwise we emit the same ext16 + shift pattern that we
2967           would have gotten from the normal tcg-op.c expansion.  */
2968        tcg_debug_assert(a2 == 8 && args[3] == 8);
2969        if (a1 < 4 && a0 < 8) {
2970            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2971        } else {
2972            tcg_out_ext16u(s, a0, a1);
2973            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2974        }
2975        break;
2976
2977    case INDEX_op_sextract_i32:
2978        /* We don't implement sextract_i64, as we cannot sign-extend to
2979           64-bits without using the REX prefix that explicitly excludes
2980           access to the high-byte registers.  */
2981        tcg_debug_assert(a2 == 8 && args[3] == 8);
2982        if (a1 < 4 && a0 < 8) {
2983            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2984        } else {
2985            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2986            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2987        }
2988        break;
2989
2990    OP_32_64(extract2):
2991        /* Note that SHRD outputs to the r/m operand.  */
2992        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2993        tcg_out8(s, args[3]);
2994        break;
2995
2996    case INDEX_op_mb:
2997        tcg_out_mb(s, a0);
2998        break;
2999    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
3000    case INDEX_op_mov_i64:
3001    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
3002    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
3003    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
3004    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
3005    case INDEX_op_ext8s_i64:
3006    case INDEX_op_ext8u_i32:
3007    case INDEX_op_ext8u_i64:
3008    case INDEX_op_ext16s_i32:
3009    case INDEX_op_ext16s_i64:
3010    case INDEX_op_ext16u_i32:
3011    case INDEX_op_ext16u_i64:
3012    case INDEX_op_ext32s_i64:
3013    case INDEX_op_ext32u_i64:
3014    case INDEX_op_ext_i32_i64:
3015    case INDEX_op_extu_i32_i64:
3016    case INDEX_op_extrl_i64_i32:
3017    default:
3018        g_assert_not_reached();
3019    }
3020
3021#undef OP_32_64
3022}
3023
3024static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3025                           unsigned vecl, unsigned vece,
3026                           const TCGArg args[TCG_MAX_OP_ARGS],
3027                           const int const_args[TCG_MAX_OP_ARGS])
3028{
3029    static int const add_insn[4] = {
3030        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3031    };
3032    static int const ssadd_insn[4] = {
3033        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3034    };
3035    static int const usadd_insn[4] = {
3036        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3037    };
3038    static int const sub_insn[4] = {
3039        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3040    };
3041    static int const sssub_insn[4] = {
3042        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3043    };
3044    static int const ussub_insn[4] = {
3045        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3046    };
3047    static int const mul_insn[4] = {
3048        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3049    };
3050    static int const shift_imm_insn[4] = {
3051        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3052    };
3053    static int const cmpeq_insn[4] = {
3054        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3055    };
3056    static int const cmpgt_insn[4] = {
3057        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3058    };
3059    static int const punpckl_insn[4] = {
3060        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3061    };
3062    static int const punpckh_insn[4] = {
3063        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3064    };
3065    static int const packss_insn[4] = {
3066        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3067    };
3068    static int const packus_insn[4] = {
3069        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3070    };
3071    static int const smin_insn[4] = {
3072        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3073    };
3074    static int const smax_insn[4] = {
3075        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3076    };
3077    static int const umin_insn[4] = {
3078        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3079    };
3080    static int const umax_insn[4] = {
3081        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3082    };
3083    static int const rotlv_insn[4] = {
3084        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3085    };
3086    static int const rotrv_insn[4] = {
3087        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3088    };
3089    static int const shlv_insn[4] = {
3090        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3091    };
3092    static int const shrv_insn[4] = {
3093        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3094    };
3095    static int const sarv_insn[4] = {
3096        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3097    };
3098    static int const shls_insn[4] = {
3099        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3100    };
3101    static int const shrs_insn[4] = {
3102        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3103    };
3104    static int const sars_insn[4] = {
3105        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3106    };
3107    static int const vpshldi_insn[4] = {
3108        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3109    };
3110    static int const vpshldv_insn[4] = {
3111        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3112    };
3113    static int const vpshrdv_insn[4] = {
3114        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3115    };
3116    static int const abs_insn[4] = {
3117        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3118    };
3119
3120    TCGType type = vecl + TCG_TYPE_V64;
3121    int insn, sub;
3122    TCGArg a0, a1, a2, a3;
3123
3124    a0 = args[0];
3125    a1 = args[1];
3126    a2 = args[2];
3127
3128    switch (opc) {
3129    case INDEX_op_add_vec:
3130        insn = add_insn[vece];
3131        goto gen_simd;
3132    case INDEX_op_ssadd_vec:
3133        insn = ssadd_insn[vece];
3134        goto gen_simd;
3135    case INDEX_op_usadd_vec:
3136        insn = usadd_insn[vece];
3137        goto gen_simd;
3138    case INDEX_op_sub_vec:
3139        insn = sub_insn[vece];
3140        goto gen_simd;
3141    case INDEX_op_sssub_vec:
3142        insn = sssub_insn[vece];
3143        goto gen_simd;
3144    case INDEX_op_ussub_vec:
3145        insn = ussub_insn[vece];
3146        goto gen_simd;
3147    case INDEX_op_mul_vec:
3148        insn = mul_insn[vece];
3149        goto gen_simd;
3150    case INDEX_op_and_vec:
3151        insn = OPC_PAND;
3152        goto gen_simd;
3153    case INDEX_op_or_vec:
3154        insn = OPC_POR;
3155        goto gen_simd;
3156    case INDEX_op_xor_vec:
3157        insn = OPC_PXOR;
3158        goto gen_simd;
3159    case INDEX_op_smin_vec:
3160        insn = smin_insn[vece];
3161        goto gen_simd;
3162    case INDEX_op_umin_vec:
3163        insn = umin_insn[vece];
3164        goto gen_simd;
3165    case INDEX_op_smax_vec:
3166        insn = smax_insn[vece];
3167        goto gen_simd;
3168    case INDEX_op_umax_vec:
3169        insn = umax_insn[vece];
3170        goto gen_simd;
3171    case INDEX_op_shlv_vec:
3172        insn = shlv_insn[vece];
3173        goto gen_simd;
3174    case INDEX_op_shrv_vec:
3175        insn = shrv_insn[vece];
3176        goto gen_simd;
3177    case INDEX_op_sarv_vec:
3178        insn = sarv_insn[vece];
3179        goto gen_simd;
3180    case INDEX_op_rotlv_vec:
3181        insn = rotlv_insn[vece];
3182        goto gen_simd;
3183    case INDEX_op_rotrv_vec:
3184        insn = rotrv_insn[vece];
3185        goto gen_simd;
3186    case INDEX_op_shls_vec:
3187        insn = shls_insn[vece];
3188        goto gen_simd;
3189    case INDEX_op_shrs_vec:
3190        insn = shrs_insn[vece];
3191        goto gen_simd;
3192    case INDEX_op_sars_vec:
3193        insn = sars_insn[vece];
3194        goto gen_simd;
3195    case INDEX_op_x86_punpckl_vec:
3196        insn = punpckl_insn[vece];
3197        goto gen_simd;
3198    case INDEX_op_x86_punpckh_vec:
3199        insn = punpckh_insn[vece];
3200        goto gen_simd;
3201    case INDEX_op_x86_packss_vec:
3202        insn = packss_insn[vece];
3203        goto gen_simd;
3204    case INDEX_op_x86_packus_vec:
3205        insn = packus_insn[vece];
3206        goto gen_simd;
3207    case INDEX_op_x86_vpshldv_vec:
3208        insn = vpshldv_insn[vece];
3209        a1 = a2;
3210        a2 = args[3];
3211        goto gen_simd;
3212    case INDEX_op_x86_vpshrdv_vec:
3213        insn = vpshrdv_insn[vece];
3214        a1 = a2;
3215        a2 = args[3];
3216        goto gen_simd;
3217#if TCG_TARGET_REG_BITS == 32
3218    case INDEX_op_dup2_vec:
3219        /* First merge the two 32-bit inputs to a single 64-bit element. */
3220        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3221        /* Then replicate the 64-bit elements across the rest of the vector. */
3222        if (type != TCG_TYPE_V64) {
3223            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3224        }
3225        break;
3226#endif
3227    case INDEX_op_abs_vec:
3228        insn = abs_insn[vece];
3229        a2 = a1;
3230        a1 = 0;
3231        goto gen_simd;
3232    gen_simd:
3233        tcg_debug_assert(insn != OPC_UD2);
3234        if (type == TCG_TYPE_V256) {
3235            insn |= P_VEXL;
3236        }
3237        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3238        break;
3239
3240    case INDEX_op_cmp_vec:
3241        sub = args[3];
3242        if (sub == TCG_COND_EQ) {
3243            insn = cmpeq_insn[vece];
3244        } else if (sub == TCG_COND_GT) {
3245            insn = cmpgt_insn[vece];
3246        } else {
3247            g_assert_not_reached();
3248        }
3249        goto gen_simd;
3250
3251    case INDEX_op_andc_vec:
3252        insn = OPC_PANDN;
3253        if (type == TCG_TYPE_V256) {
3254            insn |= P_VEXL;
3255        }
3256        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3257        break;
3258
3259    case INDEX_op_shli_vec:
3260        insn = shift_imm_insn[vece];
3261        sub = 6;
3262        goto gen_shift;
3263    case INDEX_op_shri_vec:
3264        insn = shift_imm_insn[vece];
3265        sub = 2;
3266        goto gen_shift;
3267    case INDEX_op_sari_vec:
3268        if (vece == MO_64) {
3269            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3270        } else {
3271            insn = shift_imm_insn[vece];
3272        }
3273        sub = 4;
3274        goto gen_shift;
3275    case INDEX_op_rotli_vec:
3276        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3277        if (vece == MO_64) {
3278            insn |= P_VEXW;
3279        }
3280        sub = 1;
3281        goto gen_shift;
3282    gen_shift:
3283        tcg_debug_assert(vece != MO_8);
3284        if (type == TCG_TYPE_V256) {
3285            insn |= P_VEXL;
3286        }
3287        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3288        tcg_out8(s, a2);
3289        break;
3290
3291    case INDEX_op_ld_vec:
3292        tcg_out_ld(s, type, a0, a1, a2);
3293        break;
3294    case INDEX_op_st_vec:
3295        tcg_out_st(s, type, a0, a1, a2);
3296        break;
3297    case INDEX_op_dupm_vec:
3298        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3299        break;
3300
3301    case INDEX_op_x86_shufps_vec:
3302        insn = OPC_SHUFPS;
3303        sub = args[3];
3304        goto gen_simd_imm8;
3305    case INDEX_op_x86_blend_vec:
3306        if (vece == MO_16) {
3307            insn = OPC_PBLENDW;
3308        } else if (vece == MO_32) {
3309            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3310        } else {
3311            g_assert_not_reached();
3312        }
3313        sub = args[3];
3314        goto gen_simd_imm8;
3315    case INDEX_op_x86_vperm2i128_vec:
3316        insn = OPC_VPERM2I128;
3317        sub = args[3];
3318        goto gen_simd_imm8;
3319    case INDEX_op_x86_vpshldi_vec:
3320        insn = vpshldi_insn[vece];
3321        sub = args[3];
3322        goto gen_simd_imm8;
3323
3324    case INDEX_op_not_vec:
3325        insn = OPC_VPTERNLOGQ;
3326        a2 = a1;
3327        sub = 0x33; /* !B */
3328        goto gen_simd_imm8;
3329    case INDEX_op_nor_vec:
3330        insn = OPC_VPTERNLOGQ;
3331        sub = 0x11; /* norCB */
3332        goto gen_simd_imm8;
3333    case INDEX_op_nand_vec:
3334        insn = OPC_VPTERNLOGQ;
3335        sub = 0x77; /* nandCB */
3336        goto gen_simd_imm8;
3337    case INDEX_op_eqv_vec:
3338        insn = OPC_VPTERNLOGQ;
3339        sub = 0x99; /* xnorCB */
3340        goto gen_simd_imm8;
3341    case INDEX_op_orc_vec:
3342        insn = OPC_VPTERNLOGQ;
3343        sub = 0xdd; /* orB!C */
3344        goto gen_simd_imm8;
3345
3346    case INDEX_op_bitsel_vec:
3347        insn = OPC_VPTERNLOGQ;
3348        a3 = args[3];
3349        if (a0 == a1) {
3350            a1 = a2;
3351            a2 = a3;
3352            sub = 0xca; /* A?B:C */
3353        } else if (a0 == a2) {
3354            a2 = a3;
3355            sub = 0xe2; /* B?A:C */
3356        } else {
3357            tcg_out_mov(s, type, a0, a3);
3358            sub = 0xb8; /* B?C:A */
3359        }
3360        goto gen_simd_imm8;
3361
3362    gen_simd_imm8:
3363        tcg_debug_assert(insn != OPC_UD2);
3364        if (type == TCG_TYPE_V256) {
3365            insn |= P_VEXL;
3366        }
3367        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3368        tcg_out8(s, sub);
3369        break;
3370
3371    case INDEX_op_x86_vpblendvb_vec:
3372        insn = OPC_VPBLENDVB;
3373        if (type == TCG_TYPE_V256) {
3374            insn |= P_VEXL;
3375        }
3376        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3377        tcg_out8(s, args[3] << 4);
3378        break;
3379
3380    case INDEX_op_x86_psrldq_vec:
3381        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3382        tcg_out8(s, a2);
3383        break;
3384
3385    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3386    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3387    default:
3388        g_assert_not_reached();
3389    }
3390}
3391
3392static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3393{
3394    switch (op) {
3395    case INDEX_op_goto_ptr:
3396        return C_O0_I1(r);
3397
3398    case INDEX_op_ld8u_i32:
3399    case INDEX_op_ld8u_i64:
3400    case INDEX_op_ld8s_i32:
3401    case INDEX_op_ld8s_i64:
3402    case INDEX_op_ld16u_i32:
3403    case INDEX_op_ld16u_i64:
3404    case INDEX_op_ld16s_i32:
3405    case INDEX_op_ld16s_i64:
3406    case INDEX_op_ld_i32:
3407    case INDEX_op_ld32u_i64:
3408    case INDEX_op_ld32s_i64:
3409    case INDEX_op_ld_i64:
3410        return C_O1_I1(r, r);
3411
3412    case INDEX_op_st8_i32:
3413    case INDEX_op_st8_i64:
3414        return C_O0_I2(qi, r);
3415
3416    case INDEX_op_st16_i32:
3417    case INDEX_op_st16_i64:
3418    case INDEX_op_st_i32:
3419    case INDEX_op_st32_i64:
3420        return C_O0_I2(ri, r);
3421
3422    case INDEX_op_st_i64:
3423        return C_O0_I2(re, r);
3424
3425    case INDEX_op_add_i32:
3426    case INDEX_op_add_i64:
3427        return C_O1_I2(r, r, re);
3428
3429    case INDEX_op_sub_i32:
3430    case INDEX_op_sub_i64:
3431    case INDEX_op_mul_i32:
3432    case INDEX_op_mul_i64:
3433    case INDEX_op_or_i32:
3434    case INDEX_op_or_i64:
3435    case INDEX_op_xor_i32:
3436    case INDEX_op_xor_i64:
3437        return C_O1_I2(r, 0, re);
3438
3439    case INDEX_op_and_i32:
3440    case INDEX_op_and_i64:
3441        return C_O1_I2(r, 0, reZ);
3442
3443    case INDEX_op_andc_i32:
3444    case INDEX_op_andc_i64:
3445        return C_O1_I2(r, r, rI);
3446
3447    case INDEX_op_shl_i32:
3448    case INDEX_op_shl_i64:
3449    case INDEX_op_shr_i32:
3450    case INDEX_op_shr_i64:
3451    case INDEX_op_sar_i32:
3452    case INDEX_op_sar_i64:
3453        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3454
3455    case INDEX_op_rotl_i32:
3456    case INDEX_op_rotl_i64:
3457    case INDEX_op_rotr_i32:
3458    case INDEX_op_rotr_i64:
3459        return C_O1_I2(r, 0, ci);
3460
3461    case INDEX_op_brcond_i32:
3462    case INDEX_op_brcond_i64:
3463        return C_O0_I2(r, reT);
3464
3465    case INDEX_op_bswap16_i32:
3466    case INDEX_op_bswap16_i64:
3467    case INDEX_op_bswap32_i32:
3468    case INDEX_op_bswap32_i64:
3469    case INDEX_op_bswap64_i64:
3470    case INDEX_op_neg_i32:
3471    case INDEX_op_neg_i64:
3472    case INDEX_op_not_i32:
3473    case INDEX_op_not_i64:
3474    case INDEX_op_extrh_i64_i32:
3475        return C_O1_I1(r, 0);
3476
3477    case INDEX_op_ext8s_i32:
3478    case INDEX_op_ext8s_i64:
3479    case INDEX_op_ext8u_i32:
3480    case INDEX_op_ext8u_i64:
3481        return C_O1_I1(r, q);
3482
3483    case INDEX_op_ext16s_i32:
3484    case INDEX_op_ext16s_i64:
3485    case INDEX_op_ext16u_i32:
3486    case INDEX_op_ext16u_i64:
3487    case INDEX_op_ext32s_i64:
3488    case INDEX_op_ext32u_i64:
3489    case INDEX_op_ext_i32_i64:
3490    case INDEX_op_extu_i32_i64:
3491    case INDEX_op_extrl_i64_i32:
3492    case INDEX_op_extract_i32:
3493    case INDEX_op_extract_i64:
3494    case INDEX_op_sextract_i32:
3495    case INDEX_op_ctpop_i32:
3496    case INDEX_op_ctpop_i64:
3497        return C_O1_I1(r, r);
3498
3499    case INDEX_op_extract2_i32:
3500    case INDEX_op_extract2_i64:
3501        return C_O1_I2(r, 0, r);
3502
3503    case INDEX_op_deposit_i32:
3504    case INDEX_op_deposit_i64:
3505        return C_O1_I2(q, 0, qi);
3506
3507    case INDEX_op_setcond_i32:
3508    case INDEX_op_setcond_i64:
3509    case INDEX_op_negsetcond_i32:
3510    case INDEX_op_negsetcond_i64:
3511        return C_O1_I2(q, r, reT);
3512
3513    case INDEX_op_movcond_i32:
3514    case INDEX_op_movcond_i64:
3515        return C_O1_I4(r, r, reT, r, 0);
3516
3517    case INDEX_op_div2_i32:
3518    case INDEX_op_div2_i64:
3519    case INDEX_op_divu2_i32:
3520    case INDEX_op_divu2_i64:
3521        return C_O2_I3(a, d, 0, 1, r);
3522
3523    case INDEX_op_mulu2_i32:
3524    case INDEX_op_mulu2_i64:
3525    case INDEX_op_muls2_i32:
3526    case INDEX_op_muls2_i64:
3527        return C_O2_I2(a, d, a, r);
3528
3529    case INDEX_op_add2_i32:
3530    case INDEX_op_add2_i64:
3531    case INDEX_op_sub2_i32:
3532    case INDEX_op_sub2_i64:
3533        return C_N1_O1_I4(r, r, 0, 1, re, re);
3534
3535    case INDEX_op_ctz_i32:
3536    case INDEX_op_ctz_i64:
3537        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3538
3539    case INDEX_op_clz_i32:
3540    case INDEX_op_clz_i64:
3541        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3542
3543    case INDEX_op_qemu_ld_a32_i32:
3544        return C_O1_I1(r, L);
3545    case INDEX_op_qemu_ld_a64_i32:
3546        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3547
3548    case INDEX_op_qemu_st_a32_i32:
3549        return C_O0_I2(L, L);
3550    case INDEX_op_qemu_st_a64_i32:
3551        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3552    case INDEX_op_qemu_st8_a32_i32:
3553        return C_O0_I2(s, L);
3554    case INDEX_op_qemu_st8_a64_i32:
3555        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3556
3557    case INDEX_op_qemu_ld_a32_i64:
3558        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3559    case INDEX_op_qemu_ld_a64_i64:
3560        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3561
3562    case INDEX_op_qemu_st_a32_i64:
3563        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3564    case INDEX_op_qemu_st_a64_i64:
3565        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3566
3567    case INDEX_op_qemu_ld_a32_i128:
3568    case INDEX_op_qemu_ld_a64_i128:
3569        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3570        return C_O2_I1(r, r, L);
3571    case INDEX_op_qemu_st_a32_i128:
3572    case INDEX_op_qemu_st_a64_i128:
3573        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3574        return C_O0_I3(L, L, L);
3575
3576    case INDEX_op_brcond2_i32:
3577        return C_O0_I4(r, r, ri, ri);
3578
3579    case INDEX_op_setcond2_i32:
3580        return C_O1_I4(r, r, r, ri, ri);
3581
3582    case INDEX_op_ld_vec:
3583    case INDEX_op_dupm_vec:
3584        return C_O1_I1(x, r);
3585
3586    case INDEX_op_st_vec:
3587        return C_O0_I2(x, r);
3588
3589    case INDEX_op_add_vec:
3590    case INDEX_op_sub_vec:
3591    case INDEX_op_mul_vec:
3592    case INDEX_op_and_vec:
3593    case INDEX_op_or_vec:
3594    case INDEX_op_xor_vec:
3595    case INDEX_op_andc_vec:
3596    case INDEX_op_orc_vec:
3597    case INDEX_op_nand_vec:
3598    case INDEX_op_nor_vec:
3599    case INDEX_op_eqv_vec:
3600    case INDEX_op_ssadd_vec:
3601    case INDEX_op_usadd_vec:
3602    case INDEX_op_sssub_vec:
3603    case INDEX_op_ussub_vec:
3604    case INDEX_op_smin_vec:
3605    case INDEX_op_umin_vec:
3606    case INDEX_op_smax_vec:
3607    case INDEX_op_umax_vec:
3608    case INDEX_op_shlv_vec:
3609    case INDEX_op_shrv_vec:
3610    case INDEX_op_sarv_vec:
3611    case INDEX_op_rotlv_vec:
3612    case INDEX_op_rotrv_vec:
3613    case INDEX_op_shls_vec:
3614    case INDEX_op_shrs_vec:
3615    case INDEX_op_sars_vec:
3616    case INDEX_op_cmp_vec:
3617    case INDEX_op_x86_shufps_vec:
3618    case INDEX_op_x86_blend_vec:
3619    case INDEX_op_x86_packss_vec:
3620    case INDEX_op_x86_packus_vec:
3621    case INDEX_op_x86_vperm2i128_vec:
3622    case INDEX_op_x86_punpckl_vec:
3623    case INDEX_op_x86_punpckh_vec:
3624    case INDEX_op_x86_vpshldi_vec:
3625#if TCG_TARGET_REG_BITS == 32
3626    case INDEX_op_dup2_vec:
3627#endif
3628        return C_O1_I2(x, x, x);
3629
3630    case INDEX_op_abs_vec:
3631    case INDEX_op_dup_vec:
3632    case INDEX_op_not_vec:
3633    case INDEX_op_shli_vec:
3634    case INDEX_op_shri_vec:
3635    case INDEX_op_sari_vec:
3636    case INDEX_op_rotli_vec:
3637    case INDEX_op_x86_psrldq_vec:
3638        return C_O1_I1(x, x);
3639
3640    case INDEX_op_x86_vpshldv_vec:
3641    case INDEX_op_x86_vpshrdv_vec:
3642        return C_O1_I3(x, 0, x, x);
3643
3644    case INDEX_op_bitsel_vec:
3645    case INDEX_op_x86_vpblendvb_vec:
3646        return C_O1_I3(x, x, x, x);
3647
3648    default:
3649        g_assert_not_reached();
3650    }
3651}
3652
3653int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3654{
3655    switch (opc) {
3656    case INDEX_op_add_vec:
3657    case INDEX_op_sub_vec:
3658    case INDEX_op_and_vec:
3659    case INDEX_op_or_vec:
3660    case INDEX_op_xor_vec:
3661    case INDEX_op_andc_vec:
3662    case INDEX_op_orc_vec:
3663    case INDEX_op_nand_vec:
3664    case INDEX_op_nor_vec:
3665    case INDEX_op_eqv_vec:
3666    case INDEX_op_not_vec:
3667    case INDEX_op_bitsel_vec:
3668        return 1;
3669    case INDEX_op_cmp_vec:
3670    case INDEX_op_cmpsel_vec:
3671        return -1;
3672
3673    case INDEX_op_rotli_vec:
3674        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3675
3676    case INDEX_op_shli_vec:
3677    case INDEX_op_shri_vec:
3678        /* We must expand the operation for MO_8.  */
3679        return vece == MO_8 ? -1 : 1;
3680
3681    case INDEX_op_sari_vec:
3682        switch (vece) {
3683        case MO_8:
3684            return -1;
3685        case MO_16:
3686        case MO_32:
3687            return 1;
3688        case MO_64:
3689            if (have_avx512vl) {
3690                return 1;
3691            }
3692            /*
3693             * We can emulate this for MO_64, but it does not pay off
3694             * unless we're producing at least 4 values.
3695             */
3696            return type >= TCG_TYPE_V256 ? -1 : 0;
3697        }
3698        return 0;
3699
3700    case INDEX_op_shls_vec:
3701    case INDEX_op_shrs_vec:
3702        return vece >= MO_16;
3703    case INDEX_op_sars_vec:
3704        switch (vece) {
3705        case MO_16:
3706        case MO_32:
3707            return 1;
3708        case MO_64:
3709            return have_avx512vl;
3710        }
3711        return 0;
3712    case INDEX_op_rotls_vec:
3713        return vece >= MO_16 ? -1 : 0;
3714
3715    case INDEX_op_shlv_vec:
3716    case INDEX_op_shrv_vec:
3717        switch (vece) {
3718        case MO_16:
3719            return have_avx512bw;
3720        case MO_32:
3721        case MO_64:
3722            return have_avx2;
3723        }
3724        return 0;
3725    case INDEX_op_sarv_vec:
3726        switch (vece) {
3727        case MO_16:
3728            return have_avx512bw;
3729        case MO_32:
3730            return have_avx2;
3731        case MO_64:
3732            return have_avx512vl;
3733        }
3734        return 0;
3735    case INDEX_op_rotlv_vec:
3736    case INDEX_op_rotrv_vec:
3737        switch (vece) {
3738        case MO_16:
3739            return have_avx512vbmi2 ? -1 : 0;
3740        case MO_32:
3741        case MO_64:
3742            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3743        }
3744        return 0;
3745
3746    case INDEX_op_mul_vec:
3747        switch (vece) {
3748        case MO_8:
3749            return -1;
3750        case MO_64:
3751            return have_avx512dq;
3752        }
3753        return 1;
3754
3755    case INDEX_op_ssadd_vec:
3756    case INDEX_op_usadd_vec:
3757    case INDEX_op_sssub_vec:
3758    case INDEX_op_ussub_vec:
3759        return vece <= MO_16;
3760    case INDEX_op_smin_vec:
3761    case INDEX_op_smax_vec:
3762    case INDEX_op_umin_vec:
3763    case INDEX_op_umax_vec:
3764    case INDEX_op_abs_vec:
3765        return vece <= MO_32 || have_avx512vl;
3766
3767    default:
3768        return 0;
3769    }
3770}
3771
3772static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3773                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3774{
3775    TCGv_vec t1, t2;
3776
3777    tcg_debug_assert(vece == MO_8);
3778
3779    t1 = tcg_temp_new_vec(type);
3780    t2 = tcg_temp_new_vec(type);
3781
3782    /*
3783     * Unpack to W, shift, and repack.  Tricky bits:
3784     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3785     *     i.e. duplicate in other half of the 16-bit lane.
3786     * (2) For right-shift, add 8 so that the high half of the lane
3787     *     becomes zero.  For left-shift, and left-rotate, we must
3788     *     shift up and down again.
3789     * (3) Step 2 leaves high half zero such that PACKUSWB
3790     *     (pack with unsigned saturation) does not modify
3791     *     the quantity.
3792     */
3793    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3794              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3795    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3796              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3797
3798    if (opc != INDEX_op_rotli_vec) {
3799        imm += 8;
3800    }
3801    if (opc == INDEX_op_shri_vec) {
3802        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3803        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3804    } else {
3805        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3806        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3807        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3808        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3809    }
3810
3811    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3812              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3813    tcg_temp_free_vec(t1);
3814    tcg_temp_free_vec(t2);
3815}
3816
3817static void expand_vec_sari(TCGType type, unsigned vece,
3818                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3819{
3820    TCGv_vec t1, t2;
3821
3822    switch (vece) {
3823    case MO_8:
3824        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3825        t1 = tcg_temp_new_vec(type);
3826        t2 = tcg_temp_new_vec(type);
3827        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3828                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3829        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3830                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3831        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3832        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3833        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3834                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3835        tcg_temp_free_vec(t1);
3836        tcg_temp_free_vec(t2);
3837        break;
3838
3839    case MO_64:
3840        t1 = tcg_temp_new_vec(type);
3841        if (imm <= 32) {
3842            /*
3843             * We can emulate a small sign extend by performing an arithmetic
3844             * 32-bit shift and overwriting the high half of a 64-bit logical
3845             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3846             * does not, so we have to bound the smaller shift -- we get the
3847             * same result in the high half either way.
3848             */
3849            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3850            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3851            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3852                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3853                      tcgv_vec_arg(t1), 0xaa);
3854        } else {
3855            /* Otherwise we will need to use a compare vs 0 to produce
3856             * the sign-extend, shift and merge.
3857             */
3858            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3859                            tcg_constant_vec(type, MO_64, 0), v1);
3860            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3861            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3862            tcg_gen_or_vec(MO_64, v0, v0, t1);
3863        }
3864        tcg_temp_free_vec(t1);
3865        break;
3866
3867    default:
3868        g_assert_not_reached();
3869    }
3870}
3871
3872static void expand_vec_rotli(TCGType type, unsigned vece,
3873                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3874{
3875    TCGv_vec t;
3876
3877    if (vece == MO_8) {
3878        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3879        return;
3880    }
3881
3882    if (have_avx512vbmi2) {
3883        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3884                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3885        return;
3886    }
3887
3888    t = tcg_temp_new_vec(type);
3889    tcg_gen_shli_vec(vece, t, v1, imm);
3890    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3891    tcg_gen_or_vec(vece, v0, v0, t);
3892    tcg_temp_free_vec(t);
3893}
3894
3895static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3896                            TCGv_vec v1, TCGv_vec sh, bool right)
3897{
3898    TCGv_vec t;
3899
3900    if (have_avx512vbmi2) {
3901        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3902                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3903                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3904        return;
3905    }
3906
3907    t = tcg_temp_new_vec(type);
3908    tcg_gen_dupi_vec(vece, t, 8 << vece);
3909    tcg_gen_sub_vec(vece, t, t, sh);
3910    if (right) {
3911        tcg_gen_shlv_vec(vece, t, v1, t);
3912        tcg_gen_shrv_vec(vece, v0, v1, sh);
3913    } else {
3914        tcg_gen_shrv_vec(vece, t, v1, t);
3915        tcg_gen_shlv_vec(vece, v0, v1, sh);
3916    }
3917    tcg_gen_or_vec(vece, v0, v0, t);
3918    tcg_temp_free_vec(t);
3919}
3920
3921static void expand_vec_rotls(TCGType type, unsigned vece,
3922                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3923{
3924    TCGv_vec t = tcg_temp_new_vec(type);
3925
3926    tcg_debug_assert(vece != MO_8);
3927
3928    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3929        tcg_gen_dup_i32_vec(vece, t, lsh);
3930        if (vece >= MO_32) {
3931            tcg_gen_rotlv_vec(vece, v0, v1, t);
3932        } else {
3933            expand_vec_rotv(type, vece, v0, v1, t, false);
3934        }
3935    } else {
3936        TCGv_i32 rsh = tcg_temp_new_i32();
3937
3938        tcg_gen_neg_i32(rsh, lsh);
3939        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3940        tcg_gen_shls_vec(vece, t, v1, lsh);
3941        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3942        tcg_gen_or_vec(vece, v0, v0, t);
3943
3944        tcg_temp_free_i32(rsh);
3945    }
3946
3947    tcg_temp_free_vec(t);
3948}
3949
3950static void expand_vec_mul(TCGType type, unsigned vece,
3951                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3952{
3953    TCGv_vec t1, t2, t3, t4, zero;
3954
3955    tcg_debug_assert(vece == MO_8);
3956
3957    /*
3958     * Unpack v1 bytes to words, 0 | x.
3959     * Unpack v2 bytes to words, y | 0.
3960     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3961     * Shift logical right by 8 bits to clear the high 8 bytes before
3962     * using an unsigned saturated pack.
3963     *
3964     * The difference between the V64, V128 and V256 cases is merely how
3965     * we distribute the expansion between temporaries.
3966     */
3967    switch (type) {
3968    case TCG_TYPE_V64:
3969        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3970        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3971        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3972        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3973                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3974        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3975                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3976        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3977        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3978        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3979                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3980        tcg_temp_free_vec(t1);
3981        tcg_temp_free_vec(t2);
3982        break;
3983
3984    case TCG_TYPE_V128:
3985    case TCG_TYPE_V256:
3986        t1 = tcg_temp_new_vec(type);
3987        t2 = tcg_temp_new_vec(type);
3988        t3 = tcg_temp_new_vec(type);
3989        t4 = tcg_temp_new_vec(type);
3990        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3991        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3992                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3993        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3994                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3995        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3996                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3997        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3998                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3999        tcg_gen_mul_vec(MO_16, t1, t1, t2);
4000        tcg_gen_mul_vec(MO_16, t3, t3, t4);
4001        tcg_gen_shri_vec(MO_16, t1, t1, 8);
4002        tcg_gen_shri_vec(MO_16, t3, t3, 8);
4003        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
4004                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
4005        tcg_temp_free_vec(t1);
4006        tcg_temp_free_vec(t2);
4007        tcg_temp_free_vec(t3);
4008        tcg_temp_free_vec(t4);
4009        break;
4010
4011    default:
4012        g_assert_not_reached();
4013    }
4014}
4015
4016static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
4017                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4018{
4019    enum {
4020        NEED_INV  = 1,
4021        NEED_SWAP = 2,
4022        NEED_BIAS = 4,
4023        NEED_UMIN = 8,
4024        NEED_UMAX = 16,
4025    };
4026    TCGv_vec t1, t2, t3;
4027    uint8_t fixup;
4028
4029    switch (cond) {
4030    case TCG_COND_EQ:
4031    case TCG_COND_GT:
4032        fixup = 0;
4033        break;
4034    case TCG_COND_NE:
4035    case TCG_COND_LE:
4036        fixup = NEED_INV;
4037        break;
4038    case TCG_COND_LT:
4039        fixup = NEED_SWAP;
4040        break;
4041    case TCG_COND_GE:
4042        fixup = NEED_SWAP | NEED_INV;
4043        break;
4044    case TCG_COND_LEU:
4045        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4046            fixup = NEED_UMIN;
4047        } else {
4048            fixup = NEED_BIAS | NEED_INV;
4049        }
4050        break;
4051    case TCG_COND_GTU:
4052        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4053            fixup = NEED_UMIN | NEED_INV;
4054        } else {
4055            fixup = NEED_BIAS;
4056        }
4057        break;
4058    case TCG_COND_GEU:
4059        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4060            fixup = NEED_UMAX;
4061        } else {
4062            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
4063        }
4064        break;
4065    case TCG_COND_LTU:
4066        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4067            fixup = NEED_UMAX | NEED_INV;
4068        } else {
4069            fixup = NEED_BIAS | NEED_SWAP;
4070        }
4071        break;
4072    default:
4073        g_assert_not_reached();
4074    }
4075
4076    if (fixup & NEED_INV) {
4077        cond = tcg_invert_cond(cond);
4078    }
4079    if (fixup & NEED_SWAP) {
4080        t1 = v1, v1 = v2, v2 = t1;
4081        cond = tcg_swap_cond(cond);
4082    }
4083
4084    t1 = t2 = NULL;
4085    if (fixup & (NEED_UMIN | NEED_UMAX)) {
4086        t1 = tcg_temp_new_vec(type);
4087        if (fixup & NEED_UMIN) {
4088            tcg_gen_umin_vec(vece, t1, v1, v2);
4089        } else {
4090            tcg_gen_umax_vec(vece, t1, v1, v2);
4091        }
4092        v2 = t1;
4093        cond = TCG_COND_EQ;
4094    } else if (fixup & NEED_BIAS) {
4095        t1 = tcg_temp_new_vec(type);
4096        t2 = tcg_temp_new_vec(type);
4097        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4098        tcg_gen_sub_vec(vece, t1, v1, t3);
4099        tcg_gen_sub_vec(vece, t2, v2, t3);
4100        v1 = t1;
4101        v2 = t2;
4102        cond = tcg_signed_cond(cond);
4103    }
4104
4105    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4106    /* Expand directly; do not recurse.  */
4107    vec_gen_4(INDEX_op_cmp_vec, type, vece,
4108              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4109
4110    if (t1) {
4111        tcg_temp_free_vec(t1);
4112        if (t2) {
4113            tcg_temp_free_vec(t2);
4114        }
4115    }
4116    return fixup & NEED_INV;
4117}
4118
4119static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4120                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4121{
4122    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4123        tcg_gen_not_vec(vece, v0, v0);
4124    }
4125}
4126
4127static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4128                              TCGv_vec c1, TCGv_vec c2,
4129                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4130{
4131    TCGv_vec t = tcg_temp_new_vec(type);
4132
4133    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4134        /* Invert the sense of the compare by swapping arguments.  */
4135        TCGv_vec x;
4136        x = v3, v3 = v4, v4 = x;
4137    }
4138    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4139              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4140              tcgv_vec_arg(v3), tcgv_vec_arg(t));
4141    tcg_temp_free_vec(t);
4142}
4143
4144void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4145                       TCGArg a0, ...)
4146{
4147    va_list va;
4148    TCGArg a2;
4149    TCGv_vec v0, v1, v2, v3, v4;
4150
4151    va_start(va, a0);
4152    v0 = temp_tcgv_vec(arg_temp(a0));
4153    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4154    a2 = va_arg(va, TCGArg);
4155
4156    switch (opc) {
4157    case INDEX_op_shli_vec:
4158    case INDEX_op_shri_vec:
4159        expand_vec_shi(type, vece, opc, v0, v1, a2);
4160        break;
4161
4162    case INDEX_op_sari_vec:
4163        expand_vec_sari(type, vece, v0, v1, a2);
4164        break;
4165
4166    case INDEX_op_rotli_vec:
4167        expand_vec_rotli(type, vece, v0, v1, a2);
4168        break;
4169
4170    case INDEX_op_rotls_vec:
4171        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4172        break;
4173
4174    case INDEX_op_rotlv_vec:
4175        v2 = temp_tcgv_vec(arg_temp(a2));
4176        expand_vec_rotv(type, vece, v0, v1, v2, false);
4177        break;
4178    case INDEX_op_rotrv_vec:
4179        v2 = temp_tcgv_vec(arg_temp(a2));
4180        expand_vec_rotv(type, vece, v0, v1, v2, true);
4181        break;
4182
4183    case INDEX_op_mul_vec:
4184        v2 = temp_tcgv_vec(arg_temp(a2));
4185        expand_vec_mul(type, vece, v0, v1, v2);
4186        break;
4187
4188    case INDEX_op_cmp_vec:
4189        v2 = temp_tcgv_vec(arg_temp(a2));
4190        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4191        break;
4192
4193    case INDEX_op_cmpsel_vec:
4194        v2 = temp_tcgv_vec(arg_temp(a2));
4195        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4196        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4197        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4198        break;
4199
4200    default:
4201        break;
4202    }
4203
4204    va_end(va);
4205}
4206
4207static const int tcg_target_callee_save_regs[] = {
4208#if TCG_TARGET_REG_BITS == 64
4209    TCG_REG_RBP,
4210    TCG_REG_RBX,
4211#if defined(_WIN64)
4212    TCG_REG_RDI,
4213    TCG_REG_RSI,
4214#endif
4215    TCG_REG_R12,
4216    TCG_REG_R13,
4217    TCG_REG_R14, /* Currently used for the global env. */
4218    TCG_REG_R15,
4219#else
4220    TCG_REG_EBP, /* Currently used for the global env. */
4221    TCG_REG_EBX,
4222    TCG_REG_ESI,
4223    TCG_REG_EDI,
4224#endif
4225};
4226
4227/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4228   and tcg_register_jit.  */
4229
4230#define PUSH_SIZE \
4231    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4232     * (TCG_TARGET_REG_BITS / 8))
4233
4234#define FRAME_SIZE \
4235    ((PUSH_SIZE \
4236      + TCG_STATIC_CALL_ARGS_SIZE \
4237      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4238      + TCG_TARGET_STACK_ALIGN - 1) \
4239     & ~(TCG_TARGET_STACK_ALIGN - 1))
4240
4241/* Generate global QEMU prologue and epilogue code */
4242static void tcg_target_qemu_prologue(TCGContext *s)
4243{
4244    int i, stack_addend;
4245
4246    /* TB prologue */
4247
4248    /* Reserve some stack space, also for TCG temps.  */
4249    stack_addend = FRAME_SIZE - PUSH_SIZE;
4250    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4251                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4252
4253    /* Save all callee saved registers.  */
4254    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4255        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4256    }
4257
4258    if (!tcg_use_softmmu && guest_base) {
4259        int seg = setup_guest_base_seg();
4260        if (seg != 0) {
4261            x86_guest_base.seg = seg;
4262        } else if (guest_base == (int32_t)guest_base) {
4263            x86_guest_base.ofs = guest_base;
4264        } else {
4265            assert(TCG_TARGET_REG_BITS == 64);
4266            /* Choose R12 because, as a base, it requires a SIB byte. */
4267            x86_guest_base.index = TCG_REG_R12;
4268            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4269            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4270        }
4271    }
4272
4273    if (TCG_TARGET_REG_BITS == 32) {
4274        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4275                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4276        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4277        /* jmp *tb.  */
4278        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4279                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4280                             + stack_addend);
4281    } else {
4282        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4283        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4284        /* jmp *tb.  */
4285        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4286    }
4287
4288    /*
4289     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4290     * and fall through to the rest of the epilogue.
4291     */
4292    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4293    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4294
4295    /* TB epilogue */
4296    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4297
4298    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4299
4300    if (have_avx2) {
4301        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4302    }
4303    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4304        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4305    }
4306    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4307}
4308
4309static void tcg_out_tb_start(TCGContext *s)
4310{
4311    /* nothing to do */
4312}
4313
4314static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4315{
4316    memset(p, 0x90, count);
4317}
4318
4319static void tcg_target_init(TCGContext *s)
4320{
4321    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4322    if (TCG_TARGET_REG_BITS == 64) {
4323        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4324    }
4325    if (have_avx1) {
4326        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4327        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4328    }
4329    if (have_avx2) {
4330        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4331    }
4332
4333    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4334    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4335    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4336    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4337    if (TCG_TARGET_REG_BITS == 64) {
4338#if !defined(_WIN64)
4339        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4340        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4341#endif
4342        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4343        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4344        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4345        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4346    }
4347
4348    s->reserved_regs = 0;
4349    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4350    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4351#ifdef _WIN64
4352    /* These are call saved, and we don't save them, so don't use them. */
4353    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4354    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4355    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4356    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4357    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4358    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4359    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4360    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4361    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4362    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4363#endif
4364}
4365
4366typedef struct {
4367    DebugFrameHeader h;
4368    uint8_t fde_def_cfa[4];
4369    uint8_t fde_reg_ofs[14];
4370} DebugFrame;
4371
4372/* We're expecting a 2 byte uleb128 encoded value.  */
4373QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4374
4375#if !defined(__ELF__)
4376    /* Host machine without ELF. */
4377#elif TCG_TARGET_REG_BITS == 64
4378#define ELF_HOST_MACHINE EM_X86_64
4379static const DebugFrame debug_frame = {
4380    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4381    .h.cie.id = -1,
4382    .h.cie.version = 1,
4383    .h.cie.code_align = 1,
4384    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4385    .h.cie.return_column = 16,
4386
4387    /* Total FDE size does not include the "len" member.  */
4388    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4389
4390    .fde_def_cfa = {
4391        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4392        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4393        (FRAME_SIZE >> 7)
4394    },
4395    .fde_reg_ofs = {
4396        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4397        /* The following ordering must match tcg_target_callee_save_regs.  */
4398        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4399        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4400        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4401        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4402        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4403        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4404    }
4405};
4406#else
4407#define ELF_HOST_MACHINE EM_386
4408static const DebugFrame debug_frame = {
4409    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4410    .h.cie.id = -1,
4411    .h.cie.version = 1,
4412    .h.cie.code_align = 1,
4413    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4414    .h.cie.return_column = 8,
4415
4416    /* Total FDE size does not include the "len" member.  */
4417    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4418
4419    .fde_def_cfa = {
4420        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4421        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4422        (FRAME_SIZE >> 7)
4423    },
4424    .fde_reg_ofs = {
4425        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4426        /* The following ordering must match tcg_target_callee_save_regs.  */
4427        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4428        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4429        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4430        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4431    }
4432};
4433#endif
4434
4435#if defined(ELF_HOST_MACHINE)
4436void tcg_register_jit(const void *buf, size_t buf_size)
4437{
4438    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4439}
4440#endif
4441