xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 21e9a8aefb0313174c1861df84e5e49bd84026c8)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135
136/* Registers used with L constraint, which are the first argument
137   registers on x86_64, and two random call clobbered registers on
138   i386. */
139#if TCG_TARGET_REG_BITS == 64
140# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
141# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
142#else
143# define TCG_REG_L0 TCG_REG_EAX
144# define TCG_REG_L1 TCG_REG_EDX
145#endif
146
147#if TCG_TARGET_REG_BITS == 64
148# define ALL_GENERAL_REGS      0x0000ffffu
149# define ALL_VECTOR_REGS       0xffff0000u
150# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
151#else
152# define ALL_GENERAL_REGS      0x000000ffu
153# define ALL_VECTOR_REGS       0x00ff0000u
154# define ALL_BYTEL_REGS        0x0000000fu
155#endif
156#define SOFTMMU_RESERVE_REGS \
157    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
158
159/* For 64-bit, we always know that CMOV is available.  */
160#if TCG_TARGET_REG_BITS == 64
161# define have_cmov      true
162#else
163# define have_cmov      (cpuinfo & CPUINFO_CMOV)
164#endif
165#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
166#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
167
168static const tcg_insn_unit *tb_ret_addr;
169
170static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171                        intptr_t value, intptr_t addend)
172{
173    value += addend;
174    switch(type) {
175    case R_386_PC32:
176        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
177        if (value != (int32_t)value) {
178            return false;
179        }
180        /* FALLTHRU */
181    case R_386_32:
182        tcg_patch32(code_ptr, value);
183        break;
184    case R_386_PC8:
185        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
186        if (value != (int8_t)value) {
187            return false;
188        }
189        tcg_patch8(code_ptr, value);
190        break;
191    default:
192        g_assert_not_reached();
193    }
194    return true;
195}
196
197/* test if a constant matches the constraint */
198static bool tcg_target_const_match(int64_t val, int ct,
199                                   TCGType type, TCGCond cond, int vece)
200{
201    if (ct & TCG_CT_CONST) {
202        return 1;
203    }
204    if (type == TCG_TYPE_I32) {
205        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
206            return 1;
207        }
208    } else {
209        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
210            return 1;
211        }
212        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
213            return 1;
214        }
215        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
216            return 1;
217        }
218    }
219    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
220        return 1;
221    }
222    return 0;
223}
224
225# define LOWREGMASK(x)	((x) & 7)
226
227#define P_EXT		0x100		/* 0x0f opcode prefix */
228#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
229#define P_DATA16        0x400           /* 0x66 opcode prefix */
230#define P_VEXW          0x1000          /* Set VEX.W = 1 */
231#if TCG_TARGET_REG_BITS == 64
232# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
233# define P_REXB_R       0x2000          /* REG field as byte register */
234# define P_REXB_RM      0x4000          /* R/M field as byte register */
235# define P_GS           0x8000          /* gs segment override */
236#else
237# define P_REXW		0
238# define P_REXB_R	0
239# define P_REXB_RM	0
240# define P_GS           0
241#endif
242#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
243#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
244#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
245#define P_VEXL          0x80000         /* Set VEX.L = 1 */
246#define P_EVEX          0x100000        /* Requires EVEX encoding */
247
248#define OPC_ARITH_EbIb	(0x80)
249#define OPC_ARITH_EvIz	(0x81)
250#define OPC_ARITH_EvIb	(0x83)
251#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
252#define OPC_ANDN        (0xf2 | P_EXT38)
253#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
254#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
255#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
256#define OPC_BSF         (0xbc | P_EXT)
257#define OPC_BSR         (0xbd | P_EXT)
258#define OPC_BSWAP	(0xc8 | P_EXT)
259#define OPC_CALL_Jz	(0xe8)
260#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
261#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
262#define OPC_DEC_r32	(0x48)
263#define OPC_IMUL_GvEv	(0xaf | P_EXT)
264#define OPC_IMUL_GvEvIb	(0x6b)
265#define OPC_IMUL_GvEvIz	(0x69)
266#define OPC_INC_r32	(0x40)
267#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
268#define OPC_JCC_short	(0x70)		/* ... plus condition code */
269#define OPC_JMP_long	(0xe9)
270#define OPC_JMP_short	(0xeb)
271#define OPC_LEA         (0x8d)
272#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
273#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
274#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
275#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
276#define OPC_MOVB_EvIz   (0xc6)
277#define OPC_MOVL_EvIz	(0xc7)
278#define OPC_MOVB_Ib     (0xb0)
279#define OPC_MOVL_Iv     (0xb8)
280#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
281#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
282#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
283#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
284#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
285#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
286#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
287#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
288#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
289#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
290#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
291#define OPC_MOVSBL	(0xbe | P_EXT)
292#define OPC_MOVSWL	(0xbf | P_EXT)
293#define OPC_MOVSLQ	(0x63 | P_REXW)
294#define OPC_MOVZBL	(0xb6 | P_EXT)
295#define OPC_MOVZWL	(0xb7 | P_EXT)
296#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
297#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
298#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
299#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
300#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
301#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
302#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
303#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
304#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
305#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
306#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
307#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
308#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
309#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
310#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
311#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
312#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
313#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
314#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
315#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
316#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
317#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
318#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
319#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
320#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
321#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
322#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
323#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
324#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
325#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
326#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
327#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
328#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
329#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
330#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
331#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
332#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
333#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
334#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
335#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
336#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
337#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
338#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
339#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
340#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
341#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
342#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
343#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
344#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
345#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
346#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
347#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
348#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
349#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
350#define OPC_POR         (0xeb | P_EXT | P_DATA16)
351#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
352#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
353#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
354#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
355#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
356#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
357#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
358#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
359#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
360#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
361#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
362#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
363#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
364#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
365#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
366#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
367#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
368#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
369#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
370#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
371#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
372#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
373#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
374#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
375#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
376#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
377#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
378#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
379#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
380#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
381#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
382#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
383#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
384#define OPC_POP_r32	(0x58)
385#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
386#define OPC_PUSH_r32	(0x50)
387#define OPC_PUSH_Iv	(0x68)
388#define OPC_PUSH_Ib	(0x6a)
389#define OPC_RET		(0xc3)
390#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
391#define OPC_SHIFT_1	(0xd1)
392#define OPC_SHIFT_Ib	(0xc1)
393#define OPC_SHIFT_cl	(0xd3)
394#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
395#define OPC_SHUFPS      (0xc6 | P_EXT)
396#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
397#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
398#define OPC_SHRD_Ib     (0xac | P_EXT)
399#define OPC_TESTL	(0x85)
400#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
401#define OPC_UD2         (0x0b | P_EXT)
402#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
403#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
404#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
405#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
406#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
407#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
408#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
409#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
410#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
411#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
412#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
413#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
414#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
415#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
416#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
417#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
418#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
419#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
420#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
421#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
422#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
423#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
424#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
425#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
426#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
427#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
428#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
429#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
430#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
431#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
432#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
433#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
434#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
435#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
436#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
437#define OPC_VZEROUPPER  (0x77 | P_EXT)
438#define OPC_XCHG_ax_r32	(0x90)
439#define OPC_XCHG_EvGv   (0x87)
440
441#define OPC_GRP3_Eb     (0xf6)
442#define OPC_GRP3_Ev     (0xf7)
443#define OPC_GRP5        (0xff)
444#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
445
446/* Group 1 opcode extensions for 0x80-0x83.
447   These are also used as modifiers for OPC_ARITH.  */
448#define ARITH_ADD 0
449#define ARITH_OR  1
450#define ARITH_ADC 2
451#define ARITH_SBB 3
452#define ARITH_AND 4
453#define ARITH_SUB 5
454#define ARITH_XOR 6
455#define ARITH_CMP 7
456
457/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
458#define SHIFT_ROL 0
459#define SHIFT_ROR 1
460#define SHIFT_SHL 4
461#define SHIFT_SHR 5
462#define SHIFT_SAR 7
463
464/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
465#define EXT3_TESTi 0
466#define EXT3_NOT   2
467#define EXT3_NEG   3
468#define EXT3_MUL   4
469#define EXT3_IMUL  5
470#define EXT3_DIV   6
471#define EXT3_IDIV  7
472
473/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
474#define EXT5_INC_Ev	0
475#define EXT5_DEC_Ev	1
476#define EXT5_CALLN_Ev	2
477#define EXT5_JMPN_Ev	4
478
479/* Condition codes to be added to OPC_JCC_{long,short}.  */
480#define JCC_JMP (-1)
481#define JCC_JO  0x0
482#define JCC_JNO 0x1
483#define JCC_JB  0x2
484#define JCC_JAE 0x3
485#define JCC_JE  0x4
486#define JCC_JNE 0x5
487#define JCC_JBE 0x6
488#define JCC_JA  0x7
489#define JCC_JS  0x8
490#define JCC_JNS 0x9
491#define JCC_JP  0xa
492#define JCC_JNP 0xb
493#define JCC_JL  0xc
494#define JCC_JGE 0xd
495#define JCC_JLE 0xe
496#define JCC_JG  0xf
497
498static const uint8_t tcg_cond_to_jcc[] = {
499    [TCG_COND_EQ] = JCC_JE,
500    [TCG_COND_NE] = JCC_JNE,
501    [TCG_COND_LT] = JCC_JL,
502    [TCG_COND_GE] = JCC_JGE,
503    [TCG_COND_LE] = JCC_JLE,
504    [TCG_COND_GT] = JCC_JG,
505    [TCG_COND_LTU] = JCC_JB,
506    [TCG_COND_GEU] = JCC_JAE,
507    [TCG_COND_LEU] = JCC_JBE,
508    [TCG_COND_GTU] = JCC_JA,
509};
510
511#if TCG_TARGET_REG_BITS == 64
512static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
513{
514    int rex;
515
516    if (opc & P_GS) {
517        tcg_out8(s, 0x65);
518    }
519    if (opc & P_DATA16) {
520        /* We should never be asking for both 16 and 64-bit operation.  */
521        tcg_debug_assert((opc & P_REXW) == 0);
522        tcg_out8(s, 0x66);
523    }
524    if (opc & P_SIMDF3) {
525        tcg_out8(s, 0xf3);
526    } else if (opc & P_SIMDF2) {
527        tcg_out8(s, 0xf2);
528    }
529
530    rex = 0;
531    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
532    rex |= (r & 8) >> 1;                /* REX.R */
533    rex |= (x & 8) >> 2;                /* REX.X */
534    rex |= (rm & 8) >> 3;               /* REX.B */
535
536    /* P_REXB_{R,RM} indicates that the given register is the low byte.
537       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
538       as otherwise the encoding indicates %[abcd]h.  Note that the values
539       that are ORed in merely indicate that the REX byte must be present;
540       those bits get discarded in output.  */
541    rex |= opc & (r >= 4 ? P_REXB_R : 0);
542    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
543
544    if (rex) {
545        tcg_out8(s, (uint8_t)(rex | 0x40));
546    }
547
548    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
549        tcg_out8(s, 0x0f);
550        if (opc & P_EXT38) {
551            tcg_out8(s, 0x38);
552        } else if (opc & P_EXT3A) {
553            tcg_out8(s, 0x3a);
554        }
555    }
556
557    tcg_out8(s, opc);
558}
559#else
560static void tcg_out_opc(TCGContext *s, int opc)
561{
562    if (opc & P_DATA16) {
563        tcg_out8(s, 0x66);
564    }
565    if (opc & P_SIMDF3) {
566        tcg_out8(s, 0xf3);
567    } else if (opc & P_SIMDF2) {
568        tcg_out8(s, 0xf2);
569    }
570    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
571        tcg_out8(s, 0x0f);
572        if (opc & P_EXT38) {
573            tcg_out8(s, 0x38);
574        } else if (opc & P_EXT3A) {
575            tcg_out8(s, 0x3a);
576        }
577    }
578    tcg_out8(s, opc);
579}
580/* Discard the register arguments to tcg_out_opc early, so as not to penalize
581   the 32-bit compilation paths.  This method works with all versions of gcc,
582   whereas relying on optimization may not be able to exclude them.  */
583#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
584#endif
585
586static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
587{
588    tcg_out_opc(s, opc, r, rm, 0);
589    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
590}
591
592static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
593                            int rm, int index)
594{
595    int tmp;
596
597    if (opc & P_GS) {
598        tcg_out8(s, 0x65);
599    }
600    /* Use the two byte form if possible, which cannot encode
601       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
602    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
603        && ((rm | index) & 8) == 0) {
604        /* Two byte VEX prefix.  */
605        tcg_out8(s, 0xc5);
606
607        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
608    } else {
609        /* Three byte VEX prefix.  */
610        tcg_out8(s, 0xc4);
611
612        /* VEX.m-mmmm */
613        if (opc & P_EXT3A) {
614            tmp = 3;
615        } else if (opc & P_EXT38) {
616            tmp = 2;
617        } else if (opc & P_EXT) {
618            tmp = 1;
619        } else {
620            g_assert_not_reached();
621        }
622        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
623        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
624        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
625        tcg_out8(s, tmp);
626
627        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
628    }
629
630    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
631    /* VEX.pp */
632    if (opc & P_DATA16) {
633        tmp |= 1;                          /* 0x66 */
634    } else if (opc & P_SIMDF3) {
635        tmp |= 2;                          /* 0xf3 */
636    } else if (opc & P_SIMDF2) {
637        tmp |= 3;                          /* 0xf2 */
638    }
639    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
640    tcg_out8(s, tmp);
641    tcg_out8(s, opc);
642}
643
644static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
645                             int rm, int index)
646{
647    /* The entire 4-byte evex prefix; with R' and V' set. */
648    uint32_t p = 0x08041062;
649    int mm, pp;
650
651    tcg_debug_assert(have_avx512vl);
652
653    /* EVEX.mm */
654    if (opc & P_EXT3A) {
655        mm = 3;
656    } else if (opc & P_EXT38) {
657        mm = 2;
658    } else if (opc & P_EXT) {
659        mm = 1;
660    } else {
661        g_assert_not_reached();
662    }
663
664    /* EVEX.pp */
665    if (opc & P_DATA16) {
666        pp = 1;                          /* 0x66 */
667    } else if (opc & P_SIMDF3) {
668        pp = 2;                          /* 0xf3 */
669    } else if (opc & P_SIMDF2) {
670        pp = 3;                          /* 0xf2 */
671    } else {
672        pp = 0;
673    }
674
675    p = deposit32(p, 8, 2, mm);
676    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
677    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
678    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
679    p = deposit32(p, 16, 2, pp);
680    p = deposit32(p, 19, 4, ~v);
681    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
682    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
683
684    tcg_out32(s, p);
685    tcg_out8(s, opc);
686}
687
688static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
689{
690    if (opc & P_EVEX) {
691        tcg_out_evex_opc(s, opc, r, v, rm, 0);
692    } else {
693        tcg_out_vex_opc(s, opc, r, v, rm, 0);
694    }
695    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
696}
697
698/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
699   We handle either RM and INDEX missing with a negative value.  In 64-bit
700   mode for absolute addresses, ~RM is the size of the immediate operand
701   that will follow the instruction.  */
702
703static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
704                               int shift, intptr_t offset)
705{
706    int mod, len;
707
708    if (index < 0 && rm < 0) {
709        if (TCG_TARGET_REG_BITS == 64) {
710            /* Try for a rip-relative addressing mode.  This has replaced
711               the 32-bit-mode absolute addressing encoding.  */
712            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
713            intptr_t disp = offset - pc;
714            if (disp == (int32_t)disp) {
715                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
716                tcg_out32(s, disp);
717                return;
718            }
719
720            /* Try for an absolute address encoding.  This requires the
721               use of the MODRM+SIB encoding and is therefore larger than
722               rip-relative addressing.  */
723            if (offset == (int32_t)offset) {
724                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
725                tcg_out8(s, (4 << 3) | 5);
726                tcg_out32(s, offset);
727                return;
728            }
729
730            /* ??? The memory isn't directly addressable.  */
731            g_assert_not_reached();
732        } else {
733            /* Absolute address.  */
734            tcg_out8(s, (r << 3) | 5);
735            tcg_out32(s, offset);
736            return;
737        }
738    }
739
740    /* Find the length of the immediate addend.  Note that the encoding
741       that would be used for (%ebp) indicates absolute addressing.  */
742    if (rm < 0) {
743        mod = 0, len = 4, rm = 5;
744    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
745        mod = 0, len = 0;
746    } else if (offset == (int8_t)offset) {
747        mod = 0x40, len = 1;
748    } else {
749        mod = 0x80, len = 4;
750    }
751
752    /* Use a single byte MODRM format if possible.  Note that the encoding
753       that would be used for %esp is the escape to the two byte form.  */
754    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
755        /* Single byte MODRM format.  */
756        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
757    } else {
758        /* Two byte MODRM+SIB format.  */
759
760        /* Note that the encoding that would place %esp into the index
761           field indicates no index register.  In 64-bit mode, the REX.X
762           bit counts, so %r12 can be used as the index.  */
763        if (index < 0) {
764            index = 4;
765        } else {
766            tcg_debug_assert(index != TCG_REG_ESP);
767        }
768
769        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
770        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
771    }
772
773    if (len == 1) {
774        tcg_out8(s, offset);
775    } else if (len == 4) {
776        tcg_out32(s, offset);
777    }
778}
779
780static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
781                                     int index, int shift, intptr_t offset)
782{
783    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
784    tcg_out_sib_offset(s, r, rm, index, shift, offset);
785}
786
787static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
788                                         int rm, int index, int shift,
789                                         intptr_t offset)
790{
791    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
792    tcg_out_sib_offset(s, r, rm, index, shift, offset);
793}
794
795/* A simplification of the above with no index or shift.  */
796static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
797                                        int rm, intptr_t offset)
798{
799    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
800}
801
802static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
803                                            int v, int rm, intptr_t offset)
804{
805    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
806}
807
808/* Output an opcode with an expected reference to the constant pool.  */
809static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
810{
811    tcg_out_opc(s, opc, r, 0, 0);
812    /* Absolute for 32-bit, pc-relative for 64-bit.  */
813    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
814    tcg_out32(s, 0);
815}
816
817/* Output an opcode with an expected reference to the constant pool.  */
818static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
819{
820    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
821    /* Absolute for 32-bit, pc-relative for 64-bit.  */
822    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
823    tcg_out32(s, 0);
824}
825
826/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
827static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
828{
829    /* Propagate an opcode prefix, such as P_REXW.  */
830    int ext = subop & ~0x7;
831    subop &= 0x7;
832
833    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
834}
835
836static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
837{
838    int rexw = 0;
839
840    if (arg == ret) {
841        return true;
842    }
843    switch (type) {
844    case TCG_TYPE_I64:
845        rexw = P_REXW;
846        /* fallthru */
847    case TCG_TYPE_I32:
848        if (ret < 16) {
849            if (arg < 16) {
850                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
851            } else {
852                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
853            }
854        } else {
855            if (arg < 16) {
856                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
857            } else {
858                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
859            }
860        }
861        break;
862
863    case TCG_TYPE_V64:
864        tcg_debug_assert(ret >= 16 && arg >= 16);
865        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
866        break;
867    case TCG_TYPE_V128:
868        tcg_debug_assert(ret >= 16 && arg >= 16);
869        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
870        break;
871    case TCG_TYPE_V256:
872        tcg_debug_assert(ret >= 16 && arg >= 16);
873        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
874        break;
875
876    default:
877        g_assert_not_reached();
878    }
879    return true;
880}
881
882static const int avx2_dup_insn[4] = {
883    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
884    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
885};
886
887static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
888                            TCGReg r, TCGReg a)
889{
890    if (have_avx2) {
891        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
892        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
893    } else {
894        switch (vece) {
895        case MO_8:
896            /* ??? With zero in a register, use PSHUFB.  */
897            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
898            a = r;
899            /* FALLTHRU */
900        case MO_16:
901            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
902            a = r;
903            /* FALLTHRU */
904        case MO_32:
905            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
906            /* imm8 operand: all output lanes selected from input lane 0.  */
907            tcg_out8(s, 0);
908            break;
909        case MO_64:
910            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
911            break;
912        default:
913            g_assert_not_reached();
914        }
915    }
916    return true;
917}
918
919static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
920                             TCGReg r, TCGReg base, intptr_t offset)
921{
922    if (have_avx2) {
923        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
924        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
925                                 r, 0, base, offset);
926    } else {
927        switch (vece) {
928        case MO_64:
929            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
930            break;
931        case MO_32:
932            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
933            break;
934        case MO_16:
935            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
936            tcg_out8(s, 0); /* imm8 */
937            tcg_out_dup_vec(s, type, vece, r, r);
938            break;
939        case MO_8:
940            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
941            tcg_out8(s, 0); /* imm8 */
942            tcg_out_dup_vec(s, type, vece, r, r);
943            break;
944        default:
945            g_assert_not_reached();
946        }
947    }
948    return true;
949}
950
951static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
952                             TCGReg ret, int64_t arg)
953{
954    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
955
956    if (arg == 0) {
957        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
958        return;
959    }
960    if (arg == -1) {
961        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
962        return;
963    }
964
965    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
966        if (have_avx2) {
967            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
968        } else {
969            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
970        }
971        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
972    } else {
973        if (type == TCG_TYPE_V64) {
974            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
975        } else if (have_avx2) {
976            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
977        } else {
978            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
979        }
980        if (TCG_TARGET_REG_BITS == 64) {
981            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
982        } else {
983            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
984        }
985    }
986}
987
988static void tcg_out_movi_vec(TCGContext *s, TCGType type,
989                             TCGReg ret, tcg_target_long arg)
990{
991    if (arg == 0) {
992        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
993        return;
994    }
995    if (arg == -1) {
996        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
997        return;
998    }
999
1000    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1001    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1002    if (TCG_TARGET_REG_BITS == 64) {
1003        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1004    } else {
1005        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1006    }
1007}
1008
1009static void tcg_out_movi_int(TCGContext *s, TCGType type,
1010                             TCGReg ret, tcg_target_long arg)
1011{
1012    tcg_target_long diff;
1013
1014    if (arg == 0) {
1015        tgen_arithr(s, ARITH_XOR, ret, ret);
1016        return;
1017    }
1018    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1019        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1020        tcg_out32(s, arg);
1021        return;
1022    }
1023    if (arg == (int32_t)arg) {
1024        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1025        tcg_out32(s, arg);
1026        return;
1027    }
1028
1029    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1030    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1031    if (diff == (int32_t)diff) {
1032        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1033        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1034        tcg_out32(s, diff);
1035        return;
1036    }
1037
1038    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1039    tcg_out64(s, arg);
1040}
1041
1042static void tcg_out_movi(TCGContext *s, TCGType type,
1043                         TCGReg ret, tcg_target_long arg)
1044{
1045    switch (type) {
1046    case TCG_TYPE_I32:
1047#if TCG_TARGET_REG_BITS == 64
1048    case TCG_TYPE_I64:
1049#endif
1050        if (ret < 16) {
1051            tcg_out_movi_int(s, type, ret, arg);
1052        } else {
1053            tcg_out_movi_vec(s, type, ret, arg);
1054        }
1055        break;
1056    default:
1057        g_assert_not_reached();
1058    }
1059}
1060
1061static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1062{
1063    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1064    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1065    return true;
1066}
1067
1068static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1069                             tcg_target_long imm)
1070{
1071    /* This function is only used for passing structs by reference. */
1072    tcg_debug_assert(imm == (int32_t)imm);
1073    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1074}
1075
1076static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1077{
1078    if (val == (int8_t)val) {
1079        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1080        tcg_out8(s, val);
1081    } else if (val == (int32_t)val) {
1082        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1083        tcg_out32(s, val);
1084    } else {
1085        g_assert_not_reached();
1086    }
1087}
1088
1089static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1090{
1091    /* Given the strength of x86 memory ordering, we only need care for
1092       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1093       faster than "mfence", so don't bother with the sse insn.  */
1094    if (a0 & TCG_MO_ST_LD) {
1095        tcg_out8(s, 0xf0);
1096        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1097        tcg_out8(s, 0);
1098    }
1099}
1100
1101static inline void tcg_out_push(TCGContext *s, int reg)
1102{
1103    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1104}
1105
1106static inline void tcg_out_pop(TCGContext *s, int reg)
1107{
1108    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1109}
1110
1111static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1112                       TCGReg arg1, intptr_t arg2)
1113{
1114    switch (type) {
1115    case TCG_TYPE_I32:
1116        if (ret < 16) {
1117            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1118        } else {
1119            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1120        }
1121        break;
1122    case TCG_TYPE_I64:
1123        if (ret < 16) {
1124            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1125            break;
1126        }
1127        /* FALLTHRU */
1128    case TCG_TYPE_V64:
1129        /* There is no instruction that can validate 8-byte alignment.  */
1130        tcg_debug_assert(ret >= 16);
1131        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1132        break;
1133    case TCG_TYPE_V128:
1134        /*
1135         * The gvec infrastructure is asserts that v128 vector loads
1136         * and stores use a 16-byte aligned offset.  Validate that the
1137         * final pointer is aligned by using an insn that will SIGSEGV.
1138         */
1139        tcg_debug_assert(ret >= 16);
1140        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1141        break;
1142    case TCG_TYPE_V256:
1143        /*
1144         * The gvec infrastructure only requires 16-byte alignment,
1145         * so here we must use an unaligned load.
1146         */
1147        tcg_debug_assert(ret >= 16);
1148        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1149                                 ret, 0, arg1, arg2);
1150        break;
1151    default:
1152        g_assert_not_reached();
1153    }
1154}
1155
1156static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1157                       TCGReg arg1, intptr_t arg2)
1158{
1159    switch (type) {
1160    case TCG_TYPE_I32:
1161        if (arg < 16) {
1162            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1163        } else {
1164            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1165        }
1166        break;
1167    case TCG_TYPE_I64:
1168        if (arg < 16) {
1169            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1170            break;
1171        }
1172        /* FALLTHRU */
1173    case TCG_TYPE_V64:
1174        /* There is no instruction that can validate 8-byte alignment.  */
1175        tcg_debug_assert(arg >= 16);
1176        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1177        break;
1178    case TCG_TYPE_V128:
1179        /*
1180         * The gvec infrastructure is asserts that v128 vector loads
1181         * and stores use a 16-byte aligned offset.  Validate that the
1182         * final pointer is aligned by using an insn that will SIGSEGV.
1183         *
1184         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1185         * for _WIN64, which must have SSE2 but may not have AVX.
1186         */
1187        tcg_debug_assert(arg >= 16);
1188        if (have_avx1) {
1189            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1190        } else {
1191            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1192        }
1193        break;
1194    case TCG_TYPE_V256:
1195        /*
1196         * The gvec infrastructure only requires 16-byte alignment,
1197         * so here we must use an unaligned store.
1198         */
1199        tcg_debug_assert(arg >= 16);
1200        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1201                                 arg, 0, arg1, arg2);
1202        break;
1203    default:
1204        g_assert_not_reached();
1205    }
1206}
1207
1208static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1209                        TCGReg base, intptr_t ofs)
1210{
1211    int rexw = 0;
1212    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1213        if (val != (int32_t)val) {
1214            return false;
1215        }
1216        rexw = P_REXW;
1217    } else if (type != TCG_TYPE_I32) {
1218        return false;
1219    }
1220    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1221    tcg_out32(s, val);
1222    return true;
1223}
1224
1225static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1226{
1227    /* Propagate an opcode prefix, such as P_DATA16.  */
1228    int ext = subopc & ~0x7;
1229    subopc &= 0x7;
1230
1231    if (count == 1) {
1232        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1233    } else {
1234        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1235        tcg_out8(s, count);
1236    }
1237}
1238
1239static inline void tcg_out_bswap32(TCGContext *s, int reg)
1240{
1241    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1242}
1243
1244static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1245{
1246    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1247}
1248
1249static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1250{
1251    /* movzbl */
1252    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1253    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1254}
1255
1256static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1257{
1258    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1259    /* movsbl */
1260    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1261    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1262}
1263
1264static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1265{
1266    /* movzwl */
1267    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1268}
1269
1270static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1271{
1272    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1273    /* movsw[lq] */
1274    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1275}
1276
1277static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1278{
1279    /* 32-bit mov zero extends.  */
1280    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1281}
1282
1283static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1284{
1285    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1286    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1287}
1288
1289static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1290{
1291    tcg_out_ext32s(s, dest, src);
1292}
1293
1294static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1295{
1296    if (dest != src) {
1297        tcg_out_ext32u(s, dest, src);
1298    }
1299}
1300
1301static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1302{
1303    tcg_out_ext32u(s, dest, src);
1304}
1305
1306static inline void tcg_out_bswap64(TCGContext *s, int reg)
1307{
1308    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1309}
1310
1311static void tgen_arithi(TCGContext *s, int c, int r0,
1312                        tcg_target_long val, int cf)
1313{
1314    int rexw = 0;
1315
1316    if (TCG_TARGET_REG_BITS == 64) {
1317        rexw = c & -8;
1318        c &= 7;
1319    }
1320
1321    switch (c) {
1322    case ARITH_ADD:
1323    case ARITH_SUB:
1324        if (!cf) {
1325            /*
1326             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1327             * partial flags update stalls on Pentium4 and are not recommended
1328             * by current Intel optimization manuals.
1329             */
1330            if (val == 1 || val == -1) {
1331                int is_inc = (c == ARITH_ADD) ^ (val < 0);
1332                if (TCG_TARGET_REG_BITS == 64) {
1333                    /*
1334                     * The single-byte increment encodings are re-tasked
1335                     * as the REX prefixes.  Use the MODRM encoding.
1336                     */
1337                    tcg_out_modrm(s, OPC_GRP5 + rexw,
1338                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1339                } else {
1340                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1341                }
1342                return;
1343            }
1344            if (val == 128) {
1345                /*
1346                 * Facilitate using an 8-bit immediate.  Carry is inverted
1347                 * by this transformation, so do it only if cf == 0.
1348                 */
1349                c ^= ARITH_ADD ^ ARITH_SUB;
1350                val = -128;
1351            }
1352        }
1353        break;
1354
1355    case ARITH_AND:
1356        if (TCG_TARGET_REG_BITS == 64) {
1357            if (val == 0xffffffffu) {
1358                tcg_out_ext32u(s, r0, r0);
1359                return;
1360            }
1361            if (val == (uint32_t)val) {
1362                /* AND with no high bits set can use a 32-bit operation.  */
1363                rexw = 0;
1364            }
1365        }
1366        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1367            tcg_out_ext8u(s, r0, r0);
1368            return;
1369        }
1370        if (val == 0xffffu) {
1371            tcg_out_ext16u(s, r0, r0);
1372            return;
1373        }
1374        break;
1375
1376    case ARITH_OR:
1377    case ARITH_XOR:
1378        if (val >= 0x80 && val <= 0xff
1379            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1380            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1381            tcg_out8(s, val);
1382            return;
1383        }
1384        break;
1385    }
1386
1387    if (val == (int8_t)val) {
1388        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1389        tcg_out8(s, val);
1390        return;
1391    }
1392    if (rexw == 0 || val == (int32_t)val) {
1393        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1394        tcg_out32(s, val);
1395        return;
1396    }
1397
1398    g_assert_not_reached();
1399}
1400
1401static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1402{
1403    if (val != 0) {
1404        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1405    }
1406}
1407
1408/* Set SMALL to force a short forward branch.  */
1409static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1410{
1411    int32_t val, val1;
1412
1413    if (l->has_value) {
1414        val = tcg_pcrel_diff(s, l->u.value_ptr);
1415        val1 = val - 2;
1416        if ((int8_t)val1 == val1) {
1417            if (opc == -1) {
1418                tcg_out8(s, OPC_JMP_short);
1419            } else {
1420                tcg_out8(s, OPC_JCC_short + opc);
1421            }
1422            tcg_out8(s, val1);
1423        } else {
1424            tcg_debug_assert(!small);
1425            if (opc == -1) {
1426                tcg_out8(s, OPC_JMP_long);
1427                tcg_out32(s, val - 5);
1428            } else {
1429                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1430                tcg_out32(s, val - 6);
1431            }
1432        }
1433    } else if (small) {
1434        if (opc == -1) {
1435            tcg_out8(s, OPC_JMP_short);
1436        } else {
1437            tcg_out8(s, OPC_JCC_short + opc);
1438        }
1439        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1440        s->code_ptr += 1;
1441    } else {
1442        if (opc == -1) {
1443            tcg_out8(s, OPC_JMP_long);
1444        } else {
1445            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1446        }
1447        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1448        s->code_ptr += 4;
1449    }
1450}
1451
1452static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1453                        int const_arg2, int rexw)
1454{
1455    if (const_arg2) {
1456        if (arg2 == 0) {
1457            /* test r, r */
1458            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1459        } else {
1460            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1461        }
1462    } else {
1463        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1464    }
1465}
1466
1467static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1468                           TCGArg arg1, TCGArg arg2, int const_arg2,
1469                           TCGLabel *label, bool small)
1470{
1471    tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1472    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1473}
1474
1475#if TCG_TARGET_REG_BITS == 32
1476static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1477                            const int *const_args, bool small)
1478{
1479    TCGLabel *label_next = gen_new_label();
1480    TCGLabel *label_this = arg_label(args[5]);
1481
1482    switch(args[4]) {
1483    case TCG_COND_EQ:
1484        tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1485                       label_next, 1);
1486        tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1487                       label_this, small);
1488        break;
1489    case TCG_COND_NE:
1490        tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1491                       label_this, small);
1492        tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1493                       label_this, small);
1494        break;
1495    case TCG_COND_LT:
1496        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1497                       label_this, small);
1498        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1499        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1500                       label_this, small);
1501        break;
1502    case TCG_COND_LE:
1503        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1504                       label_this, small);
1505        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1506        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1507                       label_this, small);
1508        break;
1509    case TCG_COND_GT:
1510        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1511                       label_this, small);
1512        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1513        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1514                       label_this, small);
1515        break;
1516    case TCG_COND_GE:
1517        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1518                       label_this, small);
1519        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1520        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1521                       label_this, small);
1522        break;
1523    case TCG_COND_LTU:
1524        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1525                       label_this, small);
1526        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1527        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1528                       label_this, small);
1529        break;
1530    case TCG_COND_LEU:
1531        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1532                       label_this, small);
1533        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1534        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1535                       label_this, small);
1536        break;
1537    case TCG_COND_GTU:
1538        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1539                       label_this, small);
1540        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1541        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1542                       label_this, small);
1543        break;
1544    case TCG_COND_GEU:
1545        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1546                       label_this, small);
1547        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1548        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1549                       label_this, small);
1550        break;
1551    default:
1552        g_assert_not_reached();
1553    }
1554    tcg_out_label(s, label_next);
1555}
1556#endif
1557
1558static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1559                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1560                            int const_arg2, bool neg)
1561{
1562    bool inv = false;
1563    bool cleared;
1564
1565    switch (cond) {
1566    case TCG_COND_NE:
1567        inv = true;
1568        /* fall through */
1569    case TCG_COND_EQ:
1570        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1571        if (const_arg2 && arg2 == 0) {
1572            arg2 = 1;
1573            goto do_ltu;
1574        }
1575        break;
1576
1577    case TCG_COND_LEU:
1578        inv = true;
1579        /* fall through */
1580    case TCG_COND_GTU:
1581        /* If arg2 is a register, swap for LTU/GEU. */
1582        if (!const_arg2) {
1583            TCGReg t = arg1;
1584            arg1 = arg2;
1585            arg2 = t;
1586            goto do_ltu;
1587        }
1588        break;
1589
1590    case TCG_COND_GEU:
1591        inv = true;
1592        /* fall through */
1593    case TCG_COND_LTU:
1594    do_ltu:
1595        /*
1596         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1597         * We can then use NEG or INC to produce the desired result.
1598         * This is always smaller than the SETCC expansion.
1599         */
1600        tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1601
1602        /* X - X - C = -C = (C ? -1 : 0) */
1603        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1604        if (inv && neg) {
1605            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1606            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1607        } else if (inv) {
1608            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1609            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1610        } else if (!neg) {
1611            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1612            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1613        }
1614        return;
1615
1616    case TCG_COND_GE:
1617        inv = true;
1618        /* fall through */
1619    case TCG_COND_LT:
1620        /* If arg2 is 0, extract the sign bit. */
1621        if (const_arg2 && arg2 == 0) {
1622            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1623            if (inv) {
1624                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1625            }
1626            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1627                           dest, rexw ? 63 : 31);
1628            return;
1629        }
1630        break;
1631
1632    default:
1633        break;
1634    }
1635
1636    /*
1637     * If dest does not overlap the inputs, clearing it first is preferred.
1638     * The XOR breaks any false dependency for the low-byte write to dest,
1639     * and is also one byte smaller than MOVZBL.
1640     */
1641    cleared = false;
1642    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1643        tgen_arithr(s, ARITH_XOR, dest, dest);
1644        cleared = true;
1645    }
1646
1647    tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1648    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1649
1650    if (!cleared) {
1651        tcg_out_ext8u(s, dest, dest);
1652    }
1653    if (neg) {
1654        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1655    }
1656}
1657
1658#if TCG_TARGET_REG_BITS == 32
1659static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1660                             const int *const_args)
1661{
1662    TCGArg new_args[6];
1663    TCGLabel *label_true, *label_over;
1664
1665    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1666
1667    if (args[0] == args[1] || args[0] == args[2]
1668        || (!const_args[3] && args[0] == args[3])
1669        || (!const_args[4] && args[0] == args[4])) {
1670        /* When the destination overlaps with one of the argument
1671           registers, don't do anything tricky.  */
1672        label_true = gen_new_label();
1673        label_over = gen_new_label();
1674
1675        new_args[5] = label_arg(label_true);
1676        tcg_out_brcond2(s, new_args, const_args+1, 1);
1677
1678        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1679        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1680        tcg_out_label(s, label_true);
1681
1682        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1683        tcg_out_label(s, label_over);
1684    } else {
1685        /* When the destination does not overlap one of the arguments,
1686           clear the destination first, jump if cond false, and emit an
1687           increment in the true case.  This results in smaller code.  */
1688
1689        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1690
1691        label_over = gen_new_label();
1692        new_args[4] = tcg_invert_cond(new_args[4]);
1693        new_args[5] = label_arg(label_over);
1694        tcg_out_brcond2(s, new_args, const_args+1, 1);
1695
1696        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1697        tcg_out_label(s, label_over);
1698    }
1699}
1700#endif
1701
1702static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1703                         TCGReg dest, TCGReg v1)
1704{
1705    if (have_cmov) {
1706        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1707    } else {
1708        TCGLabel *over = gen_new_label();
1709        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1710        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1711        tcg_out_label(s, over);
1712    }
1713}
1714
1715static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1716                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1717                            TCGReg v1)
1718{
1719    tcg_out_cmp(s, c1, c2, const_c2, rexw);
1720    tcg_out_cmov(s, cond, rexw, dest, v1);
1721}
1722
1723static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1724                        TCGArg arg2, bool const_a2)
1725{
1726    if (have_bmi1) {
1727        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1728        if (const_a2) {
1729            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1730        } else {
1731            tcg_debug_assert(dest != arg2);
1732            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1733        }
1734    } else {
1735        tcg_debug_assert(dest != arg2);
1736        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1737        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1738    }
1739}
1740
1741static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1742                        TCGArg arg2, bool const_a2)
1743{
1744    if (have_lzcnt) {
1745        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1746        if (const_a2) {
1747            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1748        } else {
1749            tcg_debug_assert(dest != arg2);
1750            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1751        }
1752    } else {
1753        tcg_debug_assert(!const_a2);
1754        tcg_debug_assert(dest != arg1);
1755        tcg_debug_assert(dest != arg2);
1756
1757        /* Recall that the output of BSR is the index not the count.  */
1758        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1759        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1760
1761        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1762        tcg_out_cmp(s, arg1, 0, 1, rexw);
1763        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1764    }
1765}
1766
1767static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1768{
1769    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1770
1771    if (disp == (int32_t)disp) {
1772        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1773        tcg_out32(s, disp);
1774    } else {
1775        /* rip-relative addressing into the constant pool.
1776           This is 6 + 8 = 14 bytes, as compared to using an
1777           immediate load 10 + 6 = 16 bytes, plus we may
1778           be able to re-use the pool constant for more calls.  */
1779        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1780        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1781        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1782        tcg_out32(s, 0);
1783    }
1784}
1785
1786static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1787                         const TCGHelperInfo *info)
1788{
1789    tcg_out_branch(s, 1, dest);
1790
1791#ifndef _WIN32
1792    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1793        /*
1794         * The sysv i386 abi for struct return places a reference as the
1795         * first argument of the stack, and pops that argument with the
1796         * return statement.  Since we want to retain the aligned stack
1797         * pointer for the callee, we do not want to actually push that
1798         * argument before the call but rely on the normal store to the
1799         * stack slot.  But we do need to compensate for the pop in order
1800         * to reset our correct stack pointer value.
1801         * Pushing a garbage value back onto the stack is quickest.
1802         */
1803        tcg_out_push(s, TCG_REG_EAX);
1804    }
1805#endif
1806}
1807
1808static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1809{
1810    tcg_out_branch(s, 0, dest);
1811}
1812
1813static void tcg_out_nopn(TCGContext *s, int n)
1814{
1815    int i;
1816    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1817     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1818     * duplicate prefix, and all of the interesting recent cores can
1819     * decode and discard the duplicates in a single cycle.
1820     */
1821    tcg_debug_assert(n >= 1);
1822    for (i = 1; i < n; ++i) {
1823        tcg_out8(s, 0x66);
1824    }
1825    tcg_out8(s, 0x90);
1826}
1827
1828/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1829static void __attribute__((unused))
1830tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1831{
1832    /*
1833     * This is used for testing alignment, so we can usually use testb.
1834     * For i686, we have to use testl for %esi/%edi.
1835     */
1836    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1837        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1838        tcg_out8(s, i);
1839    } else {
1840        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1841        tcg_out32(s, i);
1842    }
1843}
1844
1845typedef struct {
1846    TCGReg base;
1847    int index;
1848    int ofs;
1849    int seg;
1850    TCGAtomAlign aa;
1851} HostAddress;
1852
1853bool tcg_target_has_memory_bswap(MemOp memop)
1854{
1855    TCGAtomAlign aa;
1856
1857    if (!have_movbe) {
1858        return false;
1859    }
1860    if ((memop & MO_SIZE) < MO_128) {
1861        return true;
1862    }
1863
1864    /*
1865     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1866     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1867     */
1868    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1869    return aa.atom < MO_128;
1870}
1871
1872/*
1873 * Because i686 has no register parameters and because x86_64 has xchg
1874 * to handle addr/data register overlap, we have placed all input arguments
1875 * before we need might need a scratch reg.
1876 *
1877 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1878 * a general-purpose scratch when we don't actually know it's available,
1879 * use the ra_gen hook to load into RAX if needed.
1880 */
1881#if TCG_TARGET_REG_BITS == 64
1882static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1883{
1884    if (arg < 0) {
1885        arg = TCG_REG_RAX;
1886    }
1887    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1888    return arg;
1889}
1890static const TCGLdstHelperParam ldst_helper_param = {
1891    .ra_gen = ldst_ra_gen
1892};
1893#else
1894static const TCGLdstHelperParam ldst_helper_param = { };
1895#endif
1896
1897static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1898                                TCGReg l, TCGReg h, TCGReg v)
1899{
1900    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1901
1902    /* vpmov{d,q} %v, %l */
1903    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1904    /* vpextr{d,q} $1, %v, %h */
1905    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1906    tcg_out8(s, 1);
1907}
1908
1909static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1910                                TCGReg v, TCGReg l, TCGReg h)
1911{
1912    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1913
1914    /* vmov{d,q} %l, %v */
1915    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1916    /* vpinsr{d,q} $1, %h, %v, %v */
1917    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1918    tcg_out8(s, 1);
1919}
1920
1921/*
1922 * Generate code for the slow path for a load at the end of block
1923 */
1924static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1925{
1926    MemOp opc = get_memop(l->oi);
1927    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1928
1929    /* resolve label address */
1930    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1931    if (label_ptr[1]) {
1932        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1933    }
1934
1935    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1936    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1937    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1938
1939    tcg_out_jmp(s, l->raddr);
1940    return true;
1941}
1942
1943/*
1944 * Generate code for the slow path for a store at the end of block
1945 */
1946static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1947{
1948    MemOp opc = get_memop(l->oi);
1949    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1950
1951    /* resolve label address */
1952    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1953    if (label_ptr[1]) {
1954        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1955    }
1956
1957    tcg_out_st_helper_args(s, l, &ldst_helper_param);
1958    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1959
1960    tcg_out_jmp(s, l->raddr);
1961    return true;
1962}
1963
1964#ifdef CONFIG_USER_ONLY
1965static HostAddress x86_guest_base = {
1966    .index = -1
1967};
1968
1969#if defined(__x86_64__) && defined(__linux__)
1970# include <asm/prctl.h>
1971# include <sys/prctl.h>
1972int arch_prctl(int code, unsigned long addr);
1973static inline int setup_guest_base_seg(void)
1974{
1975    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1976        return P_GS;
1977    }
1978    return 0;
1979}
1980#define setup_guest_base_seg  setup_guest_base_seg
1981#elif defined(__x86_64__) && \
1982      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1983# include <machine/sysarch.h>
1984static inline int setup_guest_base_seg(void)
1985{
1986    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1987        return P_GS;
1988    }
1989    return 0;
1990}
1991#define setup_guest_base_seg  setup_guest_base_seg
1992#endif
1993#else
1994# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
1995#endif /* CONFIG_USER_ONLY */
1996#ifndef setup_guest_base_seg
1997# define setup_guest_base_seg()  0
1998#endif
1999
2000#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2001
2002/*
2003 * For softmmu, perform the TLB load and compare.
2004 * For useronly, perform any required alignment tests.
2005 * In both cases, return a TCGLabelQemuLdst structure if the slow path
2006 * is required and fill in @h with the host address for the fast path.
2007 */
2008static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2009                                           TCGReg addrlo, TCGReg addrhi,
2010                                           MemOpIdx oi, bool is_ld)
2011{
2012    TCGLabelQemuLdst *ldst = NULL;
2013    MemOp opc = get_memop(oi);
2014    MemOp s_bits = opc & MO_SIZE;
2015    unsigned a_mask;
2016
2017    if (tcg_use_softmmu) {
2018        h->index = TCG_REG_L0;
2019        h->ofs = 0;
2020        h->seg = 0;
2021    } else {
2022        *h = x86_guest_base;
2023    }
2024    h->base = addrlo;
2025    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2026    a_mask = (1 << h->aa.align) - 1;
2027
2028    if (tcg_use_softmmu) {
2029        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2030                            : offsetof(CPUTLBEntry, addr_write);
2031        TCGType ttype = TCG_TYPE_I32;
2032        TCGType tlbtype = TCG_TYPE_I32;
2033        int trexw = 0, hrexw = 0, tlbrexw = 0;
2034        unsigned mem_index = get_mmuidx(oi);
2035        unsigned s_mask = (1 << s_bits) - 1;
2036        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2037        int tlb_mask;
2038
2039        ldst = new_ldst_label(s);
2040        ldst->is_ld = is_ld;
2041        ldst->oi = oi;
2042        ldst->addrlo_reg = addrlo;
2043        ldst->addrhi_reg = addrhi;
2044
2045        if (TCG_TARGET_REG_BITS == 64) {
2046            ttype = s->addr_type;
2047            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2048            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2049                hrexw = P_REXW;
2050                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2051                    tlbtype = TCG_TYPE_I64;
2052                    tlbrexw = P_REXW;
2053                }
2054            }
2055        }
2056
2057        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2058        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2059                       s->page_bits - CPU_TLB_ENTRY_BITS);
2060
2061        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2062                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2063
2064        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2065                             fast_ofs + offsetof(CPUTLBDescFast, table));
2066
2067        /*
2068         * If the required alignment is at least as large as the access,
2069         * simply copy the address and mask.  For lesser alignments,
2070         * check that we don't cross pages for the complete access.
2071         */
2072        if (a_mask >= s_mask) {
2073            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2074        } else {
2075            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2076                                 addrlo, s_mask - a_mask);
2077        }
2078        tlb_mask = s->page_mask | a_mask;
2079        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2080
2081        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2082        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2083                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2084
2085        /* jne slow_path */
2086        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2087        ldst->label_ptr[0] = s->code_ptr;
2088        s->code_ptr += 4;
2089
2090        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2091            /* cmp 4(TCG_REG_L0), addrhi */
2092            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2093                                 TCG_REG_L0, cmp_ofs + 4);
2094
2095            /* jne slow_path */
2096            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2097            ldst->label_ptr[1] = s->code_ptr;
2098            s->code_ptr += 4;
2099        }
2100
2101        /* TLB Hit.  */
2102        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2103                   offsetof(CPUTLBEntry, addend));
2104    } else if (a_mask) {
2105        ldst = new_ldst_label(s);
2106
2107        ldst->is_ld = is_ld;
2108        ldst->oi = oi;
2109        ldst->addrlo_reg = addrlo;
2110        ldst->addrhi_reg = addrhi;
2111
2112        tcg_out_testi(s, addrlo, a_mask);
2113        /* jne slow_path */
2114        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2115        ldst->label_ptr[0] = s->code_ptr;
2116        s->code_ptr += 4;
2117    }
2118
2119    return ldst;
2120}
2121
2122static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2123                                   HostAddress h, TCGType type, MemOp memop)
2124{
2125    bool use_movbe = false;
2126    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2127    int movop = OPC_MOVL_GvEv;
2128
2129    /* Do big-endian loads with movbe.  */
2130    if (memop & MO_BSWAP) {
2131        tcg_debug_assert(have_movbe);
2132        use_movbe = true;
2133        movop = OPC_MOVBE_GyMy;
2134    }
2135
2136    switch (memop & MO_SSIZE) {
2137    case MO_UB:
2138        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2139                                 h.base, h.index, 0, h.ofs);
2140        break;
2141    case MO_SB:
2142        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2143                                 h.base, h.index, 0, h.ofs);
2144        break;
2145    case MO_UW:
2146        if (use_movbe) {
2147            /* There is no extending movbe; only low 16-bits are modified.  */
2148            if (datalo != h.base && datalo != h.index) {
2149                /* XOR breaks dependency chains.  */
2150                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2151                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2152                                         datalo, h.base, h.index, 0, h.ofs);
2153            } else {
2154                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2155                                         datalo, h.base, h.index, 0, h.ofs);
2156                tcg_out_ext16u(s, datalo, datalo);
2157            }
2158        } else {
2159            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2160                                     h.base, h.index, 0, h.ofs);
2161        }
2162        break;
2163    case MO_SW:
2164        if (use_movbe) {
2165            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2166                                     datalo, h.base, h.index, 0, h.ofs);
2167            tcg_out_ext16s(s, type, datalo, datalo);
2168        } else {
2169            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2170                                     datalo, h.base, h.index, 0, h.ofs);
2171        }
2172        break;
2173    case MO_UL:
2174        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2175                                 h.base, h.index, 0, h.ofs);
2176        break;
2177#if TCG_TARGET_REG_BITS == 64
2178    case MO_SL:
2179        if (use_movbe) {
2180            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2181                                     h.base, h.index, 0, h.ofs);
2182            tcg_out_ext32s(s, datalo, datalo);
2183        } else {
2184            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2185                                     h.base, h.index, 0, h.ofs);
2186        }
2187        break;
2188#endif
2189    case MO_UQ:
2190        if (TCG_TARGET_REG_BITS == 64) {
2191            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2192                                     h.base, h.index, 0, h.ofs);
2193            break;
2194        }
2195        if (use_movbe) {
2196            TCGReg t = datalo;
2197            datalo = datahi;
2198            datahi = t;
2199        }
2200        if (h.base == datalo || h.index == datalo) {
2201            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2202                                     h.base, h.index, 0, h.ofs);
2203            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2204            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2205        } else {
2206            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2207                                     h.base, h.index, 0, h.ofs);
2208            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2209                                     h.base, h.index, 0, h.ofs + 4);
2210        }
2211        break;
2212
2213    case MO_128:
2214        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2215
2216        /*
2217         * Without 16-byte atomicity, use integer regs.
2218         * That is where we want the data, and it allows bswaps.
2219         */
2220        if (h.aa.atom < MO_128) {
2221            if (use_movbe) {
2222                TCGReg t = datalo;
2223                datalo = datahi;
2224                datahi = t;
2225            }
2226            if (h.base == datalo || h.index == datalo) {
2227                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2228                                         h.base, h.index, 0, h.ofs);
2229                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2230                                     datalo, datahi, 0);
2231                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2232                                     datahi, datahi, 8);
2233            } else {
2234                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2235                                         h.base, h.index, 0, h.ofs);
2236                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2237                                         h.base, h.index, 0, h.ofs + 8);
2238            }
2239            break;
2240        }
2241
2242        /*
2243         * With 16-byte atomicity, a vector load is required.
2244         * If we already have 16-byte alignment, then VMOVDQA always works.
2245         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2246         * Else use we require a runtime test for alignment for VMOVDQA;
2247         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2248         */
2249        if (h.aa.align >= MO_128) {
2250            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2251                                         TCG_TMP_VEC, 0,
2252                                         h.base, h.index, 0, h.ofs);
2253        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2254            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2255                                         TCG_TMP_VEC, 0,
2256                                         h.base, h.index, 0, h.ofs);
2257        } else {
2258            TCGLabel *l1 = gen_new_label();
2259            TCGLabel *l2 = gen_new_label();
2260
2261            tcg_out_testi(s, h.base, 15);
2262            tcg_out_jxx(s, JCC_JNE, l1, true);
2263
2264            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2265                                         TCG_TMP_VEC, 0,
2266                                         h.base, h.index, 0, h.ofs);
2267            tcg_out_jxx(s, JCC_JMP, l2, true);
2268
2269            tcg_out_label(s, l1);
2270            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2271                                         TCG_TMP_VEC, 0,
2272                                         h.base, h.index, 0, h.ofs);
2273            tcg_out_label(s, l2);
2274        }
2275        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2276        break;
2277
2278    default:
2279        g_assert_not_reached();
2280    }
2281}
2282
2283static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2284                            TCGReg addrlo, TCGReg addrhi,
2285                            MemOpIdx oi, TCGType data_type)
2286{
2287    TCGLabelQemuLdst *ldst;
2288    HostAddress h;
2289
2290    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2291    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2292
2293    if (ldst) {
2294        ldst->type = data_type;
2295        ldst->datalo_reg = datalo;
2296        ldst->datahi_reg = datahi;
2297        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2298    }
2299}
2300
2301static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2302                                   HostAddress h, MemOp memop)
2303{
2304    bool use_movbe = false;
2305    int movop = OPC_MOVL_EvGv;
2306
2307    /*
2308     * Do big-endian stores with movbe or system-mode.
2309     * User-only without movbe will have its swapping done generically.
2310     */
2311    if (memop & MO_BSWAP) {
2312        tcg_debug_assert(have_movbe);
2313        use_movbe = true;
2314        movop = OPC_MOVBE_MyGy;
2315    }
2316
2317    switch (memop & MO_SIZE) {
2318    case MO_8:
2319        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2320        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2321        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2322                                 datalo, h.base, h.index, 0, h.ofs);
2323        break;
2324    case MO_16:
2325        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2326                                 h.base, h.index, 0, h.ofs);
2327        break;
2328    case MO_32:
2329        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2330                                 h.base, h.index, 0, h.ofs);
2331        break;
2332    case MO_64:
2333        if (TCG_TARGET_REG_BITS == 64) {
2334            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2335                                     h.base, h.index, 0, h.ofs);
2336        } else {
2337            if (use_movbe) {
2338                TCGReg t = datalo;
2339                datalo = datahi;
2340                datahi = t;
2341            }
2342            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2343                                     h.base, h.index, 0, h.ofs);
2344            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2345                                     h.base, h.index, 0, h.ofs + 4);
2346        }
2347        break;
2348
2349    case MO_128:
2350        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2351
2352        /*
2353         * Without 16-byte atomicity, use integer regs.
2354         * That is where we have the data, and it allows bswaps.
2355         */
2356        if (h.aa.atom < MO_128) {
2357            if (use_movbe) {
2358                TCGReg t = datalo;
2359                datalo = datahi;
2360                datahi = t;
2361            }
2362            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2363                                     h.base, h.index, 0, h.ofs);
2364            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2365                                     h.base, h.index, 0, h.ofs + 8);
2366            break;
2367        }
2368
2369        /*
2370         * With 16-byte atomicity, a vector store is required.
2371         * If we already have 16-byte alignment, then VMOVDQA always works.
2372         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2373         * Else use we require a runtime test for alignment for VMOVDQA;
2374         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2375         */
2376        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2377        if (h.aa.align >= MO_128) {
2378            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2379                                         TCG_TMP_VEC, 0,
2380                                         h.base, h.index, 0, h.ofs);
2381        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2382            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2383                                         TCG_TMP_VEC, 0,
2384                                         h.base, h.index, 0, h.ofs);
2385        } else {
2386            TCGLabel *l1 = gen_new_label();
2387            TCGLabel *l2 = gen_new_label();
2388
2389            tcg_out_testi(s, h.base, 15);
2390            tcg_out_jxx(s, JCC_JNE, l1, true);
2391
2392            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2393                                         TCG_TMP_VEC, 0,
2394                                         h.base, h.index, 0, h.ofs);
2395            tcg_out_jxx(s, JCC_JMP, l2, true);
2396
2397            tcg_out_label(s, l1);
2398            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2399                                         TCG_TMP_VEC, 0,
2400                                         h.base, h.index, 0, h.ofs);
2401            tcg_out_label(s, l2);
2402        }
2403        break;
2404
2405    default:
2406        g_assert_not_reached();
2407    }
2408}
2409
2410static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2411                            TCGReg addrlo, TCGReg addrhi,
2412                            MemOpIdx oi, TCGType data_type)
2413{
2414    TCGLabelQemuLdst *ldst;
2415    HostAddress h;
2416
2417    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2418    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2419
2420    if (ldst) {
2421        ldst->type = data_type;
2422        ldst->datalo_reg = datalo;
2423        ldst->datahi_reg = datahi;
2424        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2425    }
2426}
2427
2428static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2429{
2430    /* Reuse the zeroing that exists for goto_ptr.  */
2431    if (a0 == 0) {
2432        tcg_out_jmp(s, tcg_code_gen_epilogue);
2433    } else {
2434        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2435        tcg_out_jmp(s, tb_ret_addr);
2436    }
2437}
2438
2439static void tcg_out_goto_tb(TCGContext *s, int which)
2440{
2441    /*
2442     * Jump displacement must be aligned for atomic patching;
2443     * see if we need to add extra nops before jump
2444     */
2445    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2446    if (gap != 1) {
2447        tcg_out_nopn(s, gap - 1);
2448    }
2449    tcg_out8(s, OPC_JMP_long); /* jmp im */
2450    set_jmp_insn_offset(s, which);
2451    tcg_out32(s, 0);
2452    set_jmp_reset_offset(s, which);
2453}
2454
2455void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2456                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2457{
2458    /* patch the branch destination */
2459    uintptr_t addr = tb->jmp_target_addr[n];
2460    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2461    /* no need to flush icache explicitly */
2462}
2463
2464static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2465                              const TCGArg args[TCG_MAX_OP_ARGS],
2466                              const int const_args[TCG_MAX_OP_ARGS])
2467{
2468    TCGArg a0, a1, a2;
2469    int c, const_a2, vexop, rexw = 0;
2470
2471#if TCG_TARGET_REG_BITS == 64
2472# define OP_32_64(x) \
2473        case glue(glue(INDEX_op_, x), _i64): \
2474            rexw = P_REXW; /* FALLTHRU */    \
2475        case glue(glue(INDEX_op_, x), _i32)
2476#else
2477# define OP_32_64(x) \
2478        case glue(glue(INDEX_op_, x), _i32)
2479#endif
2480
2481    /* Hoist the loads of the most common arguments.  */
2482    a0 = args[0];
2483    a1 = args[1];
2484    a2 = args[2];
2485    const_a2 = const_args[2];
2486
2487    switch (opc) {
2488    case INDEX_op_goto_ptr:
2489        /* jmp to the given host address (could be epilogue) */
2490        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2491        break;
2492    case INDEX_op_br:
2493        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2494        break;
2495    OP_32_64(ld8u):
2496        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2497        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2498        break;
2499    OP_32_64(ld8s):
2500        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2501        break;
2502    OP_32_64(ld16u):
2503        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2504        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2505        break;
2506    OP_32_64(ld16s):
2507        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2508        break;
2509#if TCG_TARGET_REG_BITS == 64
2510    case INDEX_op_ld32u_i64:
2511#endif
2512    case INDEX_op_ld_i32:
2513        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2514        break;
2515
2516    OP_32_64(st8):
2517        if (const_args[0]) {
2518            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2519            tcg_out8(s, a0);
2520        } else {
2521            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2522        }
2523        break;
2524    OP_32_64(st16):
2525        if (const_args[0]) {
2526            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2527            tcg_out16(s, a0);
2528        } else {
2529            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2530        }
2531        break;
2532#if TCG_TARGET_REG_BITS == 64
2533    case INDEX_op_st32_i64:
2534#endif
2535    case INDEX_op_st_i32:
2536        if (const_args[0]) {
2537            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2538            tcg_out32(s, a0);
2539        } else {
2540            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2541        }
2542        break;
2543
2544    OP_32_64(add):
2545        /* For 3-operand addition, use LEA.  */
2546        if (a0 != a1) {
2547            TCGArg c3 = 0;
2548            if (const_a2) {
2549                c3 = a2, a2 = -1;
2550            } else if (a0 == a2) {
2551                /* Watch out for dest = src + dest, since we've removed
2552                   the matching constraint on the add.  */
2553                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2554                break;
2555            }
2556
2557            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2558            break;
2559        }
2560        c = ARITH_ADD;
2561        goto gen_arith;
2562    OP_32_64(sub):
2563        c = ARITH_SUB;
2564        goto gen_arith;
2565    OP_32_64(and):
2566        c = ARITH_AND;
2567        goto gen_arith;
2568    OP_32_64(or):
2569        c = ARITH_OR;
2570        goto gen_arith;
2571    OP_32_64(xor):
2572        c = ARITH_XOR;
2573        goto gen_arith;
2574    gen_arith:
2575        if (const_a2) {
2576            tgen_arithi(s, c + rexw, a0, a2, 0);
2577        } else {
2578            tgen_arithr(s, c + rexw, a0, a2);
2579        }
2580        break;
2581
2582    OP_32_64(andc):
2583        if (const_a2) {
2584            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2585            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2586        } else {
2587            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2588        }
2589        break;
2590
2591    OP_32_64(mul):
2592        if (const_a2) {
2593            int32_t val;
2594            val = a2;
2595            if (val == (int8_t)val) {
2596                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2597                tcg_out8(s, val);
2598            } else {
2599                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2600                tcg_out32(s, val);
2601            }
2602        } else {
2603            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2604        }
2605        break;
2606
2607    OP_32_64(div2):
2608        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2609        break;
2610    OP_32_64(divu2):
2611        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2612        break;
2613
2614    OP_32_64(shl):
2615        /* For small constant 3-operand shift, use LEA.  */
2616        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2617            if (a2 - 1 == 0) {
2618                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2619                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2620            } else {
2621                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2622                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2623            }
2624            break;
2625        }
2626        c = SHIFT_SHL;
2627        vexop = OPC_SHLX;
2628        goto gen_shift_maybe_vex;
2629    OP_32_64(shr):
2630        c = SHIFT_SHR;
2631        vexop = OPC_SHRX;
2632        goto gen_shift_maybe_vex;
2633    OP_32_64(sar):
2634        c = SHIFT_SAR;
2635        vexop = OPC_SARX;
2636        goto gen_shift_maybe_vex;
2637    OP_32_64(rotl):
2638        c = SHIFT_ROL;
2639        goto gen_shift;
2640    OP_32_64(rotr):
2641        c = SHIFT_ROR;
2642        goto gen_shift;
2643    gen_shift_maybe_vex:
2644        if (have_bmi2) {
2645            if (!const_a2) {
2646                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2647                break;
2648            }
2649            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2650        }
2651        /* FALLTHRU */
2652    gen_shift:
2653        if (const_a2) {
2654            tcg_out_shifti(s, c + rexw, a0, a2);
2655        } else {
2656            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2657        }
2658        break;
2659
2660    OP_32_64(ctz):
2661        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2662        break;
2663    OP_32_64(clz):
2664        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2665        break;
2666    OP_32_64(ctpop):
2667        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2668        break;
2669
2670    OP_32_64(brcond):
2671        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2672                       arg_label(args[3]), 0);
2673        break;
2674    OP_32_64(setcond):
2675        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2676        break;
2677    OP_32_64(negsetcond):
2678        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2679        break;
2680    OP_32_64(movcond):
2681        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2682        break;
2683
2684    OP_32_64(bswap16):
2685        if (a2 & TCG_BSWAP_OS) {
2686            /* Output must be sign-extended. */
2687            if (rexw) {
2688                tcg_out_bswap64(s, a0);
2689                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2690            } else {
2691                tcg_out_bswap32(s, a0);
2692                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2693            }
2694        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2695            /* Output must be zero-extended, but input isn't. */
2696            tcg_out_bswap32(s, a0);
2697            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2698        } else {
2699            tcg_out_rolw_8(s, a0);
2700        }
2701        break;
2702    OP_32_64(bswap32):
2703        tcg_out_bswap32(s, a0);
2704        if (rexw && (a2 & TCG_BSWAP_OS)) {
2705            tcg_out_ext32s(s, a0, a0);
2706        }
2707        break;
2708
2709    OP_32_64(neg):
2710        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2711        break;
2712    OP_32_64(not):
2713        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2714        break;
2715
2716    case INDEX_op_qemu_ld_a64_i32:
2717        if (TCG_TARGET_REG_BITS == 32) {
2718            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2719            break;
2720        }
2721        /* fall through */
2722    case INDEX_op_qemu_ld_a32_i32:
2723        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2724        break;
2725    case INDEX_op_qemu_ld_a32_i64:
2726        if (TCG_TARGET_REG_BITS == 64) {
2727            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2728        } else {
2729            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2730        }
2731        break;
2732    case INDEX_op_qemu_ld_a64_i64:
2733        if (TCG_TARGET_REG_BITS == 64) {
2734            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2735        } else {
2736            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2737        }
2738        break;
2739    case INDEX_op_qemu_ld_a32_i128:
2740    case INDEX_op_qemu_ld_a64_i128:
2741        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2742        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2743        break;
2744
2745    case INDEX_op_qemu_st_a64_i32:
2746    case INDEX_op_qemu_st8_a64_i32:
2747        if (TCG_TARGET_REG_BITS == 32) {
2748            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2749            break;
2750        }
2751        /* fall through */
2752    case INDEX_op_qemu_st_a32_i32:
2753    case INDEX_op_qemu_st8_a32_i32:
2754        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2755        break;
2756    case INDEX_op_qemu_st_a32_i64:
2757        if (TCG_TARGET_REG_BITS == 64) {
2758            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2759        } else {
2760            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2761        }
2762        break;
2763    case INDEX_op_qemu_st_a64_i64:
2764        if (TCG_TARGET_REG_BITS == 64) {
2765            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2766        } else {
2767            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2768        }
2769        break;
2770    case INDEX_op_qemu_st_a32_i128:
2771    case INDEX_op_qemu_st_a64_i128:
2772        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2773        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2774        break;
2775
2776    OP_32_64(mulu2):
2777        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2778        break;
2779    OP_32_64(muls2):
2780        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2781        break;
2782    OP_32_64(add2):
2783        if (const_args[4]) {
2784            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2785        } else {
2786            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2787        }
2788        if (const_args[5]) {
2789            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2790        } else {
2791            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2792        }
2793        break;
2794    OP_32_64(sub2):
2795        if (const_args[4]) {
2796            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2797        } else {
2798            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2799        }
2800        if (const_args[5]) {
2801            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2802        } else {
2803            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2804        }
2805        break;
2806
2807#if TCG_TARGET_REG_BITS == 32
2808    case INDEX_op_brcond2_i32:
2809        tcg_out_brcond2(s, args, const_args, 0);
2810        break;
2811    case INDEX_op_setcond2_i32:
2812        tcg_out_setcond2(s, args, const_args);
2813        break;
2814#else /* TCG_TARGET_REG_BITS == 64 */
2815    case INDEX_op_ld32s_i64:
2816        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2817        break;
2818    case INDEX_op_ld_i64:
2819        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2820        break;
2821    case INDEX_op_st_i64:
2822        if (const_args[0]) {
2823            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2824            tcg_out32(s, a0);
2825        } else {
2826            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2827        }
2828        break;
2829
2830    case INDEX_op_bswap64_i64:
2831        tcg_out_bswap64(s, a0);
2832        break;
2833    case INDEX_op_extrh_i64_i32:
2834        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2835        break;
2836#endif
2837
2838    OP_32_64(deposit):
2839        if (args[3] == 0 && args[4] == 8) {
2840            /* load bits 0..7 */
2841            if (const_a2) {
2842                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2843                            0, a0, 0);
2844                tcg_out8(s, a2);
2845            } else {
2846                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2847            }
2848        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2849            /* load bits 8..15 */
2850            if (const_a2) {
2851                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2852                tcg_out8(s, a2);
2853            } else {
2854                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2855            }
2856        } else if (args[3] == 0 && args[4] == 16) {
2857            /* load bits 0..15 */
2858            if (const_a2) {
2859                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2860                            0, a0, 0);
2861                tcg_out16(s, a2);
2862            } else {
2863                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2864            }
2865        } else {
2866            g_assert_not_reached();
2867        }
2868        break;
2869
2870    case INDEX_op_extract_i64:
2871        if (a2 + args[3] == 32) {
2872            /* This is a 32-bit zero-extending right shift.  */
2873            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2874            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2875            break;
2876        }
2877        /* FALLTHRU */
2878    case INDEX_op_extract_i32:
2879        /* On the off-chance that we can use the high-byte registers.
2880           Otherwise we emit the same ext16 + shift pattern that we
2881           would have gotten from the normal tcg-op.c expansion.  */
2882        tcg_debug_assert(a2 == 8 && args[3] == 8);
2883        if (a1 < 4 && a0 < 8) {
2884            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2885        } else {
2886            tcg_out_ext16u(s, a0, a1);
2887            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2888        }
2889        break;
2890
2891    case INDEX_op_sextract_i32:
2892        /* We don't implement sextract_i64, as we cannot sign-extend to
2893           64-bits without using the REX prefix that explicitly excludes
2894           access to the high-byte registers.  */
2895        tcg_debug_assert(a2 == 8 && args[3] == 8);
2896        if (a1 < 4 && a0 < 8) {
2897            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2898        } else {
2899            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2900            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2901        }
2902        break;
2903
2904    OP_32_64(extract2):
2905        /* Note that SHRD outputs to the r/m operand.  */
2906        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2907        tcg_out8(s, args[3]);
2908        break;
2909
2910    case INDEX_op_mb:
2911        tcg_out_mb(s, a0);
2912        break;
2913    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2914    case INDEX_op_mov_i64:
2915    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2916    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2917    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2918    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2919    case INDEX_op_ext8s_i64:
2920    case INDEX_op_ext8u_i32:
2921    case INDEX_op_ext8u_i64:
2922    case INDEX_op_ext16s_i32:
2923    case INDEX_op_ext16s_i64:
2924    case INDEX_op_ext16u_i32:
2925    case INDEX_op_ext16u_i64:
2926    case INDEX_op_ext32s_i64:
2927    case INDEX_op_ext32u_i64:
2928    case INDEX_op_ext_i32_i64:
2929    case INDEX_op_extu_i32_i64:
2930    case INDEX_op_extrl_i64_i32:
2931    default:
2932        g_assert_not_reached();
2933    }
2934
2935#undef OP_32_64
2936}
2937
2938static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2939                           unsigned vecl, unsigned vece,
2940                           const TCGArg args[TCG_MAX_OP_ARGS],
2941                           const int const_args[TCG_MAX_OP_ARGS])
2942{
2943    static int const add_insn[4] = {
2944        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2945    };
2946    static int const ssadd_insn[4] = {
2947        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2948    };
2949    static int const usadd_insn[4] = {
2950        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2951    };
2952    static int const sub_insn[4] = {
2953        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2954    };
2955    static int const sssub_insn[4] = {
2956        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2957    };
2958    static int const ussub_insn[4] = {
2959        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2960    };
2961    static int const mul_insn[4] = {
2962        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2963    };
2964    static int const shift_imm_insn[4] = {
2965        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2966    };
2967    static int const cmpeq_insn[4] = {
2968        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2969    };
2970    static int const cmpgt_insn[4] = {
2971        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2972    };
2973    static int const punpckl_insn[4] = {
2974        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2975    };
2976    static int const punpckh_insn[4] = {
2977        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2978    };
2979    static int const packss_insn[4] = {
2980        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2981    };
2982    static int const packus_insn[4] = {
2983        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2984    };
2985    static int const smin_insn[4] = {
2986        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2987    };
2988    static int const smax_insn[4] = {
2989        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2990    };
2991    static int const umin_insn[4] = {
2992        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2993    };
2994    static int const umax_insn[4] = {
2995        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2996    };
2997    static int const rotlv_insn[4] = {
2998        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2999    };
3000    static int const rotrv_insn[4] = {
3001        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3002    };
3003    static int const shlv_insn[4] = {
3004        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3005    };
3006    static int const shrv_insn[4] = {
3007        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3008    };
3009    static int const sarv_insn[4] = {
3010        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3011    };
3012    static int const shls_insn[4] = {
3013        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3014    };
3015    static int const shrs_insn[4] = {
3016        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3017    };
3018    static int const sars_insn[4] = {
3019        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3020    };
3021    static int const vpshldi_insn[4] = {
3022        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3023    };
3024    static int const vpshldv_insn[4] = {
3025        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3026    };
3027    static int const vpshrdv_insn[4] = {
3028        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3029    };
3030    static int const abs_insn[4] = {
3031        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3032    };
3033
3034    TCGType type = vecl + TCG_TYPE_V64;
3035    int insn, sub;
3036    TCGArg a0, a1, a2, a3;
3037
3038    a0 = args[0];
3039    a1 = args[1];
3040    a2 = args[2];
3041
3042    switch (opc) {
3043    case INDEX_op_add_vec:
3044        insn = add_insn[vece];
3045        goto gen_simd;
3046    case INDEX_op_ssadd_vec:
3047        insn = ssadd_insn[vece];
3048        goto gen_simd;
3049    case INDEX_op_usadd_vec:
3050        insn = usadd_insn[vece];
3051        goto gen_simd;
3052    case INDEX_op_sub_vec:
3053        insn = sub_insn[vece];
3054        goto gen_simd;
3055    case INDEX_op_sssub_vec:
3056        insn = sssub_insn[vece];
3057        goto gen_simd;
3058    case INDEX_op_ussub_vec:
3059        insn = ussub_insn[vece];
3060        goto gen_simd;
3061    case INDEX_op_mul_vec:
3062        insn = mul_insn[vece];
3063        goto gen_simd;
3064    case INDEX_op_and_vec:
3065        insn = OPC_PAND;
3066        goto gen_simd;
3067    case INDEX_op_or_vec:
3068        insn = OPC_POR;
3069        goto gen_simd;
3070    case INDEX_op_xor_vec:
3071        insn = OPC_PXOR;
3072        goto gen_simd;
3073    case INDEX_op_smin_vec:
3074        insn = smin_insn[vece];
3075        goto gen_simd;
3076    case INDEX_op_umin_vec:
3077        insn = umin_insn[vece];
3078        goto gen_simd;
3079    case INDEX_op_smax_vec:
3080        insn = smax_insn[vece];
3081        goto gen_simd;
3082    case INDEX_op_umax_vec:
3083        insn = umax_insn[vece];
3084        goto gen_simd;
3085    case INDEX_op_shlv_vec:
3086        insn = shlv_insn[vece];
3087        goto gen_simd;
3088    case INDEX_op_shrv_vec:
3089        insn = shrv_insn[vece];
3090        goto gen_simd;
3091    case INDEX_op_sarv_vec:
3092        insn = sarv_insn[vece];
3093        goto gen_simd;
3094    case INDEX_op_rotlv_vec:
3095        insn = rotlv_insn[vece];
3096        goto gen_simd;
3097    case INDEX_op_rotrv_vec:
3098        insn = rotrv_insn[vece];
3099        goto gen_simd;
3100    case INDEX_op_shls_vec:
3101        insn = shls_insn[vece];
3102        goto gen_simd;
3103    case INDEX_op_shrs_vec:
3104        insn = shrs_insn[vece];
3105        goto gen_simd;
3106    case INDEX_op_sars_vec:
3107        insn = sars_insn[vece];
3108        goto gen_simd;
3109    case INDEX_op_x86_punpckl_vec:
3110        insn = punpckl_insn[vece];
3111        goto gen_simd;
3112    case INDEX_op_x86_punpckh_vec:
3113        insn = punpckh_insn[vece];
3114        goto gen_simd;
3115    case INDEX_op_x86_packss_vec:
3116        insn = packss_insn[vece];
3117        goto gen_simd;
3118    case INDEX_op_x86_packus_vec:
3119        insn = packus_insn[vece];
3120        goto gen_simd;
3121    case INDEX_op_x86_vpshldv_vec:
3122        insn = vpshldv_insn[vece];
3123        a1 = a2;
3124        a2 = args[3];
3125        goto gen_simd;
3126    case INDEX_op_x86_vpshrdv_vec:
3127        insn = vpshrdv_insn[vece];
3128        a1 = a2;
3129        a2 = args[3];
3130        goto gen_simd;
3131#if TCG_TARGET_REG_BITS == 32
3132    case INDEX_op_dup2_vec:
3133        /* First merge the two 32-bit inputs to a single 64-bit element. */
3134        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3135        /* Then replicate the 64-bit elements across the rest of the vector. */
3136        if (type != TCG_TYPE_V64) {
3137            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3138        }
3139        break;
3140#endif
3141    case INDEX_op_abs_vec:
3142        insn = abs_insn[vece];
3143        a2 = a1;
3144        a1 = 0;
3145        goto gen_simd;
3146    gen_simd:
3147        tcg_debug_assert(insn != OPC_UD2);
3148        if (type == TCG_TYPE_V256) {
3149            insn |= P_VEXL;
3150        }
3151        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3152        break;
3153
3154    case INDEX_op_cmp_vec:
3155        sub = args[3];
3156        if (sub == TCG_COND_EQ) {
3157            insn = cmpeq_insn[vece];
3158        } else if (sub == TCG_COND_GT) {
3159            insn = cmpgt_insn[vece];
3160        } else {
3161            g_assert_not_reached();
3162        }
3163        goto gen_simd;
3164
3165    case INDEX_op_andc_vec:
3166        insn = OPC_PANDN;
3167        if (type == TCG_TYPE_V256) {
3168            insn |= P_VEXL;
3169        }
3170        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3171        break;
3172
3173    case INDEX_op_shli_vec:
3174        insn = shift_imm_insn[vece];
3175        sub = 6;
3176        goto gen_shift;
3177    case INDEX_op_shri_vec:
3178        insn = shift_imm_insn[vece];
3179        sub = 2;
3180        goto gen_shift;
3181    case INDEX_op_sari_vec:
3182        if (vece == MO_64) {
3183            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3184        } else {
3185            insn = shift_imm_insn[vece];
3186        }
3187        sub = 4;
3188        goto gen_shift;
3189    case INDEX_op_rotli_vec:
3190        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3191        if (vece == MO_64) {
3192            insn |= P_VEXW;
3193        }
3194        sub = 1;
3195        goto gen_shift;
3196    gen_shift:
3197        tcg_debug_assert(vece != MO_8);
3198        if (type == TCG_TYPE_V256) {
3199            insn |= P_VEXL;
3200        }
3201        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3202        tcg_out8(s, a2);
3203        break;
3204
3205    case INDEX_op_ld_vec:
3206        tcg_out_ld(s, type, a0, a1, a2);
3207        break;
3208    case INDEX_op_st_vec:
3209        tcg_out_st(s, type, a0, a1, a2);
3210        break;
3211    case INDEX_op_dupm_vec:
3212        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3213        break;
3214
3215    case INDEX_op_x86_shufps_vec:
3216        insn = OPC_SHUFPS;
3217        sub = args[3];
3218        goto gen_simd_imm8;
3219    case INDEX_op_x86_blend_vec:
3220        if (vece == MO_16) {
3221            insn = OPC_PBLENDW;
3222        } else if (vece == MO_32) {
3223            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3224        } else {
3225            g_assert_not_reached();
3226        }
3227        sub = args[3];
3228        goto gen_simd_imm8;
3229    case INDEX_op_x86_vperm2i128_vec:
3230        insn = OPC_VPERM2I128;
3231        sub = args[3];
3232        goto gen_simd_imm8;
3233    case INDEX_op_x86_vpshldi_vec:
3234        insn = vpshldi_insn[vece];
3235        sub = args[3];
3236        goto gen_simd_imm8;
3237
3238    case INDEX_op_not_vec:
3239        insn = OPC_VPTERNLOGQ;
3240        a2 = a1;
3241        sub = 0x33; /* !B */
3242        goto gen_simd_imm8;
3243    case INDEX_op_nor_vec:
3244        insn = OPC_VPTERNLOGQ;
3245        sub = 0x11; /* norCB */
3246        goto gen_simd_imm8;
3247    case INDEX_op_nand_vec:
3248        insn = OPC_VPTERNLOGQ;
3249        sub = 0x77; /* nandCB */
3250        goto gen_simd_imm8;
3251    case INDEX_op_eqv_vec:
3252        insn = OPC_VPTERNLOGQ;
3253        sub = 0x99; /* xnorCB */
3254        goto gen_simd_imm8;
3255    case INDEX_op_orc_vec:
3256        insn = OPC_VPTERNLOGQ;
3257        sub = 0xdd; /* orB!C */
3258        goto gen_simd_imm8;
3259
3260    case INDEX_op_bitsel_vec:
3261        insn = OPC_VPTERNLOGQ;
3262        a3 = args[3];
3263        if (a0 == a1) {
3264            a1 = a2;
3265            a2 = a3;
3266            sub = 0xca; /* A?B:C */
3267        } else if (a0 == a2) {
3268            a2 = a3;
3269            sub = 0xe2; /* B?A:C */
3270        } else {
3271            tcg_out_mov(s, type, a0, a3);
3272            sub = 0xb8; /* B?C:A */
3273        }
3274        goto gen_simd_imm8;
3275
3276    gen_simd_imm8:
3277        tcg_debug_assert(insn != OPC_UD2);
3278        if (type == TCG_TYPE_V256) {
3279            insn |= P_VEXL;
3280        }
3281        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3282        tcg_out8(s, sub);
3283        break;
3284
3285    case INDEX_op_x86_vpblendvb_vec:
3286        insn = OPC_VPBLENDVB;
3287        if (type == TCG_TYPE_V256) {
3288            insn |= P_VEXL;
3289        }
3290        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3291        tcg_out8(s, args[3] << 4);
3292        break;
3293
3294    case INDEX_op_x86_psrldq_vec:
3295        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3296        tcg_out8(s, a2);
3297        break;
3298
3299    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3300    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3301    default:
3302        g_assert_not_reached();
3303    }
3304}
3305
3306static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3307{
3308    switch (op) {
3309    case INDEX_op_goto_ptr:
3310        return C_O0_I1(r);
3311
3312    case INDEX_op_ld8u_i32:
3313    case INDEX_op_ld8u_i64:
3314    case INDEX_op_ld8s_i32:
3315    case INDEX_op_ld8s_i64:
3316    case INDEX_op_ld16u_i32:
3317    case INDEX_op_ld16u_i64:
3318    case INDEX_op_ld16s_i32:
3319    case INDEX_op_ld16s_i64:
3320    case INDEX_op_ld_i32:
3321    case INDEX_op_ld32u_i64:
3322    case INDEX_op_ld32s_i64:
3323    case INDEX_op_ld_i64:
3324        return C_O1_I1(r, r);
3325
3326    case INDEX_op_st8_i32:
3327    case INDEX_op_st8_i64:
3328        return C_O0_I2(qi, r);
3329
3330    case INDEX_op_st16_i32:
3331    case INDEX_op_st16_i64:
3332    case INDEX_op_st_i32:
3333    case INDEX_op_st32_i64:
3334        return C_O0_I2(ri, r);
3335
3336    case INDEX_op_st_i64:
3337        return C_O0_I2(re, r);
3338
3339    case INDEX_op_add_i32:
3340    case INDEX_op_add_i64:
3341        return C_O1_I2(r, r, re);
3342
3343    case INDEX_op_sub_i32:
3344    case INDEX_op_sub_i64:
3345    case INDEX_op_mul_i32:
3346    case INDEX_op_mul_i64:
3347    case INDEX_op_or_i32:
3348    case INDEX_op_or_i64:
3349    case INDEX_op_xor_i32:
3350    case INDEX_op_xor_i64:
3351        return C_O1_I2(r, 0, re);
3352
3353    case INDEX_op_and_i32:
3354    case INDEX_op_and_i64:
3355        return C_O1_I2(r, 0, reZ);
3356
3357    case INDEX_op_andc_i32:
3358    case INDEX_op_andc_i64:
3359        return C_O1_I2(r, r, rI);
3360
3361    case INDEX_op_shl_i32:
3362    case INDEX_op_shl_i64:
3363    case INDEX_op_shr_i32:
3364    case INDEX_op_shr_i64:
3365    case INDEX_op_sar_i32:
3366    case INDEX_op_sar_i64:
3367        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3368
3369    case INDEX_op_rotl_i32:
3370    case INDEX_op_rotl_i64:
3371    case INDEX_op_rotr_i32:
3372    case INDEX_op_rotr_i64:
3373        return C_O1_I2(r, 0, ci);
3374
3375    case INDEX_op_brcond_i32:
3376    case INDEX_op_brcond_i64:
3377        return C_O0_I2(r, re);
3378
3379    case INDEX_op_bswap16_i32:
3380    case INDEX_op_bswap16_i64:
3381    case INDEX_op_bswap32_i32:
3382    case INDEX_op_bswap32_i64:
3383    case INDEX_op_bswap64_i64:
3384    case INDEX_op_neg_i32:
3385    case INDEX_op_neg_i64:
3386    case INDEX_op_not_i32:
3387    case INDEX_op_not_i64:
3388    case INDEX_op_extrh_i64_i32:
3389        return C_O1_I1(r, 0);
3390
3391    case INDEX_op_ext8s_i32:
3392    case INDEX_op_ext8s_i64:
3393    case INDEX_op_ext8u_i32:
3394    case INDEX_op_ext8u_i64:
3395        return C_O1_I1(r, q);
3396
3397    case INDEX_op_ext16s_i32:
3398    case INDEX_op_ext16s_i64:
3399    case INDEX_op_ext16u_i32:
3400    case INDEX_op_ext16u_i64:
3401    case INDEX_op_ext32s_i64:
3402    case INDEX_op_ext32u_i64:
3403    case INDEX_op_ext_i32_i64:
3404    case INDEX_op_extu_i32_i64:
3405    case INDEX_op_extrl_i64_i32:
3406    case INDEX_op_extract_i32:
3407    case INDEX_op_extract_i64:
3408    case INDEX_op_sextract_i32:
3409    case INDEX_op_ctpop_i32:
3410    case INDEX_op_ctpop_i64:
3411        return C_O1_I1(r, r);
3412
3413    case INDEX_op_extract2_i32:
3414    case INDEX_op_extract2_i64:
3415        return C_O1_I2(r, 0, r);
3416
3417    case INDEX_op_deposit_i32:
3418    case INDEX_op_deposit_i64:
3419        return C_O1_I2(q, 0, qi);
3420
3421    case INDEX_op_setcond_i32:
3422    case INDEX_op_setcond_i64:
3423    case INDEX_op_negsetcond_i32:
3424    case INDEX_op_negsetcond_i64:
3425        return C_O1_I2(q, r, re);
3426
3427    case INDEX_op_movcond_i32:
3428    case INDEX_op_movcond_i64:
3429        return C_O1_I4(r, r, re, r, 0);
3430
3431    case INDEX_op_div2_i32:
3432    case INDEX_op_div2_i64:
3433    case INDEX_op_divu2_i32:
3434    case INDEX_op_divu2_i64:
3435        return C_O2_I3(a, d, 0, 1, r);
3436
3437    case INDEX_op_mulu2_i32:
3438    case INDEX_op_mulu2_i64:
3439    case INDEX_op_muls2_i32:
3440    case INDEX_op_muls2_i64:
3441        return C_O2_I2(a, d, a, r);
3442
3443    case INDEX_op_add2_i32:
3444    case INDEX_op_add2_i64:
3445    case INDEX_op_sub2_i32:
3446    case INDEX_op_sub2_i64:
3447        return C_N1_O1_I4(r, r, 0, 1, re, re);
3448
3449    case INDEX_op_ctz_i32:
3450    case INDEX_op_ctz_i64:
3451        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3452
3453    case INDEX_op_clz_i32:
3454    case INDEX_op_clz_i64:
3455        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3456
3457    case INDEX_op_qemu_ld_a32_i32:
3458        return C_O1_I1(r, L);
3459    case INDEX_op_qemu_ld_a64_i32:
3460        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3461
3462    case INDEX_op_qemu_st_a32_i32:
3463        return C_O0_I2(L, L);
3464    case INDEX_op_qemu_st_a64_i32:
3465        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3466    case INDEX_op_qemu_st8_a32_i32:
3467        return C_O0_I2(s, L);
3468    case INDEX_op_qemu_st8_a64_i32:
3469        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3470
3471    case INDEX_op_qemu_ld_a32_i64:
3472        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3473    case INDEX_op_qemu_ld_a64_i64:
3474        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3475
3476    case INDEX_op_qemu_st_a32_i64:
3477        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3478    case INDEX_op_qemu_st_a64_i64:
3479        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3480
3481    case INDEX_op_qemu_ld_a32_i128:
3482    case INDEX_op_qemu_ld_a64_i128:
3483        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3484        return C_O2_I1(r, r, L);
3485    case INDEX_op_qemu_st_a32_i128:
3486    case INDEX_op_qemu_st_a64_i128:
3487        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3488        return C_O0_I3(L, L, L);
3489
3490    case INDEX_op_brcond2_i32:
3491        return C_O0_I4(r, r, ri, ri);
3492
3493    case INDEX_op_setcond2_i32:
3494        return C_O1_I4(r, r, r, ri, ri);
3495
3496    case INDEX_op_ld_vec:
3497    case INDEX_op_dupm_vec:
3498        return C_O1_I1(x, r);
3499
3500    case INDEX_op_st_vec:
3501        return C_O0_I2(x, r);
3502
3503    case INDEX_op_add_vec:
3504    case INDEX_op_sub_vec:
3505    case INDEX_op_mul_vec:
3506    case INDEX_op_and_vec:
3507    case INDEX_op_or_vec:
3508    case INDEX_op_xor_vec:
3509    case INDEX_op_andc_vec:
3510    case INDEX_op_orc_vec:
3511    case INDEX_op_nand_vec:
3512    case INDEX_op_nor_vec:
3513    case INDEX_op_eqv_vec:
3514    case INDEX_op_ssadd_vec:
3515    case INDEX_op_usadd_vec:
3516    case INDEX_op_sssub_vec:
3517    case INDEX_op_ussub_vec:
3518    case INDEX_op_smin_vec:
3519    case INDEX_op_umin_vec:
3520    case INDEX_op_smax_vec:
3521    case INDEX_op_umax_vec:
3522    case INDEX_op_shlv_vec:
3523    case INDEX_op_shrv_vec:
3524    case INDEX_op_sarv_vec:
3525    case INDEX_op_rotlv_vec:
3526    case INDEX_op_rotrv_vec:
3527    case INDEX_op_shls_vec:
3528    case INDEX_op_shrs_vec:
3529    case INDEX_op_sars_vec:
3530    case INDEX_op_cmp_vec:
3531    case INDEX_op_x86_shufps_vec:
3532    case INDEX_op_x86_blend_vec:
3533    case INDEX_op_x86_packss_vec:
3534    case INDEX_op_x86_packus_vec:
3535    case INDEX_op_x86_vperm2i128_vec:
3536    case INDEX_op_x86_punpckl_vec:
3537    case INDEX_op_x86_punpckh_vec:
3538    case INDEX_op_x86_vpshldi_vec:
3539#if TCG_TARGET_REG_BITS == 32
3540    case INDEX_op_dup2_vec:
3541#endif
3542        return C_O1_I2(x, x, x);
3543
3544    case INDEX_op_abs_vec:
3545    case INDEX_op_dup_vec:
3546    case INDEX_op_not_vec:
3547    case INDEX_op_shli_vec:
3548    case INDEX_op_shri_vec:
3549    case INDEX_op_sari_vec:
3550    case INDEX_op_rotli_vec:
3551    case INDEX_op_x86_psrldq_vec:
3552        return C_O1_I1(x, x);
3553
3554    case INDEX_op_x86_vpshldv_vec:
3555    case INDEX_op_x86_vpshrdv_vec:
3556        return C_O1_I3(x, 0, x, x);
3557
3558    case INDEX_op_bitsel_vec:
3559    case INDEX_op_x86_vpblendvb_vec:
3560        return C_O1_I3(x, x, x, x);
3561
3562    default:
3563        g_assert_not_reached();
3564    }
3565}
3566
3567int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3568{
3569    switch (opc) {
3570    case INDEX_op_add_vec:
3571    case INDEX_op_sub_vec:
3572    case INDEX_op_and_vec:
3573    case INDEX_op_or_vec:
3574    case INDEX_op_xor_vec:
3575    case INDEX_op_andc_vec:
3576    case INDEX_op_orc_vec:
3577    case INDEX_op_nand_vec:
3578    case INDEX_op_nor_vec:
3579    case INDEX_op_eqv_vec:
3580    case INDEX_op_not_vec:
3581    case INDEX_op_bitsel_vec:
3582        return 1;
3583    case INDEX_op_cmp_vec:
3584    case INDEX_op_cmpsel_vec:
3585        return -1;
3586
3587    case INDEX_op_rotli_vec:
3588        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3589
3590    case INDEX_op_shli_vec:
3591    case INDEX_op_shri_vec:
3592        /* We must expand the operation for MO_8.  */
3593        return vece == MO_8 ? -1 : 1;
3594
3595    case INDEX_op_sari_vec:
3596        switch (vece) {
3597        case MO_8:
3598            return -1;
3599        case MO_16:
3600        case MO_32:
3601            return 1;
3602        case MO_64:
3603            if (have_avx512vl) {
3604                return 1;
3605            }
3606            /*
3607             * We can emulate this for MO_64, but it does not pay off
3608             * unless we're producing at least 4 values.
3609             */
3610            return type >= TCG_TYPE_V256 ? -1 : 0;
3611        }
3612        return 0;
3613
3614    case INDEX_op_shls_vec:
3615    case INDEX_op_shrs_vec:
3616        return vece >= MO_16;
3617    case INDEX_op_sars_vec:
3618        switch (vece) {
3619        case MO_16:
3620        case MO_32:
3621            return 1;
3622        case MO_64:
3623            return have_avx512vl;
3624        }
3625        return 0;
3626    case INDEX_op_rotls_vec:
3627        return vece >= MO_16 ? -1 : 0;
3628
3629    case INDEX_op_shlv_vec:
3630    case INDEX_op_shrv_vec:
3631        switch (vece) {
3632        case MO_16:
3633            return have_avx512bw;
3634        case MO_32:
3635        case MO_64:
3636            return have_avx2;
3637        }
3638        return 0;
3639    case INDEX_op_sarv_vec:
3640        switch (vece) {
3641        case MO_16:
3642            return have_avx512bw;
3643        case MO_32:
3644            return have_avx2;
3645        case MO_64:
3646            return have_avx512vl;
3647        }
3648        return 0;
3649    case INDEX_op_rotlv_vec:
3650    case INDEX_op_rotrv_vec:
3651        switch (vece) {
3652        case MO_16:
3653            return have_avx512vbmi2 ? -1 : 0;
3654        case MO_32:
3655        case MO_64:
3656            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3657        }
3658        return 0;
3659
3660    case INDEX_op_mul_vec:
3661        switch (vece) {
3662        case MO_8:
3663            return -1;
3664        case MO_64:
3665            return have_avx512dq;
3666        }
3667        return 1;
3668
3669    case INDEX_op_ssadd_vec:
3670    case INDEX_op_usadd_vec:
3671    case INDEX_op_sssub_vec:
3672    case INDEX_op_ussub_vec:
3673        return vece <= MO_16;
3674    case INDEX_op_smin_vec:
3675    case INDEX_op_smax_vec:
3676    case INDEX_op_umin_vec:
3677    case INDEX_op_umax_vec:
3678    case INDEX_op_abs_vec:
3679        return vece <= MO_32 || have_avx512vl;
3680
3681    default:
3682        return 0;
3683    }
3684}
3685
3686static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3687                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3688{
3689    TCGv_vec t1, t2;
3690
3691    tcg_debug_assert(vece == MO_8);
3692
3693    t1 = tcg_temp_new_vec(type);
3694    t2 = tcg_temp_new_vec(type);
3695
3696    /*
3697     * Unpack to W, shift, and repack.  Tricky bits:
3698     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3699     *     i.e. duplicate in other half of the 16-bit lane.
3700     * (2) For right-shift, add 8 so that the high half of the lane
3701     *     becomes zero.  For left-shift, and left-rotate, we must
3702     *     shift up and down again.
3703     * (3) Step 2 leaves high half zero such that PACKUSWB
3704     *     (pack with unsigned saturation) does not modify
3705     *     the quantity.
3706     */
3707    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3708              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3709    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3710              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3711
3712    if (opc != INDEX_op_rotli_vec) {
3713        imm += 8;
3714    }
3715    if (opc == INDEX_op_shri_vec) {
3716        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3717        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3718    } else {
3719        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3720        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3721        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3722        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3723    }
3724
3725    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3726              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3727    tcg_temp_free_vec(t1);
3728    tcg_temp_free_vec(t2);
3729}
3730
3731static void expand_vec_sari(TCGType type, unsigned vece,
3732                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3733{
3734    TCGv_vec t1, t2;
3735
3736    switch (vece) {
3737    case MO_8:
3738        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3739        t1 = tcg_temp_new_vec(type);
3740        t2 = tcg_temp_new_vec(type);
3741        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3742                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3743        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3744                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3745        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3746        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3747        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3748                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3749        tcg_temp_free_vec(t1);
3750        tcg_temp_free_vec(t2);
3751        break;
3752
3753    case MO_64:
3754        t1 = tcg_temp_new_vec(type);
3755        if (imm <= 32) {
3756            /*
3757             * We can emulate a small sign extend by performing an arithmetic
3758             * 32-bit shift and overwriting the high half of a 64-bit logical
3759             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3760             * does not, so we have to bound the smaller shift -- we get the
3761             * same result in the high half either way.
3762             */
3763            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3764            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3765            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3766                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3767                      tcgv_vec_arg(t1), 0xaa);
3768        } else {
3769            /* Otherwise we will need to use a compare vs 0 to produce
3770             * the sign-extend, shift and merge.
3771             */
3772            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3773                            tcg_constant_vec(type, MO_64, 0), v1);
3774            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3775            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3776            tcg_gen_or_vec(MO_64, v0, v0, t1);
3777        }
3778        tcg_temp_free_vec(t1);
3779        break;
3780
3781    default:
3782        g_assert_not_reached();
3783    }
3784}
3785
3786static void expand_vec_rotli(TCGType type, unsigned vece,
3787                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3788{
3789    TCGv_vec t;
3790
3791    if (vece == MO_8) {
3792        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3793        return;
3794    }
3795
3796    if (have_avx512vbmi2) {
3797        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3798                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3799        return;
3800    }
3801
3802    t = tcg_temp_new_vec(type);
3803    tcg_gen_shli_vec(vece, t, v1, imm);
3804    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3805    tcg_gen_or_vec(vece, v0, v0, t);
3806    tcg_temp_free_vec(t);
3807}
3808
3809static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3810                            TCGv_vec v1, TCGv_vec sh, bool right)
3811{
3812    TCGv_vec t;
3813
3814    if (have_avx512vbmi2) {
3815        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3816                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3817                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3818        return;
3819    }
3820
3821    t = tcg_temp_new_vec(type);
3822    tcg_gen_dupi_vec(vece, t, 8 << vece);
3823    tcg_gen_sub_vec(vece, t, t, sh);
3824    if (right) {
3825        tcg_gen_shlv_vec(vece, t, v1, t);
3826        tcg_gen_shrv_vec(vece, v0, v1, sh);
3827    } else {
3828        tcg_gen_shrv_vec(vece, t, v1, t);
3829        tcg_gen_shlv_vec(vece, v0, v1, sh);
3830    }
3831    tcg_gen_or_vec(vece, v0, v0, t);
3832    tcg_temp_free_vec(t);
3833}
3834
3835static void expand_vec_rotls(TCGType type, unsigned vece,
3836                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3837{
3838    TCGv_vec t = tcg_temp_new_vec(type);
3839
3840    tcg_debug_assert(vece != MO_8);
3841
3842    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3843        tcg_gen_dup_i32_vec(vece, t, lsh);
3844        if (vece >= MO_32) {
3845            tcg_gen_rotlv_vec(vece, v0, v1, t);
3846        } else {
3847            expand_vec_rotv(type, vece, v0, v1, t, false);
3848        }
3849    } else {
3850        TCGv_i32 rsh = tcg_temp_new_i32();
3851
3852        tcg_gen_neg_i32(rsh, lsh);
3853        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3854        tcg_gen_shls_vec(vece, t, v1, lsh);
3855        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3856        tcg_gen_or_vec(vece, v0, v0, t);
3857
3858        tcg_temp_free_i32(rsh);
3859    }
3860
3861    tcg_temp_free_vec(t);
3862}
3863
3864static void expand_vec_mul(TCGType type, unsigned vece,
3865                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3866{
3867    TCGv_vec t1, t2, t3, t4, zero;
3868
3869    tcg_debug_assert(vece == MO_8);
3870
3871    /*
3872     * Unpack v1 bytes to words, 0 | x.
3873     * Unpack v2 bytes to words, y | 0.
3874     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3875     * Shift logical right by 8 bits to clear the high 8 bytes before
3876     * using an unsigned saturated pack.
3877     *
3878     * The difference between the V64, V128 and V256 cases is merely how
3879     * we distribute the expansion between temporaries.
3880     */
3881    switch (type) {
3882    case TCG_TYPE_V64:
3883        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3884        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3885        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3886        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3887                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3888        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3889                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3890        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3891        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3892        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3893                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3894        tcg_temp_free_vec(t1);
3895        tcg_temp_free_vec(t2);
3896        break;
3897
3898    case TCG_TYPE_V128:
3899    case TCG_TYPE_V256:
3900        t1 = tcg_temp_new_vec(type);
3901        t2 = tcg_temp_new_vec(type);
3902        t3 = tcg_temp_new_vec(type);
3903        t4 = tcg_temp_new_vec(type);
3904        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3905        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3906                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3907        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3908                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3909        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3910                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3911        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3912                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3913        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3914        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3915        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3916        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3917        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3918                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3919        tcg_temp_free_vec(t1);
3920        tcg_temp_free_vec(t2);
3921        tcg_temp_free_vec(t3);
3922        tcg_temp_free_vec(t4);
3923        break;
3924
3925    default:
3926        g_assert_not_reached();
3927    }
3928}
3929
3930static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3931                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3932{
3933    enum {
3934        NEED_INV  = 1,
3935        NEED_SWAP = 2,
3936        NEED_BIAS = 4,
3937        NEED_UMIN = 8,
3938        NEED_UMAX = 16,
3939    };
3940    TCGv_vec t1, t2, t3;
3941    uint8_t fixup;
3942
3943    switch (cond) {
3944    case TCG_COND_EQ:
3945    case TCG_COND_GT:
3946        fixup = 0;
3947        break;
3948    case TCG_COND_NE:
3949    case TCG_COND_LE:
3950        fixup = NEED_INV;
3951        break;
3952    case TCG_COND_LT:
3953        fixup = NEED_SWAP;
3954        break;
3955    case TCG_COND_GE:
3956        fixup = NEED_SWAP | NEED_INV;
3957        break;
3958    case TCG_COND_LEU:
3959        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3960            fixup = NEED_UMIN;
3961        } else {
3962            fixup = NEED_BIAS | NEED_INV;
3963        }
3964        break;
3965    case TCG_COND_GTU:
3966        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3967            fixup = NEED_UMIN | NEED_INV;
3968        } else {
3969            fixup = NEED_BIAS;
3970        }
3971        break;
3972    case TCG_COND_GEU:
3973        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3974            fixup = NEED_UMAX;
3975        } else {
3976            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3977        }
3978        break;
3979    case TCG_COND_LTU:
3980        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3981            fixup = NEED_UMAX | NEED_INV;
3982        } else {
3983            fixup = NEED_BIAS | NEED_SWAP;
3984        }
3985        break;
3986    default:
3987        g_assert_not_reached();
3988    }
3989
3990    if (fixup & NEED_INV) {
3991        cond = tcg_invert_cond(cond);
3992    }
3993    if (fixup & NEED_SWAP) {
3994        t1 = v1, v1 = v2, v2 = t1;
3995        cond = tcg_swap_cond(cond);
3996    }
3997
3998    t1 = t2 = NULL;
3999    if (fixup & (NEED_UMIN | NEED_UMAX)) {
4000        t1 = tcg_temp_new_vec(type);
4001        if (fixup & NEED_UMIN) {
4002            tcg_gen_umin_vec(vece, t1, v1, v2);
4003        } else {
4004            tcg_gen_umax_vec(vece, t1, v1, v2);
4005        }
4006        v2 = t1;
4007        cond = TCG_COND_EQ;
4008    } else if (fixup & NEED_BIAS) {
4009        t1 = tcg_temp_new_vec(type);
4010        t2 = tcg_temp_new_vec(type);
4011        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4012        tcg_gen_sub_vec(vece, t1, v1, t3);
4013        tcg_gen_sub_vec(vece, t2, v2, t3);
4014        v1 = t1;
4015        v2 = t2;
4016        cond = tcg_signed_cond(cond);
4017    }
4018
4019    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4020    /* Expand directly; do not recurse.  */
4021    vec_gen_4(INDEX_op_cmp_vec, type, vece,
4022              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4023
4024    if (t1) {
4025        tcg_temp_free_vec(t1);
4026        if (t2) {
4027            tcg_temp_free_vec(t2);
4028        }
4029    }
4030    return fixup & NEED_INV;
4031}
4032
4033static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4034                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4035{
4036    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4037        tcg_gen_not_vec(vece, v0, v0);
4038    }
4039}
4040
4041static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4042                              TCGv_vec c1, TCGv_vec c2,
4043                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4044{
4045    TCGv_vec t = tcg_temp_new_vec(type);
4046
4047    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4048        /* Invert the sense of the compare by swapping arguments.  */
4049        TCGv_vec x;
4050        x = v3, v3 = v4, v4 = x;
4051    }
4052    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4053              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4054              tcgv_vec_arg(v3), tcgv_vec_arg(t));
4055    tcg_temp_free_vec(t);
4056}
4057
4058void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4059                       TCGArg a0, ...)
4060{
4061    va_list va;
4062    TCGArg a2;
4063    TCGv_vec v0, v1, v2, v3, v4;
4064
4065    va_start(va, a0);
4066    v0 = temp_tcgv_vec(arg_temp(a0));
4067    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4068    a2 = va_arg(va, TCGArg);
4069
4070    switch (opc) {
4071    case INDEX_op_shli_vec:
4072    case INDEX_op_shri_vec:
4073        expand_vec_shi(type, vece, opc, v0, v1, a2);
4074        break;
4075
4076    case INDEX_op_sari_vec:
4077        expand_vec_sari(type, vece, v0, v1, a2);
4078        break;
4079
4080    case INDEX_op_rotli_vec:
4081        expand_vec_rotli(type, vece, v0, v1, a2);
4082        break;
4083
4084    case INDEX_op_rotls_vec:
4085        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4086        break;
4087
4088    case INDEX_op_rotlv_vec:
4089        v2 = temp_tcgv_vec(arg_temp(a2));
4090        expand_vec_rotv(type, vece, v0, v1, v2, false);
4091        break;
4092    case INDEX_op_rotrv_vec:
4093        v2 = temp_tcgv_vec(arg_temp(a2));
4094        expand_vec_rotv(type, vece, v0, v1, v2, true);
4095        break;
4096
4097    case INDEX_op_mul_vec:
4098        v2 = temp_tcgv_vec(arg_temp(a2));
4099        expand_vec_mul(type, vece, v0, v1, v2);
4100        break;
4101
4102    case INDEX_op_cmp_vec:
4103        v2 = temp_tcgv_vec(arg_temp(a2));
4104        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4105        break;
4106
4107    case INDEX_op_cmpsel_vec:
4108        v2 = temp_tcgv_vec(arg_temp(a2));
4109        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4110        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4111        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4112        break;
4113
4114    default:
4115        break;
4116    }
4117
4118    va_end(va);
4119}
4120
4121static const int tcg_target_callee_save_regs[] = {
4122#if TCG_TARGET_REG_BITS == 64
4123    TCG_REG_RBP,
4124    TCG_REG_RBX,
4125#if defined(_WIN64)
4126    TCG_REG_RDI,
4127    TCG_REG_RSI,
4128#endif
4129    TCG_REG_R12,
4130    TCG_REG_R13,
4131    TCG_REG_R14, /* Currently used for the global env. */
4132    TCG_REG_R15,
4133#else
4134    TCG_REG_EBP, /* Currently used for the global env. */
4135    TCG_REG_EBX,
4136    TCG_REG_ESI,
4137    TCG_REG_EDI,
4138#endif
4139};
4140
4141/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4142   and tcg_register_jit.  */
4143
4144#define PUSH_SIZE \
4145    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4146     * (TCG_TARGET_REG_BITS / 8))
4147
4148#define FRAME_SIZE \
4149    ((PUSH_SIZE \
4150      + TCG_STATIC_CALL_ARGS_SIZE \
4151      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4152      + TCG_TARGET_STACK_ALIGN - 1) \
4153     & ~(TCG_TARGET_STACK_ALIGN - 1))
4154
4155/* Generate global QEMU prologue and epilogue code */
4156static void tcg_target_qemu_prologue(TCGContext *s)
4157{
4158    int i, stack_addend;
4159
4160    /* TB prologue */
4161
4162    /* Reserve some stack space, also for TCG temps.  */
4163    stack_addend = FRAME_SIZE - PUSH_SIZE;
4164    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4165                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4166
4167    /* Save all callee saved registers.  */
4168    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4169        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4170    }
4171
4172    if (!tcg_use_softmmu && guest_base) {
4173        int seg = setup_guest_base_seg();
4174        if (seg != 0) {
4175            x86_guest_base.seg = seg;
4176        } else if (guest_base == (int32_t)guest_base) {
4177            x86_guest_base.ofs = guest_base;
4178        } else {
4179            assert(TCG_TARGET_REG_BITS == 64);
4180            /* Choose R12 because, as a base, it requires a SIB byte. */
4181            x86_guest_base.index = TCG_REG_R12;
4182            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4183            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4184        }
4185    }
4186
4187    if (TCG_TARGET_REG_BITS == 32) {
4188        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4189                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4190        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4191        /* jmp *tb.  */
4192        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4193                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4194                             + stack_addend);
4195    } else {
4196        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4197        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4198        /* jmp *tb.  */
4199        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4200    }
4201
4202    /*
4203     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4204     * and fall through to the rest of the epilogue.
4205     */
4206    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4207    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4208
4209    /* TB epilogue */
4210    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4211
4212    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4213
4214    if (have_avx2) {
4215        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4216    }
4217    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4218        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4219    }
4220    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4221}
4222
4223static void tcg_out_tb_start(TCGContext *s)
4224{
4225    /* nothing to do */
4226}
4227
4228static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4229{
4230    memset(p, 0x90, count);
4231}
4232
4233static void tcg_target_init(TCGContext *s)
4234{
4235    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4236    if (TCG_TARGET_REG_BITS == 64) {
4237        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4238    }
4239    if (have_avx1) {
4240        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4241        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4242    }
4243    if (have_avx2) {
4244        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4245    }
4246
4247    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4248    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4249    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4250    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4251    if (TCG_TARGET_REG_BITS == 64) {
4252#if !defined(_WIN64)
4253        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4254        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4255#endif
4256        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4257        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4258        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4259        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4260    }
4261
4262    s->reserved_regs = 0;
4263    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4264    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4265#ifdef _WIN64
4266    /* These are call saved, and we don't save them, so don't use them. */
4267    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4268    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4269    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4270    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4271    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4272    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4273    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4274    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4275    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4276    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4277#endif
4278}
4279
4280typedef struct {
4281    DebugFrameHeader h;
4282    uint8_t fde_def_cfa[4];
4283    uint8_t fde_reg_ofs[14];
4284} DebugFrame;
4285
4286/* We're expecting a 2 byte uleb128 encoded value.  */
4287QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4288
4289#if !defined(__ELF__)
4290    /* Host machine without ELF. */
4291#elif TCG_TARGET_REG_BITS == 64
4292#define ELF_HOST_MACHINE EM_X86_64
4293static const DebugFrame debug_frame = {
4294    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4295    .h.cie.id = -1,
4296    .h.cie.version = 1,
4297    .h.cie.code_align = 1,
4298    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4299    .h.cie.return_column = 16,
4300
4301    /* Total FDE size does not include the "len" member.  */
4302    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4303
4304    .fde_def_cfa = {
4305        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4306        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4307        (FRAME_SIZE >> 7)
4308    },
4309    .fde_reg_ofs = {
4310        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4311        /* The following ordering must match tcg_target_callee_save_regs.  */
4312        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4313        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4314        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4315        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4316        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4317        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4318    }
4319};
4320#else
4321#define ELF_HOST_MACHINE EM_386
4322static const DebugFrame debug_frame = {
4323    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4324    .h.cie.id = -1,
4325    .h.cie.version = 1,
4326    .h.cie.code_align = 1,
4327    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4328    .h.cie.return_column = 8,
4329
4330    /* Total FDE size does not include the "len" member.  */
4331    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4332
4333    .fde_def_cfa = {
4334        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4335        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4336        (FRAME_SIZE >> 7)
4337    },
4338    .fde_reg_ofs = {
4339        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4340        /* The following ordering must match tcg_target_callee_save_regs.  */
4341        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4342        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4343        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4344        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4345    }
4346};
4347#endif
4348
4349#if defined(ELF_HOST_MACHINE)
4350void tcg_register_jit(const void *buf, size_t buf_size)
4351{
4352    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4353}
4354#endif
4355