xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 40f23e4e)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-pool.c.inc"
26
27#ifdef CONFIG_DEBUG_TCG
28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29#if TCG_TARGET_REG_BITS == 64
30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31#else
32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33#endif
34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36#if TCG_TARGET_REG_BITS == 64
37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39#endif
40};
41#endif
42
43static const int tcg_target_reg_alloc_order[] = {
44#if TCG_TARGET_REG_BITS == 64
45    TCG_REG_RBP,
46    TCG_REG_RBX,
47    TCG_REG_R12,
48    TCG_REG_R13,
49    TCG_REG_R14,
50    TCG_REG_R15,
51    TCG_REG_R10,
52    TCG_REG_R11,
53    TCG_REG_R9,
54    TCG_REG_R8,
55    TCG_REG_RCX,
56    TCG_REG_RDX,
57    TCG_REG_RSI,
58    TCG_REG_RDI,
59    TCG_REG_RAX,
60#else
61    TCG_REG_EBX,
62    TCG_REG_ESI,
63    TCG_REG_EDI,
64    TCG_REG_EBP,
65    TCG_REG_ECX,
66    TCG_REG_EDX,
67    TCG_REG_EAX,
68#endif
69    TCG_REG_XMM0,
70    TCG_REG_XMM1,
71    TCG_REG_XMM2,
72    TCG_REG_XMM3,
73    TCG_REG_XMM4,
74    TCG_REG_XMM5,
75#ifndef _WIN64
76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
78    TCG_REG_XMM6,
79    TCG_REG_XMM7,
80#if TCG_TARGET_REG_BITS == 64
81    TCG_REG_XMM8,
82    TCG_REG_XMM9,
83    TCG_REG_XMM10,
84    TCG_REG_XMM11,
85    TCG_REG_XMM12,
86    TCG_REG_XMM13,
87    TCG_REG_XMM14,
88    TCG_REG_XMM15,
89#endif
90#endif
91};
92
93static const int tcg_target_call_iarg_regs[] = {
94#if TCG_TARGET_REG_BITS == 64
95#if defined(_WIN64)
96    TCG_REG_RCX,
97    TCG_REG_RDX,
98#else
99    TCG_REG_RDI,
100    TCG_REG_RSI,
101    TCG_REG_RDX,
102    TCG_REG_RCX,
103#endif
104    TCG_REG_R8,
105    TCG_REG_R9,
106#else
107    /* 32 bit mode uses stack based calling convention (GCC default). */
108#endif
109};
110
111static const int tcg_target_call_oarg_regs[] = {
112    TCG_REG_EAX,
113#if TCG_TARGET_REG_BITS == 32
114    TCG_REG_EDX
115#endif
116};
117
118/* Constants we accept.  */
119#define TCG_CT_CONST_S32 0x100
120#define TCG_CT_CONST_U32 0x200
121#define TCG_CT_CONST_I32 0x400
122#define TCG_CT_CONST_WSZ 0x800
123
124/* Registers used with L constraint, which are the first argument
125   registers on x86_64, and two random call clobbered registers on
126   i386. */
127#if TCG_TARGET_REG_BITS == 64
128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130#else
131# define TCG_REG_L0 TCG_REG_EAX
132# define TCG_REG_L1 TCG_REG_EDX
133#endif
134
135#define ALL_BYTEH_REGS         0x0000000fu
136#if TCG_TARGET_REG_BITS == 64
137# define ALL_GENERAL_REGS      0x0000ffffu
138# define ALL_VECTOR_REGS       0xffff0000u
139# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
140#else
141# define ALL_GENERAL_REGS      0x000000ffu
142# define ALL_VECTOR_REGS       0x00ff0000u
143# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
144#endif
145#ifdef CONFIG_SOFTMMU
146# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
147#else
148# define SOFTMMU_RESERVE_REGS  0
149#endif
150
151/* The host compiler should supply <cpuid.h> to enable runtime features
152   detection, as we're not going to go so far as our own inline assembly.
153   If not available, default values will be assumed.  */
154#if defined(CONFIG_CPUID_H)
155#include "qemu/cpuid.h"
156#endif
157
158/* For 64-bit, we always know that CMOV is available.  */
159#if TCG_TARGET_REG_BITS == 64
160# define have_cmov 1
161#elif defined(CONFIG_CPUID_H)
162static bool have_cmov;
163#else
164# define have_cmov 0
165#endif
166
167/* We need these symbols in tcg-target.h, and we can't properly conditionalize
168   it there.  Therefore we always define the variable.  */
169bool have_bmi1;
170bool have_popcnt;
171bool have_avx1;
172bool have_avx2;
173bool have_movbe;
174
175#ifdef CONFIG_CPUID_H
176static bool have_bmi2;
177static bool have_lzcnt;
178#else
179# define have_bmi2 0
180# define have_lzcnt 0
181#endif
182
183static const tcg_insn_unit *tb_ret_addr;
184
185static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
186                        intptr_t value, intptr_t addend)
187{
188    value += addend;
189    switch(type) {
190    case R_386_PC32:
191        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
192        if (value != (int32_t)value) {
193            return false;
194        }
195        /* FALLTHRU */
196    case R_386_32:
197        tcg_patch32(code_ptr, value);
198        break;
199    case R_386_PC8:
200        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
201        if (value != (int8_t)value) {
202            return false;
203        }
204        tcg_patch8(code_ptr, value);
205        break;
206    default:
207        tcg_abort();
208    }
209    return true;
210}
211
212/* test if a constant matches the constraint */
213static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
214{
215    if (ct & TCG_CT_CONST) {
216        return 1;
217    }
218    if (type == TCG_TYPE_I32) {
219        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
220            return 1;
221        }
222    } else {
223        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
224            return 1;
225        }
226        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
227            return 1;
228        }
229        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
230            return 1;
231        }
232    }
233    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
234        return 1;
235    }
236    return 0;
237}
238
239# define LOWREGMASK(x)	((x) & 7)
240
241#define P_EXT		0x100		/* 0x0f opcode prefix */
242#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
243#define P_DATA16        0x400           /* 0x66 opcode prefix */
244#if TCG_TARGET_REG_BITS == 64
245# define P_REXW         0x1000          /* Set REX.W = 1 */
246# define P_REXB_R       0x2000          /* REG field as byte register */
247# define P_REXB_RM      0x4000          /* R/M field as byte register */
248# define P_GS           0x8000          /* gs segment override */
249#else
250# define P_REXW		0
251# define P_REXB_R	0
252# define P_REXB_RM	0
253# define P_GS           0
254#endif
255#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
256#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
257#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
258#define P_VEXL          0x80000         /* Set VEX.L = 1 */
259
260#define OPC_ARITH_EvIz	(0x81)
261#define OPC_ARITH_EvIb	(0x83)
262#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
263#define OPC_ANDN        (0xf2 | P_EXT38)
264#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
265#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
266#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
267#define OPC_BSF         (0xbc | P_EXT)
268#define OPC_BSR         (0xbd | P_EXT)
269#define OPC_BSWAP	(0xc8 | P_EXT)
270#define OPC_CALL_Jz	(0xe8)
271#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
272#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
273#define OPC_DEC_r32	(0x48)
274#define OPC_IMUL_GvEv	(0xaf | P_EXT)
275#define OPC_IMUL_GvEvIb	(0x6b)
276#define OPC_IMUL_GvEvIz	(0x69)
277#define OPC_INC_r32	(0x40)
278#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
279#define OPC_JCC_short	(0x70)		/* ... plus condition code */
280#define OPC_JMP_long	(0xe9)
281#define OPC_JMP_short	(0xeb)
282#define OPC_LEA         (0x8d)
283#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
284#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
285#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
286#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
287#define OPC_MOVB_EvIz   (0xc6)
288#define OPC_MOVL_EvIz	(0xc7)
289#define OPC_MOVL_Iv     (0xb8)
290#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
291#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
292#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
293#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
294#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
295#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
296#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
297#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
298#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
299#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
300#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
301#define OPC_MOVSBL	(0xbe | P_EXT)
302#define OPC_MOVSWL	(0xbf | P_EXT)
303#define OPC_MOVSLQ	(0x63 | P_REXW)
304#define OPC_MOVZBL	(0xb6 | P_EXT)
305#define OPC_MOVZWL	(0xb7 | P_EXT)
306#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
307#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
308#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
309#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
310#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
311#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
312#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
313#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
314#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
315#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
316#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
317#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
318#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
319#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
320#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
321#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
322#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
323#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
324#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
325#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
326#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
327#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
328#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
329#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
330#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
331#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
332#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
333#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
334#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
335#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
336#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
337#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
338#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
339#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
340#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
341#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
342#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
343#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
344#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
345#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
346#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
347#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
348#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
349#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
350#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
351#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
352#define OPC_POR         (0xeb | P_EXT | P_DATA16)
353#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
354#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
355#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
356#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
357#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
358#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
359#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
360#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
361#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
362#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
363#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
364#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
365#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
366#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
367#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
368#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
369#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
370#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
371#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
372#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
373#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
374#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
375#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
376#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
377#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
378#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
379#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
380#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
381#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
382#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
383#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
384#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
385#define OPC_POP_r32	(0x58)
386#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
387#define OPC_PUSH_r32	(0x50)
388#define OPC_PUSH_Iv	(0x68)
389#define OPC_PUSH_Ib	(0x6a)
390#define OPC_RET		(0xc3)
391#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
392#define OPC_SHIFT_1	(0xd1)
393#define OPC_SHIFT_Ib	(0xc1)
394#define OPC_SHIFT_cl	(0xd3)
395#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
396#define OPC_SHUFPS      (0xc6 | P_EXT)
397#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
398#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
399#define OPC_SHRD_Ib     (0xac | P_EXT)
400#define OPC_TESTL	(0x85)
401#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
402#define OPC_UD2         (0x0b | P_EXT)
403#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
404#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
405#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
406#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
407#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
408#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
409#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
410#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
411#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
412#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
413#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
414#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
415#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
416#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
417#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
418#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
419#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
420#define OPC_VZEROUPPER  (0x77 | P_EXT)
421#define OPC_XCHG_ax_r32	(0x90)
422
423#define OPC_GRP3_Ev	(0xf7)
424#define OPC_GRP5	(0xff)
425#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
426
427/* Group 1 opcode extensions for 0x80-0x83.
428   These are also used as modifiers for OPC_ARITH.  */
429#define ARITH_ADD 0
430#define ARITH_OR  1
431#define ARITH_ADC 2
432#define ARITH_SBB 3
433#define ARITH_AND 4
434#define ARITH_SUB 5
435#define ARITH_XOR 6
436#define ARITH_CMP 7
437
438/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
439#define SHIFT_ROL 0
440#define SHIFT_ROR 1
441#define SHIFT_SHL 4
442#define SHIFT_SHR 5
443#define SHIFT_SAR 7
444
445/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
446#define EXT3_NOT   2
447#define EXT3_NEG   3
448#define EXT3_MUL   4
449#define EXT3_IMUL  5
450#define EXT3_DIV   6
451#define EXT3_IDIV  7
452
453/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
454#define EXT5_INC_Ev	0
455#define EXT5_DEC_Ev	1
456#define EXT5_CALLN_Ev	2
457#define EXT5_JMPN_Ev	4
458
459/* Condition codes to be added to OPC_JCC_{long,short}.  */
460#define JCC_JMP (-1)
461#define JCC_JO  0x0
462#define JCC_JNO 0x1
463#define JCC_JB  0x2
464#define JCC_JAE 0x3
465#define JCC_JE  0x4
466#define JCC_JNE 0x5
467#define JCC_JBE 0x6
468#define JCC_JA  0x7
469#define JCC_JS  0x8
470#define JCC_JNS 0x9
471#define JCC_JP  0xa
472#define JCC_JNP 0xb
473#define JCC_JL  0xc
474#define JCC_JGE 0xd
475#define JCC_JLE 0xe
476#define JCC_JG  0xf
477
478static const uint8_t tcg_cond_to_jcc[] = {
479    [TCG_COND_EQ] = JCC_JE,
480    [TCG_COND_NE] = JCC_JNE,
481    [TCG_COND_LT] = JCC_JL,
482    [TCG_COND_GE] = JCC_JGE,
483    [TCG_COND_LE] = JCC_JLE,
484    [TCG_COND_GT] = JCC_JG,
485    [TCG_COND_LTU] = JCC_JB,
486    [TCG_COND_GEU] = JCC_JAE,
487    [TCG_COND_LEU] = JCC_JBE,
488    [TCG_COND_GTU] = JCC_JA,
489};
490
491#if TCG_TARGET_REG_BITS == 64
492static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
493{
494    int rex;
495
496    if (opc & P_GS) {
497        tcg_out8(s, 0x65);
498    }
499    if (opc & P_DATA16) {
500        /* We should never be asking for both 16 and 64-bit operation.  */
501        tcg_debug_assert((opc & P_REXW) == 0);
502        tcg_out8(s, 0x66);
503    }
504    if (opc & P_SIMDF3) {
505        tcg_out8(s, 0xf3);
506    } else if (opc & P_SIMDF2) {
507        tcg_out8(s, 0xf2);
508    }
509
510    rex = 0;
511    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
512    rex |= (r & 8) >> 1;                /* REX.R */
513    rex |= (x & 8) >> 2;                /* REX.X */
514    rex |= (rm & 8) >> 3;               /* REX.B */
515
516    /* P_REXB_{R,RM} indicates that the given register is the low byte.
517       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
518       as otherwise the encoding indicates %[abcd]h.  Note that the values
519       that are ORed in merely indicate that the REX byte must be present;
520       those bits get discarded in output.  */
521    rex |= opc & (r >= 4 ? P_REXB_R : 0);
522    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
523
524    if (rex) {
525        tcg_out8(s, (uint8_t)(rex | 0x40));
526    }
527
528    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
529        tcg_out8(s, 0x0f);
530        if (opc & P_EXT38) {
531            tcg_out8(s, 0x38);
532        } else if (opc & P_EXT3A) {
533            tcg_out8(s, 0x3a);
534        }
535    }
536
537    tcg_out8(s, opc);
538}
539#else
540static void tcg_out_opc(TCGContext *s, int opc)
541{
542    if (opc & P_DATA16) {
543        tcg_out8(s, 0x66);
544    }
545    if (opc & P_SIMDF3) {
546        tcg_out8(s, 0xf3);
547    } else if (opc & P_SIMDF2) {
548        tcg_out8(s, 0xf2);
549    }
550    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
551        tcg_out8(s, 0x0f);
552        if (opc & P_EXT38) {
553            tcg_out8(s, 0x38);
554        } else if (opc & P_EXT3A) {
555            tcg_out8(s, 0x3a);
556        }
557    }
558    tcg_out8(s, opc);
559}
560/* Discard the register arguments to tcg_out_opc early, so as not to penalize
561   the 32-bit compilation paths.  This method works with all versions of gcc,
562   whereas relying on optimization may not be able to exclude them.  */
563#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
564#endif
565
566static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
567{
568    tcg_out_opc(s, opc, r, rm, 0);
569    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
570}
571
572static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
573                            int rm, int index)
574{
575    int tmp;
576
577    /* Use the two byte form if possible, which cannot encode
578       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
579    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
580        && ((rm | index) & 8) == 0) {
581        /* Two byte VEX prefix.  */
582        tcg_out8(s, 0xc5);
583
584        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
585    } else {
586        /* Three byte VEX prefix.  */
587        tcg_out8(s, 0xc4);
588
589        /* VEX.m-mmmm */
590        if (opc & P_EXT3A) {
591            tmp = 3;
592        } else if (opc & P_EXT38) {
593            tmp = 2;
594        } else if (opc & P_EXT) {
595            tmp = 1;
596        } else {
597            g_assert_not_reached();
598        }
599        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
600        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
601        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
602        tcg_out8(s, tmp);
603
604        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
605    }
606
607    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
608    /* VEX.pp */
609    if (opc & P_DATA16) {
610        tmp |= 1;                          /* 0x66 */
611    } else if (opc & P_SIMDF3) {
612        tmp |= 2;                          /* 0xf3 */
613    } else if (opc & P_SIMDF2) {
614        tmp |= 3;                          /* 0xf2 */
615    }
616    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
617    tcg_out8(s, tmp);
618    tcg_out8(s, opc);
619}
620
621static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
622{
623    tcg_out_vex_opc(s, opc, r, v, rm, 0);
624    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
625}
626
627/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
628   We handle either RM and INDEX missing with a negative value.  In 64-bit
629   mode for absolute addresses, ~RM is the size of the immediate operand
630   that will follow the instruction.  */
631
632static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
633                               int shift, intptr_t offset)
634{
635    int mod, len;
636
637    if (index < 0 && rm < 0) {
638        if (TCG_TARGET_REG_BITS == 64) {
639            /* Try for a rip-relative addressing mode.  This has replaced
640               the 32-bit-mode absolute addressing encoding.  */
641            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
642            intptr_t disp = offset - pc;
643            if (disp == (int32_t)disp) {
644                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
645                tcg_out32(s, disp);
646                return;
647            }
648
649            /* Try for an absolute address encoding.  This requires the
650               use of the MODRM+SIB encoding and is therefore larger than
651               rip-relative addressing.  */
652            if (offset == (int32_t)offset) {
653                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
654                tcg_out8(s, (4 << 3) | 5);
655                tcg_out32(s, offset);
656                return;
657            }
658
659            /* ??? The memory isn't directly addressable.  */
660            g_assert_not_reached();
661        } else {
662            /* Absolute address.  */
663            tcg_out8(s, (r << 3) | 5);
664            tcg_out32(s, offset);
665            return;
666        }
667    }
668
669    /* Find the length of the immediate addend.  Note that the encoding
670       that would be used for (%ebp) indicates absolute addressing.  */
671    if (rm < 0) {
672        mod = 0, len = 4, rm = 5;
673    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
674        mod = 0, len = 0;
675    } else if (offset == (int8_t)offset) {
676        mod = 0x40, len = 1;
677    } else {
678        mod = 0x80, len = 4;
679    }
680
681    /* Use a single byte MODRM format if possible.  Note that the encoding
682       that would be used for %esp is the escape to the two byte form.  */
683    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
684        /* Single byte MODRM format.  */
685        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
686    } else {
687        /* Two byte MODRM+SIB format.  */
688
689        /* Note that the encoding that would place %esp into the index
690           field indicates no index register.  In 64-bit mode, the REX.X
691           bit counts, so %r12 can be used as the index.  */
692        if (index < 0) {
693            index = 4;
694        } else {
695            tcg_debug_assert(index != TCG_REG_ESP);
696        }
697
698        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
699        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
700    }
701
702    if (len == 1) {
703        tcg_out8(s, offset);
704    } else if (len == 4) {
705        tcg_out32(s, offset);
706    }
707}
708
709static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
710                                     int index, int shift, intptr_t offset)
711{
712    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
713    tcg_out_sib_offset(s, r, rm, index, shift, offset);
714}
715
716static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
717                                         int rm, int index, int shift,
718                                         intptr_t offset)
719{
720    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
721    tcg_out_sib_offset(s, r, rm, index, shift, offset);
722}
723
724/* A simplification of the above with no index or shift.  */
725static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
726                                        int rm, intptr_t offset)
727{
728    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
729}
730
731static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
732                                            int v, int rm, intptr_t offset)
733{
734    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
735}
736
737/* Output an opcode with an expected reference to the constant pool.  */
738static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
739{
740    tcg_out_opc(s, opc, r, 0, 0);
741    /* Absolute for 32-bit, pc-relative for 64-bit.  */
742    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
743    tcg_out32(s, 0);
744}
745
746/* Output an opcode with an expected reference to the constant pool.  */
747static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
748{
749    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
750    /* Absolute for 32-bit, pc-relative for 64-bit.  */
751    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
752    tcg_out32(s, 0);
753}
754
755/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
756static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
757{
758    /* Propagate an opcode prefix, such as P_REXW.  */
759    int ext = subop & ~0x7;
760    subop &= 0x7;
761
762    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
763}
764
765static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
766{
767    int rexw = 0;
768
769    if (arg == ret) {
770        return true;
771    }
772    switch (type) {
773    case TCG_TYPE_I64:
774        rexw = P_REXW;
775        /* fallthru */
776    case TCG_TYPE_I32:
777        if (ret < 16) {
778            if (arg < 16) {
779                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
780            } else {
781                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
782            }
783        } else {
784            if (arg < 16) {
785                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
786            } else {
787                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
788            }
789        }
790        break;
791
792    case TCG_TYPE_V64:
793        tcg_debug_assert(ret >= 16 && arg >= 16);
794        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
795        break;
796    case TCG_TYPE_V128:
797        tcg_debug_assert(ret >= 16 && arg >= 16);
798        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
799        break;
800    case TCG_TYPE_V256:
801        tcg_debug_assert(ret >= 16 && arg >= 16);
802        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
803        break;
804
805    default:
806        g_assert_not_reached();
807    }
808    return true;
809}
810
811static const int avx2_dup_insn[4] = {
812    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
813    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
814};
815
816static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
817                            TCGReg r, TCGReg a)
818{
819    if (have_avx2) {
820        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
821        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
822    } else {
823        switch (vece) {
824        case MO_8:
825            /* ??? With zero in a register, use PSHUFB.  */
826            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
827            a = r;
828            /* FALLTHRU */
829        case MO_16:
830            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
831            a = r;
832            /* FALLTHRU */
833        case MO_32:
834            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
835            /* imm8 operand: all output lanes selected from input lane 0.  */
836            tcg_out8(s, 0);
837            break;
838        case MO_64:
839            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
840            break;
841        default:
842            g_assert_not_reached();
843        }
844    }
845    return true;
846}
847
848static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
849                             TCGReg r, TCGReg base, intptr_t offset)
850{
851    if (have_avx2) {
852        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
853        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
854                                 r, 0, base, offset);
855    } else {
856        switch (vece) {
857        case MO_64:
858            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
859            break;
860        case MO_32:
861            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
862            break;
863        case MO_16:
864            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
865            tcg_out8(s, 0); /* imm8 */
866            tcg_out_dup_vec(s, type, vece, r, r);
867            break;
868        case MO_8:
869            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
870            tcg_out8(s, 0); /* imm8 */
871            tcg_out_dup_vec(s, type, vece, r, r);
872            break;
873        default:
874            g_assert_not_reached();
875        }
876    }
877    return true;
878}
879
880static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
881                             TCGReg ret, int64_t arg)
882{
883    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
884
885    if (arg == 0) {
886        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
887        return;
888    }
889    if (arg == -1) {
890        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
891        return;
892    }
893
894    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
895        if (have_avx2) {
896            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
897        } else {
898            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
899        }
900        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
901    } else {
902        if (type == TCG_TYPE_V64) {
903            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
904        } else if (have_avx2) {
905            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
906        } else {
907            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
908        }
909        if (TCG_TARGET_REG_BITS == 64) {
910            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
911        } else {
912            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
913        }
914    }
915}
916
917static void tcg_out_movi_vec(TCGContext *s, TCGType type,
918                             TCGReg ret, tcg_target_long arg)
919{
920    if (arg == 0) {
921        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
922        return;
923    }
924    if (arg == -1) {
925        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
926        return;
927    }
928
929    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
930    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
931    if (TCG_TARGET_REG_BITS == 64) {
932        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
933    } else {
934        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
935    }
936}
937
938static void tcg_out_movi_int(TCGContext *s, TCGType type,
939                             TCGReg ret, tcg_target_long arg)
940{
941    tcg_target_long diff;
942
943    if (arg == 0) {
944        tgen_arithr(s, ARITH_XOR, ret, ret);
945        return;
946    }
947    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
948        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
949        tcg_out32(s, arg);
950        return;
951    }
952    if (arg == (int32_t)arg) {
953        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
954        tcg_out32(s, arg);
955        return;
956    }
957
958    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
959    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
960    if (diff == (int32_t)diff) {
961        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
962        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
963        tcg_out32(s, diff);
964        return;
965    }
966
967    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
968    tcg_out64(s, arg);
969}
970
971static void tcg_out_movi(TCGContext *s, TCGType type,
972                         TCGReg ret, tcg_target_long arg)
973{
974    switch (type) {
975    case TCG_TYPE_I32:
976#if TCG_TARGET_REG_BITS == 64
977    case TCG_TYPE_I64:
978#endif
979        if (ret < 16) {
980            tcg_out_movi_int(s, type, ret, arg);
981        } else {
982            tcg_out_movi_vec(s, type, ret, arg);
983        }
984        break;
985    default:
986        g_assert_not_reached();
987    }
988}
989
990static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
991{
992    if (val == (int8_t)val) {
993        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
994        tcg_out8(s, val);
995    } else if (val == (int32_t)val) {
996        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
997        tcg_out32(s, val);
998    } else {
999        tcg_abort();
1000    }
1001}
1002
1003static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1004{
1005    /* Given the strength of x86 memory ordering, we only need care for
1006       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1007       faster than "mfence", so don't bother with the sse insn.  */
1008    if (a0 & TCG_MO_ST_LD) {
1009        tcg_out8(s, 0xf0);
1010        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1011        tcg_out8(s, 0);
1012    }
1013}
1014
1015static inline void tcg_out_push(TCGContext *s, int reg)
1016{
1017    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1018}
1019
1020static inline void tcg_out_pop(TCGContext *s, int reg)
1021{
1022    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1023}
1024
1025static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1026                       TCGReg arg1, intptr_t arg2)
1027{
1028    switch (type) {
1029    case TCG_TYPE_I32:
1030        if (ret < 16) {
1031            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1032        } else {
1033            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1034        }
1035        break;
1036    case TCG_TYPE_I64:
1037        if (ret < 16) {
1038            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1039            break;
1040        }
1041        /* FALLTHRU */
1042    case TCG_TYPE_V64:
1043        /* There is no instruction that can validate 8-byte alignment.  */
1044        tcg_debug_assert(ret >= 16);
1045        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1046        break;
1047    case TCG_TYPE_V128:
1048        /*
1049         * The gvec infrastructure is asserts that v128 vector loads
1050         * and stores use a 16-byte aligned offset.  Validate that the
1051         * final pointer is aligned by using an insn that will SIGSEGV.
1052         */
1053        tcg_debug_assert(ret >= 16);
1054        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1055        break;
1056    case TCG_TYPE_V256:
1057        /*
1058         * The gvec infrastructure only requires 16-byte alignment,
1059         * so here we must use an unaligned load.
1060         */
1061        tcg_debug_assert(ret >= 16);
1062        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1063                                 ret, 0, arg1, arg2);
1064        break;
1065    default:
1066        g_assert_not_reached();
1067    }
1068}
1069
1070static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1071                       TCGReg arg1, intptr_t arg2)
1072{
1073    switch (type) {
1074    case TCG_TYPE_I32:
1075        if (arg < 16) {
1076            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1077        } else {
1078            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1079        }
1080        break;
1081    case TCG_TYPE_I64:
1082        if (arg < 16) {
1083            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1084            break;
1085        }
1086        /* FALLTHRU */
1087    case TCG_TYPE_V64:
1088        /* There is no instruction that can validate 8-byte alignment.  */
1089        tcg_debug_assert(arg >= 16);
1090        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1091        break;
1092    case TCG_TYPE_V128:
1093        /*
1094         * The gvec infrastructure is asserts that v128 vector loads
1095         * and stores use a 16-byte aligned offset.  Validate that the
1096         * final pointer is aligned by using an insn that will SIGSEGV.
1097         */
1098        tcg_debug_assert(arg >= 16);
1099        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1100        break;
1101    case TCG_TYPE_V256:
1102        /*
1103         * The gvec infrastructure only requires 16-byte alignment,
1104         * so here we must use an unaligned store.
1105         */
1106        tcg_debug_assert(arg >= 16);
1107        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1108                                 arg, 0, arg1, arg2);
1109        break;
1110    default:
1111        g_assert_not_reached();
1112    }
1113}
1114
1115static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1116                        TCGReg base, intptr_t ofs)
1117{
1118    int rexw = 0;
1119    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1120        if (val != (int32_t)val) {
1121            return false;
1122        }
1123        rexw = P_REXW;
1124    } else if (type != TCG_TYPE_I32) {
1125        return false;
1126    }
1127    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1128    tcg_out32(s, val);
1129    return true;
1130}
1131
1132static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1133{
1134    /* Propagate an opcode prefix, such as P_DATA16.  */
1135    int ext = subopc & ~0x7;
1136    subopc &= 0x7;
1137
1138    if (count == 1) {
1139        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1140    } else {
1141        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1142        tcg_out8(s, count);
1143    }
1144}
1145
1146static inline void tcg_out_bswap32(TCGContext *s, int reg)
1147{
1148    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1149}
1150
1151static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1152{
1153    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1154}
1155
1156static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1157{
1158    /* movzbl */
1159    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1160    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1161}
1162
1163static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1164{
1165    /* movsbl */
1166    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1167    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1168}
1169
1170static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1171{
1172    /* movzwl */
1173    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1174}
1175
1176static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1177{
1178    /* movsw[lq] */
1179    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1180}
1181
1182static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1183{
1184    /* 32-bit mov zero extends.  */
1185    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1186}
1187
1188static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1189{
1190    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1191}
1192
1193static inline void tcg_out_bswap64(TCGContext *s, int reg)
1194{
1195    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1196}
1197
1198static void tgen_arithi(TCGContext *s, int c, int r0,
1199                        tcg_target_long val, int cf)
1200{
1201    int rexw = 0;
1202
1203    if (TCG_TARGET_REG_BITS == 64) {
1204        rexw = c & -8;
1205        c &= 7;
1206    }
1207
1208    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1209       partial flags update stalls on Pentium4 and are not recommended
1210       by current Intel optimization manuals.  */
1211    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1212        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1213        if (TCG_TARGET_REG_BITS == 64) {
1214            /* The single-byte increment encodings are re-tasked as the
1215               REX prefixes.  Use the MODRM encoding.  */
1216            tcg_out_modrm(s, OPC_GRP5 + rexw,
1217                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1218        } else {
1219            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1220        }
1221        return;
1222    }
1223
1224    if (c == ARITH_AND) {
1225        if (TCG_TARGET_REG_BITS == 64) {
1226            if (val == 0xffffffffu) {
1227                tcg_out_ext32u(s, r0, r0);
1228                return;
1229            }
1230            if (val == (uint32_t)val) {
1231                /* AND with no high bits set can use a 32-bit operation.  */
1232                rexw = 0;
1233            }
1234        }
1235        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1236            tcg_out_ext8u(s, r0, r0);
1237            return;
1238        }
1239        if (val == 0xffffu) {
1240            tcg_out_ext16u(s, r0, r0);
1241            return;
1242        }
1243    }
1244
1245    if (val == (int8_t)val) {
1246        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1247        tcg_out8(s, val);
1248        return;
1249    }
1250    if (rexw == 0 || val == (int32_t)val) {
1251        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1252        tcg_out32(s, val);
1253        return;
1254    }
1255
1256    tcg_abort();
1257}
1258
1259static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1260{
1261    if (val != 0) {
1262        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1263    }
1264}
1265
1266/* Use SMALL != 0 to force a short forward branch.  */
1267static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1268{
1269    int32_t val, val1;
1270
1271    if (l->has_value) {
1272        val = tcg_pcrel_diff(s, l->u.value_ptr);
1273        val1 = val - 2;
1274        if ((int8_t)val1 == val1) {
1275            if (opc == -1) {
1276                tcg_out8(s, OPC_JMP_short);
1277            } else {
1278                tcg_out8(s, OPC_JCC_short + opc);
1279            }
1280            tcg_out8(s, val1);
1281        } else {
1282            if (small) {
1283                tcg_abort();
1284            }
1285            if (opc == -1) {
1286                tcg_out8(s, OPC_JMP_long);
1287                tcg_out32(s, val - 5);
1288            } else {
1289                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1290                tcg_out32(s, val - 6);
1291            }
1292        }
1293    } else if (small) {
1294        if (opc == -1) {
1295            tcg_out8(s, OPC_JMP_short);
1296        } else {
1297            tcg_out8(s, OPC_JCC_short + opc);
1298        }
1299        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1300        s->code_ptr += 1;
1301    } else {
1302        if (opc == -1) {
1303            tcg_out8(s, OPC_JMP_long);
1304        } else {
1305            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1306        }
1307        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1308        s->code_ptr += 4;
1309    }
1310}
1311
1312static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1313                        int const_arg2, int rexw)
1314{
1315    if (const_arg2) {
1316        if (arg2 == 0) {
1317            /* test r, r */
1318            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1319        } else {
1320            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1321        }
1322    } else {
1323        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1324    }
1325}
1326
1327static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1328                             TCGArg arg1, TCGArg arg2, int const_arg2,
1329                             TCGLabel *label, int small)
1330{
1331    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1332    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1333}
1334
1335#if TCG_TARGET_REG_BITS == 64
1336static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1337                             TCGArg arg1, TCGArg arg2, int const_arg2,
1338                             TCGLabel *label, int small)
1339{
1340    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1341    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1342}
1343#else
1344/* XXX: we implement it at the target level to avoid having to
1345   handle cross basic blocks temporaries */
1346static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1347                            const int *const_args, int small)
1348{
1349    TCGLabel *label_next = gen_new_label();
1350    TCGLabel *label_this = arg_label(args[5]);
1351
1352    switch(args[4]) {
1353    case TCG_COND_EQ:
1354        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1355                         label_next, 1);
1356        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1357                         label_this, small);
1358        break;
1359    case TCG_COND_NE:
1360        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1361                         label_this, small);
1362        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1363                         label_this, small);
1364        break;
1365    case TCG_COND_LT:
1366        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1367                         label_this, small);
1368        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1369        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1370                         label_this, small);
1371        break;
1372    case TCG_COND_LE:
1373        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1374                         label_this, small);
1375        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1376        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1377                         label_this, small);
1378        break;
1379    case TCG_COND_GT:
1380        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1381                         label_this, small);
1382        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1383        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1384                         label_this, small);
1385        break;
1386    case TCG_COND_GE:
1387        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1388                         label_this, small);
1389        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1390        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1391                         label_this, small);
1392        break;
1393    case TCG_COND_LTU:
1394        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1395                         label_this, small);
1396        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1397        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1398                         label_this, small);
1399        break;
1400    case TCG_COND_LEU:
1401        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1402                         label_this, small);
1403        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1404        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1405                         label_this, small);
1406        break;
1407    case TCG_COND_GTU:
1408        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1409                         label_this, small);
1410        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1411        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1412                         label_this, small);
1413        break;
1414    case TCG_COND_GEU:
1415        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1416                         label_this, small);
1417        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1418        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1419                         label_this, small);
1420        break;
1421    default:
1422        tcg_abort();
1423    }
1424    tcg_out_label(s, label_next);
1425}
1426#endif
1427
1428static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1429                              TCGArg arg1, TCGArg arg2, int const_arg2)
1430{
1431    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1432    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1433    tcg_out_ext8u(s, dest, dest);
1434}
1435
1436#if TCG_TARGET_REG_BITS == 64
1437static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1438                              TCGArg arg1, TCGArg arg2, int const_arg2)
1439{
1440    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1441    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1442    tcg_out_ext8u(s, dest, dest);
1443}
1444#else
1445static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1446                             const int *const_args)
1447{
1448    TCGArg new_args[6];
1449    TCGLabel *label_true, *label_over;
1450
1451    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1452
1453    if (args[0] == args[1] || args[0] == args[2]
1454        || (!const_args[3] && args[0] == args[3])
1455        || (!const_args[4] && args[0] == args[4])) {
1456        /* When the destination overlaps with one of the argument
1457           registers, don't do anything tricky.  */
1458        label_true = gen_new_label();
1459        label_over = gen_new_label();
1460
1461        new_args[5] = label_arg(label_true);
1462        tcg_out_brcond2(s, new_args, const_args+1, 1);
1463
1464        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1465        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1466        tcg_out_label(s, label_true);
1467
1468        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1469        tcg_out_label(s, label_over);
1470    } else {
1471        /* When the destination does not overlap one of the arguments,
1472           clear the destination first, jump if cond false, and emit an
1473           increment in the true case.  This results in smaller code.  */
1474
1475        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1476
1477        label_over = gen_new_label();
1478        new_args[4] = tcg_invert_cond(new_args[4]);
1479        new_args[5] = label_arg(label_over);
1480        tcg_out_brcond2(s, new_args, const_args+1, 1);
1481
1482        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1483        tcg_out_label(s, label_over);
1484    }
1485}
1486#endif
1487
1488static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1489                         TCGReg dest, TCGReg v1)
1490{
1491    if (have_cmov) {
1492        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1493    } else {
1494        TCGLabel *over = gen_new_label();
1495        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1496        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1497        tcg_out_label(s, over);
1498    }
1499}
1500
1501static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1502                              TCGReg c1, TCGArg c2, int const_c2,
1503                              TCGReg v1)
1504{
1505    tcg_out_cmp(s, c1, c2, const_c2, 0);
1506    tcg_out_cmov(s, cond, 0, dest, v1);
1507}
1508
1509#if TCG_TARGET_REG_BITS == 64
1510static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1511                              TCGReg c1, TCGArg c2, int const_c2,
1512                              TCGReg v1)
1513{
1514    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1515    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1516}
1517#endif
1518
1519static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1520                        TCGArg arg2, bool const_a2)
1521{
1522    if (have_bmi1) {
1523        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1524        if (const_a2) {
1525            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1526        } else {
1527            tcg_debug_assert(dest != arg2);
1528            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1529        }
1530    } else {
1531        tcg_debug_assert(dest != arg2);
1532        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1533        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1534    }
1535}
1536
1537static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1538                        TCGArg arg2, bool const_a2)
1539{
1540    if (have_lzcnt) {
1541        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1542        if (const_a2) {
1543            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1544        } else {
1545            tcg_debug_assert(dest != arg2);
1546            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1547        }
1548    } else {
1549        tcg_debug_assert(!const_a2);
1550        tcg_debug_assert(dest != arg1);
1551        tcg_debug_assert(dest != arg2);
1552
1553        /* Recall that the output of BSR is the index not the count.  */
1554        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1555        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1556
1557        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1558        tcg_out_cmp(s, arg1, 0, 1, rexw);
1559        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1560    }
1561}
1562
1563static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1564{
1565    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1566
1567    if (disp == (int32_t)disp) {
1568        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1569        tcg_out32(s, disp);
1570    } else {
1571        /* rip-relative addressing into the constant pool.
1572           This is 6 + 8 = 14 bytes, as compared to using an
1573           an immediate load 10 + 6 = 16 bytes, plus we may
1574           be able to re-use the pool constant for more calls.  */
1575        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1576        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1577        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1578        tcg_out32(s, 0);
1579    }
1580}
1581
1582static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1583{
1584    tcg_out_branch(s, 1, dest);
1585}
1586
1587static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1588{
1589    tcg_out_branch(s, 0, dest);
1590}
1591
1592static void tcg_out_nopn(TCGContext *s, int n)
1593{
1594    int i;
1595    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1596     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1597     * duplicate prefix, and all of the interesting recent cores can
1598     * decode and discard the duplicates in a single cycle.
1599     */
1600    tcg_debug_assert(n >= 1);
1601    for (i = 1; i < n; ++i) {
1602        tcg_out8(s, 0x66);
1603    }
1604    tcg_out8(s, 0x90);
1605}
1606
1607#if defined(CONFIG_SOFTMMU)
1608#include "../tcg-ldst.c.inc"
1609
1610/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1611 *                                     int mmu_idx, uintptr_t ra)
1612 */
1613static void * const qemu_ld_helpers[16] = {
1614    [MO_UB]   = helper_ret_ldub_mmu,
1615    [MO_LEUW] = helper_le_lduw_mmu,
1616    [MO_LEUL] = helper_le_ldul_mmu,
1617    [MO_LEQ]  = helper_le_ldq_mmu,
1618    [MO_BEUW] = helper_be_lduw_mmu,
1619    [MO_BEUL] = helper_be_ldul_mmu,
1620    [MO_BEQ]  = helper_be_ldq_mmu,
1621};
1622
1623/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1624 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1625 */
1626static void * const qemu_st_helpers[16] = {
1627    [MO_UB]   = helper_ret_stb_mmu,
1628    [MO_LEUW] = helper_le_stw_mmu,
1629    [MO_LEUL] = helper_le_stl_mmu,
1630    [MO_LEQ]  = helper_le_stq_mmu,
1631    [MO_BEUW] = helper_be_stw_mmu,
1632    [MO_BEUL] = helper_be_stl_mmu,
1633    [MO_BEQ]  = helper_be_stq_mmu,
1634};
1635
1636/* Perform the TLB load and compare.
1637
1638   Inputs:
1639   ADDRLO and ADDRHI contain the low and high part of the address.
1640
1641   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1642
1643   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1644   This should be offsetof addr_read or addr_write.
1645
1646   Outputs:
1647   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1648   positions of the displacements of forward jumps to the TLB miss case.
1649
1650   Second argument register is loaded with the low part of the address.
1651   In the TLB hit case, it has been adjusted as indicated by the TLB
1652   and so is a host address.  In the TLB miss case, it continues to
1653   hold a guest address.
1654
1655   First argument register is clobbered.  */
1656
1657static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1658                                    int mem_index, MemOp opc,
1659                                    tcg_insn_unit **label_ptr, int which)
1660{
1661    const TCGReg r0 = TCG_REG_L0;
1662    const TCGReg r1 = TCG_REG_L1;
1663    TCGType ttype = TCG_TYPE_I32;
1664    TCGType tlbtype = TCG_TYPE_I32;
1665    int trexw = 0, hrexw = 0, tlbrexw = 0;
1666    unsigned a_bits = get_alignment_bits(opc);
1667    unsigned s_bits = opc & MO_SIZE;
1668    unsigned a_mask = (1 << a_bits) - 1;
1669    unsigned s_mask = (1 << s_bits) - 1;
1670    target_ulong tlb_mask;
1671
1672    if (TCG_TARGET_REG_BITS == 64) {
1673        if (TARGET_LONG_BITS == 64) {
1674            ttype = TCG_TYPE_I64;
1675            trexw = P_REXW;
1676        }
1677        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1678            hrexw = P_REXW;
1679            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1680                tlbtype = TCG_TYPE_I64;
1681                tlbrexw = P_REXW;
1682            }
1683        }
1684    }
1685
1686    tcg_out_mov(s, tlbtype, r0, addrlo);
1687    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1688                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1689
1690    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1691                         TLB_MASK_TABLE_OFS(mem_index) +
1692                         offsetof(CPUTLBDescFast, mask));
1693
1694    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1695                         TLB_MASK_TABLE_OFS(mem_index) +
1696                         offsetof(CPUTLBDescFast, table));
1697
1698    /* If the required alignment is at least as large as the access, simply
1699       copy the address and mask.  For lesser alignments, check that we don't
1700       cross pages for the complete access.  */
1701    if (a_bits >= s_bits) {
1702        tcg_out_mov(s, ttype, r1, addrlo);
1703    } else {
1704        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1705    }
1706    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1707    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1708
1709    /* cmp 0(r0), r1 */
1710    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1711
1712    /* Prepare for both the fast path add of the tlb addend, and the slow
1713       path function argument setup.  */
1714    tcg_out_mov(s, ttype, r1, addrlo);
1715
1716    /* jne slow_path */
1717    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1718    label_ptr[0] = s->code_ptr;
1719    s->code_ptr += 4;
1720
1721    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1722        /* cmp 4(r0), addrhi */
1723        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1724
1725        /* jne slow_path */
1726        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1727        label_ptr[1] = s->code_ptr;
1728        s->code_ptr += 4;
1729    }
1730
1731    /* TLB Hit.  */
1732
1733    /* add addend(r0), r1 */
1734    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1735                         offsetof(CPUTLBEntry, addend));
1736}
1737
1738/*
1739 * Record the context of a call to the out of line helper code for the slow path
1740 * for a load or store, so that we can later generate the correct helper code
1741 */
1742static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1743                                TCGMemOpIdx oi,
1744                                TCGReg datalo, TCGReg datahi,
1745                                TCGReg addrlo, TCGReg addrhi,
1746                                tcg_insn_unit *raddr,
1747                                tcg_insn_unit **label_ptr)
1748{
1749    TCGLabelQemuLdst *label = new_ldst_label(s);
1750
1751    label->is_ld = is_ld;
1752    label->oi = oi;
1753    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1754    label->datalo_reg = datalo;
1755    label->datahi_reg = datahi;
1756    label->addrlo_reg = addrlo;
1757    label->addrhi_reg = addrhi;
1758    label->raddr = tcg_splitwx_to_rx(raddr);
1759    label->label_ptr[0] = label_ptr[0];
1760    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1761        label->label_ptr[1] = label_ptr[1];
1762    }
1763}
1764
1765/*
1766 * Generate code for the slow path for a load at the end of block
1767 */
1768static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1769{
1770    TCGMemOpIdx oi = l->oi;
1771    MemOp opc = get_memop(oi);
1772    TCGReg data_reg;
1773    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1774    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1775
1776    /* resolve label address */
1777    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1778    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1779        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1780    }
1781
1782    if (TCG_TARGET_REG_BITS == 32) {
1783        int ofs = 0;
1784
1785        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1786        ofs += 4;
1787
1788        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1789        ofs += 4;
1790
1791        if (TARGET_LONG_BITS == 64) {
1792            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1793            ofs += 4;
1794        }
1795
1796        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1797        ofs += 4;
1798
1799        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1800    } else {
1801        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1802        /* The second argument is already loaded with addrlo.  */
1803        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1804        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1805                     (uintptr_t)l->raddr);
1806    }
1807
1808    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1809
1810    data_reg = l->datalo_reg;
1811    switch (opc & MO_SSIZE) {
1812    case MO_SB:
1813        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1814        break;
1815    case MO_SW:
1816        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1817        break;
1818#if TCG_TARGET_REG_BITS == 64
1819    case MO_SL:
1820        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1821        break;
1822#endif
1823    case MO_UB:
1824    case MO_UW:
1825        /* Note that the helpers have zero-extended to tcg_target_long.  */
1826    case MO_UL:
1827        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1828        break;
1829    case MO_Q:
1830        if (TCG_TARGET_REG_BITS == 64) {
1831            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1832        } else if (data_reg == TCG_REG_EDX) {
1833            /* xchg %edx, %eax */
1834            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1835            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1836        } else {
1837            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1838            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1839        }
1840        break;
1841    default:
1842        tcg_abort();
1843    }
1844
1845    /* Jump to the code corresponding to next IR of qemu_st */
1846    tcg_out_jmp(s, l->raddr);
1847    return true;
1848}
1849
1850/*
1851 * Generate code for the slow path for a store at the end of block
1852 */
1853static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1854{
1855    TCGMemOpIdx oi = l->oi;
1856    MemOp opc = get_memop(oi);
1857    MemOp s_bits = opc & MO_SIZE;
1858    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1859    TCGReg retaddr;
1860
1861    /* resolve label address */
1862    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1863    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1864        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1865    }
1866
1867    if (TCG_TARGET_REG_BITS == 32) {
1868        int ofs = 0;
1869
1870        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1871        ofs += 4;
1872
1873        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1874        ofs += 4;
1875
1876        if (TARGET_LONG_BITS == 64) {
1877            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1878            ofs += 4;
1879        }
1880
1881        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1882        ofs += 4;
1883
1884        if (s_bits == MO_64) {
1885            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1886            ofs += 4;
1887        }
1888
1889        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1890        ofs += 4;
1891
1892        retaddr = TCG_REG_EAX;
1893        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1894        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1895    } else {
1896        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1897        /* The second argument is already loaded with addrlo.  */
1898        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1899                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1900        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1901
1902        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1903            retaddr = tcg_target_call_iarg_regs[4];
1904            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1905        } else {
1906            retaddr = TCG_REG_RAX;
1907            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1908            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1909                       TCG_TARGET_CALL_STACK_OFFSET);
1910        }
1911    }
1912
1913    /* "Tail call" to the helper, with the return address back inline.  */
1914    tcg_out_push(s, retaddr);
1915    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1916    return true;
1917}
1918#elif TCG_TARGET_REG_BITS == 32
1919# define x86_guest_base_seg     0
1920# define x86_guest_base_index   -1
1921# define x86_guest_base_offset  guest_base
1922#else
1923static int x86_guest_base_seg;
1924static int x86_guest_base_index = -1;
1925static int32_t x86_guest_base_offset;
1926# if defined(__x86_64__) && defined(__linux__)
1927#  include <asm/prctl.h>
1928#  include <sys/prctl.h>
1929int arch_prctl(int code, unsigned long addr);
1930static inline int setup_guest_base_seg(void)
1931{
1932    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1933        return P_GS;
1934    }
1935    return 0;
1936}
1937# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1938#  include <machine/sysarch.h>
1939static inline int setup_guest_base_seg(void)
1940{
1941    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1942        return P_GS;
1943    }
1944    return 0;
1945}
1946# else
1947static inline int setup_guest_base_seg(void)
1948{
1949    return 0;
1950}
1951# endif
1952#endif /* SOFTMMU */
1953
1954static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1955                                   TCGReg base, int index, intptr_t ofs,
1956                                   int seg, bool is64, MemOp memop)
1957{
1958    bool use_movbe = false;
1959    int rexw = is64 * P_REXW;
1960    int movop = OPC_MOVL_GvEv;
1961
1962    /* Do big-endian loads with movbe.  */
1963    if (memop & MO_BSWAP) {
1964        tcg_debug_assert(have_movbe);
1965        use_movbe = true;
1966        movop = OPC_MOVBE_GyMy;
1967    }
1968
1969    switch (memop & MO_SSIZE) {
1970    case MO_UB:
1971        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1972                                 base, index, 0, ofs);
1973        break;
1974    case MO_SB:
1975        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1976                                 base, index, 0, ofs);
1977        break;
1978    case MO_UW:
1979        if (use_movbe) {
1980            /* There is no extending movbe; only low 16-bits are modified.  */
1981            if (datalo != base && datalo != index) {
1982                /* XOR breaks dependency chains.  */
1983                tgen_arithr(s, ARITH_XOR, datalo, datalo);
1984                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1985                                         datalo, base, index, 0, ofs);
1986            } else {
1987                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1988                                         datalo, base, index, 0, ofs);
1989                tcg_out_ext16u(s, datalo, datalo);
1990            }
1991        } else {
1992            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1993                                     base, index, 0, ofs);
1994        }
1995        break;
1996    case MO_SW:
1997        if (use_movbe) {
1998            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1999                                     datalo, base, index, 0, ofs);
2000            tcg_out_ext16s(s, datalo, datalo, rexw);
2001        } else {
2002            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2003                                     datalo, base, index, 0, ofs);
2004        }
2005        break;
2006    case MO_UL:
2007        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2008        break;
2009#if TCG_TARGET_REG_BITS == 64
2010    case MO_SL:
2011        if (use_movbe) {
2012            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2013                                     base, index, 0, ofs);
2014            tcg_out_ext32s(s, datalo, datalo);
2015        } else {
2016            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2017                                     base, index, 0, ofs);
2018        }
2019        break;
2020#endif
2021    case MO_Q:
2022        if (TCG_TARGET_REG_BITS == 64) {
2023            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2024                                     base, index, 0, ofs);
2025        } else {
2026            if (use_movbe) {
2027                TCGReg t = datalo;
2028                datalo = datahi;
2029                datahi = t;
2030            }
2031            if (base != datalo) {
2032                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2033                                         base, index, 0, ofs);
2034                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2035                                         base, index, 0, ofs + 4);
2036            } else {
2037                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2038                                         base, index, 0, ofs + 4);
2039                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2040                                         base, index, 0, ofs);
2041            }
2042        }
2043        break;
2044    default:
2045        g_assert_not_reached();
2046    }
2047}
2048
2049/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2050   EAX. It will be useful once fixed registers globals are less
2051   common. */
2052static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2053{
2054    TCGReg datalo, datahi, addrlo;
2055    TCGReg addrhi __attribute__((unused));
2056    TCGMemOpIdx oi;
2057    MemOp opc;
2058#if defined(CONFIG_SOFTMMU)
2059    int mem_index;
2060    tcg_insn_unit *label_ptr[2];
2061#endif
2062
2063    datalo = *args++;
2064    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2065    addrlo = *args++;
2066    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2067    oi = *args++;
2068    opc = get_memop(oi);
2069
2070#if defined(CONFIG_SOFTMMU)
2071    mem_index = get_mmuidx(oi);
2072
2073    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2074                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2075
2076    /* TLB Hit.  */
2077    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2078
2079    /* Record the current context of a load into ldst label */
2080    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2081                        s->code_ptr, label_ptr);
2082#else
2083    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2084                           x86_guest_base_offset, x86_guest_base_seg,
2085                           is64, opc);
2086#endif
2087}
2088
2089static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2090                                   TCGReg base, int index, intptr_t ofs,
2091                                   int seg, MemOp memop)
2092{
2093    bool use_movbe = false;
2094    int movop = OPC_MOVL_EvGv;
2095
2096    /*
2097     * Do big-endian stores with movbe or softmmu.
2098     * User-only without movbe will have its swapping done generically.
2099     */
2100    if (memop & MO_BSWAP) {
2101        tcg_debug_assert(have_movbe);
2102        use_movbe = true;
2103        movop = OPC_MOVBE_MyGy;
2104    }
2105
2106    switch (memop & MO_SIZE) {
2107    case MO_8:
2108        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2109        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2110        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2111                                 datalo, base, index, 0, ofs);
2112        break;
2113    case MO_16:
2114        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2115                                 base, index, 0, ofs);
2116        break;
2117    case MO_32:
2118        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2119        break;
2120    case MO_64:
2121        if (TCG_TARGET_REG_BITS == 64) {
2122            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2123                                     base, index, 0, ofs);
2124        } else {
2125            if (use_movbe) {
2126                TCGReg t = datalo;
2127                datalo = datahi;
2128                datahi = t;
2129            }
2130            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2131                                     base, index, 0, ofs);
2132            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2133                                     base, index, 0, ofs + 4);
2134        }
2135        break;
2136    default:
2137        g_assert_not_reached();
2138    }
2139}
2140
2141static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2142{
2143    TCGReg datalo, datahi, addrlo;
2144    TCGReg addrhi __attribute__((unused));
2145    TCGMemOpIdx oi;
2146    MemOp opc;
2147#if defined(CONFIG_SOFTMMU)
2148    int mem_index;
2149    tcg_insn_unit *label_ptr[2];
2150#endif
2151
2152    datalo = *args++;
2153    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2154    addrlo = *args++;
2155    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2156    oi = *args++;
2157    opc = get_memop(oi);
2158
2159#if defined(CONFIG_SOFTMMU)
2160    mem_index = get_mmuidx(oi);
2161
2162    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2163                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2164
2165    /* TLB Hit.  */
2166    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2167
2168    /* Record the current context of a store into ldst label */
2169    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2170                        s->code_ptr, label_ptr);
2171#else
2172    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2173                           x86_guest_base_offset, x86_guest_base_seg, opc);
2174#endif
2175}
2176
2177static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2178                              const TCGArg args[TCG_MAX_OP_ARGS],
2179                              const int const_args[TCG_MAX_OP_ARGS])
2180{
2181    TCGArg a0, a1, a2;
2182    int c, const_a2, vexop, rexw = 0;
2183
2184#if TCG_TARGET_REG_BITS == 64
2185# define OP_32_64(x) \
2186        case glue(glue(INDEX_op_, x), _i64): \
2187            rexw = P_REXW; /* FALLTHRU */    \
2188        case glue(glue(INDEX_op_, x), _i32)
2189#else
2190# define OP_32_64(x) \
2191        case glue(glue(INDEX_op_, x), _i32)
2192#endif
2193
2194    /* Hoist the loads of the most common arguments.  */
2195    a0 = args[0];
2196    a1 = args[1];
2197    a2 = args[2];
2198    const_a2 = const_args[2];
2199
2200    switch (opc) {
2201    case INDEX_op_exit_tb:
2202        /* Reuse the zeroing that exists for goto_ptr.  */
2203        if (a0 == 0) {
2204            tcg_out_jmp(s, tcg_code_gen_epilogue);
2205        } else {
2206            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2207            tcg_out_jmp(s, tb_ret_addr);
2208        }
2209        break;
2210    case INDEX_op_goto_tb:
2211        if (s->tb_jmp_insn_offset) {
2212            /* direct jump method */
2213            int gap;
2214            /* jump displacement must be aligned for atomic patching;
2215             * see if we need to add extra nops before jump
2216             */
2217            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2218            if (gap != 1) {
2219                tcg_out_nopn(s, gap - 1);
2220            }
2221            tcg_out8(s, OPC_JMP_long); /* jmp im */
2222            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2223            tcg_out32(s, 0);
2224        } else {
2225            /* indirect jump method */
2226            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2227                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2228        }
2229        set_jmp_reset_offset(s, a0);
2230        break;
2231    case INDEX_op_goto_ptr:
2232        /* jmp to the given host address (could be epilogue) */
2233        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2234        break;
2235    case INDEX_op_br:
2236        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2237        break;
2238    OP_32_64(ld8u):
2239        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2240        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2241        break;
2242    OP_32_64(ld8s):
2243        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2244        break;
2245    OP_32_64(ld16u):
2246        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2247        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2248        break;
2249    OP_32_64(ld16s):
2250        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2251        break;
2252#if TCG_TARGET_REG_BITS == 64
2253    case INDEX_op_ld32u_i64:
2254#endif
2255    case INDEX_op_ld_i32:
2256        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2257        break;
2258
2259    OP_32_64(st8):
2260        if (const_args[0]) {
2261            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2262            tcg_out8(s, a0);
2263        } else {
2264            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2265        }
2266        break;
2267    OP_32_64(st16):
2268        if (const_args[0]) {
2269            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2270            tcg_out16(s, a0);
2271        } else {
2272            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2273        }
2274        break;
2275#if TCG_TARGET_REG_BITS == 64
2276    case INDEX_op_st32_i64:
2277#endif
2278    case INDEX_op_st_i32:
2279        if (const_args[0]) {
2280            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2281            tcg_out32(s, a0);
2282        } else {
2283            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2284        }
2285        break;
2286
2287    OP_32_64(add):
2288        /* For 3-operand addition, use LEA.  */
2289        if (a0 != a1) {
2290            TCGArg c3 = 0;
2291            if (const_a2) {
2292                c3 = a2, a2 = -1;
2293            } else if (a0 == a2) {
2294                /* Watch out for dest = src + dest, since we've removed
2295                   the matching constraint on the add.  */
2296                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2297                break;
2298            }
2299
2300            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2301            break;
2302        }
2303        c = ARITH_ADD;
2304        goto gen_arith;
2305    OP_32_64(sub):
2306        c = ARITH_SUB;
2307        goto gen_arith;
2308    OP_32_64(and):
2309        c = ARITH_AND;
2310        goto gen_arith;
2311    OP_32_64(or):
2312        c = ARITH_OR;
2313        goto gen_arith;
2314    OP_32_64(xor):
2315        c = ARITH_XOR;
2316        goto gen_arith;
2317    gen_arith:
2318        if (const_a2) {
2319            tgen_arithi(s, c + rexw, a0, a2, 0);
2320        } else {
2321            tgen_arithr(s, c + rexw, a0, a2);
2322        }
2323        break;
2324
2325    OP_32_64(andc):
2326        if (const_a2) {
2327            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2328            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2329        } else {
2330            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2331        }
2332        break;
2333
2334    OP_32_64(mul):
2335        if (const_a2) {
2336            int32_t val;
2337            val = a2;
2338            if (val == (int8_t)val) {
2339                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2340                tcg_out8(s, val);
2341            } else {
2342                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2343                tcg_out32(s, val);
2344            }
2345        } else {
2346            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2347        }
2348        break;
2349
2350    OP_32_64(div2):
2351        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2352        break;
2353    OP_32_64(divu2):
2354        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2355        break;
2356
2357    OP_32_64(shl):
2358        /* For small constant 3-operand shift, use LEA.  */
2359        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2360            if (a2 - 1 == 0) {
2361                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2362                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2363            } else {
2364                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2365                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2366            }
2367            break;
2368        }
2369        c = SHIFT_SHL;
2370        vexop = OPC_SHLX;
2371        goto gen_shift_maybe_vex;
2372    OP_32_64(shr):
2373        c = SHIFT_SHR;
2374        vexop = OPC_SHRX;
2375        goto gen_shift_maybe_vex;
2376    OP_32_64(sar):
2377        c = SHIFT_SAR;
2378        vexop = OPC_SARX;
2379        goto gen_shift_maybe_vex;
2380    OP_32_64(rotl):
2381        c = SHIFT_ROL;
2382        goto gen_shift;
2383    OP_32_64(rotr):
2384        c = SHIFT_ROR;
2385        goto gen_shift;
2386    gen_shift_maybe_vex:
2387        if (have_bmi2) {
2388            if (!const_a2) {
2389                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2390                break;
2391            }
2392            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2393        }
2394        /* FALLTHRU */
2395    gen_shift:
2396        if (const_a2) {
2397            tcg_out_shifti(s, c + rexw, a0, a2);
2398        } else {
2399            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2400        }
2401        break;
2402
2403    OP_32_64(ctz):
2404        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2405        break;
2406    OP_32_64(clz):
2407        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2408        break;
2409    OP_32_64(ctpop):
2410        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2411        break;
2412
2413    case INDEX_op_brcond_i32:
2414        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2415        break;
2416    case INDEX_op_setcond_i32:
2417        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2418        break;
2419    case INDEX_op_movcond_i32:
2420        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2421        break;
2422
2423    OP_32_64(bswap16):
2424        if (a2 & TCG_BSWAP_OS) {
2425            /* Output must be sign-extended. */
2426            if (rexw) {
2427                tcg_out_bswap64(s, a0);
2428                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2429            } else {
2430                tcg_out_bswap32(s, a0);
2431                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2432            }
2433        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2434            /* Output must be zero-extended, but input isn't. */
2435            tcg_out_bswap32(s, a0);
2436            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2437        } else {
2438            tcg_out_rolw_8(s, a0);
2439        }
2440        break;
2441    OP_32_64(bswap32):
2442        tcg_out_bswap32(s, a0);
2443        if (rexw && (a2 & TCG_BSWAP_OS)) {
2444            tcg_out_ext32s(s, a0, a0);
2445        }
2446        break;
2447
2448    OP_32_64(neg):
2449        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2450        break;
2451    OP_32_64(not):
2452        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2453        break;
2454
2455    OP_32_64(ext8s):
2456        tcg_out_ext8s(s, a0, a1, rexw);
2457        break;
2458    OP_32_64(ext16s):
2459        tcg_out_ext16s(s, a0, a1, rexw);
2460        break;
2461    OP_32_64(ext8u):
2462        tcg_out_ext8u(s, a0, a1);
2463        break;
2464    OP_32_64(ext16u):
2465        tcg_out_ext16u(s, a0, a1);
2466        break;
2467
2468    case INDEX_op_qemu_ld_i32:
2469        tcg_out_qemu_ld(s, args, 0);
2470        break;
2471    case INDEX_op_qemu_ld_i64:
2472        tcg_out_qemu_ld(s, args, 1);
2473        break;
2474    case INDEX_op_qemu_st_i32:
2475    case INDEX_op_qemu_st8_i32:
2476        tcg_out_qemu_st(s, args, 0);
2477        break;
2478    case INDEX_op_qemu_st_i64:
2479        tcg_out_qemu_st(s, args, 1);
2480        break;
2481
2482    OP_32_64(mulu2):
2483        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2484        break;
2485    OP_32_64(muls2):
2486        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2487        break;
2488    OP_32_64(add2):
2489        if (const_args[4]) {
2490            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2491        } else {
2492            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2493        }
2494        if (const_args[5]) {
2495            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2496        } else {
2497            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2498        }
2499        break;
2500    OP_32_64(sub2):
2501        if (const_args[4]) {
2502            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2503        } else {
2504            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2505        }
2506        if (const_args[5]) {
2507            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2508        } else {
2509            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2510        }
2511        break;
2512
2513#if TCG_TARGET_REG_BITS == 32
2514    case INDEX_op_brcond2_i32:
2515        tcg_out_brcond2(s, args, const_args, 0);
2516        break;
2517    case INDEX_op_setcond2_i32:
2518        tcg_out_setcond2(s, args, const_args);
2519        break;
2520#else /* TCG_TARGET_REG_BITS == 64 */
2521    case INDEX_op_ld32s_i64:
2522        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2523        break;
2524    case INDEX_op_ld_i64:
2525        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2526        break;
2527    case INDEX_op_st_i64:
2528        if (const_args[0]) {
2529            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2530            tcg_out32(s, a0);
2531        } else {
2532            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2533        }
2534        break;
2535
2536    case INDEX_op_brcond_i64:
2537        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2538        break;
2539    case INDEX_op_setcond_i64:
2540        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2541        break;
2542    case INDEX_op_movcond_i64:
2543        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2544        break;
2545
2546    case INDEX_op_bswap64_i64:
2547        tcg_out_bswap64(s, a0);
2548        break;
2549    case INDEX_op_extu_i32_i64:
2550    case INDEX_op_ext32u_i64:
2551    case INDEX_op_extrl_i64_i32:
2552        tcg_out_ext32u(s, a0, a1);
2553        break;
2554    case INDEX_op_ext_i32_i64:
2555    case INDEX_op_ext32s_i64:
2556        tcg_out_ext32s(s, a0, a1);
2557        break;
2558    case INDEX_op_extrh_i64_i32:
2559        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2560        break;
2561#endif
2562
2563    OP_32_64(deposit):
2564        if (args[3] == 0 && args[4] == 8) {
2565            /* load bits 0..7 */
2566            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2567        } else if (args[3] == 8 && args[4] == 8) {
2568            /* load bits 8..15 */
2569            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2570        } else if (args[3] == 0 && args[4] == 16) {
2571            /* load bits 0..15 */
2572            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2573        } else {
2574            tcg_abort();
2575        }
2576        break;
2577
2578    case INDEX_op_extract_i64:
2579        if (a2 + args[3] == 32) {
2580            /* This is a 32-bit zero-extending right shift.  */
2581            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2582            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2583            break;
2584        }
2585        /* FALLTHRU */
2586    case INDEX_op_extract_i32:
2587        /* On the off-chance that we can use the high-byte registers.
2588           Otherwise we emit the same ext16 + shift pattern that we
2589           would have gotten from the normal tcg-op.c expansion.  */
2590        tcg_debug_assert(a2 == 8 && args[3] == 8);
2591        if (a1 < 4 && a0 < 8) {
2592            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2593        } else {
2594            tcg_out_ext16u(s, a0, a1);
2595            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2596        }
2597        break;
2598
2599    case INDEX_op_sextract_i32:
2600        /* We don't implement sextract_i64, as we cannot sign-extend to
2601           64-bits without using the REX prefix that explicitly excludes
2602           access to the high-byte registers.  */
2603        tcg_debug_assert(a2 == 8 && args[3] == 8);
2604        if (a1 < 4 && a0 < 8) {
2605            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2606        } else {
2607            tcg_out_ext16s(s, a0, a1, 0);
2608            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2609        }
2610        break;
2611
2612    OP_32_64(extract2):
2613        /* Note that SHRD outputs to the r/m operand.  */
2614        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2615        tcg_out8(s, args[3]);
2616        break;
2617
2618    case INDEX_op_mb:
2619        tcg_out_mb(s, a0);
2620        break;
2621    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2622    case INDEX_op_mov_i64:
2623    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2624    default:
2625        tcg_abort();
2626    }
2627
2628#undef OP_32_64
2629}
2630
2631static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2632                           unsigned vecl, unsigned vece,
2633                           const TCGArg args[TCG_MAX_OP_ARGS],
2634                           const int const_args[TCG_MAX_OP_ARGS])
2635{
2636    static int const add_insn[4] = {
2637        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2638    };
2639    static int const ssadd_insn[4] = {
2640        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2641    };
2642    static int const usadd_insn[4] = {
2643        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2644    };
2645    static int const sub_insn[4] = {
2646        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2647    };
2648    static int const sssub_insn[4] = {
2649        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2650    };
2651    static int const ussub_insn[4] = {
2652        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2653    };
2654    static int const mul_insn[4] = {
2655        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2656    };
2657    static int const shift_imm_insn[4] = {
2658        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2659    };
2660    static int const cmpeq_insn[4] = {
2661        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2662    };
2663    static int const cmpgt_insn[4] = {
2664        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2665    };
2666    static int const punpckl_insn[4] = {
2667        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2668    };
2669    static int const punpckh_insn[4] = {
2670        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2671    };
2672    static int const packss_insn[4] = {
2673        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2674    };
2675    static int const packus_insn[4] = {
2676        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2677    };
2678    static int const smin_insn[4] = {
2679        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2680    };
2681    static int const smax_insn[4] = {
2682        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2683    };
2684    static int const umin_insn[4] = {
2685        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2686    };
2687    static int const umax_insn[4] = {
2688        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2689    };
2690    static int const shlv_insn[4] = {
2691        /* TODO: AVX512 adds support for MO_16.  */
2692        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2693    };
2694    static int const shrv_insn[4] = {
2695        /* TODO: AVX512 adds support for MO_16.  */
2696        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2697    };
2698    static int const sarv_insn[4] = {
2699        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2700        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2701    };
2702    static int const shls_insn[4] = {
2703        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2704    };
2705    static int const shrs_insn[4] = {
2706        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2707    };
2708    static int const sars_insn[4] = {
2709        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2710    };
2711    static int const abs_insn[4] = {
2712        /* TODO: AVX512 adds support for MO_64.  */
2713        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2714    };
2715
2716    TCGType type = vecl + TCG_TYPE_V64;
2717    int insn, sub;
2718    TCGArg a0, a1, a2;
2719
2720    a0 = args[0];
2721    a1 = args[1];
2722    a2 = args[2];
2723
2724    switch (opc) {
2725    case INDEX_op_add_vec:
2726        insn = add_insn[vece];
2727        goto gen_simd;
2728    case INDEX_op_ssadd_vec:
2729        insn = ssadd_insn[vece];
2730        goto gen_simd;
2731    case INDEX_op_usadd_vec:
2732        insn = usadd_insn[vece];
2733        goto gen_simd;
2734    case INDEX_op_sub_vec:
2735        insn = sub_insn[vece];
2736        goto gen_simd;
2737    case INDEX_op_sssub_vec:
2738        insn = sssub_insn[vece];
2739        goto gen_simd;
2740    case INDEX_op_ussub_vec:
2741        insn = ussub_insn[vece];
2742        goto gen_simd;
2743    case INDEX_op_mul_vec:
2744        insn = mul_insn[vece];
2745        goto gen_simd;
2746    case INDEX_op_and_vec:
2747        insn = OPC_PAND;
2748        goto gen_simd;
2749    case INDEX_op_or_vec:
2750        insn = OPC_POR;
2751        goto gen_simd;
2752    case INDEX_op_xor_vec:
2753        insn = OPC_PXOR;
2754        goto gen_simd;
2755    case INDEX_op_smin_vec:
2756        insn = smin_insn[vece];
2757        goto gen_simd;
2758    case INDEX_op_umin_vec:
2759        insn = umin_insn[vece];
2760        goto gen_simd;
2761    case INDEX_op_smax_vec:
2762        insn = smax_insn[vece];
2763        goto gen_simd;
2764    case INDEX_op_umax_vec:
2765        insn = umax_insn[vece];
2766        goto gen_simd;
2767    case INDEX_op_shlv_vec:
2768        insn = shlv_insn[vece];
2769        goto gen_simd;
2770    case INDEX_op_shrv_vec:
2771        insn = shrv_insn[vece];
2772        goto gen_simd;
2773    case INDEX_op_sarv_vec:
2774        insn = sarv_insn[vece];
2775        goto gen_simd;
2776    case INDEX_op_shls_vec:
2777        insn = shls_insn[vece];
2778        goto gen_simd;
2779    case INDEX_op_shrs_vec:
2780        insn = shrs_insn[vece];
2781        goto gen_simd;
2782    case INDEX_op_sars_vec:
2783        insn = sars_insn[vece];
2784        goto gen_simd;
2785    case INDEX_op_x86_punpckl_vec:
2786        insn = punpckl_insn[vece];
2787        goto gen_simd;
2788    case INDEX_op_x86_punpckh_vec:
2789        insn = punpckh_insn[vece];
2790        goto gen_simd;
2791    case INDEX_op_x86_packss_vec:
2792        insn = packss_insn[vece];
2793        goto gen_simd;
2794    case INDEX_op_x86_packus_vec:
2795        insn = packus_insn[vece];
2796        goto gen_simd;
2797#if TCG_TARGET_REG_BITS == 32
2798    case INDEX_op_dup2_vec:
2799        /* First merge the two 32-bit inputs to a single 64-bit element. */
2800        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2801        /* Then replicate the 64-bit elements across the rest of the vector. */
2802        if (type != TCG_TYPE_V64) {
2803            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2804        }
2805        break;
2806#endif
2807    case INDEX_op_abs_vec:
2808        insn = abs_insn[vece];
2809        a2 = a1;
2810        a1 = 0;
2811        goto gen_simd;
2812    gen_simd:
2813        tcg_debug_assert(insn != OPC_UD2);
2814        if (type == TCG_TYPE_V256) {
2815            insn |= P_VEXL;
2816        }
2817        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2818        break;
2819
2820    case INDEX_op_cmp_vec:
2821        sub = args[3];
2822        if (sub == TCG_COND_EQ) {
2823            insn = cmpeq_insn[vece];
2824        } else if (sub == TCG_COND_GT) {
2825            insn = cmpgt_insn[vece];
2826        } else {
2827            g_assert_not_reached();
2828        }
2829        goto gen_simd;
2830
2831    case INDEX_op_andc_vec:
2832        insn = OPC_PANDN;
2833        if (type == TCG_TYPE_V256) {
2834            insn |= P_VEXL;
2835        }
2836        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2837        break;
2838
2839    case INDEX_op_shli_vec:
2840        sub = 6;
2841        goto gen_shift;
2842    case INDEX_op_shri_vec:
2843        sub = 2;
2844        goto gen_shift;
2845    case INDEX_op_sari_vec:
2846        tcg_debug_assert(vece != MO_64);
2847        sub = 4;
2848    gen_shift:
2849        tcg_debug_assert(vece != MO_8);
2850        insn = shift_imm_insn[vece];
2851        if (type == TCG_TYPE_V256) {
2852            insn |= P_VEXL;
2853        }
2854        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2855        tcg_out8(s, a2);
2856        break;
2857
2858    case INDEX_op_ld_vec:
2859        tcg_out_ld(s, type, a0, a1, a2);
2860        break;
2861    case INDEX_op_st_vec:
2862        tcg_out_st(s, type, a0, a1, a2);
2863        break;
2864    case INDEX_op_dupm_vec:
2865        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2866        break;
2867
2868    case INDEX_op_x86_shufps_vec:
2869        insn = OPC_SHUFPS;
2870        sub = args[3];
2871        goto gen_simd_imm8;
2872    case INDEX_op_x86_blend_vec:
2873        if (vece == MO_16) {
2874            insn = OPC_PBLENDW;
2875        } else if (vece == MO_32) {
2876            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2877        } else {
2878            g_assert_not_reached();
2879        }
2880        sub = args[3];
2881        goto gen_simd_imm8;
2882    case INDEX_op_x86_vperm2i128_vec:
2883        insn = OPC_VPERM2I128;
2884        sub = args[3];
2885        goto gen_simd_imm8;
2886    gen_simd_imm8:
2887        if (type == TCG_TYPE_V256) {
2888            insn |= P_VEXL;
2889        }
2890        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2891        tcg_out8(s, sub);
2892        break;
2893
2894    case INDEX_op_x86_vpblendvb_vec:
2895        insn = OPC_VPBLENDVB;
2896        if (type == TCG_TYPE_V256) {
2897            insn |= P_VEXL;
2898        }
2899        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2900        tcg_out8(s, args[3] << 4);
2901        break;
2902
2903    case INDEX_op_x86_psrldq_vec:
2904        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2905        tcg_out8(s, a2);
2906        break;
2907
2908    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2909    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2910    default:
2911        g_assert_not_reached();
2912    }
2913}
2914
2915static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2916{
2917    switch (op) {
2918    case INDEX_op_goto_ptr:
2919        return C_O0_I1(r);
2920
2921    case INDEX_op_ld8u_i32:
2922    case INDEX_op_ld8u_i64:
2923    case INDEX_op_ld8s_i32:
2924    case INDEX_op_ld8s_i64:
2925    case INDEX_op_ld16u_i32:
2926    case INDEX_op_ld16u_i64:
2927    case INDEX_op_ld16s_i32:
2928    case INDEX_op_ld16s_i64:
2929    case INDEX_op_ld_i32:
2930    case INDEX_op_ld32u_i64:
2931    case INDEX_op_ld32s_i64:
2932    case INDEX_op_ld_i64:
2933        return C_O1_I1(r, r);
2934
2935    case INDEX_op_st8_i32:
2936    case INDEX_op_st8_i64:
2937        return C_O0_I2(qi, r);
2938
2939    case INDEX_op_st16_i32:
2940    case INDEX_op_st16_i64:
2941    case INDEX_op_st_i32:
2942    case INDEX_op_st32_i64:
2943        return C_O0_I2(ri, r);
2944
2945    case INDEX_op_st_i64:
2946        return C_O0_I2(re, r);
2947
2948    case INDEX_op_add_i32:
2949    case INDEX_op_add_i64:
2950        return C_O1_I2(r, r, re);
2951
2952    case INDEX_op_sub_i32:
2953    case INDEX_op_sub_i64:
2954    case INDEX_op_mul_i32:
2955    case INDEX_op_mul_i64:
2956    case INDEX_op_or_i32:
2957    case INDEX_op_or_i64:
2958    case INDEX_op_xor_i32:
2959    case INDEX_op_xor_i64:
2960        return C_O1_I2(r, 0, re);
2961
2962    case INDEX_op_and_i32:
2963    case INDEX_op_and_i64:
2964        return C_O1_I2(r, 0, reZ);
2965
2966    case INDEX_op_andc_i32:
2967    case INDEX_op_andc_i64:
2968        return C_O1_I2(r, r, rI);
2969
2970    case INDEX_op_shl_i32:
2971    case INDEX_op_shl_i64:
2972    case INDEX_op_shr_i32:
2973    case INDEX_op_shr_i64:
2974    case INDEX_op_sar_i32:
2975    case INDEX_op_sar_i64:
2976        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
2977
2978    case INDEX_op_rotl_i32:
2979    case INDEX_op_rotl_i64:
2980    case INDEX_op_rotr_i32:
2981    case INDEX_op_rotr_i64:
2982        return C_O1_I2(r, 0, ci);
2983
2984    case INDEX_op_brcond_i32:
2985    case INDEX_op_brcond_i64:
2986        return C_O0_I2(r, re);
2987
2988    case INDEX_op_bswap16_i32:
2989    case INDEX_op_bswap16_i64:
2990    case INDEX_op_bswap32_i32:
2991    case INDEX_op_bswap32_i64:
2992    case INDEX_op_bswap64_i64:
2993    case INDEX_op_neg_i32:
2994    case INDEX_op_neg_i64:
2995    case INDEX_op_not_i32:
2996    case INDEX_op_not_i64:
2997    case INDEX_op_extrh_i64_i32:
2998        return C_O1_I1(r, 0);
2999
3000    case INDEX_op_ext8s_i32:
3001    case INDEX_op_ext8s_i64:
3002    case INDEX_op_ext8u_i32:
3003    case INDEX_op_ext8u_i64:
3004        return C_O1_I1(r, q);
3005
3006    case INDEX_op_ext16s_i32:
3007    case INDEX_op_ext16s_i64:
3008    case INDEX_op_ext16u_i32:
3009    case INDEX_op_ext16u_i64:
3010    case INDEX_op_ext32s_i64:
3011    case INDEX_op_ext32u_i64:
3012    case INDEX_op_ext_i32_i64:
3013    case INDEX_op_extu_i32_i64:
3014    case INDEX_op_extrl_i64_i32:
3015    case INDEX_op_extract_i32:
3016    case INDEX_op_extract_i64:
3017    case INDEX_op_sextract_i32:
3018    case INDEX_op_ctpop_i32:
3019    case INDEX_op_ctpop_i64:
3020        return C_O1_I1(r, r);
3021
3022    case INDEX_op_extract2_i32:
3023    case INDEX_op_extract2_i64:
3024        return C_O1_I2(r, 0, r);
3025
3026    case INDEX_op_deposit_i32:
3027    case INDEX_op_deposit_i64:
3028        return C_O1_I2(Q, 0, Q);
3029
3030    case INDEX_op_setcond_i32:
3031    case INDEX_op_setcond_i64:
3032        return C_O1_I2(q, r, re);
3033
3034    case INDEX_op_movcond_i32:
3035    case INDEX_op_movcond_i64:
3036        return C_O1_I4(r, r, re, r, 0);
3037
3038    case INDEX_op_div2_i32:
3039    case INDEX_op_div2_i64:
3040    case INDEX_op_divu2_i32:
3041    case INDEX_op_divu2_i64:
3042        return C_O2_I3(a, d, 0, 1, r);
3043
3044    case INDEX_op_mulu2_i32:
3045    case INDEX_op_mulu2_i64:
3046    case INDEX_op_muls2_i32:
3047    case INDEX_op_muls2_i64:
3048        return C_O2_I2(a, d, a, r);
3049
3050    case INDEX_op_add2_i32:
3051    case INDEX_op_add2_i64:
3052    case INDEX_op_sub2_i32:
3053    case INDEX_op_sub2_i64:
3054        return C_O2_I4(r, r, 0, 1, re, re);
3055
3056    case INDEX_op_ctz_i32:
3057    case INDEX_op_ctz_i64:
3058        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3059
3060    case INDEX_op_clz_i32:
3061    case INDEX_op_clz_i64:
3062        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3063
3064    case INDEX_op_qemu_ld_i32:
3065        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3066                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3067
3068    case INDEX_op_qemu_st_i32:
3069        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3070                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3071    case INDEX_op_qemu_st8_i32:
3072        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3073                ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3074
3075    case INDEX_op_qemu_ld_i64:
3076        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3077                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3078                : C_O2_I2(r, r, L, L));
3079
3080    case INDEX_op_qemu_st_i64:
3081        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3082                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3083                : C_O0_I4(L, L, L, L));
3084
3085    case INDEX_op_brcond2_i32:
3086        return C_O0_I4(r, r, ri, ri);
3087
3088    case INDEX_op_setcond2_i32:
3089        return C_O1_I4(r, r, r, ri, ri);
3090
3091    case INDEX_op_ld_vec:
3092    case INDEX_op_dupm_vec:
3093        return C_O1_I1(x, r);
3094
3095    case INDEX_op_st_vec:
3096        return C_O0_I2(x, r);
3097
3098    case INDEX_op_add_vec:
3099    case INDEX_op_sub_vec:
3100    case INDEX_op_mul_vec:
3101    case INDEX_op_and_vec:
3102    case INDEX_op_or_vec:
3103    case INDEX_op_xor_vec:
3104    case INDEX_op_andc_vec:
3105    case INDEX_op_ssadd_vec:
3106    case INDEX_op_usadd_vec:
3107    case INDEX_op_sssub_vec:
3108    case INDEX_op_ussub_vec:
3109    case INDEX_op_smin_vec:
3110    case INDEX_op_umin_vec:
3111    case INDEX_op_smax_vec:
3112    case INDEX_op_umax_vec:
3113    case INDEX_op_shlv_vec:
3114    case INDEX_op_shrv_vec:
3115    case INDEX_op_sarv_vec:
3116    case INDEX_op_shls_vec:
3117    case INDEX_op_shrs_vec:
3118    case INDEX_op_sars_vec:
3119    case INDEX_op_rotls_vec:
3120    case INDEX_op_cmp_vec:
3121    case INDEX_op_x86_shufps_vec:
3122    case INDEX_op_x86_blend_vec:
3123    case INDEX_op_x86_packss_vec:
3124    case INDEX_op_x86_packus_vec:
3125    case INDEX_op_x86_vperm2i128_vec:
3126    case INDEX_op_x86_punpckl_vec:
3127    case INDEX_op_x86_punpckh_vec:
3128#if TCG_TARGET_REG_BITS == 32
3129    case INDEX_op_dup2_vec:
3130#endif
3131        return C_O1_I2(x, x, x);
3132
3133    case INDEX_op_abs_vec:
3134    case INDEX_op_dup_vec:
3135    case INDEX_op_shli_vec:
3136    case INDEX_op_shri_vec:
3137    case INDEX_op_sari_vec:
3138    case INDEX_op_x86_psrldq_vec:
3139        return C_O1_I1(x, x);
3140
3141    case INDEX_op_x86_vpblendvb_vec:
3142        return C_O1_I3(x, x, x, x);
3143
3144    default:
3145        g_assert_not_reached();
3146    }
3147}
3148
3149int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3150{
3151    switch (opc) {
3152    case INDEX_op_add_vec:
3153    case INDEX_op_sub_vec:
3154    case INDEX_op_and_vec:
3155    case INDEX_op_or_vec:
3156    case INDEX_op_xor_vec:
3157    case INDEX_op_andc_vec:
3158        return 1;
3159    case INDEX_op_rotli_vec:
3160    case INDEX_op_cmp_vec:
3161    case INDEX_op_cmpsel_vec:
3162        return -1;
3163
3164    case INDEX_op_shli_vec:
3165    case INDEX_op_shri_vec:
3166        /* We must expand the operation for MO_8.  */
3167        return vece == MO_8 ? -1 : 1;
3168
3169    case INDEX_op_sari_vec:
3170        /* We must expand the operation for MO_8.  */
3171        if (vece == MO_8) {
3172            return -1;
3173        }
3174        /* We can emulate this for MO_64, but it does not pay off
3175           unless we're producing at least 4 values.  */
3176        if (vece == MO_64) {
3177            return type >= TCG_TYPE_V256 ? -1 : 0;
3178        }
3179        return 1;
3180
3181    case INDEX_op_shls_vec:
3182    case INDEX_op_shrs_vec:
3183        return vece >= MO_16;
3184    case INDEX_op_sars_vec:
3185        return vece >= MO_16 && vece <= MO_32;
3186    case INDEX_op_rotls_vec:
3187        return vece >= MO_16 ? -1 : 0;
3188
3189    case INDEX_op_shlv_vec:
3190    case INDEX_op_shrv_vec:
3191        return have_avx2 && vece >= MO_32;
3192    case INDEX_op_sarv_vec:
3193        return have_avx2 && vece == MO_32;
3194    case INDEX_op_rotlv_vec:
3195    case INDEX_op_rotrv_vec:
3196        return have_avx2 && vece >= MO_32 ? -1 : 0;
3197
3198    case INDEX_op_mul_vec:
3199        if (vece == MO_8) {
3200            /* We can expand the operation for MO_8.  */
3201            return -1;
3202        }
3203        if (vece == MO_64) {
3204            return 0;
3205        }
3206        return 1;
3207
3208    case INDEX_op_ssadd_vec:
3209    case INDEX_op_usadd_vec:
3210    case INDEX_op_sssub_vec:
3211    case INDEX_op_ussub_vec:
3212        return vece <= MO_16;
3213    case INDEX_op_smin_vec:
3214    case INDEX_op_smax_vec:
3215    case INDEX_op_umin_vec:
3216    case INDEX_op_umax_vec:
3217    case INDEX_op_abs_vec:
3218        return vece <= MO_32;
3219
3220    default:
3221        return 0;
3222    }
3223}
3224
3225static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3226                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3227{
3228    TCGv_vec t1, t2;
3229
3230    tcg_debug_assert(vece == MO_8);
3231
3232    t1 = tcg_temp_new_vec(type);
3233    t2 = tcg_temp_new_vec(type);
3234
3235    /*
3236     * Unpack to W, shift, and repack.  Tricky bits:
3237     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3238     *     i.e. duplicate in other half of the 16-bit lane.
3239     * (2) For right-shift, add 8 so that the high half of the lane
3240     *     becomes zero.  For left-shift, and left-rotate, we must
3241     *     shift up and down again.
3242     * (3) Step 2 leaves high half zero such that PACKUSWB
3243     *     (pack with unsigned saturation) does not modify
3244     *     the quantity.
3245     */
3246    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3247              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3248    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3249              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3250
3251    if (opc != INDEX_op_rotli_vec) {
3252        imm += 8;
3253    }
3254    if (opc == INDEX_op_shri_vec) {
3255        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3256        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3257    } else {
3258        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3259        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3260        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3261        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3262    }
3263
3264    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3265              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3266    tcg_temp_free_vec(t1);
3267    tcg_temp_free_vec(t2);
3268}
3269
3270static void expand_vec_sari(TCGType type, unsigned vece,
3271                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3272{
3273    TCGv_vec t1, t2;
3274
3275    switch (vece) {
3276    case MO_8:
3277        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3278        t1 = tcg_temp_new_vec(type);
3279        t2 = tcg_temp_new_vec(type);
3280        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3281                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3282        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3283                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3284        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3285        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3286        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3287                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3288        tcg_temp_free_vec(t1);
3289        tcg_temp_free_vec(t2);
3290        break;
3291
3292    case MO_64:
3293        if (imm <= 32) {
3294            /*
3295             * We can emulate a small sign extend by performing an arithmetic
3296             * 32-bit shift and overwriting the high half of a 64-bit logical
3297             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3298             * does not, so we have to bound the smaller shift -- we get the
3299             * same result in the high half either way.
3300             */
3301            t1 = tcg_temp_new_vec(type);
3302            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3303            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3304            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3305                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3306                      tcgv_vec_arg(t1), 0xaa);
3307            tcg_temp_free_vec(t1);
3308        } else {
3309            /* Otherwise we will need to use a compare vs 0 to produce
3310             * the sign-extend, shift and merge.
3311             */
3312            t1 = tcg_const_zeros_vec(type);
3313            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3314            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3315            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3316            tcg_gen_or_vec(MO_64, v0, v0, t1);
3317            tcg_temp_free_vec(t1);
3318        }
3319        break;
3320
3321    default:
3322        g_assert_not_reached();
3323    }
3324}
3325
3326static void expand_vec_rotli(TCGType type, unsigned vece,
3327                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3328{
3329    TCGv_vec t;
3330
3331    if (vece == MO_8) {
3332        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3333        return;
3334    }
3335
3336    t = tcg_temp_new_vec(type);
3337    tcg_gen_shli_vec(vece, t, v1, imm);
3338    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3339    tcg_gen_or_vec(vece, v0, v0, t);
3340    tcg_temp_free_vec(t);
3341}
3342
3343static void expand_vec_rotls(TCGType type, unsigned vece,
3344                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3345{
3346    TCGv_i32 rsh;
3347    TCGv_vec t;
3348
3349    tcg_debug_assert(vece != MO_8);
3350
3351    t = tcg_temp_new_vec(type);
3352    rsh = tcg_temp_new_i32();
3353
3354    tcg_gen_neg_i32(rsh, lsh);
3355    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3356    tcg_gen_shls_vec(vece, t, v1, lsh);
3357    tcg_gen_shrs_vec(vece, v0, v1, rsh);
3358    tcg_gen_or_vec(vece, v0, v0, t);
3359    tcg_temp_free_vec(t);
3360    tcg_temp_free_i32(rsh);
3361}
3362
3363static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3364                            TCGv_vec v1, TCGv_vec sh, bool right)
3365{
3366    TCGv_vec t = tcg_temp_new_vec(type);
3367
3368    tcg_gen_dupi_vec(vece, t, 8 << vece);
3369    tcg_gen_sub_vec(vece, t, t, sh);
3370    if (right) {
3371        tcg_gen_shlv_vec(vece, t, v1, t);
3372        tcg_gen_shrv_vec(vece, v0, v1, sh);
3373    } else {
3374        tcg_gen_shrv_vec(vece, t, v1, t);
3375        tcg_gen_shlv_vec(vece, v0, v1, sh);
3376    }
3377    tcg_gen_or_vec(vece, v0, v0, t);
3378    tcg_temp_free_vec(t);
3379}
3380
3381static void expand_vec_mul(TCGType type, unsigned vece,
3382                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3383{
3384    TCGv_vec t1, t2, t3, t4, zero;
3385
3386    tcg_debug_assert(vece == MO_8);
3387
3388    /*
3389     * Unpack v1 bytes to words, 0 | x.
3390     * Unpack v2 bytes to words, y | 0.
3391     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3392     * Shift logical right by 8 bits to clear the high 8 bytes before
3393     * using an unsigned saturated pack.
3394     *
3395     * The difference between the V64, V128 and V256 cases is merely how
3396     * we distribute the expansion between temporaries.
3397     */
3398    switch (type) {
3399    case TCG_TYPE_V64:
3400        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3401        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3402        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3403        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3404                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3405        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3406                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3407        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3408        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3409        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3410                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3411        tcg_temp_free_vec(t1);
3412        tcg_temp_free_vec(t2);
3413        break;
3414
3415    case TCG_TYPE_V128:
3416    case TCG_TYPE_V256:
3417        t1 = tcg_temp_new_vec(type);
3418        t2 = tcg_temp_new_vec(type);
3419        t3 = tcg_temp_new_vec(type);
3420        t4 = tcg_temp_new_vec(type);
3421        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3422        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3423                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3424        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3425                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3426        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3427                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3428        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3429                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3430        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3431        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3432        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3433        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3434        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3435                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3436        tcg_temp_free_vec(t1);
3437        tcg_temp_free_vec(t2);
3438        tcg_temp_free_vec(t3);
3439        tcg_temp_free_vec(t4);
3440        break;
3441
3442    default:
3443        g_assert_not_reached();
3444    }
3445}
3446
3447static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3448                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3449{
3450    enum {
3451        NEED_INV  = 1,
3452        NEED_SWAP = 2,
3453        NEED_BIAS = 4,
3454        NEED_UMIN = 8,
3455        NEED_UMAX = 16,
3456    };
3457    TCGv_vec t1, t2, t3;
3458    uint8_t fixup;
3459
3460    switch (cond) {
3461    case TCG_COND_EQ:
3462    case TCG_COND_GT:
3463        fixup = 0;
3464        break;
3465    case TCG_COND_NE:
3466    case TCG_COND_LE:
3467        fixup = NEED_INV;
3468        break;
3469    case TCG_COND_LT:
3470        fixup = NEED_SWAP;
3471        break;
3472    case TCG_COND_GE:
3473        fixup = NEED_SWAP | NEED_INV;
3474        break;
3475    case TCG_COND_LEU:
3476        if (vece <= MO_32) {
3477            fixup = NEED_UMIN;
3478        } else {
3479            fixup = NEED_BIAS | NEED_INV;
3480        }
3481        break;
3482    case TCG_COND_GTU:
3483        if (vece <= MO_32) {
3484            fixup = NEED_UMIN | NEED_INV;
3485        } else {
3486            fixup = NEED_BIAS;
3487        }
3488        break;
3489    case TCG_COND_GEU:
3490        if (vece <= MO_32) {
3491            fixup = NEED_UMAX;
3492        } else {
3493            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3494        }
3495        break;
3496    case TCG_COND_LTU:
3497        if (vece <= MO_32) {
3498            fixup = NEED_UMAX | NEED_INV;
3499        } else {
3500            fixup = NEED_BIAS | NEED_SWAP;
3501        }
3502        break;
3503    default:
3504        g_assert_not_reached();
3505    }
3506
3507    if (fixup & NEED_INV) {
3508        cond = tcg_invert_cond(cond);
3509    }
3510    if (fixup & NEED_SWAP) {
3511        t1 = v1, v1 = v2, v2 = t1;
3512        cond = tcg_swap_cond(cond);
3513    }
3514
3515    t1 = t2 = NULL;
3516    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3517        t1 = tcg_temp_new_vec(type);
3518        if (fixup & NEED_UMIN) {
3519            tcg_gen_umin_vec(vece, t1, v1, v2);
3520        } else {
3521            tcg_gen_umax_vec(vece, t1, v1, v2);
3522        }
3523        v2 = t1;
3524        cond = TCG_COND_EQ;
3525    } else if (fixup & NEED_BIAS) {
3526        t1 = tcg_temp_new_vec(type);
3527        t2 = tcg_temp_new_vec(type);
3528        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3529        tcg_gen_sub_vec(vece, t1, v1, t3);
3530        tcg_gen_sub_vec(vece, t2, v2, t3);
3531        v1 = t1;
3532        v2 = t2;
3533        cond = tcg_signed_cond(cond);
3534    }
3535
3536    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3537    /* Expand directly; do not recurse.  */
3538    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3539              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3540
3541    if (t1) {
3542        tcg_temp_free_vec(t1);
3543        if (t2) {
3544            tcg_temp_free_vec(t2);
3545        }
3546    }
3547    return fixup & NEED_INV;
3548}
3549
3550static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3551                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3552{
3553    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3554        tcg_gen_not_vec(vece, v0, v0);
3555    }
3556}
3557
3558static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3559                              TCGv_vec c1, TCGv_vec c2,
3560                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3561{
3562    TCGv_vec t = tcg_temp_new_vec(type);
3563
3564    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3565        /* Invert the sense of the compare by swapping arguments.  */
3566        TCGv_vec x;
3567        x = v3, v3 = v4, v4 = x;
3568    }
3569    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3570              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3571              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3572    tcg_temp_free_vec(t);
3573}
3574
3575void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3576                       TCGArg a0, ...)
3577{
3578    va_list va;
3579    TCGArg a2;
3580    TCGv_vec v0, v1, v2, v3, v4;
3581
3582    va_start(va, a0);
3583    v0 = temp_tcgv_vec(arg_temp(a0));
3584    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3585    a2 = va_arg(va, TCGArg);
3586
3587    switch (opc) {
3588    case INDEX_op_shli_vec:
3589    case INDEX_op_shri_vec:
3590        expand_vec_shi(type, vece, opc, v0, v1, a2);
3591        break;
3592
3593    case INDEX_op_sari_vec:
3594        expand_vec_sari(type, vece, v0, v1, a2);
3595        break;
3596
3597    case INDEX_op_rotli_vec:
3598        expand_vec_rotli(type, vece, v0, v1, a2);
3599        break;
3600
3601    case INDEX_op_rotls_vec:
3602        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3603        break;
3604
3605    case INDEX_op_rotlv_vec:
3606        v2 = temp_tcgv_vec(arg_temp(a2));
3607        expand_vec_rotv(type, vece, v0, v1, v2, false);
3608        break;
3609    case INDEX_op_rotrv_vec:
3610        v2 = temp_tcgv_vec(arg_temp(a2));
3611        expand_vec_rotv(type, vece, v0, v1, v2, true);
3612        break;
3613
3614    case INDEX_op_mul_vec:
3615        v2 = temp_tcgv_vec(arg_temp(a2));
3616        expand_vec_mul(type, vece, v0, v1, v2);
3617        break;
3618
3619    case INDEX_op_cmp_vec:
3620        v2 = temp_tcgv_vec(arg_temp(a2));
3621        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3622        break;
3623
3624    case INDEX_op_cmpsel_vec:
3625        v2 = temp_tcgv_vec(arg_temp(a2));
3626        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3627        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3628        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3629        break;
3630
3631    default:
3632        break;
3633    }
3634
3635    va_end(va);
3636}
3637
3638static const int tcg_target_callee_save_regs[] = {
3639#if TCG_TARGET_REG_BITS == 64
3640    TCG_REG_RBP,
3641    TCG_REG_RBX,
3642#if defined(_WIN64)
3643    TCG_REG_RDI,
3644    TCG_REG_RSI,
3645#endif
3646    TCG_REG_R12,
3647    TCG_REG_R13,
3648    TCG_REG_R14, /* Currently used for the global env. */
3649    TCG_REG_R15,
3650#else
3651    TCG_REG_EBP, /* Currently used for the global env. */
3652    TCG_REG_EBX,
3653    TCG_REG_ESI,
3654    TCG_REG_EDI,
3655#endif
3656};
3657
3658/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3659   and tcg_register_jit.  */
3660
3661#define PUSH_SIZE \
3662    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3663     * (TCG_TARGET_REG_BITS / 8))
3664
3665#define FRAME_SIZE \
3666    ((PUSH_SIZE \
3667      + TCG_STATIC_CALL_ARGS_SIZE \
3668      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3669      + TCG_TARGET_STACK_ALIGN - 1) \
3670     & ~(TCG_TARGET_STACK_ALIGN - 1))
3671
3672/* Generate global QEMU prologue and epilogue code */
3673static void tcg_target_qemu_prologue(TCGContext *s)
3674{
3675    int i, stack_addend;
3676
3677    /* TB prologue */
3678
3679    /* Reserve some stack space, also for TCG temps.  */
3680    stack_addend = FRAME_SIZE - PUSH_SIZE;
3681    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3682                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3683
3684    /* Save all callee saved registers.  */
3685    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3686        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3687    }
3688
3689#if TCG_TARGET_REG_BITS == 32
3690    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3691               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3692    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3693    /* jmp *tb.  */
3694    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3695                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3696                         + stack_addend);
3697#else
3698# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3699    if (guest_base) {
3700        int seg = setup_guest_base_seg();
3701        if (seg != 0) {
3702            x86_guest_base_seg = seg;
3703        } else if (guest_base == (int32_t)guest_base) {
3704            x86_guest_base_offset = guest_base;
3705        } else {
3706            /* Choose R12 because, as a base, it requires a SIB byte. */
3707            x86_guest_base_index = TCG_REG_R12;
3708            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3709            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3710        }
3711    }
3712# endif
3713    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3714    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3715    /* jmp *tb.  */
3716    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3717#endif
3718
3719    /*
3720     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3721     * and fall through to the rest of the epilogue.
3722     */
3723    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3724    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3725
3726    /* TB epilogue */
3727    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3728
3729    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3730
3731    if (have_avx2) {
3732        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3733    }
3734    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3735        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3736    }
3737    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3738}
3739
3740static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3741{
3742    memset(p, 0x90, count);
3743}
3744
3745static void tcg_target_init(TCGContext *s)
3746{
3747#ifdef CONFIG_CPUID_H
3748    unsigned a, b, c, d, b7 = 0;
3749    int max = __get_cpuid_max(0, 0);
3750
3751    if (max >= 7) {
3752        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3753        __cpuid_count(7, 0, a, b7, c, d);
3754        have_bmi1 = (b7 & bit_BMI) != 0;
3755        have_bmi2 = (b7 & bit_BMI2) != 0;
3756    }
3757
3758    if (max >= 1) {
3759        __cpuid(1, a, b, c, d);
3760#ifndef have_cmov
3761        /* For 32-bit, 99% certainty that we're running on hardware that
3762           supports cmov, but we still need to check.  In case cmov is not
3763           available, we'll use a small forward branch.  */
3764        have_cmov = (d & bit_CMOV) != 0;
3765#endif
3766
3767        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3768           need to probe for it.  */
3769        have_movbe = (c & bit_MOVBE) != 0;
3770        have_popcnt = (c & bit_POPCNT) != 0;
3771
3772        /* There are a number of things we must check before we can be
3773           sure of not hitting invalid opcode.  */
3774        if (c & bit_OSXSAVE) {
3775            unsigned xcrl, xcrh;
3776            /* The xgetbv instruction is not available to older versions of
3777             * the assembler, so we encode the instruction manually.
3778             */
3779            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3780            if ((xcrl & 6) == 6) {
3781                have_avx1 = (c & bit_AVX) != 0;
3782                have_avx2 = (b7 & bit_AVX2) != 0;
3783            }
3784        }
3785    }
3786
3787    max = __get_cpuid_max(0x8000000, 0);
3788    if (max >= 1) {
3789        __cpuid(0x80000001, a, b, c, d);
3790        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3791        have_lzcnt = (c & bit_LZCNT) != 0;
3792    }
3793#endif /* CONFIG_CPUID_H */
3794
3795    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3796    if (TCG_TARGET_REG_BITS == 64) {
3797        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3798    }
3799    if (have_avx1) {
3800        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3801        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3802    }
3803    if (have_avx2) {
3804        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3805    }
3806
3807    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3808    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3809    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3810    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3811    if (TCG_TARGET_REG_BITS == 64) {
3812#if !defined(_WIN64)
3813        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3814        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3815#endif
3816        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3817        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3818        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3819        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3820    }
3821
3822    s->reserved_regs = 0;
3823    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3824}
3825
3826typedef struct {
3827    DebugFrameHeader h;
3828    uint8_t fde_def_cfa[4];
3829    uint8_t fde_reg_ofs[14];
3830} DebugFrame;
3831
3832/* We're expecting a 2 byte uleb128 encoded value.  */
3833QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3834
3835#if !defined(__ELF__)
3836    /* Host machine without ELF. */
3837#elif TCG_TARGET_REG_BITS == 64
3838#define ELF_HOST_MACHINE EM_X86_64
3839static const DebugFrame debug_frame = {
3840    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3841    .h.cie.id = -1,
3842    .h.cie.version = 1,
3843    .h.cie.code_align = 1,
3844    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3845    .h.cie.return_column = 16,
3846
3847    /* Total FDE size does not include the "len" member.  */
3848    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3849
3850    .fde_def_cfa = {
3851        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3852        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3853        (FRAME_SIZE >> 7)
3854    },
3855    .fde_reg_ofs = {
3856        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3857        /* The following ordering must match tcg_target_callee_save_regs.  */
3858        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3859        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3860        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3861        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3862        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3863        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3864    }
3865};
3866#else
3867#define ELF_HOST_MACHINE EM_386
3868static const DebugFrame debug_frame = {
3869    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3870    .h.cie.id = -1,
3871    .h.cie.version = 1,
3872    .h.cie.code_align = 1,
3873    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3874    .h.cie.return_column = 8,
3875
3876    /* Total FDE size does not include the "len" member.  */
3877    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3878
3879    .fde_def_cfa = {
3880        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3881        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3882        (FRAME_SIZE >> 7)
3883    },
3884    .fde_reg_ofs = {
3885        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3886        /* The following ordering must match tcg_target_callee_save_regs.  */
3887        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3888        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3889        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3890        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3891    }
3892};
3893#endif
3894
3895#if defined(ELF_HOST_MACHINE)
3896void tcg_register_jit(const void *buf, size_t buf_size)
3897{
3898    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3899}
3900#endif
3901