xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 67abc3dd)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-pool.c.inc"
26
27#ifdef CONFIG_DEBUG_TCG
28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29#if TCG_TARGET_REG_BITS == 64
30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31#else
32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33#endif
34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36#if TCG_TARGET_REG_BITS == 64
37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39#endif
40};
41#endif
42
43static const int tcg_target_reg_alloc_order[] = {
44#if TCG_TARGET_REG_BITS == 64
45    TCG_REG_RBP,
46    TCG_REG_RBX,
47    TCG_REG_R12,
48    TCG_REG_R13,
49    TCG_REG_R14,
50    TCG_REG_R15,
51    TCG_REG_R10,
52    TCG_REG_R11,
53    TCG_REG_R9,
54    TCG_REG_R8,
55    TCG_REG_RCX,
56    TCG_REG_RDX,
57    TCG_REG_RSI,
58    TCG_REG_RDI,
59    TCG_REG_RAX,
60#else
61    TCG_REG_EBX,
62    TCG_REG_ESI,
63    TCG_REG_EDI,
64    TCG_REG_EBP,
65    TCG_REG_ECX,
66    TCG_REG_EDX,
67    TCG_REG_EAX,
68#endif
69    TCG_REG_XMM0,
70    TCG_REG_XMM1,
71    TCG_REG_XMM2,
72    TCG_REG_XMM3,
73    TCG_REG_XMM4,
74    TCG_REG_XMM5,
75#ifndef _WIN64
76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
78    TCG_REG_XMM6,
79    TCG_REG_XMM7,
80#if TCG_TARGET_REG_BITS == 64
81    TCG_REG_XMM8,
82    TCG_REG_XMM9,
83    TCG_REG_XMM10,
84    TCG_REG_XMM11,
85    TCG_REG_XMM12,
86    TCG_REG_XMM13,
87    TCG_REG_XMM14,
88    TCG_REG_XMM15,
89#endif
90#endif
91};
92
93static const int tcg_target_call_iarg_regs[] = {
94#if TCG_TARGET_REG_BITS == 64
95#if defined(_WIN64)
96    TCG_REG_RCX,
97    TCG_REG_RDX,
98#else
99    TCG_REG_RDI,
100    TCG_REG_RSI,
101    TCG_REG_RDX,
102    TCG_REG_RCX,
103#endif
104    TCG_REG_R8,
105    TCG_REG_R9,
106#else
107    /* 32 bit mode uses stack based calling convention (GCC default). */
108#endif
109};
110
111static const int tcg_target_call_oarg_regs[] = {
112    TCG_REG_EAX,
113#if TCG_TARGET_REG_BITS == 32
114    TCG_REG_EDX
115#endif
116};
117
118/* Constants we accept.  */
119#define TCG_CT_CONST_S32 0x100
120#define TCG_CT_CONST_U32 0x200
121#define TCG_CT_CONST_I32 0x400
122#define TCG_CT_CONST_WSZ 0x800
123
124/* Registers used with L constraint, which are the first argument
125   registers on x86_64, and two random call clobbered registers on
126   i386. */
127#if TCG_TARGET_REG_BITS == 64
128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130#else
131# define TCG_REG_L0 TCG_REG_EAX
132# define TCG_REG_L1 TCG_REG_EDX
133#endif
134
135/* The host compiler should supply <cpuid.h> to enable runtime features
136   detection, as we're not going to go so far as our own inline assembly.
137   If not available, default values will be assumed.  */
138#if defined(CONFIG_CPUID_H)
139#include "qemu/cpuid.h"
140#endif
141
142/* For 64-bit, we always know that CMOV is available.  */
143#if TCG_TARGET_REG_BITS == 64
144# define have_cmov 1
145#elif defined(CONFIG_CPUID_H)
146static bool have_cmov;
147#else
148# define have_cmov 0
149#endif
150
151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
152   it there.  Therefore we always define the variable.  */
153bool have_bmi1;
154bool have_popcnt;
155bool have_avx1;
156bool have_avx2;
157
158#ifdef CONFIG_CPUID_H
159static bool have_movbe;
160static bool have_bmi2;
161static bool have_lzcnt;
162#else
163# define have_movbe 0
164# define have_bmi2 0
165# define have_lzcnt 0
166#endif
167
168static tcg_insn_unit *tb_ret_addr;
169
170static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171                        intptr_t value, intptr_t addend)
172{
173    value += addend;
174    switch(type) {
175    case R_386_PC32:
176        value -= (uintptr_t)code_ptr;
177        if (value != (int32_t)value) {
178            return false;
179        }
180        /* FALLTHRU */
181    case R_386_32:
182        tcg_patch32(code_ptr, value);
183        break;
184    case R_386_PC8:
185        value -= (uintptr_t)code_ptr;
186        if (value != (int8_t)value) {
187            return false;
188        }
189        tcg_patch8(code_ptr, value);
190        break;
191    default:
192        tcg_abort();
193    }
194    return true;
195}
196
197#if TCG_TARGET_REG_BITS == 64
198#define ALL_GENERAL_REGS   0x0000ffffu
199#define ALL_VECTOR_REGS    0xffff0000u
200#else
201#define ALL_GENERAL_REGS   0x000000ffu
202#define ALL_VECTOR_REGS    0x00ff0000u
203#endif
204
205/* parse target specific constraints */
206static const char *target_parse_constraint(TCGArgConstraint *ct,
207                                           const char *ct_str, TCGType type)
208{
209    switch(*ct_str++) {
210    case 'a':
211        ct->ct |= TCG_CT_REG;
212        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
213        break;
214    case 'b':
215        ct->ct |= TCG_CT_REG;
216        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
217        break;
218    case 'c':
219        ct->ct |= TCG_CT_REG;
220        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
221        break;
222    case 'd':
223        ct->ct |= TCG_CT_REG;
224        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
225        break;
226    case 'S':
227        ct->ct |= TCG_CT_REG;
228        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
229        break;
230    case 'D':
231        ct->ct |= TCG_CT_REG;
232        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
233        break;
234    case 'q':
235        /* A register that can be used as a byte operand.  */
236        ct->ct |= TCG_CT_REG;
237        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
238        break;
239    case 'Q':
240        /* A register with an addressable second byte (e.g. %ah).  */
241        ct->ct |= TCG_CT_REG;
242        ct->u.regs = 0xf;
243        break;
244    case 'r':
245        /* A general register.  */
246        ct->ct |= TCG_CT_REG;
247        ct->u.regs |= ALL_GENERAL_REGS;
248        break;
249    case 'W':
250        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
251        ct->ct |= TCG_CT_CONST_WSZ;
252        break;
253    case 'x':
254        /* A vector register.  */
255        ct->ct |= TCG_CT_REG;
256        ct->u.regs |= ALL_VECTOR_REGS;
257        break;
258
259        /* qemu_ld/st address constraint */
260    case 'L':
261        ct->ct |= TCG_CT_REG;
262        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
263        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
264        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
265        break;
266
267    case 'e':
268        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
269        break;
270    case 'Z':
271        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
272        break;
273    case 'I':
274        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
275        break;
276
277    default:
278        return NULL;
279    }
280    return ct_str;
281}
282
283/* test if a constant matches the constraint */
284static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
285                                         const TCGArgConstraint *arg_ct)
286{
287    int ct = arg_ct->ct;
288    if (ct & TCG_CT_CONST) {
289        return 1;
290    }
291    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
292        return 1;
293    }
294    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
295        return 1;
296    }
297    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
298        return 1;
299    }
300    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
301        return 1;
302    }
303    return 0;
304}
305
306# define LOWREGMASK(x)	((x) & 7)
307
308#define P_EXT		0x100		/* 0x0f opcode prefix */
309#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
310#define P_DATA16        0x400           /* 0x66 opcode prefix */
311#if TCG_TARGET_REG_BITS == 64
312# define P_REXW         0x1000          /* Set REX.W = 1 */
313# define P_REXB_R       0x2000          /* REG field as byte register */
314# define P_REXB_RM      0x4000          /* R/M field as byte register */
315# define P_GS           0x8000          /* gs segment override */
316#else
317# define P_REXW		0
318# define P_REXB_R	0
319# define P_REXB_RM	0
320# define P_GS           0
321#endif
322#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
323#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
324#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
325#define P_VEXL          0x80000         /* Set VEX.L = 1 */
326
327#define OPC_ARITH_EvIz	(0x81)
328#define OPC_ARITH_EvIb	(0x83)
329#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
330#define OPC_ANDN        (0xf2 | P_EXT38)
331#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
332#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
333#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
334#define OPC_BSF         (0xbc | P_EXT)
335#define OPC_BSR         (0xbd | P_EXT)
336#define OPC_BSWAP	(0xc8 | P_EXT)
337#define OPC_CALL_Jz	(0xe8)
338#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
339#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
340#define OPC_DEC_r32	(0x48)
341#define OPC_IMUL_GvEv	(0xaf | P_EXT)
342#define OPC_IMUL_GvEvIb	(0x6b)
343#define OPC_IMUL_GvEvIz	(0x69)
344#define OPC_INC_r32	(0x40)
345#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
346#define OPC_JCC_short	(0x70)		/* ... plus condition code */
347#define OPC_JMP_long	(0xe9)
348#define OPC_JMP_short	(0xeb)
349#define OPC_LEA         (0x8d)
350#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
351#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
352#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
353#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
354#define OPC_MOVB_EvIz   (0xc6)
355#define OPC_MOVL_EvIz	(0xc7)
356#define OPC_MOVL_Iv     (0xb8)
357#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
358#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
359#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
360#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
361#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
362#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
363#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
364#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
365#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
366#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
367#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
368#define OPC_MOVSBL	(0xbe | P_EXT)
369#define OPC_MOVSWL	(0xbf | P_EXT)
370#define OPC_MOVSLQ	(0x63 | P_REXW)
371#define OPC_MOVZBL	(0xb6 | P_EXT)
372#define OPC_MOVZWL	(0xb7 | P_EXT)
373#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
374#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
375#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
376#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
377#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
378#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
379#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
380#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
381#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
382#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
383#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
384#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
385#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
386#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
387#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
388#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
389#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
390#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
391#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
392#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
393#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
394#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
395#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
396#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
397#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
398#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
399#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
400#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
401#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
402#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
403#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
404#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
405#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
406#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
407#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
408#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
409#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
410#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
411#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
412#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
413#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
414#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
415#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
416#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
417#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
418#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
419#define OPC_POR         (0xeb | P_EXT | P_DATA16)
420#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
421#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
422#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
423#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
424#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
425#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
426#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
427#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
428#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
429#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
430#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
431#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
432#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
433#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
434#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
435#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
436#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
437#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
438#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
439#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
440#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
441#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
442#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
443#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
444#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
445#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
446#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
447#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
448#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
449#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
450#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
451#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
452#define OPC_POP_r32	(0x58)
453#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
454#define OPC_PUSH_r32	(0x50)
455#define OPC_PUSH_Iv	(0x68)
456#define OPC_PUSH_Ib	(0x6a)
457#define OPC_RET		(0xc3)
458#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
459#define OPC_SHIFT_1	(0xd1)
460#define OPC_SHIFT_Ib	(0xc1)
461#define OPC_SHIFT_cl	(0xd3)
462#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
463#define OPC_SHUFPS      (0xc6 | P_EXT)
464#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
465#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
466#define OPC_SHRD_Ib     (0xac | P_EXT)
467#define OPC_TESTL	(0x85)
468#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
469#define OPC_UD2         (0x0b | P_EXT)
470#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
471#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
472#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
473#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
474#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
475#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
476#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
477#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
478#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
479#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
480#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
481#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
482#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
483#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
484#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
485#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
486#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
487#define OPC_VZEROUPPER  (0x77 | P_EXT)
488#define OPC_XCHG_ax_r32	(0x90)
489
490#define OPC_GRP3_Ev	(0xf7)
491#define OPC_GRP5	(0xff)
492#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
493
494/* Group 1 opcode extensions for 0x80-0x83.
495   These are also used as modifiers for OPC_ARITH.  */
496#define ARITH_ADD 0
497#define ARITH_OR  1
498#define ARITH_ADC 2
499#define ARITH_SBB 3
500#define ARITH_AND 4
501#define ARITH_SUB 5
502#define ARITH_XOR 6
503#define ARITH_CMP 7
504
505/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
506#define SHIFT_ROL 0
507#define SHIFT_ROR 1
508#define SHIFT_SHL 4
509#define SHIFT_SHR 5
510#define SHIFT_SAR 7
511
512/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
513#define EXT3_NOT   2
514#define EXT3_NEG   3
515#define EXT3_MUL   4
516#define EXT3_IMUL  5
517#define EXT3_DIV   6
518#define EXT3_IDIV  7
519
520/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
521#define EXT5_INC_Ev	0
522#define EXT5_DEC_Ev	1
523#define EXT5_CALLN_Ev	2
524#define EXT5_JMPN_Ev	4
525
526/* Condition codes to be added to OPC_JCC_{long,short}.  */
527#define JCC_JMP (-1)
528#define JCC_JO  0x0
529#define JCC_JNO 0x1
530#define JCC_JB  0x2
531#define JCC_JAE 0x3
532#define JCC_JE  0x4
533#define JCC_JNE 0x5
534#define JCC_JBE 0x6
535#define JCC_JA  0x7
536#define JCC_JS  0x8
537#define JCC_JNS 0x9
538#define JCC_JP  0xa
539#define JCC_JNP 0xb
540#define JCC_JL  0xc
541#define JCC_JGE 0xd
542#define JCC_JLE 0xe
543#define JCC_JG  0xf
544
545static const uint8_t tcg_cond_to_jcc[] = {
546    [TCG_COND_EQ] = JCC_JE,
547    [TCG_COND_NE] = JCC_JNE,
548    [TCG_COND_LT] = JCC_JL,
549    [TCG_COND_GE] = JCC_JGE,
550    [TCG_COND_LE] = JCC_JLE,
551    [TCG_COND_GT] = JCC_JG,
552    [TCG_COND_LTU] = JCC_JB,
553    [TCG_COND_GEU] = JCC_JAE,
554    [TCG_COND_LEU] = JCC_JBE,
555    [TCG_COND_GTU] = JCC_JA,
556};
557
558#if TCG_TARGET_REG_BITS == 64
559static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
560{
561    int rex;
562
563    if (opc & P_GS) {
564        tcg_out8(s, 0x65);
565    }
566    if (opc & P_DATA16) {
567        /* We should never be asking for both 16 and 64-bit operation.  */
568        tcg_debug_assert((opc & P_REXW) == 0);
569        tcg_out8(s, 0x66);
570    }
571    if (opc & P_SIMDF3) {
572        tcg_out8(s, 0xf3);
573    } else if (opc & P_SIMDF2) {
574        tcg_out8(s, 0xf2);
575    }
576
577    rex = 0;
578    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
579    rex |= (r & 8) >> 1;                /* REX.R */
580    rex |= (x & 8) >> 2;                /* REX.X */
581    rex |= (rm & 8) >> 3;               /* REX.B */
582
583    /* P_REXB_{R,RM} indicates that the given register is the low byte.
584       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
585       as otherwise the encoding indicates %[abcd]h.  Note that the values
586       that are ORed in merely indicate that the REX byte must be present;
587       those bits get discarded in output.  */
588    rex |= opc & (r >= 4 ? P_REXB_R : 0);
589    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
590
591    if (rex) {
592        tcg_out8(s, (uint8_t)(rex | 0x40));
593    }
594
595    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
596        tcg_out8(s, 0x0f);
597        if (opc & P_EXT38) {
598            tcg_out8(s, 0x38);
599        } else if (opc & P_EXT3A) {
600            tcg_out8(s, 0x3a);
601        }
602    }
603
604    tcg_out8(s, opc);
605}
606#else
607static void tcg_out_opc(TCGContext *s, int opc)
608{
609    if (opc & P_DATA16) {
610        tcg_out8(s, 0x66);
611    }
612    if (opc & P_SIMDF3) {
613        tcg_out8(s, 0xf3);
614    } else if (opc & P_SIMDF2) {
615        tcg_out8(s, 0xf2);
616    }
617    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
618        tcg_out8(s, 0x0f);
619        if (opc & P_EXT38) {
620            tcg_out8(s, 0x38);
621        } else if (opc & P_EXT3A) {
622            tcg_out8(s, 0x3a);
623        }
624    }
625    tcg_out8(s, opc);
626}
627/* Discard the register arguments to tcg_out_opc early, so as not to penalize
628   the 32-bit compilation paths.  This method works with all versions of gcc,
629   whereas relying on optimization may not be able to exclude them.  */
630#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
631#endif
632
633static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
634{
635    tcg_out_opc(s, opc, r, rm, 0);
636    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
637}
638
639static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
640                            int rm, int index)
641{
642    int tmp;
643
644    /* Use the two byte form if possible, which cannot encode
645       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
646    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
647        && ((rm | index) & 8) == 0) {
648        /* Two byte VEX prefix.  */
649        tcg_out8(s, 0xc5);
650
651        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
652    } else {
653        /* Three byte VEX prefix.  */
654        tcg_out8(s, 0xc4);
655
656        /* VEX.m-mmmm */
657        if (opc & P_EXT3A) {
658            tmp = 3;
659        } else if (opc & P_EXT38) {
660            tmp = 2;
661        } else if (opc & P_EXT) {
662            tmp = 1;
663        } else {
664            g_assert_not_reached();
665        }
666        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
667        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
668        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
669        tcg_out8(s, tmp);
670
671        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
672    }
673
674    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
675    /* VEX.pp */
676    if (opc & P_DATA16) {
677        tmp |= 1;                          /* 0x66 */
678    } else if (opc & P_SIMDF3) {
679        tmp |= 2;                          /* 0xf3 */
680    } else if (opc & P_SIMDF2) {
681        tmp |= 3;                          /* 0xf2 */
682    }
683    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
684    tcg_out8(s, tmp);
685    tcg_out8(s, opc);
686}
687
688static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
689{
690    tcg_out_vex_opc(s, opc, r, v, rm, 0);
691    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
692}
693
694/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
695   We handle either RM and INDEX missing with a negative value.  In 64-bit
696   mode for absolute addresses, ~RM is the size of the immediate operand
697   that will follow the instruction.  */
698
699static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
700                               int shift, intptr_t offset)
701{
702    int mod, len;
703
704    if (index < 0 && rm < 0) {
705        if (TCG_TARGET_REG_BITS == 64) {
706            /* Try for a rip-relative addressing mode.  This has replaced
707               the 32-bit-mode absolute addressing encoding.  */
708            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
709            intptr_t disp = offset - pc;
710            if (disp == (int32_t)disp) {
711                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
712                tcg_out32(s, disp);
713                return;
714            }
715
716            /* Try for an absolute address encoding.  This requires the
717               use of the MODRM+SIB encoding and is therefore larger than
718               rip-relative addressing.  */
719            if (offset == (int32_t)offset) {
720                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
721                tcg_out8(s, (4 << 3) | 5);
722                tcg_out32(s, offset);
723                return;
724            }
725
726            /* ??? The memory isn't directly addressable.  */
727            g_assert_not_reached();
728        } else {
729            /* Absolute address.  */
730            tcg_out8(s, (r << 3) | 5);
731            tcg_out32(s, offset);
732            return;
733        }
734    }
735
736    /* Find the length of the immediate addend.  Note that the encoding
737       that would be used for (%ebp) indicates absolute addressing.  */
738    if (rm < 0) {
739        mod = 0, len = 4, rm = 5;
740    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
741        mod = 0, len = 0;
742    } else if (offset == (int8_t)offset) {
743        mod = 0x40, len = 1;
744    } else {
745        mod = 0x80, len = 4;
746    }
747
748    /* Use a single byte MODRM format if possible.  Note that the encoding
749       that would be used for %esp is the escape to the two byte form.  */
750    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
751        /* Single byte MODRM format.  */
752        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
753    } else {
754        /* Two byte MODRM+SIB format.  */
755
756        /* Note that the encoding that would place %esp into the index
757           field indicates no index register.  In 64-bit mode, the REX.X
758           bit counts, so %r12 can be used as the index.  */
759        if (index < 0) {
760            index = 4;
761        } else {
762            tcg_debug_assert(index != TCG_REG_ESP);
763        }
764
765        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
766        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
767    }
768
769    if (len == 1) {
770        tcg_out8(s, offset);
771    } else if (len == 4) {
772        tcg_out32(s, offset);
773    }
774}
775
776static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
777                                     int index, int shift, intptr_t offset)
778{
779    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
780    tcg_out_sib_offset(s, r, rm, index, shift, offset);
781}
782
783static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
784                                         int rm, int index, int shift,
785                                         intptr_t offset)
786{
787    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
788    tcg_out_sib_offset(s, r, rm, index, shift, offset);
789}
790
791/* A simplification of the above with no index or shift.  */
792static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
793                                        int rm, intptr_t offset)
794{
795    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
796}
797
798static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
799                                            int v, int rm, intptr_t offset)
800{
801    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
802}
803
804/* Output an opcode with an expected reference to the constant pool.  */
805static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
806{
807    tcg_out_opc(s, opc, r, 0, 0);
808    /* Absolute for 32-bit, pc-relative for 64-bit.  */
809    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
810    tcg_out32(s, 0);
811}
812
813/* Output an opcode with an expected reference to the constant pool.  */
814static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
815{
816    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
817    /* Absolute for 32-bit, pc-relative for 64-bit.  */
818    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
819    tcg_out32(s, 0);
820}
821
822/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
823static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
824{
825    /* Propagate an opcode prefix, such as P_REXW.  */
826    int ext = subop & ~0x7;
827    subop &= 0x7;
828
829    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
830}
831
832static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
833{
834    int rexw = 0;
835
836    if (arg == ret) {
837        return true;
838    }
839    switch (type) {
840    case TCG_TYPE_I64:
841        rexw = P_REXW;
842        /* fallthru */
843    case TCG_TYPE_I32:
844        if (ret < 16) {
845            if (arg < 16) {
846                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
847            } else {
848                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
849            }
850        } else {
851            if (arg < 16) {
852                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
853            } else {
854                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
855            }
856        }
857        break;
858
859    case TCG_TYPE_V64:
860        tcg_debug_assert(ret >= 16 && arg >= 16);
861        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
862        break;
863    case TCG_TYPE_V128:
864        tcg_debug_assert(ret >= 16 && arg >= 16);
865        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
866        break;
867    case TCG_TYPE_V256:
868        tcg_debug_assert(ret >= 16 && arg >= 16);
869        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
870        break;
871
872    default:
873        g_assert_not_reached();
874    }
875    return true;
876}
877
878static const int avx2_dup_insn[4] = {
879    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
880    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
881};
882
883static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
884                            TCGReg r, TCGReg a)
885{
886    if (have_avx2) {
887        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
888        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
889    } else {
890        switch (vece) {
891        case MO_8:
892            /* ??? With zero in a register, use PSHUFB.  */
893            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
894            a = r;
895            /* FALLTHRU */
896        case MO_16:
897            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
898            a = r;
899            /* FALLTHRU */
900        case MO_32:
901            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
902            /* imm8 operand: all output lanes selected from input lane 0.  */
903            tcg_out8(s, 0);
904            break;
905        case MO_64:
906            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
907            break;
908        default:
909            g_assert_not_reached();
910        }
911    }
912    return true;
913}
914
915static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
916                             TCGReg r, TCGReg base, intptr_t offset)
917{
918    if (have_avx2) {
919        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
920        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
921                                 r, 0, base, offset);
922    } else {
923        switch (vece) {
924        case MO_64:
925            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
926            break;
927        case MO_32:
928            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
929            break;
930        case MO_16:
931            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
932            tcg_out8(s, 0); /* imm8 */
933            tcg_out_dup_vec(s, type, vece, r, r);
934            break;
935        case MO_8:
936            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
937            tcg_out8(s, 0); /* imm8 */
938            tcg_out_dup_vec(s, type, vece, r, r);
939            break;
940        default:
941            g_assert_not_reached();
942        }
943    }
944    return true;
945}
946
947static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
948                             TCGReg ret, tcg_target_long arg)
949{
950    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
951
952    if (arg == 0) {
953        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
954        return;
955    }
956    if (arg == -1) {
957        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
958        return;
959    }
960
961    if (TCG_TARGET_REG_BITS == 64) {
962        if (type == TCG_TYPE_V64) {
963            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
964        } else if (have_avx2) {
965            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
966        } else {
967            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
968        }
969        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
970    } else {
971        if (have_avx2) {
972            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTW + vex_l, ret);
973        } else {
974            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
975        }
976        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
977    }
978}
979
980static void tcg_out_movi(TCGContext *s, TCGType type,
981                         TCGReg ret, tcg_target_long arg)
982{
983    tcg_target_long diff;
984
985    switch (type) {
986    case TCG_TYPE_I32:
987#if TCG_TARGET_REG_BITS == 64
988    case TCG_TYPE_I64:
989#endif
990        if (ret < 16) {
991            break;
992        }
993        /* fallthru */
994    case TCG_TYPE_V64:
995    case TCG_TYPE_V128:
996    case TCG_TYPE_V256:
997        tcg_debug_assert(ret >= 16);
998        tcg_out_dupi_vec(s, type, ret, arg);
999        return;
1000    default:
1001        g_assert_not_reached();
1002    }
1003
1004    if (arg == 0) {
1005        tgen_arithr(s, ARITH_XOR, ret, ret);
1006        return;
1007    }
1008    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1009        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1010        tcg_out32(s, arg);
1011        return;
1012    }
1013    if (arg == (int32_t)arg) {
1014        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1015        tcg_out32(s, arg);
1016        return;
1017    }
1018
1019    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1020    diff = arg - ((uintptr_t)s->code_ptr + 7);
1021    if (diff == (int32_t)diff) {
1022        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1023        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1024        tcg_out32(s, diff);
1025        return;
1026    }
1027
1028    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1029    tcg_out64(s, arg);
1030}
1031
1032static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1033{
1034    if (val == (int8_t)val) {
1035        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1036        tcg_out8(s, val);
1037    } else if (val == (int32_t)val) {
1038        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1039        tcg_out32(s, val);
1040    } else {
1041        tcg_abort();
1042    }
1043}
1044
1045static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1046{
1047    /* Given the strength of x86 memory ordering, we only need care for
1048       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1049       faster than "mfence", so don't bother with the sse insn.  */
1050    if (a0 & TCG_MO_ST_LD) {
1051        tcg_out8(s, 0xf0);
1052        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1053        tcg_out8(s, 0);
1054    }
1055}
1056
1057static inline void tcg_out_push(TCGContext *s, int reg)
1058{
1059    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1060}
1061
1062static inline void tcg_out_pop(TCGContext *s, int reg)
1063{
1064    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1065}
1066
1067static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1068                       TCGReg arg1, intptr_t arg2)
1069{
1070    switch (type) {
1071    case TCG_TYPE_I32:
1072        if (ret < 16) {
1073            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1074        } else {
1075            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1076        }
1077        break;
1078    case TCG_TYPE_I64:
1079        if (ret < 16) {
1080            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1081            break;
1082        }
1083        /* FALLTHRU */
1084    case TCG_TYPE_V64:
1085        /* There is no instruction that can validate 8-byte alignment.  */
1086        tcg_debug_assert(ret >= 16);
1087        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1088        break;
1089    case TCG_TYPE_V128:
1090        /*
1091         * The gvec infrastructure is asserts that v128 vector loads
1092         * and stores use a 16-byte aligned offset.  Validate that the
1093         * final pointer is aligned by using an insn that will SIGSEGV.
1094         */
1095        tcg_debug_assert(ret >= 16);
1096        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1097        break;
1098    case TCG_TYPE_V256:
1099        /*
1100         * The gvec infrastructure only requires 16-byte alignment,
1101         * so here we must use an unaligned load.
1102         */
1103        tcg_debug_assert(ret >= 16);
1104        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1105                                 ret, 0, arg1, arg2);
1106        break;
1107    default:
1108        g_assert_not_reached();
1109    }
1110}
1111
1112static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1113                       TCGReg arg1, intptr_t arg2)
1114{
1115    switch (type) {
1116    case TCG_TYPE_I32:
1117        if (arg < 16) {
1118            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1119        } else {
1120            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1121        }
1122        break;
1123    case TCG_TYPE_I64:
1124        if (arg < 16) {
1125            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1126            break;
1127        }
1128        /* FALLTHRU */
1129    case TCG_TYPE_V64:
1130        /* There is no instruction that can validate 8-byte alignment.  */
1131        tcg_debug_assert(arg >= 16);
1132        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1133        break;
1134    case TCG_TYPE_V128:
1135        /*
1136         * The gvec infrastructure is asserts that v128 vector loads
1137         * and stores use a 16-byte aligned offset.  Validate that the
1138         * final pointer is aligned by using an insn that will SIGSEGV.
1139         */
1140        tcg_debug_assert(arg >= 16);
1141        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1142        break;
1143    case TCG_TYPE_V256:
1144        /*
1145         * The gvec infrastructure only requires 16-byte alignment,
1146         * so here we must use an unaligned store.
1147         */
1148        tcg_debug_assert(arg >= 16);
1149        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1150                                 arg, 0, arg1, arg2);
1151        break;
1152    default:
1153        g_assert_not_reached();
1154    }
1155}
1156
1157static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1158                        TCGReg base, intptr_t ofs)
1159{
1160    int rexw = 0;
1161    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1162        if (val != (int32_t)val) {
1163            return false;
1164        }
1165        rexw = P_REXW;
1166    } else if (type != TCG_TYPE_I32) {
1167        return false;
1168    }
1169    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1170    tcg_out32(s, val);
1171    return true;
1172}
1173
1174static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1175{
1176    /* Propagate an opcode prefix, such as P_DATA16.  */
1177    int ext = subopc & ~0x7;
1178    subopc &= 0x7;
1179
1180    if (count == 1) {
1181        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1182    } else {
1183        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1184        tcg_out8(s, count);
1185    }
1186}
1187
1188static inline void tcg_out_bswap32(TCGContext *s, int reg)
1189{
1190    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1191}
1192
1193static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1194{
1195    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1196}
1197
1198static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1199{
1200    /* movzbl */
1201    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1202    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1203}
1204
1205static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1206{
1207    /* movsbl */
1208    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1209    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1210}
1211
1212static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1213{
1214    /* movzwl */
1215    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1216}
1217
1218static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1219{
1220    /* movsw[lq] */
1221    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1222}
1223
1224static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1225{
1226    /* 32-bit mov zero extends.  */
1227    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1228}
1229
1230static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1231{
1232    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1233}
1234
1235static inline void tcg_out_bswap64(TCGContext *s, int reg)
1236{
1237    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1238}
1239
1240static void tgen_arithi(TCGContext *s, int c, int r0,
1241                        tcg_target_long val, int cf)
1242{
1243    int rexw = 0;
1244
1245    if (TCG_TARGET_REG_BITS == 64) {
1246        rexw = c & -8;
1247        c &= 7;
1248    }
1249
1250    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1251       partial flags update stalls on Pentium4 and are not recommended
1252       by current Intel optimization manuals.  */
1253    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1254        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1255        if (TCG_TARGET_REG_BITS == 64) {
1256            /* The single-byte increment encodings are re-tasked as the
1257               REX prefixes.  Use the MODRM encoding.  */
1258            tcg_out_modrm(s, OPC_GRP5 + rexw,
1259                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1260        } else {
1261            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1262        }
1263        return;
1264    }
1265
1266    if (c == ARITH_AND) {
1267        if (TCG_TARGET_REG_BITS == 64) {
1268            if (val == 0xffffffffu) {
1269                tcg_out_ext32u(s, r0, r0);
1270                return;
1271            }
1272            if (val == (uint32_t)val) {
1273                /* AND with no high bits set can use a 32-bit operation.  */
1274                rexw = 0;
1275            }
1276        }
1277        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1278            tcg_out_ext8u(s, r0, r0);
1279            return;
1280        }
1281        if (val == 0xffffu) {
1282            tcg_out_ext16u(s, r0, r0);
1283            return;
1284        }
1285    }
1286
1287    if (val == (int8_t)val) {
1288        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1289        tcg_out8(s, val);
1290        return;
1291    }
1292    if (rexw == 0 || val == (int32_t)val) {
1293        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1294        tcg_out32(s, val);
1295        return;
1296    }
1297
1298    tcg_abort();
1299}
1300
1301static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1302{
1303    if (val != 0) {
1304        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1305    }
1306}
1307
1308/* Use SMALL != 0 to force a short forward branch.  */
1309static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1310{
1311    int32_t val, val1;
1312
1313    if (l->has_value) {
1314        val = tcg_pcrel_diff(s, l->u.value_ptr);
1315        val1 = val - 2;
1316        if ((int8_t)val1 == val1) {
1317            if (opc == -1) {
1318                tcg_out8(s, OPC_JMP_short);
1319            } else {
1320                tcg_out8(s, OPC_JCC_short + opc);
1321            }
1322            tcg_out8(s, val1);
1323        } else {
1324            if (small) {
1325                tcg_abort();
1326            }
1327            if (opc == -1) {
1328                tcg_out8(s, OPC_JMP_long);
1329                tcg_out32(s, val - 5);
1330            } else {
1331                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1332                tcg_out32(s, val - 6);
1333            }
1334        }
1335    } else if (small) {
1336        if (opc == -1) {
1337            tcg_out8(s, OPC_JMP_short);
1338        } else {
1339            tcg_out8(s, OPC_JCC_short + opc);
1340        }
1341        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1342        s->code_ptr += 1;
1343    } else {
1344        if (opc == -1) {
1345            tcg_out8(s, OPC_JMP_long);
1346        } else {
1347            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1348        }
1349        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1350        s->code_ptr += 4;
1351    }
1352}
1353
1354static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1355                        int const_arg2, int rexw)
1356{
1357    if (const_arg2) {
1358        if (arg2 == 0) {
1359            /* test r, r */
1360            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1361        } else {
1362            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1363        }
1364    } else {
1365        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1366    }
1367}
1368
1369static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1370                             TCGArg arg1, TCGArg arg2, int const_arg2,
1371                             TCGLabel *label, int small)
1372{
1373    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1374    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1375}
1376
1377#if TCG_TARGET_REG_BITS == 64
1378static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1379                             TCGArg arg1, TCGArg arg2, int const_arg2,
1380                             TCGLabel *label, int small)
1381{
1382    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1383    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1384}
1385#else
1386/* XXX: we implement it at the target level to avoid having to
1387   handle cross basic blocks temporaries */
1388static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1389                            const int *const_args, int small)
1390{
1391    TCGLabel *label_next = gen_new_label();
1392    TCGLabel *label_this = arg_label(args[5]);
1393
1394    switch(args[4]) {
1395    case TCG_COND_EQ:
1396        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1397                         label_next, 1);
1398        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1399                         label_this, small);
1400        break;
1401    case TCG_COND_NE:
1402        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1403                         label_this, small);
1404        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1405                         label_this, small);
1406        break;
1407    case TCG_COND_LT:
1408        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1409                         label_this, small);
1410        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1411        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1412                         label_this, small);
1413        break;
1414    case TCG_COND_LE:
1415        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1416                         label_this, small);
1417        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1418        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1419                         label_this, small);
1420        break;
1421    case TCG_COND_GT:
1422        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1423                         label_this, small);
1424        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1425        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1426                         label_this, small);
1427        break;
1428    case TCG_COND_GE:
1429        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1430                         label_this, small);
1431        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1432        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1433                         label_this, small);
1434        break;
1435    case TCG_COND_LTU:
1436        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1437                         label_this, small);
1438        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1439        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1440                         label_this, small);
1441        break;
1442    case TCG_COND_LEU:
1443        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1444                         label_this, small);
1445        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1446        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1447                         label_this, small);
1448        break;
1449    case TCG_COND_GTU:
1450        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1451                         label_this, small);
1452        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1453        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1454                         label_this, small);
1455        break;
1456    case TCG_COND_GEU:
1457        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1458                         label_this, small);
1459        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1460        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1461                         label_this, small);
1462        break;
1463    default:
1464        tcg_abort();
1465    }
1466    tcg_out_label(s, label_next, s->code_ptr);
1467}
1468#endif
1469
1470static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1471                              TCGArg arg1, TCGArg arg2, int const_arg2)
1472{
1473    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1474    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1475    tcg_out_ext8u(s, dest, dest);
1476}
1477
1478#if TCG_TARGET_REG_BITS == 64
1479static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1480                              TCGArg arg1, TCGArg arg2, int const_arg2)
1481{
1482    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1483    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1484    tcg_out_ext8u(s, dest, dest);
1485}
1486#else
1487static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1488                             const int *const_args)
1489{
1490    TCGArg new_args[6];
1491    TCGLabel *label_true, *label_over;
1492
1493    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1494
1495    if (args[0] == args[1] || args[0] == args[2]
1496        || (!const_args[3] && args[0] == args[3])
1497        || (!const_args[4] && args[0] == args[4])) {
1498        /* When the destination overlaps with one of the argument
1499           registers, don't do anything tricky.  */
1500        label_true = gen_new_label();
1501        label_over = gen_new_label();
1502
1503        new_args[5] = label_arg(label_true);
1504        tcg_out_brcond2(s, new_args, const_args+1, 1);
1505
1506        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1507        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1508        tcg_out_label(s, label_true, s->code_ptr);
1509
1510        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1511        tcg_out_label(s, label_over, s->code_ptr);
1512    } else {
1513        /* When the destination does not overlap one of the arguments,
1514           clear the destination first, jump if cond false, and emit an
1515           increment in the true case.  This results in smaller code.  */
1516
1517        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1518
1519        label_over = gen_new_label();
1520        new_args[4] = tcg_invert_cond(new_args[4]);
1521        new_args[5] = label_arg(label_over);
1522        tcg_out_brcond2(s, new_args, const_args+1, 1);
1523
1524        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1525        tcg_out_label(s, label_over, s->code_ptr);
1526    }
1527}
1528#endif
1529
1530static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1531                         TCGReg dest, TCGReg v1)
1532{
1533    if (have_cmov) {
1534        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1535    } else {
1536        TCGLabel *over = gen_new_label();
1537        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1538        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1539        tcg_out_label(s, over, s->code_ptr);
1540    }
1541}
1542
1543static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1544                              TCGReg c1, TCGArg c2, int const_c2,
1545                              TCGReg v1)
1546{
1547    tcg_out_cmp(s, c1, c2, const_c2, 0);
1548    tcg_out_cmov(s, cond, 0, dest, v1);
1549}
1550
1551#if TCG_TARGET_REG_BITS == 64
1552static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1553                              TCGReg c1, TCGArg c2, int const_c2,
1554                              TCGReg v1)
1555{
1556    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1557    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1558}
1559#endif
1560
1561static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1562                        TCGArg arg2, bool const_a2)
1563{
1564    if (have_bmi1) {
1565        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1566        if (const_a2) {
1567            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1568        } else {
1569            tcg_debug_assert(dest != arg2);
1570            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1571        }
1572    } else {
1573        tcg_debug_assert(dest != arg2);
1574        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1575        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1576    }
1577}
1578
1579static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1580                        TCGArg arg2, bool const_a2)
1581{
1582    if (have_lzcnt) {
1583        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1584        if (const_a2) {
1585            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1586        } else {
1587            tcg_debug_assert(dest != arg2);
1588            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1589        }
1590    } else {
1591        tcg_debug_assert(!const_a2);
1592        tcg_debug_assert(dest != arg1);
1593        tcg_debug_assert(dest != arg2);
1594
1595        /* Recall that the output of BSR is the index not the count.  */
1596        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1597        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1598
1599        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1600        tcg_out_cmp(s, arg1, 0, 1, rexw);
1601        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1602    }
1603}
1604
1605static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1606{
1607    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1608
1609    if (disp == (int32_t)disp) {
1610        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1611        tcg_out32(s, disp);
1612    } else {
1613        /* rip-relative addressing into the constant pool.
1614           This is 6 + 8 = 14 bytes, as compared to using an
1615           an immediate load 10 + 6 = 16 bytes, plus we may
1616           be able to re-use the pool constant for more calls.  */
1617        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1618        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1619        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1620        tcg_out32(s, 0);
1621    }
1622}
1623
1624static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1625{
1626    tcg_out_branch(s, 1, dest);
1627}
1628
1629static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1630{
1631    tcg_out_branch(s, 0, dest);
1632}
1633
1634static void tcg_out_nopn(TCGContext *s, int n)
1635{
1636    int i;
1637    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1638     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1639     * duplicate prefix, and all of the interesting recent cores can
1640     * decode and discard the duplicates in a single cycle.
1641     */
1642    tcg_debug_assert(n >= 1);
1643    for (i = 1; i < n; ++i) {
1644        tcg_out8(s, 0x66);
1645    }
1646    tcg_out8(s, 0x90);
1647}
1648
1649#if defined(CONFIG_SOFTMMU)
1650#include "../tcg-ldst.c.inc"
1651
1652/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1653 *                                     int mmu_idx, uintptr_t ra)
1654 */
1655static void * const qemu_ld_helpers[16] = {
1656    [MO_UB]   = helper_ret_ldub_mmu,
1657    [MO_LEUW] = helper_le_lduw_mmu,
1658    [MO_LEUL] = helper_le_ldul_mmu,
1659    [MO_LEQ]  = helper_le_ldq_mmu,
1660    [MO_BEUW] = helper_be_lduw_mmu,
1661    [MO_BEUL] = helper_be_ldul_mmu,
1662    [MO_BEQ]  = helper_be_ldq_mmu,
1663};
1664
1665/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1666 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1667 */
1668static void * const qemu_st_helpers[16] = {
1669    [MO_UB]   = helper_ret_stb_mmu,
1670    [MO_LEUW] = helper_le_stw_mmu,
1671    [MO_LEUL] = helper_le_stl_mmu,
1672    [MO_LEQ]  = helper_le_stq_mmu,
1673    [MO_BEUW] = helper_be_stw_mmu,
1674    [MO_BEUL] = helper_be_stl_mmu,
1675    [MO_BEQ]  = helper_be_stq_mmu,
1676};
1677
1678/* Perform the TLB load and compare.
1679
1680   Inputs:
1681   ADDRLO and ADDRHI contain the low and high part of the address.
1682
1683   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1684
1685   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1686   This should be offsetof addr_read or addr_write.
1687
1688   Outputs:
1689   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1690   positions of the displacements of forward jumps to the TLB miss case.
1691
1692   Second argument register is loaded with the low part of the address.
1693   In the TLB hit case, it has been adjusted as indicated by the TLB
1694   and so is a host address.  In the TLB miss case, it continues to
1695   hold a guest address.
1696
1697   First argument register is clobbered.  */
1698
1699static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1700                                    int mem_index, MemOp opc,
1701                                    tcg_insn_unit **label_ptr, int which)
1702{
1703    const TCGReg r0 = TCG_REG_L0;
1704    const TCGReg r1 = TCG_REG_L1;
1705    TCGType ttype = TCG_TYPE_I32;
1706    TCGType tlbtype = TCG_TYPE_I32;
1707    int trexw = 0, hrexw = 0, tlbrexw = 0;
1708    unsigned a_bits = get_alignment_bits(opc);
1709    unsigned s_bits = opc & MO_SIZE;
1710    unsigned a_mask = (1 << a_bits) - 1;
1711    unsigned s_mask = (1 << s_bits) - 1;
1712    target_ulong tlb_mask;
1713
1714    if (TCG_TARGET_REG_BITS == 64) {
1715        if (TARGET_LONG_BITS == 64) {
1716            ttype = TCG_TYPE_I64;
1717            trexw = P_REXW;
1718        }
1719        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1720            hrexw = P_REXW;
1721            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1722                tlbtype = TCG_TYPE_I64;
1723                tlbrexw = P_REXW;
1724            }
1725        }
1726    }
1727
1728    tcg_out_mov(s, tlbtype, r0, addrlo);
1729    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1730                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1731
1732    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1733                         TLB_MASK_TABLE_OFS(mem_index) +
1734                         offsetof(CPUTLBDescFast, mask));
1735
1736    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1737                         TLB_MASK_TABLE_OFS(mem_index) +
1738                         offsetof(CPUTLBDescFast, table));
1739
1740    /* If the required alignment is at least as large as the access, simply
1741       copy the address and mask.  For lesser alignments, check that we don't
1742       cross pages for the complete access.  */
1743    if (a_bits >= s_bits) {
1744        tcg_out_mov(s, ttype, r1, addrlo);
1745    } else {
1746        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1747    }
1748    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1749    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1750
1751    /* cmp 0(r0), r1 */
1752    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1753
1754    /* Prepare for both the fast path add of the tlb addend, and the slow
1755       path function argument setup.  */
1756    tcg_out_mov(s, ttype, r1, addrlo);
1757
1758    /* jne slow_path */
1759    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1760    label_ptr[0] = s->code_ptr;
1761    s->code_ptr += 4;
1762
1763    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1764        /* cmp 4(r0), addrhi */
1765        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1766
1767        /* jne slow_path */
1768        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1769        label_ptr[1] = s->code_ptr;
1770        s->code_ptr += 4;
1771    }
1772
1773    /* TLB Hit.  */
1774
1775    /* add addend(r0), r1 */
1776    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1777                         offsetof(CPUTLBEntry, addend));
1778}
1779
1780/*
1781 * Record the context of a call to the out of line helper code for the slow path
1782 * for a load or store, so that we can later generate the correct helper code
1783 */
1784static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1785                                TCGMemOpIdx oi,
1786                                TCGReg datalo, TCGReg datahi,
1787                                TCGReg addrlo, TCGReg addrhi,
1788                                tcg_insn_unit *raddr,
1789                                tcg_insn_unit **label_ptr)
1790{
1791    TCGLabelQemuLdst *label = new_ldst_label(s);
1792
1793    label->is_ld = is_ld;
1794    label->oi = oi;
1795    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1796    label->datalo_reg = datalo;
1797    label->datahi_reg = datahi;
1798    label->addrlo_reg = addrlo;
1799    label->addrhi_reg = addrhi;
1800    label->raddr = raddr;
1801    label->label_ptr[0] = label_ptr[0];
1802    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1803        label->label_ptr[1] = label_ptr[1];
1804    }
1805}
1806
1807/*
1808 * Generate code for the slow path for a load at the end of block
1809 */
1810static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1811{
1812    TCGMemOpIdx oi = l->oi;
1813    MemOp opc = get_memop(oi);
1814    TCGReg data_reg;
1815    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1816    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1817
1818    /* resolve label address */
1819    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1820    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1821        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1822    }
1823
1824    if (TCG_TARGET_REG_BITS == 32) {
1825        int ofs = 0;
1826
1827        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1828        ofs += 4;
1829
1830        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1831        ofs += 4;
1832
1833        if (TARGET_LONG_BITS == 64) {
1834            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1835            ofs += 4;
1836        }
1837
1838        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1839        ofs += 4;
1840
1841        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1842    } else {
1843        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1844        /* The second argument is already loaded with addrlo.  */
1845        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1846        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1847                     (uintptr_t)l->raddr);
1848    }
1849
1850    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1851
1852    data_reg = l->datalo_reg;
1853    switch (opc & MO_SSIZE) {
1854    case MO_SB:
1855        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1856        break;
1857    case MO_SW:
1858        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1859        break;
1860#if TCG_TARGET_REG_BITS == 64
1861    case MO_SL:
1862        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1863        break;
1864#endif
1865    case MO_UB:
1866    case MO_UW:
1867        /* Note that the helpers have zero-extended to tcg_target_long.  */
1868    case MO_UL:
1869        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1870        break;
1871    case MO_Q:
1872        if (TCG_TARGET_REG_BITS == 64) {
1873            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1874        } else if (data_reg == TCG_REG_EDX) {
1875            /* xchg %edx, %eax */
1876            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1877            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1878        } else {
1879            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1880            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1881        }
1882        break;
1883    default:
1884        tcg_abort();
1885    }
1886
1887    /* Jump to the code corresponding to next IR of qemu_st */
1888    tcg_out_jmp(s, l->raddr);
1889    return true;
1890}
1891
1892/*
1893 * Generate code for the slow path for a store at the end of block
1894 */
1895static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1896{
1897    TCGMemOpIdx oi = l->oi;
1898    MemOp opc = get_memop(oi);
1899    MemOp s_bits = opc & MO_SIZE;
1900    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1901    TCGReg retaddr;
1902
1903    /* resolve label address */
1904    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1905    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1906        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1907    }
1908
1909    if (TCG_TARGET_REG_BITS == 32) {
1910        int ofs = 0;
1911
1912        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1913        ofs += 4;
1914
1915        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1916        ofs += 4;
1917
1918        if (TARGET_LONG_BITS == 64) {
1919            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1920            ofs += 4;
1921        }
1922
1923        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1924        ofs += 4;
1925
1926        if (s_bits == MO_64) {
1927            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1928            ofs += 4;
1929        }
1930
1931        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1932        ofs += 4;
1933
1934        retaddr = TCG_REG_EAX;
1935        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1936        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1937    } else {
1938        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1939        /* The second argument is already loaded with addrlo.  */
1940        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1941                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1942        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1943
1944        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1945            retaddr = tcg_target_call_iarg_regs[4];
1946            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1947        } else {
1948            retaddr = TCG_REG_RAX;
1949            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1950            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1951                       TCG_TARGET_CALL_STACK_OFFSET);
1952        }
1953    }
1954
1955    /* "Tail call" to the helper, with the return address back inline.  */
1956    tcg_out_push(s, retaddr);
1957    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1958    return true;
1959}
1960#elif TCG_TARGET_REG_BITS == 32
1961# define x86_guest_base_seg     0
1962# define x86_guest_base_index   -1
1963# define x86_guest_base_offset  guest_base
1964#else
1965static int x86_guest_base_seg;
1966static int x86_guest_base_index = -1;
1967static int32_t x86_guest_base_offset;
1968# if defined(__x86_64__) && defined(__linux__)
1969#  include <asm/prctl.h>
1970#  include <sys/prctl.h>
1971int arch_prctl(int code, unsigned long addr);
1972static inline int setup_guest_base_seg(void)
1973{
1974    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1975        return P_GS;
1976    }
1977    return 0;
1978}
1979# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1980#  include <machine/sysarch.h>
1981static inline int setup_guest_base_seg(void)
1982{
1983    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1984        return P_GS;
1985    }
1986    return 0;
1987}
1988# else
1989static inline int setup_guest_base_seg(void)
1990{
1991    return 0;
1992}
1993# endif
1994#endif /* SOFTMMU */
1995
1996static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1997                                   TCGReg base, int index, intptr_t ofs,
1998                                   int seg, bool is64, MemOp memop)
1999{
2000    const MemOp real_bswap = memop & MO_BSWAP;
2001    MemOp bswap = real_bswap;
2002    int rexw = is64 * P_REXW;
2003    int movop = OPC_MOVL_GvEv;
2004
2005    if (have_movbe && real_bswap) {
2006        bswap = 0;
2007        movop = OPC_MOVBE_GyMy;
2008    }
2009
2010    switch (memop & MO_SSIZE) {
2011    case MO_UB:
2012        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2013                                 base, index, 0, ofs);
2014        break;
2015    case MO_SB:
2016        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2017                                 base, index, 0, ofs);
2018        break;
2019    case MO_UW:
2020        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2021                                 base, index, 0, ofs);
2022        if (real_bswap) {
2023            tcg_out_rolw_8(s, datalo);
2024        }
2025        break;
2026    case MO_SW:
2027        if (real_bswap) {
2028            if (have_movbe) {
2029                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2030                                         datalo, base, index, 0, ofs);
2031            } else {
2032                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2033                                         base, index, 0, ofs);
2034                tcg_out_rolw_8(s, datalo);
2035            }
2036            tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
2037        } else {
2038            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2039                                     datalo, base, index, 0, ofs);
2040        }
2041        break;
2042    case MO_UL:
2043        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2044        if (bswap) {
2045            tcg_out_bswap32(s, datalo);
2046        }
2047        break;
2048#if TCG_TARGET_REG_BITS == 64
2049    case MO_SL:
2050        if (real_bswap) {
2051            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2052                                     base, index, 0, ofs);
2053            if (bswap) {
2054                tcg_out_bswap32(s, datalo);
2055            }
2056            tcg_out_ext32s(s, datalo, datalo);
2057        } else {
2058            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2059                                     base, index, 0, ofs);
2060        }
2061        break;
2062#endif
2063    case MO_Q:
2064        if (TCG_TARGET_REG_BITS == 64) {
2065            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2066                                     base, index, 0, ofs);
2067            if (bswap) {
2068                tcg_out_bswap64(s, datalo);
2069            }
2070        } else {
2071            if (real_bswap) {
2072                int t = datalo;
2073                datalo = datahi;
2074                datahi = t;
2075            }
2076            if (base != datalo) {
2077                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2078                                         base, index, 0, ofs);
2079                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2080                                         base, index, 0, ofs + 4);
2081            } else {
2082                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2083                                         base, index, 0, ofs + 4);
2084                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2085                                         base, index, 0, ofs);
2086            }
2087            if (bswap) {
2088                tcg_out_bswap32(s, datalo);
2089                tcg_out_bswap32(s, datahi);
2090            }
2091        }
2092        break;
2093    default:
2094        tcg_abort();
2095    }
2096}
2097
2098/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2099   EAX. It will be useful once fixed registers globals are less
2100   common. */
2101static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2102{
2103    TCGReg datalo, datahi, addrlo;
2104    TCGReg addrhi __attribute__((unused));
2105    TCGMemOpIdx oi;
2106    MemOp opc;
2107#if defined(CONFIG_SOFTMMU)
2108    int mem_index;
2109    tcg_insn_unit *label_ptr[2];
2110#endif
2111
2112    datalo = *args++;
2113    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2114    addrlo = *args++;
2115    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2116    oi = *args++;
2117    opc = get_memop(oi);
2118
2119#if defined(CONFIG_SOFTMMU)
2120    mem_index = get_mmuidx(oi);
2121
2122    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2123                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2124
2125    /* TLB Hit.  */
2126    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2127
2128    /* Record the current context of a load into ldst label */
2129    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2130                        s->code_ptr, label_ptr);
2131#else
2132    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2133                           x86_guest_base_offset, x86_guest_base_seg,
2134                           is64, opc);
2135#endif
2136}
2137
2138static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2139                                   TCGReg base, int index, intptr_t ofs,
2140                                   int seg, MemOp memop)
2141{
2142    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
2143       we could perform the bswap twice to restore the original value
2144       instead of moving to the scratch.  But as it is, the L constraint
2145       means that TCG_REG_L0 is definitely free here.  */
2146    const TCGReg scratch = TCG_REG_L0;
2147    const MemOp real_bswap = memop & MO_BSWAP;
2148    MemOp bswap = real_bswap;
2149    int movop = OPC_MOVL_EvGv;
2150
2151    if (have_movbe && real_bswap) {
2152        bswap = 0;
2153        movop = OPC_MOVBE_MyGy;
2154    }
2155
2156    switch (memop & MO_SIZE) {
2157    case MO_8:
2158        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2159           Use the scratch register if necessary.  */
2160        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2161            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2162            datalo = scratch;
2163        }
2164        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2165                                 datalo, base, index, 0, ofs);
2166        break;
2167    case MO_16:
2168        if (bswap) {
2169            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2170            tcg_out_rolw_8(s, scratch);
2171            datalo = scratch;
2172        }
2173        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2174                                 base, index, 0, ofs);
2175        break;
2176    case MO_32:
2177        if (bswap) {
2178            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2179            tcg_out_bswap32(s, scratch);
2180            datalo = scratch;
2181        }
2182        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2183        break;
2184    case MO_64:
2185        if (TCG_TARGET_REG_BITS == 64) {
2186            if (bswap) {
2187                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2188                tcg_out_bswap64(s, scratch);
2189                datalo = scratch;
2190            }
2191            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2192                                     base, index, 0, ofs);
2193        } else if (bswap) {
2194            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2195            tcg_out_bswap32(s, scratch);
2196            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2197                                     base, index, 0, ofs);
2198            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2199            tcg_out_bswap32(s, scratch);
2200            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2201                                     base, index, 0, ofs + 4);
2202        } else {
2203            if (real_bswap) {
2204                int t = datalo;
2205                datalo = datahi;
2206                datahi = t;
2207            }
2208            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2209                                     base, index, 0, ofs);
2210            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2211                                     base, index, 0, ofs + 4);
2212        }
2213        break;
2214    default:
2215        tcg_abort();
2216    }
2217}
2218
2219static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2220{
2221    TCGReg datalo, datahi, addrlo;
2222    TCGReg addrhi __attribute__((unused));
2223    TCGMemOpIdx oi;
2224    MemOp opc;
2225#if defined(CONFIG_SOFTMMU)
2226    int mem_index;
2227    tcg_insn_unit *label_ptr[2];
2228#endif
2229
2230    datalo = *args++;
2231    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2232    addrlo = *args++;
2233    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2234    oi = *args++;
2235    opc = get_memop(oi);
2236
2237#if defined(CONFIG_SOFTMMU)
2238    mem_index = get_mmuidx(oi);
2239
2240    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2241                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2242
2243    /* TLB Hit.  */
2244    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2245
2246    /* Record the current context of a store into ldst label */
2247    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2248                        s->code_ptr, label_ptr);
2249#else
2250    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2251                           x86_guest_base_offset, x86_guest_base_seg, opc);
2252#endif
2253}
2254
2255static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2256                              const TCGArg *args, const int *const_args)
2257{
2258    TCGArg a0, a1, a2;
2259    int c, const_a2, vexop, rexw = 0;
2260
2261#if TCG_TARGET_REG_BITS == 64
2262# define OP_32_64(x) \
2263        case glue(glue(INDEX_op_, x), _i64): \
2264            rexw = P_REXW; /* FALLTHRU */    \
2265        case glue(glue(INDEX_op_, x), _i32)
2266#else
2267# define OP_32_64(x) \
2268        case glue(glue(INDEX_op_, x), _i32)
2269#endif
2270
2271    /* Hoist the loads of the most common arguments.  */
2272    a0 = args[0];
2273    a1 = args[1];
2274    a2 = args[2];
2275    const_a2 = const_args[2];
2276
2277    switch (opc) {
2278    case INDEX_op_exit_tb:
2279        /* Reuse the zeroing that exists for goto_ptr.  */
2280        if (a0 == 0) {
2281            tcg_out_jmp(s, s->code_gen_epilogue);
2282        } else {
2283            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2284            tcg_out_jmp(s, tb_ret_addr);
2285        }
2286        break;
2287    case INDEX_op_goto_tb:
2288        if (s->tb_jmp_insn_offset) {
2289            /* direct jump method */
2290            int gap;
2291            /* jump displacement must be aligned for atomic patching;
2292             * see if we need to add extra nops before jump
2293             */
2294            gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2295            if (gap != 1) {
2296                tcg_out_nopn(s, gap - 1);
2297            }
2298            tcg_out8(s, OPC_JMP_long); /* jmp im */
2299            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2300            tcg_out32(s, 0);
2301        } else {
2302            /* indirect jump method */
2303            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2304                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2305        }
2306        set_jmp_reset_offset(s, a0);
2307        break;
2308    case INDEX_op_goto_ptr:
2309        /* jmp to the given host address (could be epilogue) */
2310        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2311        break;
2312    case INDEX_op_br:
2313        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2314        break;
2315    OP_32_64(ld8u):
2316        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2317        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2318        break;
2319    OP_32_64(ld8s):
2320        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2321        break;
2322    OP_32_64(ld16u):
2323        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2324        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2325        break;
2326    OP_32_64(ld16s):
2327        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2328        break;
2329#if TCG_TARGET_REG_BITS == 64
2330    case INDEX_op_ld32u_i64:
2331#endif
2332    case INDEX_op_ld_i32:
2333        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2334        break;
2335
2336    OP_32_64(st8):
2337        if (const_args[0]) {
2338            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2339            tcg_out8(s, a0);
2340        } else {
2341            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2342        }
2343        break;
2344    OP_32_64(st16):
2345        if (const_args[0]) {
2346            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2347            tcg_out16(s, a0);
2348        } else {
2349            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2350        }
2351        break;
2352#if TCG_TARGET_REG_BITS == 64
2353    case INDEX_op_st32_i64:
2354#endif
2355    case INDEX_op_st_i32:
2356        if (const_args[0]) {
2357            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2358            tcg_out32(s, a0);
2359        } else {
2360            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2361        }
2362        break;
2363
2364    OP_32_64(add):
2365        /* For 3-operand addition, use LEA.  */
2366        if (a0 != a1) {
2367            TCGArg c3 = 0;
2368            if (const_a2) {
2369                c3 = a2, a2 = -1;
2370            } else if (a0 == a2) {
2371                /* Watch out for dest = src + dest, since we've removed
2372                   the matching constraint on the add.  */
2373                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2374                break;
2375            }
2376
2377            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2378            break;
2379        }
2380        c = ARITH_ADD;
2381        goto gen_arith;
2382    OP_32_64(sub):
2383        c = ARITH_SUB;
2384        goto gen_arith;
2385    OP_32_64(and):
2386        c = ARITH_AND;
2387        goto gen_arith;
2388    OP_32_64(or):
2389        c = ARITH_OR;
2390        goto gen_arith;
2391    OP_32_64(xor):
2392        c = ARITH_XOR;
2393        goto gen_arith;
2394    gen_arith:
2395        if (const_a2) {
2396            tgen_arithi(s, c + rexw, a0, a2, 0);
2397        } else {
2398            tgen_arithr(s, c + rexw, a0, a2);
2399        }
2400        break;
2401
2402    OP_32_64(andc):
2403        if (const_a2) {
2404            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2405            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2406        } else {
2407            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2408        }
2409        break;
2410
2411    OP_32_64(mul):
2412        if (const_a2) {
2413            int32_t val;
2414            val = a2;
2415            if (val == (int8_t)val) {
2416                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2417                tcg_out8(s, val);
2418            } else {
2419                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2420                tcg_out32(s, val);
2421            }
2422        } else {
2423            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2424        }
2425        break;
2426
2427    OP_32_64(div2):
2428        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2429        break;
2430    OP_32_64(divu2):
2431        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2432        break;
2433
2434    OP_32_64(shl):
2435        /* For small constant 3-operand shift, use LEA.  */
2436        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2437            if (a2 - 1 == 0) {
2438                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2439                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2440            } else {
2441                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2442                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2443            }
2444            break;
2445        }
2446        c = SHIFT_SHL;
2447        vexop = OPC_SHLX;
2448        goto gen_shift_maybe_vex;
2449    OP_32_64(shr):
2450        c = SHIFT_SHR;
2451        vexop = OPC_SHRX;
2452        goto gen_shift_maybe_vex;
2453    OP_32_64(sar):
2454        c = SHIFT_SAR;
2455        vexop = OPC_SARX;
2456        goto gen_shift_maybe_vex;
2457    OP_32_64(rotl):
2458        c = SHIFT_ROL;
2459        goto gen_shift;
2460    OP_32_64(rotr):
2461        c = SHIFT_ROR;
2462        goto gen_shift;
2463    gen_shift_maybe_vex:
2464        if (have_bmi2) {
2465            if (!const_a2) {
2466                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2467                break;
2468            }
2469            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2470        }
2471        /* FALLTHRU */
2472    gen_shift:
2473        if (const_a2) {
2474            tcg_out_shifti(s, c + rexw, a0, a2);
2475        } else {
2476            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2477        }
2478        break;
2479
2480    OP_32_64(ctz):
2481        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2482        break;
2483    OP_32_64(clz):
2484        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2485        break;
2486    OP_32_64(ctpop):
2487        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2488        break;
2489
2490    case INDEX_op_brcond_i32:
2491        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2492        break;
2493    case INDEX_op_setcond_i32:
2494        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2495        break;
2496    case INDEX_op_movcond_i32:
2497        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2498        break;
2499
2500    OP_32_64(bswap16):
2501        tcg_out_rolw_8(s, a0);
2502        break;
2503    OP_32_64(bswap32):
2504        tcg_out_bswap32(s, a0);
2505        break;
2506
2507    OP_32_64(neg):
2508        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2509        break;
2510    OP_32_64(not):
2511        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2512        break;
2513
2514    OP_32_64(ext8s):
2515        tcg_out_ext8s(s, a0, a1, rexw);
2516        break;
2517    OP_32_64(ext16s):
2518        tcg_out_ext16s(s, a0, a1, rexw);
2519        break;
2520    OP_32_64(ext8u):
2521        tcg_out_ext8u(s, a0, a1);
2522        break;
2523    OP_32_64(ext16u):
2524        tcg_out_ext16u(s, a0, a1);
2525        break;
2526
2527    case INDEX_op_qemu_ld_i32:
2528        tcg_out_qemu_ld(s, args, 0);
2529        break;
2530    case INDEX_op_qemu_ld_i64:
2531        tcg_out_qemu_ld(s, args, 1);
2532        break;
2533    case INDEX_op_qemu_st_i32:
2534        tcg_out_qemu_st(s, args, 0);
2535        break;
2536    case INDEX_op_qemu_st_i64:
2537        tcg_out_qemu_st(s, args, 1);
2538        break;
2539
2540    OP_32_64(mulu2):
2541        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2542        break;
2543    OP_32_64(muls2):
2544        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2545        break;
2546    OP_32_64(add2):
2547        if (const_args[4]) {
2548            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2549        } else {
2550            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2551        }
2552        if (const_args[5]) {
2553            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2554        } else {
2555            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2556        }
2557        break;
2558    OP_32_64(sub2):
2559        if (const_args[4]) {
2560            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2561        } else {
2562            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2563        }
2564        if (const_args[5]) {
2565            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2566        } else {
2567            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2568        }
2569        break;
2570
2571#if TCG_TARGET_REG_BITS == 32
2572    case INDEX_op_brcond2_i32:
2573        tcg_out_brcond2(s, args, const_args, 0);
2574        break;
2575    case INDEX_op_setcond2_i32:
2576        tcg_out_setcond2(s, args, const_args);
2577        break;
2578#else /* TCG_TARGET_REG_BITS == 64 */
2579    case INDEX_op_ld32s_i64:
2580        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2581        break;
2582    case INDEX_op_ld_i64:
2583        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2584        break;
2585    case INDEX_op_st_i64:
2586        if (const_args[0]) {
2587            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2588            tcg_out32(s, a0);
2589        } else {
2590            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2591        }
2592        break;
2593
2594    case INDEX_op_brcond_i64:
2595        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2596        break;
2597    case INDEX_op_setcond_i64:
2598        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2599        break;
2600    case INDEX_op_movcond_i64:
2601        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2602        break;
2603
2604    case INDEX_op_bswap64_i64:
2605        tcg_out_bswap64(s, a0);
2606        break;
2607    case INDEX_op_extu_i32_i64:
2608    case INDEX_op_ext32u_i64:
2609    case INDEX_op_extrl_i64_i32:
2610        tcg_out_ext32u(s, a0, a1);
2611        break;
2612    case INDEX_op_ext_i32_i64:
2613    case INDEX_op_ext32s_i64:
2614        tcg_out_ext32s(s, a0, a1);
2615        break;
2616    case INDEX_op_extrh_i64_i32:
2617        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2618        break;
2619#endif
2620
2621    OP_32_64(deposit):
2622        if (args[3] == 0 && args[4] == 8) {
2623            /* load bits 0..7 */
2624            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2625        } else if (args[3] == 8 && args[4] == 8) {
2626            /* load bits 8..15 */
2627            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2628        } else if (args[3] == 0 && args[4] == 16) {
2629            /* load bits 0..15 */
2630            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2631        } else {
2632            tcg_abort();
2633        }
2634        break;
2635
2636    case INDEX_op_extract_i64:
2637        if (a2 + args[3] == 32) {
2638            /* This is a 32-bit zero-extending right shift.  */
2639            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2640            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2641            break;
2642        }
2643        /* FALLTHRU */
2644    case INDEX_op_extract_i32:
2645        /* On the off-chance that we can use the high-byte registers.
2646           Otherwise we emit the same ext16 + shift pattern that we
2647           would have gotten from the normal tcg-op.c expansion.  */
2648        tcg_debug_assert(a2 == 8 && args[3] == 8);
2649        if (a1 < 4 && a0 < 8) {
2650            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2651        } else {
2652            tcg_out_ext16u(s, a0, a1);
2653            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2654        }
2655        break;
2656
2657    case INDEX_op_sextract_i32:
2658        /* We don't implement sextract_i64, as we cannot sign-extend to
2659           64-bits without using the REX prefix that explicitly excludes
2660           access to the high-byte registers.  */
2661        tcg_debug_assert(a2 == 8 && args[3] == 8);
2662        if (a1 < 4 && a0 < 8) {
2663            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2664        } else {
2665            tcg_out_ext16s(s, a0, a1, 0);
2666            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2667        }
2668        break;
2669
2670    OP_32_64(extract2):
2671        /* Note that SHRD outputs to the r/m operand.  */
2672        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2673        tcg_out8(s, args[3]);
2674        break;
2675
2676    case INDEX_op_mb:
2677        tcg_out_mb(s, a0);
2678        break;
2679    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2680    case INDEX_op_mov_i64:
2681    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2682    case INDEX_op_movi_i64:
2683    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2684    default:
2685        tcg_abort();
2686    }
2687
2688#undef OP_32_64
2689}
2690
2691static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2692                           unsigned vecl, unsigned vece,
2693                           const TCGArg *args, const int *const_args)
2694{
2695    static int const add_insn[4] = {
2696        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2697    };
2698    static int const ssadd_insn[4] = {
2699        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2700    };
2701    static int const usadd_insn[4] = {
2702        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2703    };
2704    static int const sub_insn[4] = {
2705        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2706    };
2707    static int const sssub_insn[4] = {
2708        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2709    };
2710    static int const ussub_insn[4] = {
2711        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2712    };
2713    static int const mul_insn[4] = {
2714        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2715    };
2716    static int const shift_imm_insn[4] = {
2717        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2718    };
2719    static int const cmpeq_insn[4] = {
2720        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2721    };
2722    static int const cmpgt_insn[4] = {
2723        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2724    };
2725    static int const punpckl_insn[4] = {
2726        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2727    };
2728    static int const punpckh_insn[4] = {
2729        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2730    };
2731    static int const packss_insn[4] = {
2732        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2733    };
2734    static int const packus_insn[4] = {
2735        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2736    };
2737    static int const smin_insn[4] = {
2738        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2739    };
2740    static int const smax_insn[4] = {
2741        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2742    };
2743    static int const umin_insn[4] = {
2744        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2745    };
2746    static int const umax_insn[4] = {
2747        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2748    };
2749    static int const shlv_insn[4] = {
2750        /* TODO: AVX512 adds support for MO_16.  */
2751        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2752    };
2753    static int const shrv_insn[4] = {
2754        /* TODO: AVX512 adds support for MO_16.  */
2755        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2756    };
2757    static int const sarv_insn[4] = {
2758        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2759        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2760    };
2761    static int const shls_insn[4] = {
2762        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2763    };
2764    static int const shrs_insn[4] = {
2765        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2766    };
2767    static int const sars_insn[4] = {
2768        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2769    };
2770    static int const abs_insn[4] = {
2771        /* TODO: AVX512 adds support for MO_64.  */
2772        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2773    };
2774
2775    TCGType type = vecl + TCG_TYPE_V64;
2776    int insn, sub;
2777    TCGArg a0, a1, a2;
2778
2779    a0 = args[0];
2780    a1 = args[1];
2781    a2 = args[2];
2782
2783    switch (opc) {
2784    case INDEX_op_add_vec:
2785        insn = add_insn[vece];
2786        goto gen_simd;
2787    case INDEX_op_ssadd_vec:
2788        insn = ssadd_insn[vece];
2789        goto gen_simd;
2790    case INDEX_op_usadd_vec:
2791        insn = usadd_insn[vece];
2792        goto gen_simd;
2793    case INDEX_op_sub_vec:
2794        insn = sub_insn[vece];
2795        goto gen_simd;
2796    case INDEX_op_sssub_vec:
2797        insn = sssub_insn[vece];
2798        goto gen_simd;
2799    case INDEX_op_ussub_vec:
2800        insn = ussub_insn[vece];
2801        goto gen_simd;
2802    case INDEX_op_mul_vec:
2803        insn = mul_insn[vece];
2804        goto gen_simd;
2805    case INDEX_op_and_vec:
2806        insn = OPC_PAND;
2807        goto gen_simd;
2808    case INDEX_op_or_vec:
2809        insn = OPC_POR;
2810        goto gen_simd;
2811    case INDEX_op_xor_vec:
2812        insn = OPC_PXOR;
2813        goto gen_simd;
2814    case INDEX_op_smin_vec:
2815        insn = smin_insn[vece];
2816        goto gen_simd;
2817    case INDEX_op_umin_vec:
2818        insn = umin_insn[vece];
2819        goto gen_simd;
2820    case INDEX_op_smax_vec:
2821        insn = smax_insn[vece];
2822        goto gen_simd;
2823    case INDEX_op_umax_vec:
2824        insn = umax_insn[vece];
2825        goto gen_simd;
2826    case INDEX_op_shlv_vec:
2827        insn = shlv_insn[vece];
2828        goto gen_simd;
2829    case INDEX_op_shrv_vec:
2830        insn = shrv_insn[vece];
2831        goto gen_simd;
2832    case INDEX_op_sarv_vec:
2833        insn = sarv_insn[vece];
2834        goto gen_simd;
2835    case INDEX_op_shls_vec:
2836        insn = shls_insn[vece];
2837        goto gen_simd;
2838    case INDEX_op_shrs_vec:
2839        insn = shrs_insn[vece];
2840        goto gen_simd;
2841    case INDEX_op_sars_vec:
2842        insn = sars_insn[vece];
2843        goto gen_simd;
2844    case INDEX_op_x86_punpckl_vec:
2845        insn = punpckl_insn[vece];
2846        goto gen_simd;
2847    case INDEX_op_x86_punpckh_vec:
2848        insn = punpckh_insn[vece];
2849        goto gen_simd;
2850    case INDEX_op_x86_packss_vec:
2851        insn = packss_insn[vece];
2852        goto gen_simd;
2853    case INDEX_op_x86_packus_vec:
2854        insn = packus_insn[vece];
2855        goto gen_simd;
2856#if TCG_TARGET_REG_BITS == 32
2857    case INDEX_op_dup2_vec:
2858        /* First merge the two 32-bit inputs to a single 64-bit element. */
2859        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2860        /* Then replicate the 64-bit elements across the rest of the vector. */
2861        if (type != TCG_TYPE_V64) {
2862            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2863        }
2864        break;
2865#endif
2866    case INDEX_op_abs_vec:
2867        insn = abs_insn[vece];
2868        a2 = a1;
2869        a1 = 0;
2870        goto gen_simd;
2871    gen_simd:
2872        tcg_debug_assert(insn != OPC_UD2);
2873        if (type == TCG_TYPE_V256) {
2874            insn |= P_VEXL;
2875        }
2876        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2877        break;
2878
2879    case INDEX_op_cmp_vec:
2880        sub = args[3];
2881        if (sub == TCG_COND_EQ) {
2882            insn = cmpeq_insn[vece];
2883        } else if (sub == TCG_COND_GT) {
2884            insn = cmpgt_insn[vece];
2885        } else {
2886            g_assert_not_reached();
2887        }
2888        goto gen_simd;
2889
2890    case INDEX_op_andc_vec:
2891        insn = OPC_PANDN;
2892        if (type == TCG_TYPE_V256) {
2893            insn |= P_VEXL;
2894        }
2895        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2896        break;
2897
2898    case INDEX_op_shli_vec:
2899        sub = 6;
2900        goto gen_shift;
2901    case INDEX_op_shri_vec:
2902        sub = 2;
2903        goto gen_shift;
2904    case INDEX_op_sari_vec:
2905        tcg_debug_assert(vece != MO_64);
2906        sub = 4;
2907    gen_shift:
2908        tcg_debug_assert(vece != MO_8);
2909        insn = shift_imm_insn[vece];
2910        if (type == TCG_TYPE_V256) {
2911            insn |= P_VEXL;
2912        }
2913        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2914        tcg_out8(s, a2);
2915        break;
2916
2917    case INDEX_op_ld_vec:
2918        tcg_out_ld(s, type, a0, a1, a2);
2919        break;
2920    case INDEX_op_st_vec:
2921        tcg_out_st(s, type, a0, a1, a2);
2922        break;
2923    case INDEX_op_dupm_vec:
2924        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2925        break;
2926
2927    case INDEX_op_x86_shufps_vec:
2928        insn = OPC_SHUFPS;
2929        sub = args[3];
2930        goto gen_simd_imm8;
2931    case INDEX_op_x86_blend_vec:
2932        if (vece == MO_16) {
2933            insn = OPC_PBLENDW;
2934        } else if (vece == MO_32) {
2935            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2936        } else {
2937            g_assert_not_reached();
2938        }
2939        sub = args[3];
2940        goto gen_simd_imm8;
2941    case INDEX_op_x86_vperm2i128_vec:
2942        insn = OPC_VPERM2I128;
2943        sub = args[3];
2944        goto gen_simd_imm8;
2945    gen_simd_imm8:
2946        if (type == TCG_TYPE_V256) {
2947            insn |= P_VEXL;
2948        }
2949        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2950        tcg_out8(s, sub);
2951        break;
2952
2953    case INDEX_op_x86_vpblendvb_vec:
2954        insn = OPC_VPBLENDVB;
2955        if (type == TCG_TYPE_V256) {
2956            insn |= P_VEXL;
2957        }
2958        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2959        tcg_out8(s, args[3] << 4);
2960        break;
2961
2962    case INDEX_op_x86_psrldq_vec:
2963        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2964        tcg_out8(s, a2);
2965        break;
2966
2967    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2968    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
2969    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2970    default:
2971        g_assert_not_reached();
2972    }
2973}
2974
2975static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2976{
2977    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2978    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2979    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2980    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2981    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2982    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2983    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2984    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2985    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2986    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2987    static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
2988    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2989    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2990    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2991    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2992    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2993    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2994    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2995    static const TCGTargetOpDef r_r_L_L
2996        = { .args_ct_str = { "r", "r", "L", "L" } };
2997    static const TCGTargetOpDef L_L_L_L
2998        = { .args_ct_str = { "L", "L", "L", "L" } };
2999    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
3000    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
3001    static const TCGTargetOpDef x_x_x_x
3002        = { .args_ct_str = { "x", "x", "x", "x" } };
3003    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
3004
3005    switch (op) {
3006    case INDEX_op_goto_ptr:
3007        return &r;
3008
3009    case INDEX_op_ld8u_i32:
3010    case INDEX_op_ld8u_i64:
3011    case INDEX_op_ld8s_i32:
3012    case INDEX_op_ld8s_i64:
3013    case INDEX_op_ld16u_i32:
3014    case INDEX_op_ld16u_i64:
3015    case INDEX_op_ld16s_i32:
3016    case INDEX_op_ld16s_i64:
3017    case INDEX_op_ld_i32:
3018    case INDEX_op_ld32u_i64:
3019    case INDEX_op_ld32s_i64:
3020    case INDEX_op_ld_i64:
3021        return &r_r;
3022
3023    case INDEX_op_st8_i32:
3024    case INDEX_op_st8_i64:
3025        return &qi_r;
3026    case INDEX_op_st16_i32:
3027    case INDEX_op_st16_i64:
3028    case INDEX_op_st_i32:
3029    case INDEX_op_st32_i64:
3030        return &ri_r;
3031    case INDEX_op_st_i64:
3032        return &re_r;
3033
3034    case INDEX_op_add_i32:
3035    case INDEX_op_add_i64:
3036        return &r_r_re;
3037    case INDEX_op_sub_i32:
3038    case INDEX_op_sub_i64:
3039    case INDEX_op_mul_i32:
3040    case INDEX_op_mul_i64:
3041    case INDEX_op_or_i32:
3042    case INDEX_op_or_i64:
3043    case INDEX_op_xor_i32:
3044    case INDEX_op_xor_i64:
3045        return &r_0_re;
3046
3047    case INDEX_op_and_i32:
3048    case INDEX_op_and_i64:
3049        {
3050            static const TCGTargetOpDef and
3051                = { .args_ct_str = { "r", "0", "reZ" } };
3052            return &and;
3053        }
3054        break;
3055    case INDEX_op_andc_i32:
3056    case INDEX_op_andc_i64:
3057        {
3058            static const TCGTargetOpDef andc
3059                = { .args_ct_str = { "r", "r", "rI" } };
3060            return &andc;
3061        }
3062        break;
3063
3064    case INDEX_op_shl_i32:
3065    case INDEX_op_shl_i64:
3066    case INDEX_op_shr_i32:
3067    case INDEX_op_shr_i64:
3068    case INDEX_op_sar_i32:
3069    case INDEX_op_sar_i64:
3070        return have_bmi2 ? &r_r_ri : &r_0_ci;
3071    case INDEX_op_rotl_i32:
3072    case INDEX_op_rotl_i64:
3073    case INDEX_op_rotr_i32:
3074    case INDEX_op_rotr_i64:
3075        return &r_0_ci;
3076
3077    case INDEX_op_brcond_i32:
3078    case INDEX_op_brcond_i64:
3079        return &r_re;
3080
3081    case INDEX_op_bswap16_i32:
3082    case INDEX_op_bswap16_i64:
3083    case INDEX_op_bswap32_i32:
3084    case INDEX_op_bswap32_i64:
3085    case INDEX_op_bswap64_i64:
3086    case INDEX_op_neg_i32:
3087    case INDEX_op_neg_i64:
3088    case INDEX_op_not_i32:
3089    case INDEX_op_not_i64:
3090    case INDEX_op_extrh_i64_i32:
3091        return &r_0;
3092
3093    case INDEX_op_ext8s_i32:
3094    case INDEX_op_ext8s_i64:
3095    case INDEX_op_ext8u_i32:
3096    case INDEX_op_ext8u_i64:
3097        return &r_q;
3098    case INDEX_op_ext16s_i32:
3099    case INDEX_op_ext16s_i64:
3100    case INDEX_op_ext16u_i32:
3101    case INDEX_op_ext16u_i64:
3102    case INDEX_op_ext32s_i64:
3103    case INDEX_op_ext32u_i64:
3104    case INDEX_op_ext_i32_i64:
3105    case INDEX_op_extu_i32_i64:
3106    case INDEX_op_extrl_i64_i32:
3107    case INDEX_op_extract_i32:
3108    case INDEX_op_extract_i64:
3109    case INDEX_op_sextract_i32:
3110    case INDEX_op_ctpop_i32:
3111    case INDEX_op_ctpop_i64:
3112        return &r_r;
3113    case INDEX_op_extract2_i32:
3114    case INDEX_op_extract2_i64:
3115        return &r_0_r;
3116
3117    case INDEX_op_deposit_i32:
3118    case INDEX_op_deposit_i64:
3119        {
3120            static const TCGTargetOpDef dep
3121                = { .args_ct_str = { "Q", "0", "Q" } };
3122            return &dep;
3123        }
3124    case INDEX_op_setcond_i32:
3125    case INDEX_op_setcond_i64:
3126        {
3127            static const TCGTargetOpDef setc
3128                = { .args_ct_str = { "q", "r", "re" } };
3129            return &setc;
3130        }
3131    case INDEX_op_movcond_i32:
3132    case INDEX_op_movcond_i64:
3133        {
3134            static const TCGTargetOpDef movc
3135                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
3136            return &movc;
3137        }
3138    case INDEX_op_div2_i32:
3139    case INDEX_op_div2_i64:
3140    case INDEX_op_divu2_i32:
3141    case INDEX_op_divu2_i64:
3142        {
3143            static const TCGTargetOpDef div2
3144                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3145            return &div2;
3146        }
3147    case INDEX_op_mulu2_i32:
3148    case INDEX_op_mulu2_i64:
3149    case INDEX_op_muls2_i32:
3150    case INDEX_op_muls2_i64:
3151        {
3152            static const TCGTargetOpDef mul2
3153                = { .args_ct_str = { "a", "d", "a", "r" } };
3154            return &mul2;
3155        }
3156    case INDEX_op_add2_i32:
3157    case INDEX_op_add2_i64:
3158    case INDEX_op_sub2_i32:
3159    case INDEX_op_sub2_i64:
3160        {
3161            static const TCGTargetOpDef arith2
3162                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3163            return &arith2;
3164        }
3165    case INDEX_op_ctz_i32:
3166    case INDEX_op_ctz_i64:
3167        {
3168            static const TCGTargetOpDef ctz[2] = {
3169                { .args_ct_str = { "&r", "r", "r" } },
3170                { .args_ct_str = { "&r", "r", "rW" } },
3171            };
3172            return &ctz[have_bmi1];
3173        }
3174    case INDEX_op_clz_i32:
3175    case INDEX_op_clz_i64:
3176        {
3177            static const TCGTargetOpDef clz[2] = {
3178                { .args_ct_str = { "&r", "r", "r" } },
3179                { .args_ct_str = { "&r", "r", "rW" } },
3180            };
3181            return &clz[have_lzcnt];
3182        }
3183
3184    case INDEX_op_qemu_ld_i32:
3185        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3186    case INDEX_op_qemu_st_i32:
3187        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3188    case INDEX_op_qemu_ld_i64:
3189        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3190                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3191                : &r_r_L_L);
3192    case INDEX_op_qemu_st_i64:
3193        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3194                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3195                : &L_L_L_L);
3196
3197    case INDEX_op_brcond2_i32:
3198        {
3199            static const TCGTargetOpDef b2
3200                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3201            return &b2;
3202        }
3203    case INDEX_op_setcond2_i32:
3204        {
3205            static const TCGTargetOpDef s2
3206                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3207            return &s2;
3208        }
3209
3210    case INDEX_op_ld_vec:
3211    case INDEX_op_st_vec:
3212    case INDEX_op_dupm_vec:
3213        return &x_r;
3214
3215    case INDEX_op_add_vec:
3216    case INDEX_op_sub_vec:
3217    case INDEX_op_mul_vec:
3218    case INDEX_op_and_vec:
3219    case INDEX_op_or_vec:
3220    case INDEX_op_xor_vec:
3221    case INDEX_op_andc_vec:
3222    case INDEX_op_ssadd_vec:
3223    case INDEX_op_usadd_vec:
3224    case INDEX_op_sssub_vec:
3225    case INDEX_op_ussub_vec:
3226    case INDEX_op_smin_vec:
3227    case INDEX_op_umin_vec:
3228    case INDEX_op_smax_vec:
3229    case INDEX_op_umax_vec:
3230    case INDEX_op_shlv_vec:
3231    case INDEX_op_shrv_vec:
3232    case INDEX_op_sarv_vec:
3233    case INDEX_op_shls_vec:
3234    case INDEX_op_shrs_vec:
3235    case INDEX_op_sars_vec:
3236    case INDEX_op_rotls_vec:
3237    case INDEX_op_cmp_vec:
3238    case INDEX_op_x86_shufps_vec:
3239    case INDEX_op_x86_blend_vec:
3240    case INDEX_op_x86_packss_vec:
3241    case INDEX_op_x86_packus_vec:
3242    case INDEX_op_x86_vperm2i128_vec:
3243    case INDEX_op_x86_punpckl_vec:
3244    case INDEX_op_x86_punpckh_vec:
3245#if TCG_TARGET_REG_BITS == 32
3246    case INDEX_op_dup2_vec:
3247#endif
3248        return &x_x_x;
3249    case INDEX_op_abs_vec:
3250    case INDEX_op_dup_vec:
3251    case INDEX_op_shli_vec:
3252    case INDEX_op_shri_vec:
3253    case INDEX_op_sari_vec:
3254    case INDEX_op_x86_psrldq_vec:
3255        return &x_x;
3256    case INDEX_op_x86_vpblendvb_vec:
3257        return &x_x_x_x;
3258
3259    default:
3260        break;
3261    }
3262    return NULL;
3263}
3264
3265int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3266{
3267    switch (opc) {
3268    case INDEX_op_add_vec:
3269    case INDEX_op_sub_vec:
3270    case INDEX_op_and_vec:
3271    case INDEX_op_or_vec:
3272    case INDEX_op_xor_vec:
3273    case INDEX_op_andc_vec:
3274        return 1;
3275    case INDEX_op_rotli_vec:
3276    case INDEX_op_cmp_vec:
3277    case INDEX_op_cmpsel_vec:
3278        return -1;
3279
3280    case INDEX_op_shli_vec:
3281    case INDEX_op_shri_vec:
3282        /* We must expand the operation for MO_8.  */
3283        return vece == MO_8 ? -1 : 1;
3284
3285    case INDEX_op_sari_vec:
3286        /* We must expand the operation for MO_8.  */
3287        if (vece == MO_8) {
3288            return -1;
3289        }
3290        /* We can emulate this for MO_64, but it does not pay off
3291           unless we're producing at least 4 values.  */
3292        if (vece == MO_64) {
3293            return type >= TCG_TYPE_V256 ? -1 : 0;
3294        }
3295        return 1;
3296
3297    case INDEX_op_shls_vec:
3298    case INDEX_op_shrs_vec:
3299        return vece >= MO_16;
3300    case INDEX_op_sars_vec:
3301        return vece >= MO_16 && vece <= MO_32;
3302    case INDEX_op_rotls_vec:
3303        return vece >= MO_16 ? -1 : 0;
3304
3305    case INDEX_op_shlv_vec:
3306    case INDEX_op_shrv_vec:
3307        return have_avx2 && vece >= MO_32;
3308    case INDEX_op_sarv_vec:
3309        return have_avx2 && vece == MO_32;
3310    case INDEX_op_rotlv_vec:
3311    case INDEX_op_rotrv_vec:
3312        return have_avx2 && vece >= MO_32 ? -1 : 0;
3313
3314    case INDEX_op_mul_vec:
3315        if (vece == MO_8) {
3316            /* We can expand the operation for MO_8.  */
3317            return -1;
3318        }
3319        if (vece == MO_64) {
3320            return 0;
3321        }
3322        return 1;
3323
3324    case INDEX_op_ssadd_vec:
3325    case INDEX_op_usadd_vec:
3326    case INDEX_op_sssub_vec:
3327    case INDEX_op_ussub_vec:
3328        return vece <= MO_16;
3329    case INDEX_op_smin_vec:
3330    case INDEX_op_smax_vec:
3331    case INDEX_op_umin_vec:
3332    case INDEX_op_umax_vec:
3333    case INDEX_op_abs_vec:
3334        return vece <= MO_32;
3335
3336    default:
3337        return 0;
3338    }
3339}
3340
3341static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3342                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3343{
3344    TCGv_vec t1, t2;
3345
3346    tcg_debug_assert(vece == MO_8);
3347
3348    t1 = tcg_temp_new_vec(type);
3349    t2 = tcg_temp_new_vec(type);
3350
3351    /*
3352     * Unpack to W, shift, and repack.  Tricky bits:
3353     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3354     *     i.e. duplicate in other half of the 16-bit lane.
3355     * (2) For right-shift, add 8 so that the high half of the lane
3356     *     becomes zero.  For left-shift, and left-rotate, we must
3357     *     shift up and down again.
3358     * (3) Step 2 leaves high half zero such that PACKUSWB
3359     *     (pack with unsigned saturation) does not modify
3360     *     the quantity.
3361     */
3362    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3363              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3364    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3365              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3366
3367    if (opc != INDEX_op_rotli_vec) {
3368        imm += 8;
3369    }
3370    if (opc == INDEX_op_shri_vec) {
3371        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3372        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3373    } else {
3374        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3375        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3376        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3377        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3378    }
3379
3380    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3381              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3382    tcg_temp_free_vec(t1);
3383    tcg_temp_free_vec(t2);
3384}
3385
3386static void expand_vec_sari(TCGType type, unsigned vece,
3387                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3388{
3389    TCGv_vec t1, t2;
3390
3391    switch (vece) {
3392    case MO_8:
3393        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3394        t1 = tcg_temp_new_vec(type);
3395        t2 = tcg_temp_new_vec(type);
3396        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3397                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3398        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3399                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3400        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3401        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3402        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3403                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3404        tcg_temp_free_vec(t1);
3405        tcg_temp_free_vec(t2);
3406        break;
3407
3408    case MO_64:
3409        if (imm <= 32) {
3410            /*
3411             * We can emulate a small sign extend by performing an arithmetic
3412             * 32-bit shift and overwriting the high half of a 64-bit logical
3413             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3414             * does not, so we have to bound the smaller shift -- we get the
3415             * same result in the high half either way.
3416             */
3417            t1 = tcg_temp_new_vec(type);
3418            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3419            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3420            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3421                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3422                      tcgv_vec_arg(t1), 0xaa);
3423            tcg_temp_free_vec(t1);
3424        } else {
3425            /* Otherwise we will need to use a compare vs 0 to produce
3426             * the sign-extend, shift and merge.
3427             */
3428            t1 = tcg_const_zeros_vec(type);
3429            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3430            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3431            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3432            tcg_gen_or_vec(MO_64, v0, v0, t1);
3433            tcg_temp_free_vec(t1);
3434        }
3435        break;
3436
3437    default:
3438        g_assert_not_reached();
3439    }
3440}
3441
3442static void expand_vec_rotli(TCGType type, unsigned vece,
3443                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3444{
3445    TCGv_vec t;
3446
3447    if (vece == MO_8) {
3448        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3449        return;
3450    }
3451
3452    t = tcg_temp_new_vec(type);
3453    tcg_gen_shli_vec(vece, t, v1, imm);
3454    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3455    tcg_gen_or_vec(vece, v0, v0, t);
3456    tcg_temp_free_vec(t);
3457}
3458
3459static void expand_vec_rotls(TCGType type, unsigned vece,
3460                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3461{
3462    TCGv_i32 rsh;
3463    TCGv_vec t;
3464
3465    tcg_debug_assert(vece != MO_8);
3466
3467    t = tcg_temp_new_vec(type);
3468    rsh = tcg_temp_new_i32();
3469
3470    tcg_gen_neg_i32(rsh, lsh);
3471    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3472    tcg_gen_shls_vec(vece, t, v1, lsh);
3473    tcg_gen_shrs_vec(vece, v0, v1, rsh);
3474    tcg_gen_or_vec(vece, v0, v0, t);
3475    tcg_temp_free_vec(t);
3476    tcg_temp_free_i32(rsh);
3477}
3478
3479static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3480                            TCGv_vec v1, TCGv_vec sh, bool right)
3481{
3482    TCGv_vec t = tcg_temp_new_vec(type);
3483
3484    tcg_gen_dupi_vec(vece, t, 8 << vece);
3485    tcg_gen_sub_vec(vece, t, t, sh);
3486    if (right) {
3487        tcg_gen_shlv_vec(vece, t, v1, t);
3488        tcg_gen_shrv_vec(vece, v0, v1, sh);
3489    } else {
3490        tcg_gen_shrv_vec(vece, t, v1, t);
3491        tcg_gen_shlv_vec(vece, v0, v1, sh);
3492    }
3493    tcg_gen_or_vec(vece, v0, v0, t);
3494    tcg_temp_free_vec(t);
3495}
3496
3497static void expand_vec_mul(TCGType type, unsigned vece,
3498                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3499{
3500    TCGv_vec t1, t2, t3, t4;
3501
3502    tcg_debug_assert(vece == MO_8);
3503
3504    /*
3505     * Unpack v1 bytes to words, 0 | x.
3506     * Unpack v2 bytes to words, y | 0.
3507     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3508     * Shift logical right by 8 bits to clear the high 8 bytes before
3509     * using an unsigned saturated pack.
3510     *
3511     * The difference between the V64, V128 and V256 cases is merely how
3512     * we distribute the expansion between temporaries.
3513     */
3514    switch (type) {
3515    case TCG_TYPE_V64:
3516        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3517        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3518        tcg_gen_dup16i_vec(t2, 0);
3519        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3520                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3521        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3522                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3523        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3524        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3525        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3526                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3527        tcg_temp_free_vec(t1);
3528        tcg_temp_free_vec(t2);
3529        break;
3530
3531    case TCG_TYPE_V128:
3532    case TCG_TYPE_V256:
3533        t1 = tcg_temp_new_vec(type);
3534        t2 = tcg_temp_new_vec(type);
3535        t3 = tcg_temp_new_vec(type);
3536        t4 = tcg_temp_new_vec(type);
3537        tcg_gen_dup16i_vec(t4, 0);
3538        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3539                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3540        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3541                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3542        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3543                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3544        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3545                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3546        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3547        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3548        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3549        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3550        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3551                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3552        tcg_temp_free_vec(t1);
3553        tcg_temp_free_vec(t2);
3554        tcg_temp_free_vec(t3);
3555        tcg_temp_free_vec(t4);
3556        break;
3557
3558    default:
3559        g_assert_not_reached();
3560    }
3561}
3562
3563static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3564                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3565{
3566    enum {
3567        NEED_INV  = 1,
3568        NEED_SWAP = 2,
3569        NEED_BIAS = 4,
3570        NEED_UMIN = 8,
3571        NEED_UMAX = 16,
3572    };
3573    TCGv_vec t1, t2;
3574    uint8_t fixup;
3575
3576    switch (cond) {
3577    case TCG_COND_EQ:
3578    case TCG_COND_GT:
3579        fixup = 0;
3580        break;
3581    case TCG_COND_NE:
3582    case TCG_COND_LE:
3583        fixup = NEED_INV;
3584        break;
3585    case TCG_COND_LT:
3586        fixup = NEED_SWAP;
3587        break;
3588    case TCG_COND_GE:
3589        fixup = NEED_SWAP | NEED_INV;
3590        break;
3591    case TCG_COND_LEU:
3592        if (vece <= MO_32) {
3593            fixup = NEED_UMIN;
3594        } else {
3595            fixup = NEED_BIAS | NEED_INV;
3596        }
3597        break;
3598    case TCG_COND_GTU:
3599        if (vece <= MO_32) {
3600            fixup = NEED_UMIN | NEED_INV;
3601        } else {
3602            fixup = NEED_BIAS;
3603        }
3604        break;
3605    case TCG_COND_GEU:
3606        if (vece <= MO_32) {
3607            fixup = NEED_UMAX;
3608        } else {
3609            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3610        }
3611        break;
3612    case TCG_COND_LTU:
3613        if (vece <= MO_32) {
3614            fixup = NEED_UMAX | NEED_INV;
3615        } else {
3616            fixup = NEED_BIAS | NEED_SWAP;
3617        }
3618        break;
3619    default:
3620        g_assert_not_reached();
3621    }
3622
3623    if (fixup & NEED_INV) {
3624        cond = tcg_invert_cond(cond);
3625    }
3626    if (fixup & NEED_SWAP) {
3627        t1 = v1, v1 = v2, v2 = t1;
3628        cond = tcg_swap_cond(cond);
3629    }
3630
3631    t1 = t2 = NULL;
3632    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3633        t1 = tcg_temp_new_vec(type);
3634        if (fixup & NEED_UMIN) {
3635            tcg_gen_umin_vec(vece, t1, v1, v2);
3636        } else {
3637            tcg_gen_umax_vec(vece, t1, v1, v2);
3638        }
3639        v2 = t1;
3640        cond = TCG_COND_EQ;
3641    } else if (fixup & NEED_BIAS) {
3642        t1 = tcg_temp_new_vec(type);
3643        t2 = tcg_temp_new_vec(type);
3644        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3645        tcg_gen_sub_vec(vece, t1, v1, t2);
3646        tcg_gen_sub_vec(vece, t2, v2, t2);
3647        v1 = t1;
3648        v2 = t2;
3649        cond = tcg_signed_cond(cond);
3650    }
3651
3652    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3653    /* Expand directly; do not recurse.  */
3654    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3655              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3656
3657    if (t1) {
3658        tcg_temp_free_vec(t1);
3659        if (t2) {
3660            tcg_temp_free_vec(t2);
3661        }
3662    }
3663    return fixup & NEED_INV;
3664}
3665
3666static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3667                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3668{
3669    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3670        tcg_gen_not_vec(vece, v0, v0);
3671    }
3672}
3673
3674static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3675                              TCGv_vec c1, TCGv_vec c2,
3676                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3677{
3678    TCGv_vec t = tcg_temp_new_vec(type);
3679
3680    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3681        /* Invert the sense of the compare by swapping arguments.  */
3682        TCGv_vec x;
3683        x = v3, v3 = v4, v4 = x;
3684    }
3685    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3686              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3687              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3688    tcg_temp_free_vec(t);
3689}
3690
3691void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3692                       TCGArg a0, ...)
3693{
3694    va_list va;
3695    TCGArg a2;
3696    TCGv_vec v0, v1, v2, v3, v4;
3697
3698    va_start(va, a0);
3699    v0 = temp_tcgv_vec(arg_temp(a0));
3700    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3701    a2 = va_arg(va, TCGArg);
3702
3703    switch (opc) {
3704    case INDEX_op_shli_vec:
3705    case INDEX_op_shri_vec:
3706        expand_vec_shi(type, vece, opc, v0, v1, a2);
3707        break;
3708
3709    case INDEX_op_sari_vec:
3710        expand_vec_sari(type, vece, v0, v1, a2);
3711        break;
3712
3713    case INDEX_op_rotli_vec:
3714        expand_vec_rotli(type, vece, v0, v1, a2);
3715        break;
3716
3717    case INDEX_op_rotls_vec:
3718        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3719        break;
3720
3721    case INDEX_op_rotlv_vec:
3722        v2 = temp_tcgv_vec(arg_temp(a2));
3723        expand_vec_rotv(type, vece, v0, v1, v2, false);
3724        break;
3725    case INDEX_op_rotrv_vec:
3726        v2 = temp_tcgv_vec(arg_temp(a2));
3727        expand_vec_rotv(type, vece, v0, v1, v2, true);
3728        break;
3729
3730    case INDEX_op_mul_vec:
3731        v2 = temp_tcgv_vec(arg_temp(a2));
3732        expand_vec_mul(type, vece, v0, v1, v2);
3733        break;
3734
3735    case INDEX_op_cmp_vec:
3736        v2 = temp_tcgv_vec(arg_temp(a2));
3737        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3738        break;
3739
3740    case INDEX_op_cmpsel_vec:
3741        v2 = temp_tcgv_vec(arg_temp(a2));
3742        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3743        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3744        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3745        break;
3746
3747    default:
3748        break;
3749    }
3750
3751    va_end(va);
3752}
3753
3754static const int tcg_target_callee_save_regs[] = {
3755#if TCG_TARGET_REG_BITS == 64
3756    TCG_REG_RBP,
3757    TCG_REG_RBX,
3758#if defined(_WIN64)
3759    TCG_REG_RDI,
3760    TCG_REG_RSI,
3761#endif
3762    TCG_REG_R12,
3763    TCG_REG_R13,
3764    TCG_REG_R14, /* Currently used for the global env. */
3765    TCG_REG_R15,
3766#else
3767    TCG_REG_EBP, /* Currently used for the global env. */
3768    TCG_REG_EBX,
3769    TCG_REG_ESI,
3770    TCG_REG_EDI,
3771#endif
3772};
3773
3774/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3775   and tcg_register_jit.  */
3776
3777#define PUSH_SIZE \
3778    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3779     * (TCG_TARGET_REG_BITS / 8))
3780
3781#define FRAME_SIZE \
3782    ((PUSH_SIZE \
3783      + TCG_STATIC_CALL_ARGS_SIZE \
3784      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3785      + TCG_TARGET_STACK_ALIGN - 1) \
3786     & ~(TCG_TARGET_STACK_ALIGN - 1))
3787
3788/* Generate global QEMU prologue and epilogue code */
3789static void tcg_target_qemu_prologue(TCGContext *s)
3790{
3791    int i, stack_addend;
3792
3793    /* TB prologue */
3794
3795    /* Reserve some stack space, also for TCG temps.  */
3796    stack_addend = FRAME_SIZE - PUSH_SIZE;
3797    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3798                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3799
3800    /* Save all callee saved registers.  */
3801    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3802        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3803    }
3804
3805#if TCG_TARGET_REG_BITS == 32
3806    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3807               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3808    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3809    /* jmp *tb.  */
3810    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3811                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3812                         + stack_addend);
3813#else
3814# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3815    if (guest_base) {
3816        int seg = setup_guest_base_seg();
3817        if (seg != 0) {
3818            x86_guest_base_seg = seg;
3819        } else if (guest_base == (int32_t)guest_base) {
3820            x86_guest_base_offset = guest_base;
3821        } else {
3822            /* Choose R12 because, as a base, it requires a SIB byte. */
3823            x86_guest_base_index = TCG_REG_R12;
3824            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3825            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3826        }
3827    }
3828# endif
3829    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3830    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3831    /* jmp *tb.  */
3832    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3833#endif
3834
3835    /*
3836     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3837     * and fall through to the rest of the epilogue.
3838     */
3839    s->code_gen_epilogue = s->code_ptr;
3840    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3841
3842    /* TB epilogue */
3843    tb_ret_addr = s->code_ptr;
3844
3845    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3846
3847    if (have_avx2) {
3848        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3849    }
3850    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3851        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3852    }
3853    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3854}
3855
3856static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3857{
3858    memset(p, 0x90, count);
3859}
3860
3861static void tcg_target_init(TCGContext *s)
3862{
3863#ifdef CONFIG_CPUID_H
3864    unsigned a, b, c, d, b7 = 0;
3865    int max = __get_cpuid_max(0, 0);
3866
3867    if (max >= 7) {
3868        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3869        __cpuid_count(7, 0, a, b7, c, d);
3870        have_bmi1 = (b7 & bit_BMI) != 0;
3871        have_bmi2 = (b7 & bit_BMI2) != 0;
3872    }
3873
3874    if (max >= 1) {
3875        __cpuid(1, a, b, c, d);
3876#ifndef have_cmov
3877        /* For 32-bit, 99% certainty that we're running on hardware that
3878           supports cmov, but we still need to check.  In case cmov is not
3879           available, we'll use a small forward branch.  */
3880        have_cmov = (d & bit_CMOV) != 0;
3881#endif
3882
3883        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3884           need to probe for it.  */
3885        have_movbe = (c & bit_MOVBE) != 0;
3886        have_popcnt = (c & bit_POPCNT) != 0;
3887
3888        /* There are a number of things we must check before we can be
3889           sure of not hitting invalid opcode.  */
3890        if (c & bit_OSXSAVE) {
3891            unsigned xcrl, xcrh;
3892            /* The xgetbv instruction is not available to older versions of
3893             * the assembler, so we encode the instruction manually.
3894             */
3895            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3896            if ((xcrl & 6) == 6) {
3897                have_avx1 = (c & bit_AVX) != 0;
3898                have_avx2 = (b7 & bit_AVX2) != 0;
3899            }
3900        }
3901    }
3902
3903    max = __get_cpuid_max(0x8000000, 0);
3904    if (max >= 1) {
3905        __cpuid(0x80000001, a, b, c, d);
3906        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3907        have_lzcnt = (c & bit_LZCNT) != 0;
3908    }
3909#endif /* CONFIG_CPUID_H */
3910
3911    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3912    if (TCG_TARGET_REG_BITS == 64) {
3913        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3914    }
3915    if (have_avx1) {
3916        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3917        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3918    }
3919    if (have_avx2) {
3920        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3921    }
3922
3923    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3924    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3925    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3926    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3927    if (TCG_TARGET_REG_BITS == 64) {
3928#if !defined(_WIN64)
3929        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3930        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3931#endif
3932        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3933        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3934        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3935        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3936    }
3937
3938    s->reserved_regs = 0;
3939    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3940}
3941
3942typedef struct {
3943    DebugFrameHeader h;
3944    uint8_t fde_def_cfa[4];
3945    uint8_t fde_reg_ofs[14];
3946} DebugFrame;
3947
3948/* We're expecting a 2 byte uleb128 encoded value.  */
3949QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3950
3951#if !defined(__ELF__)
3952    /* Host machine without ELF. */
3953#elif TCG_TARGET_REG_BITS == 64
3954#define ELF_HOST_MACHINE EM_X86_64
3955static const DebugFrame debug_frame = {
3956    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3957    .h.cie.id = -1,
3958    .h.cie.version = 1,
3959    .h.cie.code_align = 1,
3960    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3961    .h.cie.return_column = 16,
3962
3963    /* Total FDE size does not include the "len" member.  */
3964    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3965
3966    .fde_def_cfa = {
3967        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3968        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3969        (FRAME_SIZE >> 7)
3970    },
3971    .fde_reg_ofs = {
3972        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3973        /* The following ordering must match tcg_target_callee_save_regs.  */
3974        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3975        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3976        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3977        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3978        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3979        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3980    }
3981};
3982#else
3983#define ELF_HOST_MACHINE EM_386
3984static const DebugFrame debug_frame = {
3985    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3986    .h.cie.id = -1,
3987    .h.cie.version = 1,
3988    .h.cie.code_align = 1,
3989    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3990    .h.cie.return_column = 8,
3991
3992    /* Total FDE size does not include the "len" member.  */
3993    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3994
3995    .fde_def_cfa = {
3996        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3997        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3998        (FRAME_SIZE >> 7)
3999    },
4000    .fde_reg_ofs = {
4001        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4002        /* The following ordering must match tcg_target_callee_save_regs.  */
4003        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4004        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4005        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4006        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4007    }
4008};
4009#endif
4010
4011#if defined(ELF_HOST_MACHINE)
4012void tcg_register_jit(void *buf, size_t buf_size)
4013{
4014    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4015}
4016#endif
4017