xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 77182df1)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-pool.c.inc"
26
27#ifdef CONFIG_DEBUG_TCG
28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29#if TCG_TARGET_REG_BITS == 64
30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31#else
32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33#endif
34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36#if TCG_TARGET_REG_BITS == 64
37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39#endif
40};
41#endif
42
43static const int tcg_target_reg_alloc_order[] = {
44#if TCG_TARGET_REG_BITS == 64
45    TCG_REG_RBP,
46    TCG_REG_RBX,
47    TCG_REG_R12,
48    TCG_REG_R13,
49    TCG_REG_R14,
50    TCG_REG_R15,
51    TCG_REG_R10,
52    TCG_REG_R11,
53    TCG_REG_R9,
54    TCG_REG_R8,
55    TCG_REG_RCX,
56    TCG_REG_RDX,
57    TCG_REG_RSI,
58    TCG_REG_RDI,
59    TCG_REG_RAX,
60#else
61    TCG_REG_EBX,
62    TCG_REG_ESI,
63    TCG_REG_EDI,
64    TCG_REG_EBP,
65    TCG_REG_ECX,
66    TCG_REG_EDX,
67    TCG_REG_EAX,
68#endif
69    TCG_REG_XMM0,
70    TCG_REG_XMM1,
71    TCG_REG_XMM2,
72    TCG_REG_XMM3,
73    TCG_REG_XMM4,
74    TCG_REG_XMM5,
75#ifndef _WIN64
76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
78    TCG_REG_XMM6,
79    TCG_REG_XMM7,
80#if TCG_TARGET_REG_BITS == 64
81    TCG_REG_XMM8,
82    TCG_REG_XMM9,
83    TCG_REG_XMM10,
84    TCG_REG_XMM11,
85    TCG_REG_XMM12,
86    TCG_REG_XMM13,
87    TCG_REG_XMM14,
88    TCG_REG_XMM15,
89#endif
90#endif
91};
92
93static const int tcg_target_call_iarg_regs[] = {
94#if TCG_TARGET_REG_BITS == 64
95#if defined(_WIN64)
96    TCG_REG_RCX,
97    TCG_REG_RDX,
98#else
99    TCG_REG_RDI,
100    TCG_REG_RSI,
101    TCG_REG_RDX,
102    TCG_REG_RCX,
103#endif
104    TCG_REG_R8,
105    TCG_REG_R9,
106#else
107    /* 32 bit mode uses stack based calling convention (GCC default). */
108#endif
109};
110
111static const int tcg_target_call_oarg_regs[] = {
112    TCG_REG_EAX,
113#if TCG_TARGET_REG_BITS == 32
114    TCG_REG_EDX
115#endif
116};
117
118/* Constants we accept.  */
119#define TCG_CT_CONST_S32 0x100
120#define TCG_CT_CONST_U32 0x200
121#define TCG_CT_CONST_I32 0x400
122#define TCG_CT_CONST_WSZ 0x800
123
124/* Registers used with L constraint, which are the first argument
125   registers on x86_64, and two random call clobbered registers on
126   i386. */
127#if TCG_TARGET_REG_BITS == 64
128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130#else
131# define TCG_REG_L0 TCG_REG_EAX
132# define TCG_REG_L1 TCG_REG_EDX
133#endif
134
135/* The host compiler should supply <cpuid.h> to enable runtime features
136   detection, as we're not going to go so far as our own inline assembly.
137   If not available, default values will be assumed.  */
138#if defined(CONFIG_CPUID_H)
139#include "qemu/cpuid.h"
140#endif
141
142/* For 64-bit, we always know that CMOV is available.  */
143#if TCG_TARGET_REG_BITS == 64
144# define have_cmov 1
145#elif defined(CONFIG_CPUID_H)
146static bool have_cmov;
147#else
148# define have_cmov 0
149#endif
150
151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
152   it there.  Therefore we always define the variable.  */
153bool have_bmi1;
154bool have_popcnt;
155bool have_avx1;
156bool have_avx2;
157bool have_movbe;
158
159#ifdef CONFIG_CPUID_H
160static bool have_bmi2;
161static bool have_lzcnt;
162#else
163# define have_bmi2 0
164# define have_lzcnt 0
165#endif
166
167static const tcg_insn_unit *tb_ret_addr;
168
169static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
170                        intptr_t value, intptr_t addend)
171{
172    value += addend;
173    switch(type) {
174    case R_386_PC32:
175        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
176        if (value != (int32_t)value) {
177            return false;
178        }
179        /* FALLTHRU */
180    case R_386_32:
181        tcg_patch32(code_ptr, value);
182        break;
183    case R_386_PC8:
184        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
185        if (value != (int8_t)value) {
186            return false;
187        }
188        tcg_patch8(code_ptr, value);
189        break;
190    default:
191        tcg_abort();
192    }
193    return true;
194}
195
196#if TCG_TARGET_REG_BITS == 64
197#define ALL_GENERAL_REGS   0x0000ffffu
198#define ALL_VECTOR_REGS    0xffff0000u
199#else
200#define ALL_GENERAL_REGS   0x000000ffu
201#define ALL_VECTOR_REGS    0x00ff0000u
202#endif
203
204/* parse target specific constraints */
205static const char *target_parse_constraint(TCGArgConstraint *ct,
206                                           const char *ct_str, TCGType type)
207{
208    switch(*ct_str++) {
209    case 'a':
210        tcg_regset_set_reg(ct->regs, TCG_REG_EAX);
211        break;
212    case 'b':
213        tcg_regset_set_reg(ct->regs, TCG_REG_EBX);
214        break;
215    case 'c':
216        tcg_regset_set_reg(ct->regs, TCG_REG_ECX);
217        break;
218    case 'd':
219        tcg_regset_set_reg(ct->regs, TCG_REG_EDX);
220        break;
221    case 'S':
222        tcg_regset_set_reg(ct->regs, TCG_REG_ESI);
223        break;
224    case 'D':
225        tcg_regset_set_reg(ct->regs, TCG_REG_EDI);
226        break;
227    case 'q':
228        /* A register that can be used as a byte operand.  */
229        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
230        break;
231    case 'Q':
232        /* A register with an addressable second byte (e.g. %ah).  */
233        ct->regs = 0xf;
234        break;
235    case 'r':
236        /* A general register.  */
237        ct->regs |= ALL_GENERAL_REGS;
238        break;
239    case 'W':
240        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
241        ct->ct |= TCG_CT_CONST_WSZ;
242        break;
243    case 'x':
244        /* A vector register.  */
245        ct->regs |= ALL_VECTOR_REGS;
246        break;
247
248    case 'L':
249        /* qemu_ld/st data+address constraint */
250        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
251#ifdef CONFIG_SOFTMMU
252        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
253        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
254#endif
255        break;
256    case 's':
257        /* qemu_st8_i32 data constraint */
258        ct->regs = 0xf;
259#ifdef CONFIG_SOFTMMU
260        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
261        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
262#endif
263        break;
264
265    case 'e':
266        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
267        break;
268    case 'Z':
269        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
270        break;
271    case 'I':
272        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
273        break;
274
275    default:
276        return NULL;
277    }
278    return ct_str;
279}
280
281/* test if a constant matches the constraint */
282static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
283                                         const TCGArgConstraint *arg_ct)
284{
285    int ct = arg_ct->ct;
286    if (ct & TCG_CT_CONST) {
287        return 1;
288    }
289    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
290        return 1;
291    }
292    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
293        return 1;
294    }
295    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
296        return 1;
297    }
298    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
299        return 1;
300    }
301    return 0;
302}
303
304# define LOWREGMASK(x)	((x) & 7)
305
306#define P_EXT		0x100		/* 0x0f opcode prefix */
307#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
308#define P_DATA16        0x400           /* 0x66 opcode prefix */
309#if TCG_TARGET_REG_BITS == 64
310# define P_REXW         0x1000          /* Set REX.W = 1 */
311# define P_REXB_R       0x2000          /* REG field as byte register */
312# define P_REXB_RM      0x4000          /* R/M field as byte register */
313# define P_GS           0x8000          /* gs segment override */
314#else
315# define P_REXW		0
316# define P_REXB_R	0
317# define P_REXB_RM	0
318# define P_GS           0
319#endif
320#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
321#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
322#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
323#define P_VEXL          0x80000         /* Set VEX.L = 1 */
324
325#define OPC_ARITH_EvIz	(0x81)
326#define OPC_ARITH_EvIb	(0x83)
327#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
328#define OPC_ANDN        (0xf2 | P_EXT38)
329#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
330#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
331#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
332#define OPC_BSF         (0xbc | P_EXT)
333#define OPC_BSR         (0xbd | P_EXT)
334#define OPC_BSWAP	(0xc8 | P_EXT)
335#define OPC_CALL_Jz	(0xe8)
336#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
337#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
338#define OPC_DEC_r32	(0x48)
339#define OPC_IMUL_GvEv	(0xaf | P_EXT)
340#define OPC_IMUL_GvEvIb	(0x6b)
341#define OPC_IMUL_GvEvIz	(0x69)
342#define OPC_INC_r32	(0x40)
343#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
344#define OPC_JCC_short	(0x70)		/* ... plus condition code */
345#define OPC_JMP_long	(0xe9)
346#define OPC_JMP_short	(0xeb)
347#define OPC_LEA         (0x8d)
348#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
349#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
350#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
351#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
352#define OPC_MOVB_EvIz   (0xc6)
353#define OPC_MOVL_EvIz	(0xc7)
354#define OPC_MOVL_Iv     (0xb8)
355#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
356#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
357#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
358#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
359#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
360#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
361#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
362#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
363#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
364#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
365#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
366#define OPC_MOVSBL	(0xbe | P_EXT)
367#define OPC_MOVSWL	(0xbf | P_EXT)
368#define OPC_MOVSLQ	(0x63 | P_REXW)
369#define OPC_MOVZBL	(0xb6 | P_EXT)
370#define OPC_MOVZWL	(0xb7 | P_EXT)
371#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
372#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
373#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
374#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
375#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
376#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
377#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
378#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
379#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
380#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
381#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
382#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
383#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
384#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
385#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
386#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
387#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
388#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
389#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
390#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
391#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
392#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
393#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
394#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
395#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
396#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
397#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
398#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
399#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
400#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
401#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
402#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
403#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
404#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
405#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
406#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
407#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
408#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
409#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
410#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
411#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
412#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
413#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
414#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
415#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
416#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
417#define OPC_POR         (0xeb | P_EXT | P_DATA16)
418#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
419#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
420#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
421#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
422#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
423#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
424#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
425#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
426#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
427#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
428#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
429#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
430#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
431#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
432#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
433#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
434#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
435#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
436#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
437#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
438#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
439#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
440#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
441#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
442#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
443#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
444#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
445#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
446#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
447#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
448#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
449#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
450#define OPC_POP_r32	(0x58)
451#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
452#define OPC_PUSH_r32	(0x50)
453#define OPC_PUSH_Iv	(0x68)
454#define OPC_PUSH_Ib	(0x6a)
455#define OPC_RET		(0xc3)
456#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
457#define OPC_SHIFT_1	(0xd1)
458#define OPC_SHIFT_Ib	(0xc1)
459#define OPC_SHIFT_cl	(0xd3)
460#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
461#define OPC_SHUFPS      (0xc6 | P_EXT)
462#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
463#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
464#define OPC_SHRD_Ib     (0xac | P_EXT)
465#define OPC_TESTL	(0x85)
466#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
467#define OPC_UD2         (0x0b | P_EXT)
468#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
469#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
470#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
471#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
472#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
473#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
474#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
475#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
476#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
477#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
478#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
479#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
480#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
481#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
482#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
483#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
484#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
485#define OPC_VZEROUPPER  (0x77 | P_EXT)
486#define OPC_XCHG_ax_r32	(0x90)
487
488#define OPC_GRP3_Ev	(0xf7)
489#define OPC_GRP5	(0xff)
490#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
491
492/* Group 1 opcode extensions for 0x80-0x83.
493   These are also used as modifiers for OPC_ARITH.  */
494#define ARITH_ADD 0
495#define ARITH_OR  1
496#define ARITH_ADC 2
497#define ARITH_SBB 3
498#define ARITH_AND 4
499#define ARITH_SUB 5
500#define ARITH_XOR 6
501#define ARITH_CMP 7
502
503/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
504#define SHIFT_ROL 0
505#define SHIFT_ROR 1
506#define SHIFT_SHL 4
507#define SHIFT_SHR 5
508#define SHIFT_SAR 7
509
510/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
511#define EXT3_NOT   2
512#define EXT3_NEG   3
513#define EXT3_MUL   4
514#define EXT3_IMUL  5
515#define EXT3_DIV   6
516#define EXT3_IDIV  7
517
518/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
519#define EXT5_INC_Ev	0
520#define EXT5_DEC_Ev	1
521#define EXT5_CALLN_Ev	2
522#define EXT5_JMPN_Ev	4
523
524/* Condition codes to be added to OPC_JCC_{long,short}.  */
525#define JCC_JMP (-1)
526#define JCC_JO  0x0
527#define JCC_JNO 0x1
528#define JCC_JB  0x2
529#define JCC_JAE 0x3
530#define JCC_JE  0x4
531#define JCC_JNE 0x5
532#define JCC_JBE 0x6
533#define JCC_JA  0x7
534#define JCC_JS  0x8
535#define JCC_JNS 0x9
536#define JCC_JP  0xa
537#define JCC_JNP 0xb
538#define JCC_JL  0xc
539#define JCC_JGE 0xd
540#define JCC_JLE 0xe
541#define JCC_JG  0xf
542
543static const uint8_t tcg_cond_to_jcc[] = {
544    [TCG_COND_EQ] = JCC_JE,
545    [TCG_COND_NE] = JCC_JNE,
546    [TCG_COND_LT] = JCC_JL,
547    [TCG_COND_GE] = JCC_JGE,
548    [TCG_COND_LE] = JCC_JLE,
549    [TCG_COND_GT] = JCC_JG,
550    [TCG_COND_LTU] = JCC_JB,
551    [TCG_COND_GEU] = JCC_JAE,
552    [TCG_COND_LEU] = JCC_JBE,
553    [TCG_COND_GTU] = JCC_JA,
554};
555
556#if TCG_TARGET_REG_BITS == 64
557static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
558{
559    int rex;
560
561    if (opc & P_GS) {
562        tcg_out8(s, 0x65);
563    }
564    if (opc & P_DATA16) {
565        /* We should never be asking for both 16 and 64-bit operation.  */
566        tcg_debug_assert((opc & P_REXW) == 0);
567        tcg_out8(s, 0x66);
568    }
569    if (opc & P_SIMDF3) {
570        tcg_out8(s, 0xf3);
571    } else if (opc & P_SIMDF2) {
572        tcg_out8(s, 0xf2);
573    }
574
575    rex = 0;
576    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
577    rex |= (r & 8) >> 1;                /* REX.R */
578    rex |= (x & 8) >> 2;                /* REX.X */
579    rex |= (rm & 8) >> 3;               /* REX.B */
580
581    /* P_REXB_{R,RM} indicates that the given register is the low byte.
582       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
583       as otherwise the encoding indicates %[abcd]h.  Note that the values
584       that are ORed in merely indicate that the REX byte must be present;
585       those bits get discarded in output.  */
586    rex |= opc & (r >= 4 ? P_REXB_R : 0);
587    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
588
589    if (rex) {
590        tcg_out8(s, (uint8_t)(rex | 0x40));
591    }
592
593    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
594        tcg_out8(s, 0x0f);
595        if (opc & P_EXT38) {
596            tcg_out8(s, 0x38);
597        } else if (opc & P_EXT3A) {
598            tcg_out8(s, 0x3a);
599        }
600    }
601
602    tcg_out8(s, opc);
603}
604#else
605static void tcg_out_opc(TCGContext *s, int opc)
606{
607    if (opc & P_DATA16) {
608        tcg_out8(s, 0x66);
609    }
610    if (opc & P_SIMDF3) {
611        tcg_out8(s, 0xf3);
612    } else if (opc & P_SIMDF2) {
613        tcg_out8(s, 0xf2);
614    }
615    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
616        tcg_out8(s, 0x0f);
617        if (opc & P_EXT38) {
618            tcg_out8(s, 0x38);
619        } else if (opc & P_EXT3A) {
620            tcg_out8(s, 0x3a);
621        }
622    }
623    tcg_out8(s, opc);
624}
625/* Discard the register arguments to tcg_out_opc early, so as not to penalize
626   the 32-bit compilation paths.  This method works with all versions of gcc,
627   whereas relying on optimization may not be able to exclude them.  */
628#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
629#endif
630
631static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
632{
633    tcg_out_opc(s, opc, r, rm, 0);
634    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
635}
636
637static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
638                            int rm, int index)
639{
640    int tmp;
641
642    /* Use the two byte form if possible, which cannot encode
643       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
644    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
645        && ((rm | index) & 8) == 0) {
646        /* Two byte VEX prefix.  */
647        tcg_out8(s, 0xc5);
648
649        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
650    } else {
651        /* Three byte VEX prefix.  */
652        tcg_out8(s, 0xc4);
653
654        /* VEX.m-mmmm */
655        if (opc & P_EXT3A) {
656            tmp = 3;
657        } else if (opc & P_EXT38) {
658            tmp = 2;
659        } else if (opc & P_EXT) {
660            tmp = 1;
661        } else {
662            g_assert_not_reached();
663        }
664        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
665        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
666        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
667        tcg_out8(s, tmp);
668
669        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
670    }
671
672    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
673    /* VEX.pp */
674    if (opc & P_DATA16) {
675        tmp |= 1;                          /* 0x66 */
676    } else if (opc & P_SIMDF3) {
677        tmp |= 2;                          /* 0xf3 */
678    } else if (opc & P_SIMDF2) {
679        tmp |= 3;                          /* 0xf2 */
680    }
681    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
682    tcg_out8(s, tmp);
683    tcg_out8(s, opc);
684}
685
686static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
687{
688    tcg_out_vex_opc(s, opc, r, v, rm, 0);
689    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
690}
691
692/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
693   We handle either RM and INDEX missing with a negative value.  In 64-bit
694   mode for absolute addresses, ~RM is the size of the immediate operand
695   that will follow the instruction.  */
696
697static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
698                               int shift, intptr_t offset)
699{
700    int mod, len;
701
702    if (index < 0 && rm < 0) {
703        if (TCG_TARGET_REG_BITS == 64) {
704            /* Try for a rip-relative addressing mode.  This has replaced
705               the 32-bit-mode absolute addressing encoding.  */
706            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
707            intptr_t disp = offset - pc;
708            if (disp == (int32_t)disp) {
709                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
710                tcg_out32(s, disp);
711                return;
712            }
713
714            /* Try for an absolute address encoding.  This requires the
715               use of the MODRM+SIB encoding and is therefore larger than
716               rip-relative addressing.  */
717            if (offset == (int32_t)offset) {
718                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
719                tcg_out8(s, (4 << 3) | 5);
720                tcg_out32(s, offset);
721                return;
722            }
723
724            /* ??? The memory isn't directly addressable.  */
725            g_assert_not_reached();
726        } else {
727            /* Absolute address.  */
728            tcg_out8(s, (r << 3) | 5);
729            tcg_out32(s, offset);
730            return;
731        }
732    }
733
734    /* Find the length of the immediate addend.  Note that the encoding
735       that would be used for (%ebp) indicates absolute addressing.  */
736    if (rm < 0) {
737        mod = 0, len = 4, rm = 5;
738    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
739        mod = 0, len = 0;
740    } else if (offset == (int8_t)offset) {
741        mod = 0x40, len = 1;
742    } else {
743        mod = 0x80, len = 4;
744    }
745
746    /* Use a single byte MODRM format if possible.  Note that the encoding
747       that would be used for %esp is the escape to the two byte form.  */
748    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
749        /* Single byte MODRM format.  */
750        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
751    } else {
752        /* Two byte MODRM+SIB format.  */
753
754        /* Note that the encoding that would place %esp into the index
755           field indicates no index register.  In 64-bit mode, the REX.X
756           bit counts, so %r12 can be used as the index.  */
757        if (index < 0) {
758            index = 4;
759        } else {
760            tcg_debug_assert(index != TCG_REG_ESP);
761        }
762
763        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
764        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
765    }
766
767    if (len == 1) {
768        tcg_out8(s, offset);
769    } else if (len == 4) {
770        tcg_out32(s, offset);
771    }
772}
773
774static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
775                                     int index, int shift, intptr_t offset)
776{
777    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
778    tcg_out_sib_offset(s, r, rm, index, shift, offset);
779}
780
781static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
782                                         int rm, int index, int shift,
783                                         intptr_t offset)
784{
785    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
786    tcg_out_sib_offset(s, r, rm, index, shift, offset);
787}
788
789/* A simplification of the above with no index or shift.  */
790static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
791                                        int rm, intptr_t offset)
792{
793    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
794}
795
796static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
797                                            int v, int rm, intptr_t offset)
798{
799    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
800}
801
802/* Output an opcode with an expected reference to the constant pool.  */
803static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
804{
805    tcg_out_opc(s, opc, r, 0, 0);
806    /* Absolute for 32-bit, pc-relative for 64-bit.  */
807    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
808    tcg_out32(s, 0);
809}
810
811/* Output an opcode with an expected reference to the constant pool.  */
812static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
813{
814    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
815    /* Absolute for 32-bit, pc-relative for 64-bit.  */
816    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
817    tcg_out32(s, 0);
818}
819
820/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
821static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
822{
823    /* Propagate an opcode prefix, such as P_REXW.  */
824    int ext = subop & ~0x7;
825    subop &= 0x7;
826
827    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
828}
829
830static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
831{
832    int rexw = 0;
833
834    if (arg == ret) {
835        return true;
836    }
837    switch (type) {
838    case TCG_TYPE_I64:
839        rexw = P_REXW;
840        /* fallthru */
841    case TCG_TYPE_I32:
842        if (ret < 16) {
843            if (arg < 16) {
844                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
845            } else {
846                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
847            }
848        } else {
849            if (arg < 16) {
850                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
851            } else {
852                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
853            }
854        }
855        break;
856
857    case TCG_TYPE_V64:
858        tcg_debug_assert(ret >= 16 && arg >= 16);
859        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
860        break;
861    case TCG_TYPE_V128:
862        tcg_debug_assert(ret >= 16 && arg >= 16);
863        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
864        break;
865    case TCG_TYPE_V256:
866        tcg_debug_assert(ret >= 16 && arg >= 16);
867        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
868        break;
869
870    default:
871        g_assert_not_reached();
872    }
873    return true;
874}
875
876static const int avx2_dup_insn[4] = {
877    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
878    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
879};
880
881static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
882                            TCGReg r, TCGReg a)
883{
884    if (have_avx2) {
885        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
886        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
887    } else {
888        switch (vece) {
889        case MO_8:
890            /* ??? With zero in a register, use PSHUFB.  */
891            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
892            a = r;
893            /* FALLTHRU */
894        case MO_16:
895            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
896            a = r;
897            /* FALLTHRU */
898        case MO_32:
899            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
900            /* imm8 operand: all output lanes selected from input lane 0.  */
901            tcg_out8(s, 0);
902            break;
903        case MO_64:
904            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
905            break;
906        default:
907            g_assert_not_reached();
908        }
909    }
910    return true;
911}
912
913static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
914                             TCGReg r, TCGReg base, intptr_t offset)
915{
916    if (have_avx2) {
917        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
918        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
919                                 r, 0, base, offset);
920    } else {
921        switch (vece) {
922        case MO_64:
923            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
924            break;
925        case MO_32:
926            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
927            break;
928        case MO_16:
929            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
930            tcg_out8(s, 0); /* imm8 */
931            tcg_out_dup_vec(s, type, vece, r, r);
932            break;
933        case MO_8:
934            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
935            tcg_out8(s, 0); /* imm8 */
936            tcg_out_dup_vec(s, type, vece, r, r);
937            break;
938        default:
939            g_assert_not_reached();
940        }
941    }
942    return true;
943}
944
945static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
946                             TCGReg ret, int64_t arg)
947{
948    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
949
950    if (arg == 0) {
951        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
952        return;
953    }
954    if (arg == -1) {
955        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
956        return;
957    }
958
959    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
960        if (have_avx2) {
961            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
962        } else {
963            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
964        }
965        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
966    } else {
967        if (type == TCG_TYPE_V64) {
968            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
969        } else if (have_avx2) {
970            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
971        } else {
972            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
973        }
974        if (TCG_TARGET_REG_BITS == 64) {
975            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
976        } else {
977            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
978        }
979    }
980}
981
982static void tcg_out_movi_vec(TCGContext *s, TCGType type,
983                             TCGReg ret, tcg_target_long arg)
984{
985    if (arg == 0) {
986        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
987        return;
988    }
989    if (arg == -1) {
990        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
991        return;
992    }
993
994    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
995    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
996    if (TCG_TARGET_REG_BITS == 64) {
997        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
998    } else {
999        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1000    }
1001}
1002
1003static void tcg_out_movi_int(TCGContext *s, TCGType type,
1004                             TCGReg ret, tcg_target_long arg)
1005{
1006    tcg_target_long diff;
1007
1008    if (arg == 0) {
1009        tgen_arithr(s, ARITH_XOR, ret, ret);
1010        return;
1011    }
1012    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1013        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1014        tcg_out32(s, arg);
1015        return;
1016    }
1017    if (arg == (int32_t)arg) {
1018        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1019        tcg_out32(s, arg);
1020        return;
1021    }
1022
1023    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1024    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1025    if (diff == (int32_t)diff) {
1026        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1027        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1028        tcg_out32(s, diff);
1029        return;
1030    }
1031
1032    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1033    tcg_out64(s, arg);
1034}
1035
1036static void tcg_out_movi(TCGContext *s, TCGType type,
1037                         TCGReg ret, tcg_target_long arg)
1038{
1039    switch (type) {
1040    case TCG_TYPE_I32:
1041#if TCG_TARGET_REG_BITS == 64
1042    case TCG_TYPE_I64:
1043#endif
1044        if (ret < 16) {
1045            tcg_out_movi_int(s, type, ret, arg);
1046        } else {
1047            tcg_out_movi_vec(s, type, ret, arg);
1048        }
1049        break;
1050    default:
1051        g_assert_not_reached();
1052    }
1053}
1054
1055static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1056{
1057    if (val == (int8_t)val) {
1058        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1059        tcg_out8(s, val);
1060    } else if (val == (int32_t)val) {
1061        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1062        tcg_out32(s, val);
1063    } else {
1064        tcg_abort();
1065    }
1066}
1067
1068static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1069{
1070    /* Given the strength of x86 memory ordering, we only need care for
1071       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1072       faster than "mfence", so don't bother with the sse insn.  */
1073    if (a0 & TCG_MO_ST_LD) {
1074        tcg_out8(s, 0xf0);
1075        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1076        tcg_out8(s, 0);
1077    }
1078}
1079
1080static inline void tcg_out_push(TCGContext *s, int reg)
1081{
1082    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1083}
1084
1085static inline void tcg_out_pop(TCGContext *s, int reg)
1086{
1087    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1088}
1089
1090static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1091                       TCGReg arg1, intptr_t arg2)
1092{
1093    switch (type) {
1094    case TCG_TYPE_I32:
1095        if (ret < 16) {
1096            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1097        } else {
1098            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1099        }
1100        break;
1101    case TCG_TYPE_I64:
1102        if (ret < 16) {
1103            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1104            break;
1105        }
1106        /* FALLTHRU */
1107    case TCG_TYPE_V64:
1108        /* There is no instruction that can validate 8-byte alignment.  */
1109        tcg_debug_assert(ret >= 16);
1110        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1111        break;
1112    case TCG_TYPE_V128:
1113        /*
1114         * The gvec infrastructure is asserts that v128 vector loads
1115         * and stores use a 16-byte aligned offset.  Validate that the
1116         * final pointer is aligned by using an insn that will SIGSEGV.
1117         */
1118        tcg_debug_assert(ret >= 16);
1119        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1120        break;
1121    case TCG_TYPE_V256:
1122        /*
1123         * The gvec infrastructure only requires 16-byte alignment,
1124         * so here we must use an unaligned load.
1125         */
1126        tcg_debug_assert(ret >= 16);
1127        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1128                                 ret, 0, arg1, arg2);
1129        break;
1130    default:
1131        g_assert_not_reached();
1132    }
1133}
1134
1135static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1136                       TCGReg arg1, intptr_t arg2)
1137{
1138    switch (type) {
1139    case TCG_TYPE_I32:
1140        if (arg < 16) {
1141            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1142        } else {
1143            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1144        }
1145        break;
1146    case TCG_TYPE_I64:
1147        if (arg < 16) {
1148            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1149            break;
1150        }
1151        /* FALLTHRU */
1152    case TCG_TYPE_V64:
1153        /* There is no instruction that can validate 8-byte alignment.  */
1154        tcg_debug_assert(arg >= 16);
1155        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1156        break;
1157    case TCG_TYPE_V128:
1158        /*
1159         * The gvec infrastructure is asserts that v128 vector loads
1160         * and stores use a 16-byte aligned offset.  Validate that the
1161         * final pointer is aligned by using an insn that will SIGSEGV.
1162         */
1163        tcg_debug_assert(arg >= 16);
1164        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1165        break;
1166    case TCG_TYPE_V256:
1167        /*
1168         * The gvec infrastructure only requires 16-byte alignment,
1169         * so here we must use an unaligned store.
1170         */
1171        tcg_debug_assert(arg >= 16);
1172        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1173                                 arg, 0, arg1, arg2);
1174        break;
1175    default:
1176        g_assert_not_reached();
1177    }
1178}
1179
1180static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1181                        TCGReg base, intptr_t ofs)
1182{
1183    int rexw = 0;
1184    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1185        if (val != (int32_t)val) {
1186            return false;
1187        }
1188        rexw = P_REXW;
1189    } else if (type != TCG_TYPE_I32) {
1190        return false;
1191    }
1192    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1193    tcg_out32(s, val);
1194    return true;
1195}
1196
1197static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1198{
1199    /* Propagate an opcode prefix, such as P_DATA16.  */
1200    int ext = subopc & ~0x7;
1201    subopc &= 0x7;
1202
1203    if (count == 1) {
1204        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1205    } else {
1206        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1207        tcg_out8(s, count);
1208    }
1209}
1210
1211static inline void tcg_out_bswap32(TCGContext *s, int reg)
1212{
1213    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1214}
1215
1216static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1217{
1218    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1219}
1220
1221static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1222{
1223    /* movzbl */
1224    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1225    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1226}
1227
1228static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1229{
1230    /* movsbl */
1231    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1232    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1233}
1234
1235static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1236{
1237    /* movzwl */
1238    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1239}
1240
1241static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1242{
1243    /* movsw[lq] */
1244    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1245}
1246
1247static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1248{
1249    /* 32-bit mov zero extends.  */
1250    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1251}
1252
1253static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1254{
1255    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1256}
1257
1258static inline void tcg_out_bswap64(TCGContext *s, int reg)
1259{
1260    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1261}
1262
1263static void tgen_arithi(TCGContext *s, int c, int r0,
1264                        tcg_target_long val, int cf)
1265{
1266    int rexw = 0;
1267
1268    if (TCG_TARGET_REG_BITS == 64) {
1269        rexw = c & -8;
1270        c &= 7;
1271    }
1272
1273    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1274       partial flags update stalls on Pentium4 and are not recommended
1275       by current Intel optimization manuals.  */
1276    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1277        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1278        if (TCG_TARGET_REG_BITS == 64) {
1279            /* The single-byte increment encodings are re-tasked as the
1280               REX prefixes.  Use the MODRM encoding.  */
1281            tcg_out_modrm(s, OPC_GRP5 + rexw,
1282                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1283        } else {
1284            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1285        }
1286        return;
1287    }
1288
1289    if (c == ARITH_AND) {
1290        if (TCG_TARGET_REG_BITS == 64) {
1291            if (val == 0xffffffffu) {
1292                tcg_out_ext32u(s, r0, r0);
1293                return;
1294            }
1295            if (val == (uint32_t)val) {
1296                /* AND with no high bits set can use a 32-bit operation.  */
1297                rexw = 0;
1298            }
1299        }
1300        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1301            tcg_out_ext8u(s, r0, r0);
1302            return;
1303        }
1304        if (val == 0xffffu) {
1305            tcg_out_ext16u(s, r0, r0);
1306            return;
1307        }
1308    }
1309
1310    if (val == (int8_t)val) {
1311        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1312        tcg_out8(s, val);
1313        return;
1314    }
1315    if (rexw == 0 || val == (int32_t)val) {
1316        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1317        tcg_out32(s, val);
1318        return;
1319    }
1320
1321    tcg_abort();
1322}
1323
1324static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1325{
1326    if (val != 0) {
1327        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1328    }
1329}
1330
1331/* Use SMALL != 0 to force a short forward branch.  */
1332static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1333{
1334    int32_t val, val1;
1335
1336    if (l->has_value) {
1337        val = tcg_pcrel_diff(s, l->u.value_ptr);
1338        val1 = val - 2;
1339        if ((int8_t)val1 == val1) {
1340            if (opc == -1) {
1341                tcg_out8(s, OPC_JMP_short);
1342            } else {
1343                tcg_out8(s, OPC_JCC_short + opc);
1344            }
1345            tcg_out8(s, val1);
1346        } else {
1347            if (small) {
1348                tcg_abort();
1349            }
1350            if (opc == -1) {
1351                tcg_out8(s, OPC_JMP_long);
1352                tcg_out32(s, val - 5);
1353            } else {
1354                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1355                tcg_out32(s, val - 6);
1356            }
1357        }
1358    } else if (small) {
1359        if (opc == -1) {
1360            tcg_out8(s, OPC_JMP_short);
1361        } else {
1362            tcg_out8(s, OPC_JCC_short + opc);
1363        }
1364        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1365        s->code_ptr += 1;
1366    } else {
1367        if (opc == -1) {
1368            tcg_out8(s, OPC_JMP_long);
1369        } else {
1370            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1371        }
1372        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1373        s->code_ptr += 4;
1374    }
1375}
1376
1377static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1378                        int const_arg2, int rexw)
1379{
1380    if (const_arg2) {
1381        if (arg2 == 0) {
1382            /* test r, r */
1383            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1384        } else {
1385            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1386        }
1387    } else {
1388        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1389    }
1390}
1391
1392static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1393                             TCGArg arg1, TCGArg arg2, int const_arg2,
1394                             TCGLabel *label, int small)
1395{
1396    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1397    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1398}
1399
1400#if TCG_TARGET_REG_BITS == 64
1401static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1402                             TCGArg arg1, TCGArg arg2, int const_arg2,
1403                             TCGLabel *label, int small)
1404{
1405    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1406    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1407}
1408#else
1409/* XXX: we implement it at the target level to avoid having to
1410   handle cross basic blocks temporaries */
1411static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1412                            const int *const_args, int small)
1413{
1414    TCGLabel *label_next = gen_new_label();
1415    TCGLabel *label_this = arg_label(args[5]);
1416
1417    switch(args[4]) {
1418    case TCG_COND_EQ:
1419        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1420                         label_next, 1);
1421        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1422                         label_this, small);
1423        break;
1424    case TCG_COND_NE:
1425        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1426                         label_this, small);
1427        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1428                         label_this, small);
1429        break;
1430    case TCG_COND_LT:
1431        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1432                         label_this, small);
1433        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1434        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1435                         label_this, small);
1436        break;
1437    case TCG_COND_LE:
1438        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1439                         label_this, small);
1440        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1441        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1442                         label_this, small);
1443        break;
1444    case TCG_COND_GT:
1445        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1446                         label_this, small);
1447        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1448        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1449                         label_this, small);
1450        break;
1451    case TCG_COND_GE:
1452        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1453                         label_this, small);
1454        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1455        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1456                         label_this, small);
1457        break;
1458    case TCG_COND_LTU:
1459        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1460                         label_this, small);
1461        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1462        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1463                         label_this, small);
1464        break;
1465    case TCG_COND_LEU:
1466        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1467                         label_this, small);
1468        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1469        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1470                         label_this, small);
1471        break;
1472    case TCG_COND_GTU:
1473        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1474                         label_this, small);
1475        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1476        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1477                         label_this, small);
1478        break;
1479    case TCG_COND_GEU:
1480        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1481                         label_this, small);
1482        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1483        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1484                         label_this, small);
1485        break;
1486    default:
1487        tcg_abort();
1488    }
1489    tcg_out_label(s, label_next);
1490}
1491#endif
1492
1493static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1494                              TCGArg arg1, TCGArg arg2, int const_arg2)
1495{
1496    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1497    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1498    tcg_out_ext8u(s, dest, dest);
1499}
1500
1501#if TCG_TARGET_REG_BITS == 64
1502static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1503                              TCGArg arg1, TCGArg arg2, int const_arg2)
1504{
1505    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1506    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1507    tcg_out_ext8u(s, dest, dest);
1508}
1509#else
1510static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1511                             const int *const_args)
1512{
1513    TCGArg new_args[6];
1514    TCGLabel *label_true, *label_over;
1515
1516    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1517
1518    if (args[0] == args[1] || args[0] == args[2]
1519        || (!const_args[3] && args[0] == args[3])
1520        || (!const_args[4] && args[0] == args[4])) {
1521        /* When the destination overlaps with one of the argument
1522           registers, don't do anything tricky.  */
1523        label_true = gen_new_label();
1524        label_over = gen_new_label();
1525
1526        new_args[5] = label_arg(label_true);
1527        tcg_out_brcond2(s, new_args, const_args+1, 1);
1528
1529        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1530        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1531        tcg_out_label(s, label_true);
1532
1533        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1534        tcg_out_label(s, label_over);
1535    } else {
1536        /* When the destination does not overlap one of the arguments,
1537           clear the destination first, jump if cond false, and emit an
1538           increment in the true case.  This results in smaller code.  */
1539
1540        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1541
1542        label_over = gen_new_label();
1543        new_args[4] = tcg_invert_cond(new_args[4]);
1544        new_args[5] = label_arg(label_over);
1545        tcg_out_brcond2(s, new_args, const_args+1, 1);
1546
1547        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1548        tcg_out_label(s, label_over);
1549    }
1550}
1551#endif
1552
1553static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1554                         TCGReg dest, TCGReg v1)
1555{
1556    if (have_cmov) {
1557        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1558    } else {
1559        TCGLabel *over = gen_new_label();
1560        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1561        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1562        tcg_out_label(s, over);
1563    }
1564}
1565
1566static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1567                              TCGReg c1, TCGArg c2, int const_c2,
1568                              TCGReg v1)
1569{
1570    tcg_out_cmp(s, c1, c2, const_c2, 0);
1571    tcg_out_cmov(s, cond, 0, dest, v1);
1572}
1573
1574#if TCG_TARGET_REG_BITS == 64
1575static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1576                              TCGReg c1, TCGArg c2, int const_c2,
1577                              TCGReg v1)
1578{
1579    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1580    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1581}
1582#endif
1583
1584static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1585                        TCGArg arg2, bool const_a2)
1586{
1587    if (have_bmi1) {
1588        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1589        if (const_a2) {
1590            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1591        } else {
1592            tcg_debug_assert(dest != arg2);
1593            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1594        }
1595    } else {
1596        tcg_debug_assert(dest != arg2);
1597        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1598        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1599    }
1600}
1601
1602static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1603                        TCGArg arg2, bool const_a2)
1604{
1605    if (have_lzcnt) {
1606        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1607        if (const_a2) {
1608            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1609        } else {
1610            tcg_debug_assert(dest != arg2);
1611            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1612        }
1613    } else {
1614        tcg_debug_assert(!const_a2);
1615        tcg_debug_assert(dest != arg1);
1616        tcg_debug_assert(dest != arg2);
1617
1618        /* Recall that the output of BSR is the index not the count.  */
1619        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1620        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1621
1622        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1623        tcg_out_cmp(s, arg1, 0, 1, rexw);
1624        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1625    }
1626}
1627
1628static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1629{
1630    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1631
1632    if (disp == (int32_t)disp) {
1633        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1634        tcg_out32(s, disp);
1635    } else {
1636        /* rip-relative addressing into the constant pool.
1637           This is 6 + 8 = 14 bytes, as compared to using an
1638           an immediate load 10 + 6 = 16 bytes, plus we may
1639           be able to re-use the pool constant for more calls.  */
1640        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1641        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1642        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1643        tcg_out32(s, 0);
1644    }
1645}
1646
1647static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1648{
1649    tcg_out_branch(s, 1, dest);
1650}
1651
1652static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1653{
1654    tcg_out_branch(s, 0, dest);
1655}
1656
1657static void tcg_out_nopn(TCGContext *s, int n)
1658{
1659    int i;
1660    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1661     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1662     * duplicate prefix, and all of the interesting recent cores can
1663     * decode and discard the duplicates in a single cycle.
1664     */
1665    tcg_debug_assert(n >= 1);
1666    for (i = 1; i < n; ++i) {
1667        tcg_out8(s, 0x66);
1668    }
1669    tcg_out8(s, 0x90);
1670}
1671
1672#if defined(CONFIG_SOFTMMU)
1673#include "../tcg-ldst.c.inc"
1674
1675/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1676 *                                     int mmu_idx, uintptr_t ra)
1677 */
1678static void * const qemu_ld_helpers[16] = {
1679    [MO_UB]   = helper_ret_ldub_mmu,
1680    [MO_LEUW] = helper_le_lduw_mmu,
1681    [MO_LEUL] = helper_le_ldul_mmu,
1682    [MO_LEQ]  = helper_le_ldq_mmu,
1683    [MO_BEUW] = helper_be_lduw_mmu,
1684    [MO_BEUL] = helper_be_ldul_mmu,
1685    [MO_BEQ]  = helper_be_ldq_mmu,
1686};
1687
1688/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1689 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1690 */
1691static void * const qemu_st_helpers[16] = {
1692    [MO_UB]   = helper_ret_stb_mmu,
1693    [MO_LEUW] = helper_le_stw_mmu,
1694    [MO_LEUL] = helper_le_stl_mmu,
1695    [MO_LEQ]  = helper_le_stq_mmu,
1696    [MO_BEUW] = helper_be_stw_mmu,
1697    [MO_BEUL] = helper_be_stl_mmu,
1698    [MO_BEQ]  = helper_be_stq_mmu,
1699};
1700
1701/* Perform the TLB load and compare.
1702
1703   Inputs:
1704   ADDRLO and ADDRHI contain the low and high part of the address.
1705
1706   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1707
1708   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1709   This should be offsetof addr_read or addr_write.
1710
1711   Outputs:
1712   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1713   positions of the displacements of forward jumps to the TLB miss case.
1714
1715   Second argument register is loaded with the low part of the address.
1716   In the TLB hit case, it has been adjusted as indicated by the TLB
1717   and so is a host address.  In the TLB miss case, it continues to
1718   hold a guest address.
1719
1720   First argument register is clobbered.  */
1721
1722static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1723                                    int mem_index, MemOp opc,
1724                                    tcg_insn_unit **label_ptr, int which)
1725{
1726    const TCGReg r0 = TCG_REG_L0;
1727    const TCGReg r1 = TCG_REG_L1;
1728    TCGType ttype = TCG_TYPE_I32;
1729    TCGType tlbtype = TCG_TYPE_I32;
1730    int trexw = 0, hrexw = 0, tlbrexw = 0;
1731    unsigned a_bits = get_alignment_bits(opc);
1732    unsigned s_bits = opc & MO_SIZE;
1733    unsigned a_mask = (1 << a_bits) - 1;
1734    unsigned s_mask = (1 << s_bits) - 1;
1735    target_ulong tlb_mask;
1736
1737    if (TCG_TARGET_REG_BITS == 64) {
1738        if (TARGET_LONG_BITS == 64) {
1739            ttype = TCG_TYPE_I64;
1740            trexw = P_REXW;
1741        }
1742        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1743            hrexw = P_REXW;
1744            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1745                tlbtype = TCG_TYPE_I64;
1746                tlbrexw = P_REXW;
1747            }
1748        }
1749    }
1750
1751    tcg_out_mov(s, tlbtype, r0, addrlo);
1752    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1753                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1754
1755    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1756                         TLB_MASK_TABLE_OFS(mem_index) +
1757                         offsetof(CPUTLBDescFast, mask));
1758
1759    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1760                         TLB_MASK_TABLE_OFS(mem_index) +
1761                         offsetof(CPUTLBDescFast, table));
1762
1763    /* If the required alignment is at least as large as the access, simply
1764       copy the address and mask.  For lesser alignments, check that we don't
1765       cross pages for the complete access.  */
1766    if (a_bits >= s_bits) {
1767        tcg_out_mov(s, ttype, r1, addrlo);
1768    } else {
1769        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1770    }
1771    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1772    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1773
1774    /* cmp 0(r0), r1 */
1775    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1776
1777    /* Prepare for both the fast path add of the tlb addend, and the slow
1778       path function argument setup.  */
1779    tcg_out_mov(s, ttype, r1, addrlo);
1780
1781    /* jne slow_path */
1782    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1783    label_ptr[0] = s->code_ptr;
1784    s->code_ptr += 4;
1785
1786    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1787        /* cmp 4(r0), addrhi */
1788        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1789
1790        /* jne slow_path */
1791        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1792        label_ptr[1] = s->code_ptr;
1793        s->code_ptr += 4;
1794    }
1795
1796    /* TLB Hit.  */
1797
1798    /* add addend(r0), r1 */
1799    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1800                         offsetof(CPUTLBEntry, addend));
1801}
1802
1803/*
1804 * Record the context of a call to the out of line helper code for the slow path
1805 * for a load or store, so that we can later generate the correct helper code
1806 */
1807static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1808                                TCGMemOpIdx oi,
1809                                TCGReg datalo, TCGReg datahi,
1810                                TCGReg addrlo, TCGReg addrhi,
1811                                tcg_insn_unit *raddr,
1812                                tcg_insn_unit **label_ptr)
1813{
1814    TCGLabelQemuLdst *label = new_ldst_label(s);
1815
1816    label->is_ld = is_ld;
1817    label->oi = oi;
1818    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1819    label->datalo_reg = datalo;
1820    label->datahi_reg = datahi;
1821    label->addrlo_reg = addrlo;
1822    label->addrhi_reg = addrhi;
1823    label->raddr = tcg_splitwx_to_rx(raddr);
1824    label->label_ptr[0] = label_ptr[0];
1825    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1826        label->label_ptr[1] = label_ptr[1];
1827    }
1828}
1829
1830/*
1831 * Generate code for the slow path for a load at the end of block
1832 */
1833static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1834{
1835    TCGMemOpIdx oi = l->oi;
1836    MemOp opc = get_memop(oi);
1837    TCGReg data_reg;
1838    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1839    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1840
1841    /* resolve label address */
1842    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1843    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1844        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1845    }
1846
1847    if (TCG_TARGET_REG_BITS == 32) {
1848        int ofs = 0;
1849
1850        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1851        ofs += 4;
1852
1853        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1854        ofs += 4;
1855
1856        if (TARGET_LONG_BITS == 64) {
1857            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1858            ofs += 4;
1859        }
1860
1861        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1862        ofs += 4;
1863
1864        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1865    } else {
1866        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1867        /* The second argument is already loaded with addrlo.  */
1868        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1869        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1870                     (uintptr_t)l->raddr);
1871    }
1872
1873    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1874
1875    data_reg = l->datalo_reg;
1876    switch (opc & MO_SSIZE) {
1877    case MO_SB:
1878        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1879        break;
1880    case MO_SW:
1881        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1882        break;
1883#if TCG_TARGET_REG_BITS == 64
1884    case MO_SL:
1885        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1886        break;
1887#endif
1888    case MO_UB:
1889    case MO_UW:
1890        /* Note that the helpers have zero-extended to tcg_target_long.  */
1891    case MO_UL:
1892        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1893        break;
1894    case MO_Q:
1895        if (TCG_TARGET_REG_BITS == 64) {
1896            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1897        } else if (data_reg == TCG_REG_EDX) {
1898            /* xchg %edx, %eax */
1899            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1900            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1901        } else {
1902            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1903            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1904        }
1905        break;
1906    default:
1907        tcg_abort();
1908    }
1909
1910    /* Jump to the code corresponding to next IR of qemu_st */
1911    tcg_out_jmp(s, l->raddr);
1912    return true;
1913}
1914
1915/*
1916 * Generate code for the slow path for a store at the end of block
1917 */
1918static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1919{
1920    TCGMemOpIdx oi = l->oi;
1921    MemOp opc = get_memop(oi);
1922    MemOp s_bits = opc & MO_SIZE;
1923    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1924    TCGReg retaddr;
1925
1926    /* resolve label address */
1927    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1928    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1929        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1930    }
1931
1932    if (TCG_TARGET_REG_BITS == 32) {
1933        int ofs = 0;
1934
1935        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1936        ofs += 4;
1937
1938        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1939        ofs += 4;
1940
1941        if (TARGET_LONG_BITS == 64) {
1942            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1943            ofs += 4;
1944        }
1945
1946        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1947        ofs += 4;
1948
1949        if (s_bits == MO_64) {
1950            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1951            ofs += 4;
1952        }
1953
1954        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1955        ofs += 4;
1956
1957        retaddr = TCG_REG_EAX;
1958        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1959        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1960    } else {
1961        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1962        /* The second argument is already loaded with addrlo.  */
1963        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1964                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1965        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1966
1967        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1968            retaddr = tcg_target_call_iarg_regs[4];
1969            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1970        } else {
1971            retaddr = TCG_REG_RAX;
1972            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1973            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1974                       TCG_TARGET_CALL_STACK_OFFSET);
1975        }
1976    }
1977
1978    /* "Tail call" to the helper, with the return address back inline.  */
1979    tcg_out_push(s, retaddr);
1980    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1981    return true;
1982}
1983#elif TCG_TARGET_REG_BITS == 32
1984# define x86_guest_base_seg     0
1985# define x86_guest_base_index   -1
1986# define x86_guest_base_offset  guest_base
1987#else
1988static int x86_guest_base_seg;
1989static int x86_guest_base_index = -1;
1990static int32_t x86_guest_base_offset;
1991# if defined(__x86_64__) && defined(__linux__)
1992#  include <asm/prctl.h>
1993#  include <sys/prctl.h>
1994int arch_prctl(int code, unsigned long addr);
1995static inline int setup_guest_base_seg(void)
1996{
1997    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1998        return P_GS;
1999    }
2000    return 0;
2001}
2002# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
2003#  include <machine/sysarch.h>
2004static inline int setup_guest_base_seg(void)
2005{
2006    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2007        return P_GS;
2008    }
2009    return 0;
2010}
2011# else
2012static inline int setup_guest_base_seg(void)
2013{
2014    return 0;
2015}
2016# endif
2017#endif /* SOFTMMU */
2018
2019static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2020                                   TCGReg base, int index, intptr_t ofs,
2021                                   int seg, bool is64, MemOp memop)
2022{
2023    bool use_movbe = false;
2024    int rexw = is64 * P_REXW;
2025    int movop = OPC_MOVL_GvEv;
2026
2027    /* Do big-endian loads with movbe.  */
2028    if (memop & MO_BSWAP) {
2029        tcg_debug_assert(have_movbe);
2030        use_movbe = true;
2031        movop = OPC_MOVBE_GyMy;
2032    }
2033
2034    switch (memop & MO_SSIZE) {
2035    case MO_UB:
2036        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2037                                 base, index, 0, ofs);
2038        break;
2039    case MO_SB:
2040        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2041                                 base, index, 0, ofs);
2042        break;
2043    case MO_UW:
2044        if (use_movbe) {
2045            /* There is no extending movbe; only low 16-bits are modified.  */
2046            if (datalo != base && datalo != index) {
2047                /* XOR breaks dependency chains.  */
2048                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2049                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2050                                         datalo, base, index, 0, ofs);
2051            } else {
2052                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2053                                         datalo, base, index, 0, ofs);
2054                tcg_out_ext16u(s, datalo, datalo);
2055            }
2056        } else {
2057            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2058                                     base, index, 0, ofs);
2059        }
2060        break;
2061    case MO_SW:
2062        if (use_movbe) {
2063            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2064                                     datalo, base, index, 0, ofs);
2065            tcg_out_ext16s(s, datalo, datalo, rexw);
2066        } else {
2067            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2068                                     datalo, base, index, 0, ofs);
2069        }
2070        break;
2071    case MO_UL:
2072        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2073        break;
2074#if TCG_TARGET_REG_BITS == 64
2075    case MO_SL:
2076        if (use_movbe) {
2077            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2078                                     base, index, 0, ofs);
2079            tcg_out_ext32s(s, datalo, datalo);
2080        } else {
2081            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2082                                     base, index, 0, ofs);
2083        }
2084        break;
2085#endif
2086    case MO_Q:
2087        if (TCG_TARGET_REG_BITS == 64) {
2088            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2089                                     base, index, 0, ofs);
2090        } else {
2091            if (use_movbe) {
2092                TCGReg t = datalo;
2093                datalo = datahi;
2094                datahi = t;
2095            }
2096            if (base != datalo) {
2097                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2098                                         base, index, 0, ofs);
2099                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2100                                         base, index, 0, ofs + 4);
2101            } else {
2102                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2103                                         base, index, 0, ofs + 4);
2104                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2105                                         base, index, 0, ofs);
2106            }
2107        }
2108        break;
2109    default:
2110        g_assert_not_reached();
2111    }
2112}
2113
2114/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2115   EAX. It will be useful once fixed registers globals are less
2116   common. */
2117static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2118{
2119    TCGReg datalo, datahi, addrlo;
2120    TCGReg addrhi __attribute__((unused));
2121    TCGMemOpIdx oi;
2122    MemOp opc;
2123#if defined(CONFIG_SOFTMMU)
2124    int mem_index;
2125    tcg_insn_unit *label_ptr[2];
2126#endif
2127
2128    datalo = *args++;
2129    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2130    addrlo = *args++;
2131    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2132    oi = *args++;
2133    opc = get_memop(oi);
2134
2135#if defined(CONFIG_SOFTMMU)
2136    mem_index = get_mmuidx(oi);
2137
2138    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2139                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2140
2141    /* TLB Hit.  */
2142    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2143
2144    /* Record the current context of a load into ldst label */
2145    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2146                        s->code_ptr, label_ptr);
2147#else
2148    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2149                           x86_guest_base_offset, x86_guest_base_seg,
2150                           is64, opc);
2151#endif
2152}
2153
2154static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2155                                   TCGReg base, int index, intptr_t ofs,
2156                                   int seg, MemOp memop)
2157{
2158    bool use_movbe = false;
2159    int movop = OPC_MOVL_EvGv;
2160
2161    /*
2162     * Do big-endian stores with movbe or softmmu.
2163     * User-only without movbe will have its swapping done generically.
2164     */
2165    if (memop & MO_BSWAP) {
2166        tcg_debug_assert(have_movbe);
2167        use_movbe = true;
2168        movop = OPC_MOVBE_MyGy;
2169    }
2170
2171    switch (memop & MO_SIZE) {
2172    case MO_8:
2173        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2174        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2175        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2176                                 datalo, base, index, 0, ofs);
2177        break;
2178    case MO_16:
2179        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2180                                 base, index, 0, ofs);
2181        break;
2182    case MO_32:
2183        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2184        break;
2185    case MO_64:
2186        if (TCG_TARGET_REG_BITS == 64) {
2187            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2188                                     base, index, 0, ofs);
2189        } else {
2190            if (use_movbe) {
2191                TCGReg t = datalo;
2192                datalo = datahi;
2193                datahi = t;
2194            }
2195            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2196                                     base, index, 0, ofs);
2197            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2198                                     base, index, 0, ofs + 4);
2199        }
2200        break;
2201    default:
2202        g_assert_not_reached();
2203    }
2204}
2205
2206static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2207{
2208    TCGReg datalo, datahi, addrlo;
2209    TCGReg addrhi __attribute__((unused));
2210    TCGMemOpIdx oi;
2211    MemOp opc;
2212#if defined(CONFIG_SOFTMMU)
2213    int mem_index;
2214    tcg_insn_unit *label_ptr[2];
2215#endif
2216
2217    datalo = *args++;
2218    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2219    addrlo = *args++;
2220    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2221    oi = *args++;
2222    opc = get_memop(oi);
2223
2224#if defined(CONFIG_SOFTMMU)
2225    mem_index = get_mmuidx(oi);
2226
2227    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2228                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2229
2230    /* TLB Hit.  */
2231    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2232
2233    /* Record the current context of a store into ldst label */
2234    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2235                        s->code_ptr, label_ptr);
2236#else
2237    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2238                           x86_guest_base_offset, x86_guest_base_seg, opc);
2239#endif
2240}
2241
2242static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2243                              const TCGArg *args, const int *const_args)
2244{
2245    TCGArg a0, a1, a2;
2246    int c, const_a2, vexop, rexw = 0;
2247
2248#if TCG_TARGET_REG_BITS == 64
2249# define OP_32_64(x) \
2250        case glue(glue(INDEX_op_, x), _i64): \
2251            rexw = P_REXW; /* FALLTHRU */    \
2252        case glue(glue(INDEX_op_, x), _i32)
2253#else
2254# define OP_32_64(x) \
2255        case glue(glue(INDEX_op_, x), _i32)
2256#endif
2257
2258    /* Hoist the loads of the most common arguments.  */
2259    a0 = args[0];
2260    a1 = args[1];
2261    a2 = args[2];
2262    const_a2 = const_args[2];
2263
2264    switch (opc) {
2265    case INDEX_op_exit_tb:
2266        /* Reuse the zeroing that exists for goto_ptr.  */
2267        if (a0 == 0) {
2268            tcg_out_jmp(s, tcg_code_gen_epilogue);
2269        } else {
2270            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2271            tcg_out_jmp(s, tb_ret_addr);
2272        }
2273        break;
2274    case INDEX_op_goto_tb:
2275        if (s->tb_jmp_insn_offset) {
2276            /* direct jump method */
2277            int gap;
2278            /* jump displacement must be aligned for atomic patching;
2279             * see if we need to add extra nops before jump
2280             */
2281            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2282            if (gap != 1) {
2283                tcg_out_nopn(s, gap - 1);
2284            }
2285            tcg_out8(s, OPC_JMP_long); /* jmp im */
2286            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2287            tcg_out32(s, 0);
2288        } else {
2289            /* indirect jump method */
2290            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2291                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2292        }
2293        set_jmp_reset_offset(s, a0);
2294        break;
2295    case INDEX_op_goto_ptr:
2296        /* jmp to the given host address (could be epilogue) */
2297        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2298        break;
2299    case INDEX_op_br:
2300        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2301        break;
2302    OP_32_64(ld8u):
2303        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2304        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2305        break;
2306    OP_32_64(ld8s):
2307        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2308        break;
2309    OP_32_64(ld16u):
2310        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2311        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2312        break;
2313    OP_32_64(ld16s):
2314        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2315        break;
2316#if TCG_TARGET_REG_BITS == 64
2317    case INDEX_op_ld32u_i64:
2318#endif
2319    case INDEX_op_ld_i32:
2320        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2321        break;
2322
2323    OP_32_64(st8):
2324        if (const_args[0]) {
2325            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2326            tcg_out8(s, a0);
2327        } else {
2328            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2329        }
2330        break;
2331    OP_32_64(st16):
2332        if (const_args[0]) {
2333            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2334            tcg_out16(s, a0);
2335        } else {
2336            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2337        }
2338        break;
2339#if TCG_TARGET_REG_BITS == 64
2340    case INDEX_op_st32_i64:
2341#endif
2342    case INDEX_op_st_i32:
2343        if (const_args[0]) {
2344            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2345            tcg_out32(s, a0);
2346        } else {
2347            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2348        }
2349        break;
2350
2351    OP_32_64(add):
2352        /* For 3-operand addition, use LEA.  */
2353        if (a0 != a1) {
2354            TCGArg c3 = 0;
2355            if (const_a2) {
2356                c3 = a2, a2 = -1;
2357            } else if (a0 == a2) {
2358                /* Watch out for dest = src + dest, since we've removed
2359                   the matching constraint on the add.  */
2360                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2361                break;
2362            }
2363
2364            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2365            break;
2366        }
2367        c = ARITH_ADD;
2368        goto gen_arith;
2369    OP_32_64(sub):
2370        c = ARITH_SUB;
2371        goto gen_arith;
2372    OP_32_64(and):
2373        c = ARITH_AND;
2374        goto gen_arith;
2375    OP_32_64(or):
2376        c = ARITH_OR;
2377        goto gen_arith;
2378    OP_32_64(xor):
2379        c = ARITH_XOR;
2380        goto gen_arith;
2381    gen_arith:
2382        if (const_a2) {
2383            tgen_arithi(s, c + rexw, a0, a2, 0);
2384        } else {
2385            tgen_arithr(s, c + rexw, a0, a2);
2386        }
2387        break;
2388
2389    OP_32_64(andc):
2390        if (const_a2) {
2391            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2392            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2393        } else {
2394            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2395        }
2396        break;
2397
2398    OP_32_64(mul):
2399        if (const_a2) {
2400            int32_t val;
2401            val = a2;
2402            if (val == (int8_t)val) {
2403                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2404                tcg_out8(s, val);
2405            } else {
2406                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2407                tcg_out32(s, val);
2408            }
2409        } else {
2410            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2411        }
2412        break;
2413
2414    OP_32_64(div2):
2415        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2416        break;
2417    OP_32_64(divu2):
2418        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2419        break;
2420
2421    OP_32_64(shl):
2422        /* For small constant 3-operand shift, use LEA.  */
2423        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2424            if (a2 - 1 == 0) {
2425                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2426                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2427            } else {
2428                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2429                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2430            }
2431            break;
2432        }
2433        c = SHIFT_SHL;
2434        vexop = OPC_SHLX;
2435        goto gen_shift_maybe_vex;
2436    OP_32_64(shr):
2437        c = SHIFT_SHR;
2438        vexop = OPC_SHRX;
2439        goto gen_shift_maybe_vex;
2440    OP_32_64(sar):
2441        c = SHIFT_SAR;
2442        vexop = OPC_SARX;
2443        goto gen_shift_maybe_vex;
2444    OP_32_64(rotl):
2445        c = SHIFT_ROL;
2446        goto gen_shift;
2447    OP_32_64(rotr):
2448        c = SHIFT_ROR;
2449        goto gen_shift;
2450    gen_shift_maybe_vex:
2451        if (have_bmi2) {
2452            if (!const_a2) {
2453                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2454                break;
2455            }
2456            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2457        }
2458        /* FALLTHRU */
2459    gen_shift:
2460        if (const_a2) {
2461            tcg_out_shifti(s, c + rexw, a0, a2);
2462        } else {
2463            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2464        }
2465        break;
2466
2467    OP_32_64(ctz):
2468        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2469        break;
2470    OP_32_64(clz):
2471        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2472        break;
2473    OP_32_64(ctpop):
2474        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2475        break;
2476
2477    case INDEX_op_brcond_i32:
2478        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2479        break;
2480    case INDEX_op_setcond_i32:
2481        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2482        break;
2483    case INDEX_op_movcond_i32:
2484        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2485        break;
2486
2487    OP_32_64(bswap16):
2488        tcg_out_rolw_8(s, a0);
2489        break;
2490    OP_32_64(bswap32):
2491        tcg_out_bswap32(s, a0);
2492        break;
2493
2494    OP_32_64(neg):
2495        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2496        break;
2497    OP_32_64(not):
2498        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2499        break;
2500
2501    OP_32_64(ext8s):
2502        tcg_out_ext8s(s, a0, a1, rexw);
2503        break;
2504    OP_32_64(ext16s):
2505        tcg_out_ext16s(s, a0, a1, rexw);
2506        break;
2507    OP_32_64(ext8u):
2508        tcg_out_ext8u(s, a0, a1);
2509        break;
2510    OP_32_64(ext16u):
2511        tcg_out_ext16u(s, a0, a1);
2512        break;
2513
2514    case INDEX_op_qemu_ld_i32:
2515        tcg_out_qemu_ld(s, args, 0);
2516        break;
2517    case INDEX_op_qemu_ld_i64:
2518        tcg_out_qemu_ld(s, args, 1);
2519        break;
2520    case INDEX_op_qemu_st_i32:
2521    case INDEX_op_qemu_st8_i32:
2522        tcg_out_qemu_st(s, args, 0);
2523        break;
2524    case INDEX_op_qemu_st_i64:
2525        tcg_out_qemu_st(s, args, 1);
2526        break;
2527
2528    OP_32_64(mulu2):
2529        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2530        break;
2531    OP_32_64(muls2):
2532        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2533        break;
2534    OP_32_64(add2):
2535        if (const_args[4]) {
2536            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2537        } else {
2538            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2539        }
2540        if (const_args[5]) {
2541            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2542        } else {
2543            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2544        }
2545        break;
2546    OP_32_64(sub2):
2547        if (const_args[4]) {
2548            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2549        } else {
2550            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2551        }
2552        if (const_args[5]) {
2553            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2554        } else {
2555            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2556        }
2557        break;
2558
2559#if TCG_TARGET_REG_BITS == 32
2560    case INDEX_op_brcond2_i32:
2561        tcg_out_brcond2(s, args, const_args, 0);
2562        break;
2563    case INDEX_op_setcond2_i32:
2564        tcg_out_setcond2(s, args, const_args);
2565        break;
2566#else /* TCG_TARGET_REG_BITS == 64 */
2567    case INDEX_op_ld32s_i64:
2568        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2569        break;
2570    case INDEX_op_ld_i64:
2571        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2572        break;
2573    case INDEX_op_st_i64:
2574        if (const_args[0]) {
2575            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2576            tcg_out32(s, a0);
2577        } else {
2578            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2579        }
2580        break;
2581
2582    case INDEX_op_brcond_i64:
2583        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2584        break;
2585    case INDEX_op_setcond_i64:
2586        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2587        break;
2588    case INDEX_op_movcond_i64:
2589        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2590        break;
2591
2592    case INDEX_op_bswap64_i64:
2593        tcg_out_bswap64(s, a0);
2594        break;
2595    case INDEX_op_extu_i32_i64:
2596    case INDEX_op_ext32u_i64:
2597    case INDEX_op_extrl_i64_i32:
2598        tcg_out_ext32u(s, a0, a1);
2599        break;
2600    case INDEX_op_ext_i32_i64:
2601    case INDEX_op_ext32s_i64:
2602        tcg_out_ext32s(s, a0, a1);
2603        break;
2604    case INDEX_op_extrh_i64_i32:
2605        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2606        break;
2607#endif
2608
2609    OP_32_64(deposit):
2610        if (args[3] == 0 && args[4] == 8) {
2611            /* load bits 0..7 */
2612            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2613        } else if (args[3] == 8 && args[4] == 8) {
2614            /* load bits 8..15 */
2615            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2616        } else if (args[3] == 0 && args[4] == 16) {
2617            /* load bits 0..15 */
2618            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2619        } else {
2620            tcg_abort();
2621        }
2622        break;
2623
2624    case INDEX_op_extract_i64:
2625        if (a2 + args[3] == 32) {
2626            /* This is a 32-bit zero-extending right shift.  */
2627            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2628            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2629            break;
2630        }
2631        /* FALLTHRU */
2632    case INDEX_op_extract_i32:
2633        /* On the off-chance that we can use the high-byte registers.
2634           Otherwise we emit the same ext16 + shift pattern that we
2635           would have gotten from the normal tcg-op.c expansion.  */
2636        tcg_debug_assert(a2 == 8 && args[3] == 8);
2637        if (a1 < 4 && a0 < 8) {
2638            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2639        } else {
2640            tcg_out_ext16u(s, a0, a1);
2641            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2642        }
2643        break;
2644
2645    case INDEX_op_sextract_i32:
2646        /* We don't implement sextract_i64, as we cannot sign-extend to
2647           64-bits without using the REX prefix that explicitly excludes
2648           access to the high-byte registers.  */
2649        tcg_debug_assert(a2 == 8 && args[3] == 8);
2650        if (a1 < 4 && a0 < 8) {
2651            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2652        } else {
2653            tcg_out_ext16s(s, a0, a1, 0);
2654            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2655        }
2656        break;
2657
2658    OP_32_64(extract2):
2659        /* Note that SHRD outputs to the r/m operand.  */
2660        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2661        tcg_out8(s, args[3]);
2662        break;
2663
2664    case INDEX_op_mb:
2665        tcg_out_mb(s, a0);
2666        break;
2667    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2668    case INDEX_op_mov_i64:
2669    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2670    default:
2671        tcg_abort();
2672    }
2673
2674#undef OP_32_64
2675}
2676
2677static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2678                           unsigned vecl, unsigned vece,
2679                           const TCGArg *args, const int *const_args)
2680{
2681    static int const add_insn[4] = {
2682        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2683    };
2684    static int const ssadd_insn[4] = {
2685        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2686    };
2687    static int const usadd_insn[4] = {
2688        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2689    };
2690    static int const sub_insn[4] = {
2691        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2692    };
2693    static int const sssub_insn[4] = {
2694        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2695    };
2696    static int const ussub_insn[4] = {
2697        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2698    };
2699    static int const mul_insn[4] = {
2700        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2701    };
2702    static int const shift_imm_insn[4] = {
2703        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2704    };
2705    static int const cmpeq_insn[4] = {
2706        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2707    };
2708    static int const cmpgt_insn[4] = {
2709        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2710    };
2711    static int const punpckl_insn[4] = {
2712        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2713    };
2714    static int const punpckh_insn[4] = {
2715        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2716    };
2717    static int const packss_insn[4] = {
2718        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2719    };
2720    static int const packus_insn[4] = {
2721        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2722    };
2723    static int const smin_insn[4] = {
2724        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2725    };
2726    static int const smax_insn[4] = {
2727        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2728    };
2729    static int const umin_insn[4] = {
2730        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2731    };
2732    static int const umax_insn[4] = {
2733        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2734    };
2735    static int const shlv_insn[4] = {
2736        /* TODO: AVX512 adds support for MO_16.  */
2737        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2738    };
2739    static int const shrv_insn[4] = {
2740        /* TODO: AVX512 adds support for MO_16.  */
2741        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2742    };
2743    static int const sarv_insn[4] = {
2744        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2745        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2746    };
2747    static int const shls_insn[4] = {
2748        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2749    };
2750    static int const shrs_insn[4] = {
2751        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2752    };
2753    static int const sars_insn[4] = {
2754        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2755    };
2756    static int const abs_insn[4] = {
2757        /* TODO: AVX512 adds support for MO_64.  */
2758        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2759    };
2760
2761    TCGType type = vecl + TCG_TYPE_V64;
2762    int insn, sub;
2763    TCGArg a0, a1, a2;
2764
2765    a0 = args[0];
2766    a1 = args[1];
2767    a2 = args[2];
2768
2769    switch (opc) {
2770    case INDEX_op_add_vec:
2771        insn = add_insn[vece];
2772        goto gen_simd;
2773    case INDEX_op_ssadd_vec:
2774        insn = ssadd_insn[vece];
2775        goto gen_simd;
2776    case INDEX_op_usadd_vec:
2777        insn = usadd_insn[vece];
2778        goto gen_simd;
2779    case INDEX_op_sub_vec:
2780        insn = sub_insn[vece];
2781        goto gen_simd;
2782    case INDEX_op_sssub_vec:
2783        insn = sssub_insn[vece];
2784        goto gen_simd;
2785    case INDEX_op_ussub_vec:
2786        insn = ussub_insn[vece];
2787        goto gen_simd;
2788    case INDEX_op_mul_vec:
2789        insn = mul_insn[vece];
2790        goto gen_simd;
2791    case INDEX_op_and_vec:
2792        insn = OPC_PAND;
2793        goto gen_simd;
2794    case INDEX_op_or_vec:
2795        insn = OPC_POR;
2796        goto gen_simd;
2797    case INDEX_op_xor_vec:
2798        insn = OPC_PXOR;
2799        goto gen_simd;
2800    case INDEX_op_smin_vec:
2801        insn = smin_insn[vece];
2802        goto gen_simd;
2803    case INDEX_op_umin_vec:
2804        insn = umin_insn[vece];
2805        goto gen_simd;
2806    case INDEX_op_smax_vec:
2807        insn = smax_insn[vece];
2808        goto gen_simd;
2809    case INDEX_op_umax_vec:
2810        insn = umax_insn[vece];
2811        goto gen_simd;
2812    case INDEX_op_shlv_vec:
2813        insn = shlv_insn[vece];
2814        goto gen_simd;
2815    case INDEX_op_shrv_vec:
2816        insn = shrv_insn[vece];
2817        goto gen_simd;
2818    case INDEX_op_sarv_vec:
2819        insn = sarv_insn[vece];
2820        goto gen_simd;
2821    case INDEX_op_shls_vec:
2822        insn = shls_insn[vece];
2823        goto gen_simd;
2824    case INDEX_op_shrs_vec:
2825        insn = shrs_insn[vece];
2826        goto gen_simd;
2827    case INDEX_op_sars_vec:
2828        insn = sars_insn[vece];
2829        goto gen_simd;
2830    case INDEX_op_x86_punpckl_vec:
2831        insn = punpckl_insn[vece];
2832        goto gen_simd;
2833    case INDEX_op_x86_punpckh_vec:
2834        insn = punpckh_insn[vece];
2835        goto gen_simd;
2836    case INDEX_op_x86_packss_vec:
2837        insn = packss_insn[vece];
2838        goto gen_simd;
2839    case INDEX_op_x86_packus_vec:
2840        insn = packus_insn[vece];
2841        goto gen_simd;
2842#if TCG_TARGET_REG_BITS == 32
2843    case INDEX_op_dup2_vec:
2844        /* First merge the two 32-bit inputs to a single 64-bit element. */
2845        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2846        /* Then replicate the 64-bit elements across the rest of the vector. */
2847        if (type != TCG_TYPE_V64) {
2848            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2849        }
2850        break;
2851#endif
2852    case INDEX_op_abs_vec:
2853        insn = abs_insn[vece];
2854        a2 = a1;
2855        a1 = 0;
2856        goto gen_simd;
2857    gen_simd:
2858        tcg_debug_assert(insn != OPC_UD2);
2859        if (type == TCG_TYPE_V256) {
2860            insn |= P_VEXL;
2861        }
2862        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2863        break;
2864
2865    case INDEX_op_cmp_vec:
2866        sub = args[3];
2867        if (sub == TCG_COND_EQ) {
2868            insn = cmpeq_insn[vece];
2869        } else if (sub == TCG_COND_GT) {
2870            insn = cmpgt_insn[vece];
2871        } else {
2872            g_assert_not_reached();
2873        }
2874        goto gen_simd;
2875
2876    case INDEX_op_andc_vec:
2877        insn = OPC_PANDN;
2878        if (type == TCG_TYPE_V256) {
2879            insn |= P_VEXL;
2880        }
2881        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2882        break;
2883
2884    case INDEX_op_shli_vec:
2885        sub = 6;
2886        goto gen_shift;
2887    case INDEX_op_shri_vec:
2888        sub = 2;
2889        goto gen_shift;
2890    case INDEX_op_sari_vec:
2891        tcg_debug_assert(vece != MO_64);
2892        sub = 4;
2893    gen_shift:
2894        tcg_debug_assert(vece != MO_8);
2895        insn = shift_imm_insn[vece];
2896        if (type == TCG_TYPE_V256) {
2897            insn |= P_VEXL;
2898        }
2899        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2900        tcg_out8(s, a2);
2901        break;
2902
2903    case INDEX_op_ld_vec:
2904        tcg_out_ld(s, type, a0, a1, a2);
2905        break;
2906    case INDEX_op_st_vec:
2907        tcg_out_st(s, type, a0, a1, a2);
2908        break;
2909    case INDEX_op_dupm_vec:
2910        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2911        break;
2912
2913    case INDEX_op_x86_shufps_vec:
2914        insn = OPC_SHUFPS;
2915        sub = args[3];
2916        goto gen_simd_imm8;
2917    case INDEX_op_x86_blend_vec:
2918        if (vece == MO_16) {
2919            insn = OPC_PBLENDW;
2920        } else if (vece == MO_32) {
2921            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2922        } else {
2923            g_assert_not_reached();
2924        }
2925        sub = args[3];
2926        goto gen_simd_imm8;
2927    case INDEX_op_x86_vperm2i128_vec:
2928        insn = OPC_VPERM2I128;
2929        sub = args[3];
2930        goto gen_simd_imm8;
2931    gen_simd_imm8:
2932        if (type == TCG_TYPE_V256) {
2933            insn |= P_VEXL;
2934        }
2935        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2936        tcg_out8(s, sub);
2937        break;
2938
2939    case INDEX_op_x86_vpblendvb_vec:
2940        insn = OPC_VPBLENDVB;
2941        if (type == TCG_TYPE_V256) {
2942            insn |= P_VEXL;
2943        }
2944        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2945        tcg_out8(s, args[3] << 4);
2946        break;
2947
2948    case INDEX_op_x86_psrldq_vec:
2949        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2950        tcg_out8(s, a2);
2951        break;
2952
2953    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2954    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2955    default:
2956        g_assert_not_reached();
2957    }
2958}
2959
2960static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2961{
2962    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2963    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2964    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2965    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2966    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2967    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2968    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2969    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2970    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2971    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2972    static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
2973    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2974    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2975    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2976    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2977    static const TCGTargetOpDef s_L = { .args_ct_str = { "s", "L" } };
2978    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2979    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2980    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2981    static const TCGTargetOpDef s_L_L = { .args_ct_str = { "s", "L", "L" } };
2982    static const TCGTargetOpDef r_r_L_L
2983        = { .args_ct_str = { "r", "r", "L", "L" } };
2984    static const TCGTargetOpDef L_L_L_L
2985        = { .args_ct_str = { "L", "L", "L", "L" } };
2986    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2987    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2988    static const TCGTargetOpDef x_x_x_x
2989        = { .args_ct_str = { "x", "x", "x", "x" } };
2990    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2991
2992    switch (op) {
2993    case INDEX_op_goto_ptr:
2994        return &r;
2995
2996    case INDEX_op_ld8u_i32:
2997    case INDEX_op_ld8u_i64:
2998    case INDEX_op_ld8s_i32:
2999    case INDEX_op_ld8s_i64:
3000    case INDEX_op_ld16u_i32:
3001    case INDEX_op_ld16u_i64:
3002    case INDEX_op_ld16s_i32:
3003    case INDEX_op_ld16s_i64:
3004    case INDEX_op_ld_i32:
3005    case INDEX_op_ld32u_i64:
3006    case INDEX_op_ld32s_i64:
3007    case INDEX_op_ld_i64:
3008        return &r_r;
3009
3010    case INDEX_op_st8_i32:
3011    case INDEX_op_st8_i64:
3012        return &qi_r;
3013    case INDEX_op_st16_i32:
3014    case INDEX_op_st16_i64:
3015    case INDEX_op_st_i32:
3016    case INDEX_op_st32_i64:
3017        return &ri_r;
3018    case INDEX_op_st_i64:
3019        return &re_r;
3020
3021    case INDEX_op_add_i32:
3022    case INDEX_op_add_i64:
3023        return &r_r_re;
3024    case INDEX_op_sub_i32:
3025    case INDEX_op_sub_i64:
3026    case INDEX_op_mul_i32:
3027    case INDEX_op_mul_i64:
3028    case INDEX_op_or_i32:
3029    case INDEX_op_or_i64:
3030    case INDEX_op_xor_i32:
3031    case INDEX_op_xor_i64:
3032        return &r_0_re;
3033
3034    case INDEX_op_and_i32:
3035    case INDEX_op_and_i64:
3036        {
3037            static const TCGTargetOpDef and
3038                = { .args_ct_str = { "r", "0", "reZ" } };
3039            return &and;
3040        }
3041        break;
3042    case INDEX_op_andc_i32:
3043    case INDEX_op_andc_i64:
3044        {
3045            static const TCGTargetOpDef andc
3046                = { .args_ct_str = { "r", "r", "rI" } };
3047            return &andc;
3048        }
3049        break;
3050
3051    case INDEX_op_shl_i32:
3052    case INDEX_op_shl_i64:
3053    case INDEX_op_shr_i32:
3054    case INDEX_op_shr_i64:
3055    case INDEX_op_sar_i32:
3056    case INDEX_op_sar_i64:
3057        return have_bmi2 ? &r_r_ri : &r_0_ci;
3058    case INDEX_op_rotl_i32:
3059    case INDEX_op_rotl_i64:
3060    case INDEX_op_rotr_i32:
3061    case INDEX_op_rotr_i64:
3062        return &r_0_ci;
3063
3064    case INDEX_op_brcond_i32:
3065    case INDEX_op_brcond_i64:
3066        return &r_re;
3067
3068    case INDEX_op_bswap16_i32:
3069    case INDEX_op_bswap16_i64:
3070    case INDEX_op_bswap32_i32:
3071    case INDEX_op_bswap32_i64:
3072    case INDEX_op_bswap64_i64:
3073    case INDEX_op_neg_i32:
3074    case INDEX_op_neg_i64:
3075    case INDEX_op_not_i32:
3076    case INDEX_op_not_i64:
3077    case INDEX_op_extrh_i64_i32:
3078        return &r_0;
3079
3080    case INDEX_op_ext8s_i32:
3081    case INDEX_op_ext8s_i64:
3082    case INDEX_op_ext8u_i32:
3083    case INDEX_op_ext8u_i64:
3084        return &r_q;
3085    case INDEX_op_ext16s_i32:
3086    case INDEX_op_ext16s_i64:
3087    case INDEX_op_ext16u_i32:
3088    case INDEX_op_ext16u_i64:
3089    case INDEX_op_ext32s_i64:
3090    case INDEX_op_ext32u_i64:
3091    case INDEX_op_ext_i32_i64:
3092    case INDEX_op_extu_i32_i64:
3093    case INDEX_op_extrl_i64_i32:
3094    case INDEX_op_extract_i32:
3095    case INDEX_op_extract_i64:
3096    case INDEX_op_sextract_i32:
3097    case INDEX_op_ctpop_i32:
3098    case INDEX_op_ctpop_i64:
3099        return &r_r;
3100    case INDEX_op_extract2_i32:
3101    case INDEX_op_extract2_i64:
3102        return &r_0_r;
3103
3104    case INDEX_op_deposit_i32:
3105    case INDEX_op_deposit_i64:
3106        {
3107            static const TCGTargetOpDef dep
3108                = { .args_ct_str = { "Q", "0", "Q" } };
3109            return &dep;
3110        }
3111    case INDEX_op_setcond_i32:
3112    case INDEX_op_setcond_i64:
3113        {
3114            static const TCGTargetOpDef setc
3115                = { .args_ct_str = { "q", "r", "re" } };
3116            return &setc;
3117        }
3118    case INDEX_op_movcond_i32:
3119    case INDEX_op_movcond_i64:
3120        {
3121            static const TCGTargetOpDef movc
3122                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
3123            return &movc;
3124        }
3125    case INDEX_op_div2_i32:
3126    case INDEX_op_div2_i64:
3127    case INDEX_op_divu2_i32:
3128    case INDEX_op_divu2_i64:
3129        {
3130            static const TCGTargetOpDef div2
3131                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3132            return &div2;
3133        }
3134    case INDEX_op_mulu2_i32:
3135    case INDEX_op_mulu2_i64:
3136    case INDEX_op_muls2_i32:
3137    case INDEX_op_muls2_i64:
3138        {
3139            static const TCGTargetOpDef mul2
3140                = { .args_ct_str = { "a", "d", "a", "r" } };
3141            return &mul2;
3142        }
3143    case INDEX_op_add2_i32:
3144    case INDEX_op_add2_i64:
3145    case INDEX_op_sub2_i32:
3146    case INDEX_op_sub2_i64:
3147        {
3148            static const TCGTargetOpDef arith2
3149                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3150            return &arith2;
3151        }
3152    case INDEX_op_ctz_i32:
3153    case INDEX_op_ctz_i64:
3154        {
3155            static const TCGTargetOpDef ctz[2] = {
3156                { .args_ct_str = { "&r", "r", "r" } },
3157                { .args_ct_str = { "&r", "r", "rW" } },
3158            };
3159            return &ctz[have_bmi1];
3160        }
3161    case INDEX_op_clz_i32:
3162    case INDEX_op_clz_i64:
3163        {
3164            static const TCGTargetOpDef clz[2] = {
3165                { .args_ct_str = { "&r", "r", "r" } },
3166                { .args_ct_str = { "&r", "r", "rW" } },
3167            };
3168            return &clz[have_lzcnt];
3169        }
3170
3171    case INDEX_op_qemu_ld_i32:
3172        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3173    case INDEX_op_qemu_st_i32:
3174        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3175    case INDEX_op_qemu_st8_i32:
3176        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &s_L : &s_L_L;
3177    case INDEX_op_qemu_ld_i64:
3178        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3179                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3180                : &r_r_L_L);
3181    case INDEX_op_qemu_st_i64:
3182        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3183                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3184                : &L_L_L_L);
3185
3186    case INDEX_op_brcond2_i32:
3187        {
3188            static const TCGTargetOpDef b2
3189                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3190            return &b2;
3191        }
3192    case INDEX_op_setcond2_i32:
3193        {
3194            static const TCGTargetOpDef s2
3195                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3196            return &s2;
3197        }
3198
3199    case INDEX_op_ld_vec:
3200    case INDEX_op_st_vec:
3201    case INDEX_op_dupm_vec:
3202        return &x_r;
3203
3204    case INDEX_op_add_vec:
3205    case INDEX_op_sub_vec:
3206    case INDEX_op_mul_vec:
3207    case INDEX_op_and_vec:
3208    case INDEX_op_or_vec:
3209    case INDEX_op_xor_vec:
3210    case INDEX_op_andc_vec:
3211    case INDEX_op_ssadd_vec:
3212    case INDEX_op_usadd_vec:
3213    case INDEX_op_sssub_vec:
3214    case INDEX_op_ussub_vec:
3215    case INDEX_op_smin_vec:
3216    case INDEX_op_umin_vec:
3217    case INDEX_op_smax_vec:
3218    case INDEX_op_umax_vec:
3219    case INDEX_op_shlv_vec:
3220    case INDEX_op_shrv_vec:
3221    case INDEX_op_sarv_vec:
3222    case INDEX_op_shls_vec:
3223    case INDEX_op_shrs_vec:
3224    case INDEX_op_sars_vec:
3225    case INDEX_op_rotls_vec:
3226    case INDEX_op_cmp_vec:
3227    case INDEX_op_x86_shufps_vec:
3228    case INDEX_op_x86_blend_vec:
3229    case INDEX_op_x86_packss_vec:
3230    case INDEX_op_x86_packus_vec:
3231    case INDEX_op_x86_vperm2i128_vec:
3232    case INDEX_op_x86_punpckl_vec:
3233    case INDEX_op_x86_punpckh_vec:
3234#if TCG_TARGET_REG_BITS == 32
3235    case INDEX_op_dup2_vec:
3236#endif
3237        return &x_x_x;
3238    case INDEX_op_abs_vec:
3239    case INDEX_op_dup_vec:
3240    case INDEX_op_shli_vec:
3241    case INDEX_op_shri_vec:
3242    case INDEX_op_sari_vec:
3243    case INDEX_op_x86_psrldq_vec:
3244        return &x_x;
3245    case INDEX_op_x86_vpblendvb_vec:
3246        return &x_x_x_x;
3247
3248    default:
3249        break;
3250    }
3251    return NULL;
3252}
3253
3254int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3255{
3256    switch (opc) {
3257    case INDEX_op_add_vec:
3258    case INDEX_op_sub_vec:
3259    case INDEX_op_and_vec:
3260    case INDEX_op_or_vec:
3261    case INDEX_op_xor_vec:
3262    case INDEX_op_andc_vec:
3263        return 1;
3264    case INDEX_op_rotli_vec:
3265    case INDEX_op_cmp_vec:
3266    case INDEX_op_cmpsel_vec:
3267        return -1;
3268
3269    case INDEX_op_shli_vec:
3270    case INDEX_op_shri_vec:
3271        /* We must expand the operation for MO_8.  */
3272        return vece == MO_8 ? -1 : 1;
3273
3274    case INDEX_op_sari_vec:
3275        /* We must expand the operation for MO_8.  */
3276        if (vece == MO_8) {
3277            return -1;
3278        }
3279        /* We can emulate this for MO_64, but it does not pay off
3280           unless we're producing at least 4 values.  */
3281        if (vece == MO_64) {
3282            return type >= TCG_TYPE_V256 ? -1 : 0;
3283        }
3284        return 1;
3285
3286    case INDEX_op_shls_vec:
3287    case INDEX_op_shrs_vec:
3288        return vece >= MO_16;
3289    case INDEX_op_sars_vec:
3290        return vece >= MO_16 && vece <= MO_32;
3291    case INDEX_op_rotls_vec:
3292        return vece >= MO_16 ? -1 : 0;
3293
3294    case INDEX_op_shlv_vec:
3295    case INDEX_op_shrv_vec:
3296        return have_avx2 && vece >= MO_32;
3297    case INDEX_op_sarv_vec:
3298        return have_avx2 && vece == MO_32;
3299    case INDEX_op_rotlv_vec:
3300    case INDEX_op_rotrv_vec:
3301        return have_avx2 && vece >= MO_32 ? -1 : 0;
3302
3303    case INDEX_op_mul_vec:
3304        if (vece == MO_8) {
3305            /* We can expand the operation for MO_8.  */
3306            return -1;
3307        }
3308        if (vece == MO_64) {
3309            return 0;
3310        }
3311        return 1;
3312
3313    case INDEX_op_ssadd_vec:
3314    case INDEX_op_usadd_vec:
3315    case INDEX_op_sssub_vec:
3316    case INDEX_op_ussub_vec:
3317        return vece <= MO_16;
3318    case INDEX_op_smin_vec:
3319    case INDEX_op_smax_vec:
3320    case INDEX_op_umin_vec:
3321    case INDEX_op_umax_vec:
3322    case INDEX_op_abs_vec:
3323        return vece <= MO_32;
3324
3325    default:
3326        return 0;
3327    }
3328}
3329
3330static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3331                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3332{
3333    TCGv_vec t1, t2;
3334
3335    tcg_debug_assert(vece == MO_8);
3336
3337    t1 = tcg_temp_new_vec(type);
3338    t2 = tcg_temp_new_vec(type);
3339
3340    /*
3341     * Unpack to W, shift, and repack.  Tricky bits:
3342     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3343     *     i.e. duplicate in other half of the 16-bit lane.
3344     * (2) For right-shift, add 8 so that the high half of the lane
3345     *     becomes zero.  For left-shift, and left-rotate, we must
3346     *     shift up and down again.
3347     * (3) Step 2 leaves high half zero such that PACKUSWB
3348     *     (pack with unsigned saturation) does not modify
3349     *     the quantity.
3350     */
3351    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3352              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3353    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3354              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3355
3356    if (opc != INDEX_op_rotli_vec) {
3357        imm += 8;
3358    }
3359    if (opc == INDEX_op_shri_vec) {
3360        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3361        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3362    } else {
3363        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3364        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3365        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3366        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3367    }
3368
3369    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3370              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3371    tcg_temp_free_vec(t1);
3372    tcg_temp_free_vec(t2);
3373}
3374
3375static void expand_vec_sari(TCGType type, unsigned vece,
3376                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3377{
3378    TCGv_vec t1, t2;
3379
3380    switch (vece) {
3381    case MO_8:
3382        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3383        t1 = tcg_temp_new_vec(type);
3384        t2 = tcg_temp_new_vec(type);
3385        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3386                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3387        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3388                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3389        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3390        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3391        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3392                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3393        tcg_temp_free_vec(t1);
3394        tcg_temp_free_vec(t2);
3395        break;
3396
3397    case MO_64:
3398        if (imm <= 32) {
3399            /*
3400             * We can emulate a small sign extend by performing an arithmetic
3401             * 32-bit shift and overwriting the high half of a 64-bit logical
3402             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3403             * does not, so we have to bound the smaller shift -- we get the
3404             * same result in the high half either way.
3405             */
3406            t1 = tcg_temp_new_vec(type);
3407            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3408            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3409            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3410                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3411                      tcgv_vec_arg(t1), 0xaa);
3412            tcg_temp_free_vec(t1);
3413        } else {
3414            /* Otherwise we will need to use a compare vs 0 to produce
3415             * the sign-extend, shift and merge.
3416             */
3417            t1 = tcg_const_zeros_vec(type);
3418            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3419            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3420            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3421            tcg_gen_or_vec(MO_64, v0, v0, t1);
3422            tcg_temp_free_vec(t1);
3423        }
3424        break;
3425
3426    default:
3427        g_assert_not_reached();
3428    }
3429}
3430
3431static void expand_vec_rotli(TCGType type, unsigned vece,
3432                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3433{
3434    TCGv_vec t;
3435
3436    if (vece == MO_8) {
3437        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3438        return;
3439    }
3440
3441    t = tcg_temp_new_vec(type);
3442    tcg_gen_shli_vec(vece, t, v1, imm);
3443    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3444    tcg_gen_or_vec(vece, v0, v0, t);
3445    tcg_temp_free_vec(t);
3446}
3447
3448static void expand_vec_rotls(TCGType type, unsigned vece,
3449                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3450{
3451    TCGv_i32 rsh;
3452    TCGv_vec t;
3453
3454    tcg_debug_assert(vece != MO_8);
3455
3456    t = tcg_temp_new_vec(type);
3457    rsh = tcg_temp_new_i32();
3458
3459    tcg_gen_neg_i32(rsh, lsh);
3460    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3461    tcg_gen_shls_vec(vece, t, v1, lsh);
3462    tcg_gen_shrs_vec(vece, v0, v1, rsh);
3463    tcg_gen_or_vec(vece, v0, v0, t);
3464    tcg_temp_free_vec(t);
3465    tcg_temp_free_i32(rsh);
3466}
3467
3468static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3469                            TCGv_vec v1, TCGv_vec sh, bool right)
3470{
3471    TCGv_vec t = tcg_temp_new_vec(type);
3472
3473    tcg_gen_dupi_vec(vece, t, 8 << vece);
3474    tcg_gen_sub_vec(vece, t, t, sh);
3475    if (right) {
3476        tcg_gen_shlv_vec(vece, t, v1, t);
3477        tcg_gen_shrv_vec(vece, v0, v1, sh);
3478    } else {
3479        tcg_gen_shrv_vec(vece, t, v1, t);
3480        tcg_gen_shlv_vec(vece, v0, v1, sh);
3481    }
3482    tcg_gen_or_vec(vece, v0, v0, t);
3483    tcg_temp_free_vec(t);
3484}
3485
3486static void expand_vec_mul(TCGType type, unsigned vece,
3487                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3488{
3489    TCGv_vec t1, t2, t3, t4, zero;
3490
3491    tcg_debug_assert(vece == MO_8);
3492
3493    /*
3494     * Unpack v1 bytes to words, 0 | x.
3495     * Unpack v2 bytes to words, y | 0.
3496     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3497     * Shift logical right by 8 bits to clear the high 8 bytes before
3498     * using an unsigned saturated pack.
3499     *
3500     * The difference between the V64, V128 and V256 cases is merely how
3501     * we distribute the expansion between temporaries.
3502     */
3503    switch (type) {
3504    case TCG_TYPE_V64:
3505        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3506        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3507        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3508        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3509                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3510        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3511                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3512        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3513        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3514        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3515                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3516        tcg_temp_free_vec(t1);
3517        tcg_temp_free_vec(t2);
3518        break;
3519
3520    case TCG_TYPE_V128:
3521    case TCG_TYPE_V256:
3522        t1 = tcg_temp_new_vec(type);
3523        t2 = tcg_temp_new_vec(type);
3524        t3 = tcg_temp_new_vec(type);
3525        t4 = tcg_temp_new_vec(type);
3526        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3527        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3528                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3529        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3530                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3531        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3532                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3533        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3534                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3535        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3536        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3537        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3538        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3539        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3540                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3541        tcg_temp_free_vec(t1);
3542        tcg_temp_free_vec(t2);
3543        tcg_temp_free_vec(t3);
3544        tcg_temp_free_vec(t4);
3545        break;
3546
3547    default:
3548        g_assert_not_reached();
3549    }
3550}
3551
3552static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3553                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3554{
3555    enum {
3556        NEED_INV  = 1,
3557        NEED_SWAP = 2,
3558        NEED_BIAS = 4,
3559        NEED_UMIN = 8,
3560        NEED_UMAX = 16,
3561    };
3562    TCGv_vec t1, t2, t3;
3563    uint8_t fixup;
3564
3565    switch (cond) {
3566    case TCG_COND_EQ:
3567    case TCG_COND_GT:
3568        fixup = 0;
3569        break;
3570    case TCG_COND_NE:
3571    case TCG_COND_LE:
3572        fixup = NEED_INV;
3573        break;
3574    case TCG_COND_LT:
3575        fixup = NEED_SWAP;
3576        break;
3577    case TCG_COND_GE:
3578        fixup = NEED_SWAP | NEED_INV;
3579        break;
3580    case TCG_COND_LEU:
3581        if (vece <= MO_32) {
3582            fixup = NEED_UMIN;
3583        } else {
3584            fixup = NEED_BIAS | NEED_INV;
3585        }
3586        break;
3587    case TCG_COND_GTU:
3588        if (vece <= MO_32) {
3589            fixup = NEED_UMIN | NEED_INV;
3590        } else {
3591            fixup = NEED_BIAS;
3592        }
3593        break;
3594    case TCG_COND_GEU:
3595        if (vece <= MO_32) {
3596            fixup = NEED_UMAX;
3597        } else {
3598            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3599        }
3600        break;
3601    case TCG_COND_LTU:
3602        if (vece <= MO_32) {
3603            fixup = NEED_UMAX | NEED_INV;
3604        } else {
3605            fixup = NEED_BIAS | NEED_SWAP;
3606        }
3607        break;
3608    default:
3609        g_assert_not_reached();
3610    }
3611
3612    if (fixup & NEED_INV) {
3613        cond = tcg_invert_cond(cond);
3614    }
3615    if (fixup & NEED_SWAP) {
3616        t1 = v1, v1 = v2, v2 = t1;
3617        cond = tcg_swap_cond(cond);
3618    }
3619
3620    t1 = t2 = NULL;
3621    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3622        t1 = tcg_temp_new_vec(type);
3623        if (fixup & NEED_UMIN) {
3624            tcg_gen_umin_vec(vece, t1, v1, v2);
3625        } else {
3626            tcg_gen_umax_vec(vece, t1, v1, v2);
3627        }
3628        v2 = t1;
3629        cond = TCG_COND_EQ;
3630    } else if (fixup & NEED_BIAS) {
3631        t1 = tcg_temp_new_vec(type);
3632        t2 = tcg_temp_new_vec(type);
3633        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3634        tcg_gen_sub_vec(vece, t1, v1, t3);
3635        tcg_gen_sub_vec(vece, t2, v2, t3);
3636        v1 = t1;
3637        v2 = t2;
3638        cond = tcg_signed_cond(cond);
3639    }
3640
3641    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3642    /* Expand directly; do not recurse.  */
3643    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3644              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3645
3646    if (t1) {
3647        tcg_temp_free_vec(t1);
3648        if (t2) {
3649            tcg_temp_free_vec(t2);
3650        }
3651    }
3652    return fixup & NEED_INV;
3653}
3654
3655static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3656                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3657{
3658    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3659        tcg_gen_not_vec(vece, v0, v0);
3660    }
3661}
3662
3663static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3664                              TCGv_vec c1, TCGv_vec c2,
3665                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3666{
3667    TCGv_vec t = tcg_temp_new_vec(type);
3668
3669    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3670        /* Invert the sense of the compare by swapping arguments.  */
3671        TCGv_vec x;
3672        x = v3, v3 = v4, v4 = x;
3673    }
3674    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3675              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3676              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3677    tcg_temp_free_vec(t);
3678}
3679
3680void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3681                       TCGArg a0, ...)
3682{
3683    va_list va;
3684    TCGArg a2;
3685    TCGv_vec v0, v1, v2, v3, v4;
3686
3687    va_start(va, a0);
3688    v0 = temp_tcgv_vec(arg_temp(a0));
3689    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3690    a2 = va_arg(va, TCGArg);
3691
3692    switch (opc) {
3693    case INDEX_op_shli_vec:
3694    case INDEX_op_shri_vec:
3695        expand_vec_shi(type, vece, opc, v0, v1, a2);
3696        break;
3697
3698    case INDEX_op_sari_vec:
3699        expand_vec_sari(type, vece, v0, v1, a2);
3700        break;
3701
3702    case INDEX_op_rotli_vec:
3703        expand_vec_rotli(type, vece, v0, v1, a2);
3704        break;
3705
3706    case INDEX_op_rotls_vec:
3707        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3708        break;
3709
3710    case INDEX_op_rotlv_vec:
3711        v2 = temp_tcgv_vec(arg_temp(a2));
3712        expand_vec_rotv(type, vece, v0, v1, v2, false);
3713        break;
3714    case INDEX_op_rotrv_vec:
3715        v2 = temp_tcgv_vec(arg_temp(a2));
3716        expand_vec_rotv(type, vece, v0, v1, v2, true);
3717        break;
3718
3719    case INDEX_op_mul_vec:
3720        v2 = temp_tcgv_vec(arg_temp(a2));
3721        expand_vec_mul(type, vece, v0, v1, v2);
3722        break;
3723
3724    case INDEX_op_cmp_vec:
3725        v2 = temp_tcgv_vec(arg_temp(a2));
3726        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3727        break;
3728
3729    case INDEX_op_cmpsel_vec:
3730        v2 = temp_tcgv_vec(arg_temp(a2));
3731        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3732        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3733        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3734        break;
3735
3736    default:
3737        break;
3738    }
3739
3740    va_end(va);
3741}
3742
3743static const int tcg_target_callee_save_regs[] = {
3744#if TCG_TARGET_REG_BITS == 64
3745    TCG_REG_RBP,
3746    TCG_REG_RBX,
3747#if defined(_WIN64)
3748    TCG_REG_RDI,
3749    TCG_REG_RSI,
3750#endif
3751    TCG_REG_R12,
3752    TCG_REG_R13,
3753    TCG_REG_R14, /* Currently used for the global env. */
3754    TCG_REG_R15,
3755#else
3756    TCG_REG_EBP, /* Currently used for the global env. */
3757    TCG_REG_EBX,
3758    TCG_REG_ESI,
3759    TCG_REG_EDI,
3760#endif
3761};
3762
3763/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3764   and tcg_register_jit.  */
3765
3766#define PUSH_SIZE \
3767    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3768     * (TCG_TARGET_REG_BITS / 8))
3769
3770#define FRAME_SIZE \
3771    ((PUSH_SIZE \
3772      + TCG_STATIC_CALL_ARGS_SIZE \
3773      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3774      + TCG_TARGET_STACK_ALIGN - 1) \
3775     & ~(TCG_TARGET_STACK_ALIGN - 1))
3776
3777/* Generate global QEMU prologue and epilogue code */
3778static void tcg_target_qemu_prologue(TCGContext *s)
3779{
3780    int i, stack_addend;
3781
3782    /* TB prologue */
3783
3784    /* Reserve some stack space, also for TCG temps.  */
3785    stack_addend = FRAME_SIZE - PUSH_SIZE;
3786    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3787                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3788
3789    /* Save all callee saved registers.  */
3790    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3791        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3792    }
3793
3794#if TCG_TARGET_REG_BITS == 32
3795    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3796               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3797    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3798    /* jmp *tb.  */
3799    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3800                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3801                         + stack_addend);
3802#else
3803# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3804    if (guest_base) {
3805        int seg = setup_guest_base_seg();
3806        if (seg != 0) {
3807            x86_guest_base_seg = seg;
3808        } else if (guest_base == (int32_t)guest_base) {
3809            x86_guest_base_offset = guest_base;
3810        } else {
3811            /* Choose R12 because, as a base, it requires a SIB byte. */
3812            x86_guest_base_index = TCG_REG_R12;
3813            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3814            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3815        }
3816    }
3817# endif
3818    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3819    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3820    /* jmp *tb.  */
3821    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3822#endif
3823
3824    /*
3825     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3826     * and fall through to the rest of the epilogue.
3827     */
3828    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3829    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3830
3831    /* TB epilogue */
3832    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3833
3834    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3835
3836    if (have_avx2) {
3837        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3838    }
3839    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3840        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3841    }
3842    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3843}
3844
3845static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3846{
3847    memset(p, 0x90, count);
3848}
3849
3850static void tcg_target_init(TCGContext *s)
3851{
3852#ifdef CONFIG_CPUID_H
3853    unsigned a, b, c, d, b7 = 0;
3854    int max = __get_cpuid_max(0, 0);
3855
3856    if (max >= 7) {
3857        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3858        __cpuid_count(7, 0, a, b7, c, d);
3859        have_bmi1 = (b7 & bit_BMI) != 0;
3860        have_bmi2 = (b7 & bit_BMI2) != 0;
3861    }
3862
3863    if (max >= 1) {
3864        __cpuid(1, a, b, c, d);
3865#ifndef have_cmov
3866        /* For 32-bit, 99% certainty that we're running on hardware that
3867           supports cmov, but we still need to check.  In case cmov is not
3868           available, we'll use a small forward branch.  */
3869        have_cmov = (d & bit_CMOV) != 0;
3870#endif
3871
3872        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3873           need to probe for it.  */
3874        have_movbe = (c & bit_MOVBE) != 0;
3875        have_popcnt = (c & bit_POPCNT) != 0;
3876
3877        /* There are a number of things we must check before we can be
3878           sure of not hitting invalid opcode.  */
3879        if (c & bit_OSXSAVE) {
3880            unsigned xcrl, xcrh;
3881            /* The xgetbv instruction is not available to older versions of
3882             * the assembler, so we encode the instruction manually.
3883             */
3884            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3885            if ((xcrl & 6) == 6) {
3886                have_avx1 = (c & bit_AVX) != 0;
3887                have_avx2 = (b7 & bit_AVX2) != 0;
3888            }
3889        }
3890    }
3891
3892    max = __get_cpuid_max(0x8000000, 0);
3893    if (max >= 1) {
3894        __cpuid(0x80000001, a, b, c, d);
3895        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3896        have_lzcnt = (c & bit_LZCNT) != 0;
3897    }
3898#endif /* CONFIG_CPUID_H */
3899
3900    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3901    if (TCG_TARGET_REG_BITS == 64) {
3902        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3903    }
3904    if (have_avx1) {
3905        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3906        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3907    }
3908    if (have_avx2) {
3909        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3910    }
3911
3912    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3913    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3914    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3915    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3916    if (TCG_TARGET_REG_BITS == 64) {
3917#if !defined(_WIN64)
3918        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3919        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3920#endif
3921        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3922        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3923        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3924        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3925    }
3926
3927    s->reserved_regs = 0;
3928    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3929}
3930
3931typedef struct {
3932    DebugFrameHeader h;
3933    uint8_t fde_def_cfa[4];
3934    uint8_t fde_reg_ofs[14];
3935} DebugFrame;
3936
3937/* We're expecting a 2 byte uleb128 encoded value.  */
3938QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3939
3940#if !defined(__ELF__)
3941    /* Host machine without ELF. */
3942#elif TCG_TARGET_REG_BITS == 64
3943#define ELF_HOST_MACHINE EM_X86_64
3944static const DebugFrame debug_frame = {
3945    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3946    .h.cie.id = -1,
3947    .h.cie.version = 1,
3948    .h.cie.code_align = 1,
3949    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3950    .h.cie.return_column = 16,
3951
3952    /* Total FDE size does not include the "len" member.  */
3953    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3954
3955    .fde_def_cfa = {
3956        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3957        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3958        (FRAME_SIZE >> 7)
3959    },
3960    .fde_reg_ofs = {
3961        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3962        /* The following ordering must match tcg_target_callee_save_regs.  */
3963        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3964        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3965        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3966        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3967        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3968        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3969    }
3970};
3971#else
3972#define ELF_HOST_MACHINE EM_386
3973static const DebugFrame debug_frame = {
3974    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3975    .h.cie.id = -1,
3976    .h.cie.version = 1,
3977    .h.cie.code_align = 1,
3978    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3979    .h.cie.return_column = 8,
3980
3981    /* Total FDE size does not include the "len" member.  */
3982    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3983
3984    .fde_def_cfa = {
3985        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3986        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3987        (FRAME_SIZE >> 7)
3988    },
3989    .fde_reg_ofs = {
3990        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3991        /* The following ordering must match tcg_target_callee_save_regs.  */
3992        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3993        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3994        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3995        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3996    }
3997};
3998#endif
3999
4000#if defined(ELF_HOST_MACHINE)
4001void tcg_register_jit(const void *buf, size_t buf_size)
4002{
4003    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4004}
4005#endif
4006