xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision b68a92f4cb16115025f41bc59e1b2f182a610370)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-pool.c.inc"
26
27#ifdef CONFIG_DEBUG_TCG
28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29#if TCG_TARGET_REG_BITS == 64
30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31#else
32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33#endif
34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36#if TCG_TARGET_REG_BITS == 64
37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39#endif
40};
41#endif
42
43static const int tcg_target_reg_alloc_order[] = {
44#if TCG_TARGET_REG_BITS == 64
45    TCG_REG_RBP,
46    TCG_REG_RBX,
47    TCG_REG_R12,
48    TCG_REG_R13,
49    TCG_REG_R14,
50    TCG_REG_R15,
51    TCG_REG_R10,
52    TCG_REG_R11,
53    TCG_REG_R9,
54    TCG_REG_R8,
55    TCG_REG_RCX,
56    TCG_REG_RDX,
57    TCG_REG_RSI,
58    TCG_REG_RDI,
59    TCG_REG_RAX,
60#else
61    TCG_REG_EBX,
62    TCG_REG_ESI,
63    TCG_REG_EDI,
64    TCG_REG_EBP,
65    TCG_REG_ECX,
66    TCG_REG_EDX,
67    TCG_REG_EAX,
68#endif
69    TCG_REG_XMM0,
70    TCG_REG_XMM1,
71    TCG_REG_XMM2,
72    TCG_REG_XMM3,
73    TCG_REG_XMM4,
74    TCG_REG_XMM5,
75#ifndef _WIN64
76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
78    TCG_REG_XMM6,
79    TCG_REG_XMM7,
80#if TCG_TARGET_REG_BITS == 64
81    TCG_REG_XMM8,
82    TCG_REG_XMM9,
83    TCG_REG_XMM10,
84    TCG_REG_XMM11,
85    TCG_REG_XMM12,
86    TCG_REG_XMM13,
87    TCG_REG_XMM14,
88    TCG_REG_XMM15,
89#endif
90#endif
91};
92
93static const int tcg_target_call_iarg_regs[] = {
94#if TCG_TARGET_REG_BITS == 64
95#if defined(_WIN64)
96    TCG_REG_RCX,
97    TCG_REG_RDX,
98#else
99    TCG_REG_RDI,
100    TCG_REG_RSI,
101    TCG_REG_RDX,
102    TCG_REG_RCX,
103#endif
104    TCG_REG_R8,
105    TCG_REG_R9,
106#else
107    /* 32 bit mode uses stack based calling convention (GCC default). */
108#endif
109};
110
111static const int tcg_target_call_oarg_regs[] = {
112    TCG_REG_EAX,
113#if TCG_TARGET_REG_BITS == 32
114    TCG_REG_EDX
115#endif
116};
117
118/* Constants we accept.  */
119#define TCG_CT_CONST_S32 0x100
120#define TCG_CT_CONST_U32 0x200
121#define TCG_CT_CONST_I32 0x400
122#define TCG_CT_CONST_WSZ 0x800
123
124/* Registers used with L constraint, which are the first argument
125   registers on x86_64, and two random call clobbered registers on
126   i386. */
127#if TCG_TARGET_REG_BITS == 64
128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130#else
131# define TCG_REG_L0 TCG_REG_EAX
132# define TCG_REG_L1 TCG_REG_EDX
133#endif
134
135/* The host compiler should supply <cpuid.h> to enable runtime features
136   detection, as we're not going to go so far as our own inline assembly.
137   If not available, default values will be assumed.  */
138#if defined(CONFIG_CPUID_H)
139#include "qemu/cpuid.h"
140#endif
141
142/* For 64-bit, we always know that CMOV is available.  */
143#if TCG_TARGET_REG_BITS == 64
144# define have_cmov 1
145#elif defined(CONFIG_CPUID_H)
146static bool have_cmov;
147#else
148# define have_cmov 0
149#endif
150
151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
152   it there.  Therefore we always define the variable.  */
153bool have_bmi1;
154bool have_popcnt;
155bool have_avx1;
156bool have_avx2;
157
158#ifdef CONFIG_CPUID_H
159static bool have_movbe;
160static bool have_bmi2;
161static bool have_lzcnt;
162#else
163# define have_movbe 0
164# define have_bmi2 0
165# define have_lzcnt 0
166#endif
167
168static tcg_insn_unit *tb_ret_addr;
169
170static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171                        intptr_t value, intptr_t addend)
172{
173    value += addend;
174    switch(type) {
175    case R_386_PC32:
176        value -= (uintptr_t)code_ptr;
177        if (value != (int32_t)value) {
178            return false;
179        }
180        /* FALLTHRU */
181    case R_386_32:
182        tcg_patch32(code_ptr, value);
183        break;
184    case R_386_PC8:
185        value -= (uintptr_t)code_ptr;
186        if (value != (int8_t)value) {
187            return false;
188        }
189        tcg_patch8(code_ptr, value);
190        break;
191    default:
192        tcg_abort();
193    }
194    return true;
195}
196
197#if TCG_TARGET_REG_BITS == 64
198#define ALL_GENERAL_REGS   0x0000ffffu
199#define ALL_VECTOR_REGS    0xffff0000u
200#else
201#define ALL_GENERAL_REGS   0x000000ffu
202#define ALL_VECTOR_REGS    0x00ff0000u
203#endif
204
205/* parse target specific constraints */
206static const char *target_parse_constraint(TCGArgConstraint *ct,
207                                           const char *ct_str, TCGType type)
208{
209    switch(*ct_str++) {
210    case 'a':
211        tcg_regset_set_reg(ct->regs, TCG_REG_EAX);
212        break;
213    case 'b':
214        tcg_regset_set_reg(ct->regs, TCG_REG_EBX);
215        break;
216    case 'c':
217        tcg_regset_set_reg(ct->regs, TCG_REG_ECX);
218        break;
219    case 'd':
220        tcg_regset_set_reg(ct->regs, TCG_REG_EDX);
221        break;
222    case 'S':
223        tcg_regset_set_reg(ct->regs, TCG_REG_ESI);
224        break;
225    case 'D':
226        tcg_regset_set_reg(ct->regs, TCG_REG_EDI);
227        break;
228    case 'q':
229        /* A register that can be used as a byte operand.  */
230        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
231        break;
232    case 'Q':
233        /* A register with an addressable second byte (e.g. %ah).  */
234        ct->regs = 0xf;
235        break;
236    case 'r':
237        /* A general register.  */
238        ct->regs |= ALL_GENERAL_REGS;
239        break;
240    case 'W':
241        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
242        ct->ct |= TCG_CT_CONST_WSZ;
243        break;
244    case 'x':
245        /* A vector register.  */
246        ct->regs |= ALL_VECTOR_REGS;
247        break;
248
249        /* qemu_ld/st address constraint */
250    case 'L':
251        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
252        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
253        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
254        break;
255
256    case 'e':
257        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
258        break;
259    case 'Z':
260        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
261        break;
262    case 'I':
263        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
264        break;
265
266    default:
267        return NULL;
268    }
269    return ct_str;
270}
271
272/* test if a constant matches the constraint */
273static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
274                                         const TCGArgConstraint *arg_ct)
275{
276    int ct = arg_ct->ct;
277    if (ct & TCG_CT_CONST) {
278        return 1;
279    }
280    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
281        return 1;
282    }
283    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
284        return 1;
285    }
286    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
287        return 1;
288    }
289    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
290        return 1;
291    }
292    return 0;
293}
294
295# define LOWREGMASK(x)	((x) & 7)
296
297#define P_EXT		0x100		/* 0x0f opcode prefix */
298#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
299#define P_DATA16        0x400           /* 0x66 opcode prefix */
300#if TCG_TARGET_REG_BITS == 64
301# define P_REXW         0x1000          /* Set REX.W = 1 */
302# define P_REXB_R       0x2000          /* REG field as byte register */
303# define P_REXB_RM      0x4000          /* R/M field as byte register */
304# define P_GS           0x8000          /* gs segment override */
305#else
306# define P_REXW		0
307# define P_REXB_R	0
308# define P_REXB_RM	0
309# define P_GS           0
310#endif
311#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
312#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
313#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
314#define P_VEXL          0x80000         /* Set VEX.L = 1 */
315
316#define OPC_ARITH_EvIz	(0x81)
317#define OPC_ARITH_EvIb	(0x83)
318#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
319#define OPC_ANDN        (0xf2 | P_EXT38)
320#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
321#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
322#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
323#define OPC_BSF         (0xbc | P_EXT)
324#define OPC_BSR         (0xbd | P_EXT)
325#define OPC_BSWAP	(0xc8 | P_EXT)
326#define OPC_CALL_Jz	(0xe8)
327#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
328#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
329#define OPC_DEC_r32	(0x48)
330#define OPC_IMUL_GvEv	(0xaf | P_EXT)
331#define OPC_IMUL_GvEvIb	(0x6b)
332#define OPC_IMUL_GvEvIz	(0x69)
333#define OPC_INC_r32	(0x40)
334#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
335#define OPC_JCC_short	(0x70)		/* ... plus condition code */
336#define OPC_JMP_long	(0xe9)
337#define OPC_JMP_short	(0xeb)
338#define OPC_LEA         (0x8d)
339#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
340#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
341#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
342#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
343#define OPC_MOVB_EvIz   (0xc6)
344#define OPC_MOVL_EvIz	(0xc7)
345#define OPC_MOVL_Iv     (0xb8)
346#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
347#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
348#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
349#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
350#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
351#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
352#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
353#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
354#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
355#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
356#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
357#define OPC_MOVSBL	(0xbe | P_EXT)
358#define OPC_MOVSWL	(0xbf | P_EXT)
359#define OPC_MOVSLQ	(0x63 | P_REXW)
360#define OPC_MOVZBL	(0xb6 | P_EXT)
361#define OPC_MOVZWL	(0xb7 | P_EXT)
362#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
363#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
364#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
365#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
366#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
367#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
368#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
369#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
370#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
371#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
372#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
373#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
374#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
375#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
376#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
377#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
378#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
379#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
380#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
381#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
382#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
383#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
384#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
385#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
386#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
387#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
388#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
389#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
390#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
391#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
392#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
393#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
394#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
395#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
396#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
397#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
398#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
399#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
400#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
401#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
402#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
403#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
404#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
405#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
406#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
407#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
408#define OPC_POR         (0xeb | P_EXT | P_DATA16)
409#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
410#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
411#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
412#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
413#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
414#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
415#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
416#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
417#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
418#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
419#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
420#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
421#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
422#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
423#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
424#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
425#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
426#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
427#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
428#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
429#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
430#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
431#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
432#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
433#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
434#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
435#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
436#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
437#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
438#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
439#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
440#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
441#define OPC_POP_r32	(0x58)
442#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
443#define OPC_PUSH_r32	(0x50)
444#define OPC_PUSH_Iv	(0x68)
445#define OPC_PUSH_Ib	(0x6a)
446#define OPC_RET		(0xc3)
447#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
448#define OPC_SHIFT_1	(0xd1)
449#define OPC_SHIFT_Ib	(0xc1)
450#define OPC_SHIFT_cl	(0xd3)
451#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
452#define OPC_SHUFPS      (0xc6 | P_EXT)
453#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
454#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
455#define OPC_SHRD_Ib     (0xac | P_EXT)
456#define OPC_TESTL	(0x85)
457#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
458#define OPC_UD2         (0x0b | P_EXT)
459#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
460#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
461#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
462#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
463#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
464#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
465#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
466#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
467#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
468#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
469#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
470#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
471#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
472#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
473#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
474#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
475#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
476#define OPC_VZEROUPPER  (0x77 | P_EXT)
477#define OPC_XCHG_ax_r32	(0x90)
478
479#define OPC_GRP3_Ev	(0xf7)
480#define OPC_GRP5	(0xff)
481#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
482
483/* Group 1 opcode extensions for 0x80-0x83.
484   These are also used as modifiers for OPC_ARITH.  */
485#define ARITH_ADD 0
486#define ARITH_OR  1
487#define ARITH_ADC 2
488#define ARITH_SBB 3
489#define ARITH_AND 4
490#define ARITH_SUB 5
491#define ARITH_XOR 6
492#define ARITH_CMP 7
493
494/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
495#define SHIFT_ROL 0
496#define SHIFT_ROR 1
497#define SHIFT_SHL 4
498#define SHIFT_SHR 5
499#define SHIFT_SAR 7
500
501/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
502#define EXT3_NOT   2
503#define EXT3_NEG   3
504#define EXT3_MUL   4
505#define EXT3_IMUL  5
506#define EXT3_DIV   6
507#define EXT3_IDIV  7
508
509/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
510#define EXT5_INC_Ev	0
511#define EXT5_DEC_Ev	1
512#define EXT5_CALLN_Ev	2
513#define EXT5_JMPN_Ev	4
514
515/* Condition codes to be added to OPC_JCC_{long,short}.  */
516#define JCC_JMP (-1)
517#define JCC_JO  0x0
518#define JCC_JNO 0x1
519#define JCC_JB  0x2
520#define JCC_JAE 0x3
521#define JCC_JE  0x4
522#define JCC_JNE 0x5
523#define JCC_JBE 0x6
524#define JCC_JA  0x7
525#define JCC_JS  0x8
526#define JCC_JNS 0x9
527#define JCC_JP  0xa
528#define JCC_JNP 0xb
529#define JCC_JL  0xc
530#define JCC_JGE 0xd
531#define JCC_JLE 0xe
532#define JCC_JG  0xf
533
534static const uint8_t tcg_cond_to_jcc[] = {
535    [TCG_COND_EQ] = JCC_JE,
536    [TCG_COND_NE] = JCC_JNE,
537    [TCG_COND_LT] = JCC_JL,
538    [TCG_COND_GE] = JCC_JGE,
539    [TCG_COND_LE] = JCC_JLE,
540    [TCG_COND_GT] = JCC_JG,
541    [TCG_COND_LTU] = JCC_JB,
542    [TCG_COND_GEU] = JCC_JAE,
543    [TCG_COND_LEU] = JCC_JBE,
544    [TCG_COND_GTU] = JCC_JA,
545};
546
547#if TCG_TARGET_REG_BITS == 64
548static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
549{
550    int rex;
551
552    if (opc & P_GS) {
553        tcg_out8(s, 0x65);
554    }
555    if (opc & P_DATA16) {
556        /* We should never be asking for both 16 and 64-bit operation.  */
557        tcg_debug_assert((opc & P_REXW) == 0);
558        tcg_out8(s, 0x66);
559    }
560    if (opc & P_SIMDF3) {
561        tcg_out8(s, 0xf3);
562    } else if (opc & P_SIMDF2) {
563        tcg_out8(s, 0xf2);
564    }
565
566    rex = 0;
567    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
568    rex |= (r & 8) >> 1;                /* REX.R */
569    rex |= (x & 8) >> 2;                /* REX.X */
570    rex |= (rm & 8) >> 3;               /* REX.B */
571
572    /* P_REXB_{R,RM} indicates that the given register is the low byte.
573       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
574       as otherwise the encoding indicates %[abcd]h.  Note that the values
575       that are ORed in merely indicate that the REX byte must be present;
576       those bits get discarded in output.  */
577    rex |= opc & (r >= 4 ? P_REXB_R : 0);
578    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
579
580    if (rex) {
581        tcg_out8(s, (uint8_t)(rex | 0x40));
582    }
583
584    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
585        tcg_out8(s, 0x0f);
586        if (opc & P_EXT38) {
587            tcg_out8(s, 0x38);
588        } else if (opc & P_EXT3A) {
589            tcg_out8(s, 0x3a);
590        }
591    }
592
593    tcg_out8(s, opc);
594}
595#else
596static void tcg_out_opc(TCGContext *s, int opc)
597{
598    if (opc & P_DATA16) {
599        tcg_out8(s, 0x66);
600    }
601    if (opc & P_SIMDF3) {
602        tcg_out8(s, 0xf3);
603    } else if (opc & P_SIMDF2) {
604        tcg_out8(s, 0xf2);
605    }
606    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
607        tcg_out8(s, 0x0f);
608        if (opc & P_EXT38) {
609            tcg_out8(s, 0x38);
610        } else if (opc & P_EXT3A) {
611            tcg_out8(s, 0x3a);
612        }
613    }
614    tcg_out8(s, opc);
615}
616/* Discard the register arguments to tcg_out_opc early, so as not to penalize
617   the 32-bit compilation paths.  This method works with all versions of gcc,
618   whereas relying on optimization may not be able to exclude them.  */
619#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
620#endif
621
622static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
623{
624    tcg_out_opc(s, opc, r, rm, 0);
625    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
626}
627
628static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
629                            int rm, int index)
630{
631    int tmp;
632
633    /* Use the two byte form if possible, which cannot encode
634       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
635    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
636        && ((rm | index) & 8) == 0) {
637        /* Two byte VEX prefix.  */
638        tcg_out8(s, 0xc5);
639
640        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
641    } else {
642        /* Three byte VEX prefix.  */
643        tcg_out8(s, 0xc4);
644
645        /* VEX.m-mmmm */
646        if (opc & P_EXT3A) {
647            tmp = 3;
648        } else if (opc & P_EXT38) {
649            tmp = 2;
650        } else if (opc & P_EXT) {
651            tmp = 1;
652        } else {
653            g_assert_not_reached();
654        }
655        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
656        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
657        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
658        tcg_out8(s, tmp);
659
660        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
661    }
662
663    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
664    /* VEX.pp */
665    if (opc & P_DATA16) {
666        tmp |= 1;                          /* 0x66 */
667    } else if (opc & P_SIMDF3) {
668        tmp |= 2;                          /* 0xf3 */
669    } else if (opc & P_SIMDF2) {
670        tmp |= 3;                          /* 0xf2 */
671    }
672    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
673    tcg_out8(s, tmp);
674    tcg_out8(s, opc);
675}
676
677static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
678{
679    tcg_out_vex_opc(s, opc, r, v, rm, 0);
680    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
681}
682
683/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
684   We handle either RM and INDEX missing with a negative value.  In 64-bit
685   mode for absolute addresses, ~RM is the size of the immediate operand
686   that will follow the instruction.  */
687
688static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
689                               int shift, intptr_t offset)
690{
691    int mod, len;
692
693    if (index < 0 && rm < 0) {
694        if (TCG_TARGET_REG_BITS == 64) {
695            /* Try for a rip-relative addressing mode.  This has replaced
696               the 32-bit-mode absolute addressing encoding.  */
697            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
698            intptr_t disp = offset - pc;
699            if (disp == (int32_t)disp) {
700                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
701                tcg_out32(s, disp);
702                return;
703            }
704
705            /* Try for an absolute address encoding.  This requires the
706               use of the MODRM+SIB encoding and is therefore larger than
707               rip-relative addressing.  */
708            if (offset == (int32_t)offset) {
709                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
710                tcg_out8(s, (4 << 3) | 5);
711                tcg_out32(s, offset);
712                return;
713            }
714
715            /* ??? The memory isn't directly addressable.  */
716            g_assert_not_reached();
717        } else {
718            /* Absolute address.  */
719            tcg_out8(s, (r << 3) | 5);
720            tcg_out32(s, offset);
721            return;
722        }
723    }
724
725    /* Find the length of the immediate addend.  Note that the encoding
726       that would be used for (%ebp) indicates absolute addressing.  */
727    if (rm < 0) {
728        mod = 0, len = 4, rm = 5;
729    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
730        mod = 0, len = 0;
731    } else if (offset == (int8_t)offset) {
732        mod = 0x40, len = 1;
733    } else {
734        mod = 0x80, len = 4;
735    }
736
737    /* Use a single byte MODRM format if possible.  Note that the encoding
738       that would be used for %esp is the escape to the two byte form.  */
739    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
740        /* Single byte MODRM format.  */
741        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
742    } else {
743        /* Two byte MODRM+SIB format.  */
744
745        /* Note that the encoding that would place %esp into the index
746           field indicates no index register.  In 64-bit mode, the REX.X
747           bit counts, so %r12 can be used as the index.  */
748        if (index < 0) {
749            index = 4;
750        } else {
751            tcg_debug_assert(index != TCG_REG_ESP);
752        }
753
754        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
755        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
756    }
757
758    if (len == 1) {
759        tcg_out8(s, offset);
760    } else if (len == 4) {
761        tcg_out32(s, offset);
762    }
763}
764
765static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
766                                     int index, int shift, intptr_t offset)
767{
768    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
769    tcg_out_sib_offset(s, r, rm, index, shift, offset);
770}
771
772static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
773                                         int rm, int index, int shift,
774                                         intptr_t offset)
775{
776    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
777    tcg_out_sib_offset(s, r, rm, index, shift, offset);
778}
779
780/* A simplification of the above with no index or shift.  */
781static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
782                                        int rm, intptr_t offset)
783{
784    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
785}
786
787static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
788                                            int v, int rm, intptr_t offset)
789{
790    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
791}
792
793/* Output an opcode with an expected reference to the constant pool.  */
794static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
795{
796    tcg_out_opc(s, opc, r, 0, 0);
797    /* Absolute for 32-bit, pc-relative for 64-bit.  */
798    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
799    tcg_out32(s, 0);
800}
801
802/* Output an opcode with an expected reference to the constant pool.  */
803static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
804{
805    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
806    /* Absolute for 32-bit, pc-relative for 64-bit.  */
807    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
808    tcg_out32(s, 0);
809}
810
811/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
812static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
813{
814    /* Propagate an opcode prefix, such as P_REXW.  */
815    int ext = subop & ~0x7;
816    subop &= 0x7;
817
818    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
819}
820
821static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
822{
823    int rexw = 0;
824
825    if (arg == ret) {
826        return true;
827    }
828    switch (type) {
829    case TCG_TYPE_I64:
830        rexw = P_REXW;
831        /* fallthru */
832    case TCG_TYPE_I32:
833        if (ret < 16) {
834            if (arg < 16) {
835                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
836            } else {
837                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
838            }
839        } else {
840            if (arg < 16) {
841                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
842            } else {
843                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
844            }
845        }
846        break;
847
848    case TCG_TYPE_V64:
849        tcg_debug_assert(ret >= 16 && arg >= 16);
850        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
851        break;
852    case TCG_TYPE_V128:
853        tcg_debug_assert(ret >= 16 && arg >= 16);
854        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
855        break;
856    case TCG_TYPE_V256:
857        tcg_debug_assert(ret >= 16 && arg >= 16);
858        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
859        break;
860
861    default:
862        g_assert_not_reached();
863    }
864    return true;
865}
866
867static const int avx2_dup_insn[4] = {
868    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
869    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
870};
871
872static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
873                            TCGReg r, TCGReg a)
874{
875    if (have_avx2) {
876        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
877        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
878    } else {
879        switch (vece) {
880        case MO_8:
881            /* ??? With zero in a register, use PSHUFB.  */
882            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
883            a = r;
884            /* FALLTHRU */
885        case MO_16:
886            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
887            a = r;
888            /* FALLTHRU */
889        case MO_32:
890            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
891            /* imm8 operand: all output lanes selected from input lane 0.  */
892            tcg_out8(s, 0);
893            break;
894        case MO_64:
895            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
896            break;
897        default:
898            g_assert_not_reached();
899        }
900    }
901    return true;
902}
903
904static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
905                             TCGReg r, TCGReg base, intptr_t offset)
906{
907    if (have_avx2) {
908        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
909        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
910                                 r, 0, base, offset);
911    } else {
912        switch (vece) {
913        case MO_64:
914            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
915            break;
916        case MO_32:
917            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
918            break;
919        case MO_16:
920            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
921            tcg_out8(s, 0); /* imm8 */
922            tcg_out_dup_vec(s, type, vece, r, r);
923            break;
924        case MO_8:
925            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
926            tcg_out8(s, 0); /* imm8 */
927            tcg_out_dup_vec(s, type, vece, r, r);
928            break;
929        default:
930            g_assert_not_reached();
931        }
932    }
933    return true;
934}
935
936static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
937                             TCGReg ret, tcg_target_long arg)
938{
939    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
940
941    if (arg == 0) {
942        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
943        return;
944    }
945    if (arg == -1) {
946        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
947        return;
948    }
949
950    if (TCG_TARGET_REG_BITS == 64) {
951        if (type == TCG_TYPE_V64) {
952            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
953        } else if (have_avx2) {
954            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
955        } else {
956            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
957        }
958        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
959    } else {
960        if (have_avx2) {
961            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
962        } else {
963            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
964        }
965        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
966    }
967}
968
969static void tcg_out_movi(TCGContext *s, TCGType type,
970                         TCGReg ret, tcg_target_long arg)
971{
972    tcg_target_long diff;
973
974    switch (type) {
975    case TCG_TYPE_I32:
976#if TCG_TARGET_REG_BITS == 64
977    case TCG_TYPE_I64:
978#endif
979        if (ret < 16) {
980            break;
981        }
982        /* fallthru */
983    case TCG_TYPE_V64:
984    case TCG_TYPE_V128:
985    case TCG_TYPE_V256:
986        tcg_debug_assert(ret >= 16);
987        tcg_out_dupi_vec(s, type, ret, arg);
988        return;
989    default:
990        g_assert_not_reached();
991    }
992
993    if (arg == 0) {
994        tgen_arithr(s, ARITH_XOR, ret, ret);
995        return;
996    }
997    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
998        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
999        tcg_out32(s, arg);
1000        return;
1001    }
1002    if (arg == (int32_t)arg) {
1003        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1004        tcg_out32(s, arg);
1005        return;
1006    }
1007
1008    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1009    diff = arg - ((uintptr_t)s->code_ptr + 7);
1010    if (diff == (int32_t)diff) {
1011        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1012        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1013        tcg_out32(s, diff);
1014        return;
1015    }
1016
1017    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1018    tcg_out64(s, arg);
1019}
1020
1021static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1022{
1023    if (val == (int8_t)val) {
1024        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1025        tcg_out8(s, val);
1026    } else if (val == (int32_t)val) {
1027        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1028        tcg_out32(s, val);
1029    } else {
1030        tcg_abort();
1031    }
1032}
1033
1034static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1035{
1036    /* Given the strength of x86 memory ordering, we only need care for
1037       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1038       faster than "mfence", so don't bother with the sse insn.  */
1039    if (a0 & TCG_MO_ST_LD) {
1040        tcg_out8(s, 0xf0);
1041        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1042        tcg_out8(s, 0);
1043    }
1044}
1045
1046static inline void tcg_out_push(TCGContext *s, int reg)
1047{
1048    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1049}
1050
1051static inline void tcg_out_pop(TCGContext *s, int reg)
1052{
1053    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1054}
1055
1056static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1057                       TCGReg arg1, intptr_t arg2)
1058{
1059    switch (type) {
1060    case TCG_TYPE_I32:
1061        if (ret < 16) {
1062            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1063        } else {
1064            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1065        }
1066        break;
1067    case TCG_TYPE_I64:
1068        if (ret < 16) {
1069            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1070            break;
1071        }
1072        /* FALLTHRU */
1073    case TCG_TYPE_V64:
1074        /* There is no instruction that can validate 8-byte alignment.  */
1075        tcg_debug_assert(ret >= 16);
1076        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1077        break;
1078    case TCG_TYPE_V128:
1079        /*
1080         * The gvec infrastructure is asserts that v128 vector loads
1081         * and stores use a 16-byte aligned offset.  Validate that the
1082         * final pointer is aligned by using an insn that will SIGSEGV.
1083         */
1084        tcg_debug_assert(ret >= 16);
1085        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1086        break;
1087    case TCG_TYPE_V256:
1088        /*
1089         * The gvec infrastructure only requires 16-byte alignment,
1090         * so here we must use an unaligned load.
1091         */
1092        tcg_debug_assert(ret >= 16);
1093        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1094                                 ret, 0, arg1, arg2);
1095        break;
1096    default:
1097        g_assert_not_reached();
1098    }
1099}
1100
1101static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1102                       TCGReg arg1, intptr_t arg2)
1103{
1104    switch (type) {
1105    case TCG_TYPE_I32:
1106        if (arg < 16) {
1107            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1108        } else {
1109            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1110        }
1111        break;
1112    case TCG_TYPE_I64:
1113        if (arg < 16) {
1114            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1115            break;
1116        }
1117        /* FALLTHRU */
1118    case TCG_TYPE_V64:
1119        /* There is no instruction that can validate 8-byte alignment.  */
1120        tcg_debug_assert(arg >= 16);
1121        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1122        break;
1123    case TCG_TYPE_V128:
1124        /*
1125         * The gvec infrastructure is asserts that v128 vector loads
1126         * and stores use a 16-byte aligned offset.  Validate that the
1127         * final pointer is aligned by using an insn that will SIGSEGV.
1128         */
1129        tcg_debug_assert(arg >= 16);
1130        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1131        break;
1132    case TCG_TYPE_V256:
1133        /*
1134         * The gvec infrastructure only requires 16-byte alignment,
1135         * so here we must use an unaligned store.
1136         */
1137        tcg_debug_assert(arg >= 16);
1138        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1139                                 arg, 0, arg1, arg2);
1140        break;
1141    default:
1142        g_assert_not_reached();
1143    }
1144}
1145
1146static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1147                        TCGReg base, intptr_t ofs)
1148{
1149    int rexw = 0;
1150    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1151        if (val != (int32_t)val) {
1152            return false;
1153        }
1154        rexw = P_REXW;
1155    } else if (type != TCG_TYPE_I32) {
1156        return false;
1157    }
1158    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1159    tcg_out32(s, val);
1160    return true;
1161}
1162
1163static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1164{
1165    /* Propagate an opcode prefix, such as P_DATA16.  */
1166    int ext = subopc & ~0x7;
1167    subopc &= 0x7;
1168
1169    if (count == 1) {
1170        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1171    } else {
1172        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1173        tcg_out8(s, count);
1174    }
1175}
1176
1177static inline void tcg_out_bswap32(TCGContext *s, int reg)
1178{
1179    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1180}
1181
1182static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1183{
1184    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1185}
1186
1187static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1188{
1189    /* movzbl */
1190    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1191    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1192}
1193
1194static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1195{
1196    /* movsbl */
1197    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1198    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1199}
1200
1201static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1202{
1203    /* movzwl */
1204    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1205}
1206
1207static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1208{
1209    /* movsw[lq] */
1210    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1211}
1212
1213static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1214{
1215    /* 32-bit mov zero extends.  */
1216    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1217}
1218
1219static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1220{
1221    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1222}
1223
1224static inline void tcg_out_bswap64(TCGContext *s, int reg)
1225{
1226    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1227}
1228
1229static void tgen_arithi(TCGContext *s, int c, int r0,
1230                        tcg_target_long val, int cf)
1231{
1232    int rexw = 0;
1233
1234    if (TCG_TARGET_REG_BITS == 64) {
1235        rexw = c & -8;
1236        c &= 7;
1237    }
1238
1239    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1240       partial flags update stalls on Pentium4 and are not recommended
1241       by current Intel optimization manuals.  */
1242    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1243        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1244        if (TCG_TARGET_REG_BITS == 64) {
1245            /* The single-byte increment encodings are re-tasked as the
1246               REX prefixes.  Use the MODRM encoding.  */
1247            tcg_out_modrm(s, OPC_GRP5 + rexw,
1248                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1249        } else {
1250            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1251        }
1252        return;
1253    }
1254
1255    if (c == ARITH_AND) {
1256        if (TCG_TARGET_REG_BITS == 64) {
1257            if (val == 0xffffffffu) {
1258                tcg_out_ext32u(s, r0, r0);
1259                return;
1260            }
1261            if (val == (uint32_t)val) {
1262                /* AND with no high bits set can use a 32-bit operation.  */
1263                rexw = 0;
1264            }
1265        }
1266        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1267            tcg_out_ext8u(s, r0, r0);
1268            return;
1269        }
1270        if (val == 0xffffu) {
1271            tcg_out_ext16u(s, r0, r0);
1272            return;
1273        }
1274    }
1275
1276    if (val == (int8_t)val) {
1277        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1278        tcg_out8(s, val);
1279        return;
1280    }
1281    if (rexw == 0 || val == (int32_t)val) {
1282        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1283        tcg_out32(s, val);
1284        return;
1285    }
1286
1287    tcg_abort();
1288}
1289
1290static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1291{
1292    if (val != 0) {
1293        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1294    }
1295}
1296
1297/* Use SMALL != 0 to force a short forward branch.  */
1298static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1299{
1300    int32_t val, val1;
1301
1302    if (l->has_value) {
1303        val = tcg_pcrel_diff(s, l->u.value_ptr);
1304        val1 = val - 2;
1305        if ((int8_t)val1 == val1) {
1306            if (opc == -1) {
1307                tcg_out8(s, OPC_JMP_short);
1308            } else {
1309                tcg_out8(s, OPC_JCC_short + opc);
1310            }
1311            tcg_out8(s, val1);
1312        } else {
1313            if (small) {
1314                tcg_abort();
1315            }
1316            if (opc == -1) {
1317                tcg_out8(s, OPC_JMP_long);
1318                tcg_out32(s, val - 5);
1319            } else {
1320                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1321                tcg_out32(s, val - 6);
1322            }
1323        }
1324    } else if (small) {
1325        if (opc == -1) {
1326            tcg_out8(s, OPC_JMP_short);
1327        } else {
1328            tcg_out8(s, OPC_JCC_short + opc);
1329        }
1330        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1331        s->code_ptr += 1;
1332    } else {
1333        if (opc == -1) {
1334            tcg_out8(s, OPC_JMP_long);
1335        } else {
1336            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1337        }
1338        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1339        s->code_ptr += 4;
1340    }
1341}
1342
1343static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1344                        int const_arg2, int rexw)
1345{
1346    if (const_arg2) {
1347        if (arg2 == 0) {
1348            /* test r, r */
1349            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1350        } else {
1351            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1352        }
1353    } else {
1354        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1355    }
1356}
1357
1358static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1359                             TCGArg arg1, TCGArg arg2, int const_arg2,
1360                             TCGLabel *label, int small)
1361{
1362    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1363    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1364}
1365
1366#if TCG_TARGET_REG_BITS == 64
1367static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1368                             TCGArg arg1, TCGArg arg2, int const_arg2,
1369                             TCGLabel *label, int small)
1370{
1371    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1372    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1373}
1374#else
1375/* XXX: we implement it at the target level to avoid having to
1376   handle cross basic blocks temporaries */
1377static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1378                            const int *const_args, int small)
1379{
1380    TCGLabel *label_next = gen_new_label();
1381    TCGLabel *label_this = arg_label(args[5]);
1382
1383    switch(args[4]) {
1384    case TCG_COND_EQ:
1385        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1386                         label_next, 1);
1387        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1388                         label_this, small);
1389        break;
1390    case TCG_COND_NE:
1391        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1392                         label_this, small);
1393        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1394                         label_this, small);
1395        break;
1396    case TCG_COND_LT:
1397        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1398                         label_this, small);
1399        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1400        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1401                         label_this, small);
1402        break;
1403    case TCG_COND_LE:
1404        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1405                         label_this, small);
1406        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1407        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1408                         label_this, small);
1409        break;
1410    case TCG_COND_GT:
1411        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1412                         label_this, small);
1413        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1414        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1415                         label_this, small);
1416        break;
1417    case TCG_COND_GE:
1418        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1419                         label_this, small);
1420        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1421        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1422                         label_this, small);
1423        break;
1424    case TCG_COND_LTU:
1425        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1426                         label_this, small);
1427        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1428        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1429                         label_this, small);
1430        break;
1431    case TCG_COND_LEU:
1432        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1433                         label_this, small);
1434        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1435        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1436                         label_this, small);
1437        break;
1438    case TCG_COND_GTU:
1439        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1440                         label_this, small);
1441        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1442        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1443                         label_this, small);
1444        break;
1445    case TCG_COND_GEU:
1446        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1447                         label_this, small);
1448        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1449        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1450                         label_this, small);
1451        break;
1452    default:
1453        tcg_abort();
1454    }
1455    tcg_out_label(s, label_next, s->code_ptr);
1456}
1457#endif
1458
1459static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1460                              TCGArg arg1, TCGArg arg2, int const_arg2)
1461{
1462    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1463    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1464    tcg_out_ext8u(s, dest, dest);
1465}
1466
1467#if TCG_TARGET_REG_BITS == 64
1468static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1469                              TCGArg arg1, TCGArg arg2, int const_arg2)
1470{
1471    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1472    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1473    tcg_out_ext8u(s, dest, dest);
1474}
1475#else
1476static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1477                             const int *const_args)
1478{
1479    TCGArg new_args[6];
1480    TCGLabel *label_true, *label_over;
1481
1482    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1483
1484    if (args[0] == args[1] || args[0] == args[2]
1485        || (!const_args[3] && args[0] == args[3])
1486        || (!const_args[4] && args[0] == args[4])) {
1487        /* When the destination overlaps with one of the argument
1488           registers, don't do anything tricky.  */
1489        label_true = gen_new_label();
1490        label_over = gen_new_label();
1491
1492        new_args[5] = label_arg(label_true);
1493        tcg_out_brcond2(s, new_args, const_args+1, 1);
1494
1495        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1496        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1497        tcg_out_label(s, label_true, s->code_ptr);
1498
1499        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1500        tcg_out_label(s, label_over, s->code_ptr);
1501    } else {
1502        /* When the destination does not overlap one of the arguments,
1503           clear the destination first, jump if cond false, and emit an
1504           increment in the true case.  This results in smaller code.  */
1505
1506        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1507
1508        label_over = gen_new_label();
1509        new_args[4] = tcg_invert_cond(new_args[4]);
1510        new_args[5] = label_arg(label_over);
1511        tcg_out_brcond2(s, new_args, const_args+1, 1);
1512
1513        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1514        tcg_out_label(s, label_over, s->code_ptr);
1515    }
1516}
1517#endif
1518
1519static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1520                         TCGReg dest, TCGReg v1)
1521{
1522    if (have_cmov) {
1523        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1524    } else {
1525        TCGLabel *over = gen_new_label();
1526        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1527        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1528        tcg_out_label(s, over, s->code_ptr);
1529    }
1530}
1531
1532static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1533                              TCGReg c1, TCGArg c2, int const_c2,
1534                              TCGReg v1)
1535{
1536    tcg_out_cmp(s, c1, c2, const_c2, 0);
1537    tcg_out_cmov(s, cond, 0, dest, v1);
1538}
1539
1540#if TCG_TARGET_REG_BITS == 64
1541static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1542                              TCGReg c1, TCGArg c2, int const_c2,
1543                              TCGReg v1)
1544{
1545    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1546    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1547}
1548#endif
1549
1550static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1551                        TCGArg arg2, bool const_a2)
1552{
1553    if (have_bmi1) {
1554        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1555        if (const_a2) {
1556            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1557        } else {
1558            tcg_debug_assert(dest != arg2);
1559            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1560        }
1561    } else {
1562        tcg_debug_assert(dest != arg2);
1563        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1564        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1565    }
1566}
1567
1568static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1569                        TCGArg arg2, bool const_a2)
1570{
1571    if (have_lzcnt) {
1572        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1573        if (const_a2) {
1574            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1575        } else {
1576            tcg_debug_assert(dest != arg2);
1577            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1578        }
1579    } else {
1580        tcg_debug_assert(!const_a2);
1581        tcg_debug_assert(dest != arg1);
1582        tcg_debug_assert(dest != arg2);
1583
1584        /* Recall that the output of BSR is the index not the count.  */
1585        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1586        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1587
1588        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1589        tcg_out_cmp(s, arg1, 0, 1, rexw);
1590        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1591    }
1592}
1593
1594static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1595{
1596    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1597
1598    if (disp == (int32_t)disp) {
1599        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1600        tcg_out32(s, disp);
1601    } else {
1602        /* rip-relative addressing into the constant pool.
1603           This is 6 + 8 = 14 bytes, as compared to using an
1604           an immediate load 10 + 6 = 16 bytes, plus we may
1605           be able to re-use the pool constant for more calls.  */
1606        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1607        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1608        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1609        tcg_out32(s, 0);
1610    }
1611}
1612
1613static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1614{
1615    tcg_out_branch(s, 1, dest);
1616}
1617
1618static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1619{
1620    tcg_out_branch(s, 0, dest);
1621}
1622
1623static void tcg_out_nopn(TCGContext *s, int n)
1624{
1625    int i;
1626    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1627     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1628     * duplicate prefix, and all of the interesting recent cores can
1629     * decode and discard the duplicates in a single cycle.
1630     */
1631    tcg_debug_assert(n >= 1);
1632    for (i = 1; i < n; ++i) {
1633        tcg_out8(s, 0x66);
1634    }
1635    tcg_out8(s, 0x90);
1636}
1637
1638#if defined(CONFIG_SOFTMMU)
1639#include "../tcg-ldst.c.inc"
1640
1641/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1642 *                                     int mmu_idx, uintptr_t ra)
1643 */
1644static void * const qemu_ld_helpers[16] = {
1645    [MO_UB]   = helper_ret_ldub_mmu,
1646    [MO_LEUW] = helper_le_lduw_mmu,
1647    [MO_LEUL] = helper_le_ldul_mmu,
1648    [MO_LEQ]  = helper_le_ldq_mmu,
1649    [MO_BEUW] = helper_be_lduw_mmu,
1650    [MO_BEUL] = helper_be_ldul_mmu,
1651    [MO_BEQ]  = helper_be_ldq_mmu,
1652};
1653
1654/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1655 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1656 */
1657static void * const qemu_st_helpers[16] = {
1658    [MO_UB]   = helper_ret_stb_mmu,
1659    [MO_LEUW] = helper_le_stw_mmu,
1660    [MO_LEUL] = helper_le_stl_mmu,
1661    [MO_LEQ]  = helper_le_stq_mmu,
1662    [MO_BEUW] = helper_be_stw_mmu,
1663    [MO_BEUL] = helper_be_stl_mmu,
1664    [MO_BEQ]  = helper_be_stq_mmu,
1665};
1666
1667/* Perform the TLB load and compare.
1668
1669   Inputs:
1670   ADDRLO and ADDRHI contain the low and high part of the address.
1671
1672   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1673
1674   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1675   This should be offsetof addr_read or addr_write.
1676
1677   Outputs:
1678   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1679   positions of the displacements of forward jumps to the TLB miss case.
1680
1681   Second argument register is loaded with the low part of the address.
1682   In the TLB hit case, it has been adjusted as indicated by the TLB
1683   and so is a host address.  In the TLB miss case, it continues to
1684   hold a guest address.
1685
1686   First argument register is clobbered.  */
1687
1688static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1689                                    int mem_index, MemOp opc,
1690                                    tcg_insn_unit **label_ptr, int which)
1691{
1692    const TCGReg r0 = TCG_REG_L0;
1693    const TCGReg r1 = TCG_REG_L1;
1694    TCGType ttype = TCG_TYPE_I32;
1695    TCGType tlbtype = TCG_TYPE_I32;
1696    int trexw = 0, hrexw = 0, tlbrexw = 0;
1697    unsigned a_bits = get_alignment_bits(opc);
1698    unsigned s_bits = opc & MO_SIZE;
1699    unsigned a_mask = (1 << a_bits) - 1;
1700    unsigned s_mask = (1 << s_bits) - 1;
1701    target_ulong tlb_mask;
1702
1703    if (TCG_TARGET_REG_BITS == 64) {
1704        if (TARGET_LONG_BITS == 64) {
1705            ttype = TCG_TYPE_I64;
1706            trexw = P_REXW;
1707        }
1708        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1709            hrexw = P_REXW;
1710            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1711                tlbtype = TCG_TYPE_I64;
1712                tlbrexw = P_REXW;
1713            }
1714        }
1715    }
1716
1717    tcg_out_mov(s, tlbtype, r0, addrlo);
1718    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1719                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1720
1721    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1722                         TLB_MASK_TABLE_OFS(mem_index) +
1723                         offsetof(CPUTLBDescFast, mask));
1724
1725    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1726                         TLB_MASK_TABLE_OFS(mem_index) +
1727                         offsetof(CPUTLBDescFast, table));
1728
1729    /* If the required alignment is at least as large as the access, simply
1730       copy the address and mask.  For lesser alignments, check that we don't
1731       cross pages for the complete access.  */
1732    if (a_bits >= s_bits) {
1733        tcg_out_mov(s, ttype, r1, addrlo);
1734    } else {
1735        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1736    }
1737    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1738    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1739
1740    /* cmp 0(r0), r1 */
1741    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1742
1743    /* Prepare for both the fast path add of the tlb addend, and the slow
1744       path function argument setup.  */
1745    tcg_out_mov(s, ttype, r1, addrlo);
1746
1747    /* jne slow_path */
1748    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1749    label_ptr[0] = s->code_ptr;
1750    s->code_ptr += 4;
1751
1752    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1753        /* cmp 4(r0), addrhi */
1754        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1755
1756        /* jne slow_path */
1757        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1758        label_ptr[1] = s->code_ptr;
1759        s->code_ptr += 4;
1760    }
1761
1762    /* TLB Hit.  */
1763
1764    /* add addend(r0), r1 */
1765    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1766                         offsetof(CPUTLBEntry, addend));
1767}
1768
1769/*
1770 * Record the context of a call to the out of line helper code for the slow path
1771 * for a load or store, so that we can later generate the correct helper code
1772 */
1773static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1774                                TCGMemOpIdx oi,
1775                                TCGReg datalo, TCGReg datahi,
1776                                TCGReg addrlo, TCGReg addrhi,
1777                                tcg_insn_unit *raddr,
1778                                tcg_insn_unit **label_ptr)
1779{
1780    TCGLabelQemuLdst *label = new_ldst_label(s);
1781
1782    label->is_ld = is_ld;
1783    label->oi = oi;
1784    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1785    label->datalo_reg = datalo;
1786    label->datahi_reg = datahi;
1787    label->addrlo_reg = addrlo;
1788    label->addrhi_reg = addrhi;
1789    label->raddr = raddr;
1790    label->label_ptr[0] = label_ptr[0];
1791    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1792        label->label_ptr[1] = label_ptr[1];
1793    }
1794}
1795
1796/*
1797 * Generate code for the slow path for a load at the end of block
1798 */
1799static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1800{
1801    TCGMemOpIdx oi = l->oi;
1802    MemOp opc = get_memop(oi);
1803    TCGReg data_reg;
1804    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1805    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1806
1807    /* resolve label address */
1808    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1809    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1810        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1811    }
1812
1813    if (TCG_TARGET_REG_BITS == 32) {
1814        int ofs = 0;
1815
1816        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1817        ofs += 4;
1818
1819        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1820        ofs += 4;
1821
1822        if (TARGET_LONG_BITS == 64) {
1823            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1824            ofs += 4;
1825        }
1826
1827        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1828        ofs += 4;
1829
1830        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1831    } else {
1832        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1833        /* The second argument is already loaded with addrlo.  */
1834        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1835        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1836                     (uintptr_t)l->raddr);
1837    }
1838
1839    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1840
1841    data_reg = l->datalo_reg;
1842    switch (opc & MO_SSIZE) {
1843    case MO_SB:
1844        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1845        break;
1846    case MO_SW:
1847        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1848        break;
1849#if TCG_TARGET_REG_BITS == 64
1850    case MO_SL:
1851        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1852        break;
1853#endif
1854    case MO_UB:
1855    case MO_UW:
1856        /* Note that the helpers have zero-extended to tcg_target_long.  */
1857    case MO_UL:
1858        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1859        break;
1860    case MO_Q:
1861        if (TCG_TARGET_REG_BITS == 64) {
1862            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1863        } else if (data_reg == TCG_REG_EDX) {
1864            /* xchg %edx, %eax */
1865            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1866            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1867        } else {
1868            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1869            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1870        }
1871        break;
1872    default:
1873        tcg_abort();
1874    }
1875
1876    /* Jump to the code corresponding to next IR of qemu_st */
1877    tcg_out_jmp(s, l->raddr);
1878    return true;
1879}
1880
1881/*
1882 * Generate code for the slow path for a store at the end of block
1883 */
1884static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1885{
1886    TCGMemOpIdx oi = l->oi;
1887    MemOp opc = get_memop(oi);
1888    MemOp s_bits = opc & MO_SIZE;
1889    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1890    TCGReg retaddr;
1891
1892    /* resolve label address */
1893    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1894    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1895        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1896    }
1897
1898    if (TCG_TARGET_REG_BITS == 32) {
1899        int ofs = 0;
1900
1901        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1902        ofs += 4;
1903
1904        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1905        ofs += 4;
1906
1907        if (TARGET_LONG_BITS == 64) {
1908            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1909            ofs += 4;
1910        }
1911
1912        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1913        ofs += 4;
1914
1915        if (s_bits == MO_64) {
1916            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1917            ofs += 4;
1918        }
1919
1920        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1921        ofs += 4;
1922
1923        retaddr = TCG_REG_EAX;
1924        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1925        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1926    } else {
1927        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1928        /* The second argument is already loaded with addrlo.  */
1929        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1930                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1931        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1932
1933        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1934            retaddr = tcg_target_call_iarg_regs[4];
1935            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1936        } else {
1937            retaddr = TCG_REG_RAX;
1938            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1939            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1940                       TCG_TARGET_CALL_STACK_OFFSET);
1941        }
1942    }
1943
1944    /* "Tail call" to the helper, with the return address back inline.  */
1945    tcg_out_push(s, retaddr);
1946    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1947    return true;
1948}
1949#elif TCG_TARGET_REG_BITS == 32
1950# define x86_guest_base_seg     0
1951# define x86_guest_base_index   -1
1952# define x86_guest_base_offset  guest_base
1953#else
1954static int x86_guest_base_seg;
1955static int x86_guest_base_index = -1;
1956static int32_t x86_guest_base_offset;
1957# if defined(__x86_64__) && defined(__linux__)
1958#  include <asm/prctl.h>
1959#  include <sys/prctl.h>
1960int arch_prctl(int code, unsigned long addr);
1961static inline int setup_guest_base_seg(void)
1962{
1963    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1964        return P_GS;
1965    }
1966    return 0;
1967}
1968# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1969#  include <machine/sysarch.h>
1970static inline int setup_guest_base_seg(void)
1971{
1972    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1973        return P_GS;
1974    }
1975    return 0;
1976}
1977# else
1978static inline int setup_guest_base_seg(void)
1979{
1980    return 0;
1981}
1982# endif
1983#endif /* SOFTMMU */
1984
1985static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1986                                   TCGReg base, int index, intptr_t ofs,
1987                                   int seg, bool is64, MemOp memop)
1988{
1989    const MemOp real_bswap = memop & MO_BSWAP;
1990    MemOp bswap = real_bswap;
1991    int rexw = is64 * P_REXW;
1992    int movop = OPC_MOVL_GvEv;
1993
1994    if (have_movbe && real_bswap) {
1995        bswap = 0;
1996        movop = OPC_MOVBE_GyMy;
1997    }
1998
1999    switch (memop & MO_SSIZE) {
2000    case MO_UB:
2001        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2002                                 base, index, 0, ofs);
2003        break;
2004    case MO_SB:
2005        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2006                                 base, index, 0, ofs);
2007        break;
2008    case MO_UW:
2009        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2010                                 base, index, 0, ofs);
2011        if (real_bswap) {
2012            tcg_out_rolw_8(s, datalo);
2013        }
2014        break;
2015    case MO_SW:
2016        if (real_bswap) {
2017            if (have_movbe) {
2018                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2019                                         datalo, base, index, 0, ofs);
2020            } else {
2021                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2022                                         base, index, 0, ofs);
2023                tcg_out_rolw_8(s, datalo);
2024            }
2025            tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
2026        } else {
2027            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2028                                     datalo, base, index, 0, ofs);
2029        }
2030        break;
2031    case MO_UL:
2032        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2033        if (bswap) {
2034            tcg_out_bswap32(s, datalo);
2035        }
2036        break;
2037#if TCG_TARGET_REG_BITS == 64
2038    case MO_SL:
2039        if (real_bswap) {
2040            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2041                                     base, index, 0, ofs);
2042            if (bswap) {
2043                tcg_out_bswap32(s, datalo);
2044            }
2045            tcg_out_ext32s(s, datalo, datalo);
2046        } else {
2047            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2048                                     base, index, 0, ofs);
2049        }
2050        break;
2051#endif
2052    case MO_Q:
2053        if (TCG_TARGET_REG_BITS == 64) {
2054            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2055                                     base, index, 0, ofs);
2056            if (bswap) {
2057                tcg_out_bswap64(s, datalo);
2058            }
2059        } else {
2060            if (real_bswap) {
2061                int t = datalo;
2062                datalo = datahi;
2063                datahi = t;
2064            }
2065            if (base != datalo) {
2066                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2067                                         base, index, 0, ofs);
2068                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2069                                         base, index, 0, ofs + 4);
2070            } else {
2071                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2072                                         base, index, 0, ofs + 4);
2073                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2074                                         base, index, 0, ofs);
2075            }
2076            if (bswap) {
2077                tcg_out_bswap32(s, datalo);
2078                tcg_out_bswap32(s, datahi);
2079            }
2080        }
2081        break;
2082    default:
2083        tcg_abort();
2084    }
2085}
2086
2087/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2088   EAX. It will be useful once fixed registers globals are less
2089   common. */
2090static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2091{
2092    TCGReg datalo, datahi, addrlo;
2093    TCGReg addrhi __attribute__((unused));
2094    TCGMemOpIdx oi;
2095    MemOp opc;
2096#if defined(CONFIG_SOFTMMU)
2097    int mem_index;
2098    tcg_insn_unit *label_ptr[2];
2099#endif
2100
2101    datalo = *args++;
2102    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2103    addrlo = *args++;
2104    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2105    oi = *args++;
2106    opc = get_memop(oi);
2107
2108#if defined(CONFIG_SOFTMMU)
2109    mem_index = get_mmuidx(oi);
2110
2111    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2112                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2113
2114    /* TLB Hit.  */
2115    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2116
2117    /* Record the current context of a load into ldst label */
2118    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2119                        s->code_ptr, label_ptr);
2120#else
2121    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2122                           x86_guest_base_offset, x86_guest_base_seg,
2123                           is64, opc);
2124#endif
2125}
2126
2127static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2128                                   TCGReg base, int index, intptr_t ofs,
2129                                   int seg, MemOp memop)
2130{
2131    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
2132       we could perform the bswap twice to restore the original value
2133       instead of moving to the scratch.  But as it is, the L constraint
2134       means that TCG_REG_L0 is definitely free here.  */
2135    const TCGReg scratch = TCG_REG_L0;
2136    const MemOp real_bswap = memop & MO_BSWAP;
2137    MemOp bswap = real_bswap;
2138    int movop = OPC_MOVL_EvGv;
2139
2140    if (have_movbe && real_bswap) {
2141        bswap = 0;
2142        movop = OPC_MOVBE_MyGy;
2143    }
2144
2145    switch (memop & MO_SIZE) {
2146    case MO_8:
2147        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2148           Use the scratch register if necessary.  */
2149        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2150            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2151            datalo = scratch;
2152        }
2153        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2154                                 datalo, base, index, 0, ofs);
2155        break;
2156    case MO_16:
2157        if (bswap) {
2158            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2159            tcg_out_rolw_8(s, scratch);
2160            datalo = scratch;
2161        }
2162        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2163                                 base, index, 0, ofs);
2164        break;
2165    case MO_32:
2166        if (bswap) {
2167            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2168            tcg_out_bswap32(s, scratch);
2169            datalo = scratch;
2170        }
2171        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2172        break;
2173    case MO_64:
2174        if (TCG_TARGET_REG_BITS == 64) {
2175            if (bswap) {
2176                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2177                tcg_out_bswap64(s, scratch);
2178                datalo = scratch;
2179            }
2180            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2181                                     base, index, 0, ofs);
2182        } else if (bswap) {
2183            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2184            tcg_out_bswap32(s, scratch);
2185            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2186                                     base, index, 0, ofs);
2187            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2188            tcg_out_bswap32(s, scratch);
2189            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2190                                     base, index, 0, ofs + 4);
2191        } else {
2192            if (real_bswap) {
2193                int t = datalo;
2194                datalo = datahi;
2195                datahi = t;
2196            }
2197            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2198                                     base, index, 0, ofs);
2199            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2200                                     base, index, 0, ofs + 4);
2201        }
2202        break;
2203    default:
2204        tcg_abort();
2205    }
2206}
2207
2208static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2209{
2210    TCGReg datalo, datahi, addrlo;
2211    TCGReg addrhi __attribute__((unused));
2212    TCGMemOpIdx oi;
2213    MemOp opc;
2214#if defined(CONFIG_SOFTMMU)
2215    int mem_index;
2216    tcg_insn_unit *label_ptr[2];
2217#endif
2218
2219    datalo = *args++;
2220    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2221    addrlo = *args++;
2222    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2223    oi = *args++;
2224    opc = get_memop(oi);
2225
2226#if defined(CONFIG_SOFTMMU)
2227    mem_index = get_mmuidx(oi);
2228
2229    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2230                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2231
2232    /* TLB Hit.  */
2233    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2234
2235    /* Record the current context of a store into ldst label */
2236    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2237                        s->code_ptr, label_ptr);
2238#else
2239    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2240                           x86_guest_base_offset, x86_guest_base_seg, opc);
2241#endif
2242}
2243
2244static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2245                              const TCGArg *args, const int *const_args)
2246{
2247    TCGArg a0, a1, a2;
2248    int c, const_a2, vexop, rexw = 0;
2249
2250#if TCG_TARGET_REG_BITS == 64
2251# define OP_32_64(x) \
2252        case glue(glue(INDEX_op_, x), _i64): \
2253            rexw = P_REXW; /* FALLTHRU */    \
2254        case glue(glue(INDEX_op_, x), _i32)
2255#else
2256# define OP_32_64(x) \
2257        case glue(glue(INDEX_op_, x), _i32)
2258#endif
2259
2260    /* Hoist the loads of the most common arguments.  */
2261    a0 = args[0];
2262    a1 = args[1];
2263    a2 = args[2];
2264    const_a2 = const_args[2];
2265
2266    switch (opc) {
2267    case INDEX_op_exit_tb:
2268        /* Reuse the zeroing that exists for goto_ptr.  */
2269        if (a0 == 0) {
2270            tcg_out_jmp(s, s->code_gen_epilogue);
2271        } else {
2272            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2273            tcg_out_jmp(s, tb_ret_addr);
2274        }
2275        break;
2276    case INDEX_op_goto_tb:
2277        if (s->tb_jmp_insn_offset) {
2278            /* direct jump method */
2279            int gap;
2280            /* jump displacement must be aligned for atomic patching;
2281             * see if we need to add extra nops before jump
2282             */
2283            gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2284            if (gap != 1) {
2285                tcg_out_nopn(s, gap - 1);
2286            }
2287            tcg_out8(s, OPC_JMP_long); /* jmp im */
2288            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2289            tcg_out32(s, 0);
2290        } else {
2291            /* indirect jump method */
2292            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2293                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2294        }
2295        set_jmp_reset_offset(s, a0);
2296        break;
2297    case INDEX_op_goto_ptr:
2298        /* jmp to the given host address (could be epilogue) */
2299        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2300        break;
2301    case INDEX_op_br:
2302        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2303        break;
2304    OP_32_64(ld8u):
2305        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2306        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2307        break;
2308    OP_32_64(ld8s):
2309        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2310        break;
2311    OP_32_64(ld16u):
2312        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2313        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2314        break;
2315    OP_32_64(ld16s):
2316        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2317        break;
2318#if TCG_TARGET_REG_BITS == 64
2319    case INDEX_op_ld32u_i64:
2320#endif
2321    case INDEX_op_ld_i32:
2322        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2323        break;
2324
2325    OP_32_64(st8):
2326        if (const_args[0]) {
2327            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2328            tcg_out8(s, a0);
2329        } else {
2330            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2331        }
2332        break;
2333    OP_32_64(st16):
2334        if (const_args[0]) {
2335            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2336            tcg_out16(s, a0);
2337        } else {
2338            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2339        }
2340        break;
2341#if TCG_TARGET_REG_BITS == 64
2342    case INDEX_op_st32_i64:
2343#endif
2344    case INDEX_op_st_i32:
2345        if (const_args[0]) {
2346            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2347            tcg_out32(s, a0);
2348        } else {
2349            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2350        }
2351        break;
2352
2353    OP_32_64(add):
2354        /* For 3-operand addition, use LEA.  */
2355        if (a0 != a1) {
2356            TCGArg c3 = 0;
2357            if (const_a2) {
2358                c3 = a2, a2 = -1;
2359            } else if (a0 == a2) {
2360                /* Watch out for dest = src + dest, since we've removed
2361                   the matching constraint on the add.  */
2362                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2363                break;
2364            }
2365
2366            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2367            break;
2368        }
2369        c = ARITH_ADD;
2370        goto gen_arith;
2371    OP_32_64(sub):
2372        c = ARITH_SUB;
2373        goto gen_arith;
2374    OP_32_64(and):
2375        c = ARITH_AND;
2376        goto gen_arith;
2377    OP_32_64(or):
2378        c = ARITH_OR;
2379        goto gen_arith;
2380    OP_32_64(xor):
2381        c = ARITH_XOR;
2382        goto gen_arith;
2383    gen_arith:
2384        if (const_a2) {
2385            tgen_arithi(s, c + rexw, a0, a2, 0);
2386        } else {
2387            tgen_arithr(s, c + rexw, a0, a2);
2388        }
2389        break;
2390
2391    OP_32_64(andc):
2392        if (const_a2) {
2393            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2394            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2395        } else {
2396            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2397        }
2398        break;
2399
2400    OP_32_64(mul):
2401        if (const_a2) {
2402            int32_t val;
2403            val = a2;
2404            if (val == (int8_t)val) {
2405                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2406                tcg_out8(s, val);
2407            } else {
2408                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2409                tcg_out32(s, val);
2410            }
2411        } else {
2412            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2413        }
2414        break;
2415
2416    OP_32_64(div2):
2417        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2418        break;
2419    OP_32_64(divu2):
2420        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2421        break;
2422
2423    OP_32_64(shl):
2424        /* For small constant 3-operand shift, use LEA.  */
2425        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2426            if (a2 - 1 == 0) {
2427                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2428                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2429            } else {
2430                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2431                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2432            }
2433            break;
2434        }
2435        c = SHIFT_SHL;
2436        vexop = OPC_SHLX;
2437        goto gen_shift_maybe_vex;
2438    OP_32_64(shr):
2439        c = SHIFT_SHR;
2440        vexop = OPC_SHRX;
2441        goto gen_shift_maybe_vex;
2442    OP_32_64(sar):
2443        c = SHIFT_SAR;
2444        vexop = OPC_SARX;
2445        goto gen_shift_maybe_vex;
2446    OP_32_64(rotl):
2447        c = SHIFT_ROL;
2448        goto gen_shift;
2449    OP_32_64(rotr):
2450        c = SHIFT_ROR;
2451        goto gen_shift;
2452    gen_shift_maybe_vex:
2453        if (have_bmi2) {
2454            if (!const_a2) {
2455                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2456                break;
2457            }
2458            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2459        }
2460        /* FALLTHRU */
2461    gen_shift:
2462        if (const_a2) {
2463            tcg_out_shifti(s, c + rexw, a0, a2);
2464        } else {
2465            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2466        }
2467        break;
2468
2469    OP_32_64(ctz):
2470        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2471        break;
2472    OP_32_64(clz):
2473        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2474        break;
2475    OP_32_64(ctpop):
2476        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2477        break;
2478
2479    case INDEX_op_brcond_i32:
2480        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2481        break;
2482    case INDEX_op_setcond_i32:
2483        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2484        break;
2485    case INDEX_op_movcond_i32:
2486        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2487        break;
2488
2489    OP_32_64(bswap16):
2490        tcg_out_rolw_8(s, a0);
2491        break;
2492    OP_32_64(bswap32):
2493        tcg_out_bswap32(s, a0);
2494        break;
2495
2496    OP_32_64(neg):
2497        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2498        break;
2499    OP_32_64(not):
2500        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2501        break;
2502
2503    OP_32_64(ext8s):
2504        tcg_out_ext8s(s, a0, a1, rexw);
2505        break;
2506    OP_32_64(ext16s):
2507        tcg_out_ext16s(s, a0, a1, rexw);
2508        break;
2509    OP_32_64(ext8u):
2510        tcg_out_ext8u(s, a0, a1);
2511        break;
2512    OP_32_64(ext16u):
2513        tcg_out_ext16u(s, a0, a1);
2514        break;
2515
2516    case INDEX_op_qemu_ld_i32:
2517        tcg_out_qemu_ld(s, args, 0);
2518        break;
2519    case INDEX_op_qemu_ld_i64:
2520        tcg_out_qemu_ld(s, args, 1);
2521        break;
2522    case INDEX_op_qemu_st_i32:
2523        tcg_out_qemu_st(s, args, 0);
2524        break;
2525    case INDEX_op_qemu_st_i64:
2526        tcg_out_qemu_st(s, args, 1);
2527        break;
2528
2529    OP_32_64(mulu2):
2530        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2531        break;
2532    OP_32_64(muls2):
2533        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2534        break;
2535    OP_32_64(add2):
2536        if (const_args[4]) {
2537            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2538        } else {
2539            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2540        }
2541        if (const_args[5]) {
2542            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2543        } else {
2544            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2545        }
2546        break;
2547    OP_32_64(sub2):
2548        if (const_args[4]) {
2549            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2550        } else {
2551            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2552        }
2553        if (const_args[5]) {
2554            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2555        } else {
2556            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2557        }
2558        break;
2559
2560#if TCG_TARGET_REG_BITS == 32
2561    case INDEX_op_brcond2_i32:
2562        tcg_out_brcond2(s, args, const_args, 0);
2563        break;
2564    case INDEX_op_setcond2_i32:
2565        tcg_out_setcond2(s, args, const_args);
2566        break;
2567#else /* TCG_TARGET_REG_BITS == 64 */
2568    case INDEX_op_ld32s_i64:
2569        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2570        break;
2571    case INDEX_op_ld_i64:
2572        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2573        break;
2574    case INDEX_op_st_i64:
2575        if (const_args[0]) {
2576            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2577            tcg_out32(s, a0);
2578        } else {
2579            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2580        }
2581        break;
2582
2583    case INDEX_op_brcond_i64:
2584        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2585        break;
2586    case INDEX_op_setcond_i64:
2587        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2588        break;
2589    case INDEX_op_movcond_i64:
2590        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2591        break;
2592
2593    case INDEX_op_bswap64_i64:
2594        tcg_out_bswap64(s, a0);
2595        break;
2596    case INDEX_op_extu_i32_i64:
2597    case INDEX_op_ext32u_i64:
2598    case INDEX_op_extrl_i64_i32:
2599        tcg_out_ext32u(s, a0, a1);
2600        break;
2601    case INDEX_op_ext_i32_i64:
2602    case INDEX_op_ext32s_i64:
2603        tcg_out_ext32s(s, a0, a1);
2604        break;
2605    case INDEX_op_extrh_i64_i32:
2606        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2607        break;
2608#endif
2609
2610    OP_32_64(deposit):
2611        if (args[3] == 0 && args[4] == 8) {
2612            /* load bits 0..7 */
2613            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2614        } else if (args[3] == 8 && args[4] == 8) {
2615            /* load bits 8..15 */
2616            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2617        } else if (args[3] == 0 && args[4] == 16) {
2618            /* load bits 0..15 */
2619            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2620        } else {
2621            tcg_abort();
2622        }
2623        break;
2624
2625    case INDEX_op_extract_i64:
2626        if (a2 + args[3] == 32) {
2627            /* This is a 32-bit zero-extending right shift.  */
2628            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2629            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2630            break;
2631        }
2632        /* FALLTHRU */
2633    case INDEX_op_extract_i32:
2634        /* On the off-chance that we can use the high-byte registers.
2635           Otherwise we emit the same ext16 + shift pattern that we
2636           would have gotten from the normal tcg-op.c expansion.  */
2637        tcg_debug_assert(a2 == 8 && args[3] == 8);
2638        if (a1 < 4 && a0 < 8) {
2639            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2640        } else {
2641            tcg_out_ext16u(s, a0, a1);
2642            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2643        }
2644        break;
2645
2646    case INDEX_op_sextract_i32:
2647        /* We don't implement sextract_i64, as we cannot sign-extend to
2648           64-bits without using the REX prefix that explicitly excludes
2649           access to the high-byte registers.  */
2650        tcg_debug_assert(a2 == 8 && args[3] == 8);
2651        if (a1 < 4 && a0 < 8) {
2652            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2653        } else {
2654            tcg_out_ext16s(s, a0, a1, 0);
2655            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2656        }
2657        break;
2658
2659    OP_32_64(extract2):
2660        /* Note that SHRD outputs to the r/m operand.  */
2661        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2662        tcg_out8(s, args[3]);
2663        break;
2664
2665    case INDEX_op_mb:
2666        tcg_out_mb(s, a0);
2667        break;
2668    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2669    case INDEX_op_mov_i64:
2670    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2671    case INDEX_op_movi_i64:
2672    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2673    default:
2674        tcg_abort();
2675    }
2676
2677#undef OP_32_64
2678}
2679
2680static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2681                           unsigned vecl, unsigned vece,
2682                           const TCGArg *args, const int *const_args)
2683{
2684    static int const add_insn[4] = {
2685        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2686    };
2687    static int const ssadd_insn[4] = {
2688        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2689    };
2690    static int const usadd_insn[4] = {
2691        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2692    };
2693    static int const sub_insn[4] = {
2694        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2695    };
2696    static int const sssub_insn[4] = {
2697        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2698    };
2699    static int const ussub_insn[4] = {
2700        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2701    };
2702    static int const mul_insn[4] = {
2703        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2704    };
2705    static int const shift_imm_insn[4] = {
2706        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2707    };
2708    static int const cmpeq_insn[4] = {
2709        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2710    };
2711    static int const cmpgt_insn[4] = {
2712        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2713    };
2714    static int const punpckl_insn[4] = {
2715        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2716    };
2717    static int const punpckh_insn[4] = {
2718        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2719    };
2720    static int const packss_insn[4] = {
2721        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2722    };
2723    static int const packus_insn[4] = {
2724        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2725    };
2726    static int const smin_insn[4] = {
2727        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2728    };
2729    static int const smax_insn[4] = {
2730        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2731    };
2732    static int const umin_insn[4] = {
2733        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2734    };
2735    static int const umax_insn[4] = {
2736        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2737    };
2738    static int const shlv_insn[4] = {
2739        /* TODO: AVX512 adds support for MO_16.  */
2740        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2741    };
2742    static int const shrv_insn[4] = {
2743        /* TODO: AVX512 adds support for MO_16.  */
2744        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2745    };
2746    static int const sarv_insn[4] = {
2747        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2748        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2749    };
2750    static int const shls_insn[4] = {
2751        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2752    };
2753    static int const shrs_insn[4] = {
2754        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2755    };
2756    static int const sars_insn[4] = {
2757        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2758    };
2759    static int const abs_insn[4] = {
2760        /* TODO: AVX512 adds support for MO_64.  */
2761        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2762    };
2763
2764    TCGType type = vecl + TCG_TYPE_V64;
2765    int insn, sub;
2766    TCGArg a0, a1, a2;
2767
2768    a0 = args[0];
2769    a1 = args[1];
2770    a2 = args[2];
2771
2772    switch (opc) {
2773    case INDEX_op_add_vec:
2774        insn = add_insn[vece];
2775        goto gen_simd;
2776    case INDEX_op_ssadd_vec:
2777        insn = ssadd_insn[vece];
2778        goto gen_simd;
2779    case INDEX_op_usadd_vec:
2780        insn = usadd_insn[vece];
2781        goto gen_simd;
2782    case INDEX_op_sub_vec:
2783        insn = sub_insn[vece];
2784        goto gen_simd;
2785    case INDEX_op_sssub_vec:
2786        insn = sssub_insn[vece];
2787        goto gen_simd;
2788    case INDEX_op_ussub_vec:
2789        insn = ussub_insn[vece];
2790        goto gen_simd;
2791    case INDEX_op_mul_vec:
2792        insn = mul_insn[vece];
2793        goto gen_simd;
2794    case INDEX_op_and_vec:
2795        insn = OPC_PAND;
2796        goto gen_simd;
2797    case INDEX_op_or_vec:
2798        insn = OPC_POR;
2799        goto gen_simd;
2800    case INDEX_op_xor_vec:
2801        insn = OPC_PXOR;
2802        goto gen_simd;
2803    case INDEX_op_smin_vec:
2804        insn = smin_insn[vece];
2805        goto gen_simd;
2806    case INDEX_op_umin_vec:
2807        insn = umin_insn[vece];
2808        goto gen_simd;
2809    case INDEX_op_smax_vec:
2810        insn = smax_insn[vece];
2811        goto gen_simd;
2812    case INDEX_op_umax_vec:
2813        insn = umax_insn[vece];
2814        goto gen_simd;
2815    case INDEX_op_shlv_vec:
2816        insn = shlv_insn[vece];
2817        goto gen_simd;
2818    case INDEX_op_shrv_vec:
2819        insn = shrv_insn[vece];
2820        goto gen_simd;
2821    case INDEX_op_sarv_vec:
2822        insn = sarv_insn[vece];
2823        goto gen_simd;
2824    case INDEX_op_shls_vec:
2825        insn = shls_insn[vece];
2826        goto gen_simd;
2827    case INDEX_op_shrs_vec:
2828        insn = shrs_insn[vece];
2829        goto gen_simd;
2830    case INDEX_op_sars_vec:
2831        insn = sars_insn[vece];
2832        goto gen_simd;
2833    case INDEX_op_x86_punpckl_vec:
2834        insn = punpckl_insn[vece];
2835        goto gen_simd;
2836    case INDEX_op_x86_punpckh_vec:
2837        insn = punpckh_insn[vece];
2838        goto gen_simd;
2839    case INDEX_op_x86_packss_vec:
2840        insn = packss_insn[vece];
2841        goto gen_simd;
2842    case INDEX_op_x86_packus_vec:
2843        insn = packus_insn[vece];
2844        goto gen_simd;
2845#if TCG_TARGET_REG_BITS == 32
2846    case INDEX_op_dup2_vec:
2847        /* First merge the two 32-bit inputs to a single 64-bit element. */
2848        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2849        /* Then replicate the 64-bit elements across the rest of the vector. */
2850        if (type != TCG_TYPE_V64) {
2851            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2852        }
2853        break;
2854#endif
2855    case INDEX_op_abs_vec:
2856        insn = abs_insn[vece];
2857        a2 = a1;
2858        a1 = 0;
2859        goto gen_simd;
2860    gen_simd:
2861        tcg_debug_assert(insn != OPC_UD2);
2862        if (type == TCG_TYPE_V256) {
2863            insn |= P_VEXL;
2864        }
2865        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2866        break;
2867
2868    case INDEX_op_cmp_vec:
2869        sub = args[3];
2870        if (sub == TCG_COND_EQ) {
2871            insn = cmpeq_insn[vece];
2872        } else if (sub == TCG_COND_GT) {
2873            insn = cmpgt_insn[vece];
2874        } else {
2875            g_assert_not_reached();
2876        }
2877        goto gen_simd;
2878
2879    case INDEX_op_andc_vec:
2880        insn = OPC_PANDN;
2881        if (type == TCG_TYPE_V256) {
2882            insn |= P_VEXL;
2883        }
2884        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2885        break;
2886
2887    case INDEX_op_shli_vec:
2888        sub = 6;
2889        goto gen_shift;
2890    case INDEX_op_shri_vec:
2891        sub = 2;
2892        goto gen_shift;
2893    case INDEX_op_sari_vec:
2894        tcg_debug_assert(vece != MO_64);
2895        sub = 4;
2896    gen_shift:
2897        tcg_debug_assert(vece != MO_8);
2898        insn = shift_imm_insn[vece];
2899        if (type == TCG_TYPE_V256) {
2900            insn |= P_VEXL;
2901        }
2902        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2903        tcg_out8(s, a2);
2904        break;
2905
2906    case INDEX_op_ld_vec:
2907        tcg_out_ld(s, type, a0, a1, a2);
2908        break;
2909    case INDEX_op_st_vec:
2910        tcg_out_st(s, type, a0, a1, a2);
2911        break;
2912    case INDEX_op_dupm_vec:
2913        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2914        break;
2915
2916    case INDEX_op_x86_shufps_vec:
2917        insn = OPC_SHUFPS;
2918        sub = args[3];
2919        goto gen_simd_imm8;
2920    case INDEX_op_x86_blend_vec:
2921        if (vece == MO_16) {
2922            insn = OPC_PBLENDW;
2923        } else if (vece == MO_32) {
2924            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2925        } else {
2926            g_assert_not_reached();
2927        }
2928        sub = args[3];
2929        goto gen_simd_imm8;
2930    case INDEX_op_x86_vperm2i128_vec:
2931        insn = OPC_VPERM2I128;
2932        sub = args[3];
2933        goto gen_simd_imm8;
2934    gen_simd_imm8:
2935        if (type == TCG_TYPE_V256) {
2936            insn |= P_VEXL;
2937        }
2938        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2939        tcg_out8(s, sub);
2940        break;
2941
2942    case INDEX_op_x86_vpblendvb_vec:
2943        insn = OPC_VPBLENDVB;
2944        if (type == TCG_TYPE_V256) {
2945            insn |= P_VEXL;
2946        }
2947        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2948        tcg_out8(s, args[3] << 4);
2949        break;
2950
2951    case INDEX_op_x86_psrldq_vec:
2952        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2953        tcg_out8(s, a2);
2954        break;
2955
2956    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2957    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
2958    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2959    default:
2960        g_assert_not_reached();
2961    }
2962}
2963
2964static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2965{
2966    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2967    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2968    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2969    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2970    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2971    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2972    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2973    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2974    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2975    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2976    static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
2977    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2978    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2979    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2980    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2981    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2982    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2983    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2984    static const TCGTargetOpDef r_r_L_L
2985        = { .args_ct_str = { "r", "r", "L", "L" } };
2986    static const TCGTargetOpDef L_L_L_L
2987        = { .args_ct_str = { "L", "L", "L", "L" } };
2988    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2989    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2990    static const TCGTargetOpDef x_x_x_x
2991        = { .args_ct_str = { "x", "x", "x", "x" } };
2992    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2993
2994    switch (op) {
2995    case INDEX_op_goto_ptr:
2996        return &r;
2997
2998    case INDEX_op_ld8u_i32:
2999    case INDEX_op_ld8u_i64:
3000    case INDEX_op_ld8s_i32:
3001    case INDEX_op_ld8s_i64:
3002    case INDEX_op_ld16u_i32:
3003    case INDEX_op_ld16u_i64:
3004    case INDEX_op_ld16s_i32:
3005    case INDEX_op_ld16s_i64:
3006    case INDEX_op_ld_i32:
3007    case INDEX_op_ld32u_i64:
3008    case INDEX_op_ld32s_i64:
3009    case INDEX_op_ld_i64:
3010        return &r_r;
3011
3012    case INDEX_op_st8_i32:
3013    case INDEX_op_st8_i64:
3014        return &qi_r;
3015    case INDEX_op_st16_i32:
3016    case INDEX_op_st16_i64:
3017    case INDEX_op_st_i32:
3018    case INDEX_op_st32_i64:
3019        return &ri_r;
3020    case INDEX_op_st_i64:
3021        return &re_r;
3022
3023    case INDEX_op_add_i32:
3024    case INDEX_op_add_i64:
3025        return &r_r_re;
3026    case INDEX_op_sub_i32:
3027    case INDEX_op_sub_i64:
3028    case INDEX_op_mul_i32:
3029    case INDEX_op_mul_i64:
3030    case INDEX_op_or_i32:
3031    case INDEX_op_or_i64:
3032    case INDEX_op_xor_i32:
3033    case INDEX_op_xor_i64:
3034        return &r_0_re;
3035
3036    case INDEX_op_and_i32:
3037    case INDEX_op_and_i64:
3038        {
3039            static const TCGTargetOpDef and
3040                = { .args_ct_str = { "r", "0", "reZ" } };
3041            return &and;
3042        }
3043        break;
3044    case INDEX_op_andc_i32:
3045    case INDEX_op_andc_i64:
3046        {
3047            static const TCGTargetOpDef andc
3048                = { .args_ct_str = { "r", "r", "rI" } };
3049            return &andc;
3050        }
3051        break;
3052
3053    case INDEX_op_shl_i32:
3054    case INDEX_op_shl_i64:
3055    case INDEX_op_shr_i32:
3056    case INDEX_op_shr_i64:
3057    case INDEX_op_sar_i32:
3058    case INDEX_op_sar_i64:
3059        return have_bmi2 ? &r_r_ri : &r_0_ci;
3060    case INDEX_op_rotl_i32:
3061    case INDEX_op_rotl_i64:
3062    case INDEX_op_rotr_i32:
3063    case INDEX_op_rotr_i64:
3064        return &r_0_ci;
3065
3066    case INDEX_op_brcond_i32:
3067    case INDEX_op_brcond_i64:
3068        return &r_re;
3069
3070    case INDEX_op_bswap16_i32:
3071    case INDEX_op_bswap16_i64:
3072    case INDEX_op_bswap32_i32:
3073    case INDEX_op_bswap32_i64:
3074    case INDEX_op_bswap64_i64:
3075    case INDEX_op_neg_i32:
3076    case INDEX_op_neg_i64:
3077    case INDEX_op_not_i32:
3078    case INDEX_op_not_i64:
3079    case INDEX_op_extrh_i64_i32:
3080        return &r_0;
3081
3082    case INDEX_op_ext8s_i32:
3083    case INDEX_op_ext8s_i64:
3084    case INDEX_op_ext8u_i32:
3085    case INDEX_op_ext8u_i64:
3086        return &r_q;
3087    case INDEX_op_ext16s_i32:
3088    case INDEX_op_ext16s_i64:
3089    case INDEX_op_ext16u_i32:
3090    case INDEX_op_ext16u_i64:
3091    case INDEX_op_ext32s_i64:
3092    case INDEX_op_ext32u_i64:
3093    case INDEX_op_ext_i32_i64:
3094    case INDEX_op_extu_i32_i64:
3095    case INDEX_op_extrl_i64_i32:
3096    case INDEX_op_extract_i32:
3097    case INDEX_op_extract_i64:
3098    case INDEX_op_sextract_i32:
3099    case INDEX_op_ctpop_i32:
3100    case INDEX_op_ctpop_i64:
3101        return &r_r;
3102    case INDEX_op_extract2_i32:
3103    case INDEX_op_extract2_i64:
3104        return &r_0_r;
3105
3106    case INDEX_op_deposit_i32:
3107    case INDEX_op_deposit_i64:
3108        {
3109            static const TCGTargetOpDef dep
3110                = { .args_ct_str = { "Q", "0", "Q" } };
3111            return &dep;
3112        }
3113    case INDEX_op_setcond_i32:
3114    case INDEX_op_setcond_i64:
3115        {
3116            static const TCGTargetOpDef setc
3117                = { .args_ct_str = { "q", "r", "re" } };
3118            return &setc;
3119        }
3120    case INDEX_op_movcond_i32:
3121    case INDEX_op_movcond_i64:
3122        {
3123            static const TCGTargetOpDef movc
3124                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
3125            return &movc;
3126        }
3127    case INDEX_op_div2_i32:
3128    case INDEX_op_div2_i64:
3129    case INDEX_op_divu2_i32:
3130    case INDEX_op_divu2_i64:
3131        {
3132            static const TCGTargetOpDef div2
3133                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3134            return &div2;
3135        }
3136    case INDEX_op_mulu2_i32:
3137    case INDEX_op_mulu2_i64:
3138    case INDEX_op_muls2_i32:
3139    case INDEX_op_muls2_i64:
3140        {
3141            static const TCGTargetOpDef mul2
3142                = { .args_ct_str = { "a", "d", "a", "r" } };
3143            return &mul2;
3144        }
3145    case INDEX_op_add2_i32:
3146    case INDEX_op_add2_i64:
3147    case INDEX_op_sub2_i32:
3148    case INDEX_op_sub2_i64:
3149        {
3150            static const TCGTargetOpDef arith2
3151                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3152            return &arith2;
3153        }
3154    case INDEX_op_ctz_i32:
3155    case INDEX_op_ctz_i64:
3156        {
3157            static const TCGTargetOpDef ctz[2] = {
3158                { .args_ct_str = { "&r", "r", "r" } },
3159                { .args_ct_str = { "&r", "r", "rW" } },
3160            };
3161            return &ctz[have_bmi1];
3162        }
3163    case INDEX_op_clz_i32:
3164    case INDEX_op_clz_i64:
3165        {
3166            static const TCGTargetOpDef clz[2] = {
3167                { .args_ct_str = { "&r", "r", "r" } },
3168                { .args_ct_str = { "&r", "r", "rW" } },
3169            };
3170            return &clz[have_lzcnt];
3171        }
3172
3173    case INDEX_op_qemu_ld_i32:
3174        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3175    case INDEX_op_qemu_st_i32:
3176        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3177    case INDEX_op_qemu_ld_i64:
3178        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3179                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3180                : &r_r_L_L);
3181    case INDEX_op_qemu_st_i64:
3182        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3183                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3184                : &L_L_L_L);
3185
3186    case INDEX_op_brcond2_i32:
3187        {
3188            static const TCGTargetOpDef b2
3189                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3190            return &b2;
3191        }
3192    case INDEX_op_setcond2_i32:
3193        {
3194            static const TCGTargetOpDef s2
3195                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3196            return &s2;
3197        }
3198
3199    case INDEX_op_ld_vec:
3200    case INDEX_op_st_vec:
3201    case INDEX_op_dupm_vec:
3202        return &x_r;
3203
3204    case INDEX_op_add_vec:
3205    case INDEX_op_sub_vec:
3206    case INDEX_op_mul_vec:
3207    case INDEX_op_and_vec:
3208    case INDEX_op_or_vec:
3209    case INDEX_op_xor_vec:
3210    case INDEX_op_andc_vec:
3211    case INDEX_op_ssadd_vec:
3212    case INDEX_op_usadd_vec:
3213    case INDEX_op_sssub_vec:
3214    case INDEX_op_ussub_vec:
3215    case INDEX_op_smin_vec:
3216    case INDEX_op_umin_vec:
3217    case INDEX_op_smax_vec:
3218    case INDEX_op_umax_vec:
3219    case INDEX_op_shlv_vec:
3220    case INDEX_op_shrv_vec:
3221    case INDEX_op_sarv_vec:
3222    case INDEX_op_shls_vec:
3223    case INDEX_op_shrs_vec:
3224    case INDEX_op_sars_vec:
3225    case INDEX_op_rotls_vec:
3226    case INDEX_op_cmp_vec:
3227    case INDEX_op_x86_shufps_vec:
3228    case INDEX_op_x86_blend_vec:
3229    case INDEX_op_x86_packss_vec:
3230    case INDEX_op_x86_packus_vec:
3231    case INDEX_op_x86_vperm2i128_vec:
3232    case INDEX_op_x86_punpckl_vec:
3233    case INDEX_op_x86_punpckh_vec:
3234#if TCG_TARGET_REG_BITS == 32
3235    case INDEX_op_dup2_vec:
3236#endif
3237        return &x_x_x;
3238    case INDEX_op_abs_vec:
3239    case INDEX_op_dup_vec:
3240    case INDEX_op_shli_vec:
3241    case INDEX_op_shri_vec:
3242    case INDEX_op_sari_vec:
3243    case INDEX_op_x86_psrldq_vec:
3244        return &x_x;
3245    case INDEX_op_x86_vpblendvb_vec:
3246        return &x_x_x_x;
3247
3248    default:
3249        break;
3250    }
3251    return NULL;
3252}
3253
3254int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3255{
3256    switch (opc) {
3257    case INDEX_op_add_vec:
3258    case INDEX_op_sub_vec:
3259    case INDEX_op_and_vec:
3260    case INDEX_op_or_vec:
3261    case INDEX_op_xor_vec:
3262    case INDEX_op_andc_vec:
3263        return 1;
3264    case INDEX_op_rotli_vec:
3265    case INDEX_op_cmp_vec:
3266    case INDEX_op_cmpsel_vec:
3267        return -1;
3268
3269    case INDEX_op_shli_vec:
3270    case INDEX_op_shri_vec:
3271        /* We must expand the operation for MO_8.  */
3272        return vece == MO_8 ? -1 : 1;
3273
3274    case INDEX_op_sari_vec:
3275        /* We must expand the operation for MO_8.  */
3276        if (vece == MO_8) {
3277            return -1;
3278        }
3279        /* We can emulate this for MO_64, but it does not pay off
3280           unless we're producing at least 4 values.  */
3281        if (vece == MO_64) {
3282            return type >= TCG_TYPE_V256 ? -1 : 0;
3283        }
3284        return 1;
3285
3286    case INDEX_op_shls_vec:
3287    case INDEX_op_shrs_vec:
3288        return vece >= MO_16;
3289    case INDEX_op_sars_vec:
3290        return vece >= MO_16 && vece <= MO_32;
3291    case INDEX_op_rotls_vec:
3292        return vece >= MO_16 ? -1 : 0;
3293
3294    case INDEX_op_shlv_vec:
3295    case INDEX_op_shrv_vec:
3296        return have_avx2 && vece >= MO_32;
3297    case INDEX_op_sarv_vec:
3298        return have_avx2 && vece == MO_32;
3299    case INDEX_op_rotlv_vec:
3300    case INDEX_op_rotrv_vec:
3301        return have_avx2 && vece >= MO_32 ? -1 : 0;
3302
3303    case INDEX_op_mul_vec:
3304        if (vece == MO_8) {
3305            /* We can expand the operation for MO_8.  */
3306            return -1;
3307        }
3308        if (vece == MO_64) {
3309            return 0;
3310        }
3311        return 1;
3312
3313    case INDEX_op_ssadd_vec:
3314    case INDEX_op_usadd_vec:
3315    case INDEX_op_sssub_vec:
3316    case INDEX_op_ussub_vec:
3317        return vece <= MO_16;
3318    case INDEX_op_smin_vec:
3319    case INDEX_op_smax_vec:
3320    case INDEX_op_umin_vec:
3321    case INDEX_op_umax_vec:
3322    case INDEX_op_abs_vec:
3323        return vece <= MO_32;
3324
3325    default:
3326        return 0;
3327    }
3328}
3329
3330static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3331                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3332{
3333    TCGv_vec t1, t2;
3334
3335    tcg_debug_assert(vece == MO_8);
3336
3337    t1 = tcg_temp_new_vec(type);
3338    t2 = tcg_temp_new_vec(type);
3339
3340    /*
3341     * Unpack to W, shift, and repack.  Tricky bits:
3342     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3343     *     i.e. duplicate in other half of the 16-bit lane.
3344     * (2) For right-shift, add 8 so that the high half of the lane
3345     *     becomes zero.  For left-shift, and left-rotate, we must
3346     *     shift up and down again.
3347     * (3) Step 2 leaves high half zero such that PACKUSWB
3348     *     (pack with unsigned saturation) does not modify
3349     *     the quantity.
3350     */
3351    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3352              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3353    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3354              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3355
3356    if (opc != INDEX_op_rotli_vec) {
3357        imm += 8;
3358    }
3359    if (opc == INDEX_op_shri_vec) {
3360        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3361        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3362    } else {
3363        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3364        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3365        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3366        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3367    }
3368
3369    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3370              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3371    tcg_temp_free_vec(t1);
3372    tcg_temp_free_vec(t2);
3373}
3374
3375static void expand_vec_sari(TCGType type, unsigned vece,
3376                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3377{
3378    TCGv_vec t1, t2;
3379
3380    switch (vece) {
3381    case MO_8:
3382        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3383        t1 = tcg_temp_new_vec(type);
3384        t2 = tcg_temp_new_vec(type);
3385        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3386                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3387        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3388                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3389        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3390        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3391        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3392                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3393        tcg_temp_free_vec(t1);
3394        tcg_temp_free_vec(t2);
3395        break;
3396
3397    case MO_64:
3398        if (imm <= 32) {
3399            /*
3400             * We can emulate a small sign extend by performing an arithmetic
3401             * 32-bit shift and overwriting the high half of a 64-bit logical
3402             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3403             * does not, so we have to bound the smaller shift -- we get the
3404             * same result in the high half either way.
3405             */
3406            t1 = tcg_temp_new_vec(type);
3407            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3408            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3409            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3410                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3411                      tcgv_vec_arg(t1), 0xaa);
3412            tcg_temp_free_vec(t1);
3413        } else {
3414            /* Otherwise we will need to use a compare vs 0 to produce
3415             * the sign-extend, shift and merge.
3416             */
3417            t1 = tcg_const_zeros_vec(type);
3418            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3419            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3420            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3421            tcg_gen_or_vec(MO_64, v0, v0, t1);
3422            tcg_temp_free_vec(t1);
3423        }
3424        break;
3425
3426    default:
3427        g_assert_not_reached();
3428    }
3429}
3430
3431static void expand_vec_rotli(TCGType type, unsigned vece,
3432                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3433{
3434    TCGv_vec t;
3435
3436    if (vece == MO_8) {
3437        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3438        return;
3439    }
3440
3441    t = tcg_temp_new_vec(type);
3442    tcg_gen_shli_vec(vece, t, v1, imm);
3443    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3444    tcg_gen_or_vec(vece, v0, v0, t);
3445    tcg_temp_free_vec(t);
3446}
3447
3448static void expand_vec_rotls(TCGType type, unsigned vece,
3449                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3450{
3451    TCGv_i32 rsh;
3452    TCGv_vec t;
3453
3454    tcg_debug_assert(vece != MO_8);
3455
3456    t = tcg_temp_new_vec(type);
3457    rsh = tcg_temp_new_i32();
3458
3459    tcg_gen_neg_i32(rsh, lsh);
3460    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3461    tcg_gen_shls_vec(vece, t, v1, lsh);
3462    tcg_gen_shrs_vec(vece, v0, v1, rsh);
3463    tcg_gen_or_vec(vece, v0, v0, t);
3464    tcg_temp_free_vec(t);
3465    tcg_temp_free_i32(rsh);
3466}
3467
3468static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3469                            TCGv_vec v1, TCGv_vec sh, bool right)
3470{
3471    TCGv_vec t = tcg_temp_new_vec(type);
3472
3473    tcg_gen_dupi_vec(vece, t, 8 << vece);
3474    tcg_gen_sub_vec(vece, t, t, sh);
3475    if (right) {
3476        tcg_gen_shlv_vec(vece, t, v1, t);
3477        tcg_gen_shrv_vec(vece, v0, v1, sh);
3478    } else {
3479        tcg_gen_shrv_vec(vece, t, v1, t);
3480        tcg_gen_shlv_vec(vece, v0, v1, sh);
3481    }
3482    tcg_gen_or_vec(vece, v0, v0, t);
3483    tcg_temp_free_vec(t);
3484}
3485
3486static void expand_vec_mul(TCGType type, unsigned vece,
3487                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3488{
3489    TCGv_vec t1, t2, t3, t4;
3490
3491    tcg_debug_assert(vece == MO_8);
3492
3493    /*
3494     * Unpack v1 bytes to words, 0 | x.
3495     * Unpack v2 bytes to words, y | 0.
3496     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3497     * Shift logical right by 8 bits to clear the high 8 bytes before
3498     * using an unsigned saturated pack.
3499     *
3500     * The difference between the V64, V128 and V256 cases is merely how
3501     * we distribute the expansion between temporaries.
3502     */
3503    switch (type) {
3504    case TCG_TYPE_V64:
3505        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3506        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3507        tcg_gen_dup16i_vec(t2, 0);
3508        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3509                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3510        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3511                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3512        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3513        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3514        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3515                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3516        tcg_temp_free_vec(t1);
3517        tcg_temp_free_vec(t2);
3518        break;
3519
3520    case TCG_TYPE_V128:
3521    case TCG_TYPE_V256:
3522        t1 = tcg_temp_new_vec(type);
3523        t2 = tcg_temp_new_vec(type);
3524        t3 = tcg_temp_new_vec(type);
3525        t4 = tcg_temp_new_vec(type);
3526        tcg_gen_dup16i_vec(t4, 0);
3527        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3528                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3529        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3530                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3531        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3532                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3533        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3534                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3535        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3536        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3537        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3538        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3539        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3540                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3541        tcg_temp_free_vec(t1);
3542        tcg_temp_free_vec(t2);
3543        tcg_temp_free_vec(t3);
3544        tcg_temp_free_vec(t4);
3545        break;
3546
3547    default:
3548        g_assert_not_reached();
3549    }
3550}
3551
3552static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3553                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3554{
3555    enum {
3556        NEED_INV  = 1,
3557        NEED_SWAP = 2,
3558        NEED_BIAS = 4,
3559        NEED_UMIN = 8,
3560        NEED_UMAX = 16,
3561    };
3562    TCGv_vec t1, t2;
3563    uint8_t fixup;
3564
3565    switch (cond) {
3566    case TCG_COND_EQ:
3567    case TCG_COND_GT:
3568        fixup = 0;
3569        break;
3570    case TCG_COND_NE:
3571    case TCG_COND_LE:
3572        fixup = NEED_INV;
3573        break;
3574    case TCG_COND_LT:
3575        fixup = NEED_SWAP;
3576        break;
3577    case TCG_COND_GE:
3578        fixup = NEED_SWAP | NEED_INV;
3579        break;
3580    case TCG_COND_LEU:
3581        if (vece <= MO_32) {
3582            fixup = NEED_UMIN;
3583        } else {
3584            fixup = NEED_BIAS | NEED_INV;
3585        }
3586        break;
3587    case TCG_COND_GTU:
3588        if (vece <= MO_32) {
3589            fixup = NEED_UMIN | NEED_INV;
3590        } else {
3591            fixup = NEED_BIAS;
3592        }
3593        break;
3594    case TCG_COND_GEU:
3595        if (vece <= MO_32) {
3596            fixup = NEED_UMAX;
3597        } else {
3598            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3599        }
3600        break;
3601    case TCG_COND_LTU:
3602        if (vece <= MO_32) {
3603            fixup = NEED_UMAX | NEED_INV;
3604        } else {
3605            fixup = NEED_BIAS | NEED_SWAP;
3606        }
3607        break;
3608    default:
3609        g_assert_not_reached();
3610    }
3611
3612    if (fixup & NEED_INV) {
3613        cond = tcg_invert_cond(cond);
3614    }
3615    if (fixup & NEED_SWAP) {
3616        t1 = v1, v1 = v2, v2 = t1;
3617        cond = tcg_swap_cond(cond);
3618    }
3619
3620    t1 = t2 = NULL;
3621    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3622        t1 = tcg_temp_new_vec(type);
3623        if (fixup & NEED_UMIN) {
3624            tcg_gen_umin_vec(vece, t1, v1, v2);
3625        } else {
3626            tcg_gen_umax_vec(vece, t1, v1, v2);
3627        }
3628        v2 = t1;
3629        cond = TCG_COND_EQ;
3630    } else if (fixup & NEED_BIAS) {
3631        t1 = tcg_temp_new_vec(type);
3632        t2 = tcg_temp_new_vec(type);
3633        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3634        tcg_gen_sub_vec(vece, t1, v1, t2);
3635        tcg_gen_sub_vec(vece, t2, v2, t2);
3636        v1 = t1;
3637        v2 = t2;
3638        cond = tcg_signed_cond(cond);
3639    }
3640
3641    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3642    /* Expand directly; do not recurse.  */
3643    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3644              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3645
3646    if (t1) {
3647        tcg_temp_free_vec(t1);
3648        if (t2) {
3649            tcg_temp_free_vec(t2);
3650        }
3651    }
3652    return fixup & NEED_INV;
3653}
3654
3655static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3656                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3657{
3658    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3659        tcg_gen_not_vec(vece, v0, v0);
3660    }
3661}
3662
3663static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3664                              TCGv_vec c1, TCGv_vec c2,
3665                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3666{
3667    TCGv_vec t = tcg_temp_new_vec(type);
3668
3669    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3670        /* Invert the sense of the compare by swapping arguments.  */
3671        TCGv_vec x;
3672        x = v3, v3 = v4, v4 = x;
3673    }
3674    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3675              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3676              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3677    tcg_temp_free_vec(t);
3678}
3679
3680void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3681                       TCGArg a0, ...)
3682{
3683    va_list va;
3684    TCGArg a2;
3685    TCGv_vec v0, v1, v2, v3, v4;
3686
3687    va_start(va, a0);
3688    v0 = temp_tcgv_vec(arg_temp(a0));
3689    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3690    a2 = va_arg(va, TCGArg);
3691
3692    switch (opc) {
3693    case INDEX_op_shli_vec:
3694    case INDEX_op_shri_vec:
3695        expand_vec_shi(type, vece, opc, v0, v1, a2);
3696        break;
3697
3698    case INDEX_op_sari_vec:
3699        expand_vec_sari(type, vece, v0, v1, a2);
3700        break;
3701
3702    case INDEX_op_rotli_vec:
3703        expand_vec_rotli(type, vece, v0, v1, a2);
3704        break;
3705
3706    case INDEX_op_rotls_vec:
3707        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3708        break;
3709
3710    case INDEX_op_rotlv_vec:
3711        v2 = temp_tcgv_vec(arg_temp(a2));
3712        expand_vec_rotv(type, vece, v0, v1, v2, false);
3713        break;
3714    case INDEX_op_rotrv_vec:
3715        v2 = temp_tcgv_vec(arg_temp(a2));
3716        expand_vec_rotv(type, vece, v0, v1, v2, true);
3717        break;
3718
3719    case INDEX_op_mul_vec:
3720        v2 = temp_tcgv_vec(arg_temp(a2));
3721        expand_vec_mul(type, vece, v0, v1, v2);
3722        break;
3723
3724    case INDEX_op_cmp_vec:
3725        v2 = temp_tcgv_vec(arg_temp(a2));
3726        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3727        break;
3728
3729    case INDEX_op_cmpsel_vec:
3730        v2 = temp_tcgv_vec(arg_temp(a2));
3731        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3732        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3733        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3734        break;
3735
3736    default:
3737        break;
3738    }
3739
3740    va_end(va);
3741}
3742
3743static const int tcg_target_callee_save_regs[] = {
3744#if TCG_TARGET_REG_BITS == 64
3745    TCG_REG_RBP,
3746    TCG_REG_RBX,
3747#if defined(_WIN64)
3748    TCG_REG_RDI,
3749    TCG_REG_RSI,
3750#endif
3751    TCG_REG_R12,
3752    TCG_REG_R13,
3753    TCG_REG_R14, /* Currently used for the global env. */
3754    TCG_REG_R15,
3755#else
3756    TCG_REG_EBP, /* Currently used for the global env. */
3757    TCG_REG_EBX,
3758    TCG_REG_ESI,
3759    TCG_REG_EDI,
3760#endif
3761};
3762
3763/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3764   and tcg_register_jit.  */
3765
3766#define PUSH_SIZE \
3767    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3768     * (TCG_TARGET_REG_BITS / 8))
3769
3770#define FRAME_SIZE \
3771    ((PUSH_SIZE \
3772      + TCG_STATIC_CALL_ARGS_SIZE \
3773      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3774      + TCG_TARGET_STACK_ALIGN - 1) \
3775     & ~(TCG_TARGET_STACK_ALIGN - 1))
3776
3777/* Generate global QEMU prologue and epilogue code */
3778static void tcg_target_qemu_prologue(TCGContext *s)
3779{
3780    int i, stack_addend;
3781
3782    /* TB prologue */
3783
3784    /* Reserve some stack space, also for TCG temps.  */
3785    stack_addend = FRAME_SIZE - PUSH_SIZE;
3786    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3787                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3788
3789    /* Save all callee saved registers.  */
3790    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3791        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3792    }
3793
3794#if TCG_TARGET_REG_BITS == 32
3795    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3796               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3797    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3798    /* jmp *tb.  */
3799    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3800                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3801                         + stack_addend);
3802#else
3803# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3804    if (guest_base) {
3805        int seg = setup_guest_base_seg();
3806        if (seg != 0) {
3807            x86_guest_base_seg = seg;
3808        } else if (guest_base == (int32_t)guest_base) {
3809            x86_guest_base_offset = guest_base;
3810        } else {
3811            /* Choose R12 because, as a base, it requires a SIB byte. */
3812            x86_guest_base_index = TCG_REG_R12;
3813            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3814            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3815        }
3816    }
3817# endif
3818    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3819    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3820    /* jmp *tb.  */
3821    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3822#endif
3823
3824    /*
3825     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3826     * and fall through to the rest of the epilogue.
3827     */
3828    s->code_gen_epilogue = s->code_ptr;
3829    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3830
3831    /* TB epilogue */
3832    tb_ret_addr = s->code_ptr;
3833
3834    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3835
3836    if (have_avx2) {
3837        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3838    }
3839    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3840        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3841    }
3842    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3843}
3844
3845static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3846{
3847    memset(p, 0x90, count);
3848}
3849
3850static void tcg_target_init(TCGContext *s)
3851{
3852#ifdef CONFIG_CPUID_H
3853    unsigned a, b, c, d, b7 = 0;
3854    int max = __get_cpuid_max(0, 0);
3855
3856    if (max >= 7) {
3857        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3858        __cpuid_count(7, 0, a, b7, c, d);
3859        have_bmi1 = (b7 & bit_BMI) != 0;
3860        have_bmi2 = (b7 & bit_BMI2) != 0;
3861    }
3862
3863    if (max >= 1) {
3864        __cpuid(1, a, b, c, d);
3865#ifndef have_cmov
3866        /* For 32-bit, 99% certainty that we're running on hardware that
3867           supports cmov, but we still need to check.  In case cmov is not
3868           available, we'll use a small forward branch.  */
3869        have_cmov = (d & bit_CMOV) != 0;
3870#endif
3871
3872        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3873           need to probe for it.  */
3874        have_movbe = (c & bit_MOVBE) != 0;
3875        have_popcnt = (c & bit_POPCNT) != 0;
3876
3877        /* There are a number of things we must check before we can be
3878           sure of not hitting invalid opcode.  */
3879        if (c & bit_OSXSAVE) {
3880            unsigned xcrl, xcrh;
3881            /* The xgetbv instruction is not available to older versions of
3882             * the assembler, so we encode the instruction manually.
3883             */
3884            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3885            if ((xcrl & 6) == 6) {
3886                have_avx1 = (c & bit_AVX) != 0;
3887                have_avx2 = (b7 & bit_AVX2) != 0;
3888            }
3889        }
3890    }
3891
3892    max = __get_cpuid_max(0x8000000, 0);
3893    if (max >= 1) {
3894        __cpuid(0x80000001, a, b, c, d);
3895        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3896        have_lzcnt = (c & bit_LZCNT) != 0;
3897    }
3898#endif /* CONFIG_CPUID_H */
3899
3900    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3901    if (TCG_TARGET_REG_BITS == 64) {
3902        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3903    }
3904    if (have_avx1) {
3905        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3906        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3907    }
3908    if (have_avx2) {
3909        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3910    }
3911
3912    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3913    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3914    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3915    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3916    if (TCG_TARGET_REG_BITS == 64) {
3917#if !defined(_WIN64)
3918        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3919        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3920#endif
3921        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3922        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3923        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3924        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3925    }
3926
3927    s->reserved_regs = 0;
3928    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3929}
3930
3931typedef struct {
3932    DebugFrameHeader h;
3933    uint8_t fde_def_cfa[4];
3934    uint8_t fde_reg_ofs[14];
3935} DebugFrame;
3936
3937/* We're expecting a 2 byte uleb128 encoded value.  */
3938QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3939
3940#if !defined(__ELF__)
3941    /* Host machine without ELF. */
3942#elif TCG_TARGET_REG_BITS == 64
3943#define ELF_HOST_MACHINE EM_X86_64
3944static const DebugFrame debug_frame = {
3945    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3946    .h.cie.id = -1,
3947    .h.cie.version = 1,
3948    .h.cie.code_align = 1,
3949    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3950    .h.cie.return_column = 16,
3951
3952    /* Total FDE size does not include the "len" member.  */
3953    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3954
3955    .fde_def_cfa = {
3956        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3957        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3958        (FRAME_SIZE >> 7)
3959    },
3960    .fde_reg_ofs = {
3961        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3962        /* The following ordering must match tcg_target_callee_save_regs.  */
3963        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3964        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3965        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3966        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3967        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3968        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3969    }
3970};
3971#else
3972#define ELF_HOST_MACHINE EM_386
3973static const DebugFrame debug_frame = {
3974    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3975    .h.cie.id = -1,
3976    .h.cie.version = 1,
3977    .h.cie.code_align = 1,
3978    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3979    .h.cie.return_column = 8,
3980
3981    /* Total FDE size does not include the "len" member.  */
3982    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3983
3984    .fde_def_cfa = {
3985        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3986        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3987        (FRAME_SIZE >> 7)
3988    },
3989    .fde_reg_ofs = {
3990        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3991        /* The following ordering must match tcg_target_callee_save_regs.  */
3992        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3993        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3994        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3995        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3996    }
3997};
3998#endif
3999
4000#if defined(ELF_HOST_MACHINE)
4001void tcg_register_jit(void *buf, size_t buf_size)
4002{
4003    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4004}
4005#endif
4006