xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision 4c4465ff)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-pool.c.inc"
26
27#ifdef CONFIG_DEBUG_TCG
28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29#if TCG_TARGET_REG_BITS == 64
30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31#else
32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33#endif
34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36#if TCG_TARGET_REG_BITS == 64
37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39#endif
40};
41#endif
42
43static const int tcg_target_reg_alloc_order[] = {
44#if TCG_TARGET_REG_BITS == 64
45    TCG_REG_RBP,
46    TCG_REG_RBX,
47    TCG_REG_R12,
48    TCG_REG_R13,
49    TCG_REG_R14,
50    TCG_REG_R15,
51    TCG_REG_R10,
52    TCG_REG_R11,
53    TCG_REG_R9,
54    TCG_REG_R8,
55    TCG_REG_RCX,
56    TCG_REG_RDX,
57    TCG_REG_RSI,
58    TCG_REG_RDI,
59    TCG_REG_RAX,
60#else
61    TCG_REG_EBX,
62    TCG_REG_ESI,
63    TCG_REG_EDI,
64    TCG_REG_EBP,
65    TCG_REG_ECX,
66    TCG_REG_EDX,
67    TCG_REG_EAX,
68#endif
69    TCG_REG_XMM0,
70    TCG_REG_XMM1,
71    TCG_REG_XMM2,
72    TCG_REG_XMM3,
73    TCG_REG_XMM4,
74    TCG_REG_XMM5,
75#ifndef _WIN64
76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
78    TCG_REG_XMM6,
79    TCG_REG_XMM7,
80#if TCG_TARGET_REG_BITS == 64
81    TCG_REG_XMM8,
82    TCG_REG_XMM9,
83    TCG_REG_XMM10,
84    TCG_REG_XMM11,
85    TCG_REG_XMM12,
86    TCG_REG_XMM13,
87    TCG_REG_XMM14,
88    TCG_REG_XMM15,
89#endif
90#endif
91};
92
93static const int tcg_target_call_iarg_regs[] = {
94#if TCG_TARGET_REG_BITS == 64
95#if defined(_WIN64)
96    TCG_REG_RCX,
97    TCG_REG_RDX,
98#else
99    TCG_REG_RDI,
100    TCG_REG_RSI,
101    TCG_REG_RDX,
102    TCG_REG_RCX,
103#endif
104    TCG_REG_R8,
105    TCG_REG_R9,
106#else
107    /* 32 bit mode uses stack based calling convention (GCC default). */
108#endif
109};
110
111static const int tcg_target_call_oarg_regs[] = {
112    TCG_REG_EAX,
113#if TCG_TARGET_REG_BITS == 32
114    TCG_REG_EDX
115#endif
116};
117
118/* Constants we accept.  */
119#define TCG_CT_CONST_S32 0x100
120#define TCG_CT_CONST_U32 0x200
121#define TCG_CT_CONST_I32 0x400
122#define TCG_CT_CONST_WSZ 0x800
123
124/* Registers used with L constraint, which are the first argument
125   registers on x86_64, and two random call clobbered registers on
126   i386. */
127#if TCG_TARGET_REG_BITS == 64
128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130#else
131# define TCG_REG_L0 TCG_REG_EAX
132# define TCG_REG_L1 TCG_REG_EDX
133#endif
134
135/* The host compiler should supply <cpuid.h> to enable runtime features
136   detection, as we're not going to go so far as our own inline assembly.
137   If not available, default values will be assumed.  */
138#if defined(CONFIG_CPUID_H)
139#include "qemu/cpuid.h"
140#endif
141
142/* For 64-bit, we always know that CMOV is available.  */
143#if TCG_TARGET_REG_BITS == 64
144# define have_cmov 1
145#elif defined(CONFIG_CPUID_H)
146static bool have_cmov;
147#else
148# define have_cmov 0
149#endif
150
151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
152   it there.  Therefore we always define the variable.  */
153bool have_bmi1;
154bool have_popcnt;
155bool have_avx1;
156bool have_avx2;
157bool have_movbe;
158
159#ifdef CONFIG_CPUID_H
160static bool have_bmi2;
161static bool have_lzcnt;
162#else
163# define have_bmi2 0
164# define have_lzcnt 0
165#endif
166
167static const tcg_insn_unit *tb_ret_addr;
168
169static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
170                        intptr_t value, intptr_t addend)
171{
172    value += addend;
173    switch(type) {
174    case R_386_PC32:
175        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
176        if (value != (int32_t)value) {
177            return false;
178        }
179        /* FALLTHRU */
180    case R_386_32:
181        tcg_patch32(code_ptr, value);
182        break;
183    case R_386_PC8:
184        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
185        if (value != (int8_t)value) {
186            return false;
187        }
188        tcg_patch8(code_ptr, value);
189        break;
190    default:
191        tcg_abort();
192    }
193    return true;
194}
195
196#if TCG_TARGET_REG_BITS == 64
197#define ALL_GENERAL_REGS   0x0000ffffu
198#define ALL_VECTOR_REGS    0xffff0000u
199#else
200#define ALL_GENERAL_REGS   0x000000ffu
201#define ALL_VECTOR_REGS    0x00ff0000u
202#endif
203
204/* parse target specific constraints */
205static const char *target_parse_constraint(TCGArgConstraint *ct,
206                                           const char *ct_str, TCGType type)
207{
208    switch(*ct_str++) {
209    case 'a':
210        tcg_regset_set_reg(ct->regs, TCG_REG_EAX);
211        break;
212    case 'b':
213        tcg_regset_set_reg(ct->regs, TCG_REG_EBX);
214        break;
215    case 'c':
216        tcg_regset_set_reg(ct->regs, TCG_REG_ECX);
217        break;
218    case 'd':
219        tcg_regset_set_reg(ct->regs, TCG_REG_EDX);
220        break;
221    case 'S':
222        tcg_regset_set_reg(ct->regs, TCG_REG_ESI);
223        break;
224    case 'D':
225        tcg_regset_set_reg(ct->regs, TCG_REG_EDI);
226        break;
227    case 'q':
228        /* A register that can be used as a byte operand.  */
229        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
230        break;
231    case 'Q':
232        /* A register with an addressable second byte (e.g. %ah).  */
233        ct->regs = 0xf;
234        break;
235    case 'r':
236        /* A general register.  */
237        ct->regs |= ALL_GENERAL_REGS;
238        break;
239    case 'W':
240        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
241        ct->ct |= TCG_CT_CONST_WSZ;
242        break;
243    case 'x':
244        /* A vector register.  */
245        ct->regs |= ALL_VECTOR_REGS;
246        break;
247
248    case 'L':
249        /* qemu_ld/st data+address constraint */
250        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
251#ifdef CONFIG_SOFTMMU
252        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
253        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
254#endif
255        break;
256    case 's':
257        /* qemu_st8_i32 data constraint */
258        ct->regs = 0xf;
259#ifdef CONFIG_SOFTMMU
260        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
261        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
262#endif
263        break;
264
265    case 'e':
266        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
267        break;
268    case 'Z':
269        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
270        break;
271    case 'I':
272        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
273        break;
274
275    default:
276        return NULL;
277    }
278    return ct_str;
279}
280
281/* test if a constant matches the constraint */
282static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
283                                         const TCGArgConstraint *arg_ct)
284{
285    int ct = arg_ct->ct;
286    if (ct & TCG_CT_CONST) {
287        return 1;
288    }
289    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
290        return 1;
291    }
292    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
293        return 1;
294    }
295    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
296        return 1;
297    }
298    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
299        return 1;
300    }
301    return 0;
302}
303
304# define LOWREGMASK(x)	((x) & 7)
305
306#define P_EXT		0x100		/* 0x0f opcode prefix */
307#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
308#define P_DATA16        0x400           /* 0x66 opcode prefix */
309#if TCG_TARGET_REG_BITS == 64
310# define P_REXW         0x1000          /* Set REX.W = 1 */
311# define P_REXB_R       0x2000          /* REG field as byte register */
312# define P_REXB_RM      0x4000          /* R/M field as byte register */
313# define P_GS           0x8000          /* gs segment override */
314#else
315# define P_REXW		0
316# define P_REXB_R	0
317# define P_REXB_RM	0
318# define P_GS           0
319#endif
320#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
321#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
322#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
323#define P_VEXL          0x80000         /* Set VEX.L = 1 */
324
325#define OPC_ARITH_EvIz	(0x81)
326#define OPC_ARITH_EvIb	(0x83)
327#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
328#define OPC_ANDN        (0xf2 | P_EXT38)
329#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
330#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
331#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
332#define OPC_BSF         (0xbc | P_EXT)
333#define OPC_BSR         (0xbd | P_EXT)
334#define OPC_BSWAP	(0xc8 | P_EXT)
335#define OPC_CALL_Jz	(0xe8)
336#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
337#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
338#define OPC_DEC_r32	(0x48)
339#define OPC_IMUL_GvEv	(0xaf | P_EXT)
340#define OPC_IMUL_GvEvIb	(0x6b)
341#define OPC_IMUL_GvEvIz	(0x69)
342#define OPC_INC_r32	(0x40)
343#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
344#define OPC_JCC_short	(0x70)		/* ... plus condition code */
345#define OPC_JMP_long	(0xe9)
346#define OPC_JMP_short	(0xeb)
347#define OPC_LEA         (0x8d)
348#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
349#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
350#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
351#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
352#define OPC_MOVB_EvIz   (0xc6)
353#define OPC_MOVL_EvIz	(0xc7)
354#define OPC_MOVL_Iv     (0xb8)
355#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
356#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
357#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
358#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
359#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
360#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
361#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
362#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
363#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
364#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
365#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
366#define OPC_MOVSBL	(0xbe | P_EXT)
367#define OPC_MOVSWL	(0xbf | P_EXT)
368#define OPC_MOVSLQ	(0x63 | P_REXW)
369#define OPC_MOVZBL	(0xb6 | P_EXT)
370#define OPC_MOVZWL	(0xb7 | P_EXT)
371#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
372#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
373#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
374#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
375#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
376#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
377#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
378#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
379#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
380#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
381#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
382#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
383#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
384#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
385#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
386#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
387#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
388#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
389#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
390#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
391#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
392#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
393#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
394#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
395#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
396#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
397#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
398#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
399#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
400#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
401#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
402#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
403#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
404#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
405#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
406#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
407#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
408#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
409#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
410#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
411#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
412#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
413#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
414#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
415#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
416#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
417#define OPC_POR         (0xeb | P_EXT | P_DATA16)
418#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
419#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
420#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
421#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
422#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
423#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
424#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
425#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
426#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
427#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
428#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
429#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
430#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
431#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
432#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
433#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
434#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
435#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
436#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
437#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
438#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
439#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
440#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
441#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
442#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
443#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
444#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
445#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
446#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
447#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
448#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
449#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
450#define OPC_POP_r32	(0x58)
451#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
452#define OPC_PUSH_r32	(0x50)
453#define OPC_PUSH_Iv	(0x68)
454#define OPC_PUSH_Ib	(0x6a)
455#define OPC_RET		(0xc3)
456#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
457#define OPC_SHIFT_1	(0xd1)
458#define OPC_SHIFT_Ib	(0xc1)
459#define OPC_SHIFT_cl	(0xd3)
460#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
461#define OPC_SHUFPS      (0xc6 | P_EXT)
462#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
463#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
464#define OPC_SHRD_Ib     (0xac | P_EXT)
465#define OPC_TESTL	(0x85)
466#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
467#define OPC_UD2         (0x0b | P_EXT)
468#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
469#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
470#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
471#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
472#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
473#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
474#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
475#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
476#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
477#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
478#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
479#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
480#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
481#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
482#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
483#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
484#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
485#define OPC_VZEROUPPER  (0x77 | P_EXT)
486#define OPC_XCHG_ax_r32	(0x90)
487
488#define OPC_GRP3_Ev	(0xf7)
489#define OPC_GRP5	(0xff)
490#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
491
492/* Group 1 opcode extensions for 0x80-0x83.
493   These are also used as modifiers for OPC_ARITH.  */
494#define ARITH_ADD 0
495#define ARITH_OR  1
496#define ARITH_ADC 2
497#define ARITH_SBB 3
498#define ARITH_AND 4
499#define ARITH_SUB 5
500#define ARITH_XOR 6
501#define ARITH_CMP 7
502
503/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
504#define SHIFT_ROL 0
505#define SHIFT_ROR 1
506#define SHIFT_SHL 4
507#define SHIFT_SHR 5
508#define SHIFT_SAR 7
509
510/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
511#define EXT3_NOT   2
512#define EXT3_NEG   3
513#define EXT3_MUL   4
514#define EXT3_IMUL  5
515#define EXT3_DIV   6
516#define EXT3_IDIV  7
517
518/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
519#define EXT5_INC_Ev	0
520#define EXT5_DEC_Ev	1
521#define EXT5_CALLN_Ev	2
522#define EXT5_JMPN_Ev	4
523
524/* Condition codes to be added to OPC_JCC_{long,short}.  */
525#define JCC_JMP (-1)
526#define JCC_JO  0x0
527#define JCC_JNO 0x1
528#define JCC_JB  0x2
529#define JCC_JAE 0x3
530#define JCC_JE  0x4
531#define JCC_JNE 0x5
532#define JCC_JBE 0x6
533#define JCC_JA  0x7
534#define JCC_JS  0x8
535#define JCC_JNS 0x9
536#define JCC_JP  0xa
537#define JCC_JNP 0xb
538#define JCC_JL  0xc
539#define JCC_JGE 0xd
540#define JCC_JLE 0xe
541#define JCC_JG  0xf
542
543static const uint8_t tcg_cond_to_jcc[] = {
544    [TCG_COND_EQ] = JCC_JE,
545    [TCG_COND_NE] = JCC_JNE,
546    [TCG_COND_LT] = JCC_JL,
547    [TCG_COND_GE] = JCC_JGE,
548    [TCG_COND_LE] = JCC_JLE,
549    [TCG_COND_GT] = JCC_JG,
550    [TCG_COND_LTU] = JCC_JB,
551    [TCG_COND_GEU] = JCC_JAE,
552    [TCG_COND_LEU] = JCC_JBE,
553    [TCG_COND_GTU] = JCC_JA,
554};
555
556#if TCG_TARGET_REG_BITS == 64
557static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
558{
559    int rex;
560
561    if (opc & P_GS) {
562        tcg_out8(s, 0x65);
563    }
564    if (opc & P_DATA16) {
565        /* We should never be asking for both 16 and 64-bit operation.  */
566        tcg_debug_assert((opc & P_REXW) == 0);
567        tcg_out8(s, 0x66);
568    }
569    if (opc & P_SIMDF3) {
570        tcg_out8(s, 0xf3);
571    } else if (opc & P_SIMDF2) {
572        tcg_out8(s, 0xf2);
573    }
574
575    rex = 0;
576    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
577    rex |= (r & 8) >> 1;                /* REX.R */
578    rex |= (x & 8) >> 2;                /* REX.X */
579    rex |= (rm & 8) >> 3;               /* REX.B */
580
581    /* P_REXB_{R,RM} indicates that the given register is the low byte.
582       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
583       as otherwise the encoding indicates %[abcd]h.  Note that the values
584       that are ORed in merely indicate that the REX byte must be present;
585       those bits get discarded in output.  */
586    rex |= opc & (r >= 4 ? P_REXB_R : 0);
587    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
588
589    if (rex) {
590        tcg_out8(s, (uint8_t)(rex | 0x40));
591    }
592
593    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
594        tcg_out8(s, 0x0f);
595        if (opc & P_EXT38) {
596            tcg_out8(s, 0x38);
597        } else if (opc & P_EXT3A) {
598            tcg_out8(s, 0x3a);
599        }
600    }
601
602    tcg_out8(s, opc);
603}
604#else
605static void tcg_out_opc(TCGContext *s, int opc)
606{
607    if (opc & P_DATA16) {
608        tcg_out8(s, 0x66);
609    }
610    if (opc & P_SIMDF3) {
611        tcg_out8(s, 0xf3);
612    } else if (opc & P_SIMDF2) {
613        tcg_out8(s, 0xf2);
614    }
615    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
616        tcg_out8(s, 0x0f);
617        if (opc & P_EXT38) {
618            tcg_out8(s, 0x38);
619        } else if (opc & P_EXT3A) {
620            tcg_out8(s, 0x3a);
621        }
622    }
623    tcg_out8(s, opc);
624}
625/* Discard the register arguments to tcg_out_opc early, so as not to penalize
626   the 32-bit compilation paths.  This method works with all versions of gcc,
627   whereas relying on optimization may not be able to exclude them.  */
628#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
629#endif
630
631static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
632{
633    tcg_out_opc(s, opc, r, rm, 0);
634    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
635}
636
637static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
638                            int rm, int index)
639{
640    int tmp;
641
642    /* Use the two byte form if possible, which cannot encode
643       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
644    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
645        && ((rm | index) & 8) == 0) {
646        /* Two byte VEX prefix.  */
647        tcg_out8(s, 0xc5);
648
649        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
650    } else {
651        /* Three byte VEX prefix.  */
652        tcg_out8(s, 0xc4);
653
654        /* VEX.m-mmmm */
655        if (opc & P_EXT3A) {
656            tmp = 3;
657        } else if (opc & P_EXT38) {
658            tmp = 2;
659        } else if (opc & P_EXT) {
660            tmp = 1;
661        } else {
662            g_assert_not_reached();
663        }
664        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
665        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
666        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
667        tcg_out8(s, tmp);
668
669        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
670    }
671
672    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
673    /* VEX.pp */
674    if (opc & P_DATA16) {
675        tmp |= 1;                          /* 0x66 */
676    } else if (opc & P_SIMDF3) {
677        tmp |= 2;                          /* 0xf3 */
678    } else if (opc & P_SIMDF2) {
679        tmp |= 3;                          /* 0xf2 */
680    }
681    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
682    tcg_out8(s, tmp);
683    tcg_out8(s, opc);
684}
685
686static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
687{
688    tcg_out_vex_opc(s, opc, r, v, rm, 0);
689    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
690}
691
692/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
693   We handle either RM and INDEX missing with a negative value.  In 64-bit
694   mode for absolute addresses, ~RM is the size of the immediate operand
695   that will follow the instruction.  */
696
697static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
698                               int shift, intptr_t offset)
699{
700    int mod, len;
701
702    if (index < 0 && rm < 0) {
703        if (TCG_TARGET_REG_BITS == 64) {
704            /* Try for a rip-relative addressing mode.  This has replaced
705               the 32-bit-mode absolute addressing encoding.  */
706            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
707            intptr_t disp = offset - pc;
708            if (disp == (int32_t)disp) {
709                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
710                tcg_out32(s, disp);
711                return;
712            }
713
714            /* Try for an absolute address encoding.  This requires the
715               use of the MODRM+SIB encoding and is therefore larger than
716               rip-relative addressing.  */
717            if (offset == (int32_t)offset) {
718                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
719                tcg_out8(s, (4 << 3) | 5);
720                tcg_out32(s, offset);
721                return;
722            }
723
724            /* ??? The memory isn't directly addressable.  */
725            g_assert_not_reached();
726        } else {
727            /* Absolute address.  */
728            tcg_out8(s, (r << 3) | 5);
729            tcg_out32(s, offset);
730            return;
731        }
732    }
733
734    /* Find the length of the immediate addend.  Note that the encoding
735       that would be used for (%ebp) indicates absolute addressing.  */
736    if (rm < 0) {
737        mod = 0, len = 4, rm = 5;
738    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
739        mod = 0, len = 0;
740    } else if (offset == (int8_t)offset) {
741        mod = 0x40, len = 1;
742    } else {
743        mod = 0x80, len = 4;
744    }
745
746    /* Use a single byte MODRM format if possible.  Note that the encoding
747       that would be used for %esp is the escape to the two byte form.  */
748    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
749        /* Single byte MODRM format.  */
750        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
751    } else {
752        /* Two byte MODRM+SIB format.  */
753
754        /* Note that the encoding that would place %esp into the index
755           field indicates no index register.  In 64-bit mode, the REX.X
756           bit counts, so %r12 can be used as the index.  */
757        if (index < 0) {
758            index = 4;
759        } else {
760            tcg_debug_assert(index != TCG_REG_ESP);
761        }
762
763        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
764        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
765    }
766
767    if (len == 1) {
768        tcg_out8(s, offset);
769    } else if (len == 4) {
770        tcg_out32(s, offset);
771    }
772}
773
774static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
775                                     int index, int shift, intptr_t offset)
776{
777    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
778    tcg_out_sib_offset(s, r, rm, index, shift, offset);
779}
780
781static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
782                                         int rm, int index, int shift,
783                                         intptr_t offset)
784{
785    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
786    tcg_out_sib_offset(s, r, rm, index, shift, offset);
787}
788
789/* A simplification of the above with no index or shift.  */
790static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
791                                        int rm, intptr_t offset)
792{
793    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
794}
795
796static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
797                                            int v, int rm, intptr_t offset)
798{
799    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
800}
801
802/* Output an opcode with an expected reference to the constant pool.  */
803static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
804{
805    tcg_out_opc(s, opc, r, 0, 0);
806    /* Absolute for 32-bit, pc-relative for 64-bit.  */
807    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
808    tcg_out32(s, 0);
809}
810
811/* Output an opcode with an expected reference to the constant pool.  */
812static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
813{
814    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
815    /* Absolute for 32-bit, pc-relative for 64-bit.  */
816    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
817    tcg_out32(s, 0);
818}
819
820/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
821static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
822{
823    /* Propagate an opcode prefix, such as P_REXW.  */
824    int ext = subop & ~0x7;
825    subop &= 0x7;
826
827    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
828}
829
830static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
831{
832    int rexw = 0;
833
834    if (arg == ret) {
835        return true;
836    }
837    switch (type) {
838    case TCG_TYPE_I64:
839        rexw = P_REXW;
840        /* fallthru */
841    case TCG_TYPE_I32:
842        if (ret < 16) {
843            if (arg < 16) {
844                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
845            } else {
846                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
847            }
848        } else {
849            if (arg < 16) {
850                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
851            } else {
852                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
853            }
854        }
855        break;
856
857    case TCG_TYPE_V64:
858        tcg_debug_assert(ret >= 16 && arg >= 16);
859        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
860        break;
861    case TCG_TYPE_V128:
862        tcg_debug_assert(ret >= 16 && arg >= 16);
863        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
864        break;
865    case TCG_TYPE_V256:
866        tcg_debug_assert(ret >= 16 && arg >= 16);
867        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
868        break;
869
870    default:
871        g_assert_not_reached();
872    }
873    return true;
874}
875
876static const int avx2_dup_insn[4] = {
877    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
878    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
879};
880
881static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
882                            TCGReg r, TCGReg a)
883{
884    if (have_avx2) {
885        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
886        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
887    } else {
888        switch (vece) {
889        case MO_8:
890            /* ??? With zero in a register, use PSHUFB.  */
891            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
892            a = r;
893            /* FALLTHRU */
894        case MO_16:
895            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
896            a = r;
897            /* FALLTHRU */
898        case MO_32:
899            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
900            /* imm8 operand: all output lanes selected from input lane 0.  */
901            tcg_out8(s, 0);
902            break;
903        case MO_64:
904            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
905            break;
906        default:
907            g_assert_not_reached();
908        }
909    }
910    return true;
911}
912
913static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
914                             TCGReg r, TCGReg base, intptr_t offset)
915{
916    if (have_avx2) {
917        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
918        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
919                                 r, 0, base, offset);
920    } else {
921        switch (vece) {
922        case MO_64:
923            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
924            break;
925        case MO_32:
926            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
927            break;
928        case MO_16:
929            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
930            tcg_out8(s, 0); /* imm8 */
931            tcg_out_dup_vec(s, type, vece, r, r);
932            break;
933        case MO_8:
934            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
935            tcg_out8(s, 0); /* imm8 */
936            tcg_out_dup_vec(s, type, vece, r, r);
937            break;
938        default:
939            g_assert_not_reached();
940        }
941    }
942    return true;
943}
944
945static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
946                             TCGReg ret, tcg_target_long arg)
947{
948    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
949
950    if (arg == 0) {
951        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
952        return;
953    }
954    if (arg == -1) {
955        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
956        return;
957    }
958
959    if (TCG_TARGET_REG_BITS == 64) {
960        if (type == TCG_TYPE_V64) {
961            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
962        } else if (have_avx2) {
963            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
964        } else {
965            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
966        }
967        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
968    } else {
969        if (have_avx2) {
970            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
971        } else {
972            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
973        }
974        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
975    }
976}
977
978static void tcg_out_movi(TCGContext *s, TCGType type,
979                         TCGReg ret, tcg_target_long arg)
980{
981    tcg_target_long diff;
982
983    switch (type) {
984    case TCG_TYPE_I32:
985#if TCG_TARGET_REG_BITS == 64
986    case TCG_TYPE_I64:
987#endif
988        if (ret < 16) {
989            break;
990        }
991        /* fallthru */
992    case TCG_TYPE_V64:
993    case TCG_TYPE_V128:
994    case TCG_TYPE_V256:
995        tcg_debug_assert(ret >= 16);
996        tcg_out_dupi_vec(s, type, ret, arg);
997        return;
998    default:
999        g_assert_not_reached();
1000    }
1001
1002    if (arg == 0) {
1003        tgen_arithr(s, ARITH_XOR, ret, ret);
1004        return;
1005    }
1006    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1007        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1008        tcg_out32(s, arg);
1009        return;
1010    }
1011    if (arg == (int32_t)arg) {
1012        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1013        tcg_out32(s, arg);
1014        return;
1015    }
1016
1017    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1018    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1019    if (diff == (int32_t)diff) {
1020        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1021        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1022        tcg_out32(s, diff);
1023        return;
1024    }
1025
1026    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1027    tcg_out64(s, arg);
1028}
1029
1030static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1031{
1032    if (val == (int8_t)val) {
1033        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1034        tcg_out8(s, val);
1035    } else if (val == (int32_t)val) {
1036        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1037        tcg_out32(s, val);
1038    } else {
1039        tcg_abort();
1040    }
1041}
1042
1043static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1044{
1045    /* Given the strength of x86 memory ordering, we only need care for
1046       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1047       faster than "mfence", so don't bother with the sse insn.  */
1048    if (a0 & TCG_MO_ST_LD) {
1049        tcg_out8(s, 0xf0);
1050        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1051        tcg_out8(s, 0);
1052    }
1053}
1054
1055static inline void tcg_out_push(TCGContext *s, int reg)
1056{
1057    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1058}
1059
1060static inline void tcg_out_pop(TCGContext *s, int reg)
1061{
1062    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1063}
1064
1065static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1066                       TCGReg arg1, intptr_t arg2)
1067{
1068    switch (type) {
1069    case TCG_TYPE_I32:
1070        if (ret < 16) {
1071            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1072        } else {
1073            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1074        }
1075        break;
1076    case TCG_TYPE_I64:
1077        if (ret < 16) {
1078            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1079            break;
1080        }
1081        /* FALLTHRU */
1082    case TCG_TYPE_V64:
1083        /* There is no instruction that can validate 8-byte alignment.  */
1084        tcg_debug_assert(ret >= 16);
1085        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1086        break;
1087    case TCG_TYPE_V128:
1088        /*
1089         * The gvec infrastructure is asserts that v128 vector loads
1090         * and stores use a 16-byte aligned offset.  Validate that the
1091         * final pointer is aligned by using an insn that will SIGSEGV.
1092         */
1093        tcg_debug_assert(ret >= 16);
1094        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1095        break;
1096    case TCG_TYPE_V256:
1097        /*
1098         * The gvec infrastructure only requires 16-byte alignment,
1099         * so here we must use an unaligned load.
1100         */
1101        tcg_debug_assert(ret >= 16);
1102        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1103                                 ret, 0, arg1, arg2);
1104        break;
1105    default:
1106        g_assert_not_reached();
1107    }
1108}
1109
1110static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1111                       TCGReg arg1, intptr_t arg2)
1112{
1113    switch (type) {
1114    case TCG_TYPE_I32:
1115        if (arg < 16) {
1116            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1117        } else {
1118            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1119        }
1120        break;
1121    case TCG_TYPE_I64:
1122        if (arg < 16) {
1123            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1124            break;
1125        }
1126        /* FALLTHRU */
1127    case TCG_TYPE_V64:
1128        /* There is no instruction that can validate 8-byte alignment.  */
1129        tcg_debug_assert(arg >= 16);
1130        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1131        break;
1132    case TCG_TYPE_V128:
1133        /*
1134         * The gvec infrastructure is asserts that v128 vector loads
1135         * and stores use a 16-byte aligned offset.  Validate that the
1136         * final pointer is aligned by using an insn that will SIGSEGV.
1137         */
1138        tcg_debug_assert(arg >= 16);
1139        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1140        break;
1141    case TCG_TYPE_V256:
1142        /*
1143         * The gvec infrastructure only requires 16-byte alignment,
1144         * so here we must use an unaligned store.
1145         */
1146        tcg_debug_assert(arg >= 16);
1147        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1148                                 arg, 0, arg1, arg2);
1149        break;
1150    default:
1151        g_assert_not_reached();
1152    }
1153}
1154
1155static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1156                        TCGReg base, intptr_t ofs)
1157{
1158    int rexw = 0;
1159    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1160        if (val != (int32_t)val) {
1161            return false;
1162        }
1163        rexw = P_REXW;
1164    } else if (type != TCG_TYPE_I32) {
1165        return false;
1166    }
1167    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1168    tcg_out32(s, val);
1169    return true;
1170}
1171
1172static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1173{
1174    /* Propagate an opcode prefix, such as P_DATA16.  */
1175    int ext = subopc & ~0x7;
1176    subopc &= 0x7;
1177
1178    if (count == 1) {
1179        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1180    } else {
1181        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1182        tcg_out8(s, count);
1183    }
1184}
1185
1186static inline void tcg_out_bswap32(TCGContext *s, int reg)
1187{
1188    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1189}
1190
1191static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1192{
1193    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1194}
1195
1196static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1197{
1198    /* movzbl */
1199    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1200    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1201}
1202
1203static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1204{
1205    /* movsbl */
1206    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1207    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1208}
1209
1210static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1211{
1212    /* movzwl */
1213    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1214}
1215
1216static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1217{
1218    /* movsw[lq] */
1219    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1220}
1221
1222static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1223{
1224    /* 32-bit mov zero extends.  */
1225    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1226}
1227
1228static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1229{
1230    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1231}
1232
1233static inline void tcg_out_bswap64(TCGContext *s, int reg)
1234{
1235    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1236}
1237
1238static void tgen_arithi(TCGContext *s, int c, int r0,
1239                        tcg_target_long val, int cf)
1240{
1241    int rexw = 0;
1242
1243    if (TCG_TARGET_REG_BITS == 64) {
1244        rexw = c & -8;
1245        c &= 7;
1246    }
1247
1248    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1249       partial flags update stalls on Pentium4 and are not recommended
1250       by current Intel optimization manuals.  */
1251    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1252        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1253        if (TCG_TARGET_REG_BITS == 64) {
1254            /* The single-byte increment encodings are re-tasked as the
1255               REX prefixes.  Use the MODRM encoding.  */
1256            tcg_out_modrm(s, OPC_GRP5 + rexw,
1257                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1258        } else {
1259            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1260        }
1261        return;
1262    }
1263
1264    if (c == ARITH_AND) {
1265        if (TCG_TARGET_REG_BITS == 64) {
1266            if (val == 0xffffffffu) {
1267                tcg_out_ext32u(s, r0, r0);
1268                return;
1269            }
1270            if (val == (uint32_t)val) {
1271                /* AND with no high bits set can use a 32-bit operation.  */
1272                rexw = 0;
1273            }
1274        }
1275        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1276            tcg_out_ext8u(s, r0, r0);
1277            return;
1278        }
1279        if (val == 0xffffu) {
1280            tcg_out_ext16u(s, r0, r0);
1281            return;
1282        }
1283    }
1284
1285    if (val == (int8_t)val) {
1286        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1287        tcg_out8(s, val);
1288        return;
1289    }
1290    if (rexw == 0 || val == (int32_t)val) {
1291        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1292        tcg_out32(s, val);
1293        return;
1294    }
1295
1296    tcg_abort();
1297}
1298
1299static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1300{
1301    if (val != 0) {
1302        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1303    }
1304}
1305
1306/* Use SMALL != 0 to force a short forward branch.  */
1307static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1308{
1309    int32_t val, val1;
1310
1311    if (l->has_value) {
1312        val = tcg_pcrel_diff(s, l->u.value_ptr);
1313        val1 = val - 2;
1314        if ((int8_t)val1 == val1) {
1315            if (opc == -1) {
1316                tcg_out8(s, OPC_JMP_short);
1317            } else {
1318                tcg_out8(s, OPC_JCC_short + opc);
1319            }
1320            tcg_out8(s, val1);
1321        } else {
1322            if (small) {
1323                tcg_abort();
1324            }
1325            if (opc == -1) {
1326                tcg_out8(s, OPC_JMP_long);
1327                tcg_out32(s, val - 5);
1328            } else {
1329                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1330                tcg_out32(s, val - 6);
1331            }
1332        }
1333    } else if (small) {
1334        if (opc == -1) {
1335            tcg_out8(s, OPC_JMP_short);
1336        } else {
1337            tcg_out8(s, OPC_JCC_short + opc);
1338        }
1339        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1340        s->code_ptr += 1;
1341    } else {
1342        if (opc == -1) {
1343            tcg_out8(s, OPC_JMP_long);
1344        } else {
1345            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1346        }
1347        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1348        s->code_ptr += 4;
1349    }
1350}
1351
1352static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1353                        int const_arg2, int rexw)
1354{
1355    if (const_arg2) {
1356        if (arg2 == 0) {
1357            /* test r, r */
1358            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1359        } else {
1360            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1361        }
1362    } else {
1363        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1364    }
1365}
1366
1367static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1368                             TCGArg arg1, TCGArg arg2, int const_arg2,
1369                             TCGLabel *label, int small)
1370{
1371    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1372    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1373}
1374
1375#if TCG_TARGET_REG_BITS == 64
1376static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1377                             TCGArg arg1, TCGArg arg2, int const_arg2,
1378                             TCGLabel *label, int small)
1379{
1380    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1381    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1382}
1383#else
1384/* XXX: we implement it at the target level to avoid having to
1385   handle cross basic blocks temporaries */
1386static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1387                            const int *const_args, int small)
1388{
1389    TCGLabel *label_next = gen_new_label();
1390    TCGLabel *label_this = arg_label(args[5]);
1391
1392    switch(args[4]) {
1393    case TCG_COND_EQ:
1394        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1395                         label_next, 1);
1396        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1397                         label_this, small);
1398        break;
1399    case TCG_COND_NE:
1400        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1401                         label_this, small);
1402        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1403                         label_this, small);
1404        break;
1405    case TCG_COND_LT:
1406        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1407                         label_this, small);
1408        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1409        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1410                         label_this, small);
1411        break;
1412    case TCG_COND_LE:
1413        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1414                         label_this, small);
1415        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1416        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1417                         label_this, small);
1418        break;
1419    case TCG_COND_GT:
1420        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1421                         label_this, small);
1422        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1423        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1424                         label_this, small);
1425        break;
1426    case TCG_COND_GE:
1427        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1428                         label_this, small);
1429        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1430        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1431                         label_this, small);
1432        break;
1433    case TCG_COND_LTU:
1434        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1435                         label_this, small);
1436        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1437        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1438                         label_this, small);
1439        break;
1440    case TCG_COND_LEU:
1441        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1442                         label_this, small);
1443        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1444        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1445                         label_this, small);
1446        break;
1447    case TCG_COND_GTU:
1448        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1449                         label_this, small);
1450        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1451        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1452                         label_this, small);
1453        break;
1454    case TCG_COND_GEU:
1455        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1456                         label_this, small);
1457        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1458        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1459                         label_this, small);
1460        break;
1461    default:
1462        tcg_abort();
1463    }
1464    tcg_out_label(s, label_next);
1465}
1466#endif
1467
1468static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1469                              TCGArg arg1, TCGArg arg2, int const_arg2)
1470{
1471    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1472    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1473    tcg_out_ext8u(s, dest, dest);
1474}
1475
1476#if TCG_TARGET_REG_BITS == 64
1477static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1478                              TCGArg arg1, TCGArg arg2, int const_arg2)
1479{
1480    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1481    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1482    tcg_out_ext8u(s, dest, dest);
1483}
1484#else
1485static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1486                             const int *const_args)
1487{
1488    TCGArg new_args[6];
1489    TCGLabel *label_true, *label_over;
1490
1491    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1492
1493    if (args[0] == args[1] || args[0] == args[2]
1494        || (!const_args[3] && args[0] == args[3])
1495        || (!const_args[4] && args[0] == args[4])) {
1496        /* When the destination overlaps with one of the argument
1497           registers, don't do anything tricky.  */
1498        label_true = gen_new_label();
1499        label_over = gen_new_label();
1500
1501        new_args[5] = label_arg(label_true);
1502        tcg_out_brcond2(s, new_args, const_args+1, 1);
1503
1504        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1505        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1506        tcg_out_label(s, label_true);
1507
1508        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1509        tcg_out_label(s, label_over);
1510    } else {
1511        /* When the destination does not overlap one of the arguments,
1512           clear the destination first, jump if cond false, and emit an
1513           increment in the true case.  This results in smaller code.  */
1514
1515        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1516
1517        label_over = gen_new_label();
1518        new_args[4] = tcg_invert_cond(new_args[4]);
1519        new_args[5] = label_arg(label_over);
1520        tcg_out_brcond2(s, new_args, const_args+1, 1);
1521
1522        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1523        tcg_out_label(s, label_over);
1524    }
1525}
1526#endif
1527
1528static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1529                         TCGReg dest, TCGReg v1)
1530{
1531    if (have_cmov) {
1532        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1533    } else {
1534        TCGLabel *over = gen_new_label();
1535        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1536        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1537        tcg_out_label(s, over);
1538    }
1539}
1540
1541static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1542                              TCGReg c1, TCGArg c2, int const_c2,
1543                              TCGReg v1)
1544{
1545    tcg_out_cmp(s, c1, c2, const_c2, 0);
1546    tcg_out_cmov(s, cond, 0, dest, v1);
1547}
1548
1549#if TCG_TARGET_REG_BITS == 64
1550static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1551                              TCGReg c1, TCGArg c2, int const_c2,
1552                              TCGReg v1)
1553{
1554    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1555    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1556}
1557#endif
1558
1559static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1560                        TCGArg arg2, bool const_a2)
1561{
1562    if (have_bmi1) {
1563        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1564        if (const_a2) {
1565            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1566        } else {
1567            tcg_debug_assert(dest != arg2);
1568            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1569        }
1570    } else {
1571        tcg_debug_assert(dest != arg2);
1572        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1573        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1574    }
1575}
1576
1577static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1578                        TCGArg arg2, bool const_a2)
1579{
1580    if (have_lzcnt) {
1581        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1582        if (const_a2) {
1583            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1584        } else {
1585            tcg_debug_assert(dest != arg2);
1586            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1587        }
1588    } else {
1589        tcg_debug_assert(!const_a2);
1590        tcg_debug_assert(dest != arg1);
1591        tcg_debug_assert(dest != arg2);
1592
1593        /* Recall that the output of BSR is the index not the count.  */
1594        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1595        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1596
1597        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1598        tcg_out_cmp(s, arg1, 0, 1, rexw);
1599        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1600    }
1601}
1602
1603static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1604{
1605    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1606
1607    if (disp == (int32_t)disp) {
1608        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1609        tcg_out32(s, disp);
1610    } else {
1611        /* rip-relative addressing into the constant pool.
1612           This is 6 + 8 = 14 bytes, as compared to using an
1613           an immediate load 10 + 6 = 16 bytes, plus we may
1614           be able to re-use the pool constant for more calls.  */
1615        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1616        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1617        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1618        tcg_out32(s, 0);
1619    }
1620}
1621
1622static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1623{
1624    tcg_out_branch(s, 1, dest);
1625}
1626
1627static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1628{
1629    tcg_out_branch(s, 0, dest);
1630}
1631
1632static void tcg_out_nopn(TCGContext *s, int n)
1633{
1634    int i;
1635    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1636     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1637     * duplicate prefix, and all of the interesting recent cores can
1638     * decode and discard the duplicates in a single cycle.
1639     */
1640    tcg_debug_assert(n >= 1);
1641    for (i = 1; i < n; ++i) {
1642        tcg_out8(s, 0x66);
1643    }
1644    tcg_out8(s, 0x90);
1645}
1646
1647#if defined(CONFIG_SOFTMMU)
1648#include "../tcg-ldst.c.inc"
1649
1650/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1651 *                                     int mmu_idx, uintptr_t ra)
1652 */
1653static void * const qemu_ld_helpers[16] = {
1654    [MO_UB]   = helper_ret_ldub_mmu,
1655    [MO_LEUW] = helper_le_lduw_mmu,
1656    [MO_LEUL] = helper_le_ldul_mmu,
1657    [MO_LEQ]  = helper_le_ldq_mmu,
1658    [MO_BEUW] = helper_be_lduw_mmu,
1659    [MO_BEUL] = helper_be_ldul_mmu,
1660    [MO_BEQ]  = helper_be_ldq_mmu,
1661};
1662
1663/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1664 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1665 */
1666static void * const qemu_st_helpers[16] = {
1667    [MO_UB]   = helper_ret_stb_mmu,
1668    [MO_LEUW] = helper_le_stw_mmu,
1669    [MO_LEUL] = helper_le_stl_mmu,
1670    [MO_LEQ]  = helper_le_stq_mmu,
1671    [MO_BEUW] = helper_be_stw_mmu,
1672    [MO_BEUL] = helper_be_stl_mmu,
1673    [MO_BEQ]  = helper_be_stq_mmu,
1674};
1675
1676/* Perform the TLB load and compare.
1677
1678   Inputs:
1679   ADDRLO and ADDRHI contain the low and high part of the address.
1680
1681   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1682
1683   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1684   This should be offsetof addr_read or addr_write.
1685
1686   Outputs:
1687   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1688   positions of the displacements of forward jumps to the TLB miss case.
1689
1690   Second argument register is loaded with the low part of the address.
1691   In the TLB hit case, it has been adjusted as indicated by the TLB
1692   and so is a host address.  In the TLB miss case, it continues to
1693   hold a guest address.
1694
1695   First argument register is clobbered.  */
1696
1697static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1698                                    int mem_index, MemOp opc,
1699                                    tcg_insn_unit **label_ptr, int which)
1700{
1701    const TCGReg r0 = TCG_REG_L0;
1702    const TCGReg r1 = TCG_REG_L1;
1703    TCGType ttype = TCG_TYPE_I32;
1704    TCGType tlbtype = TCG_TYPE_I32;
1705    int trexw = 0, hrexw = 0, tlbrexw = 0;
1706    unsigned a_bits = get_alignment_bits(opc);
1707    unsigned s_bits = opc & MO_SIZE;
1708    unsigned a_mask = (1 << a_bits) - 1;
1709    unsigned s_mask = (1 << s_bits) - 1;
1710    target_ulong tlb_mask;
1711
1712    if (TCG_TARGET_REG_BITS == 64) {
1713        if (TARGET_LONG_BITS == 64) {
1714            ttype = TCG_TYPE_I64;
1715            trexw = P_REXW;
1716        }
1717        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1718            hrexw = P_REXW;
1719            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1720                tlbtype = TCG_TYPE_I64;
1721                tlbrexw = P_REXW;
1722            }
1723        }
1724    }
1725
1726    tcg_out_mov(s, tlbtype, r0, addrlo);
1727    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1728                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1729
1730    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1731                         TLB_MASK_TABLE_OFS(mem_index) +
1732                         offsetof(CPUTLBDescFast, mask));
1733
1734    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1735                         TLB_MASK_TABLE_OFS(mem_index) +
1736                         offsetof(CPUTLBDescFast, table));
1737
1738    /* If the required alignment is at least as large as the access, simply
1739       copy the address and mask.  For lesser alignments, check that we don't
1740       cross pages for the complete access.  */
1741    if (a_bits >= s_bits) {
1742        tcg_out_mov(s, ttype, r1, addrlo);
1743    } else {
1744        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1745    }
1746    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1747    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1748
1749    /* cmp 0(r0), r1 */
1750    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1751
1752    /* Prepare for both the fast path add of the tlb addend, and the slow
1753       path function argument setup.  */
1754    tcg_out_mov(s, ttype, r1, addrlo);
1755
1756    /* jne slow_path */
1757    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1758    label_ptr[0] = s->code_ptr;
1759    s->code_ptr += 4;
1760
1761    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1762        /* cmp 4(r0), addrhi */
1763        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1764
1765        /* jne slow_path */
1766        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1767        label_ptr[1] = s->code_ptr;
1768        s->code_ptr += 4;
1769    }
1770
1771    /* TLB Hit.  */
1772
1773    /* add addend(r0), r1 */
1774    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1775                         offsetof(CPUTLBEntry, addend));
1776}
1777
1778/*
1779 * Record the context of a call to the out of line helper code for the slow path
1780 * for a load or store, so that we can later generate the correct helper code
1781 */
1782static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1783                                TCGMemOpIdx oi,
1784                                TCGReg datalo, TCGReg datahi,
1785                                TCGReg addrlo, TCGReg addrhi,
1786                                tcg_insn_unit *raddr,
1787                                tcg_insn_unit **label_ptr)
1788{
1789    TCGLabelQemuLdst *label = new_ldst_label(s);
1790
1791    label->is_ld = is_ld;
1792    label->oi = oi;
1793    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1794    label->datalo_reg = datalo;
1795    label->datahi_reg = datahi;
1796    label->addrlo_reg = addrlo;
1797    label->addrhi_reg = addrhi;
1798    label->raddr = tcg_splitwx_to_rx(raddr);
1799    label->label_ptr[0] = label_ptr[0];
1800    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1801        label->label_ptr[1] = label_ptr[1];
1802    }
1803}
1804
1805/*
1806 * Generate code for the slow path for a load at the end of block
1807 */
1808static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1809{
1810    TCGMemOpIdx oi = l->oi;
1811    MemOp opc = get_memop(oi);
1812    TCGReg data_reg;
1813    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1814    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1815
1816    /* resolve label address */
1817    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1818    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1819        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1820    }
1821
1822    if (TCG_TARGET_REG_BITS == 32) {
1823        int ofs = 0;
1824
1825        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1826        ofs += 4;
1827
1828        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1829        ofs += 4;
1830
1831        if (TARGET_LONG_BITS == 64) {
1832            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1833            ofs += 4;
1834        }
1835
1836        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1837        ofs += 4;
1838
1839        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1840    } else {
1841        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1842        /* The second argument is already loaded with addrlo.  */
1843        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1844        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1845                     (uintptr_t)l->raddr);
1846    }
1847
1848    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1849
1850    data_reg = l->datalo_reg;
1851    switch (opc & MO_SSIZE) {
1852    case MO_SB:
1853        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1854        break;
1855    case MO_SW:
1856        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1857        break;
1858#if TCG_TARGET_REG_BITS == 64
1859    case MO_SL:
1860        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1861        break;
1862#endif
1863    case MO_UB:
1864    case MO_UW:
1865        /* Note that the helpers have zero-extended to tcg_target_long.  */
1866    case MO_UL:
1867        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1868        break;
1869    case MO_Q:
1870        if (TCG_TARGET_REG_BITS == 64) {
1871            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1872        } else if (data_reg == TCG_REG_EDX) {
1873            /* xchg %edx, %eax */
1874            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1875            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1876        } else {
1877            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1878            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1879        }
1880        break;
1881    default:
1882        tcg_abort();
1883    }
1884
1885    /* Jump to the code corresponding to next IR of qemu_st */
1886    tcg_out_jmp(s, l->raddr);
1887    return true;
1888}
1889
1890/*
1891 * Generate code for the slow path for a store at the end of block
1892 */
1893static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1894{
1895    TCGMemOpIdx oi = l->oi;
1896    MemOp opc = get_memop(oi);
1897    MemOp s_bits = opc & MO_SIZE;
1898    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1899    TCGReg retaddr;
1900
1901    /* resolve label address */
1902    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1903    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1904        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1905    }
1906
1907    if (TCG_TARGET_REG_BITS == 32) {
1908        int ofs = 0;
1909
1910        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1911        ofs += 4;
1912
1913        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1914        ofs += 4;
1915
1916        if (TARGET_LONG_BITS == 64) {
1917            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1918            ofs += 4;
1919        }
1920
1921        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1922        ofs += 4;
1923
1924        if (s_bits == MO_64) {
1925            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1926            ofs += 4;
1927        }
1928
1929        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1930        ofs += 4;
1931
1932        retaddr = TCG_REG_EAX;
1933        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1934        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1935    } else {
1936        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1937        /* The second argument is already loaded with addrlo.  */
1938        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1939                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1940        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1941
1942        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1943            retaddr = tcg_target_call_iarg_regs[4];
1944            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1945        } else {
1946            retaddr = TCG_REG_RAX;
1947            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1948            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1949                       TCG_TARGET_CALL_STACK_OFFSET);
1950        }
1951    }
1952
1953    /* "Tail call" to the helper, with the return address back inline.  */
1954    tcg_out_push(s, retaddr);
1955    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1956    return true;
1957}
1958#elif TCG_TARGET_REG_BITS == 32
1959# define x86_guest_base_seg     0
1960# define x86_guest_base_index   -1
1961# define x86_guest_base_offset  guest_base
1962#else
1963static int x86_guest_base_seg;
1964static int x86_guest_base_index = -1;
1965static int32_t x86_guest_base_offset;
1966# if defined(__x86_64__) && defined(__linux__)
1967#  include <asm/prctl.h>
1968#  include <sys/prctl.h>
1969int arch_prctl(int code, unsigned long addr);
1970static inline int setup_guest_base_seg(void)
1971{
1972    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1973        return P_GS;
1974    }
1975    return 0;
1976}
1977# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1978#  include <machine/sysarch.h>
1979static inline int setup_guest_base_seg(void)
1980{
1981    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1982        return P_GS;
1983    }
1984    return 0;
1985}
1986# else
1987static inline int setup_guest_base_seg(void)
1988{
1989    return 0;
1990}
1991# endif
1992#endif /* SOFTMMU */
1993
1994static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1995                                   TCGReg base, int index, intptr_t ofs,
1996                                   int seg, bool is64, MemOp memop)
1997{
1998    bool use_movbe = false;
1999    int rexw = is64 * P_REXW;
2000    int movop = OPC_MOVL_GvEv;
2001
2002    /* Do big-endian loads with movbe.  */
2003    if (memop & MO_BSWAP) {
2004        tcg_debug_assert(have_movbe);
2005        use_movbe = true;
2006        movop = OPC_MOVBE_GyMy;
2007    }
2008
2009    switch (memop & MO_SSIZE) {
2010    case MO_UB:
2011        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2012                                 base, index, 0, ofs);
2013        break;
2014    case MO_SB:
2015        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2016                                 base, index, 0, ofs);
2017        break;
2018    case MO_UW:
2019        if (use_movbe) {
2020            /* There is no extending movbe; only low 16-bits are modified.  */
2021            if (datalo != base && datalo != index) {
2022                /* XOR breaks dependency chains.  */
2023                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2024                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2025                                         datalo, base, index, 0, ofs);
2026            } else {
2027                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2028                                         datalo, base, index, 0, ofs);
2029                tcg_out_ext16u(s, datalo, datalo);
2030            }
2031        } else {
2032            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2033                                     base, index, 0, ofs);
2034        }
2035        break;
2036    case MO_SW:
2037        if (use_movbe) {
2038            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2039                                     datalo, base, index, 0, ofs);
2040            tcg_out_ext16s(s, datalo, datalo, rexw);
2041        } else {
2042            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2043                                     datalo, base, index, 0, ofs);
2044        }
2045        break;
2046    case MO_UL:
2047        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2048        break;
2049#if TCG_TARGET_REG_BITS == 64
2050    case MO_SL:
2051        if (use_movbe) {
2052            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2053                                     base, index, 0, ofs);
2054            tcg_out_ext32s(s, datalo, datalo);
2055        } else {
2056            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2057                                     base, index, 0, ofs);
2058        }
2059        break;
2060#endif
2061    case MO_Q:
2062        if (TCG_TARGET_REG_BITS == 64) {
2063            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2064                                     base, index, 0, ofs);
2065        } else {
2066            if (use_movbe) {
2067                TCGReg t = datalo;
2068                datalo = datahi;
2069                datahi = t;
2070            }
2071            if (base != datalo) {
2072                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2073                                         base, index, 0, ofs);
2074                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2075                                         base, index, 0, ofs + 4);
2076            } else {
2077                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2078                                         base, index, 0, ofs + 4);
2079                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2080                                         base, index, 0, ofs);
2081            }
2082        }
2083        break;
2084    default:
2085        g_assert_not_reached();
2086    }
2087}
2088
2089/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2090   EAX. It will be useful once fixed registers globals are less
2091   common. */
2092static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2093{
2094    TCGReg datalo, datahi, addrlo;
2095    TCGReg addrhi __attribute__((unused));
2096    TCGMemOpIdx oi;
2097    MemOp opc;
2098#if defined(CONFIG_SOFTMMU)
2099    int mem_index;
2100    tcg_insn_unit *label_ptr[2];
2101#endif
2102
2103    datalo = *args++;
2104    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2105    addrlo = *args++;
2106    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2107    oi = *args++;
2108    opc = get_memop(oi);
2109
2110#if defined(CONFIG_SOFTMMU)
2111    mem_index = get_mmuidx(oi);
2112
2113    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2114                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2115
2116    /* TLB Hit.  */
2117    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2118
2119    /* Record the current context of a load into ldst label */
2120    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2121                        s->code_ptr, label_ptr);
2122#else
2123    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2124                           x86_guest_base_offset, x86_guest_base_seg,
2125                           is64, opc);
2126#endif
2127}
2128
2129static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2130                                   TCGReg base, int index, intptr_t ofs,
2131                                   int seg, MemOp memop)
2132{
2133    bool use_movbe = false;
2134    int movop = OPC_MOVL_EvGv;
2135
2136    /*
2137     * Do big-endian stores with movbe or softmmu.
2138     * User-only without movbe will have its swapping done generically.
2139     */
2140    if (memop & MO_BSWAP) {
2141        tcg_debug_assert(have_movbe);
2142        use_movbe = true;
2143        movop = OPC_MOVBE_MyGy;
2144    }
2145
2146    switch (memop & MO_SIZE) {
2147    case MO_8:
2148        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2149        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2150        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2151                                 datalo, base, index, 0, ofs);
2152        break;
2153    case MO_16:
2154        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2155                                 base, index, 0, ofs);
2156        break;
2157    case MO_32:
2158        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2159        break;
2160    case MO_64:
2161        if (TCG_TARGET_REG_BITS == 64) {
2162            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2163                                     base, index, 0, ofs);
2164        } else {
2165            if (use_movbe) {
2166                TCGReg t = datalo;
2167                datalo = datahi;
2168                datahi = t;
2169            }
2170            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2171                                     base, index, 0, ofs);
2172            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2173                                     base, index, 0, ofs + 4);
2174        }
2175        break;
2176    default:
2177        g_assert_not_reached();
2178    }
2179}
2180
2181static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2182{
2183    TCGReg datalo, datahi, addrlo;
2184    TCGReg addrhi __attribute__((unused));
2185    TCGMemOpIdx oi;
2186    MemOp opc;
2187#if defined(CONFIG_SOFTMMU)
2188    int mem_index;
2189    tcg_insn_unit *label_ptr[2];
2190#endif
2191
2192    datalo = *args++;
2193    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2194    addrlo = *args++;
2195    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2196    oi = *args++;
2197    opc = get_memop(oi);
2198
2199#if defined(CONFIG_SOFTMMU)
2200    mem_index = get_mmuidx(oi);
2201
2202    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2203                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2204
2205    /* TLB Hit.  */
2206    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2207
2208    /* Record the current context of a store into ldst label */
2209    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2210                        s->code_ptr, label_ptr);
2211#else
2212    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2213                           x86_guest_base_offset, x86_guest_base_seg, opc);
2214#endif
2215}
2216
2217static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2218                              const TCGArg *args, const int *const_args)
2219{
2220    TCGArg a0, a1, a2;
2221    int c, const_a2, vexop, rexw = 0;
2222
2223#if TCG_TARGET_REG_BITS == 64
2224# define OP_32_64(x) \
2225        case glue(glue(INDEX_op_, x), _i64): \
2226            rexw = P_REXW; /* FALLTHRU */    \
2227        case glue(glue(INDEX_op_, x), _i32)
2228#else
2229# define OP_32_64(x) \
2230        case glue(glue(INDEX_op_, x), _i32)
2231#endif
2232
2233    /* Hoist the loads of the most common arguments.  */
2234    a0 = args[0];
2235    a1 = args[1];
2236    a2 = args[2];
2237    const_a2 = const_args[2];
2238
2239    switch (opc) {
2240    case INDEX_op_exit_tb:
2241        /* Reuse the zeroing that exists for goto_ptr.  */
2242        if (a0 == 0) {
2243            tcg_out_jmp(s, tcg_code_gen_epilogue);
2244        } else {
2245            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2246            tcg_out_jmp(s, tb_ret_addr);
2247        }
2248        break;
2249    case INDEX_op_goto_tb:
2250        if (s->tb_jmp_insn_offset) {
2251            /* direct jump method */
2252            int gap;
2253            /* jump displacement must be aligned for atomic patching;
2254             * see if we need to add extra nops before jump
2255             */
2256            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2257            if (gap != 1) {
2258                tcg_out_nopn(s, gap - 1);
2259            }
2260            tcg_out8(s, OPC_JMP_long); /* jmp im */
2261            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2262            tcg_out32(s, 0);
2263        } else {
2264            /* indirect jump method */
2265            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2266                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2267        }
2268        set_jmp_reset_offset(s, a0);
2269        break;
2270    case INDEX_op_goto_ptr:
2271        /* jmp to the given host address (could be epilogue) */
2272        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2273        break;
2274    case INDEX_op_br:
2275        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2276        break;
2277    OP_32_64(ld8u):
2278        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2279        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2280        break;
2281    OP_32_64(ld8s):
2282        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2283        break;
2284    OP_32_64(ld16u):
2285        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2286        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2287        break;
2288    OP_32_64(ld16s):
2289        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2290        break;
2291#if TCG_TARGET_REG_BITS == 64
2292    case INDEX_op_ld32u_i64:
2293#endif
2294    case INDEX_op_ld_i32:
2295        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2296        break;
2297
2298    OP_32_64(st8):
2299        if (const_args[0]) {
2300            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2301            tcg_out8(s, a0);
2302        } else {
2303            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2304        }
2305        break;
2306    OP_32_64(st16):
2307        if (const_args[0]) {
2308            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2309            tcg_out16(s, a0);
2310        } else {
2311            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2312        }
2313        break;
2314#if TCG_TARGET_REG_BITS == 64
2315    case INDEX_op_st32_i64:
2316#endif
2317    case INDEX_op_st_i32:
2318        if (const_args[0]) {
2319            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2320            tcg_out32(s, a0);
2321        } else {
2322            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2323        }
2324        break;
2325
2326    OP_32_64(add):
2327        /* For 3-operand addition, use LEA.  */
2328        if (a0 != a1) {
2329            TCGArg c3 = 0;
2330            if (const_a2) {
2331                c3 = a2, a2 = -1;
2332            } else if (a0 == a2) {
2333                /* Watch out for dest = src + dest, since we've removed
2334                   the matching constraint on the add.  */
2335                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2336                break;
2337            }
2338
2339            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2340            break;
2341        }
2342        c = ARITH_ADD;
2343        goto gen_arith;
2344    OP_32_64(sub):
2345        c = ARITH_SUB;
2346        goto gen_arith;
2347    OP_32_64(and):
2348        c = ARITH_AND;
2349        goto gen_arith;
2350    OP_32_64(or):
2351        c = ARITH_OR;
2352        goto gen_arith;
2353    OP_32_64(xor):
2354        c = ARITH_XOR;
2355        goto gen_arith;
2356    gen_arith:
2357        if (const_a2) {
2358            tgen_arithi(s, c + rexw, a0, a2, 0);
2359        } else {
2360            tgen_arithr(s, c + rexw, a0, a2);
2361        }
2362        break;
2363
2364    OP_32_64(andc):
2365        if (const_a2) {
2366            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2367            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2368        } else {
2369            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2370        }
2371        break;
2372
2373    OP_32_64(mul):
2374        if (const_a2) {
2375            int32_t val;
2376            val = a2;
2377            if (val == (int8_t)val) {
2378                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2379                tcg_out8(s, val);
2380            } else {
2381                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2382                tcg_out32(s, val);
2383            }
2384        } else {
2385            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2386        }
2387        break;
2388
2389    OP_32_64(div2):
2390        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2391        break;
2392    OP_32_64(divu2):
2393        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2394        break;
2395
2396    OP_32_64(shl):
2397        /* For small constant 3-operand shift, use LEA.  */
2398        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2399            if (a2 - 1 == 0) {
2400                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2401                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2402            } else {
2403                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2404                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2405            }
2406            break;
2407        }
2408        c = SHIFT_SHL;
2409        vexop = OPC_SHLX;
2410        goto gen_shift_maybe_vex;
2411    OP_32_64(shr):
2412        c = SHIFT_SHR;
2413        vexop = OPC_SHRX;
2414        goto gen_shift_maybe_vex;
2415    OP_32_64(sar):
2416        c = SHIFT_SAR;
2417        vexop = OPC_SARX;
2418        goto gen_shift_maybe_vex;
2419    OP_32_64(rotl):
2420        c = SHIFT_ROL;
2421        goto gen_shift;
2422    OP_32_64(rotr):
2423        c = SHIFT_ROR;
2424        goto gen_shift;
2425    gen_shift_maybe_vex:
2426        if (have_bmi2) {
2427            if (!const_a2) {
2428                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2429                break;
2430            }
2431            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2432        }
2433        /* FALLTHRU */
2434    gen_shift:
2435        if (const_a2) {
2436            tcg_out_shifti(s, c + rexw, a0, a2);
2437        } else {
2438            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2439        }
2440        break;
2441
2442    OP_32_64(ctz):
2443        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2444        break;
2445    OP_32_64(clz):
2446        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2447        break;
2448    OP_32_64(ctpop):
2449        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2450        break;
2451
2452    case INDEX_op_brcond_i32:
2453        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2454        break;
2455    case INDEX_op_setcond_i32:
2456        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2457        break;
2458    case INDEX_op_movcond_i32:
2459        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2460        break;
2461
2462    OP_32_64(bswap16):
2463        tcg_out_rolw_8(s, a0);
2464        break;
2465    OP_32_64(bswap32):
2466        tcg_out_bswap32(s, a0);
2467        break;
2468
2469    OP_32_64(neg):
2470        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2471        break;
2472    OP_32_64(not):
2473        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2474        break;
2475
2476    OP_32_64(ext8s):
2477        tcg_out_ext8s(s, a0, a1, rexw);
2478        break;
2479    OP_32_64(ext16s):
2480        tcg_out_ext16s(s, a0, a1, rexw);
2481        break;
2482    OP_32_64(ext8u):
2483        tcg_out_ext8u(s, a0, a1);
2484        break;
2485    OP_32_64(ext16u):
2486        tcg_out_ext16u(s, a0, a1);
2487        break;
2488
2489    case INDEX_op_qemu_ld_i32:
2490        tcg_out_qemu_ld(s, args, 0);
2491        break;
2492    case INDEX_op_qemu_ld_i64:
2493        tcg_out_qemu_ld(s, args, 1);
2494        break;
2495    case INDEX_op_qemu_st_i32:
2496    case INDEX_op_qemu_st8_i32:
2497        tcg_out_qemu_st(s, args, 0);
2498        break;
2499    case INDEX_op_qemu_st_i64:
2500        tcg_out_qemu_st(s, args, 1);
2501        break;
2502
2503    OP_32_64(mulu2):
2504        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2505        break;
2506    OP_32_64(muls2):
2507        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2508        break;
2509    OP_32_64(add2):
2510        if (const_args[4]) {
2511            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2512        } else {
2513            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2514        }
2515        if (const_args[5]) {
2516            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2517        } else {
2518            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2519        }
2520        break;
2521    OP_32_64(sub2):
2522        if (const_args[4]) {
2523            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2524        } else {
2525            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2526        }
2527        if (const_args[5]) {
2528            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2529        } else {
2530            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2531        }
2532        break;
2533
2534#if TCG_TARGET_REG_BITS == 32
2535    case INDEX_op_brcond2_i32:
2536        tcg_out_brcond2(s, args, const_args, 0);
2537        break;
2538    case INDEX_op_setcond2_i32:
2539        tcg_out_setcond2(s, args, const_args);
2540        break;
2541#else /* TCG_TARGET_REG_BITS == 64 */
2542    case INDEX_op_ld32s_i64:
2543        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2544        break;
2545    case INDEX_op_ld_i64:
2546        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2547        break;
2548    case INDEX_op_st_i64:
2549        if (const_args[0]) {
2550            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2551            tcg_out32(s, a0);
2552        } else {
2553            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2554        }
2555        break;
2556
2557    case INDEX_op_brcond_i64:
2558        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2559        break;
2560    case INDEX_op_setcond_i64:
2561        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2562        break;
2563    case INDEX_op_movcond_i64:
2564        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2565        break;
2566
2567    case INDEX_op_bswap64_i64:
2568        tcg_out_bswap64(s, a0);
2569        break;
2570    case INDEX_op_extu_i32_i64:
2571    case INDEX_op_ext32u_i64:
2572    case INDEX_op_extrl_i64_i32:
2573        tcg_out_ext32u(s, a0, a1);
2574        break;
2575    case INDEX_op_ext_i32_i64:
2576    case INDEX_op_ext32s_i64:
2577        tcg_out_ext32s(s, a0, a1);
2578        break;
2579    case INDEX_op_extrh_i64_i32:
2580        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2581        break;
2582#endif
2583
2584    OP_32_64(deposit):
2585        if (args[3] == 0 && args[4] == 8) {
2586            /* load bits 0..7 */
2587            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2588        } else if (args[3] == 8 && args[4] == 8) {
2589            /* load bits 8..15 */
2590            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2591        } else if (args[3] == 0 && args[4] == 16) {
2592            /* load bits 0..15 */
2593            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2594        } else {
2595            tcg_abort();
2596        }
2597        break;
2598
2599    case INDEX_op_extract_i64:
2600        if (a2 + args[3] == 32) {
2601            /* This is a 32-bit zero-extending right shift.  */
2602            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2603            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2604            break;
2605        }
2606        /* FALLTHRU */
2607    case INDEX_op_extract_i32:
2608        /* On the off-chance that we can use the high-byte registers.
2609           Otherwise we emit the same ext16 + shift pattern that we
2610           would have gotten from the normal tcg-op.c expansion.  */
2611        tcg_debug_assert(a2 == 8 && args[3] == 8);
2612        if (a1 < 4 && a0 < 8) {
2613            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2614        } else {
2615            tcg_out_ext16u(s, a0, a1);
2616            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2617        }
2618        break;
2619
2620    case INDEX_op_sextract_i32:
2621        /* We don't implement sextract_i64, as we cannot sign-extend to
2622           64-bits without using the REX prefix that explicitly excludes
2623           access to the high-byte registers.  */
2624        tcg_debug_assert(a2 == 8 && args[3] == 8);
2625        if (a1 < 4 && a0 < 8) {
2626            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2627        } else {
2628            tcg_out_ext16s(s, a0, a1, 0);
2629            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2630        }
2631        break;
2632
2633    OP_32_64(extract2):
2634        /* Note that SHRD outputs to the r/m operand.  */
2635        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2636        tcg_out8(s, args[3]);
2637        break;
2638
2639    case INDEX_op_mb:
2640        tcg_out_mb(s, a0);
2641        break;
2642    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2643    case INDEX_op_mov_i64:
2644    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2645    case INDEX_op_movi_i64:
2646    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2647    default:
2648        tcg_abort();
2649    }
2650
2651#undef OP_32_64
2652}
2653
2654static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2655                           unsigned vecl, unsigned vece,
2656                           const TCGArg *args, const int *const_args)
2657{
2658    static int const add_insn[4] = {
2659        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2660    };
2661    static int const ssadd_insn[4] = {
2662        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2663    };
2664    static int const usadd_insn[4] = {
2665        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2666    };
2667    static int const sub_insn[4] = {
2668        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2669    };
2670    static int const sssub_insn[4] = {
2671        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2672    };
2673    static int const ussub_insn[4] = {
2674        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2675    };
2676    static int const mul_insn[4] = {
2677        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2678    };
2679    static int const shift_imm_insn[4] = {
2680        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2681    };
2682    static int const cmpeq_insn[4] = {
2683        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2684    };
2685    static int const cmpgt_insn[4] = {
2686        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2687    };
2688    static int const punpckl_insn[4] = {
2689        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2690    };
2691    static int const punpckh_insn[4] = {
2692        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2693    };
2694    static int const packss_insn[4] = {
2695        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2696    };
2697    static int const packus_insn[4] = {
2698        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2699    };
2700    static int const smin_insn[4] = {
2701        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2702    };
2703    static int const smax_insn[4] = {
2704        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2705    };
2706    static int const umin_insn[4] = {
2707        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2708    };
2709    static int const umax_insn[4] = {
2710        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2711    };
2712    static int const shlv_insn[4] = {
2713        /* TODO: AVX512 adds support for MO_16.  */
2714        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2715    };
2716    static int const shrv_insn[4] = {
2717        /* TODO: AVX512 adds support for MO_16.  */
2718        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2719    };
2720    static int const sarv_insn[4] = {
2721        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2722        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2723    };
2724    static int const shls_insn[4] = {
2725        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2726    };
2727    static int const shrs_insn[4] = {
2728        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2729    };
2730    static int const sars_insn[4] = {
2731        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2732    };
2733    static int const abs_insn[4] = {
2734        /* TODO: AVX512 adds support for MO_64.  */
2735        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2736    };
2737
2738    TCGType type = vecl + TCG_TYPE_V64;
2739    int insn, sub;
2740    TCGArg a0, a1, a2;
2741
2742    a0 = args[0];
2743    a1 = args[1];
2744    a2 = args[2];
2745
2746    switch (opc) {
2747    case INDEX_op_add_vec:
2748        insn = add_insn[vece];
2749        goto gen_simd;
2750    case INDEX_op_ssadd_vec:
2751        insn = ssadd_insn[vece];
2752        goto gen_simd;
2753    case INDEX_op_usadd_vec:
2754        insn = usadd_insn[vece];
2755        goto gen_simd;
2756    case INDEX_op_sub_vec:
2757        insn = sub_insn[vece];
2758        goto gen_simd;
2759    case INDEX_op_sssub_vec:
2760        insn = sssub_insn[vece];
2761        goto gen_simd;
2762    case INDEX_op_ussub_vec:
2763        insn = ussub_insn[vece];
2764        goto gen_simd;
2765    case INDEX_op_mul_vec:
2766        insn = mul_insn[vece];
2767        goto gen_simd;
2768    case INDEX_op_and_vec:
2769        insn = OPC_PAND;
2770        goto gen_simd;
2771    case INDEX_op_or_vec:
2772        insn = OPC_POR;
2773        goto gen_simd;
2774    case INDEX_op_xor_vec:
2775        insn = OPC_PXOR;
2776        goto gen_simd;
2777    case INDEX_op_smin_vec:
2778        insn = smin_insn[vece];
2779        goto gen_simd;
2780    case INDEX_op_umin_vec:
2781        insn = umin_insn[vece];
2782        goto gen_simd;
2783    case INDEX_op_smax_vec:
2784        insn = smax_insn[vece];
2785        goto gen_simd;
2786    case INDEX_op_umax_vec:
2787        insn = umax_insn[vece];
2788        goto gen_simd;
2789    case INDEX_op_shlv_vec:
2790        insn = shlv_insn[vece];
2791        goto gen_simd;
2792    case INDEX_op_shrv_vec:
2793        insn = shrv_insn[vece];
2794        goto gen_simd;
2795    case INDEX_op_sarv_vec:
2796        insn = sarv_insn[vece];
2797        goto gen_simd;
2798    case INDEX_op_shls_vec:
2799        insn = shls_insn[vece];
2800        goto gen_simd;
2801    case INDEX_op_shrs_vec:
2802        insn = shrs_insn[vece];
2803        goto gen_simd;
2804    case INDEX_op_sars_vec:
2805        insn = sars_insn[vece];
2806        goto gen_simd;
2807    case INDEX_op_x86_punpckl_vec:
2808        insn = punpckl_insn[vece];
2809        goto gen_simd;
2810    case INDEX_op_x86_punpckh_vec:
2811        insn = punpckh_insn[vece];
2812        goto gen_simd;
2813    case INDEX_op_x86_packss_vec:
2814        insn = packss_insn[vece];
2815        goto gen_simd;
2816    case INDEX_op_x86_packus_vec:
2817        insn = packus_insn[vece];
2818        goto gen_simd;
2819#if TCG_TARGET_REG_BITS == 32
2820    case INDEX_op_dup2_vec:
2821        /* First merge the two 32-bit inputs to a single 64-bit element. */
2822        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2823        /* Then replicate the 64-bit elements across the rest of the vector. */
2824        if (type != TCG_TYPE_V64) {
2825            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2826        }
2827        break;
2828#endif
2829    case INDEX_op_abs_vec:
2830        insn = abs_insn[vece];
2831        a2 = a1;
2832        a1 = 0;
2833        goto gen_simd;
2834    gen_simd:
2835        tcg_debug_assert(insn != OPC_UD2);
2836        if (type == TCG_TYPE_V256) {
2837            insn |= P_VEXL;
2838        }
2839        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2840        break;
2841
2842    case INDEX_op_cmp_vec:
2843        sub = args[3];
2844        if (sub == TCG_COND_EQ) {
2845            insn = cmpeq_insn[vece];
2846        } else if (sub == TCG_COND_GT) {
2847            insn = cmpgt_insn[vece];
2848        } else {
2849            g_assert_not_reached();
2850        }
2851        goto gen_simd;
2852
2853    case INDEX_op_andc_vec:
2854        insn = OPC_PANDN;
2855        if (type == TCG_TYPE_V256) {
2856            insn |= P_VEXL;
2857        }
2858        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2859        break;
2860
2861    case INDEX_op_shli_vec:
2862        sub = 6;
2863        goto gen_shift;
2864    case INDEX_op_shri_vec:
2865        sub = 2;
2866        goto gen_shift;
2867    case INDEX_op_sari_vec:
2868        tcg_debug_assert(vece != MO_64);
2869        sub = 4;
2870    gen_shift:
2871        tcg_debug_assert(vece != MO_8);
2872        insn = shift_imm_insn[vece];
2873        if (type == TCG_TYPE_V256) {
2874            insn |= P_VEXL;
2875        }
2876        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2877        tcg_out8(s, a2);
2878        break;
2879
2880    case INDEX_op_ld_vec:
2881        tcg_out_ld(s, type, a0, a1, a2);
2882        break;
2883    case INDEX_op_st_vec:
2884        tcg_out_st(s, type, a0, a1, a2);
2885        break;
2886    case INDEX_op_dupm_vec:
2887        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2888        break;
2889
2890    case INDEX_op_x86_shufps_vec:
2891        insn = OPC_SHUFPS;
2892        sub = args[3];
2893        goto gen_simd_imm8;
2894    case INDEX_op_x86_blend_vec:
2895        if (vece == MO_16) {
2896            insn = OPC_PBLENDW;
2897        } else if (vece == MO_32) {
2898            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2899        } else {
2900            g_assert_not_reached();
2901        }
2902        sub = args[3];
2903        goto gen_simd_imm8;
2904    case INDEX_op_x86_vperm2i128_vec:
2905        insn = OPC_VPERM2I128;
2906        sub = args[3];
2907        goto gen_simd_imm8;
2908    gen_simd_imm8:
2909        if (type == TCG_TYPE_V256) {
2910            insn |= P_VEXL;
2911        }
2912        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2913        tcg_out8(s, sub);
2914        break;
2915
2916    case INDEX_op_x86_vpblendvb_vec:
2917        insn = OPC_VPBLENDVB;
2918        if (type == TCG_TYPE_V256) {
2919            insn |= P_VEXL;
2920        }
2921        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2922        tcg_out8(s, args[3] << 4);
2923        break;
2924
2925    case INDEX_op_x86_psrldq_vec:
2926        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2927        tcg_out8(s, a2);
2928        break;
2929
2930    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2931    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
2932    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2933    default:
2934        g_assert_not_reached();
2935    }
2936}
2937
2938static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2939{
2940    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2941    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2942    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2943    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2944    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2945    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2946    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2947    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2948    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2949    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2950    static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
2951    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2952    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2953    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2954    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2955    static const TCGTargetOpDef s_L = { .args_ct_str = { "s", "L" } };
2956    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2957    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2958    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2959    static const TCGTargetOpDef s_L_L = { .args_ct_str = { "s", "L", "L" } };
2960    static const TCGTargetOpDef r_r_L_L
2961        = { .args_ct_str = { "r", "r", "L", "L" } };
2962    static const TCGTargetOpDef L_L_L_L
2963        = { .args_ct_str = { "L", "L", "L", "L" } };
2964    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2965    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2966    static const TCGTargetOpDef x_x_x_x
2967        = { .args_ct_str = { "x", "x", "x", "x" } };
2968    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2969
2970    switch (op) {
2971    case INDEX_op_goto_ptr:
2972        return &r;
2973
2974    case INDEX_op_ld8u_i32:
2975    case INDEX_op_ld8u_i64:
2976    case INDEX_op_ld8s_i32:
2977    case INDEX_op_ld8s_i64:
2978    case INDEX_op_ld16u_i32:
2979    case INDEX_op_ld16u_i64:
2980    case INDEX_op_ld16s_i32:
2981    case INDEX_op_ld16s_i64:
2982    case INDEX_op_ld_i32:
2983    case INDEX_op_ld32u_i64:
2984    case INDEX_op_ld32s_i64:
2985    case INDEX_op_ld_i64:
2986        return &r_r;
2987
2988    case INDEX_op_st8_i32:
2989    case INDEX_op_st8_i64:
2990        return &qi_r;
2991    case INDEX_op_st16_i32:
2992    case INDEX_op_st16_i64:
2993    case INDEX_op_st_i32:
2994    case INDEX_op_st32_i64:
2995        return &ri_r;
2996    case INDEX_op_st_i64:
2997        return &re_r;
2998
2999    case INDEX_op_add_i32:
3000    case INDEX_op_add_i64:
3001        return &r_r_re;
3002    case INDEX_op_sub_i32:
3003    case INDEX_op_sub_i64:
3004    case INDEX_op_mul_i32:
3005    case INDEX_op_mul_i64:
3006    case INDEX_op_or_i32:
3007    case INDEX_op_or_i64:
3008    case INDEX_op_xor_i32:
3009    case INDEX_op_xor_i64:
3010        return &r_0_re;
3011
3012    case INDEX_op_and_i32:
3013    case INDEX_op_and_i64:
3014        {
3015            static const TCGTargetOpDef and
3016                = { .args_ct_str = { "r", "0", "reZ" } };
3017            return &and;
3018        }
3019        break;
3020    case INDEX_op_andc_i32:
3021    case INDEX_op_andc_i64:
3022        {
3023            static const TCGTargetOpDef andc
3024                = { .args_ct_str = { "r", "r", "rI" } };
3025            return &andc;
3026        }
3027        break;
3028
3029    case INDEX_op_shl_i32:
3030    case INDEX_op_shl_i64:
3031    case INDEX_op_shr_i32:
3032    case INDEX_op_shr_i64:
3033    case INDEX_op_sar_i32:
3034    case INDEX_op_sar_i64:
3035        return have_bmi2 ? &r_r_ri : &r_0_ci;
3036    case INDEX_op_rotl_i32:
3037    case INDEX_op_rotl_i64:
3038    case INDEX_op_rotr_i32:
3039    case INDEX_op_rotr_i64:
3040        return &r_0_ci;
3041
3042    case INDEX_op_brcond_i32:
3043    case INDEX_op_brcond_i64:
3044        return &r_re;
3045
3046    case INDEX_op_bswap16_i32:
3047    case INDEX_op_bswap16_i64:
3048    case INDEX_op_bswap32_i32:
3049    case INDEX_op_bswap32_i64:
3050    case INDEX_op_bswap64_i64:
3051    case INDEX_op_neg_i32:
3052    case INDEX_op_neg_i64:
3053    case INDEX_op_not_i32:
3054    case INDEX_op_not_i64:
3055    case INDEX_op_extrh_i64_i32:
3056        return &r_0;
3057
3058    case INDEX_op_ext8s_i32:
3059    case INDEX_op_ext8s_i64:
3060    case INDEX_op_ext8u_i32:
3061    case INDEX_op_ext8u_i64:
3062        return &r_q;
3063    case INDEX_op_ext16s_i32:
3064    case INDEX_op_ext16s_i64:
3065    case INDEX_op_ext16u_i32:
3066    case INDEX_op_ext16u_i64:
3067    case INDEX_op_ext32s_i64:
3068    case INDEX_op_ext32u_i64:
3069    case INDEX_op_ext_i32_i64:
3070    case INDEX_op_extu_i32_i64:
3071    case INDEX_op_extrl_i64_i32:
3072    case INDEX_op_extract_i32:
3073    case INDEX_op_extract_i64:
3074    case INDEX_op_sextract_i32:
3075    case INDEX_op_ctpop_i32:
3076    case INDEX_op_ctpop_i64:
3077        return &r_r;
3078    case INDEX_op_extract2_i32:
3079    case INDEX_op_extract2_i64:
3080        return &r_0_r;
3081
3082    case INDEX_op_deposit_i32:
3083    case INDEX_op_deposit_i64:
3084        {
3085            static const TCGTargetOpDef dep
3086                = { .args_ct_str = { "Q", "0", "Q" } };
3087            return &dep;
3088        }
3089    case INDEX_op_setcond_i32:
3090    case INDEX_op_setcond_i64:
3091        {
3092            static const TCGTargetOpDef setc
3093                = { .args_ct_str = { "q", "r", "re" } };
3094            return &setc;
3095        }
3096    case INDEX_op_movcond_i32:
3097    case INDEX_op_movcond_i64:
3098        {
3099            static const TCGTargetOpDef movc
3100                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
3101            return &movc;
3102        }
3103    case INDEX_op_div2_i32:
3104    case INDEX_op_div2_i64:
3105    case INDEX_op_divu2_i32:
3106    case INDEX_op_divu2_i64:
3107        {
3108            static const TCGTargetOpDef div2
3109                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3110            return &div2;
3111        }
3112    case INDEX_op_mulu2_i32:
3113    case INDEX_op_mulu2_i64:
3114    case INDEX_op_muls2_i32:
3115    case INDEX_op_muls2_i64:
3116        {
3117            static const TCGTargetOpDef mul2
3118                = { .args_ct_str = { "a", "d", "a", "r" } };
3119            return &mul2;
3120        }
3121    case INDEX_op_add2_i32:
3122    case INDEX_op_add2_i64:
3123    case INDEX_op_sub2_i32:
3124    case INDEX_op_sub2_i64:
3125        {
3126            static const TCGTargetOpDef arith2
3127                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3128            return &arith2;
3129        }
3130    case INDEX_op_ctz_i32:
3131    case INDEX_op_ctz_i64:
3132        {
3133            static const TCGTargetOpDef ctz[2] = {
3134                { .args_ct_str = { "&r", "r", "r" } },
3135                { .args_ct_str = { "&r", "r", "rW" } },
3136            };
3137            return &ctz[have_bmi1];
3138        }
3139    case INDEX_op_clz_i32:
3140    case INDEX_op_clz_i64:
3141        {
3142            static const TCGTargetOpDef clz[2] = {
3143                { .args_ct_str = { "&r", "r", "r" } },
3144                { .args_ct_str = { "&r", "r", "rW" } },
3145            };
3146            return &clz[have_lzcnt];
3147        }
3148
3149    case INDEX_op_qemu_ld_i32:
3150        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3151    case INDEX_op_qemu_st_i32:
3152        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3153    case INDEX_op_qemu_st8_i32:
3154        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &s_L : &s_L_L;
3155    case INDEX_op_qemu_ld_i64:
3156        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3157                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3158                : &r_r_L_L);
3159    case INDEX_op_qemu_st_i64:
3160        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3161                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3162                : &L_L_L_L);
3163
3164    case INDEX_op_brcond2_i32:
3165        {
3166            static const TCGTargetOpDef b2
3167                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3168            return &b2;
3169        }
3170    case INDEX_op_setcond2_i32:
3171        {
3172            static const TCGTargetOpDef s2
3173                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3174            return &s2;
3175        }
3176
3177    case INDEX_op_ld_vec:
3178    case INDEX_op_st_vec:
3179    case INDEX_op_dupm_vec:
3180        return &x_r;
3181
3182    case INDEX_op_add_vec:
3183    case INDEX_op_sub_vec:
3184    case INDEX_op_mul_vec:
3185    case INDEX_op_and_vec:
3186    case INDEX_op_or_vec:
3187    case INDEX_op_xor_vec:
3188    case INDEX_op_andc_vec:
3189    case INDEX_op_ssadd_vec:
3190    case INDEX_op_usadd_vec:
3191    case INDEX_op_sssub_vec:
3192    case INDEX_op_ussub_vec:
3193    case INDEX_op_smin_vec:
3194    case INDEX_op_umin_vec:
3195    case INDEX_op_smax_vec:
3196    case INDEX_op_umax_vec:
3197    case INDEX_op_shlv_vec:
3198    case INDEX_op_shrv_vec:
3199    case INDEX_op_sarv_vec:
3200    case INDEX_op_shls_vec:
3201    case INDEX_op_shrs_vec:
3202    case INDEX_op_sars_vec:
3203    case INDEX_op_rotls_vec:
3204    case INDEX_op_cmp_vec:
3205    case INDEX_op_x86_shufps_vec:
3206    case INDEX_op_x86_blend_vec:
3207    case INDEX_op_x86_packss_vec:
3208    case INDEX_op_x86_packus_vec:
3209    case INDEX_op_x86_vperm2i128_vec:
3210    case INDEX_op_x86_punpckl_vec:
3211    case INDEX_op_x86_punpckh_vec:
3212#if TCG_TARGET_REG_BITS == 32
3213    case INDEX_op_dup2_vec:
3214#endif
3215        return &x_x_x;
3216    case INDEX_op_abs_vec:
3217    case INDEX_op_dup_vec:
3218    case INDEX_op_shli_vec:
3219    case INDEX_op_shri_vec:
3220    case INDEX_op_sari_vec:
3221    case INDEX_op_x86_psrldq_vec:
3222        return &x_x;
3223    case INDEX_op_x86_vpblendvb_vec:
3224        return &x_x_x_x;
3225
3226    default:
3227        break;
3228    }
3229    return NULL;
3230}
3231
3232int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3233{
3234    switch (opc) {
3235    case INDEX_op_add_vec:
3236    case INDEX_op_sub_vec:
3237    case INDEX_op_and_vec:
3238    case INDEX_op_or_vec:
3239    case INDEX_op_xor_vec:
3240    case INDEX_op_andc_vec:
3241        return 1;
3242    case INDEX_op_rotli_vec:
3243    case INDEX_op_cmp_vec:
3244    case INDEX_op_cmpsel_vec:
3245        return -1;
3246
3247    case INDEX_op_shli_vec:
3248    case INDEX_op_shri_vec:
3249        /* We must expand the operation for MO_8.  */
3250        return vece == MO_8 ? -1 : 1;
3251
3252    case INDEX_op_sari_vec:
3253        /* We must expand the operation for MO_8.  */
3254        if (vece == MO_8) {
3255            return -1;
3256        }
3257        /* We can emulate this for MO_64, but it does not pay off
3258           unless we're producing at least 4 values.  */
3259        if (vece == MO_64) {
3260            return type >= TCG_TYPE_V256 ? -1 : 0;
3261        }
3262        return 1;
3263
3264    case INDEX_op_shls_vec:
3265    case INDEX_op_shrs_vec:
3266        return vece >= MO_16;
3267    case INDEX_op_sars_vec:
3268        return vece >= MO_16 && vece <= MO_32;
3269    case INDEX_op_rotls_vec:
3270        return vece >= MO_16 ? -1 : 0;
3271
3272    case INDEX_op_shlv_vec:
3273    case INDEX_op_shrv_vec:
3274        return have_avx2 && vece >= MO_32;
3275    case INDEX_op_sarv_vec:
3276        return have_avx2 && vece == MO_32;
3277    case INDEX_op_rotlv_vec:
3278    case INDEX_op_rotrv_vec:
3279        return have_avx2 && vece >= MO_32 ? -1 : 0;
3280
3281    case INDEX_op_mul_vec:
3282        if (vece == MO_8) {
3283            /* We can expand the operation for MO_8.  */
3284            return -1;
3285        }
3286        if (vece == MO_64) {
3287            return 0;
3288        }
3289        return 1;
3290
3291    case INDEX_op_ssadd_vec:
3292    case INDEX_op_usadd_vec:
3293    case INDEX_op_sssub_vec:
3294    case INDEX_op_ussub_vec:
3295        return vece <= MO_16;
3296    case INDEX_op_smin_vec:
3297    case INDEX_op_smax_vec:
3298    case INDEX_op_umin_vec:
3299    case INDEX_op_umax_vec:
3300    case INDEX_op_abs_vec:
3301        return vece <= MO_32;
3302
3303    default:
3304        return 0;
3305    }
3306}
3307
3308static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3309                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3310{
3311    TCGv_vec t1, t2;
3312
3313    tcg_debug_assert(vece == MO_8);
3314
3315    t1 = tcg_temp_new_vec(type);
3316    t2 = tcg_temp_new_vec(type);
3317
3318    /*
3319     * Unpack to W, shift, and repack.  Tricky bits:
3320     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3321     *     i.e. duplicate in other half of the 16-bit lane.
3322     * (2) For right-shift, add 8 so that the high half of the lane
3323     *     becomes zero.  For left-shift, and left-rotate, we must
3324     *     shift up and down again.
3325     * (3) Step 2 leaves high half zero such that PACKUSWB
3326     *     (pack with unsigned saturation) does not modify
3327     *     the quantity.
3328     */
3329    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3330              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3331    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3332              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3333
3334    if (opc != INDEX_op_rotli_vec) {
3335        imm += 8;
3336    }
3337    if (opc == INDEX_op_shri_vec) {
3338        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3339        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3340    } else {
3341        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3342        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3343        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3344        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3345    }
3346
3347    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3348              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3349    tcg_temp_free_vec(t1);
3350    tcg_temp_free_vec(t2);
3351}
3352
3353static void expand_vec_sari(TCGType type, unsigned vece,
3354                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3355{
3356    TCGv_vec t1, t2;
3357
3358    switch (vece) {
3359    case MO_8:
3360        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3361        t1 = tcg_temp_new_vec(type);
3362        t2 = tcg_temp_new_vec(type);
3363        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3364                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3365        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3366                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3367        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3368        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3369        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3370                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3371        tcg_temp_free_vec(t1);
3372        tcg_temp_free_vec(t2);
3373        break;
3374
3375    case MO_64:
3376        if (imm <= 32) {
3377            /*
3378             * We can emulate a small sign extend by performing an arithmetic
3379             * 32-bit shift and overwriting the high half of a 64-bit logical
3380             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3381             * does not, so we have to bound the smaller shift -- we get the
3382             * same result in the high half either way.
3383             */
3384            t1 = tcg_temp_new_vec(type);
3385            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3386            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3387            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3388                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3389                      tcgv_vec_arg(t1), 0xaa);
3390            tcg_temp_free_vec(t1);
3391        } else {
3392            /* Otherwise we will need to use a compare vs 0 to produce
3393             * the sign-extend, shift and merge.
3394             */
3395            t1 = tcg_const_zeros_vec(type);
3396            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3397            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3398            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3399            tcg_gen_or_vec(MO_64, v0, v0, t1);
3400            tcg_temp_free_vec(t1);
3401        }
3402        break;
3403
3404    default:
3405        g_assert_not_reached();
3406    }
3407}
3408
3409static void expand_vec_rotli(TCGType type, unsigned vece,
3410                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3411{
3412    TCGv_vec t;
3413
3414    if (vece == MO_8) {
3415        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3416        return;
3417    }
3418
3419    t = tcg_temp_new_vec(type);
3420    tcg_gen_shli_vec(vece, t, v1, imm);
3421    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3422    tcg_gen_or_vec(vece, v0, v0, t);
3423    tcg_temp_free_vec(t);
3424}
3425
3426static void expand_vec_rotls(TCGType type, unsigned vece,
3427                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3428{
3429    TCGv_i32 rsh;
3430    TCGv_vec t;
3431
3432    tcg_debug_assert(vece != MO_8);
3433
3434    t = tcg_temp_new_vec(type);
3435    rsh = tcg_temp_new_i32();
3436
3437    tcg_gen_neg_i32(rsh, lsh);
3438    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3439    tcg_gen_shls_vec(vece, t, v1, lsh);
3440    tcg_gen_shrs_vec(vece, v0, v1, rsh);
3441    tcg_gen_or_vec(vece, v0, v0, t);
3442    tcg_temp_free_vec(t);
3443    tcg_temp_free_i32(rsh);
3444}
3445
3446static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3447                            TCGv_vec v1, TCGv_vec sh, bool right)
3448{
3449    TCGv_vec t = tcg_temp_new_vec(type);
3450
3451    tcg_gen_dupi_vec(vece, t, 8 << vece);
3452    tcg_gen_sub_vec(vece, t, t, sh);
3453    if (right) {
3454        tcg_gen_shlv_vec(vece, t, v1, t);
3455        tcg_gen_shrv_vec(vece, v0, v1, sh);
3456    } else {
3457        tcg_gen_shrv_vec(vece, t, v1, t);
3458        tcg_gen_shlv_vec(vece, v0, v1, sh);
3459    }
3460    tcg_gen_or_vec(vece, v0, v0, t);
3461    tcg_temp_free_vec(t);
3462}
3463
3464static void expand_vec_mul(TCGType type, unsigned vece,
3465                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3466{
3467    TCGv_vec t1, t2, t3, t4;
3468
3469    tcg_debug_assert(vece == MO_8);
3470
3471    /*
3472     * Unpack v1 bytes to words, 0 | x.
3473     * Unpack v2 bytes to words, y | 0.
3474     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3475     * Shift logical right by 8 bits to clear the high 8 bytes before
3476     * using an unsigned saturated pack.
3477     *
3478     * The difference between the V64, V128 and V256 cases is merely how
3479     * we distribute the expansion between temporaries.
3480     */
3481    switch (type) {
3482    case TCG_TYPE_V64:
3483        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3484        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3485        tcg_gen_dup16i_vec(t2, 0);
3486        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3487                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3488        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3489                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3490        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3491        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3492        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3493                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3494        tcg_temp_free_vec(t1);
3495        tcg_temp_free_vec(t2);
3496        break;
3497
3498    case TCG_TYPE_V128:
3499    case TCG_TYPE_V256:
3500        t1 = tcg_temp_new_vec(type);
3501        t2 = tcg_temp_new_vec(type);
3502        t3 = tcg_temp_new_vec(type);
3503        t4 = tcg_temp_new_vec(type);
3504        tcg_gen_dup16i_vec(t4, 0);
3505        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3506                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3507        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3508                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3509        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3510                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3511        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3512                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3513        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3514        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3515        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3516        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3517        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3518                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3519        tcg_temp_free_vec(t1);
3520        tcg_temp_free_vec(t2);
3521        tcg_temp_free_vec(t3);
3522        tcg_temp_free_vec(t4);
3523        break;
3524
3525    default:
3526        g_assert_not_reached();
3527    }
3528}
3529
3530static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3531                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3532{
3533    enum {
3534        NEED_INV  = 1,
3535        NEED_SWAP = 2,
3536        NEED_BIAS = 4,
3537        NEED_UMIN = 8,
3538        NEED_UMAX = 16,
3539    };
3540    TCGv_vec t1, t2;
3541    uint8_t fixup;
3542
3543    switch (cond) {
3544    case TCG_COND_EQ:
3545    case TCG_COND_GT:
3546        fixup = 0;
3547        break;
3548    case TCG_COND_NE:
3549    case TCG_COND_LE:
3550        fixup = NEED_INV;
3551        break;
3552    case TCG_COND_LT:
3553        fixup = NEED_SWAP;
3554        break;
3555    case TCG_COND_GE:
3556        fixup = NEED_SWAP | NEED_INV;
3557        break;
3558    case TCG_COND_LEU:
3559        if (vece <= MO_32) {
3560            fixup = NEED_UMIN;
3561        } else {
3562            fixup = NEED_BIAS | NEED_INV;
3563        }
3564        break;
3565    case TCG_COND_GTU:
3566        if (vece <= MO_32) {
3567            fixup = NEED_UMIN | NEED_INV;
3568        } else {
3569            fixup = NEED_BIAS;
3570        }
3571        break;
3572    case TCG_COND_GEU:
3573        if (vece <= MO_32) {
3574            fixup = NEED_UMAX;
3575        } else {
3576            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3577        }
3578        break;
3579    case TCG_COND_LTU:
3580        if (vece <= MO_32) {
3581            fixup = NEED_UMAX | NEED_INV;
3582        } else {
3583            fixup = NEED_BIAS | NEED_SWAP;
3584        }
3585        break;
3586    default:
3587        g_assert_not_reached();
3588    }
3589
3590    if (fixup & NEED_INV) {
3591        cond = tcg_invert_cond(cond);
3592    }
3593    if (fixup & NEED_SWAP) {
3594        t1 = v1, v1 = v2, v2 = t1;
3595        cond = tcg_swap_cond(cond);
3596    }
3597
3598    t1 = t2 = NULL;
3599    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3600        t1 = tcg_temp_new_vec(type);
3601        if (fixup & NEED_UMIN) {
3602            tcg_gen_umin_vec(vece, t1, v1, v2);
3603        } else {
3604            tcg_gen_umax_vec(vece, t1, v1, v2);
3605        }
3606        v2 = t1;
3607        cond = TCG_COND_EQ;
3608    } else if (fixup & NEED_BIAS) {
3609        t1 = tcg_temp_new_vec(type);
3610        t2 = tcg_temp_new_vec(type);
3611        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3612        tcg_gen_sub_vec(vece, t1, v1, t2);
3613        tcg_gen_sub_vec(vece, t2, v2, t2);
3614        v1 = t1;
3615        v2 = t2;
3616        cond = tcg_signed_cond(cond);
3617    }
3618
3619    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3620    /* Expand directly; do not recurse.  */
3621    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3622              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3623
3624    if (t1) {
3625        tcg_temp_free_vec(t1);
3626        if (t2) {
3627            tcg_temp_free_vec(t2);
3628        }
3629    }
3630    return fixup & NEED_INV;
3631}
3632
3633static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3634                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3635{
3636    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3637        tcg_gen_not_vec(vece, v0, v0);
3638    }
3639}
3640
3641static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3642                              TCGv_vec c1, TCGv_vec c2,
3643                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3644{
3645    TCGv_vec t = tcg_temp_new_vec(type);
3646
3647    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3648        /* Invert the sense of the compare by swapping arguments.  */
3649        TCGv_vec x;
3650        x = v3, v3 = v4, v4 = x;
3651    }
3652    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3653              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3654              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3655    tcg_temp_free_vec(t);
3656}
3657
3658void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3659                       TCGArg a0, ...)
3660{
3661    va_list va;
3662    TCGArg a2;
3663    TCGv_vec v0, v1, v2, v3, v4;
3664
3665    va_start(va, a0);
3666    v0 = temp_tcgv_vec(arg_temp(a0));
3667    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3668    a2 = va_arg(va, TCGArg);
3669
3670    switch (opc) {
3671    case INDEX_op_shli_vec:
3672    case INDEX_op_shri_vec:
3673        expand_vec_shi(type, vece, opc, v0, v1, a2);
3674        break;
3675
3676    case INDEX_op_sari_vec:
3677        expand_vec_sari(type, vece, v0, v1, a2);
3678        break;
3679
3680    case INDEX_op_rotli_vec:
3681        expand_vec_rotli(type, vece, v0, v1, a2);
3682        break;
3683
3684    case INDEX_op_rotls_vec:
3685        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3686        break;
3687
3688    case INDEX_op_rotlv_vec:
3689        v2 = temp_tcgv_vec(arg_temp(a2));
3690        expand_vec_rotv(type, vece, v0, v1, v2, false);
3691        break;
3692    case INDEX_op_rotrv_vec:
3693        v2 = temp_tcgv_vec(arg_temp(a2));
3694        expand_vec_rotv(type, vece, v0, v1, v2, true);
3695        break;
3696
3697    case INDEX_op_mul_vec:
3698        v2 = temp_tcgv_vec(arg_temp(a2));
3699        expand_vec_mul(type, vece, v0, v1, v2);
3700        break;
3701
3702    case INDEX_op_cmp_vec:
3703        v2 = temp_tcgv_vec(arg_temp(a2));
3704        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3705        break;
3706
3707    case INDEX_op_cmpsel_vec:
3708        v2 = temp_tcgv_vec(arg_temp(a2));
3709        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3710        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3711        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3712        break;
3713
3714    default:
3715        break;
3716    }
3717
3718    va_end(va);
3719}
3720
3721static const int tcg_target_callee_save_regs[] = {
3722#if TCG_TARGET_REG_BITS == 64
3723    TCG_REG_RBP,
3724    TCG_REG_RBX,
3725#if defined(_WIN64)
3726    TCG_REG_RDI,
3727    TCG_REG_RSI,
3728#endif
3729    TCG_REG_R12,
3730    TCG_REG_R13,
3731    TCG_REG_R14, /* Currently used for the global env. */
3732    TCG_REG_R15,
3733#else
3734    TCG_REG_EBP, /* Currently used for the global env. */
3735    TCG_REG_EBX,
3736    TCG_REG_ESI,
3737    TCG_REG_EDI,
3738#endif
3739};
3740
3741/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3742   and tcg_register_jit.  */
3743
3744#define PUSH_SIZE \
3745    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3746     * (TCG_TARGET_REG_BITS / 8))
3747
3748#define FRAME_SIZE \
3749    ((PUSH_SIZE \
3750      + TCG_STATIC_CALL_ARGS_SIZE \
3751      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3752      + TCG_TARGET_STACK_ALIGN - 1) \
3753     & ~(TCG_TARGET_STACK_ALIGN - 1))
3754
3755/* Generate global QEMU prologue and epilogue code */
3756static void tcg_target_qemu_prologue(TCGContext *s)
3757{
3758    int i, stack_addend;
3759
3760    /* TB prologue */
3761
3762    /* Reserve some stack space, also for TCG temps.  */
3763    stack_addend = FRAME_SIZE - PUSH_SIZE;
3764    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3765                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3766
3767    /* Save all callee saved registers.  */
3768    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3769        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3770    }
3771
3772#if TCG_TARGET_REG_BITS == 32
3773    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3774               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3775    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3776    /* jmp *tb.  */
3777    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3778                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3779                         + stack_addend);
3780#else
3781# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3782    if (guest_base) {
3783        int seg = setup_guest_base_seg();
3784        if (seg != 0) {
3785            x86_guest_base_seg = seg;
3786        } else if (guest_base == (int32_t)guest_base) {
3787            x86_guest_base_offset = guest_base;
3788        } else {
3789            /* Choose R12 because, as a base, it requires a SIB byte. */
3790            x86_guest_base_index = TCG_REG_R12;
3791            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3792            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3793        }
3794    }
3795# endif
3796    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3797    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3798    /* jmp *tb.  */
3799    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3800#endif
3801
3802    /*
3803     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3804     * and fall through to the rest of the epilogue.
3805     */
3806    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3807    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3808
3809    /* TB epilogue */
3810    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3811
3812    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3813
3814    if (have_avx2) {
3815        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3816    }
3817    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3818        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3819    }
3820    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3821}
3822
3823static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3824{
3825    memset(p, 0x90, count);
3826}
3827
3828static void tcg_target_init(TCGContext *s)
3829{
3830#ifdef CONFIG_CPUID_H
3831    unsigned a, b, c, d, b7 = 0;
3832    int max = __get_cpuid_max(0, 0);
3833
3834    if (max >= 7) {
3835        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3836        __cpuid_count(7, 0, a, b7, c, d);
3837        have_bmi1 = (b7 & bit_BMI) != 0;
3838        have_bmi2 = (b7 & bit_BMI2) != 0;
3839    }
3840
3841    if (max >= 1) {
3842        __cpuid(1, a, b, c, d);
3843#ifndef have_cmov
3844        /* For 32-bit, 99% certainty that we're running on hardware that
3845           supports cmov, but we still need to check.  In case cmov is not
3846           available, we'll use a small forward branch.  */
3847        have_cmov = (d & bit_CMOV) != 0;
3848#endif
3849
3850        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3851           need to probe for it.  */
3852        have_movbe = (c & bit_MOVBE) != 0;
3853        have_popcnt = (c & bit_POPCNT) != 0;
3854
3855        /* There are a number of things we must check before we can be
3856           sure of not hitting invalid opcode.  */
3857        if (c & bit_OSXSAVE) {
3858            unsigned xcrl, xcrh;
3859            /* The xgetbv instruction is not available to older versions of
3860             * the assembler, so we encode the instruction manually.
3861             */
3862            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3863            if ((xcrl & 6) == 6) {
3864                have_avx1 = (c & bit_AVX) != 0;
3865                have_avx2 = (b7 & bit_AVX2) != 0;
3866            }
3867        }
3868    }
3869
3870    max = __get_cpuid_max(0x8000000, 0);
3871    if (max >= 1) {
3872        __cpuid(0x80000001, a, b, c, d);
3873        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3874        have_lzcnt = (c & bit_LZCNT) != 0;
3875    }
3876#endif /* CONFIG_CPUID_H */
3877
3878    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3879    if (TCG_TARGET_REG_BITS == 64) {
3880        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3881    }
3882    if (have_avx1) {
3883        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3884        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3885    }
3886    if (have_avx2) {
3887        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3888    }
3889
3890    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3891    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3892    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3893    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3894    if (TCG_TARGET_REG_BITS == 64) {
3895#if !defined(_WIN64)
3896        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3897        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3898#endif
3899        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3900        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3901        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3902        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3903    }
3904
3905    s->reserved_regs = 0;
3906    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3907}
3908
3909typedef struct {
3910    DebugFrameHeader h;
3911    uint8_t fde_def_cfa[4];
3912    uint8_t fde_reg_ofs[14];
3913} DebugFrame;
3914
3915/* We're expecting a 2 byte uleb128 encoded value.  */
3916QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3917
3918#if !defined(__ELF__)
3919    /* Host machine without ELF. */
3920#elif TCG_TARGET_REG_BITS == 64
3921#define ELF_HOST_MACHINE EM_X86_64
3922static const DebugFrame debug_frame = {
3923    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3924    .h.cie.id = -1,
3925    .h.cie.version = 1,
3926    .h.cie.code_align = 1,
3927    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3928    .h.cie.return_column = 16,
3929
3930    /* Total FDE size does not include the "len" member.  */
3931    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3932
3933    .fde_def_cfa = {
3934        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3935        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3936        (FRAME_SIZE >> 7)
3937    },
3938    .fde_reg_ofs = {
3939        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3940        /* The following ordering must match tcg_target_callee_save_regs.  */
3941        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3942        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3943        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3944        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3945        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3946        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3947    }
3948};
3949#else
3950#define ELF_HOST_MACHINE EM_386
3951static const DebugFrame debug_frame = {
3952    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3953    .h.cie.id = -1,
3954    .h.cie.version = 1,
3955    .h.cie.code_align = 1,
3956    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3957    .h.cie.return_column = 8,
3958
3959    /* Total FDE size does not include the "len" member.  */
3960    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3961
3962    .fde_def_cfa = {
3963        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3964        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3965        (FRAME_SIZE >> 7)
3966    },
3967    .fde_reg_ofs = {
3968        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3969        /* The following ordering must match tcg_target_callee_save_regs.  */
3970        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3971        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3972        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3973        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3974    }
3975};
3976#endif
3977
3978#if defined(ELF_HOST_MACHINE)
3979void tcg_register_jit(const void *buf, size_t buf_size)
3980{
3981    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3982}
3983#endif
3984