xref: /openbmc/qemu/tcg/i386/tcg-target.c.inc (revision f06176be76ffa96098737665ac770cac0f7bfdb8)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-pool.c.inc"
26
27#ifdef CONFIG_DEBUG_TCG
28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29#if TCG_TARGET_REG_BITS == 64
30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31#else
32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33#endif
34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36#if TCG_TARGET_REG_BITS == 64
37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39#endif
40};
41#endif
42
43static const int tcg_target_reg_alloc_order[] = {
44#if TCG_TARGET_REG_BITS == 64
45    TCG_REG_RBP,
46    TCG_REG_RBX,
47    TCG_REG_R12,
48    TCG_REG_R13,
49    TCG_REG_R14,
50    TCG_REG_R15,
51    TCG_REG_R10,
52    TCG_REG_R11,
53    TCG_REG_R9,
54    TCG_REG_R8,
55    TCG_REG_RCX,
56    TCG_REG_RDX,
57    TCG_REG_RSI,
58    TCG_REG_RDI,
59    TCG_REG_RAX,
60#else
61    TCG_REG_EBX,
62    TCG_REG_ESI,
63    TCG_REG_EDI,
64    TCG_REG_EBP,
65    TCG_REG_ECX,
66    TCG_REG_EDX,
67    TCG_REG_EAX,
68#endif
69    TCG_REG_XMM0,
70    TCG_REG_XMM1,
71    TCG_REG_XMM2,
72    TCG_REG_XMM3,
73    TCG_REG_XMM4,
74    TCG_REG_XMM5,
75#ifndef _WIN64
76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
78    TCG_REG_XMM6,
79    TCG_REG_XMM7,
80#if TCG_TARGET_REG_BITS == 64
81    TCG_REG_XMM8,
82    TCG_REG_XMM9,
83    TCG_REG_XMM10,
84    TCG_REG_XMM11,
85    TCG_REG_XMM12,
86    TCG_REG_XMM13,
87    TCG_REG_XMM14,
88    TCG_REG_XMM15,
89#endif
90#endif
91};
92
93static const int tcg_target_call_iarg_regs[] = {
94#if TCG_TARGET_REG_BITS == 64
95#if defined(_WIN64)
96    TCG_REG_RCX,
97    TCG_REG_RDX,
98#else
99    TCG_REG_RDI,
100    TCG_REG_RSI,
101    TCG_REG_RDX,
102    TCG_REG_RCX,
103#endif
104    TCG_REG_R8,
105    TCG_REG_R9,
106#else
107    /* 32 bit mode uses stack based calling convention (GCC default). */
108#endif
109};
110
111static const int tcg_target_call_oarg_regs[] = {
112    TCG_REG_EAX,
113#if TCG_TARGET_REG_BITS == 32
114    TCG_REG_EDX
115#endif
116};
117
118/* Constants we accept.  */
119#define TCG_CT_CONST_S32 0x100
120#define TCG_CT_CONST_U32 0x200
121#define TCG_CT_CONST_I32 0x400
122#define TCG_CT_CONST_WSZ 0x800
123
124/* Registers used with L constraint, which are the first argument
125   registers on x86_64, and two random call clobbered registers on
126   i386. */
127#if TCG_TARGET_REG_BITS == 64
128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130#else
131# define TCG_REG_L0 TCG_REG_EAX
132# define TCG_REG_L1 TCG_REG_EDX
133#endif
134
135/* The host compiler should supply <cpuid.h> to enable runtime features
136   detection, as we're not going to go so far as our own inline assembly.
137   If not available, default values will be assumed.  */
138#if defined(CONFIG_CPUID_H)
139#include "qemu/cpuid.h"
140#endif
141
142/* For 64-bit, we always know that CMOV is available.  */
143#if TCG_TARGET_REG_BITS == 64
144# define have_cmov 1
145#elif defined(CONFIG_CPUID_H)
146static bool have_cmov;
147#else
148# define have_cmov 0
149#endif
150
151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
152   it there.  Therefore we always define the variable.  */
153bool have_bmi1;
154bool have_popcnt;
155bool have_avx1;
156bool have_avx2;
157bool have_movbe;
158
159#ifdef CONFIG_CPUID_H
160static bool have_bmi2;
161static bool have_lzcnt;
162#else
163# define have_bmi2 0
164# define have_lzcnt 0
165#endif
166
167static const tcg_insn_unit *tb_ret_addr;
168
169static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
170                        intptr_t value, intptr_t addend)
171{
172    value += addend;
173    switch(type) {
174    case R_386_PC32:
175        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
176        if (value != (int32_t)value) {
177            return false;
178        }
179        /* FALLTHRU */
180    case R_386_32:
181        tcg_patch32(code_ptr, value);
182        break;
183    case R_386_PC8:
184        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
185        if (value != (int8_t)value) {
186            return false;
187        }
188        tcg_patch8(code_ptr, value);
189        break;
190    default:
191        tcg_abort();
192    }
193    return true;
194}
195
196#if TCG_TARGET_REG_BITS == 64
197#define ALL_GENERAL_REGS   0x0000ffffu
198#define ALL_VECTOR_REGS    0xffff0000u
199#else
200#define ALL_GENERAL_REGS   0x000000ffu
201#define ALL_VECTOR_REGS    0x00ff0000u
202#endif
203
204/* parse target specific constraints */
205static const char *target_parse_constraint(TCGArgConstraint *ct,
206                                           const char *ct_str, TCGType type)
207{
208    switch(*ct_str++) {
209    case 'a':
210        tcg_regset_set_reg(ct->regs, TCG_REG_EAX);
211        break;
212    case 'b':
213        tcg_regset_set_reg(ct->regs, TCG_REG_EBX);
214        break;
215    case 'c':
216        tcg_regset_set_reg(ct->regs, TCG_REG_ECX);
217        break;
218    case 'd':
219        tcg_regset_set_reg(ct->regs, TCG_REG_EDX);
220        break;
221    case 'S':
222        tcg_regset_set_reg(ct->regs, TCG_REG_ESI);
223        break;
224    case 'D':
225        tcg_regset_set_reg(ct->regs, TCG_REG_EDI);
226        break;
227    case 'q':
228        /* A register that can be used as a byte operand.  */
229        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
230        break;
231    case 'Q':
232        /* A register with an addressable second byte (e.g. %ah).  */
233        ct->regs = 0xf;
234        break;
235    case 'r':
236        /* A general register.  */
237        ct->regs |= ALL_GENERAL_REGS;
238        break;
239    case 'W':
240        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
241        ct->ct |= TCG_CT_CONST_WSZ;
242        break;
243    case 'x':
244        /* A vector register.  */
245        ct->regs |= ALL_VECTOR_REGS;
246        break;
247
248    case 'L':
249        /* qemu_ld/st data+address constraint */
250        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
251#ifdef CONFIG_SOFTMMU
252        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
253        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
254#endif
255        break;
256    case 's':
257        /* qemu_st8_i32 data constraint */
258        ct->regs = 0xf;
259#ifdef CONFIG_SOFTMMU
260        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
261        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
262#endif
263        break;
264
265    case 'e':
266        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
267        break;
268    case 'Z':
269        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
270        break;
271    case 'I':
272        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
273        break;
274
275    default:
276        return NULL;
277    }
278    return ct_str;
279}
280
281/* test if a constant matches the constraint */
282static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
283                                         const TCGArgConstraint *arg_ct)
284{
285    int ct = arg_ct->ct;
286    if (ct & TCG_CT_CONST) {
287        return 1;
288    }
289    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
290        return 1;
291    }
292    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
293        return 1;
294    }
295    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
296        return 1;
297    }
298    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
299        return 1;
300    }
301    return 0;
302}
303
304# define LOWREGMASK(x)	((x) & 7)
305
306#define P_EXT		0x100		/* 0x0f opcode prefix */
307#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
308#define P_DATA16        0x400           /* 0x66 opcode prefix */
309#if TCG_TARGET_REG_BITS == 64
310# define P_REXW         0x1000          /* Set REX.W = 1 */
311# define P_REXB_R       0x2000          /* REG field as byte register */
312# define P_REXB_RM      0x4000          /* R/M field as byte register */
313# define P_GS           0x8000          /* gs segment override */
314#else
315# define P_REXW		0
316# define P_REXB_R	0
317# define P_REXB_RM	0
318# define P_GS           0
319#endif
320#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
321#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
322#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
323#define P_VEXL          0x80000         /* Set VEX.L = 1 */
324
325#define OPC_ARITH_EvIz	(0x81)
326#define OPC_ARITH_EvIb	(0x83)
327#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
328#define OPC_ANDN        (0xf2 | P_EXT38)
329#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
330#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
331#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
332#define OPC_BSF         (0xbc | P_EXT)
333#define OPC_BSR         (0xbd | P_EXT)
334#define OPC_BSWAP	(0xc8 | P_EXT)
335#define OPC_CALL_Jz	(0xe8)
336#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
337#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
338#define OPC_DEC_r32	(0x48)
339#define OPC_IMUL_GvEv	(0xaf | P_EXT)
340#define OPC_IMUL_GvEvIb	(0x6b)
341#define OPC_IMUL_GvEvIz	(0x69)
342#define OPC_INC_r32	(0x40)
343#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
344#define OPC_JCC_short	(0x70)		/* ... plus condition code */
345#define OPC_JMP_long	(0xe9)
346#define OPC_JMP_short	(0xeb)
347#define OPC_LEA         (0x8d)
348#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
349#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
350#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
351#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
352#define OPC_MOVB_EvIz   (0xc6)
353#define OPC_MOVL_EvIz	(0xc7)
354#define OPC_MOVL_Iv     (0xb8)
355#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
356#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
357#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
358#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
359#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
360#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
361#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
362#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
363#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
364#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
365#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
366#define OPC_MOVSBL	(0xbe | P_EXT)
367#define OPC_MOVSWL	(0xbf | P_EXT)
368#define OPC_MOVSLQ	(0x63 | P_REXW)
369#define OPC_MOVZBL	(0xb6 | P_EXT)
370#define OPC_MOVZWL	(0xb7 | P_EXT)
371#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
372#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
373#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
374#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
375#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
376#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
377#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
378#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
379#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
380#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
381#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
382#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
383#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
384#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
385#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
386#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
387#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
388#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
389#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
390#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
391#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
392#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
393#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
394#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
395#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
396#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
397#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
398#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
399#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
400#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
401#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
402#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
403#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
404#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
405#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
406#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
407#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
408#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
409#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
410#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
411#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
412#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
413#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
414#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
415#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
416#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
417#define OPC_POR         (0xeb | P_EXT | P_DATA16)
418#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
419#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
420#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
421#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
422#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
423#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
424#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
425#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
426#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
427#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
428#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
429#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
430#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
431#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
432#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
433#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
434#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
435#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
436#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
437#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
438#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
439#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
440#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
441#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
442#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
443#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
444#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
445#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
446#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
447#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
448#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
449#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
450#define OPC_POP_r32	(0x58)
451#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
452#define OPC_PUSH_r32	(0x50)
453#define OPC_PUSH_Iv	(0x68)
454#define OPC_PUSH_Ib	(0x6a)
455#define OPC_RET		(0xc3)
456#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
457#define OPC_SHIFT_1	(0xd1)
458#define OPC_SHIFT_Ib	(0xc1)
459#define OPC_SHIFT_cl	(0xd3)
460#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
461#define OPC_SHUFPS      (0xc6 | P_EXT)
462#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
463#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
464#define OPC_SHRD_Ib     (0xac | P_EXT)
465#define OPC_TESTL	(0x85)
466#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
467#define OPC_UD2         (0x0b | P_EXT)
468#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
469#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
470#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
471#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
472#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
473#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
474#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
475#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
476#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
477#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
478#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
479#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
480#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
481#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
482#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
483#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
484#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
485#define OPC_VZEROUPPER  (0x77 | P_EXT)
486#define OPC_XCHG_ax_r32	(0x90)
487
488#define OPC_GRP3_Ev	(0xf7)
489#define OPC_GRP5	(0xff)
490#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
491
492/* Group 1 opcode extensions for 0x80-0x83.
493   These are also used as modifiers for OPC_ARITH.  */
494#define ARITH_ADD 0
495#define ARITH_OR  1
496#define ARITH_ADC 2
497#define ARITH_SBB 3
498#define ARITH_AND 4
499#define ARITH_SUB 5
500#define ARITH_XOR 6
501#define ARITH_CMP 7
502
503/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
504#define SHIFT_ROL 0
505#define SHIFT_ROR 1
506#define SHIFT_SHL 4
507#define SHIFT_SHR 5
508#define SHIFT_SAR 7
509
510/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
511#define EXT3_NOT   2
512#define EXT3_NEG   3
513#define EXT3_MUL   4
514#define EXT3_IMUL  5
515#define EXT3_DIV   6
516#define EXT3_IDIV  7
517
518/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
519#define EXT5_INC_Ev	0
520#define EXT5_DEC_Ev	1
521#define EXT5_CALLN_Ev	2
522#define EXT5_JMPN_Ev	4
523
524/* Condition codes to be added to OPC_JCC_{long,short}.  */
525#define JCC_JMP (-1)
526#define JCC_JO  0x0
527#define JCC_JNO 0x1
528#define JCC_JB  0x2
529#define JCC_JAE 0x3
530#define JCC_JE  0x4
531#define JCC_JNE 0x5
532#define JCC_JBE 0x6
533#define JCC_JA  0x7
534#define JCC_JS  0x8
535#define JCC_JNS 0x9
536#define JCC_JP  0xa
537#define JCC_JNP 0xb
538#define JCC_JL  0xc
539#define JCC_JGE 0xd
540#define JCC_JLE 0xe
541#define JCC_JG  0xf
542
543static const uint8_t tcg_cond_to_jcc[] = {
544    [TCG_COND_EQ] = JCC_JE,
545    [TCG_COND_NE] = JCC_JNE,
546    [TCG_COND_LT] = JCC_JL,
547    [TCG_COND_GE] = JCC_JGE,
548    [TCG_COND_LE] = JCC_JLE,
549    [TCG_COND_GT] = JCC_JG,
550    [TCG_COND_LTU] = JCC_JB,
551    [TCG_COND_GEU] = JCC_JAE,
552    [TCG_COND_LEU] = JCC_JBE,
553    [TCG_COND_GTU] = JCC_JA,
554};
555
556#if TCG_TARGET_REG_BITS == 64
557static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
558{
559    int rex;
560
561    if (opc & P_GS) {
562        tcg_out8(s, 0x65);
563    }
564    if (opc & P_DATA16) {
565        /* We should never be asking for both 16 and 64-bit operation.  */
566        tcg_debug_assert((opc & P_REXW) == 0);
567        tcg_out8(s, 0x66);
568    }
569    if (opc & P_SIMDF3) {
570        tcg_out8(s, 0xf3);
571    } else if (opc & P_SIMDF2) {
572        tcg_out8(s, 0xf2);
573    }
574
575    rex = 0;
576    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
577    rex |= (r & 8) >> 1;                /* REX.R */
578    rex |= (x & 8) >> 2;                /* REX.X */
579    rex |= (rm & 8) >> 3;               /* REX.B */
580
581    /* P_REXB_{R,RM} indicates that the given register is the low byte.
582       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
583       as otherwise the encoding indicates %[abcd]h.  Note that the values
584       that are ORed in merely indicate that the REX byte must be present;
585       those bits get discarded in output.  */
586    rex |= opc & (r >= 4 ? P_REXB_R : 0);
587    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
588
589    if (rex) {
590        tcg_out8(s, (uint8_t)(rex | 0x40));
591    }
592
593    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
594        tcg_out8(s, 0x0f);
595        if (opc & P_EXT38) {
596            tcg_out8(s, 0x38);
597        } else if (opc & P_EXT3A) {
598            tcg_out8(s, 0x3a);
599        }
600    }
601
602    tcg_out8(s, opc);
603}
604#else
605static void tcg_out_opc(TCGContext *s, int opc)
606{
607    if (opc & P_DATA16) {
608        tcg_out8(s, 0x66);
609    }
610    if (opc & P_SIMDF3) {
611        tcg_out8(s, 0xf3);
612    } else if (opc & P_SIMDF2) {
613        tcg_out8(s, 0xf2);
614    }
615    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
616        tcg_out8(s, 0x0f);
617        if (opc & P_EXT38) {
618            tcg_out8(s, 0x38);
619        } else if (opc & P_EXT3A) {
620            tcg_out8(s, 0x3a);
621        }
622    }
623    tcg_out8(s, opc);
624}
625/* Discard the register arguments to tcg_out_opc early, so as not to penalize
626   the 32-bit compilation paths.  This method works with all versions of gcc,
627   whereas relying on optimization may not be able to exclude them.  */
628#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
629#endif
630
631static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
632{
633    tcg_out_opc(s, opc, r, rm, 0);
634    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
635}
636
637static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
638                            int rm, int index)
639{
640    int tmp;
641
642    /* Use the two byte form if possible, which cannot encode
643       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
644    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
645        && ((rm | index) & 8) == 0) {
646        /* Two byte VEX prefix.  */
647        tcg_out8(s, 0xc5);
648
649        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
650    } else {
651        /* Three byte VEX prefix.  */
652        tcg_out8(s, 0xc4);
653
654        /* VEX.m-mmmm */
655        if (opc & P_EXT3A) {
656            tmp = 3;
657        } else if (opc & P_EXT38) {
658            tmp = 2;
659        } else if (opc & P_EXT) {
660            tmp = 1;
661        } else {
662            g_assert_not_reached();
663        }
664        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
665        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
666        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
667        tcg_out8(s, tmp);
668
669        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
670    }
671
672    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
673    /* VEX.pp */
674    if (opc & P_DATA16) {
675        tmp |= 1;                          /* 0x66 */
676    } else if (opc & P_SIMDF3) {
677        tmp |= 2;                          /* 0xf3 */
678    } else if (opc & P_SIMDF2) {
679        tmp |= 3;                          /* 0xf2 */
680    }
681    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
682    tcg_out8(s, tmp);
683    tcg_out8(s, opc);
684}
685
686static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
687{
688    tcg_out_vex_opc(s, opc, r, v, rm, 0);
689    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
690}
691
692/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
693   We handle either RM and INDEX missing with a negative value.  In 64-bit
694   mode for absolute addresses, ~RM is the size of the immediate operand
695   that will follow the instruction.  */
696
697static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
698                               int shift, intptr_t offset)
699{
700    int mod, len;
701
702    if (index < 0 && rm < 0) {
703        if (TCG_TARGET_REG_BITS == 64) {
704            /* Try for a rip-relative addressing mode.  This has replaced
705               the 32-bit-mode absolute addressing encoding.  */
706            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
707            intptr_t disp = offset - pc;
708            if (disp == (int32_t)disp) {
709                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
710                tcg_out32(s, disp);
711                return;
712            }
713
714            /* Try for an absolute address encoding.  This requires the
715               use of the MODRM+SIB encoding and is therefore larger than
716               rip-relative addressing.  */
717            if (offset == (int32_t)offset) {
718                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
719                tcg_out8(s, (4 << 3) | 5);
720                tcg_out32(s, offset);
721                return;
722            }
723
724            /* ??? The memory isn't directly addressable.  */
725            g_assert_not_reached();
726        } else {
727            /* Absolute address.  */
728            tcg_out8(s, (r << 3) | 5);
729            tcg_out32(s, offset);
730            return;
731        }
732    }
733
734    /* Find the length of the immediate addend.  Note that the encoding
735       that would be used for (%ebp) indicates absolute addressing.  */
736    if (rm < 0) {
737        mod = 0, len = 4, rm = 5;
738    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
739        mod = 0, len = 0;
740    } else if (offset == (int8_t)offset) {
741        mod = 0x40, len = 1;
742    } else {
743        mod = 0x80, len = 4;
744    }
745
746    /* Use a single byte MODRM format if possible.  Note that the encoding
747       that would be used for %esp is the escape to the two byte form.  */
748    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
749        /* Single byte MODRM format.  */
750        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
751    } else {
752        /* Two byte MODRM+SIB format.  */
753
754        /* Note that the encoding that would place %esp into the index
755           field indicates no index register.  In 64-bit mode, the REX.X
756           bit counts, so %r12 can be used as the index.  */
757        if (index < 0) {
758            index = 4;
759        } else {
760            tcg_debug_assert(index != TCG_REG_ESP);
761        }
762
763        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
764        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
765    }
766
767    if (len == 1) {
768        tcg_out8(s, offset);
769    } else if (len == 4) {
770        tcg_out32(s, offset);
771    }
772}
773
774static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
775                                     int index, int shift, intptr_t offset)
776{
777    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
778    tcg_out_sib_offset(s, r, rm, index, shift, offset);
779}
780
781static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
782                                         int rm, int index, int shift,
783                                         intptr_t offset)
784{
785    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
786    tcg_out_sib_offset(s, r, rm, index, shift, offset);
787}
788
789/* A simplification of the above with no index or shift.  */
790static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
791                                        int rm, intptr_t offset)
792{
793    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
794}
795
796static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
797                                            int v, int rm, intptr_t offset)
798{
799    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
800}
801
802/* Output an opcode with an expected reference to the constant pool.  */
803static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
804{
805    tcg_out_opc(s, opc, r, 0, 0);
806    /* Absolute for 32-bit, pc-relative for 64-bit.  */
807    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
808    tcg_out32(s, 0);
809}
810
811/* Output an opcode with an expected reference to the constant pool.  */
812static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
813{
814    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
815    /* Absolute for 32-bit, pc-relative for 64-bit.  */
816    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
817    tcg_out32(s, 0);
818}
819
820/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
821static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
822{
823    /* Propagate an opcode prefix, such as P_REXW.  */
824    int ext = subop & ~0x7;
825    subop &= 0x7;
826
827    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
828}
829
830static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
831{
832    int rexw = 0;
833
834    if (arg == ret) {
835        return true;
836    }
837    switch (type) {
838    case TCG_TYPE_I64:
839        rexw = P_REXW;
840        /* fallthru */
841    case TCG_TYPE_I32:
842        if (ret < 16) {
843            if (arg < 16) {
844                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
845            } else {
846                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
847            }
848        } else {
849            if (arg < 16) {
850                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
851            } else {
852                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
853            }
854        }
855        break;
856
857    case TCG_TYPE_V64:
858        tcg_debug_assert(ret >= 16 && arg >= 16);
859        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
860        break;
861    case TCG_TYPE_V128:
862        tcg_debug_assert(ret >= 16 && arg >= 16);
863        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
864        break;
865    case TCG_TYPE_V256:
866        tcg_debug_assert(ret >= 16 && arg >= 16);
867        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
868        break;
869
870    default:
871        g_assert_not_reached();
872    }
873    return true;
874}
875
876static const int avx2_dup_insn[4] = {
877    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
878    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
879};
880
881static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
882                            TCGReg r, TCGReg a)
883{
884    if (have_avx2) {
885        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
886        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
887    } else {
888        switch (vece) {
889        case MO_8:
890            /* ??? With zero in a register, use PSHUFB.  */
891            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
892            a = r;
893            /* FALLTHRU */
894        case MO_16:
895            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
896            a = r;
897            /* FALLTHRU */
898        case MO_32:
899            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
900            /* imm8 operand: all output lanes selected from input lane 0.  */
901            tcg_out8(s, 0);
902            break;
903        case MO_64:
904            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
905            break;
906        default:
907            g_assert_not_reached();
908        }
909    }
910    return true;
911}
912
913static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
914                             TCGReg r, TCGReg base, intptr_t offset)
915{
916    if (have_avx2) {
917        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
918        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
919                                 r, 0, base, offset);
920    } else {
921        switch (vece) {
922        case MO_64:
923            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
924            break;
925        case MO_32:
926            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
927            break;
928        case MO_16:
929            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
930            tcg_out8(s, 0); /* imm8 */
931            tcg_out_dup_vec(s, type, vece, r, r);
932            break;
933        case MO_8:
934            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
935            tcg_out8(s, 0); /* imm8 */
936            tcg_out_dup_vec(s, type, vece, r, r);
937            break;
938        default:
939            g_assert_not_reached();
940        }
941    }
942    return true;
943}
944
945static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
946                             TCGReg ret, tcg_target_long arg)
947{
948    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
949
950    if (arg == 0) {
951        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
952        return;
953    }
954    if (arg == -1) {
955        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
956        return;
957    }
958
959    if (TCG_TARGET_REG_BITS == 64) {
960        if (type == TCG_TYPE_V64) {
961            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
962        } else if (have_avx2) {
963            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
964        } else {
965            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
966        }
967        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
968    } else {
969        if (have_avx2) {
970            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
971        } else {
972            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
973        }
974        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
975    }
976}
977
978static void tcg_out_movi(TCGContext *s, TCGType type,
979                         TCGReg ret, tcg_target_long arg)
980{
981    tcg_target_long diff;
982
983    switch (type) {
984    case TCG_TYPE_I32:
985#if TCG_TARGET_REG_BITS == 64
986    case TCG_TYPE_I64:
987#endif
988        if (ret < 16) {
989            break;
990        }
991        /* fallthru */
992    case TCG_TYPE_V64:
993    case TCG_TYPE_V128:
994    case TCG_TYPE_V256:
995        tcg_debug_assert(ret >= 16);
996        tcg_out_dupi_vec(s, type, ret, arg);
997        return;
998    default:
999        g_assert_not_reached();
1000    }
1001
1002    if (arg == 0) {
1003        tgen_arithr(s, ARITH_XOR, ret, ret);
1004        return;
1005    }
1006    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1007        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1008        tcg_out32(s, arg);
1009        return;
1010    }
1011    if (arg == (int32_t)arg) {
1012        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1013        tcg_out32(s, arg);
1014        return;
1015    }
1016
1017    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1018    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1019    if (diff == (int32_t)diff) {
1020        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1021        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1022        tcg_out32(s, diff);
1023        return;
1024    }
1025
1026    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1027    tcg_out64(s, arg);
1028}
1029
1030static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1031{
1032    if (val == (int8_t)val) {
1033        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1034        tcg_out8(s, val);
1035    } else if (val == (int32_t)val) {
1036        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1037        tcg_out32(s, val);
1038    } else {
1039        tcg_abort();
1040    }
1041}
1042
1043static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1044{
1045    /* Given the strength of x86 memory ordering, we only need care for
1046       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1047       faster than "mfence", so don't bother with the sse insn.  */
1048    if (a0 & TCG_MO_ST_LD) {
1049        tcg_out8(s, 0xf0);
1050        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1051        tcg_out8(s, 0);
1052    }
1053}
1054
1055static inline void tcg_out_push(TCGContext *s, int reg)
1056{
1057    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1058}
1059
1060static inline void tcg_out_pop(TCGContext *s, int reg)
1061{
1062    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1063}
1064
1065static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1066                       TCGReg arg1, intptr_t arg2)
1067{
1068    switch (type) {
1069    case TCG_TYPE_I32:
1070        if (ret < 16) {
1071            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1072        } else {
1073            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1074        }
1075        break;
1076    case TCG_TYPE_I64:
1077        if (ret < 16) {
1078            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1079            break;
1080        }
1081        /* FALLTHRU */
1082    case TCG_TYPE_V64:
1083        /* There is no instruction that can validate 8-byte alignment.  */
1084        tcg_debug_assert(ret >= 16);
1085        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1086        break;
1087    case TCG_TYPE_V128:
1088        /*
1089         * The gvec infrastructure is asserts that v128 vector loads
1090         * and stores use a 16-byte aligned offset.  Validate that the
1091         * final pointer is aligned by using an insn that will SIGSEGV.
1092         */
1093        tcg_debug_assert(ret >= 16);
1094        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1095        break;
1096    case TCG_TYPE_V256:
1097        /*
1098         * The gvec infrastructure only requires 16-byte alignment,
1099         * so here we must use an unaligned load.
1100         */
1101        tcg_debug_assert(ret >= 16);
1102        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1103                                 ret, 0, arg1, arg2);
1104        break;
1105    default:
1106        g_assert_not_reached();
1107    }
1108}
1109
1110static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1111                       TCGReg arg1, intptr_t arg2)
1112{
1113    switch (type) {
1114    case TCG_TYPE_I32:
1115        if (arg < 16) {
1116            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1117        } else {
1118            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1119        }
1120        break;
1121    case TCG_TYPE_I64:
1122        if (arg < 16) {
1123            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1124            break;
1125        }
1126        /* FALLTHRU */
1127    case TCG_TYPE_V64:
1128        /* There is no instruction that can validate 8-byte alignment.  */
1129        tcg_debug_assert(arg >= 16);
1130        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1131        break;
1132    case TCG_TYPE_V128:
1133        /*
1134         * The gvec infrastructure is asserts that v128 vector loads
1135         * and stores use a 16-byte aligned offset.  Validate that the
1136         * final pointer is aligned by using an insn that will SIGSEGV.
1137         */
1138        tcg_debug_assert(arg >= 16);
1139        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1140        break;
1141    case TCG_TYPE_V256:
1142        /*
1143         * The gvec infrastructure only requires 16-byte alignment,
1144         * so here we must use an unaligned store.
1145         */
1146        tcg_debug_assert(arg >= 16);
1147        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1148                                 arg, 0, arg1, arg2);
1149        break;
1150    default:
1151        g_assert_not_reached();
1152    }
1153}
1154
1155static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1156                        TCGReg base, intptr_t ofs)
1157{
1158    int rexw = 0;
1159    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1160        if (val != (int32_t)val) {
1161            return false;
1162        }
1163        rexw = P_REXW;
1164    } else if (type != TCG_TYPE_I32) {
1165        return false;
1166    }
1167    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1168    tcg_out32(s, val);
1169    return true;
1170}
1171
1172static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1173{
1174    /* Propagate an opcode prefix, such as P_DATA16.  */
1175    int ext = subopc & ~0x7;
1176    subopc &= 0x7;
1177
1178    if (count == 1) {
1179        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1180    } else {
1181        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1182        tcg_out8(s, count);
1183    }
1184}
1185
1186static inline void tcg_out_bswap32(TCGContext *s, int reg)
1187{
1188    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1189}
1190
1191static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1192{
1193    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1194}
1195
1196static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1197{
1198    /* movzbl */
1199    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1200    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1201}
1202
1203static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1204{
1205    /* movsbl */
1206    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1207    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1208}
1209
1210static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1211{
1212    /* movzwl */
1213    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1214}
1215
1216static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1217{
1218    /* movsw[lq] */
1219    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1220}
1221
1222static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1223{
1224    /* 32-bit mov zero extends.  */
1225    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1226}
1227
1228static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1229{
1230    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1231}
1232
1233static inline void tcg_out_bswap64(TCGContext *s, int reg)
1234{
1235    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1236}
1237
1238static void tgen_arithi(TCGContext *s, int c, int r0,
1239                        tcg_target_long val, int cf)
1240{
1241    int rexw = 0;
1242
1243    if (TCG_TARGET_REG_BITS == 64) {
1244        rexw = c & -8;
1245        c &= 7;
1246    }
1247
1248    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1249       partial flags update stalls on Pentium4 and are not recommended
1250       by current Intel optimization manuals.  */
1251    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1252        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1253        if (TCG_TARGET_REG_BITS == 64) {
1254            /* The single-byte increment encodings are re-tasked as the
1255               REX prefixes.  Use the MODRM encoding.  */
1256            tcg_out_modrm(s, OPC_GRP5 + rexw,
1257                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1258        } else {
1259            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1260        }
1261        return;
1262    }
1263
1264    if (c == ARITH_AND) {
1265        if (TCG_TARGET_REG_BITS == 64) {
1266            if (val == 0xffffffffu) {
1267                tcg_out_ext32u(s, r0, r0);
1268                return;
1269            }
1270            if (val == (uint32_t)val) {
1271                /* AND with no high bits set can use a 32-bit operation.  */
1272                rexw = 0;
1273            }
1274        }
1275        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1276            tcg_out_ext8u(s, r0, r0);
1277            return;
1278        }
1279        if (val == 0xffffu) {
1280            tcg_out_ext16u(s, r0, r0);
1281            return;
1282        }
1283    }
1284
1285    if (val == (int8_t)val) {
1286        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1287        tcg_out8(s, val);
1288        return;
1289    }
1290    if (rexw == 0 || val == (int32_t)val) {
1291        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1292        tcg_out32(s, val);
1293        return;
1294    }
1295
1296    tcg_abort();
1297}
1298
1299static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1300{
1301    if (val != 0) {
1302        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1303    }
1304}
1305
1306/* Use SMALL != 0 to force a short forward branch.  */
1307static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1308{
1309    int32_t val, val1;
1310
1311    if (l->has_value) {
1312        val = tcg_pcrel_diff(s, l->u.value_ptr);
1313        val1 = val - 2;
1314        if ((int8_t)val1 == val1) {
1315            if (opc == -1) {
1316                tcg_out8(s, OPC_JMP_short);
1317            } else {
1318                tcg_out8(s, OPC_JCC_short + opc);
1319            }
1320            tcg_out8(s, val1);
1321        } else {
1322            if (small) {
1323                tcg_abort();
1324            }
1325            if (opc == -1) {
1326                tcg_out8(s, OPC_JMP_long);
1327                tcg_out32(s, val - 5);
1328            } else {
1329                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1330                tcg_out32(s, val - 6);
1331            }
1332        }
1333    } else if (small) {
1334        if (opc == -1) {
1335            tcg_out8(s, OPC_JMP_short);
1336        } else {
1337            tcg_out8(s, OPC_JCC_short + opc);
1338        }
1339        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1340        s->code_ptr += 1;
1341    } else {
1342        if (opc == -1) {
1343            tcg_out8(s, OPC_JMP_long);
1344        } else {
1345            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1346        }
1347        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1348        s->code_ptr += 4;
1349    }
1350}
1351
1352static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1353                        int const_arg2, int rexw)
1354{
1355    if (const_arg2) {
1356        if (arg2 == 0) {
1357            /* test r, r */
1358            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1359        } else {
1360            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1361        }
1362    } else {
1363        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1364    }
1365}
1366
1367static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1368                             TCGArg arg1, TCGArg arg2, int const_arg2,
1369                             TCGLabel *label, int small)
1370{
1371    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1372    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1373}
1374
1375#if TCG_TARGET_REG_BITS == 64
1376static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1377                             TCGArg arg1, TCGArg arg2, int const_arg2,
1378                             TCGLabel *label, int small)
1379{
1380    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1381    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1382}
1383#else
1384/* XXX: we implement it at the target level to avoid having to
1385   handle cross basic blocks temporaries */
1386static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1387                            const int *const_args, int small)
1388{
1389    TCGLabel *label_next = gen_new_label();
1390    TCGLabel *label_this = arg_label(args[5]);
1391
1392    switch(args[4]) {
1393    case TCG_COND_EQ:
1394        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1395                         label_next, 1);
1396        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1397                         label_this, small);
1398        break;
1399    case TCG_COND_NE:
1400        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1401                         label_this, small);
1402        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1403                         label_this, small);
1404        break;
1405    case TCG_COND_LT:
1406        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1407                         label_this, small);
1408        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1409        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1410                         label_this, small);
1411        break;
1412    case TCG_COND_LE:
1413        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1414                         label_this, small);
1415        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1416        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1417                         label_this, small);
1418        break;
1419    case TCG_COND_GT:
1420        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1421                         label_this, small);
1422        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1423        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1424                         label_this, small);
1425        break;
1426    case TCG_COND_GE:
1427        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1428                         label_this, small);
1429        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1430        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1431                         label_this, small);
1432        break;
1433    case TCG_COND_LTU:
1434        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1435                         label_this, small);
1436        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1437        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1438                         label_this, small);
1439        break;
1440    case TCG_COND_LEU:
1441        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1442                         label_this, small);
1443        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1444        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1445                         label_this, small);
1446        break;
1447    case TCG_COND_GTU:
1448        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1449                         label_this, small);
1450        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1451        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1452                         label_this, small);
1453        break;
1454    case TCG_COND_GEU:
1455        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1456                         label_this, small);
1457        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1458        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1459                         label_this, small);
1460        break;
1461    default:
1462        tcg_abort();
1463    }
1464    tcg_out_label(s, label_next);
1465}
1466#endif
1467
1468static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1469                              TCGArg arg1, TCGArg arg2, int const_arg2)
1470{
1471    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1472    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1473    tcg_out_ext8u(s, dest, dest);
1474}
1475
1476#if TCG_TARGET_REG_BITS == 64
1477static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1478                              TCGArg arg1, TCGArg arg2, int const_arg2)
1479{
1480    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1481    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1482    tcg_out_ext8u(s, dest, dest);
1483}
1484#else
1485static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1486                             const int *const_args)
1487{
1488    TCGArg new_args[6];
1489    TCGLabel *label_true, *label_over;
1490
1491    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1492
1493    if (args[0] == args[1] || args[0] == args[2]
1494        || (!const_args[3] && args[0] == args[3])
1495        || (!const_args[4] && args[0] == args[4])) {
1496        /* When the destination overlaps with one of the argument
1497           registers, don't do anything tricky.  */
1498        label_true = gen_new_label();
1499        label_over = gen_new_label();
1500
1501        new_args[5] = label_arg(label_true);
1502        tcg_out_brcond2(s, new_args, const_args+1, 1);
1503
1504        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1505        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1506        tcg_out_label(s, label_true);
1507
1508        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1509        tcg_out_label(s, label_over);
1510    } else {
1511        /* When the destination does not overlap one of the arguments,
1512           clear the destination first, jump if cond false, and emit an
1513           increment in the true case.  This results in smaller code.  */
1514
1515        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1516
1517        label_over = gen_new_label();
1518        new_args[4] = tcg_invert_cond(new_args[4]);
1519        new_args[5] = label_arg(label_over);
1520        tcg_out_brcond2(s, new_args, const_args+1, 1);
1521
1522        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1523        tcg_out_label(s, label_over);
1524    }
1525}
1526#endif
1527
1528static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1529                         TCGReg dest, TCGReg v1)
1530{
1531    if (have_cmov) {
1532        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1533    } else {
1534        TCGLabel *over = gen_new_label();
1535        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1536        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1537        tcg_out_label(s, over);
1538    }
1539}
1540
1541static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1542                              TCGReg c1, TCGArg c2, int const_c2,
1543                              TCGReg v1)
1544{
1545    tcg_out_cmp(s, c1, c2, const_c2, 0);
1546    tcg_out_cmov(s, cond, 0, dest, v1);
1547}
1548
1549#if TCG_TARGET_REG_BITS == 64
1550static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1551                              TCGReg c1, TCGArg c2, int const_c2,
1552                              TCGReg v1)
1553{
1554    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1555    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1556}
1557#endif
1558
1559static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1560                        TCGArg arg2, bool const_a2)
1561{
1562    if (have_bmi1) {
1563        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1564        if (const_a2) {
1565            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1566        } else {
1567            tcg_debug_assert(dest != arg2);
1568            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1569        }
1570    } else {
1571        tcg_debug_assert(dest != arg2);
1572        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1573        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1574    }
1575}
1576
1577static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1578                        TCGArg arg2, bool const_a2)
1579{
1580    if (have_lzcnt) {
1581        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1582        if (const_a2) {
1583            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1584        } else {
1585            tcg_debug_assert(dest != arg2);
1586            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1587        }
1588    } else {
1589        tcg_debug_assert(!const_a2);
1590        tcg_debug_assert(dest != arg1);
1591        tcg_debug_assert(dest != arg2);
1592
1593        /* Recall that the output of BSR is the index not the count.  */
1594        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1595        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1596
1597        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1598        tcg_out_cmp(s, arg1, 0, 1, rexw);
1599        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1600    }
1601}
1602
1603static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1604{
1605    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1606
1607    if (disp == (int32_t)disp) {
1608        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1609        tcg_out32(s, disp);
1610    } else {
1611        /* rip-relative addressing into the constant pool.
1612           This is 6 + 8 = 14 bytes, as compared to using an
1613           an immediate load 10 + 6 = 16 bytes, plus we may
1614           be able to re-use the pool constant for more calls.  */
1615        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1616        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1617        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1618        tcg_out32(s, 0);
1619    }
1620}
1621
1622static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1623{
1624    tcg_out_branch(s, 1, dest);
1625}
1626
1627static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1628{
1629    tcg_out_branch(s, 0, dest);
1630}
1631
1632static void tcg_out_nopn(TCGContext *s, int n)
1633{
1634    int i;
1635    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1636     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1637     * duplicate prefix, and all of the interesting recent cores can
1638     * decode and discard the duplicates in a single cycle.
1639     */
1640    tcg_debug_assert(n >= 1);
1641    for (i = 1; i < n; ++i) {
1642        tcg_out8(s, 0x66);
1643    }
1644    tcg_out8(s, 0x90);
1645}
1646
1647#if defined(CONFIG_SOFTMMU)
1648#include "../tcg-ldst.c.inc"
1649
1650/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1651 *                                     int mmu_idx, uintptr_t ra)
1652 */
1653static void * const qemu_ld_helpers[16] = {
1654    [MO_UB]   = helper_ret_ldub_mmu,
1655    [MO_LEUW] = helper_le_lduw_mmu,
1656    [MO_LEUL] = helper_le_ldul_mmu,
1657    [MO_LEQ]  = helper_le_ldq_mmu,
1658    [MO_BEUW] = helper_be_lduw_mmu,
1659    [MO_BEUL] = helper_be_ldul_mmu,
1660    [MO_BEQ]  = helper_be_ldq_mmu,
1661};
1662
1663/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1664 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1665 */
1666static void * const qemu_st_helpers[16] = {
1667    [MO_UB]   = helper_ret_stb_mmu,
1668    [MO_LEUW] = helper_le_stw_mmu,
1669    [MO_LEUL] = helper_le_stl_mmu,
1670    [MO_LEQ]  = helper_le_stq_mmu,
1671    [MO_BEUW] = helper_be_stw_mmu,
1672    [MO_BEUL] = helper_be_stl_mmu,
1673    [MO_BEQ]  = helper_be_stq_mmu,
1674};
1675
1676/* Perform the TLB load and compare.
1677
1678   Inputs:
1679   ADDRLO and ADDRHI contain the low and high part of the address.
1680
1681   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1682
1683   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1684   This should be offsetof addr_read or addr_write.
1685
1686   Outputs:
1687   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1688   positions of the displacements of forward jumps to the TLB miss case.
1689
1690   Second argument register is loaded with the low part of the address.
1691   In the TLB hit case, it has been adjusted as indicated by the TLB
1692   and so is a host address.  In the TLB miss case, it continues to
1693   hold a guest address.
1694
1695   First argument register is clobbered.  */
1696
1697static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1698                                    int mem_index, MemOp opc,
1699                                    tcg_insn_unit **label_ptr, int which)
1700{
1701    const TCGReg r0 = TCG_REG_L0;
1702    const TCGReg r1 = TCG_REG_L1;
1703    TCGType ttype = TCG_TYPE_I32;
1704    TCGType tlbtype = TCG_TYPE_I32;
1705    int trexw = 0, hrexw = 0, tlbrexw = 0;
1706    unsigned a_bits = get_alignment_bits(opc);
1707    unsigned s_bits = opc & MO_SIZE;
1708    unsigned a_mask = (1 << a_bits) - 1;
1709    unsigned s_mask = (1 << s_bits) - 1;
1710    target_ulong tlb_mask;
1711
1712    if (TCG_TARGET_REG_BITS == 64) {
1713        if (TARGET_LONG_BITS == 64) {
1714            ttype = TCG_TYPE_I64;
1715            trexw = P_REXW;
1716        }
1717        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1718            hrexw = P_REXW;
1719            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1720                tlbtype = TCG_TYPE_I64;
1721                tlbrexw = P_REXW;
1722            }
1723        }
1724    }
1725
1726    tcg_out_mov(s, tlbtype, r0, addrlo);
1727    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1728                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1729
1730    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1731                         TLB_MASK_TABLE_OFS(mem_index) +
1732                         offsetof(CPUTLBDescFast, mask));
1733
1734    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1735                         TLB_MASK_TABLE_OFS(mem_index) +
1736                         offsetof(CPUTLBDescFast, table));
1737
1738    /* If the required alignment is at least as large as the access, simply
1739       copy the address and mask.  For lesser alignments, check that we don't
1740       cross pages for the complete access.  */
1741    if (a_bits >= s_bits) {
1742        tcg_out_mov(s, ttype, r1, addrlo);
1743    } else {
1744        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1745    }
1746    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1747    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1748
1749    /* cmp 0(r0), r1 */
1750    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1751
1752    /* Prepare for both the fast path add of the tlb addend, and the slow
1753       path function argument setup.  */
1754    tcg_out_mov(s, ttype, r1, addrlo);
1755
1756    /* jne slow_path */
1757    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1758    label_ptr[0] = s->code_ptr;
1759    s->code_ptr += 4;
1760
1761    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1762        /* cmp 4(r0), addrhi */
1763        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1764
1765        /* jne slow_path */
1766        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1767        label_ptr[1] = s->code_ptr;
1768        s->code_ptr += 4;
1769    }
1770
1771    /* TLB Hit.  */
1772
1773    /* add addend(r0), r1 */
1774    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1775                         offsetof(CPUTLBEntry, addend));
1776}
1777
1778/*
1779 * Record the context of a call to the out of line helper code for the slow path
1780 * for a load or store, so that we can later generate the correct helper code
1781 */
1782static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1783                                TCGMemOpIdx oi,
1784                                TCGReg datalo, TCGReg datahi,
1785                                TCGReg addrlo, TCGReg addrhi,
1786                                tcg_insn_unit *raddr,
1787                                tcg_insn_unit **label_ptr)
1788{
1789    TCGLabelQemuLdst *label = new_ldst_label(s);
1790
1791    label->is_ld = is_ld;
1792    label->oi = oi;
1793    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1794    label->datalo_reg = datalo;
1795    label->datahi_reg = datahi;
1796    label->addrlo_reg = addrlo;
1797    label->addrhi_reg = addrhi;
1798    /* TODO: Cast goes away when all hosts converted */
1799    label->raddr = (void *)tcg_splitwx_to_rx(raddr);
1800    label->label_ptr[0] = label_ptr[0];
1801    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1802        label->label_ptr[1] = label_ptr[1];
1803    }
1804}
1805
1806/*
1807 * Generate code for the slow path for a load at the end of block
1808 */
1809static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1810{
1811    TCGMemOpIdx oi = l->oi;
1812    MemOp opc = get_memop(oi);
1813    TCGReg data_reg;
1814    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1815    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1816
1817    /* resolve label address */
1818    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1819    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1820        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1821    }
1822
1823    if (TCG_TARGET_REG_BITS == 32) {
1824        int ofs = 0;
1825
1826        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1827        ofs += 4;
1828
1829        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1830        ofs += 4;
1831
1832        if (TARGET_LONG_BITS == 64) {
1833            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1834            ofs += 4;
1835        }
1836
1837        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1838        ofs += 4;
1839
1840        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1841    } else {
1842        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1843        /* The second argument is already loaded with addrlo.  */
1844        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1845        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1846                     (uintptr_t)l->raddr);
1847    }
1848
1849    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1850
1851    data_reg = l->datalo_reg;
1852    switch (opc & MO_SSIZE) {
1853    case MO_SB:
1854        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1855        break;
1856    case MO_SW:
1857        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1858        break;
1859#if TCG_TARGET_REG_BITS == 64
1860    case MO_SL:
1861        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1862        break;
1863#endif
1864    case MO_UB:
1865    case MO_UW:
1866        /* Note that the helpers have zero-extended to tcg_target_long.  */
1867    case MO_UL:
1868        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1869        break;
1870    case MO_Q:
1871        if (TCG_TARGET_REG_BITS == 64) {
1872            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1873        } else if (data_reg == TCG_REG_EDX) {
1874            /* xchg %edx, %eax */
1875            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1876            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1877        } else {
1878            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1879            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1880        }
1881        break;
1882    default:
1883        tcg_abort();
1884    }
1885
1886    /* Jump to the code corresponding to next IR of qemu_st */
1887    tcg_out_jmp(s, l->raddr);
1888    return true;
1889}
1890
1891/*
1892 * Generate code for the slow path for a store at the end of block
1893 */
1894static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1895{
1896    TCGMemOpIdx oi = l->oi;
1897    MemOp opc = get_memop(oi);
1898    MemOp s_bits = opc & MO_SIZE;
1899    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1900    TCGReg retaddr;
1901
1902    /* resolve label address */
1903    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1904    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1905        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1906    }
1907
1908    if (TCG_TARGET_REG_BITS == 32) {
1909        int ofs = 0;
1910
1911        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1912        ofs += 4;
1913
1914        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1915        ofs += 4;
1916
1917        if (TARGET_LONG_BITS == 64) {
1918            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1919            ofs += 4;
1920        }
1921
1922        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1923        ofs += 4;
1924
1925        if (s_bits == MO_64) {
1926            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1927            ofs += 4;
1928        }
1929
1930        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1931        ofs += 4;
1932
1933        retaddr = TCG_REG_EAX;
1934        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1935        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1936    } else {
1937        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1938        /* The second argument is already loaded with addrlo.  */
1939        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1940                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1941        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1942
1943        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1944            retaddr = tcg_target_call_iarg_regs[4];
1945            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1946        } else {
1947            retaddr = TCG_REG_RAX;
1948            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1949            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1950                       TCG_TARGET_CALL_STACK_OFFSET);
1951        }
1952    }
1953
1954    /* "Tail call" to the helper, with the return address back inline.  */
1955    tcg_out_push(s, retaddr);
1956    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1957    return true;
1958}
1959#elif TCG_TARGET_REG_BITS == 32
1960# define x86_guest_base_seg     0
1961# define x86_guest_base_index   -1
1962# define x86_guest_base_offset  guest_base
1963#else
1964static int x86_guest_base_seg;
1965static int x86_guest_base_index = -1;
1966static int32_t x86_guest_base_offset;
1967# if defined(__x86_64__) && defined(__linux__)
1968#  include <asm/prctl.h>
1969#  include <sys/prctl.h>
1970int arch_prctl(int code, unsigned long addr);
1971static inline int setup_guest_base_seg(void)
1972{
1973    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1974        return P_GS;
1975    }
1976    return 0;
1977}
1978# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1979#  include <machine/sysarch.h>
1980static inline int setup_guest_base_seg(void)
1981{
1982    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1983        return P_GS;
1984    }
1985    return 0;
1986}
1987# else
1988static inline int setup_guest_base_seg(void)
1989{
1990    return 0;
1991}
1992# endif
1993#endif /* SOFTMMU */
1994
1995static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1996                                   TCGReg base, int index, intptr_t ofs,
1997                                   int seg, bool is64, MemOp memop)
1998{
1999    bool use_movbe = false;
2000    int rexw = is64 * P_REXW;
2001    int movop = OPC_MOVL_GvEv;
2002
2003    /* Do big-endian loads with movbe.  */
2004    if (memop & MO_BSWAP) {
2005        tcg_debug_assert(have_movbe);
2006        use_movbe = true;
2007        movop = OPC_MOVBE_GyMy;
2008    }
2009
2010    switch (memop & MO_SSIZE) {
2011    case MO_UB:
2012        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2013                                 base, index, 0, ofs);
2014        break;
2015    case MO_SB:
2016        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2017                                 base, index, 0, ofs);
2018        break;
2019    case MO_UW:
2020        if (use_movbe) {
2021            /* There is no extending movbe; only low 16-bits are modified.  */
2022            if (datalo != base && datalo != index) {
2023                /* XOR breaks dependency chains.  */
2024                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2025                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2026                                         datalo, base, index, 0, ofs);
2027            } else {
2028                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2029                                         datalo, base, index, 0, ofs);
2030                tcg_out_ext16u(s, datalo, datalo);
2031            }
2032        } else {
2033            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2034                                     base, index, 0, ofs);
2035        }
2036        break;
2037    case MO_SW:
2038        if (use_movbe) {
2039            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2040                                     datalo, base, index, 0, ofs);
2041            tcg_out_ext16s(s, datalo, datalo, rexw);
2042        } else {
2043            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2044                                     datalo, base, index, 0, ofs);
2045        }
2046        break;
2047    case MO_UL:
2048        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2049        break;
2050#if TCG_TARGET_REG_BITS == 64
2051    case MO_SL:
2052        if (use_movbe) {
2053            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2054                                     base, index, 0, ofs);
2055            tcg_out_ext32s(s, datalo, datalo);
2056        } else {
2057            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2058                                     base, index, 0, ofs);
2059        }
2060        break;
2061#endif
2062    case MO_Q:
2063        if (TCG_TARGET_REG_BITS == 64) {
2064            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2065                                     base, index, 0, ofs);
2066        } else {
2067            if (use_movbe) {
2068                TCGReg t = datalo;
2069                datalo = datahi;
2070                datahi = t;
2071            }
2072            if (base != datalo) {
2073                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2074                                         base, index, 0, ofs);
2075                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2076                                         base, index, 0, ofs + 4);
2077            } else {
2078                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2079                                         base, index, 0, ofs + 4);
2080                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2081                                         base, index, 0, ofs);
2082            }
2083        }
2084        break;
2085    default:
2086        g_assert_not_reached();
2087    }
2088}
2089
2090/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2091   EAX. It will be useful once fixed registers globals are less
2092   common. */
2093static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2094{
2095    TCGReg datalo, datahi, addrlo;
2096    TCGReg addrhi __attribute__((unused));
2097    TCGMemOpIdx oi;
2098    MemOp opc;
2099#if defined(CONFIG_SOFTMMU)
2100    int mem_index;
2101    tcg_insn_unit *label_ptr[2];
2102#endif
2103
2104    datalo = *args++;
2105    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2106    addrlo = *args++;
2107    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2108    oi = *args++;
2109    opc = get_memop(oi);
2110
2111#if defined(CONFIG_SOFTMMU)
2112    mem_index = get_mmuidx(oi);
2113
2114    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2115                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2116
2117    /* TLB Hit.  */
2118    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2119
2120    /* Record the current context of a load into ldst label */
2121    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2122                        s->code_ptr, label_ptr);
2123#else
2124    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2125                           x86_guest_base_offset, x86_guest_base_seg,
2126                           is64, opc);
2127#endif
2128}
2129
2130static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2131                                   TCGReg base, int index, intptr_t ofs,
2132                                   int seg, MemOp memop)
2133{
2134    bool use_movbe = false;
2135    int movop = OPC_MOVL_EvGv;
2136
2137    /*
2138     * Do big-endian stores with movbe or softmmu.
2139     * User-only without movbe will have its swapping done generically.
2140     */
2141    if (memop & MO_BSWAP) {
2142        tcg_debug_assert(have_movbe);
2143        use_movbe = true;
2144        movop = OPC_MOVBE_MyGy;
2145    }
2146
2147    switch (memop & MO_SIZE) {
2148    case MO_8:
2149        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2150        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2151        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2152                                 datalo, base, index, 0, ofs);
2153        break;
2154    case MO_16:
2155        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2156                                 base, index, 0, ofs);
2157        break;
2158    case MO_32:
2159        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2160        break;
2161    case MO_64:
2162        if (TCG_TARGET_REG_BITS == 64) {
2163            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2164                                     base, index, 0, ofs);
2165        } else {
2166            if (use_movbe) {
2167                TCGReg t = datalo;
2168                datalo = datahi;
2169                datahi = t;
2170            }
2171            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2172                                     base, index, 0, ofs);
2173            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2174                                     base, index, 0, ofs + 4);
2175        }
2176        break;
2177    default:
2178        g_assert_not_reached();
2179    }
2180}
2181
2182static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2183{
2184    TCGReg datalo, datahi, addrlo;
2185    TCGReg addrhi __attribute__((unused));
2186    TCGMemOpIdx oi;
2187    MemOp opc;
2188#if defined(CONFIG_SOFTMMU)
2189    int mem_index;
2190    tcg_insn_unit *label_ptr[2];
2191#endif
2192
2193    datalo = *args++;
2194    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2195    addrlo = *args++;
2196    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2197    oi = *args++;
2198    opc = get_memop(oi);
2199
2200#if defined(CONFIG_SOFTMMU)
2201    mem_index = get_mmuidx(oi);
2202
2203    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2204                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2205
2206    /* TLB Hit.  */
2207    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2208
2209    /* Record the current context of a store into ldst label */
2210    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2211                        s->code_ptr, label_ptr);
2212#else
2213    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2214                           x86_guest_base_offset, x86_guest_base_seg, opc);
2215#endif
2216}
2217
2218static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2219                              const TCGArg *args, const int *const_args)
2220{
2221    TCGArg a0, a1, a2;
2222    int c, const_a2, vexop, rexw = 0;
2223
2224#if TCG_TARGET_REG_BITS == 64
2225# define OP_32_64(x) \
2226        case glue(glue(INDEX_op_, x), _i64): \
2227            rexw = P_REXW; /* FALLTHRU */    \
2228        case glue(glue(INDEX_op_, x), _i32)
2229#else
2230# define OP_32_64(x) \
2231        case glue(glue(INDEX_op_, x), _i32)
2232#endif
2233
2234    /* Hoist the loads of the most common arguments.  */
2235    a0 = args[0];
2236    a1 = args[1];
2237    a2 = args[2];
2238    const_a2 = const_args[2];
2239
2240    switch (opc) {
2241    case INDEX_op_exit_tb:
2242        /* Reuse the zeroing that exists for goto_ptr.  */
2243        if (a0 == 0) {
2244            tcg_out_jmp(s, tcg_code_gen_epilogue);
2245        } else {
2246            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2247            tcg_out_jmp(s, tb_ret_addr);
2248        }
2249        break;
2250    case INDEX_op_goto_tb:
2251        if (s->tb_jmp_insn_offset) {
2252            /* direct jump method */
2253            int gap;
2254            /* jump displacement must be aligned for atomic patching;
2255             * see if we need to add extra nops before jump
2256             */
2257            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2258            if (gap != 1) {
2259                tcg_out_nopn(s, gap - 1);
2260            }
2261            tcg_out8(s, OPC_JMP_long); /* jmp im */
2262            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2263            tcg_out32(s, 0);
2264        } else {
2265            /* indirect jump method */
2266            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2267                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2268        }
2269        set_jmp_reset_offset(s, a0);
2270        break;
2271    case INDEX_op_goto_ptr:
2272        /* jmp to the given host address (could be epilogue) */
2273        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2274        break;
2275    case INDEX_op_br:
2276        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2277        break;
2278    OP_32_64(ld8u):
2279        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2280        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2281        break;
2282    OP_32_64(ld8s):
2283        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2284        break;
2285    OP_32_64(ld16u):
2286        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2287        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2288        break;
2289    OP_32_64(ld16s):
2290        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2291        break;
2292#if TCG_TARGET_REG_BITS == 64
2293    case INDEX_op_ld32u_i64:
2294#endif
2295    case INDEX_op_ld_i32:
2296        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2297        break;
2298
2299    OP_32_64(st8):
2300        if (const_args[0]) {
2301            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2302            tcg_out8(s, a0);
2303        } else {
2304            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2305        }
2306        break;
2307    OP_32_64(st16):
2308        if (const_args[0]) {
2309            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2310            tcg_out16(s, a0);
2311        } else {
2312            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2313        }
2314        break;
2315#if TCG_TARGET_REG_BITS == 64
2316    case INDEX_op_st32_i64:
2317#endif
2318    case INDEX_op_st_i32:
2319        if (const_args[0]) {
2320            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2321            tcg_out32(s, a0);
2322        } else {
2323            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2324        }
2325        break;
2326
2327    OP_32_64(add):
2328        /* For 3-operand addition, use LEA.  */
2329        if (a0 != a1) {
2330            TCGArg c3 = 0;
2331            if (const_a2) {
2332                c3 = a2, a2 = -1;
2333            } else if (a0 == a2) {
2334                /* Watch out for dest = src + dest, since we've removed
2335                   the matching constraint on the add.  */
2336                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2337                break;
2338            }
2339
2340            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2341            break;
2342        }
2343        c = ARITH_ADD;
2344        goto gen_arith;
2345    OP_32_64(sub):
2346        c = ARITH_SUB;
2347        goto gen_arith;
2348    OP_32_64(and):
2349        c = ARITH_AND;
2350        goto gen_arith;
2351    OP_32_64(or):
2352        c = ARITH_OR;
2353        goto gen_arith;
2354    OP_32_64(xor):
2355        c = ARITH_XOR;
2356        goto gen_arith;
2357    gen_arith:
2358        if (const_a2) {
2359            tgen_arithi(s, c + rexw, a0, a2, 0);
2360        } else {
2361            tgen_arithr(s, c + rexw, a0, a2);
2362        }
2363        break;
2364
2365    OP_32_64(andc):
2366        if (const_a2) {
2367            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2368            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2369        } else {
2370            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2371        }
2372        break;
2373
2374    OP_32_64(mul):
2375        if (const_a2) {
2376            int32_t val;
2377            val = a2;
2378            if (val == (int8_t)val) {
2379                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2380                tcg_out8(s, val);
2381            } else {
2382                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2383                tcg_out32(s, val);
2384            }
2385        } else {
2386            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2387        }
2388        break;
2389
2390    OP_32_64(div2):
2391        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2392        break;
2393    OP_32_64(divu2):
2394        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2395        break;
2396
2397    OP_32_64(shl):
2398        /* For small constant 3-operand shift, use LEA.  */
2399        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2400            if (a2 - 1 == 0) {
2401                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2402                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2403            } else {
2404                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2405                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2406            }
2407            break;
2408        }
2409        c = SHIFT_SHL;
2410        vexop = OPC_SHLX;
2411        goto gen_shift_maybe_vex;
2412    OP_32_64(shr):
2413        c = SHIFT_SHR;
2414        vexop = OPC_SHRX;
2415        goto gen_shift_maybe_vex;
2416    OP_32_64(sar):
2417        c = SHIFT_SAR;
2418        vexop = OPC_SARX;
2419        goto gen_shift_maybe_vex;
2420    OP_32_64(rotl):
2421        c = SHIFT_ROL;
2422        goto gen_shift;
2423    OP_32_64(rotr):
2424        c = SHIFT_ROR;
2425        goto gen_shift;
2426    gen_shift_maybe_vex:
2427        if (have_bmi2) {
2428            if (!const_a2) {
2429                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2430                break;
2431            }
2432            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2433        }
2434        /* FALLTHRU */
2435    gen_shift:
2436        if (const_a2) {
2437            tcg_out_shifti(s, c + rexw, a0, a2);
2438        } else {
2439            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2440        }
2441        break;
2442
2443    OP_32_64(ctz):
2444        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2445        break;
2446    OP_32_64(clz):
2447        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2448        break;
2449    OP_32_64(ctpop):
2450        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2451        break;
2452
2453    case INDEX_op_brcond_i32:
2454        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2455        break;
2456    case INDEX_op_setcond_i32:
2457        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2458        break;
2459    case INDEX_op_movcond_i32:
2460        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2461        break;
2462
2463    OP_32_64(bswap16):
2464        tcg_out_rolw_8(s, a0);
2465        break;
2466    OP_32_64(bswap32):
2467        tcg_out_bswap32(s, a0);
2468        break;
2469
2470    OP_32_64(neg):
2471        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2472        break;
2473    OP_32_64(not):
2474        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2475        break;
2476
2477    OP_32_64(ext8s):
2478        tcg_out_ext8s(s, a0, a1, rexw);
2479        break;
2480    OP_32_64(ext16s):
2481        tcg_out_ext16s(s, a0, a1, rexw);
2482        break;
2483    OP_32_64(ext8u):
2484        tcg_out_ext8u(s, a0, a1);
2485        break;
2486    OP_32_64(ext16u):
2487        tcg_out_ext16u(s, a0, a1);
2488        break;
2489
2490    case INDEX_op_qemu_ld_i32:
2491        tcg_out_qemu_ld(s, args, 0);
2492        break;
2493    case INDEX_op_qemu_ld_i64:
2494        tcg_out_qemu_ld(s, args, 1);
2495        break;
2496    case INDEX_op_qemu_st_i32:
2497    case INDEX_op_qemu_st8_i32:
2498        tcg_out_qemu_st(s, args, 0);
2499        break;
2500    case INDEX_op_qemu_st_i64:
2501        tcg_out_qemu_st(s, args, 1);
2502        break;
2503
2504    OP_32_64(mulu2):
2505        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2506        break;
2507    OP_32_64(muls2):
2508        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2509        break;
2510    OP_32_64(add2):
2511        if (const_args[4]) {
2512            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2513        } else {
2514            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2515        }
2516        if (const_args[5]) {
2517            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2518        } else {
2519            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2520        }
2521        break;
2522    OP_32_64(sub2):
2523        if (const_args[4]) {
2524            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2525        } else {
2526            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2527        }
2528        if (const_args[5]) {
2529            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2530        } else {
2531            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2532        }
2533        break;
2534
2535#if TCG_TARGET_REG_BITS == 32
2536    case INDEX_op_brcond2_i32:
2537        tcg_out_brcond2(s, args, const_args, 0);
2538        break;
2539    case INDEX_op_setcond2_i32:
2540        tcg_out_setcond2(s, args, const_args);
2541        break;
2542#else /* TCG_TARGET_REG_BITS == 64 */
2543    case INDEX_op_ld32s_i64:
2544        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2545        break;
2546    case INDEX_op_ld_i64:
2547        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2548        break;
2549    case INDEX_op_st_i64:
2550        if (const_args[0]) {
2551            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2552            tcg_out32(s, a0);
2553        } else {
2554            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2555        }
2556        break;
2557
2558    case INDEX_op_brcond_i64:
2559        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2560        break;
2561    case INDEX_op_setcond_i64:
2562        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2563        break;
2564    case INDEX_op_movcond_i64:
2565        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2566        break;
2567
2568    case INDEX_op_bswap64_i64:
2569        tcg_out_bswap64(s, a0);
2570        break;
2571    case INDEX_op_extu_i32_i64:
2572    case INDEX_op_ext32u_i64:
2573    case INDEX_op_extrl_i64_i32:
2574        tcg_out_ext32u(s, a0, a1);
2575        break;
2576    case INDEX_op_ext_i32_i64:
2577    case INDEX_op_ext32s_i64:
2578        tcg_out_ext32s(s, a0, a1);
2579        break;
2580    case INDEX_op_extrh_i64_i32:
2581        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2582        break;
2583#endif
2584
2585    OP_32_64(deposit):
2586        if (args[3] == 0 && args[4] == 8) {
2587            /* load bits 0..7 */
2588            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2589        } else if (args[3] == 8 && args[4] == 8) {
2590            /* load bits 8..15 */
2591            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2592        } else if (args[3] == 0 && args[4] == 16) {
2593            /* load bits 0..15 */
2594            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2595        } else {
2596            tcg_abort();
2597        }
2598        break;
2599
2600    case INDEX_op_extract_i64:
2601        if (a2 + args[3] == 32) {
2602            /* This is a 32-bit zero-extending right shift.  */
2603            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2604            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2605            break;
2606        }
2607        /* FALLTHRU */
2608    case INDEX_op_extract_i32:
2609        /* On the off-chance that we can use the high-byte registers.
2610           Otherwise we emit the same ext16 + shift pattern that we
2611           would have gotten from the normal tcg-op.c expansion.  */
2612        tcg_debug_assert(a2 == 8 && args[3] == 8);
2613        if (a1 < 4 && a0 < 8) {
2614            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2615        } else {
2616            tcg_out_ext16u(s, a0, a1);
2617            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2618        }
2619        break;
2620
2621    case INDEX_op_sextract_i32:
2622        /* We don't implement sextract_i64, as we cannot sign-extend to
2623           64-bits without using the REX prefix that explicitly excludes
2624           access to the high-byte registers.  */
2625        tcg_debug_assert(a2 == 8 && args[3] == 8);
2626        if (a1 < 4 && a0 < 8) {
2627            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2628        } else {
2629            tcg_out_ext16s(s, a0, a1, 0);
2630            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2631        }
2632        break;
2633
2634    OP_32_64(extract2):
2635        /* Note that SHRD outputs to the r/m operand.  */
2636        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2637        tcg_out8(s, args[3]);
2638        break;
2639
2640    case INDEX_op_mb:
2641        tcg_out_mb(s, a0);
2642        break;
2643    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2644    case INDEX_op_mov_i64:
2645    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2646    case INDEX_op_movi_i64:
2647    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2648    default:
2649        tcg_abort();
2650    }
2651
2652#undef OP_32_64
2653}
2654
2655static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2656                           unsigned vecl, unsigned vece,
2657                           const TCGArg *args, const int *const_args)
2658{
2659    static int const add_insn[4] = {
2660        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2661    };
2662    static int const ssadd_insn[4] = {
2663        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2664    };
2665    static int const usadd_insn[4] = {
2666        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2667    };
2668    static int const sub_insn[4] = {
2669        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2670    };
2671    static int const sssub_insn[4] = {
2672        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2673    };
2674    static int const ussub_insn[4] = {
2675        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2676    };
2677    static int const mul_insn[4] = {
2678        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2679    };
2680    static int const shift_imm_insn[4] = {
2681        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2682    };
2683    static int const cmpeq_insn[4] = {
2684        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2685    };
2686    static int const cmpgt_insn[4] = {
2687        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2688    };
2689    static int const punpckl_insn[4] = {
2690        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2691    };
2692    static int const punpckh_insn[4] = {
2693        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2694    };
2695    static int const packss_insn[4] = {
2696        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2697    };
2698    static int const packus_insn[4] = {
2699        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2700    };
2701    static int const smin_insn[4] = {
2702        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2703    };
2704    static int const smax_insn[4] = {
2705        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2706    };
2707    static int const umin_insn[4] = {
2708        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2709    };
2710    static int const umax_insn[4] = {
2711        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2712    };
2713    static int const shlv_insn[4] = {
2714        /* TODO: AVX512 adds support for MO_16.  */
2715        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2716    };
2717    static int const shrv_insn[4] = {
2718        /* TODO: AVX512 adds support for MO_16.  */
2719        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2720    };
2721    static int const sarv_insn[4] = {
2722        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2723        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2724    };
2725    static int const shls_insn[4] = {
2726        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2727    };
2728    static int const shrs_insn[4] = {
2729        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2730    };
2731    static int const sars_insn[4] = {
2732        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2733    };
2734    static int const abs_insn[4] = {
2735        /* TODO: AVX512 adds support for MO_64.  */
2736        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2737    };
2738
2739    TCGType type = vecl + TCG_TYPE_V64;
2740    int insn, sub;
2741    TCGArg a0, a1, a2;
2742
2743    a0 = args[0];
2744    a1 = args[1];
2745    a2 = args[2];
2746
2747    switch (opc) {
2748    case INDEX_op_add_vec:
2749        insn = add_insn[vece];
2750        goto gen_simd;
2751    case INDEX_op_ssadd_vec:
2752        insn = ssadd_insn[vece];
2753        goto gen_simd;
2754    case INDEX_op_usadd_vec:
2755        insn = usadd_insn[vece];
2756        goto gen_simd;
2757    case INDEX_op_sub_vec:
2758        insn = sub_insn[vece];
2759        goto gen_simd;
2760    case INDEX_op_sssub_vec:
2761        insn = sssub_insn[vece];
2762        goto gen_simd;
2763    case INDEX_op_ussub_vec:
2764        insn = ussub_insn[vece];
2765        goto gen_simd;
2766    case INDEX_op_mul_vec:
2767        insn = mul_insn[vece];
2768        goto gen_simd;
2769    case INDEX_op_and_vec:
2770        insn = OPC_PAND;
2771        goto gen_simd;
2772    case INDEX_op_or_vec:
2773        insn = OPC_POR;
2774        goto gen_simd;
2775    case INDEX_op_xor_vec:
2776        insn = OPC_PXOR;
2777        goto gen_simd;
2778    case INDEX_op_smin_vec:
2779        insn = smin_insn[vece];
2780        goto gen_simd;
2781    case INDEX_op_umin_vec:
2782        insn = umin_insn[vece];
2783        goto gen_simd;
2784    case INDEX_op_smax_vec:
2785        insn = smax_insn[vece];
2786        goto gen_simd;
2787    case INDEX_op_umax_vec:
2788        insn = umax_insn[vece];
2789        goto gen_simd;
2790    case INDEX_op_shlv_vec:
2791        insn = shlv_insn[vece];
2792        goto gen_simd;
2793    case INDEX_op_shrv_vec:
2794        insn = shrv_insn[vece];
2795        goto gen_simd;
2796    case INDEX_op_sarv_vec:
2797        insn = sarv_insn[vece];
2798        goto gen_simd;
2799    case INDEX_op_shls_vec:
2800        insn = shls_insn[vece];
2801        goto gen_simd;
2802    case INDEX_op_shrs_vec:
2803        insn = shrs_insn[vece];
2804        goto gen_simd;
2805    case INDEX_op_sars_vec:
2806        insn = sars_insn[vece];
2807        goto gen_simd;
2808    case INDEX_op_x86_punpckl_vec:
2809        insn = punpckl_insn[vece];
2810        goto gen_simd;
2811    case INDEX_op_x86_punpckh_vec:
2812        insn = punpckh_insn[vece];
2813        goto gen_simd;
2814    case INDEX_op_x86_packss_vec:
2815        insn = packss_insn[vece];
2816        goto gen_simd;
2817    case INDEX_op_x86_packus_vec:
2818        insn = packus_insn[vece];
2819        goto gen_simd;
2820#if TCG_TARGET_REG_BITS == 32
2821    case INDEX_op_dup2_vec:
2822        /* First merge the two 32-bit inputs to a single 64-bit element. */
2823        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2824        /* Then replicate the 64-bit elements across the rest of the vector. */
2825        if (type != TCG_TYPE_V64) {
2826            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2827        }
2828        break;
2829#endif
2830    case INDEX_op_abs_vec:
2831        insn = abs_insn[vece];
2832        a2 = a1;
2833        a1 = 0;
2834        goto gen_simd;
2835    gen_simd:
2836        tcg_debug_assert(insn != OPC_UD2);
2837        if (type == TCG_TYPE_V256) {
2838            insn |= P_VEXL;
2839        }
2840        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2841        break;
2842
2843    case INDEX_op_cmp_vec:
2844        sub = args[3];
2845        if (sub == TCG_COND_EQ) {
2846            insn = cmpeq_insn[vece];
2847        } else if (sub == TCG_COND_GT) {
2848            insn = cmpgt_insn[vece];
2849        } else {
2850            g_assert_not_reached();
2851        }
2852        goto gen_simd;
2853
2854    case INDEX_op_andc_vec:
2855        insn = OPC_PANDN;
2856        if (type == TCG_TYPE_V256) {
2857            insn |= P_VEXL;
2858        }
2859        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2860        break;
2861
2862    case INDEX_op_shli_vec:
2863        sub = 6;
2864        goto gen_shift;
2865    case INDEX_op_shri_vec:
2866        sub = 2;
2867        goto gen_shift;
2868    case INDEX_op_sari_vec:
2869        tcg_debug_assert(vece != MO_64);
2870        sub = 4;
2871    gen_shift:
2872        tcg_debug_assert(vece != MO_8);
2873        insn = shift_imm_insn[vece];
2874        if (type == TCG_TYPE_V256) {
2875            insn |= P_VEXL;
2876        }
2877        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2878        tcg_out8(s, a2);
2879        break;
2880
2881    case INDEX_op_ld_vec:
2882        tcg_out_ld(s, type, a0, a1, a2);
2883        break;
2884    case INDEX_op_st_vec:
2885        tcg_out_st(s, type, a0, a1, a2);
2886        break;
2887    case INDEX_op_dupm_vec:
2888        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2889        break;
2890
2891    case INDEX_op_x86_shufps_vec:
2892        insn = OPC_SHUFPS;
2893        sub = args[3];
2894        goto gen_simd_imm8;
2895    case INDEX_op_x86_blend_vec:
2896        if (vece == MO_16) {
2897            insn = OPC_PBLENDW;
2898        } else if (vece == MO_32) {
2899            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2900        } else {
2901            g_assert_not_reached();
2902        }
2903        sub = args[3];
2904        goto gen_simd_imm8;
2905    case INDEX_op_x86_vperm2i128_vec:
2906        insn = OPC_VPERM2I128;
2907        sub = args[3];
2908        goto gen_simd_imm8;
2909    gen_simd_imm8:
2910        if (type == TCG_TYPE_V256) {
2911            insn |= P_VEXL;
2912        }
2913        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2914        tcg_out8(s, sub);
2915        break;
2916
2917    case INDEX_op_x86_vpblendvb_vec:
2918        insn = OPC_VPBLENDVB;
2919        if (type == TCG_TYPE_V256) {
2920            insn |= P_VEXL;
2921        }
2922        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2923        tcg_out8(s, args[3] << 4);
2924        break;
2925
2926    case INDEX_op_x86_psrldq_vec:
2927        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2928        tcg_out8(s, a2);
2929        break;
2930
2931    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2932    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
2933    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2934    default:
2935        g_assert_not_reached();
2936    }
2937}
2938
2939static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2940{
2941    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2942    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2943    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2944    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2945    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2946    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2947    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2948    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2949    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2950    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2951    static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
2952    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2953    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2954    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2955    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2956    static const TCGTargetOpDef s_L = { .args_ct_str = { "s", "L" } };
2957    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2958    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2959    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2960    static const TCGTargetOpDef s_L_L = { .args_ct_str = { "s", "L", "L" } };
2961    static const TCGTargetOpDef r_r_L_L
2962        = { .args_ct_str = { "r", "r", "L", "L" } };
2963    static const TCGTargetOpDef L_L_L_L
2964        = { .args_ct_str = { "L", "L", "L", "L" } };
2965    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2966    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2967    static const TCGTargetOpDef x_x_x_x
2968        = { .args_ct_str = { "x", "x", "x", "x" } };
2969    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2970
2971    switch (op) {
2972    case INDEX_op_goto_ptr:
2973        return &r;
2974
2975    case INDEX_op_ld8u_i32:
2976    case INDEX_op_ld8u_i64:
2977    case INDEX_op_ld8s_i32:
2978    case INDEX_op_ld8s_i64:
2979    case INDEX_op_ld16u_i32:
2980    case INDEX_op_ld16u_i64:
2981    case INDEX_op_ld16s_i32:
2982    case INDEX_op_ld16s_i64:
2983    case INDEX_op_ld_i32:
2984    case INDEX_op_ld32u_i64:
2985    case INDEX_op_ld32s_i64:
2986    case INDEX_op_ld_i64:
2987        return &r_r;
2988
2989    case INDEX_op_st8_i32:
2990    case INDEX_op_st8_i64:
2991        return &qi_r;
2992    case INDEX_op_st16_i32:
2993    case INDEX_op_st16_i64:
2994    case INDEX_op_st_i32:
2995    case INDEX_op_st32_i64:
2996        return &ri_r;
2997    case INDEX_op_st_i64:
2998        return &re_r;
2999
3000    case INDEX_op_add_i32:
3001    case INDEX_op_add_i64:
3002        return &r_r_re;
3003    case INDEX_op_sub_i32:
3004    case INDEX_op_sub_i64:
3005    case INDEX_op_mul_i32:
3006    case INDEX_op_mul_i64:
3007    case INDEX_op_or_i32:
3008    case INDEX_op_or_i64:
3009    case INDEX_op_xor_i32:
3010    case INDEX_op_xor_i64:
3011        return &r_0_re;
3012
3013    case INDEX_op_and_i32:
3014    case INDEX_op_and_i64:
3015        {
3016            static const TCGTargetOpDef and
3017                = { .args_ct_str = { "r", "0", "reZ" } };
3018            return &and;
3019        }
3020        break;
3021    case INDEX_op_andc_i32:
3022    case INDEX_op_andc_i64:
3023        {
3024            static const TCGTargetOpDef andc
3025                = { .args_ct_str = { "r", "r", "rI" } };
3026            return &andc;
3027        }
3028        break;
3029
3030    case INDEX_op_shl_i32:
3031    case INDEX_op_shl_i64:
3032    case INDEX_op_shr_i32:
3033    case INDEX_op_shr_i64:
3034    case INDEX_op_sar_i32:
3035    case INDEX_op_sar_i64:
3036        return have_bmi2 ? &r_r_ri : &r_0_ci;
3037    case INDEX_op_rotl_i32:
3038    case INDEX_op_rotl_i64:
3039    case INDEX_op_rotr_i32:
3040    case INDEX_op_rotr_i64:
3041        return &r_0_ci;
3042
3043    case INDEX_op_brcond_i32:
3044    case INDEX_op_brcond_i64:
3045        return &r_re;
3046
3047    case INDEX_op_bswap16_i32:
3048    case INDEX_op_bswap16_i64:
3049    case INDEX_op_bswap32_i32:
3050    case INDEX_op_bswap32_i64:
3051    case INDEX_op_bswap64_i64:
3052    case INDEX_op_neg_i32:
3053    case INDEX_op_neg_i64:
3054    case INDEX_op_not_i32:
3055    case INDEX_op_not_i64:
3056    case INDEX_op_extrh_i64_i32:
3057        return &r_0;
3058
3059    case INDEX_op_ext8s_i32:
3060    case INDEX_op_ext8s_i64:
3061    case INDEX_op_ext8u_i32:
3062    case INDEX_op_ext8u_i64:
3063        return &r_q;
3064    case INDEX_op_ext16s_i32:
3065    case INDEX_op_ext16s_i64:
3066    case INDEX_op_ext16u_i32:
3067    case INDEX_op_ext16u_i64:
3068    case INDEX_op_ext32s_i64:
3069    case INDEX_op_ext32u_i64:
3070    case INDEX_op_ext_i32_i64:
3071    case INDEX_op_extu_i32_i64:
3072    case INDEX_op_extrl_i64_i32:
3073    case INDEX_op_extract_i32:
3074    case INDEX_op_extract_i64:
3075    case INDEX_op_sextract_i32:
3076    case INDEX_op_ctpop_i32:
3077    case INDEX_op_ctpop_i64:
3078        return &r_r;
3079    case INDEX_op_extract2_i32:
3080    case INDEX_op_extract2_i64:
3081        return &r_0_r;
3082
3083    case INDEX_op_deposit_i32:
3084    case INDEX_op_deposit_i64:
3085        {
3086            static const TCGTargetOpDef dep
3087                = { .args_ct_str = { "Q", "0", "Q" } };
3088            return &dep;
3089        }
3090    case INDEX_op_setcond_i32:
3091    case INDEX_op_setcond_i64:
3092        {
3093            static const TCGTargetOpDef setc
3094                = { .args_ct_str = { "q", "r", "re" } };
3095            return &setc;
3096        }
3097    case INDEX_op_movcond_i32:
3098    case INDEX_op_movcond_i64:
3099        {
3100            static const TCGTargetOpDef movc
3101                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
3102            return &movc;
3103        }
3104    case INDEX_op_div2_i32:
3105    case INDEX_op_div2_i64:
3106    case INDEX_op_divu2_i32:
3107    case INDEX_op_divu2_i64:
3108        {
3109            static const TCGTargetOpDef div2
3110                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3111            return &div2;
3112        }
3113    case INDEX_op_mulu2_i32:
3114    case INDEX_op_mulu2_i64:
3115    case INDEX_op_muls2_i32:
3116    case INDEX_op_muls2_i64:
3117        {
3118            static const TCGTargetOpDef mul2
3119                = { .args_ct_str = { "a", "d", "a", "r" } };
3120            return &mul2;
3121        }
3122    case INDEX_op_add2_i32:
3123    case INDEX_op_add2_i64:
3124    case INDEX_op_sub2_i32:
3125    case INDEX_op_sub2_i64:
3126        {
3127            static const TCGTargetOpDef arith2
3128                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3129            return &arith2;
3130        }
3131    case INDEX_op_ctz_i32:
3132    case INDEX_op_ctz_i64:
3133        {
3134            static const TCGTargetOpDef ctz[2] = {
3135                { .args_ct_str = { "&r", "r", "r" } },
3136                { .args_ct_str = { "&r", "r", "rW" } },
3137            };
3138            return &ctz[have_bmi1];
3139        }
3140    case INDEX_op_clz_i32:
3141    case INDEX_op_clz_i64:
3142        {
3143            static const TCGTargetOpDef clz[2] = {
3144                { .args_ct_str = { "&r", "r", "r" } },
3145                { .args_ct_str = { "&r", "r", "rW" } },
3146            };
3147            return &clz[have_lzcnt];
3148        }
3149
3150    case INDEX_op_qemu_ld_i32:
3151        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3152    case INDEX_op_qemu_st_i32:
3153        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3154    case INDEX_op_qemu_st8_i32:
3155        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &s_L : &s_L_L;
3156    case INDEX_op_qemu_ld_i64:
3157        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3158                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3159                : &r_r_L_L);
3160    case INDEX_op_qemu_st_i64:
3161        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3162                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3163                : &L_L_L_L);
3164
3165    case INDEX_op_brcond2_i32:
3166        {
3167            static const TCGTargetOpDef b2
3168                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3169            return &b2;
3170        }
3171    case INDEX_op_setcond2_i32:
3172        {
3173            static const TCGTargetOpDef s2
3174                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3175            return &s2;
3176        }
3177
3178    case INDEX_op_ld_vec:
3179    case INDEX_op_st_vec:
3180    case INDEX_op_dupm_vec:
3181        return &x_r;
3182
3183    case INDEX_op_add_vec:
3184    case INDEX_op_sub_vec:
3185    case INDEX_op_mul_vec:
3186    case INDEX_op_and_vec:
3187    case INDEX_op_or_vec:
3188    case INDEX_op_xor_vec:
3189    case INDEX_op_andc_vec:
3190    case INDEX_op_ssadd_vec:
3191    case INDEX_op_usadd_vec:
3192    case INDEX_op_sssub_vec:
3193    case INDEX_op_ussub_vec:
3194    case INDEX_op_smin_vec:
3195    case INDEX_op_umin_vec:
3196    case INDEX_op_smax_vec:
3197    case INDEX_op_umax_vec:
3198    case INDEX_op_shlv_vec:
3199    case INDEX_op_shrv_vec:
3200    case INDEX_op_sarv_vec:
3201    case INDEX_op_shls_vec:
3202    case INDEX_op_shrs_vec:
3203    case INDEX_op_sars_vec:
3204    case INDEX_op_rotls_vec:
3205    case INDEX_op_cmp_vec:
3206    case INDEX_op_x86_shufps_vec:
3207    case INDEX_op_x86_blend_vec:
3208    case INDEX_op_x86_packss_vec:
3209    case INDEX_op_x86_packus_vec:
3210    case INDEX_op_x86_vperm2i128_vec:
3211    case INDEX_op_x86_punpckl_vec:
3212    case INDEX_op_x86_punpckh_vec:
3213#if TCG_TARGET_REG_BITS == 32
3214    case INDEX_op_dup2_vec:
3215#endif
3216        return &x_x_x;
3217    case INDEX_op_abs_vec:
3218    case INDEX_op_dup_vec:
3219    case INDEX_op_shli_vec:
3220    case INDEX_op_shri_vec:
3221    case INDEX_op_sari_vec:
3222    case INDEX_op_x86_psrldq_vec:
3223        return &x_x;
3224    case INDEX_op_x86_vpblendvb_vec:
3225        return &x_x_x_x;
3226
3227    default:
3228        break;
3229    }
3230    return NULL;
3231}
3232
3233int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3234{
3235    switch (opc) {
3236    case INDEX_op_add_vec:
3237    case INDEX_op_sub_vec:
3238    case INDEX_op_and_vec:
3239    case INDEX_op_or_vec:
3240    case INDEX_op_xor_vec:
3241    case INDEX_op_andc_vec:
3242        return 1;
3243    case INDEX_op_rotli_vec:
3244    case INDEX_op_cmp_vec:
3245    case INDEX_op_cmpsel_vec:
3246        return -1;
3247
3248    case INDEX_op_shli_vec:
3249    case INDEX_op_shri_vec:
3250        /* We must expand the operation for MO_8.  */
3251        return vece == MO_8 ? -1 : 1;
3252
3253    case INDEX_op_sari_vec:
3254        /* We must expand the operation for MO_8.  */
3255        if (vece == MO_8) {
3256            return -1;
3257        }
3258        /* We can emulate this for MO_64, but it does not pay off
3259           unless we're producing at least 4 values.  */
3260        if (vece == MO_64) {
3261            return type >= TCG_TYPE_V256 ? -1 : 0;
3262        }
3263        return 1;
3264
3265    case INDEX_op_shls_vec:
3266    case INDEX_op_shrs_vec:
3267        return vece >= MO_16;
3268    case INDEX_op_sars_vec:
3269        return vece >= MO_16 && vece <= MO_32;
3270    case INDEX_op_rotls_vec:
3271        return vece >= MO_16 ? -1 : 0;
3272
3273    case INDEX_op_shlv_vec:
3274    case INDEX_op_shrv_vec:
3275        return have_avx2 && vece >= MO_32;
3276    case INDEX_op_sarv_vec:
3277        return have_avx2 && vece == MO_32;
3278    case INDEX_op_rotlv_vec:
3279    case INDEX_op_rotrv_vec:
3280        return have_avx2 && vece >= MO_32 ? -1 : 0;
3281
3282    case INDEX_op_mul_vec:
3283        if (vece == MO_8) {
3284            /* We can expand the operation for MO_8.  */
3285            return -1;
3286        }
3287        if (vece == MO_64) {
3288            return 0;
3289        }
3290        return 1;
3291
3292    case INDEX_op_ssadd_vec:
3293    case INDEX_op_usadd_vec:
3294    case INDEX_op_sssub_vec:
3295    case INDEX_op_ussub_vec:
3296        return vece <= MO_16;
3297    case INDEX_op_smin_vec:
3298    case INDEX_op_smax_vec:
3299    case INDEX_op_umin_vec:
3300    case INDEX_op_umax_vec:
3301    case INDEX_op_abs_vec:
3302        return vece <= MO_32;
3303
3304    default:
3305        return 0;
3306    }
3307}
3308
3309static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3310                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3311{
3312    TCGv_vec t1, t2;
3313
3314    tcg_debug_assert(vece == MO_8);
3315
3316    t1 = tcg_temp_new_vec(type);
3317    t2 = tcg_temp_new_vec(type);
3318
3319    /*
3320     * Unpack to W, shift, and repack.  Tricky bits:
3321     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3322     *     i.e. duplicate in other half of the 16-bit lane.
3323     * (2) For right-shift, add 8 so that the high half of the lane
3324     *     becomes zero.  For left-shift, and left-rotate, we must
3325     *     shift up and down again.
3326     * (3) Step 2 leaves high half zero such that PACKUSWB
3327     *     (pack with unsigned saturation) does not modify
3328     *     the quantity.
3329     */
3330    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3331              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3332    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3333              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3334
3335    if (opc != INDEX_op_rotli_vec) {
3336        imm += 8;
3337    }
3338    if (opc == INDEX_op_shri_vec) {
3339        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3340        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3341    } else {
3342        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3343        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3344        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3345        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3346    }
3347
3348    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3349              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3350    tcg_temp_free_vec(t1);
3351    tcg_temp_free_vec(t2);
3352}
3353
3354static void expand_vec_sari(TCGType type, unsigned vece,
3355                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3356{
3357    TCGv_vec t1, t2;
3358
3359    switch (vece) {
3360    case MO_8:
3361        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3362        t1 = tcg_temp_new_vec(type);
3363        t2 = tcg_temp_new_vec(type);
3364        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3365                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3366        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3367                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3368        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3369        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3370        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3371                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3372        tcg_temp_free_vec(t1);
3373        tcg_temp_free_vec(t2);
3374        break;
3375
3376    case MO_64:
3377        if (imm <= 32) {
3378            /*
3379             * We can emulate a small sign extend by performing an arithmetic
3380             * 32-bit shift and overwriting the high half of a 64-bit logical
3381             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3382             * does not, so we have to bound the smaller shift -- we get the
3383             * same result in the high half either way.
3384             */
3385            t1 = tcg_temp_new_vec(type);
3386            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3387            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3388            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3389                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3390                      tcgv_vec_arg(t1), 0xaa);
3391            tcg_temp_free_vec(t1);
3392        } else {
3393            /* Otherwise we will need to use a compare vs 0 to produce
3394             * the sign-extend, shift and merge.
3395             */
3396            t1 = tcg_const_zeros_vec(type);
3397            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3398            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3399            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3400            tcg_gen_or_vec(MO_64, v0, v0, t1);
3401            tcg_temp_free_vec(t1);
3402        }
3403        break;
3404
3405    default:
3406        g_assert_not_reached();
3407    }
3408}
3409
3410static void expand_vec_rotli(TCGType type, unsigned vece,
3411                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3412{
3413    TCGv_vec t;
3414
3415    if (vece == MO_8) {
3416        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3417        return;
3418    }
3419
3420    t = tcg_temp_new_vec(type);
3421    tcg_gen_shli_vec(vece, t, v1, imm);
3422    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3423    tcg_gen_or_vec(vece, v0, v0, t);
3424    tcg_temp_free_vec(t);
3425}
3426
3427static void expand_vec_rotls(TCGType type, unsigned vece,
3428                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3429{
3430    TCGv_i32 rsh;
3431    TCGv_vec t;
3432
3433    tcg_debug_assert(vece != MO_8);
3434
3435    t = tcg_temp_new_vec(type);
3436    rsh = tcg_temp_new_i32();
3437
3438    tcg_gen_neg_i32(rsh, lsh);
3439    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3440    tcg_gen_shls_vec(vece, t, v1, lsh);
3441    tcg_gen_shrs_vec(vece, v0, v1, rsh);
3442    tcg_gen_or_vec(vece, v0, v0, t);
3443    tcg_temp_free_vec(t);
3444    tcg_temp_free_i32(rsh);
3445}
3446
3447static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3448                            TCGv_vec v1, TCGv_vec sh, bool right)
3449{
3450    TCGv_vec t = tcg_temp_new_vec(type);
3451
3452    tcg_gen_dupi_vec(vece, t, 8 << vece);
3453    tcg_gen_sub_vec(vece, t, t, sh);
3454    if (right) {
3455        tcg_gen_shlv_vec(vece, t, v1, t);
3456        tcg_gen_shrv_vec(vece, v0, v1, sh);
3457    } else {
3458        tcg_gen_shrv_vec(vece, t, v1, t);
3459        tcg_gen_shlv_vec(vece, v0, v1, sh);
3460    }
3461    tcg_gen_or_vec(vece, v0, v0, t);
3462    tcg_temp_free_vec(t);
3463}
3464
3465static void expand_vec_mul(TCGType type, unsigned vece,
3466                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3467{
3468    TCGv_vec t1, t2, t3, t4;
3469
3470    tcg_debug_assert(vece == MO_8);
3471
3472    /*
3473     * Unpack v1 bytes to words, 0 | x.
3474     * Unpack v2 bytes to words, y | 0.
3475     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3476     * Shift logical right by 8 bits to clear the high 8 bytes before
3477     * using an unsigned saturated pack.
3478     *
3479     * The difference between the V64, V128 and V256 cases is merely how
3480     * we distribute the expansion between temporaries.
3481     */
3482    switch (type) {
3483    case TCG_TYPE_V64:
3484        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3485        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3486        tcg_gen_dup16i_vec(t2, 0);
3487        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3488                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3489        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3490                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3491        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3492        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3493        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3494                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3495        tcg_temp_free_vec(t1);
3496        tcg_temp_free_vec(t2);
3497        break;
3498
3499    case TCG_TYPE_V128:
3500    case TCG_TYPE_V256:
3501        t1 = tcg_temp_new_vec(type);
3502        t2 = tcg_temp_new_vec(type);
3503        t3 = tcg_temp_new_vec(type);
3504        t4 = tcg_temp_new_vec(type);
3505        tcg_gen_dup16i_vec(t4, 0);
3506        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3507                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3508        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3509                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3510        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3511                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3512        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3513                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3514        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3515        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3516        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3517        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3518        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3519                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3520        tcg_temp_free_vec(t1);
3521        tcg_temp_free_vec(t2);
3522        tcg_temp_free_vec(t3);
3523        tcg_temp_free_vec(t4);
3524        break;
3525
3526    default:
3527        g_assert_not_reached();
3528    }
3529}
3530
3531static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3532                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3533{
3534    enum {
3535        NEED_INV  = 1,
3536        NEED_SWAP = 2,
3537        NEED_BIAS = 4,
3538        NEED_UMIN = 8,
3539        NEED_UMAX = 16,
3540    };
3541    TCGv_vec t1, t2;
3542    uint8_t fixup;
3543
3544    switch (cond) {
3545    case TCG_COND_EQ:
3546    case TCG_COND_GT:
3547        fixup = 0;
3548        break;
3549    case TCG_COND_NE:
3550    case TCG_COND_LE:
3551        fixup = NEED_INV;
3552        break;
3553    case TCG_COND_LT:
3554        fixup = NEED_SWAP;
3555        break;
3556    case TCG_COND_GE:
3557        fixup = NEED_SWAP | NEED_INV;
3558        break;
3559    case TCG_COND_LEU:
3560        if (vece <= MO_32) {
3561            fixup = NEED_UMIN;
3562        } else {
3563            fixup = NEED_BIAS | NEED_INV;
3564        }
3565        break;
3566    case TCG_COND_GTU:
3567        if (vece <= MO_32) {
3568            fixup = NEED_UMIN | NEED_INV;
3569        } else {
3570            fixup = NEED_BIAS;
3571        }
3572        break;
3573    case TCG_COND_GEU:
3574        if (vece <= MO_32) {
3575            fixup = NEED_UMAX;
3576        } else {
3577            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3578        }
3579        break;
3580    case TCG_COND_LTU:
3581        if (vece <= MO_32) {
3582            fixup = NEED_UMAX | NEED_INV;
3583        } else {
3584            fixup = NEED_BIAS | NEED_SWAP;
3585        }
3586        break;
3587    default:
3588        g_assert_not_reached();
3589    }
3590
3591    if (fixup & NEED_INV) {
3592        cond = tcg_invert_cond(cond);
3593    }
3594    if (fixup & NEED_SWAP) {
3595        t1 = v1, v1 = v2, v2 = t1;
3596        cond = tcg_swap_cond(cond);
3597    }
3598
3599    t1 = t2 = NULL;
3600    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3601        t1 = tcg_temp_new_vec(type);
3602        if (fixup & NEED_UMIN) {
3603            tcg_gen_umin_vec(vece, t1, v1, v2);
3604        } else {
3605            tcg_gen_umax_vec(vece, t1, v1, v2);
3606        }
3607        v2 = t1;
3608        cond = TCG_COND_EQ;
3609    } else if (fixup & NEED_BIAS) {
3610        t1 = tcg_temp_new_vec(type);
3611        t2 = tcg_temp_new_vec(type);
3612        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3613        tcg_gen_sub_vec(vece, t1, v1, t2);
3614        tcg_gen_sub_vec(vece, t2, v2, t2);
3615        v1 = t1;
3616        v2 = t2;
3617        cond = tcg_signed_cond(cond);
3618    }
3619
3620    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3621    /* Expand directly; do not recurse.  */
3622    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3623              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3624
3625    if (t1) {
3626        tcg_temp_free_vec(t1);
3627        if (t2) {
3628            tcg_temp_free_vec(t2);
3629        }
3630    }
3631    return fixup & NEED_INV;
3632}
3633
3634static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3635                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3636{
3637    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3638        tcg_gen_not_vec(vece, v0, v0);
3639    }
3640}
3641
3642static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3643                              TCGv_vec c1, TCGv_vec c2,
3644                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3645{
3646    TCGv_vec t = tcg_temp_new_vec(type);
3647
3648    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3649        /* Invert the sense of the compare by swapping arguments.  */
3650        TCGv_vec x;
3651        x = v3, v3 = v4, v4 = x;
3652    }
3653    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3654              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3655              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3656    tcg_temp_free_vec(t);
3657}
3658
3659void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3660                       TCGArg a0, ...)
3661{
3662    va_list va;
3663    TCGArg a2;
3664    TCGv_vec v0, v1, v2, v3, v4;
3665
3666    va_start(va, a0);
3667    v0 = temp_tcgv_vec(arg_temp(a0));
3668    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3669    a2 = va_arg(va, TCGArg);
3670
3671    switch (opc) {
3672    case INDEX_op_shli_vec:
3673    case INDEX_op_shri_vec:
3674        expand_vec_shi(type, vece, opc, v0, v1, a2);
3675        break;
3676
3677    case INDEX_op_sari_vec:
3678        expand_vec_sari(type, vece, v0, v1, a2);
3679        break;
3680
3681    case INDEX_op_rotli_vec:
3682        expand_vec_rotli(type, vece, v0, v1, a2);
3683        break;
3684
3685    case INDEX_op_rotls_vec:
3686        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3687        break;
3688
3689    case INDEX_op_rotlv_vec:
3690        v2 = temp_tcgv_vec(arg_temp(a2));
3691        expand_vec_rotv(type, vece, v0, v1, v2, false);
3692        break;
3693    case INDEX_op_rotrv_vec:
3694        v2 = temp_tcgv_vec(arg_temp(a2));
3695        expand_vec_rotv(type, vece, v0, v1, v2, true);
3696        break;
3697
3698    case INDEX_op_mul_vec:
3699        v2 = temp_tcgv_vec(arg_temp(a2));
3700        expand_vec_mul(type, vece, v0, v1, v2);
3701        break;
3702
3703    case INDEX_op_cmp_vec:
3704        v2 = temp_tcgv_vec(arg_temp(a2));
3705        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3706        break;
3707
3708    case INDEX_op_cmpsel_vec:
3709        v2 = temp_tcgv_vec(arg_temp(a2));
3710        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3711        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3712        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3713        break;
3714
3715    default:
3716        break;
3717    }
3718
3719    va_end(va);
3720}
3721
3722static const int tcg_target_callee_save_regs[] = {
3723#if TCG_TARGET_REG_BITS == 64
3724    TCG_REG_RBP,
3725    TCG_REG_RBX,
3726#if defined(_WIN64)
3727    TCG_REG_RDI,
3728    TCG_REG_RSI,
3729#endif
3730    TCG_REG_R12,
3731    TCG_REG_R13,
3732    TCG_REG_R14, /* Currently used for the global env. */
3733    TCG_REG_R15,
3734#else
3735    TCG_REG_EBP, /* Currently used for the global env. */
3736    TCG_REG_EBX,
3737    TCG_REG_ESI,
3738    TCG_REG_EDI,
3739#endif
3740};
3741
3742/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3743   and tcg_register_jit.  */
3744
3745#define PUSH_SIZE \
3746    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3747     * (TCG_TARGET_REG_BITS / 8))
3748
3749#define FRAME_SIZE \
3750    ((PUSH_SIZE \
3751      + TCG_STATIC_CALL_ARGS_SIZE \
3752      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3753      + TCG_TARGET_STACK_ALIGN - 1) \
3754     & ~(TCG_TARGET_STACK_ALIGN - 1))
3755
3756/* Generate global QEMU prologue and epilogue code */
3757static void tcg_target_qemu_prologue(TCGContext *s)
3758{
3759    int i, stack_addend;
3760
3761    /* TB prologue */
3762
3763    /* Reserve some stack space, also for TCG temps.  */
3764    stack_addend = FRAME_SIZE - PUSH_SIZE;
3765    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3766                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3767
3768    /* Save all callee saved registers.  */
3769    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3770        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3771    }
3772
3773#if TCG_TARGET_REG_BITS == 32
3774    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3775               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3776    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3777    /* jmp *tb.  */
3778    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3779                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3780                         + stack_addend);
3781#else
3782# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3783    if (guest_base) {
3784        int seg = setup_guest_base_seg();
3785        if (seg != 0) {
3786            x86_guest_base_seg = seg;
3787        } else if (guest_base == (int32_t)guest_base) {
3788            x86_guest_base_offset = guest_base;
3789        } else {
3790            /* Choose R12 because, as a base, it requires a SIB byte. */
3791            x86_guest_base_index = TCG_REG_R12;
3792            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3793            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3794        }
3795    }
3796# endif
3797    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3798    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3799    /* jmp *tb.  */
3800    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3801#endif
3802
3803    /*
3804     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3805     * and fall through to the rest of the epilogue.
3806     */
3807    /* TODO: Cast goes away when all hosts converted */
3808    tcg_code_gen_epilogue = (void *)tcg_splitwx_to_rx(s->code_ptr);
3809    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3810
3811    /* TB epilogue */
3812    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3813
3814    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3815
3816    if (have_avx2) {
3817        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3818    }
3819    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3820        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3821    }
3822    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3823}
3824
3825static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3826{
3827    memset(p, 0x90, count);
3828}
3829
3830static void tcg_target_init(TCGContext *s)
3831{
3832#ifdef CONFIG_CPUID_H
3833    unsigned a, b, c, d, b7 = 0;
3834    int max = __get_cpuid_max(0, 0);
3835
3836    if (max >= 7) {
3837        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3838        __cpuid_count(7, 0, a, b7, c, d);
3839        have_bmi1 = (b7 & bit_BMI) != 0;
3840        have_bmi2 = (b7 & bit_BMI2) != 0;
3841    }
3842
3843    if (max >= 1) {
3844        __cpuid(1, a, b, c, d);
3845#ifndef have_cmov
3846        /* For 32-bit, 99% certainty that we're running on hardware that
3847           supports cmov, but we still need to check.  In case cmov is not
3848           available, we'll use a small forward branch.  */
3849        have_cmov = (d & bit_CMOV) != 0;
3850#endif
3851
3852        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3853           need to probe for it.  */
3854        have_movbe = (c & bit_MOVBE) != 0;
3855        have_popcnt = (c & bit_POPCNT) != 0;
3856
3857        /* There are a number of things we must check before we can be
3858           sure of not hitting invalid opcode.  */
3859        if (c & bit_OSXSAVE) {
3860            unsigned xcrl, xcrh;
3861            /* The xgetbv instruction is not available to older versions of
3862             * the assembler, so we encode the instruction manually.
3863             */
3864            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3865            if ((xcrl & 6) == 6) {
3866                have_avx1 = (c & bit_AVX) != 0;
3867                have_avx2 = (b7 & bit_AVX2) != 0;
3868            }
3869        }
3870    }
3871
3872    max = __get_cpuid_max(0x8000000, 0);
3873    if (max >= 1) {
3874        __cpuid(0x80000001, a, b, c, d);
3875        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3876        have_lzcnt = (c & bit_LZCNT) != 0;
3877    }
3878#endif /* CONFIG_CPUID_H */
3879
3880    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3881    if (TCG_TARGET_REG_BITS == 64) {
3882        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3883    }
3884    if (have_avx1) {
3885        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3886        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3887    }
3888    if (have_avx2) {
3889        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3890    }
3891
3892    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3893    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3894    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3895    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3896    if (TCG_TARGET_REG_BITS == 64) {
3897#if !defined(_WIN64)
3898        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3899        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3900#endif
3901        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3902        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3903        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3904        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3905    }
3906
3907    s->reserved_regs = 0;
3908    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3909}
3910
3911typedef struct {
3912    DebugFrameHeader h;
3913    uint8_t fde_def_cfa[4];
3914    uint8_t fde_reg_ofs[14];
3915} DebugFrame;
3916
3917/* We're expecting a 2 byte uleb128 encoded value.  */
3918QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3919
3920#if !defined(__ELF__)
3921    /* Host machine without ELF. */
3922#elif TCG_TARGET_REG_BITS == 64
3923#define ELF_HOST_MACHINE EM_X86_64
3924static const DebugFrame debug_frame = {
3925    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3926    .h.cie.id = -1,
3927    .h.cie.version = 1,
3928    .h.cie.code_align = 1,
3929    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3930    .h.cie.return_column = 16,
3931
3932    /* Total FDE size does not include the "len" member.  */
3933    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3934
3935    .fde_def_cfa = {
3936        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3937        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3938        (FRAME_SIZE >> 7)
3939    },
3940    .fde_reg_ofs = {
3941        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3942        /* The following ordering must match tcg_target_callee_save_regs.  */
3943        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3944        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3945        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3946        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3947        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3948        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3949    }
3950};
3951#else
3952#define ELF_HOST_MACHINE EM_386
3953static const DebugFrame debug_frame = {
3954    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3955    .h.cie.id = -1,
3956    .h.cie.version = 1,
3957    .h.cie.code_align = 1,
3958    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3959    .h.cie.return_column = 8,
3960
3961    /* Total FDE size does not include the "len" member.  */
3962    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3963
3964    .fde_def_cfa = {
3965        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3966        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3967        (FRAME_SIZE >> 7)
3968    },
3969    .fde_reg_ofs = {
3970        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3971        /* The following ordering must match tcg_target_callee_save_regs.  */
3972        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3973        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3974        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3975        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3976    }
3977};
3978#endif
3979
3980#if defined(ELF_HOST_MACHINE)
3981void tcg_register_jit(const void *buf, size_t buf_size)
3982{
3983    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3984}
3985#endif
3986