tcg/i386/tcg-target.c.inc

/*
 * Tiny Code Generator for QEMU
 *
 * Copyright (c) 2008 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

/* Used for function call generation. */
#define TCG_TARGET_STACK_ALIGN 16
#if defined(_WIN64)
#define TCG_TARGET_CALL_STACK_OFFSET 32
#else
#define TCG_TARGET_CALL_STACK_OFFSET 0
#endif
#define TCG_TARGET_CALL_ARG_I32      TCG_CALL_ARG_NORMAL
#define TCG_TARGET_CALL_ARG_I64      TCG_CALL_ARG_NORMAL
#if defined(_WIN64)
# define TCG_TARGET_CALL_ARG_I128    TCG_CALL_ARG_BY_REF
# define TCG_TARGET_CALL_RET_I128    TCG_CALL_RET_BY_VEC
#elif TCG_TARGET_REG_BITS == 64
# define TCG_TARGET_CALL_ARG_I128    TCG_CALL_ARG_NORMAL
# define TCG_TARGET_CALL_RET_I128    TCG_CALL_RET_NORMAL
#else
# define TCG_TARGET_CALL_ARG_I128    TCG_CALL_ARG_NORMAL
# define TCG_TARGET_CALL_RET_I128    TCG_CALL_RET_BY_REF
#endif

#ifdef CONFIG_DEBUG_TCG
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
#if TCG_TARGET_REG_BITS == 64
    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
#else
    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
#endif
    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
#if TCG_TARGET_REG_BITS == 64
    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
#endif
};
#endif

static const int tcg_target_reg_alloc_order[] = {
#if TCG_TARGET_REG_BITS == 64
    TCG_REG_RBP,
    TCG_REG_RBX,
    TCG_REG_R12,
    TCG_REG_R13,
    TCG_REG_R14,
    TCG_REG_R15,
    TCG_REG_R10,
    TCG_REG_R11,
    TCG_REG_R9,
    TCG_REG_R8,
    TCG_REG_RCX,
    TCG_REG_RDX,
    TCG_REG_RSI,
    TCG_REG_RDI,
    TCG_REG_RAX,
#else
    TCG_REG_EBX,
    TCG_REG_ESI,
    TCG_REG_EDI,
    TCG_REG_EBP,
    TCG_REG_ECX,
    TCG_REG_EDX,
    TCG_REG_EAX,
#endif
    TCG_REG_XMM0,
    TCG_REG_XMM1,
    TCG_REG_XMM2,
    TCG_REG_XMM3,
    TCG_REG_XMM4,
    TCG_REG_XMM5,
#ifndef _WIN64
    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
    TCG_REG_XMM6,
    TCG_REG_XMM7,
#if TCG_TARGET_REG_BITS == 64
    TCG_REG_XMM8,
    TCG_REG_XMM9,
    TCG_REG_XMM10,
    TCG_REG_XMM11,
    TCG_REG_XMM12,
    TCG_REG_XMM13,
    TCG_REG_XMM14,
    TCG_REG_XMM15,
#endif
#endif
};

#define TCG_TMP_VEC  TCG_REG_XMM5

static const int tcg_target_call_iarg_regs[] = {
#if TCG_TARGET_REG_BITS == 64
#if defined(_WIN64)
    TCG_REG_RCX,
    TCG_REG_RDX,
#else
    TCG_REG_RDI,
    TCG_REG_RSI,
    TCG_REG_RDX,
    TCG_REG_RCX,
#endif
    TCG_REG_R8,
    TCG_REG_R9,
#else
    /* 32 bit mode uses stack based calling convention (GCC default). */
#endif
};

static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
{
    switch (kind) {
    case TCG_CALL_RET_NORMAL:
        tcg_debug_assert(slot >= 0 && slot <= 1);
        return slot ? TCG_REG_EDX : TCG_REG_EAX;
#ifdef _WIN64
    case TCG_CALL_RET_BY_VEC:
        tcg_debug_assert(slot == 0);
        return TCG_REG_XMM0;
#endif
    default:
        g_assert_not_reached();
    }
}

/* Constants we accept.  */
#define TCG_CT_CONST_S32 0x100
#define TCG_CT_CONST_U32 0x200
#define TCG_CT_CONST_I32 0x400
#define TCG_CT_CONST_WSZ 0x800
#define TCG_CT_CONST_TST 0x1000
#define TCG_CT_CONST_ZERO 0x2000

/* Registers used with L constraint, which are the first argument
   registers on x86_64, and two random call clobbered registers on
   i386. */
#if TCG_TARGET_REG_BITS == 64
# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
#else
# define TCG_REG_L0 TCG_REG_EAX
# define TCG_REG_L1 TCG_REG_EDX
#endif

#if TCG_TARGET_REG_BITS == 64
# define ALL_GENERAL_REGS      0x0000ffffu
# define ALL_VECTOR_REGS       0xffff0000u
# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
#else
# define ALL_GENERAL_REGS      0x000000ffu
# define ALL_VECTOR_REGS       0x00ff0000u
# define ALL_BYTEL_REGS        0x0000000fu
#endif
#define SOFTMMU_RESERVE_REGS \
    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)

#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)

static const tcg_insn_unit *tb_ret_addr;

static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
                        intptr_t value, intptr_t addend)
{
    value += addend;
    switch(type) {
    case R_386_PC32:
        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
        if (value != (int32_t)value) {
            return false;
        }
        /* FALLTHRU */
    case R_386_32:
        tcg_patch32(code_ptr, value);
        break;
    case R_386_PC8:
        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
        if (value != (int8_t)value) {
            return false;
        }
        tcg_patch8(code_ptr, value);
        break;
    default:
        g_assert_not_reached();
    }
    return true;
}

/* test if a constant matches the constraint */
static bool tcg_target_const_match(int64_t val, int ct,
                                   TCGType type, TCGCond cond, int vece)
{
    if (ct & TCG_CT_CONST) {
        return 1;
    }
    if (type == TCG_TYPE_I32) {
        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
                  TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
            return 1;
        }
    } else {
        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
            return 1;
        }
        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
            return 1;
        }
        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
            return 1;
        }
        /*
         * This will be used in combination with TCG_CT_CONST_S32,
         * so "normal" TESTQ is already matched.  Also accept:
         *    TESTQ -> TESTL   (uint32_t)
         *    TESTQ -> BT      (is_power_of_2)
         */
        if ((ct & TCG_CT_CONST_TST)
            && is_tst_cond(cond)
            && (val == (uint32_t)val || is_power_of_2(val))) {
            return 1;
        }
    }
    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
        return 1;
    }
    if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
        return 1;
    }
    return 0;
}

# define LOWREGMASK(x)	((x) & 7)

#define P_EXT		0x100		/* 0x0f opcode prefix */
#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
#define P_DATA16        0x400           /* 0x66 opcode prefix */
#define P_VEXW          0x1000          /* Set VEX.W = 1 */
#if TCG_TARGET_REG_BITS == 64
# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
# define P_REXB_R       0x2000          /* REG field as byte register */
# define P_REXB_RM      0x4000          /* R/M field as byte register */
# define P_GS           0x8000          /* gs segment override */
#else
# define P_REXW		0
# define P_REXB_R	0
# define P_REXB_RM	0
# define P_GS           0
#endif
#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
#define P_VEXL          0x80000         /* Set VEX.L = 1 */
#define P_EVEX          0x100000        /* Requires EVEX encoding */

#define OPC_ARITH_EbIb	(0x80)
#define OPC_ARITH_EvIz	(0x81)
#define OPC_ARITH_EvIb	(0x83)
#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
#define OPC_ANDN        (0xf2 | P_EXT38)
#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
#define OPC_BSF         (0xbc | P_EXT)
#define OPC_BSR         (0xbd | P_EXT)
#define OPC_BSWAP	(0xc8 | P_EXT)
#define OPC_CALL_Jz	(0xe8)
#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
#define OPC_DEC_r32	(0x48)
#define OPC_IMUL_GvEv	(0xaf | P_EXT)
#define OPC_IMUL_GvEvIb	(0x6b)
#define OPC_IMUL_GvEvIz	(0x69)
#define OPC_INC_r32	(0x40)
#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
#define OPC_JCC_short	(0x70)		/* ... plus condition code */
#define OPC_JMP_long	(0xe9)
#define OPC_JMP_short	(0xeb)
#define OPC_LEA         (0x8d)
#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
#define OPC_MOVB_EvIz   (0xc6)
#define OPC_MOVL_EvIz	(0xc7)
#define OPC_MOVB_Ib     (0xb0)
#define OPC_MOVL_Iv     (0xb8)
#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
#define OPC_MOVSBL	(0xbe | P_EXT)
#define OPC_MOVSWL	(0xbf | P_EXT)
#define OPC_MOVSLQ	(0x63 | P_REXW)
#define OPC_MOVZBL	(0xb6 | P_EXT)
#define OPC_MOVZWL	(0xb7 | P_EXT)
#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_POR         (0xeb | P_EXT | P_DATA16)
#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
#define OPC_POP_r32	(0x58)
#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
#define OPC_PUSH_r32	(0x50)
#define OPC_PUSH_Iv	(0x68)
#define OPC_PUSH_Ib	(0x6a)
#define OPC_RET		(0xc3)
#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
#define OPC_SHIFT_1	(0xd1)
#define OPC_SHIFT_Ib	(0xc1)
#define OPC_SHIFT_cl	(0xd3)
#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
#define OPC_SHUFPS      (0xc6 | P_EXT)
#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
#define OPC_SHRD_Ib     (0xac | P_EXT)
#define OPC_TESTB	(0x84)
#define OPC_TESTL	(0x85)
#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
#define OPC_UD2         (0x0b | P_EXT)
#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
#define OPC_VPBLENDMB   (0x66 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPBLENDMW   (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPBLENDMD   (0x64 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPBLENDMQ   (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPCMPB      (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPCMPUB     (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPCMPW      (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPCMPUW     (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPCMPD      (0x1f | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPCMPUD     (0x1e | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPCMPQ      (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPCMPUQ     (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
#define OPC_VPMOVM2B    (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX)
#define OPC_VPMOVM2W    (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
#define OPC_VPMOVM2D    (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX)
#define OPC_VPMOVM2Q    (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPTESTMB    (0x26 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPTESTMW    (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPTESTMD    (0x27 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPTESTMQ    (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPTESTNMB   (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX)
#define OPC_VPTESTNMW   (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
#define OPC_VPTESTNMD   (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX)
#define OPC_VPTESTNMQ   (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
#define OPC_VZEROUPPER  (0x77 | P_EXT)
#define OPC_XCHG_ax_r32	(0x90)
#define OPC_XCHG_EvGv   (0x87)

#define OPC_GRP3_Eb     (0xf6)
#define OPC_GRP3_Ev     (0xf7)
#define OPC_GRP5        (0xff)
#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
#define OPC_GRPBT       (0xba | P_EXT)

#define OPC_GRPBT_BT    4
#define OPC_GRPBT_BTS   5
#define OPC_GRPBT_BTR   6
#define OPC_GRPBT_BTC   7

/* Group 1 opcode extensions for 0x80-0x83.
   These are also used as modifiers for OPC_ARITH.  */
#define ARITH_ADD 0
#define ARITH_OR  1
#define ARITH_ADC 2
#define ARITH_SBB 3
#define ARITH_AND 4
#define ARITH_SUB 5
#define ARITH_XOR 6
#define ARITH_CMP 7

/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
#define SHIFT_ROL 0
#define SHIFT_ROR 1
#define SHIFT_SHL 4
#define SHIFT_SHR 5
#define SHIFT_SAR 7

/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
#define EXT3_TESTi 0
#define EXT3_NOT   2
#define EXT3_NEG   3
#define EXT3_MUL   4
#define EXT3_IMUL  5
#define EXT3_DIV   6
#define EXT3_IDIV  7

/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
#define EXT5_INC_Ev	0
#define EXT5_DEC_Ev	1
#define EXT5_CALLN_Ev	2
#define EXT5_JMPN_Ev	4

/* Condition codes to be added to OPC_JCC_{long,short}.  */
#define JCC_JMP (-1)
#define JCC_JO  0x0
#define JCC_JNO 0x1
#define JCC_JB  0x2
#define JCC_JAE 0x3
#define JCC_JE  0x4
#define JCC_JNE 0x5
#define JCC_JBE 0x6
#define JCC_JA  0x7
#define JCC_JS  0x8
#define JCC_JNS 0x9
#define JCC_JP  0xa
#define JCC_JNP 0xb
#define JCC_JL  0xc
#define JCC_JGE 0xd
#define JCC_JLE 0xe
#define JCC_JG  0xf

static const uint8_t tcg_cond_to_jcc[] = {
    [TCG_COND_EQ] = JCC_JE,
    [TCG_COND_NE] = JCC_JNE,
    [TCG_COND_LT] = JCC_JL,
    [TCG_COND_GE] = JCC_JGE,
    [TCG_COND_LE] = JCC_JLE,
    [TCG_COND_GT] = JCC_JG,
    [TCG_COND_LTU] = JCC_JB,
    [TCG_COND_GEU] = JCC_JAE,
    [TCG_COND_LEU] = JCC_JBE,
    [TCG_COND_GTU] = JCC_JA,
    [TCG_COND_TSTEQ] = JCC_JE,
    [TCG_COND_TSTNE] = JCC_JNE,
};

#if TCG_TARGET_REG_BITS == 64
static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
{
    int rex;

    if (opc & P_GS) {
        tcg_out8(s, 0x65);
    }
    if (opc & P_DATA16) {
        /* We should never be asking for both 16 and 64-bit operation.  */
        tcg_debug_assert((opc & P_REXW) == 0);
        tcg_out8(s, 0x66);
    }
    if (opc & P_SIMDF3) {
        tcg_out8(s, 0xf3);
    } else if (opc & P_SIMDF2) {
        tcg_out8(s, 0xf2);
    }

    rex = 0;
    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
    rex |= (r & 8) >> 1;                /* REX.R */
    rex |= (x & 8) >> 2;                /* REX.X */
    rex |= (rm & 8) >> 3;               /* REX.B */

    /* P_REXB_{R,RM} indicates that the given register is the low byte.
       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
       as otherwise the encoding indicates %[abcd]h.  Note that the values
       that are ORed in merely indicate that the REX byte must be present;
       those bits get discarded in output.  */
    rex |= opc & (r >= 4 ? P_REXB_R : 0);
    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);

    if (rex) {
        tcg_out8(s, (uint8_t)(rex | 0x40));
    }

    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
        tcg_out8(s, 0x0f);
        if (opc & P_EXT38) {
            tcg_out8(s, 0x38);
        } else if (opc & P_EXT3A) {
            tcg_out8(s, 0x3a);
        }
    }

    tcg_out8(s, opc);
}
#else
static void tcg_out_opc(TCGContext *s, int opc)
{
    if (opc & P_DATA16) {
        tcg_out8(s, 0x66);
    }
    if (opc & P_SIMDF3) {
        tcg_out8(s, 0xf3);
    } else if (opc & P_SIMDF2) {
        tcg_out8(s, 0xf2);
    }
    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
        tcg_out8(s, 0x0f);
        if (opc & P_EXT38) {
            tcg_out8(s, 0x38);
        } else if (opc & P_EXT3A) {
            tcg_out8(s, 0x3a);
        }
    }
    tcg_out8(s, opc);
}
/* Discard the register arguments to tcg_out_opc early, so as not to penalize
   the 32-bit compilation paths.  This method works with all versions of gcc,
   whereas relying on optimization may not be able to exclude them.  */
#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
#endif

static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
{
    tcg_out_opc(s, opc, r, rm, 0);
    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}

static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
                            int rm, int index)
{
    int tmp;

    if (opc & P_GS) {
        tcg_out8(s, 0x65);
    }
    /* Use the two byte form if possible, which cannot encode
       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
        && ((rm | index) & 8) == 0) {
        /* Two byte VEX prefix.  */
        tcg_out8(s, 0xc5);

        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
    } else {
        /* Three byte VEX prefix.  */
        tcg_out8(s, 0xc4);

        /* VEX.m-mmmm */
        if (opc & P_EXT3A) {
            tmp = 3;
        } else if (opc & P_EXT38) {
            tmp = 2;
        } else if (opc & P_EXT) {
            tmp = 1;
        } else {
            g_assert_not_reached();
        }
        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
        tcg_out8(s, tmp);

        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
    }

    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
    /* VEX.pp */
    if (opc & P_DATA16) {
        tmp |= 1;                          /* 0x66 */
    } else if (opc & P_SIMDF3) {
        tmp |= 2;                          /* 0xf3 */
    } else if (opc & P_SIMDF2) {
        tmp |= 3;                          /* 0xf2 */
    }
    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
    tcg_out8(s, tmp);
    tcg_out8(s, opc);
}

static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
                             int rm, int index, int aaa, bool z)
{
    /* The entire 4-byte evex prefix; with R' and V' set. */
    uint32_t p = 0x08041062;
    int mm, pp;

    tcg_debug_assert(have_avx512vl);

    /* EVEX.mm */
    if (opc & P_EXT3A) {
        mm = 3;
    } else if (opc & P_EXT38) {
        mm = 2;
    } else if (opc & P_EXT) {
        mm = 1;
    } else {
        g_assert_not_reached();
    }

    /* EVEX.pp */
    if (opc & P_DATA16) {
        pp = 1;                          /* 0x66 */
    } else if (opc & P_SIMDF3) {
        pp = 2;                          /* 0xf3 */
    } else if (opc & P_SIMDF2) {
        pp = 3;                          /* 0xf2 */
    } else {
        pp = 0;
    }

    p = deposit32(p, 8, 2, mm);
    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
    p = deposit32(p, 16, 2, pp);
    p = deposit32(p, 19, 4, ~v);
    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
    p = deposit32(p, 24, 3, aaa);
    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
    p = deposit32(p, 31, 1, z);

    tcg_out32(s, p);
    tcg_out8(s, opc);
}

static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
{
    if (opc & P_EVEX) {
        tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false);
    } else {
        tcg_out_vex_opc(s, opc, r, v, rm, 0);
    }
    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}

static void tcg_out_vex_modrm_type(TCGContext *s, int opc,
                                   int r, int v, int rm, TCGType type)
{
    if (type == TCG_TYPE_V256) {
        opc |= P_VEXL;
    }
    tcg_out_vex_modrm(s, opc, r, v, rm);
}

static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v,
                                    int rm, int aaa, bool z, TCGType type)
{
    if (type == TCG_TYPE_V256) {
        opc |= P_VEXL;
    }
    tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z);
    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}

/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
   We handle either RM and INDEX missing with a negative value.  In 64-bit
   mode for absolute addresses, ~RM is the size of the immediate operand
   that will follow the instruction.  */

static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
                               int shift, intptr_t offset)
{
    int mod, len;

    if (index < 0 && rm < 0) {
        if (TCG_TARGET_REG_BITS == 64) {
            /* Try for a rip-relative addressing mode.  This has replaced
               the 32-bit-mode absolute addressing encoding.  */
            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
            intptr_t disp = offset - pc;
            if (disp == (int32_t)disp) {
                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
                tcg_out32(s, disp);
                return;
            }

            /* Try for an absolute address encoding.  This requires the
               use of the MODRM+SIB encoding and is therefore larger than
               rip-relative addressing.  */
            if (offset == (int32_t)offset) {
                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
                tcg_out8(s, (4 << 3) | 5);
                tcg_out32(s, offset);
                return;
            }

            /* ??? The memory isn't directly addressable.  */
            g_assert_not_reached();
        } else {
            /* Absolute address.  */
            tcg_out8(s, (r << 3) | 5);
            tcg_out32(s, offset);
            return;
        }
    }

    /* Find the length of the immediate addend.  Note that the encoding
       that would be used for (%ebp) indicates absolute addressing.  */
    if (rm < 0) {
        mod = 0, len = 4, rm = 5;
    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
        mod = 0, len = 0;
    } else if (offset == (int8_t)offset) {
        mod = 0x40, len = 1;
    } else {
        mod = 0x80, len = 4;
    }

    /* Use a single byte MODRM format if possible.  Note that the encoding
       that would be used for %esp is the escape to the two byte form.  */
    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
        /* Single byte MODRM format.  */
        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
    } else {
        /* Two byte MODRM+SIB format.  */

        /* Note that the encoding that would place %esp into the index
           field indicates no index register.  In 64-bit mode, the REX.X
           bit counts, so %r12 can be used as the index.  */
        if (index < 0) {
            index = 4;
        } else {
            tcg_debug_assert(index != TCG_REG_ESP);
        }

        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
    }

    if (len == 1) {
        tcg_out8(s, offset);
    } else if (len == 4) {
        tcg_out32(s, offset);
    }
}

static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
                                     int index, int shift, intptr_t offset)
{
    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
    tcg_out_sib_offset(s, r, rm, index, shift, offset);
}

static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
                                         int rm, int index, int shift,
                                         intptr_t offset)
{
    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
    tcg_out_sib_offset(s, r, rm, index, shift, offset);
}

/* A simplification of the above with no index or shift.  */
static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
                                        int rm, intptr_t offset)
{
    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
}

static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
                                            int v, int rm, intptr_t offset)
{
    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
}

/* Output an opcode with an expected reference to the constant pool.  */
static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
{
    tcg_out_opc(s, opc, r, 0, 0);
    /* Absolute for 32-bit, pc-relative for 64-bit.  */
    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
    tcg_out32(s, 0);
}

/* Output an opcode with an expected reference to the constant pool.  */
static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
{
    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
    /* Absolute for 32-bit, pc-relative for 64-bit.  */
    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
    tcg_out32(s, 0);
}

/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
{
    /* Propagate an opcode prefix, such as P_REXW.  */
    int ext = subop & ~0x7;
    subop &= 0x7;

    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
}

static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
    int rexw = 0;

    if (arg == ret) {
        return true;
    }
    switch (type) {
    case TCG_TYPE_I64:
        rexw = P_REXW;
        /* fallthru */
    case TCG_TYPE_I32:
        if (ret < 16) {
            if (arg < 16) {
                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
            } else {
                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
            }
        } else {
            if (arg < 16) {
                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
            } else {
                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
            }
        }
        break;

    case TCG_TYPE_V64:
        tcg_debug_assert(ret >= 16 && arg >= 16);
        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
        break;
    case TCG_TYPE_V128:
        tcg_debug_assert(ret >= 16 && arg >= 16);
        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
        break;
    case TCG_TYPE_V256:
        tcg_debug_assert(ret >= 16 && arg >= 16);
        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
        break;

    default:
        g_assert_not_reached();
    }
    return true;
}

static const int avx2_dup_insn[4] = {
    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
};

static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
                            TCGReg r, TCGReg a)
{
    if (have_avx2) {
        tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type);
    } else {
        switch (vece) {
        case MO_8:
            /* ??? With zero in a register, use PSHUFB.  */
            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
            a = r;
            /* FALLTHRU */
        case MO_16:
            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
            a = r;
            /* FALLTHRU */
        case MO_32:
            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
            /* imm8 operand: all output lanes selected from input lane 0.  */
            tcg_out8(s, 0);
            break;
        case MO_64:
            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
            break;
        default:
            g_assert_not_reached();
        }
    }
    return true;
}

static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg r, TCGReg base, intptr_t offset)
{
    if (have_avx2) {
        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
                                 r, 0, base, offset);
    } else {
        switch (vece) {
        case MO_64:
            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
            break;
        case MO_32:
            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
            break;
        case MO_16:
            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
            tcg_out8(s, 0); /* imm8 */
            tcg_out_dup_vec(s, type, vece, r, r);
            break;
        case MO_8:
            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
            tcg_out8(s, 0); /* imm8 */
            tcg_out_dup_vec(s, type, vece, r, r);
            break;
        default:
            g_assert_not_reached();
        }
    }
    return true;
}

static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg ret, int64_t arg)
{
    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);

    if (arg == 0) {
        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
        return;
    }
    if (arg == -1) {
        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
        return;
    }

    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
        if (have_avx2) {
            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
        } else {
            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
        }
        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
    } else {
        if (type == TCG_TYPE_V64) {
            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
        } else if (have_avx2) {
            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
        } else {
            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
        }
        if (TCG_TARGET_REG_BITS == 64) {
            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
        } else {
            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
        }
    }
}

static void tcg_out_movi_vec(TCGContext *s, TCGType type,
                             TCGReg ret, tcg_target_long arg)
{
    if (arg == 0) {
        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
        return;
    }
    if (arg == -1) {
        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
        return;
    }

    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
    if (TCG_TARGET_REG_BITS == 64) {
        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
    } else {
        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
    }
}

static void tcg_out_movi_int(TCGContext *s, TCGType type,
                             TCGReg ret, tcg_target_long arg)
{
    tcg_target_long diff;

    if (arg == 0) {
        tgen_arithr(s, ARITH_XOR, ret, ret);
        return;
    }
    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
        tcg_out32(s, arg);
        return;
    }
    if (arg == (int32_t)arg) {
        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
        tcg_out32(s, arg);
        return;
    }

    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
    if (diff == (int32_t)diff) {
        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
        tcg_out32(s, diff);
        return;
    }

    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
    tcg_out64(s, arg);
}

static void tcg_out_movi(TCGContext *s, TCGType type,
                         TCGReg ret, tcg_target_long arg)
{
    switch (type) {
    case TCG_TYPE_I32:
#if TCG_TARGET_REG_BITS == 64
    case TCG_TYPE_I64:
#endif
        if (ret < 16) {
            tcg_out_movi_int(s, type, ret, arg);
        } else {
            tcg_out_movi_vec(s, type, ret, arg);
        }
        break;
    default:
        g_assert_not_reached();
    }
}

static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
    return true;
}

static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
                             tcg_target_long imm)
{
    /* This function is only used for passing structs by reference. */
    tcg_debug_assert(imm == (int32_t)imm);
    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
}

static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
{
    if (val == (int8_t)val) {
        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
        tcg_out8(s, val);
    } else if (val == (int32_t)val) {
        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
        tcg_out32(s, val);
    } else {
        g_assert_not_reached();
    }
}

static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
{
    /* Given the strength of x86 memory ordering, we only need care for
       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
       faster than "mfence", so don't bother with the sse insn.  */
    if (a0 & TCG_MO_ST_LD) {
        tcg_out8(s, 0xf0);
        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
        tcg_out8(s, 0);
    }
}

static inline void tcg_out_push(TCGContext *s, int reg)
{
    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
}

static inline void tcg_out_pop(TCGContext *s, int reg)
{
    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
}

static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
                       TCGReg arg1, intptr_t arg2)
{
    switch (type) {
    case TCG_TYPE_I32:
        if (ret < 16) {
            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
        } else {
            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
        }
        break;
    case TCG_TYPE_I64:
        if (ret < 16) {
            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
            break;
        }
        /* FALLTHRU */
    case TCG_TYPE_V64:
        /* There is no instruction that can validate 8-byte alignment.  */
        tcg_debug_assert(ret >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
        break;
    case TCG_TYPE_V128:
        /*
         * The gvec infrastructure is asserts that v128 vector loads
         * and stores use a 16-byte aligned offset.  Validate that the
         * final pointer is aligned by using an insn that will SIGSEGV.
         */
        tcg_debug_assert(ret >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
        break;
    case TCG_TYPE_V256:
        /*
         * The gvec infrastructure only requires 16-byte alignment,
         * so here we must use an unaligned load.
         */
        tcg_debug_assert(ret >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
                                 ret, 0, arg1, arg2);
        break;
    default:
        g_assert_not_reached();
    }
}

static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
                       TCGReg arg1, intptr_t arg2)
{
    switch (type) {
    case TCG_TYPE_I32:
        if (arg < 16) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
        } else {
            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
        }
        break;
    case TCG_TYPE_I64:
        if (arg < 16) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
            break;
        }
        /* FALLTHRU */
    case TCG_TYPE_V64:
        /* There is no instruction that can validate 8-byte alignment.  */
        tcg_debug_assert(arg >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
        break;
    case TCG_TYPE_V128:
        /*
         * The gvec infrastructure is asserts that v128 vector loads
         * and stores use a 16-byte aligned offset.  Validate that the
         * final pointer is aligned by using an insn that will SIGSEGV.
         *
         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
         * for _WIN64, which must have SSE2 but may not have AVX.
         */
        tcg_debug_assert(arg >= 16);
        if (have_avx1) {
            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
        } else {
            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
        }
        break;
    case TCG_TYPE_V256:
        /*
         * The gvec infrastructure only requires 16-byte alignment,
         * so here we must use an unaligned store.
         */
        tcg_debug_assert(arg >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
                                 arg, 0, arg1, arg2);
        break;
    default:
        g_assert_not_reached();
    }
}

static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
                        TCGReg base, intptr_t ofs)
{
    int rexw = 0;
    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
        if (val != (int32_t)val) {
            return false;
        }
        rexw = P_REXW;
    } else if (type != TCG_TYPE_I32) {
        return false;
    }
    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
    tcg_out32(s, val);
    return true;
}

static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
{
    /* Propagate an opcode prefix, such as P_DATA16.  */
    int ext = subopc & ~0x7;
    subopc &= 0x7;

    if (count == 1) {
        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
    } else {
        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
        tcg_out8(s, count);
    }
}

static inline void tcg_out_bswap32(TCGContext *s, int reg)
{
    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
}

static inline void tcg_out_rolw_8(TCGContext *s, int reg)
{
    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
}

static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
{
    if (TCG_TARGET_REG_BITS == 32 && src >= 4) {
        tcg_out_mov(s, TCG_TYPE_I32, dest, src);
        if (dest >= 4) {
            tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest);
            tcg_out32(s, 0xff);
            return;
        }
        src = dest;
    }
    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
}

static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

    if (TCG_TARGET_REG_BITS == 32 && src >= 4) {
        tcg_out_mov(s, TCG_TYPE_I32, dest, src);
        if (dest >= 4) {
            tcg_out_shifti(s, SHIFT_SHL, dest, 24);
            tcg_out_shifti(s, SHIFT_SAR, dest, 24);
            return;
        }
        src = dest;
    }
    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
}

static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
{
    /* movzwl */
    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
}

static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    /* movsw[lq] */
    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
}

static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
{
    /* 32-bit mov zero extends.  */
    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
}

static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
{
    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
}

static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
{
    tcg_out_ext32s(s, dest, src);
}

static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
{
    if (dest != src) {
        tcg_out_ext32u(s, dest, src);
    }
}

static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
{
    tcg_out_ext32u(s, dest, src);
}

static inline void tcg_out_bswap64(TCGContext *s, int reg)
{
    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
}

static void tgen_arithi(TCGContext *s, int c, int r0,
                        tcg_target_long val, int cf)
{
    int rexw = 0;

    if (TCG_TARGET_REG_BITS == 64) {
        rexw = c & -8;
        c &= 7;
    }

    switch (c) {
    case ARITH_ADD:
    case ARITH_SUB:
        if (!cf) {
            /*
             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
             * partial flags update stalls on Pentium4 and are not recommended
             * by current Intel optimization manuals.
             */
            if (val == 1 || val == -1) {
                int is_inc = (c == ARITH_ADD) ^ (val < 0);
                if (TCG_TARGET_REG_BITS == 64) {
                    /*
                     * The single-byte increment encodings are re-tasked
                     * as the REX prefixes.  Use the MODRM encoding.
                     */
                    tcg_out_modrm(s, OPC_GRP5 + rexw,
                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
                } else {
                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
                }
                return;
            }
            if (val == 128) {
                /*
                 * Facilitate using an 8-bit immediate.  Carry is inverted
                 * by this transformation, so do it only if cf == 0.
                 */
                c ^= ARITH_ADD ^ ARITH_SUB;
                val = -128;
            }
        }
        break;

    case ARITH_AND:
        if (TCG_TARGET_REG_BITS == 64) {
            if (val == 0xffffffffu) {
                tcg_out_ext32u(s, r0, r0);
                return;
            }
            if (val == (uint32_t)val) {
                /* AND with no high bits set can use a 32-bit operation.  */
                rexw = 0;
            }
        }
        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
            tcg_out_ext8u(s, r0, r0);
            return;
        }
        if (val == 0xffffu) {
            tcg_out_ext16u(s, r0, r0);
            return;
        }
        break;

    case ARITH_OR:
    case ARITH_XOR:
        if (val >= 0x80 && val <= 0xff
            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
            tcg_out8(s, val);
            return;
        }
        break;
    }

    if (val == (int8_t)val) {
        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
        tcg_out8(s, val);
        return;
    }
    if (rexw == 0 || val == (int32_t)val) {
        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
        tcg_out32(s, val);
        return;
    }

    g_assert_not_reached();
}

static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
{
    if (val != 0) {
        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
    }
}

/* Set SMALL to force a short forward branch.  */
static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
{
    int32_t val, val1;

    if (l->has_value) {
        val = tcg_pcrel_diff(s, l->u.value_ptr);
        val1 = val - 2;
        if ((int8_t)val1 == val1) {
            if (opc == -1) {
                tcg_out8(s, OPC_JMP_short);
            } else {
                tcg_out8(s, OPC_JCC_short + opc);
            }
            tcg_out8(s, val1);
        } else {
            tcg_debug_assert(!small);
            if (opc == -1) {
                tcg_out8(s, OPC_JMP_long);
                tcg_out32(s, val - 5);
            } else {
                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
                tcg_out32(s, val - 6);
            }
        }
    } else if (small) {
        if (opc == -1) {
            tcg_out8(s, OPC_JMP_short);
        } else {
            tcg_out8(s, OPC_JCC_short + opc);
        }
        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
        s->code_ptr += 1;
    } else {
        if (opc == -1) {
            tcg_out8(s, OPC_JMP_long);
        } else {
            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
        }
        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
        s->code_ptr += 4;
    }
}

static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
                       TCGArg arg2, int const_arg2, int rexw)
{
    int jz, js;

    if (!is_tst_cond(cond)) {
        if (!const_arg2) {
            tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
        } else if (arg2 == 0) {
            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
        } else {
            tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
        }
        return tcg_cond_to_jcc[cond];
    }

    jz = tcg_cond_to_jcc[cond];
    js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);

    if (!const_arg2) {
        tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
        return jz;
    }

    if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
        if (arg2 == 0x80) {
            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
            return js;
        }
        if (arg2 == 0xff) {
            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
            return jz;
        }
        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
        tcg_out8(s, arg2);
        return jz;
    }

    if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
        if (arg2 == 0x8000) {
            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
            return js;
        }
        if (arg2 == 0xff00) {
            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
            return jz;
        }
        tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
        tcg_out8(s, arg2 >> 8);
        return jz;
    }

    if (arg2 == 0xffff) {
        tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
        return jz;
    }
    if (arg2 == 0xffffffffu) {
        tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
        return jz;
    }

    if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
        int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
        int sh = ctz64(arg2);

        rexw = (sh & 32 ? P_REXW : 0);
        if ((sh & 31) == 31) {
            tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
            return js;
        } else {
            tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
            tcg_out8(s, sh);
            return jc;
        }
    }

    if (rexw) {
        if (arg2 == (uint32_t)arg2) {
            rexw = 0;
        } else {
            tcg_debug_assert(arg2 == (int32_t)arg2);
        }
    }
    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
    tcg_out32(s, arg2);
    return jz;
}

static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
                           TCGArg arg1, TCGArg arg2, int const_arg2,
                           TCGLabel *label, bool small)
{
    int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
    tcg_out_jxx(s, jcc, label, small);
}

#if TCG_TARGET_REG_BITS == 32
static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
                            const int *const_args, bool small)
{
    TCGLabel *label_next = gen_new_label();
    TCGLabel *label_this = arg_label(args[5]);
    TCGCond cond = args[4];

    switch (cond) {
    case TCG_COND_EQ:
    case TCG_COND_TSTEQ:
        tcg_out_brcond(s, 0, tcg_invert_cond(cond),
                       args[0], args[2], const_args[2], label_next, 1);
        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
                       label_this, small);
        break;

    case TCG_COND_NE:
    case TCG_COND_TSTNE:
        tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
                       label_this, small);
        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
                       label_this, small);
        break;

    default:
        tcg_out_brcond(s, 0, tcg_high_cond(cond), args[1],
                       args[3], const_args[3], label_this, small);
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
        tcg_out_brcond(s, 0, tcg_unsigned_cond(cond), args[0],
                       args[2], const_args[2], label_this, small);
        break;
    }
    tcg_out_label(s, label_next);
}
#endif

static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
                            TCGArg dest, TCGArg arg1, TCGArg arg2,
                            int const_arg2, bool neg)
{
    int cmp_rexw = rexw;
    bool inv = false;
    bool cleared;
    int jcc;

    switch (cond) {
    case TCG_COND_NE:
        inv = true;
        /* fall through */
    case TCG_COND_EQ:
        /* If arg2 is 0, convert to LTU/GEU vs 1. */
        if (const_arg2 && arg2 == 0) {
            arg2 = 1;
            goto do_ltu;
        }
        break;

    case TCG_COND_TSTNE:
        inv = true;
        /* fall through */
    case TCG_COND_TSTEQ:
        /* If arg2 is -1, convert to LTU/GEU vs 1. */
        if (const_arg2 && arg2 == 0xffffffffu) {
            arg2 = 1;
            cmp_rexw = 0;
            goto do_ltu;
        }
        break;

    case TCG_COND_LEU:
        inv = true;
        /* fall through */
    case TCG_COND_GTU:
        /* If arg2 is a register, swap for LTU/GEU. */
        if (!const_arg2) {
            TCGReg t = arg1;
            arg1 = arg2;
            arg2 = t;
            goto do_ltu;
        }
        break;

    case TCG_COND_GEU:
        inv = true;
        /* fall through */
    case TCG_COND_LTU:
    do_ltu:
        /*
         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
         * We can then use NEG or INC to produce the desired result.
         * This is always smaller than the SETCC expansion.
         */
        tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);

        /* X - X - C = -C = (C ? -1 : 0) */
        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
        if (inv && neg) {
            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
        } else if (inv) {
            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
        } else if (!neg) {
            /* -(C ? -1 : 0) = (C ? 1 : 0) */
            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
        }
        return;

    case TCG_COND_GE:
        inv = true;
        /* fall through */
    case TCG_COND_LT:
        /* If arg2 is 0, extract the sign bit. */
        if (const_arg2 && arg2 == 0) {
            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
            if (inv) {
                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
            }
            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
                           dest, rexw ? 63 : 31);
            return;
        }
        break;

    default:
        break;
    }

    /*
     * If dest does not overlap the inputs, clearing it first is preferred.
     * The XOR breaks any false dependency for the low-byte write to dest,
     * and is also one byte smaller than MOVZBL.
     */
    cleared = false;
    if (dest != arg1 && (const_arg2 || dest != arg2)) {
        tgen_arithr(s, ARITH_XOR, dest, dest);
        cleared = true;
    }

    jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
    tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);

    if (!cleared) {
        tcg_out_ext8u(s, dest, dest);
    }
    if (neg) {
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
    }
}

#if TCG_TARGET_REG_BITS == 32
static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
                             const int *const_args)
{
    TCGArg new_args[6];
    TCGLabel *label_true, *label_over;

    memcpy(new_args, args+1, 5*sizeof(TCGArg));

    if (args[0] == args[1] || args[0] == args[2]
        || (!const_args[3] && args[0] == args[3])
        || (!const_args[4] && args[0] == args[4])) {
        /* When the destination overlaps with one of the argument
           registers, don't do anything tricky.  */
        label_true = gen_new_label();
        label_over = gen_new_label();

        new_args[5] = label_arg(label_true);
        tcg_out_brcond2(s, new_args, const_args+1, 1);

        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
        tcg_out_jxx(s, JCC_JMP, label_over, 1);
        tcg_out_label(s, label_true);

        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
        tcg_out_label(s, label_over);
    } else {
        /* When the destination does not overlap one of the arguments,
           clear the destination first, jump if cond false, and emit an
           increment in the true case.  This results in smaller code.  */

        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);

        label_over = gen_new_label();
        new_args[4] = tcg_invert_cond(new_args[4]);
        new_args[5] = label_arg(label_over);
        tcg_out_brcond2(s, new_args, const_args+1, 1);

        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
        tcg_out_label(s, label_over);
    }
}
#endif

static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
                         TCGReg dest, TCGReg v1)
{
    tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
}

static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
                            TCGReg v1)
{
    int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
    tcg_out_cmov(s, jcc, rexw, dest, v1);
}

static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
                        TCGArg arg2, bool const_a2)
{
    if (have_bmi1) {
        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
        if (const_a2) {
            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
        } else {
            tcg_debug_assert(dest != arg2);
            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
        }
    } else {
        tcg_debug_assert(dest != arg2);
        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
        tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
    }
}

static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
                        TCGArg arg2, bool const_a2)
{
    if (have_lzcnt) {
        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
        if (const_a2) {
            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
        } else {
            tcg_debug_assert(dest != arg2);
            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
        }
    } else {
        tcg_debug_assert(!const_a2);
        tcg_debug_assert(dest != arg1);
        tcg_debug_assert(dest != arg2);

        /* Recall that the output of BSR is the index not the count.  */
        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);

        /* Since we have destroyed the flags from BSR, we have to re-test.  */
        int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
        tcg_out_cmov(s, jcc, rexw, dest, arg2);
    }
}

static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
{
    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;

    if (disp == (int32_t)disp) {
        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
        tcg_out32(s, disp);
    } else {
        /* rip-relative addressing into the constant pool.
           This is 6 + 8 = 14 bytes, as compared to using an
           immediate load 10 + 6 = 16 bytes, plus we may
           be able to re-use the pool constant for more calls.  */
        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
        tcg_out32(s, 0);
    }
}

static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
                         const TCGHelperInfo *info)
{
    tcg_out_branch(s, 1, dest);

#ifndef _WIN32
    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
        /*
         * The sysv i386 abi for struct return places a reference as the
         * first argument of the stack, and pops that argument with the
         * return statement.  Since we want to retain the aligned stack
         * pointer for the callee, we do not want to actually push that
         * argument before the call but rely on the normal store to the
         * stack slot.  But we do need to compensate for the pop in order
         * to reset our correct stack pointer value.
         * Pushing a garbage value back onto the stack is quickest.
         */
        tcg_out_push(s, TCG_REG_EAX);
    }
#endif
}

static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
{
    tcg_out_branch(s, 0, dest);
}

static void tcg_out_nopn(TCGContext *s, int n)
{
    int i;
    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
     * duplicate prefix, and all of the interesting recent cores can
     * decode and discard the duplicates in a single cycle.
     */
    tcg_debug_assert(n >= 1);
    for (i = 1; i < n; ++i) {
        tcg_out8(s, 0x66);
    }
    tcg_out8(s, 0x90);
}

typedef struct {
    TCGReg base;
    int index;
    int ofs;
    int seg;
    TCGAtomAlign aa;
} HostAddress;

bool tcg_target_has_memory_bswap(MemOp memop)
{
    TCGAtomAlign aa;

    if (!have_movbe) {
        return false;
    }
    if ((memop & MO_SIZE) < MO_128) {
        return true;
    }

    /*
     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
     */
    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
    return aa.atom < MO_128;
}

/*
 * Because i686 has no register parameters and because x86_64 has xchg
 * to handle addr/data register overlap, we have placed all input arguments
 * before we need might need a scratch reg.
 *
 * Even then, a scratch is only needed for l->raddr.  Rather than expose
 * a general-purpose scratch when we don't actually know it's available,
 * use the ra_gen hook to load into RAX if needed.
 */
#if TCG_TARGET_REG_BITS == 64
static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
{
    if (arg < 0) {
        arg = TCG_REG_RAX;
    }
    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
    return arg;
}
static const TCGLdstHelperParam ldst_helper_param = {
    .ra_gen = ldst_ra_gen
};
#else
static const TCGLdstHelperParam ldst_helper_param = { };
#endif

static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
                                TCGReg l, TCGReg h, TCGReg v)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

    /* vpmov{d,q} %v, %l */
    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
    /* vpextr{d,q} $1, %v, %h */
    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
    tcg_out8(s, 1);
}

static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
                                TCGReg v, TCGReg l, TCGReg h)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

    /* vmov{d,q} %l, %v */
    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
    /* vpinsr{d,q} $1, %h, %v, %v */
    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
    tcg_out8(s, 1);
}

/*
 * Generate code for the slow path for a load at the end of block
 */
static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
{
    MemOp opc = get_memop(l->oi);
    tcg_insn_unit **label_ptr = &l->label_ptr[0];

    /* resolve label address */
    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
    if (label_ptr[1]) {
        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
    }

    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);

    tcg_out_jmp(s, l->raddr);
    return true;
}

/*
 * Generate code for the slow path for a store at the end of block
 */
static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
{
    MemOp opc = get_memop(l->oi);
    tcg_insn_unit **label_ptr = &l->label_ptr[0];

    /* resolve label address */
    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
    if (label_ptr[1]) {
        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
    }

    tcg_out_st_helper_args(s, l, &ldst_helper_param);
    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);

    tcg_out_jmp(s, l->raddr);
    return true;
}

#ifdef CONFIG_USER_ONLY
static HostAddress x86_guest_base = {
    .index = -1
};

#if defined(__x86_64__) && defined(__linux__)
# include <asm/prctl.h>
# include <sys/prctl.h>
int arch_prctl(int code, unsigned long addr);
static inline int setup_guest_base_seg(void)
{
    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
        return P_GS;
    }
    return 0;
}
#define setup_guest_base_seg  setup_guest_base_seg
#elif defined(__x86_64__) && \
      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
# include <machine/sysarch.h>
static inline int setup_guest_base_seg(void)
{
    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
        return P_GS;
    }
    return 0;
}
#define setup_guest_base_seg  setup_guest_base_seg
#endif
#else
# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
#endif /* CONFIG_USER_ONLY */
#ifndef setup_guest_base_seg
# define setup_guest_base_seg()  0
#endif

#define MIN_TLB_MASK_TABLE_OFS  INT_MIN

/*
 * For softmmu, perform the TLB load and compare.
 * For useronly, perform any required alignment tests.
 * In both cases, return a TCGLabelQemuLdst structure if the slow path
 * is required and fill in @h with the host address for the fast path.
 */
static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                                           TCGReg addr, MemOpIdx oi, bool is_ld)
{
    TCGLabelQemuLdst *ldst = NULL;
    MemOp opc = get_memop(oi);
    MemOp s_bits = opc & MO_SIZE;
    unsigned a_mask;

    if (tcg_use_softmmu) {
        h->index = TCG_REG_L0;
        h->ofs = 0;
        h->seg = 0;
    } else {
        *h = x86_guest_base;
    }
    h->base = addr;
    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
    a_mask = (1 << h->aa.align) - 1;

    if (tcg_use_softmmu) {
        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
                            : offsetof(CPUTLBEntry, addr_write);
        TCGType ttype = TCG_TYPE_I32;
        TCGType tlbtype = TCG_TYPE_I32;
        int trexw = 0, hrexw = 0, tlbrexw = 0;
        unsigned mem_index = get_mmuidx(oi);
        unsigned s_mask = (1 << s_bits) - 1;
        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
        int tlb_mask;

        ldst = new_ldst_label(s);
        ldst->is_ld = is_ld;
        ldst->oi = oi;
        ldst->addr_reg = addr;

        if (TCG_TARGET_REG_BITS == 64) {
            ttype = s->addr_type;
            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
                hrexw = P_REXW;
                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
                    tlbtype = TCG_TYPE_I64;
                    tlbrexw = P_REXW;
                }
            }
        }

        tcg_out_mov(s, tlbtype, TCG_REG_L0, addr);
        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
                       s->page_bits - CPU_TLB_ENTRY_BITS);

        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
                             fast_ofs + offsetof(CPUTLBDescFast, mask));

        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
                             fast_ofs + offsetof(CPUTLBDescFast, table));

        /*
         * If the required alignment is at least as large as the access,
         * simply copy the address and mask.  For lesser alignments,
         * check that we don't cross pages for the complete access.
         */
        if (a_mask >= s_mask) {
            tcg_out_mov(s, ttype, TCG_REG_L1, addr);
        } else {
            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
                                 addr, s_mask - a_mask);
        }
        tlb_mask = s->page_mask | a_mask;
        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);

        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);

        /* jne slow_path */
        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
        ldst->label_ptr[0] = s->code_ptr;
        s->code_ptr += 4;

        /* TLB Hit.  */
        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
                   offsetof(CPUTLBEntry, addend));
    } else if (a_mask) {
        int jcc;

        ldst = new_ldst_label(s);
        ldst->is_ld = is_ld;
        ldst->oi = oi;
        ldst->addr_reg = addr;

        /* jne slow_path */
        jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addr, a_mask, true, false);
        tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
        ldst->label_ptr[0] = s->code_ptr;
        s->code_ptr += 4;
    }

    return ldst;
}

static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                   HostAddress h, TCGType type, MemOp memop)
{
    bool use_movbe = false;
    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
    int movop = OPC_MOVL_GvEv;

    /* Do big-endian loads with movbe.  */
    if (memop & MO_BSWAP) {
        tcg_debug_assert(have_movbe);
        use_movbe = true;
        movop = OPC_MOVBE_GyMy;
    }

    switch (memop & MO_SSIZE) {
    case MO_UB:
        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
                                 h.base, h.index, 0, h.ofs);
        break;
    case MO_SB:
        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
                                 h.base, h.index, 0, h.ofs);
        break;
    case MO_UW:
        if (use_movbe) {
            /* There is no extending movbe; only low 16-bits are modified.  */
            if (datalo != h.base && datalo != h.index) {
                /* XOR breaks dependency chains.  */
                tgen_arithr(s, ARITH_XOR, datalo, datalo);
                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
                                         datalo, h.base, h.index, 0, h.ofs);
            } else {
                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
                                         datalo, h.base, h.index, 0, h.ofs);
                tcg_out_ext16u(s, datalo, datalo);
            }
        } else {
            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
                                     h.base, h.index, 0, h.ofs);
        }
        break;
    case MO_SW:
        if (use_movbe) {
            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
                                     datalo, h.base, h.index, 0, h.ofs);
            tcg_out_ext16s(s, type, datalo, datalo);
        } else {
            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
                                     datalo, h.base, h.index, 0, h.ofs);
        }
        break;
    case MO_UL:
        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
                                 h.base, h.index, 0, h.ofs);
        break;
#if TCG_TARGET_REG_BITS == 64
    case MO_SL:
        if (use_movbe) {
            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
                                     h.base, h.index, 0, h.ofs);
            tcg_out_ext32s(s, datalo, datalo);
        } else {
            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
                                     h.base, h.index, 0, h.ofs);
        }
        break;
#endif
    case MO_UQ:
        if (TCG_TARGET_REG_BITS == 64) {
            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
                                     h.base, h.index, 0, h.ofs);
            break;
        }
        if (use_movbe) {
            TCGReg t = datalo;
            datalo = datahi;
            datahi = t;
        }
        if (h.base == datalo || h.index == datalo) {
            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
                                     h.base, h.index, 0, h.ofs);
            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
        } else {
            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
                                     h.base, h.index, 0, h.ofs);
            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
                                     h.base, h.index, 0, h.ofs + 4);
        }
        break;

    case MO_128:
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);

        /*
         * Without 16-byte atomicity, use integer regs.
         * That is where we want the data, and it allows bswaps.
         */
        if (h.aa.atom < MO_128) {
            if (use_movbe) {
                TCGReg t = datalo;
                datalo = datahi;
                datahi = t;
            }
            if (h.base == datalo || h.index == datalo) {
                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
                                         h.base, h.index, 0, h.ofs);
                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
                                     datalo, datahi, 0);
                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
                                     datahi, datahi, 8);
            } else {
                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
                                         h.base, h.index, 0, h.ofs);
                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
                                         h.base, h.index, 0, h.ofs + 8);
            }
            break;
        }

        /*
         * With 16-byte atomicity, a vector load is required.
         * If we already have 16-byte alignment, then VMOVDQA always works.
         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
         * Else use we require a runtime test for alignment for VMOVDQA;
         * use VMOVDQU on the unaligned nonatomic path for simplicity.
         */
        if (h.aa.align >= MO_128) {
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
                                         TCG_TMP_VEC, 0,
                                         h.base, h.index, 0, h.ofs);
        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
                                         TCG_TMP_VEC, 0,
                                         h.base, h.index, 0, h.ofs);
        } else {
            TCGLabel *l1 = gen_new_label();
            TCGLabel *l2 = gen_new_label();
            int jcc;

            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
            tcg_out_jxx(s, jcc, l1, true);

            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
                                         TCG_TMP_VEC, 0,
                                         h.base, h.index, 0, h.ofs);
            tcg_out_jxx(s, JCC_JMP, l2, true);

            tcg_out_label(s, l1);
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
                                         TCG_TMP_VEC, 0,
                                         h.base, h.index, 0, h.ofs);
            tcg_out_label(s, l2);
        }
        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
        break;

    default:
        g_assert_not_reached();
    }
}

static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
                            TCGReg addr, MemOpIdx oi, TCGType data_type)
{
    TCGLabelQemuLdst *ldst;
    HostAddress h;

    ldst = prepare_host_addr(s, &h, addr, oi, true);
    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));

    if (ldst) {
        ldst->type = data_type;
        ldst->datalo_reg = datalo;
        ldst->datahi_reg = datahi;
        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
    }
}

static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                   HostAddress h, MemOp memop)
{
    bool use_movbe = false;
    int movop = OPC_MOVL_EvGv;

    /*
     * Do big-endian stores with movbe or system-mode.
     * User-only without movbe will have its swapping done generically.
     */
    if (memop & MO_BSWAP) {
        tcg_debug_assert(have_movbe);
        use_movbe = true;
        movop = OPC_MOVBE_MyGy;
    }

    switch (memop & MO_SIZE) {
    case MO_8:
        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
                                 datalo, h.base, h.index, 0, h.ofs);
        break;
    case MO_16:
        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
                                 h.base, h.index, 0, h.ofs);
        break;
    case MO_32:
        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
                                 h.base, h.index, 0, h.ofs);
        break;
    case MO_64:
        if (TCG_TARGET_REG_BITS == 64) {
            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
                                     h.base, h.index, 0, h.ofs);
        } else {
            if (use_movbe) {
                TCGReg t = datalo;
                datalo = datahi;
                datahi = t;
            }
            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
                                     h.base, h.index, 0, h.ofs);
            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
                                     h.base, h.index, 0, h.ofs + 4);
        }
        break;

    case MO_128:
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);

        /*
         * Without 16-byte atomicity, use integer regs.
         * That is where we have the data, and it allows bswaps.
         */
        if (h.aa.atom < MO_128) {
            if (use_movbe) {
                TCGReg t = datalo;
                datalo = datahi;
                datahi = t;
            }
            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
                                     h.base, h.index, 0, h.ofs);
            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
                                     h.base, h.index, 0, h.ofs + 8);
            break;
        }

        /*
         * With 16-byte atomicity, a vector store is required.
         * If we already have 16-byte alignment, then VMOVDQA always works.
         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
         * Else use we require a runtime test for alignment for VMOVDQA;
         * use VMOVDQU on the unaligned nonatomic path for simplicity.
         */
        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
        if (h.aa.align >= MO_128) {
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
                                         TCG_TMP_VEC, 0,
                                         h.base, h.index, 0, h.ofs);
        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
                                         TCG_TMP_VEC, 0,
                                         h.base, h.index, 0, h.ofs);
        } else {
            TCGLabel *l1 = gen_new_label();
            TCGLabel *l2 = gen_new_label();
            int jcc;

            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
            tcg_out_jxx(s, jcc, l1, true);

            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
                                         TCG_TMP_VEC, 0,
                                         h.base, h.index, 0, h.ofs);
            tcg_out_jxx(s, JCC_JMP, l2, true);

            tcg_out_label(s, l1);
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
                                         TCG_TMP_VEC, 0,
                                         h.base, h.index, 0, h.ofs);
            tcg_out_label(s, l2);
        }
        break;

    default:
        g_assert_not_reached();
    }
}

static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
                            TCGReg addr, MemOpIdx oi, TCGType data_type)
{
    TCGLabelQemuLdst *ldst;
    HostAddress h;

    ldst = prepare_host_addr(s, &h, addr, oi, false);
    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));

    if (ldst) {
        ldst->type = data_type;
        ldst->datalo_reg = datalo;
        ldst->datahi_reg = datahi;
        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
    }
}

static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
{
    /* Reuse the zeroing that exists for goto_ptr.  */
    if (a0 == 0) {
        tcg_out_jmp(s, tcg_code_gen_epilogue);
    } else {
        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
        tcg_out_jmp(s, tb_ret_addr);
    }
}

static void tcg_out_goto_tb(TCGContext *s, int which)
{
    /*
     * Jump displacement must be aligned for atomic patching;
     * see if we need to add extra nops before jump
     */
    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
    if (gap != 1) {
        tcg_out_nopn(s, gap - 1);
    }
    tcg_out8(s, OPC_JMP_long); /* jmp im */
    set_jmp_insn_offset(s, which);
    tcg_out32(s, 0);
    set_jmp_reset_offset(s, which);
}

void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                              uintptr_t jmp_rx, uintptr_t jmp_rw)
{
    /* patch the branch destination */
    uintptr_t addr = tb->jmp_target_addr[n];
    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
    /* no need to flush icache explicitly */
}


static void tgen_add(TCGContext *s, TCGType type,
                     TCGReg a0, TCGReg a1, TCGReg a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

    if (a0 == a1) {
        tgen_arithr(s, ARITH_ADD + rexw, a0, a2);
    } else if (a0 == a2) {
        tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
    } else {
        tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, 0);
    }
}

static void tgen_addi(TCGContext *s, TCGType type,
                      TCGReg a0, TCGReg a1, tcg_target_long a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

    if (a0 == a1) {
        tgen_arithi(s, ARITH_ADD + rexw, a0, a2, false);
    } else {
        tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2);
    }
}

static const TCGOutOpBinary outop_add = {
    .base.static_constraint = C_O1_I2(r, r, re),
    .out_rrr = tgen_add,
    .out_rri = tgen_addi,
};

static void tgen_and(TCGContext *s, TCGType type,
                     TCGReg a0, TCGReg a1, TCGReg a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tgen_arithr(s, ARITH_AND + rexw, a0, a2);
}

static void tgen_andi(TCGContext *s, TCGType type,
                      TCGReg a0, TCGReg a1, tcg_target_long a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tgen_arithi(s, ARITH_AND + rexw, a0, a2, false);
}

static const TCGOutOpBinary outop_and = {
    .base.static_constraint = C_O1_I2(r, 0, reZ),
    .out_rrr = tgen_and,
    .out_rri = tgen_andi,
};

static void tgen_andc(TCGContext *s, TCGType type,
                      TCGReg a0, TCGReg a1, TCGReg a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
}

static TCGConstraintSetIndex cset_andc(TCGType type, unsigned flags)
{
    return have_bmi1 ? C_O1_I2(r, r, r) : C_NotImplemented;
}

static const TCGOutOpBinary outop_andc = {
    .base.static_constraint = C_Dynamic,
    .base.dynamic_constraint = cset_andc,
    .out_rrr = tgen_andc,
};

static const TCGOutOpBinary outop_eqv = {
    .base.static_constraint = C_NotImplemented,
};

static void tgen_mul(TCGContext *s, TCGType type,
                     TCGReg a0, TCGReg a1, TCGReg a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
}

static void tgen_muli(TCGContext *s, TCGType type,
                      TCGReg a0, TCGReg a1, tcg_target_long a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

    if (a2 == (int8_t)a2) {
        tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
        tcg_out8(s, a2);
    } else {
        tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
        tcg_out32(s, a2);
    }
}

static const TCGOutOpBinary outop_mul = {
    .base.static_constraint = C_O1_I2(r, 0, re),
    .out_rrr = tgen_mul,
    .out_rri = tgen_muli,
};

static const TCGOutOpBinary outop_nand = {
    .base.static_constraint = C_NotImplemented,
};

static const TCGOutOpBinary outop_nor = {
    .base.static_constraint = C_NotImplemented,
};

static void tgen_or(TCGContext *s, TCGType type,
                    TCGReg a0, TCGReg a1, TCGReg a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tgen_arithr(s, ARITH_OR + rexw, a0, a2);
}

static void tgen_ori(TCGContext *s, TCGType type,
                     TCGReg a0, TCGReg a1, tcg_target_long a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tgen_arithi(s, ARITH_OR + rexw, a0, a2, false);
}

static const TCGOutOpBinary outop_or = {
    .base.static_constraint = C_O1_I2(r, 0, re),
    .out_rrr = tgen_or,
    .out_rri = tgen_ori,
};

static const TCGOutOpBinary outop_orc = {
    .base.static_constraint = C_NotImplemented,
};

static void tgen_sub(TCGContext *s, TCGType type,
                      TCGReg a0, TCGReg a1, TCGReg a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tgen_arithr(s, ARITH_SUB + rexw, a0, a2);
}

static const TCGOutOpSubtract outop_sub = {
    .base.static_constraint = C_O1_I2(r, 0, r),
    .out_rrr = tgen_sub,
};

static void tgen_xor(TCGContext *s, TCGType type,
                     TCGReg a0, TCGReg a1, TCGReg a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tgen_arithr(s, ARITH_XOR + rexw, a0, a2);
}

static void tgen_xori(TCGContext *s, TCGType type,
                      TCGReg a0, TCGReg a1, tcg_target_long a2)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tgen_arithi(s, ARITH_XOR + rexw, a0, a2, false);
}

static const TCGOutOpBinary outop_xor = {
    .base.static_constraint = C_O1_I2(r, 0, re),
    .out_rrr = tgen_xor,
    .out_rri = tgen_xori,
};

static void tgen_neg(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
}

static const TCGOutOpUnary outop_neg = {
    .base.static_constraint = C_O1_I1(r, 0),
    .out_rr = tgen_neg,
};

static void tgen_not(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1)
{
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
}

static const TCGOutOpUnary outop_not = {
    .base.static_constraint = C_O1_I1(r, 0),
    .out_rr = tgen_not,
};


static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type,
                       const TCGArg args[TCG_MAX_OP_ARGS],
                       const int const_args[TCG_MAX_OP_ARGS])
{
    TCGArg a0, a1, a2;
    int c, const_a2, vexop, rexw;

#if TCG_TARGET_REG_BITS == 64
# define OP_32_64(x) \
        case glue(glue(INDEX_op_, x), _i64): \
        case glue(glue(INDEX_op_, x), _i32)
#else
# define OP_32_64(x) \
        case glue(glue(INDEX_op_, x), _i32)
#endif

    /* Hoist the loads of the most common arguments.  */
    a0 = args[0];
    a1 = args[1];
    a2 = args[2];
    const_a2 = const_args[2];
    rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

    switch (opc) {
    case INDEX_op_goto_ptr:
        /* jmp to the given host address (could be epilogue) */
        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
        break;
    case INDEX_op_br:
        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
        break;
    OP_32_64(ld8u):
        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
        break;
    OP_32_64(ld8s):
        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
        break;
    OP_32_64(ld16u):
        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
        break;
    OP_32_64(ld16s):
        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
        break;
#if TCG_TARGET_REG_BITS == 64
    case INDEX_op_ld32u_i64:
#endif
    case INDEX_op_ld_i32:
        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
        break;

    OP_32_64(st8):
        if (const_args[0]) {
            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
            tcg_out8(s, a0);
        } else {
            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
        }
        break;
    OP_32_64(st16):
        if (const_args[0]) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
            tcg_out16(s, a0);
        } else {
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
        }
        break;
#if TCG_TARGET_REG_BITS == 64
    case INDEX_op_st32_i64:
#endif
    case INDEX_op_st_i32:
        if (const_args[0]) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
            tcg_out32(s, a0);
        } else {
            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
        }
        break;

    OP_32_64(div2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
        break;
    OP_32_64(divu2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
        break;

    OP_32_64(shl):
        /* For small constant 3-operand shift, use LEA.  */
        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
            if (a2 - 1 == 0) {
                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
            } else {
                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
            }
            break;
        }
        c = SHIFT_SHL;
        vexop = OPC_SHLX;
        goto gen_shift_maybe_vex;
    OP_32_64(shr):
        c = SHIFT_SHR;
        vexop = OPC_SHRX;
        goto gen_shift_maybe_vex;
    OP_32_64(sar):
        c = SHIFT_SAR;
        vexop = OPC_SARX;
        goto gen_shift_maybe_vex;
    OP_32_64(rotl):
        c = SHIFT_ROL;
        goto gen_shift;
    OP_32_64(rotr):
        c = SHIFT_ROR;
        goto gen_shift;
    gen_shift_maybe_vex:
        if (have_bmi2) {
            if (!const_a2) {
                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
                break;
            }
            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
        }
        /* FALLTHRU */
    gen_shift:
        if (const_a2) {
            tcg_out_shifti(s, c + rexw, a0, a2);
        } else {
            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
        }
        break;

    OP_32_64(ctz):
        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
        break;
    OP_32_64(clz):
        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
        break;
    OP_32_64(ctpop):
        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
        break;

    OP_32_64(brcond):
        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
                       arg_label(args[3]), 0);
        break;
    OP_32_64(setcond):
        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
        break;
    OP_32_64(negsetcond):
        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
        break;
    OP_32_64(movcond):
        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
        break;

    OP_32_64(bswap16):
        if (a2 & TCG_BSWAP_OS) {
            /* Output must be sign-extended. */
            if (rexw) {
                tcg_out_bswap64(s, a0);
                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
            } else {
                tcg_out_bswap32(s, a0);
                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
            }
        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
            /* Output must be zero-extended, but input isn't. */
            tcg_out_bswap32(s, a0);
            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
        } else {
            tcg_out_rolw_8(s, a0);
        }
        break;
    OP_32_64(bswap32):
        tcg_out_bswap32(s, a0);
        if (rexw && (a2 & TCG_BSWAP_OS)) {
            tcg_out_ext32s(s, a0, a0);
        }
        break;

    case INDEX_op_qemu_ld_i32:
        tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32);
        break;
    case INDEX_op_qemu_ld_i64:
        if (TCG_TARGET_REG_BITS == 64) {
            tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64);
        } else {
            tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I64);
        }
        break;
    case INDEX_op_qemu_ld_i128:
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
        tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I128);
        break;

    case INDEX_op_qemu_st_i32:
    case INDEX_op_qemu_st8_i32:
        tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32);
        break;
    case INDEX_op_qemu_st_i64:
        if (TCG_TARGET_REG_BITS == 64) {
            tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64);
        } else {
            tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I64);
        }
        break;
    case INDEX_op_qemu_st_i128:
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
        tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I128);
        break;

    OP_32_64(mulu2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
        break;
    OP_32_64(muls2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
        break;
    OP_32_64(add2):
        if (const_args[4]) {
            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
        } else {
            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
        }
        if (const_args[5]) {
            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
        } else {
            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
        }
        break;
    OP_32_64(sub2):
        if (const_args[4]) {
            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
        } else {
            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
        }
        if (const_args[5]) {
            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
        } else {
            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
        }
        break;

#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_brcond2_i32:
        tcg_out_brcond2(s, args, const_args, 0);
        break;
    case INDEX_op_setcond2_i32:
        tcg_out_setcond2(s, args, const_args);
        break;
#else /* TCG_TARGET_REG_BITS == 64 */
    case INDEX_op_ld32s_i64:
        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
        break;
    case INDEX_op_ld_i64:
        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
        break;
    case INDEX_op_st_i64:
        if (const_args[0]) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
            tcg_out32(s, a0);
        } else {
            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
        }
        break;

    case INDEX_op_bswap64_i64:
        tcg_out_bswap64(s, a0);
        break;
    case INDEX_op_extrh_i64_i32:
        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
        break;
#endif

    OP_32_64(deposit):
        if (args[3] == 0 && args[4] == 8) {
            /* load bits 0..7 */
            if (const_a2) {
                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
                            0, a0, 0);
                tcg_out8(s, a2);
            } else {
                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
            }
        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
            /* load bits 8..15 */
            if (const_a2) {
                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
                tcg_out8(s, a2);
            } else {
                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
            }
        } else if (args[3] == 0 && args[4] == 16) {
            /* load bits 0..15 */
            if (const_a2) {
                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
                            0, a0, 0);
                tcg_out16(s, a2);
            } else {
                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
            }
        } else {
            g_assert_not_reached();
        }
        break;

    case INDEX_op_extract_i64:
        if (a2 + args[3] == 32) {
            if (a2 == 0) {
                tcg_out_ext32u(s, a0, a1);
                break;
            }
            /* This is a 32-bit zero-extending right shift.  */
            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
            break;
        }
        /* FALLTHRU */
    case INDEX_op_extract_i32:
        if (a2 == 0 && args[3] == 8) {
            tcg_out_ext8u(s, a0, a1);
        } else if (a2 == 0 && args[3] == 16) {
            tcg_out_ext16u(s, a0, a1);
        } else if (a2 == 8 && args[3] == 8) {
            /*
             * On the off-chance that we can use the high-byte registers.
             * Otherwise we emit the same ext16 + shift pattern that we
             * would have gotten from the normal tcg-op.c expansion.
             */
            if (a1 < 4 && a0 < 8) {
                tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
            } else {
                tcg_out_ext16u(s, a0, a1);
                tcg_out_shifti(s, SHIFT_SHR, a0, 8);
            }
        } else {
            g_assert_not_reached();
        }
        break;

    case INDEX_op_sextract_i64:
        if (a2 == 0 && args[3] == 8) {
            tcg_out_ext8s(s, TCG_TYPE_I64, a0, a1);
        } else if (a2 == 0 && args[3] == 16) {
            tcg_out_ext16s(s, TCG_TYPE_I64, a0, a1);
        } else if (a2 == 0 && args[3] == 32) {
            tcg_out_ext32s(s, a0, a1);
        } else {
            g_assert_not_reached();
        }
        break;

    case INDEX_op_sextract_i32:
        if (a2 == 0 && args[3] == 8) {
            tcg_out_ext8s(s, TCG_TYPE_I32, a0, a1);
        } else if (a2 == 0 && args[3] == 16) {
            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
        } else if (a2 == 8 && args[3] == 8) {
            if (a1 < 4 && a0 < 8) {
                tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
            } else {
                tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
                tcg_out_shifti(s, SHIFT_SAR, a0, 8);
            }
        } else {
            g_assert_not_reached();
        }
        break;

    OP_32_64(extract2):
        /* Note that SHRD outputs to the r/m operand.  */
        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
        tcg_out8(s, args[3]);
        break;

    case INDEX_op_mb:
        tcg_out_mb(s, a0);
        break;
    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
    case INDEX_op_ext_i32_i64:  /* Always emitted via tcg_reg_alloc_op.  */
    case INDEX_op_extu_i32_i64:
    case INDEX_op_extrl_i64_i32:
    default:
        g_assert_not_reached();
    }

#undef OP_32_64
}

static int const umin_insn[4] = {
    OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
};

static int const umax_insn[4] = {
    OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
};

static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece,
                                  TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
{
    static int const cmpeq_insn[4] = {
        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
    };
    static int const cmpgt_insn[4] = {
        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
    };

    enum {
        NEED_INV  = 1,
        NEED_SWAP = 2,
        NEED_UMIN = 4,
        NEED_UMAX = 8,
        INVALID   = 16,
    };
    static const uint8_t cond_fixup[16] = {
        [0 ... 15] = INVALID,
        [TCG_COND_EQ] = 0,
        [TCG_COND_GT] = 0,
        [TCG_COND_NE] = NEED_INV,
        [TCG_COND_LE] = NEED_INV,
        [TCG_COND_LT] = NEED_SWAP,
        [TCG_COND_GE] = NEED_SWAP | NEED_INV,
        [TCG_COND_LEU] = NEED_UMIN,
        [TCG_COND_GTU] = NEED_UMIN | NEED_INV,
        [TCG_COND_GEU] = NEED_UMAX,
        [TCG_COND_LTU] = NEED_UMAX | NEED_INV,
    };
    int fixup = cond_fixup[cond];

    assert(!(fixup & INVALID));

    if (fixup & NEED_INV) {
        cond = tcg_invert_cond(cond);
    }

    if (fixup & NEED_SWAP) {
        TCGReg swap = v1;
        v1 = v2;
        v2 = swap;
        cond = tcg_swap_cond(cond);
    }

    if (fixup & (NEED_UMIN | NEED_UMAX)) {
        int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]);

        /* avx2 does not have 64-bit min/max; adjusted during expand. */
        assert(vece <= MO_32);

        tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type);
        v2 = TCG_TMP_VEC;
        cond = TCG_COND_EQ;
    }

    switch (cond) {
    case TCG_COND_EQ:
        tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type);
        break;
    case TCG_COND_GT:
        tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type);
        break;
    default:
        g_assert_not_reached();
    }
    return fixup & NEED_INV;
}

static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece,
                               TCGReg v1, TCGReg v2, TCGCond cond)
{
    static const int cmpm_insn[2][4] = {
        { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ },
        { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ }
    };
    static const int testm_insn[4] = {
        OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ
    };
    static const int testnm_insn[4] = {
        OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ
    };

    static const int cond_ext[16] = {
        [TCG_COND_EQ] = 0,
        [TCG_COND_NE] = 4,
        [TCG_COND_LT] = 1,
        [TCG_COND_LTU] = 1,
        [TCG_COND_LE] = 2,
        [TCG_COND_LEU] = 2,
        [TCG_COND_NEVER] = 3,
        [TCG_COND_GE] = 5,
        [TCG_COND_GEU] = 5,
        [TCG_COND_GT] = 6,
        [TCG_COND_GTU] = 6,
        [TCG_COND_ALWAYS] = 7,
    };

    switch (cond) {
    case TCG_COND_TSTNE:
        tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type);
        break;
    case TCG_COND_TSTEQ:
        tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type);
        break;
    default:
        tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece],
                               /* k1 */ 1, v1, v2, type);
        tcg_out8(s, cond_ext[cond]);
        break;
    }
}

static void tcg_out_k1_to_vec(TCGContext *s, TCGType type,
                              unsigned vece, TCGReg dest)
{
    static const int movm_insn[] = {
        OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q
    };
    tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type);
}

static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
                            TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
{
    /*
     * With avx512, we have a complete set of comparisons into mask.
     * Unless there's a single insn expansion for the comparision,
     * expand via a mask in k1.
     */
    if ((vece <= MO_16 ? have_avx512bw : have_avx512dq)
        && cond != TCG_COND_EQ
        && cond != TCG_COND_LT
        && cond != TCG_COND_GT) {
        tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond);
        tcg_out_k1_to_vec(s, type, vece, v0);
        return;
    }

    if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) {
        tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1);
        tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type);
    }
}

static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece,
                                  TCGReg v0, TCGReg c1, TCGReg c2,
                                  TCGReg v3, TCGReg v4, TCGCond cond)
{
    static const int vpblendm_insn[] = {
        OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ
    };
    bool z = false;

    /* Swap to place constant in V4 to take advantage of zero-masking. */
    if (!v3) {
        z = true;
        v3 = v4;
        cond = tcg_invert_cond(cond);
    }

    tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond);
    tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3,
                            /* k1 */1, z, type);
}

static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
                               TCGReg v0, TCGReg c1, TCGReg c2,
                               TCGReg v3, TCGReg v4, TCGCond cond)
{
    bool inv;

    if (vece <= MO_16 ? have_avx512bw : have_avx512vl) {
        tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond);
        return;
    }

    inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond);

    /*
     * Since XMM0 is 16, the only way we get 0 into V3
     * is via the constant zero constraint.
     */
    if (!v3) {
        if (inv) {
            tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type);
        } else {
            tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type);
        }
    } else {
        if (inv) {
            TCGReg swap = v3;
            v3 = v4;
            v4 = swap;
        }
        tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type);
        tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4);
    }
}

static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                           unsigned vecl, unsigned vece,
                           const TCGArg args[TCG_MAX_OP_ARGS],
                           const int const_args[TCG_MAX_OP_ARGS])
{
    static int const add_insn[4] = {
        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
    };
    static int const ssadd_insn[4] = {
        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
    };
    static int const usadd_insn[4] = {
        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
    };
    static int const sub_insn[4] = {
        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
    };
    static int const sssub_insn[4] = {
        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
    };
    static int const ussub_insn[4] = {
        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
    };
    static int const mul_insn[4] = {
        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
    };
    static int const shift_imm_insn[4] = {
        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
    };
    static int const punpckl_insn[4] = {
        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
    };
    static int const punpckh_insn[4] = {
        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
    };
    static int const packss_insn[4] = {
        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
    };
    static int const packus_insn[4] = {
        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
    };
    static int const smin_insn[4] = {
        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
    };
    static int const smax_insn[4] = {
        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
    };
    static int const rotlv_insn[4] = {
        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
    };
    static int const rotrv_insn[4] = {
        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
    };
    static int const shlv_insn[4] = {
        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
    };
    static int const shrv_insn[4] = {
        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
    };
    static int const sarv_insn[4] = {
        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
    };
    static int const shls_insn[4] = {
        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
    };
    static int const shrs_insn[4] = {
        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
    };
    static int const sars_insn[4] = {
        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
    };
    static int const vpshldi_insn[4] = {
        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
    };
    static int const vpshldv_insn[4] = {
        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
    };
    static int const vpshrdv_insn[4] = {
        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
    };
    static int const abs_insn[4] = {
        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
    };

    TCGType type = vecl + TCG_TYPE_V64;
    int insn, sub;
    TCGArg a0, a1, a2, a3;

    a0 = args[0];
    a1 = args[1];
    a2 = args[2];

    switch (opc) {
    case INDEX_op_add_vec:
        insn = add_insn[vece];
        goto gen_simd;
    case INDEX_op_ssadd_vec:
        insn = ssadd_insn[vece];
        goto gen_simd;
    case INDEX_op_usadd_vec:
        insn = usadd_insn[vece];
        goto gen_simd;
    case INDEX_op_sub_vec:
        insn = sub_insn[vece];
        goto gen_simd;
    case INDEX_op_sssub_vec:
        insn = sssub_insn[vece];
        goto gen_simd;
    case INDEX_op_ussub_vec:
        insn = ussub_insn[vece];
        goto gen_simd;
    case INDEX_op_mul_vec:
        insn = mul_insn[vece];
        goto gen_simd;
    case INDEX_op_and_vec:
        insn = OPC_PAND;
        goto gen_simd;
    case INDEX_op_or_vec:
        insn = OPC_POR;
        goto gen_simd;
    case INDEX_op_xor_vec:
        insn = OPC_PXOR;
        goto gen_simd;
    case INDEX_op_smin_vec:
        insn = smin_insn[vece];
        goto gen_simd;
    case INDEX_op_umin_vec:
        insn = umin_insn[vece];
        goto gen_simd;
    case INDEX_op_smax_vec:
        insn = smax_insn[vece];
        goto gen_simd;
    case INDEX_op_umax_vec:
        insn = umax_insn[vece];
        goto gen_simd;
    case INDEX_op_shlv_vec:
        insn = shlv_insn[vece];
        goto gen_simd;
    case INDEX_op_shrv_vec:
        insn = shrv_insn[vece];
        goto gen_simd;
    case INDEX_op_sarv_vec:
        insn = sarv_insn[vece];
        goto gen_simd;
    case INDEX_op_rotlv_vec:
        insn = rotlv_insn[vece];
        goto gen_simd;
    case INDEX_op_rotrv_vec:
        insn = rotrv_insn[vece];
        goto gen_simd;
    case INDEX_op_shls_vec:
        insn = shls_insn[vece];
        goto gen_simd;
    case INDEX_op_shrs_vec:
        insn = shrs_insn[vece];
        goto gen_simd;
    case INDEX_op_sars_vec:
        insn = sars_insn[vece];
        goto gen_simd;
    case INDEX_op_x86_punpckl_vec:
        insn = punpckl_insn[vece];
        goto gen_simd;
    case INDEX_op_x86_punpckh_vec:
        insn = punpckh_insn[vece];
        goto gen_simd;
    case INDEX_op_x86_packss_vec:
        insn = packss_insn[vece];
        goto gen_simd;
    case INDEX_op_x86_packus_vec:
        insn = packus_insn[vece];
        goto gen_simd;
    case INDEX_op_x86_vpshldv_vec:
        insn = vpshldv_insn[vece];
        a1 = a2;
        a2 = args[3];
        goto gen_simd;
    case INDEX_op_x86_vpshrdv_vec:
        insn = vpshrdv_insn[vece];
        a1 = a2;
        a2 = args[3];
        goto gen_simd;
#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_dup2_vec:
        /* First merge the two 32-bit inputs to a single 64-bit element. */
        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
        /* Then replicate the 64-bit elements across the rest of the vector. */
        if (type != TCG_TYPE_V64) {
            tcg_out_dup_vec(s, type, MO_64, a0, a0);
        }
        break;
#endif
    case INDEX_op_abs_vec:
        insn = abs_insn[vece];
        a2 = a1;
        a1 = 0;
        goto gen_simd;
    gen_simd:
        tcg_debug_assert(insn != OPC_UD2);
        tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
        break;

    case INDEX_op_cmp_vec:
        tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]);
        break;

    case INDEX_op_cmpsel_vec:
        tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2,
                           args[3], args[4], args[5]);
        break;

    case INDEX_op_andc_vec:
        insn = OPC_PANDN;
        tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type);
        break;

    case INDEX_op_shli_vec:
        insn = shift_imm_insn[vece];
        sub = 6;
        goto gen_shift;
    case INDEX_op_shri_vec:
        insn = shift_imm_insn[vece];
        sub = 2;
        goto gen_shift;
    case INDEX_op_sari_vec:
        if (vece == MO_64) {
            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
        } else {
            insn = shift_imm_insn[vece];
        }
        sub = 4;
        goto gen_shift;
    case INDEX_op_rotli_vec:
        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
        if (vece == MO_64) {
            insn |= P_VEXW;
        }
        sub = 1;
        goto gen_shift;
    gen_shift:
        tcg_debug_assert(vece != MO_8);
        tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type);
        tcg_out8(s, a2);
        break;

    case INDEX_op_ld_vec:
        tcg_out_ld(s, type, a0, a1, a2);
        break;
    case INDEX_op_st_vec:
        tcg_out_st(s, type, a0, a1, a2);
        break;
    case INDEX_op_dupm_vec:
        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
        break;

    case INDEX_op_x86_shufps_vec:
        insn = OPC_SHUFPS;
        sub = args[3];
        goto gen_simd_imm8;
    case INDEX_op_x86_blend_vec:
        if (vece == MO_16) {
            insn = OPC_PBLENDW;
        } else if (vece == MO_32) {
            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
        } else {
            g_assert_not_reached();
        }
        sub = args[3];
        goto gen_simd_imm8;
    case INDEX_op_x86_vperm2i128_vec:
        insn = OPC_VPERM2I128;
        sub = args[3];
        goto gen_simd_imm8;
    case INDEX_op_x86_vpshldi_vec:
        insn = vpshldi_insn[vece];
        sub = args[3];
        goto gen_simd_imm8;

    case INDEX_op_not_vec:
        insn = OPC_VPTERNLOGQ;
        a2 = a1;
        sub = 0x33; /* !B */
        goto gen_simd_imm8;
    case INDEX_op_nor_vec:
        insn = OPC_VPTERNLOGQ;
        sub = 0x11; /* norCB */
        goto gen_simd_imm8;
    case INDEX_op_nand_vec:
        insn = OPC_VPTERNLOGQ;
        sub = 0x77; /* nandCB */
        goto gen_simd_imm8;
    case INDEX_op_eqv_vec:
        insn = OPC_VPTERNLOGQ;
        sub = 0x99; /* xnorCB */
        goto gen_simd_imm8;
    case INDEX_op_orc_vec:
        insn = OPC_VPTERNLOGQ;
        sub = 0xdd; /* orB!C */
        goto gen_simd_imm8;

    case INDEX_op_bitsel_vec:
        insn = OPC_VPTERNLOGQ;
        a3 = args[3];
        if (a0 == a1) {
            a1 = a2;
            a2 = a3;
            sub = 0xca; /* A?B:C */
        } else if (a0 == a2) {
            a2 = a3;
            sub = 0xe2; /* B?A:C */
        } else {
            tcg_out_mov(s, type, a0, a3);
            sub = 0xb8; /* B?C:A */
        }
        goto gen_simd_imm8;

    gen_simd_imm8:
        tcg_debug_assert(insn != OPC_UD2);
        tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
        tcg_out8(s, sub);
        break;

    case INDEX_op_x86_psrldq_vec:
        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
        tcg_out8(s, a2);
        break;

    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
    default:
        g_assert_not_reached();
    }
}

static TCGConstraintSetIndex
tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags)
{
    switch (op) {
    case INDEX_op_goto_ptr:
        return C_O0_I1(r);

    case INDEX_op_ld8u_i32:
    case INDEX_op_ld8u_i64:
    case INDEX_op_ld8s_i32:
    case INDEX_op_ld8s_i64:
    case INDEX_op_ld16u_i32:
    case INDEX_op_ld16u_i64:
    case INDEX_op_ld16s_i32:
    case INDEX_op_ld16s_i64:
    case INDEX_op_ld_i32:
    case INDEX_op_ld32u_i64:
    case INDEX_op_ld32s_i64:
    case INDEX_op_ld_i64:
        return C_O1_I1(r, r);

    case INDEX_op_st8_i32:
    case INDEX_op_st8_i64:
        return C_O0_I2(qi, r);

    case INDEX_op_st16_i32:
    case INDEX_op_st16_i64:
    case INDEX_op_st_i32:
    case INDEX_op_st32_i64:
        return C_O0_I2(ri, r);

    case INDEX_op_st_i64:
        return C_O0_I2(re, r);

    case INDEX_op_shl_i32:
    case INDEX_op_shl_i64:
    case INDEX_op_shr_i32:
    case INDEX_op_shr_i64:
    case INDEX_op_sar_i32:
    case INDEX_op_sar_i64:
        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);

    case INDEX_op_rotl_i32:
    case INDEX_op_rotl_i64:
    case INDEX_op_rotr_i32:
    case INDEX_op_rotr_i64:
        return C_O1_I2(r, 0, ci);

    case INDEX_op_brcond_i32:
    case INDEX_op_brcond_i64:
        return C_O0_I2(r, reT);

    case INDEX_op_bswap16_i32:
    case INDEX_op_bswap16_i64:
    case INDEX_op_bswap32_i32:
    case INDEX_op_bswap32_i64:
    case INDEX_op_bswap64_i64:
    case INDEX_op_extrh_i64_i32:
        return C_O1_I1(r, 0);

    case INDEX_op_ext_i32_i64:
    case INDEX_op_extu_i32_i64:
    case INDEX_op_extrl_i64_i32:
    case INDEX_op_extract_i32:
    case INDEX_op_extract_i64:
    case INDEX_op_sextract_i32:
    case INDEX_op_sextract_i64:
    case INDEX_op_ctpop_i32:
    case INDEX_op_ctpop_i64:
        return C_O1_I1(r, r);

    case INDEX_op_extract2_i32:
    case INDEX_op_extract2_i64:
        return C_O1_I2(r, 0, r);

    case INDEX_op_deposit_i32:
    case INDEX_op_deposit_i64:
        return C_O1_I2(q, 0, qi);

    case INDEX_op_setcond_i32:
    case INDEX_op_setcond_i64:
    case INDEX_op_negsetcond_i32:
    case INDEX_op_negsetcond_i64:
        return C_O1_I2(q, r, reT);

    case INDEX_op_movcond_i32:
    case INDEX_op_movcond_i64:
        return C_O1_I4(r, r, reT, r, 0);

    case INDEX_op_div2_i32:
    case INDEX_op_div2_i64:
    case INDEX_op_divu2_i32:
    case INDEX_op_divu2_i64:
        return C_O2_I3(a, d, 0, 1, r);

    case INDEX_op_mulu2_i32:
    case INDEX_op_mulu2_i64:
    case INDEX_op_muls2_i32:
    case INDEX_op_muls2_i64:
        return C_O2_I2(a, d, a, r);

    case INDEX_op_add2_i32:
    case INDEX_op_add2_i64:
    case INDEX_op_sub2_i32:
    case INDEX_op_sub2_i64:
        return C_N1_O1_I4(r, r, 0, 1, re, re);

    case INDEX_op_ctz_i32:
    case INDEX_op_ctz_i64:
        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);

    case INDEX_op_clz_i32:
    case INDEX_op_clz_i64:
        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);

    case INDEX_op_qemu_ld_i32:
        return C_O1_I1(r, L);

    case INDEX_op_qemu_st_i32:
        return C_O0_I2(L, L);
    case INDEX_op_qemu_st8_i32:
        return C_O0_I2(s, L);

    case INDEX_op_qemu_ld_i64:
        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);

    case INDEX_op_qemu_st_i64:
        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);

    case INDEX_op_qemu_ld_i128:
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
        return C_O2_I1(r, r, L);
    case INDEX_op_qemu_st_i128:
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
        return C_O0_I3(L, L, L);

    case INDEX_op_brcond2_i32:
        return C_O0_I4(r, r, ri, ri);

    case INDEX_op_setcond2_i32:
        return C_O1_I4(r, r, r, ri, ri);

    case INDEX_op_ld_vec:
    case INDEX_op_dupm_vec:
        return C_O1_I1(x, r);

    case INDEX_op_st_vec:
        return C_O0_I2(x, r);

    case INDEX_op_add_vec:
    case INDEX_op_sub_vec:
    case INDEX_op_mul_vec:
    case INDEX_op_and_vec:
    case INDEX_op_or_vec:
    case INDEX_op_xor_vec:
    case INDEX_op_andc_vec:
    case INDEX_op_orc_vec:
    case INDEX_op_nand_vec:
    case INDEX_op_nor_vec:
    case INDEX_op_eqv_vec:
    case INDEX_op_ssadd_vec:
    case INDEX_op_usadd_vec:
    case INDEX_op_sssub_vec:
    case INDEX_op_ussub_vec:
    case INDEX_op_smin_vec:
    case INDEX_op_umin_vec:
    case INDEX_op_smax_vec:
    case INDEX_op_umax_vec:
    case INDEX_op_shlv_vec:
    case INDEX_op_shrv_vec:
    case INDEX_op_sarv_vec:
    case INDEX_op_rotlv_vec:
    case INDEX_op_rotrv_vec:
    case INDEX_op_shls_vec:
    case INDEX_op_shrs_vec:
    case INDEX_op_sars_vec:
    case INDEX_op_cmp_vec:
    case INDEX_op_x86_shufps_vec:
    case INDEX_op_x86_blend_vec:
    case INDEX_op_x86_packss_vec:
    case INDEX_op_x86_packus_vec:
    case INDEX_op_x86_vperm2i128_vec:
    case INDEX_op_x86_punpckl_vec:
    case INDEX_op_x86_punpckh_vec:
    case INDEX_op_x86_vpshldi_vec:
#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_dup2_vec:
#endif
        return C_O1_I2(x, x, x);

    case INDEX_op_abs_vec:
    case INDEX_op_dup_vec:
    case INDEX_op_not_vec:
    case INDEX_op_shli_vec:
    case INDEX_op_shri_vec:
    case INDEX_op_sari_vec:
    case INDEX_op_rotli_vec:
    case INDEX_op_x86_psrldq_vec:
        return C_O1_I1(x, x);

    case INDEX_op_x86_vpshldv_vec:
    case INDEX_op_x86_vpshrdv_vec:
        return C_O1_I3(x, 0, x, x);

    case INDEX_op_bitsel_vec:
        return C_O1_I3(x, x, x, x);
    case INDEX_op_cmpsel_vec:
        return C_O1_I4(x, x, x, xO, x);

    default:
        return C_NotImplemented;
    }
}

int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
{
    switch (opc) {
    case INDEX_op_add_vec:
    case INDEX_op_sub_vec:
    case INDEX_op_and_vec:
    case INDEX_op_or_vec:
    case INDEX_op_xor_vec:
    case INDEX_op_andc_vec:
    case INDEX_op_orc_vec:
    case INDEX_op_nand_vec:
    case INDEX_op_nor_vec:
    case INDEX_op_eqv_vec:
    case INDEX_op_not_vec:
    case INDEX_op_bitsel_vec:
        return 1;
    case INDEX_op_cmp_vec:
    case INDEX_op_cmpsel_vec:
        return -1;

    case INDEX_op_rotli_vec:
        return have_avx512vl && vece >= MO_32 ? 1 : -1;

    case INDEX_op_shli_vec:
    case INDEX_op_shri_vec:
        /* We must expand the operation for MO_8.  */
        return vece == MO_8 ? -1 : 1;

    case INDEX_op_sari_vec:
        switch (vece) {
        case MO_8:
            return -1;
        case MO_16:
        case MO_32:
            return 1;
        case MO_64:
            if (have_avx512vl) {
                return 1;
            }
            /*
             * We can emulate this for MO_64, but it does not pay off
             * unless we're producing at least 4 values.
             */
            return type >= TCG_TYPE_V256 ? -1 : 0;
        }
        return 0;

    case INDEX_op_shls_vec:
    case INDEX_op_shrs_vec:
        return vece >= MO_16;
    case INDEX_op_sars_vec:
        switch (vece) {
        case MO_16:
        case MO_32:
            return 1;
        case MO_64:
            return have_avx512vl;
        }
        return 0;
    case INDEX_op_rotls_vec:
        return vece >= MO_16 ? -1 : 0;

    case INDEX_op_shlv_vec:
    case INDEX_op_shrv_vec:
        switch (vece) {
        case MO_16:
            return have_avx512bw;
        case MO_32:
        case MO_64:
            return have_avx2;
        }
        return 0;
    case INDEX_op_sarv_vec:
        switch (vece) {
        case MO_16:
            return have_avx512bw;
        case MO_32:
            return have_avx2;
        case MO_64:
            return have_avx512vl;
        }
        return 0;
    case INDEX_op_rotlv_vec:
    case INDEX_op_rotrv_vec:
        switch (vece) {
        case MO_16:
            return have_avx512vbmi2 ? -1 : 0;
        case MO_32:
        case MO_64:
            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
        }
        return 0;

    case INDEX_op_mul_vec:
        switch (vece) {
        case MO_8:
            return -1;
        case MO_64:
            return have_avx512dq;
        }
        return 1;

    case INDEX_op_ssadd_vec:
    case INDEX_op_usadd_vec:
    case INDEX_op_sssub_vec:
    case INDEX_op_ussub_vec:
        return vece <= MO_16;
    case INDEX_op_smin_vec:
    case INDEX_op_smax_vec:
    case INDEX_op_umin_vec:
    case INDEX_op_umax_vec:
    case INDEX_op_abs_vec:
        return vece <= MO_32 || have_avx512vl;

    default:
        return 0;
    }
}

static void expand_vec_shi(TCGType type, unsigned vece, bool right,
                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
    uint8_t mask;

    tcg_debug_assert(vece == MO_8);
    if (right) {
        mask = 0xff >> imm;
        tcg_gen_shri_vec(MO_16, v0, v1, imm);
    } else {
        mask = 0xff << imm;
        tcg_gen_shli_vec(MO_16, v0, v1, imm);
    }
    tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
}

static void expand_vec_sari(TCGType type, unsigned vece,
                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
    TCGv_vec t1, t2;

    switch (vece) {
    case MO_8:
        /* Unpack to 16-bit, shift, and repack.  */
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_temp_new_vec(type);
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
        tcg_temp_free_vec(t1);
        tcg_temp_free_vec(t2);
        break;

    case MO_64:
        t1 = tcg_temp_new_vec(type);
        if (imm <= 32) {
            /*
             * We can emulate a small sign extend by performing an arithmetic
             * 32-bit shift and overwriting the high half of a 64-bit logical
             * shift.  Note that the ISA says shift of 32 is valid, but TCG
             * does not, so we have to bound the smaller shift -- we get the
             * same result in the high half either way.
             */
            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
            tcg_gen_shri_vec(MO_64, v0, v1, imm);
            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
                      tcgv_vec_arg(t1), 0xaa);
        } else {
            /* Otherwise we will need to use a compare vs 0 to produce
             * the sign-extend, shift and merge.
             */
            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
                            tcg_constant_vec(type, MO_64, 0), v1);
            tcg_gen_shri_vec(MO_64, v0, v1, imm);
            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
            tcg_gen_or_vec(MO_64, v0, v0, t1);
        }
        tcg_temp_free_vec(t1);
        break;

    default:
        g_assert_not_reached();
    }
}

static void expand_vec_rotli(TCGType type, unsigned vece,
                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
    TCGv_vec t;

    if (vece != MO_8 && have_avx512vbmi2) {
        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
        return;
    }

    t = tcg_temp_new_vec(type);
    tcg_gen_shli_vec(vece, t, v1, imm);
    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
    tcg_gen_or_vec(vece, v0, v0, t);
    tcg_temp_free_vec(t);
}

static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
                            TCGv_vec v1, TCGv_vec sh, bool right)
{
    TCGv_vec t;

    if (have_avx512vbmi2) {
        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
        return;
    }

    t = tcg_temp_new_vec(type);
    tcg_gen_dupi_vec(vece, t, 8 << vece);
    tcg_gen_sub_vec(vece, t, t, sh);
    if (right) {
        tcg_gen_shlv_vec(vece, t, v1, t);
        tcg_gen_shrv_vec(vece, v0, v1, sh);
    } else {
        tcg_gen_shrv_vec(vece, t, v1, t);
        tcg_gen_shlv_vec(vece, v0, v1, sh);
    }
    tcg_gen_or_vec(vece, v0, v0, t);
    tcg_temp_free_vec(t);
}

static void expand_vec_rotls(TCGType type, unsigned vece,
                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
{
    TCGv_vec t = tcg_temp_new_vec(type);

    tcg_debug_assert(vece != MO_8);

    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
        tcg_gen_dup_i32_vec(vece, t, lsh);
        if (vece >= MO_32) {
            tcg_gen_rotlv_vec(vece, v0, v1, t);
        } else {
            expand_vec_rotv(type, vece, v0, v1, t, false);
        }
    } else {
        TCGv_i32 rsh = tcg_temp_new_i32();

        tcg_gen_neg_i32(rsh, lsh);
        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
        tcg_gen_shls_vec(vece, t, v1, lsh);
        tcg_gen_shrs_vec(vece, v0, v1, rsh);
        tcg_gen_or_vec(vece, v0, v0, t);

        tcg_temp_free_i32(rsh);
    }

    tcg_temp_free_vec(t);
}

static void expand_vec_mul(TCGType type, unsigned vece,
                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
{
    TCGv_vec t1, t2, t3, t4, zero;

    tcg_debug_assert(vece == MO_8);

    /*
     * Unpack v1 bytes to words, 0 | x.
     * Unpack v2 bytes to words, y | 0.
     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
     * Shift logical right by 8 bits to clear the high 8 bytes before
     * using an unsigned saturated pack.
     *
     * The difference between the V64, V128 and V256 cases is merely how
     * we distribute the expansion between temporaries.
     */
    switch (type) {
    case TCG_TYPE_V64:
        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
        tcg_gen_mul_vec(MO_16, t1, t1, t2);
        tcg_gen_shri_vec(MO_16, t1, t1, 8);
        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
        tcg_temp_free_vec(t1);
        tcg_temp_free_vec(t2);
        break;

    case TCG_TYPE_V128:
    case TCG_TYPE_V256:
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_temp_new_vec(type);
        t3 = tcg_temp_new_vec(type);
        t4 = tcg_temp_new_vec(type);
        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
        tcg_gen_mul_vec(MO_16, t1, t1, t2);
        tcg_gen_mul_vec(MO_16, t3, t3, t4);
        tcg_gen_shri_vec(MO_16, t1, t1, 8);
        tcg_gen_shri_vec(MO_16, t3, t3, 8);
        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
        tcg_temp_free_vec(t1);
        tcg_temp_free_vec(t2);
        tcg_temp_free_vec(t3);
        tcg_temp_free_vec(t4);
        break;

    default:
        g_assert_not_reached();
    }
}

static TCGCond expand_vec_cond(TCGType type, unsigned vece,
                               TCGArg *a1, TCGArg *a2, TCGCond cond)
{
    /*
     * Without AVX512, there are no 64-bit unsigned comparisons.
     * We must bias the inputs so that they become signed.
     * All other swapping and inversion are handled during code generation.
     */
    if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) {
        TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1));
        TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2));
        TCGv_vec t1 = tcg_temp_new_vec(type);
        TCGv_vec t2 = tcg_temp_new_vec(type);
        TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));

        tcg_gen_sub_vec(vece, t1, v1, t3);
        tcg_gen_sub_vec(vece, t2, v2, t3);
        *a1 = tcgv_vec_arg(t1);
        *a2 = tcgv_vec_arg(t2);
        cond = tcg_signed_cond(cond);
    }
    return cond;
}

static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0,
                           TCGArg a1, TCGArg a2, TCGCond cond)
{
    cond = expand_vec_cond(type, vece, &a1, &a2, cond);
    /* Expand directly; do not recurse.  */
    vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
}

static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0,
                              TCGArg a1, TCGArg a2,
                              TCGArg a3, TCGArg a4, TCGCond cond)
{
    cond = expand_vec_cond(type, vece, &a1, &a2, cond);
    /* Expand directly; do not recurse.  */
    vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond);
}

void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                       TCGArg a0, ...)
{
    va_list va;
    TCGArg a1, a2, a3, a4, a5;
    TCGv_vec v0, v1, v2;

    va_start(va, a0);
    a1 = va_arg(va, TCGArg);
    a2 = va_arg(va, TCGArg);
    v0 = temp_tcgv_vec(arg_temp(a0));
    v1 = temp_tcgv_vec(arg_temp(a1));

    switch (opc) {
    case INDEX_op_shli_vec:
        expand_vec_shi(type, vece, false, v0, v1, a2);
        break;
    case INDEX_op_shri_vec:
        expand_vec_shi(type, vece, true, v0, v1, a2);
        break;
    case INDEX_op_sari_vec:
        expand_vec_sari(type, vece, v0, v1, a2);
        break;

    case INDEX_op_rotli_vec:
        expand_vec_rotli(type, vece, v0, v1, a2);
        break;

    case INDEX_op_rotls_vec:
        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
        break;

    case INDEX_op_rotlv_vec:
        v2 = temp_tcgv_vec(arg_temp(a2));
        expand_vec_rotv(type, vece, v0, v1, v2, false);
        break;
    case INDEX_op_rotrv_vec:
        v2 = temp_tcgv_vec(arg_temp(a2));
        expand_vec_rotv(type, vece, v0, v1, v2, true);
        break;

    case INDEX_op_mul_vec:
        v2 = temp_tcgv_vec(arg_temp(a2));
        expand_vec_mul(type, vece, v0, v1, v2);
        break;

    case INDEX_op_cmp_vec:
        a3 = va_arg(va, TCGArg);
        expand_vec_cmp(type, vece, a0, a1, a2, a3);
        break;

    case INDEX_op_cmpsel_vec:
        a3 = va_arg(va, TCGArg);
        a4 = va_arg(va, TCGArg);
        a5 = va_arg(va, TCGArg);
        expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5);
        break;

    default:
        break;
    }

    va_end(va);
}

static const int tcg_target_callee_save_regs[] = {
#if TCG_TARGET_REG_BITS == 64
    TCG_REG_RBP,
    TCG_REG_RBX,
#if defined(_WIN64)
    TCG_REG_RDI,
    TCG_REG_RSI,
#endif
    TCG_REG_R12,
    TCG_REG_R13,
    TCG_REG_R14, /* Currently used for the global env. */
    TCG_REG_R15,
#else
    TCG_REG_EBP, /* Currently used for the global env. */
    TCG_REG_EBX,
    TCG_REG_ESI,
    TCG_REG_EDI,
#endif
};

/* Compute frame size via macros, to share between tcg_target_qemu_prologue
   and tcg_register_jit.  */

#define PUSH_SIZE \
    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
     * (TCG_TARGET_REG_BITS / 8))

#define FRAME_SIZE \
    ((PUSH_SIZE \
      + TCG_STATIC_CALL_ARGS_SIZE \
      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
      + TCG_TARGET_STACK_ALIGN - 1) \
     & ~(TCG_TARGET_STACK_ALIGN - 1))

/* Generate global QEMU prologue and epilogue code */
static void tcg_target_qemu_prologue(TCGContext *s)
{
    int i, stack_addend;

    /* TB prologue */

    /* Reserve some stack space, also for TCG temps.  */
    stack_addend = FRAME_SIZE - PUSH_SIZE;
    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
                  CPU_TEMP_BUF_NLONGS * sizeof(long));

    /* Save all callee saved registers.  */
    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
        tcg_out_push(s, tcg_target_callee_save_regs[i]);
    }

    if (!tcg_use_softmmu && guest_base) {
        int seg = setup_guest_base_seg();
        if (seg != 0) {
            x86_guest_base.seg = seg;
        } else if (guest_base == (int32_t)guest_base) {
            x86_guest_base.ofs = guest_base;
        } else {
            assert(TCG_TARGET_REG_BITS == 64);
            /* Choose R12 because, as a base, it requires a SIB byte. */
            x86_guest_base.index = TCG_REG_R12;
            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
        }
    }

    if (TCG_TARGET_REG_BITS == 32) {
        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
        /* jmp *tb.  */
        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
                             + stack_addend);
    } else {
        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
        /* jmp *tb.  */
        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
    }

    /*
     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
     * and fall through to the rest of the epilogue.
     */
    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);

    /* TB epilogue */
    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);

    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);

    if (have_avx2) {
        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
    }
    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
    }
    tcg_out_opc(s, OPC_RET, 0, 0, 0);
}

static void tcg_out_tb_start(TCGContext *s)
{
    /* nothing to do */
}

static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
{
    memset(p, 0x90, count);
}

static void tcg_target_init(TCGContext *s)
{
    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
    if (TCG_TARGET_REG_BITS == 64) {
        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
    }
    if (have_avx1) {
        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
    }
    if (have_avx2) {
        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
    }

    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
    if (TCG_TARGET_REG_BITS == 64) {
#if !defined(_WIN64)
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
#endif
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
    }

    s->reserved_regs = 0;
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
#ifdef _WIN64
    /* These are call saved, and we don't save them, so don't use them. */
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
#endif
}

typedef struct {
    DebugFrameHeader h;
    uint8_t fde_def_cfa[4];
    uint8_t fde_reg_ofs[14];
} DebugFrame;

/* We're expecting a 2 byte uleb128 encoded value.  */
QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));

#if !defined(__ELF__)
    /* Host machine without ELF. */
#elif TCG_TARGET_REG_BITS == 64
#define ELF_HOST_MACHINE EM_X86_64
static const DebugFrame debug_frame = {
    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
    .h.cie.id = -1,
    .h.cie.version = 1,
    .h.cie.code_align = 1,
    .h.cie.data_align = 0x78,             /* sleb128 -8 */
    .h.cie.return_column = 16,

    /* Total FDE size does not include the "len" member.  */
    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),

    .fde_def_cfa = {
        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
        (FRAME_SIZE >> 7)
    },
    .fde_reg_ofs = {
        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
        /* The following ordering must match tcg_target_callee_save_regs.  */
        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
    }
};
#else
#define ELF_HOST_MACHINE EM_386
static const DebugFrame debug_frame = {
    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
    .h.cie.id = -1,
    .h.cie.version = 1,
    .h.cie.code_align = 1,
    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
    .h.cie.return_column = 8,

    /* Total FDE size does not include the "len" member.  */
    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),

    .fde_def_cfa = {
        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
        (FRAME_SIZE >> 7)
    },
    .fde_reg_ofs = {
        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
        /* The following ordering must match tcg_target_callee_save_regs.  */
        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
    }
};
#endif

#if defined(ELF_HOST_MACHINE)
void tcg_register_jit(const void *buf, size_t buf_size)
{
    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
}
#endif