1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25/* Used for function call generation. */ 26#define TCG_TARGET_STACK_ALIGN 16 27#if defined(_WIN64) 28#define TCG_TARGET_CALL_STACK_OFFSET 32 29#else 30#define TCG_TARGET_CALL_STACK_OFFSET 0 31#endif 32#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL 33#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL 34#if defined(_WIN64) 35# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF 36# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_VEC 37#elif TCG_TARGET_REG_BITS == 64 38# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 39# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL 40#else 41# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 42# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF 43#endif 44 45#ifdef CONFIG_DEBUG_TCG 46static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 47#if TCG_TARGET_REG_BITS == 64 48 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 49#else 50 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 51#endif 52 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 53 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 54#if TCG_TARGET_REG_BITS == 64 55 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 56 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 57#endif 58}; 59#endif 60 61static const int tcg_target_reg_alloc_order[] = { 62#if TCG_TARGET_REG_BITS == 64 63 TCG_REG_RBP, 64 TCG_REG_RBX, 65 TCG_REG_R12, 66 TCG_REG_R13, 67 TCG_REG_R14, 68 TCG_REG_R15, 69 TCG_REG_R10, 70 TCG_REG_R11, 71 TCG_REG_R9, 72 TCG_REG_R8, 73 TCG_REG_RCX, 74 TCG_REG_RDX, 75 TCG_REG_RSI, 76 TCG_REG_RDI, 77 TCG_REG_RAX, 78#else 79 TCG_REG_EBX, 80 TCG_REG_ESI, 81 TCG_REG_EDI, 82 TCG_REG_EBP, 83 TCG_REG_ECX, 84 TCG_REG_EDX, 85 TCG_REG_EAX, 86#endif 87 TCG_REG_XMM0, 88 TCG_REG_XMM1, 89 TCG_REG_XMM2, 90 TCG_REG_XMM3, 91 TCG_REG_XMM4, 92 TCG_REG_XMM5, 93#ifndef _WIN64 94 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 95 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 96 TCG_REG_XMM6, 97 TCG_REG_XMM7, 98#if TCG_TARGET_REG_BITS == 64 99 TCG_REG_XMM8, 100 TCG_REG_XMM9, 101 TCG_REG_XMM10, 102 TCG_REG_XMM11, 103 TCG_REG_XMM12, 104 TCG_REG_XMM13, 105 TCG_REG_XMM14, 106 TCG_REG_XMM15, 107#endif 108#endif 109}; 110 111#define TCG_TMP_VEC TCG_REG_XMM5 112 113static const int tcg_target_call_iarg_regs[] = { 114#if TCG_TARGET_REG_BITS == 64 115#if defined(_WIN64) 116 TCG_REG_RCX, 117 TCG_REG_RDX, 118#else 119 TCG_REG_RDI, 120 TCG_REG_RSI, 121 TCG_REG_RDX, 122 TCG_REG_RCX, 123#endif 124 TCG_REG_R8, 125 TCG_REG_R9, 126#else 127 /* 32 bit mode uses stack based calling convention (GCC default). */ 128#endif 129}; 130 131static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 132{ 133 switch (kind) { 134 case TCG_CALL_RET_NORMAL: 135 tcg_debug_assert(slot >= 0 && slot <= 1); 136 return slot ? TCG_REG_EDX : TCG_REG_EAX; 137#ifdef _WIN64 138 case TCG_CALL_RET_BY_VEC: 139 tcg_debug_assert(slot == 0); 140 return TCG_REG_XMM0; 141#endif 142 default: 143 g_assert_not_reached(); 144 } 145} 146 147/* Constants we accept. */ 148#define TCG_CT_CONST_S32 0x100 149#define TCG_CT_CONST_U32 0x200 150#define TCG_CT_CONST_I32 0x400 151#define TCG_CT_CONST_WSZ 0x800 152#define TCG_CT_CONST_TST 0x1000 153#define TCG_CT_CONST_ZERO 0x2000 154 155/* Registers used with L constraint, which are the first argument 156 registers on x86_64, and two random call clobbered registers on 157 i386. */ 158#if TCG_TARGET_REG_BITS == 64 159# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 160# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 161#else 162# define TCG_REG_L0 TCG_REG_EAX 163# define TCG_REG_L1 TCG_REG_EDX 164#endif 165 166#if TCG_TARGET_REG_BITS == 64 167# define ALL_GENERAL_REGS 0x0000ffffu 168# define ALL_VECTOR_REGS 0xffff0000u 169# define ALL_BYTEL_REGS ALL_GENERAL_REGS 170#else 171# define ALL_GENERAL_REGS 0x000000ffu 172# define ALL_VECTOR_REGS 0x00ff0000u 173# define ALL_BYTEL_REGS 0x0000000fu 174#endif 175#define SOFTMMU_RESERVE_REGS \ 176 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 177 178#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 179#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 180 181static const tcg_insn_unit *tb_ret_addr; 182 183static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 184 intptr_t value, intptr_t addend) 185{ 186 value += addend; 187 switch(type) { 188 case R_386_PC32: 189 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 190 if (value != (int32_t)value) { 191 return false; 192 } 193 /* FALLTHRU */ 194 case R_386_32: 195 tcg_patch32(code_ptr, value); 196 break; 197 case R_386_PC8: 198 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 199 if (value != (int8_t)value) { 200 return false; 201 } 202 tcg_patch8(code_ptr, value); 203 break; 204 default: 205 g_assert_not_reached(); 206 } 207 return true; 208} 209 210/* test if a constant matches the constraint */ 211static bool tcg_target_const_match(int64_t val, int ct, 212 TCGType type, TCGCond cond, int vece) 213{ 214 if (ct & TCG_CT_CONST) { 215 return 1; 216 } 217 if (type == TCG_TYPE_I32) { 218 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 219 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 220 return 1; 221 } 222 } else { 223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 224 return 1; 225 } 226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 227 return 1; 228 } 229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 230 return 1; 231 } 232 /* 233 * This will be used in combination with TCG_CT_CONST_S32, 234 * so "normal" TESTQ is already matched. Also accept: 235 * TESTQ -> TESTL (uint32_t) 236 * TESTQ -> BT (is_power_of_2) 237 */ 238 if ((ct & TCG_CT_CONST_TST) 239 && is_tst_cond(cond) 240 && (val == (uint32_t)val || is_power_of_2(val))) { 241 return 1; 242 } 243 } 244 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 245 return 1; 246 } 247 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EbIb (0x80) 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVB_Ib (0xb0) 307#define OPC_MOVL_Iv (0xb8) 308#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 309#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 310#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 311#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 312#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 313#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 314#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 315#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 316#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 318#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 319#define OPC_MOVSBL (0xbe | P_EXT) 320#define OPC_MOVSWL (0xbf | P_EXT) 321#define OPC_MOVSLQ (0x63 | P_REXW) 322#define OPC_MOVZBL (0xb6 | P_EXT) 323#define OPC_MOVZWL (0xb7 | P_EXT) 324#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 325#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 326#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 327#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 328#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 329#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 330#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 331#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 332#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 333#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 334#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 335#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 336#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 337#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 338#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 339#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 340#define OPC_PAND (0xdb | P_EXT | P_DATA16) 341#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 342#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 343#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 344#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 345#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 346#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 347#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 348#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 349#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 350#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 351#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 352#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 353#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 354#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 355#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 356#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 358#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 359#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 360#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 362#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 363#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 364#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 366#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 367#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 368#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 369#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 370#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 371#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 372#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 373#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 374#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 375#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 376#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 377#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 378#define OPC_POR (0xeb | P_EXT | P_DATA16) 379#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 380#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 381#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 382#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 383#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 384#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 385#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 386#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 387#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 388#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 389#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 390#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 391#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 392#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 393#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 394#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 395#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 396#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 397#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 398#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 399#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 400#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 401#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 402#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 404#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 405#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 406#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 407#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 408#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 409#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 410#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 411#define OPC_PXOR (0xef | P_EXT | P_DATA16) 412#define OPC_POP_r32 (0x58) 413#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 414#define OPC_PUSH_r32 (0x50) 415#define OPC_PUSH_Iv (0x68) 416#define OPC_PUSH_Ib (0x6a) 417#define OPC_RET (0xc3) 418#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 419#define OPC_SHIFT_1 (0xd1) 420#define OPC_SHIFT_Ib (0xc1) 421#define OPC_SHIFT_cl (0xd3) 422#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 423#define OPC_SHUFPS (0xc6 | P_EXT) 424#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 425#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 426#define OPC_SHRD_Ib (0xac | P_EXT) 427#define OPC_TESTB (0x84) 428#define OPC_TESTL (0x85) 429#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 430#define OPC_UD2 (0x0b | P_EXT) 431#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 432#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 433#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 434#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 436#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 437#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 438#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 439#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 440#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 442#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 443#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 446#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 447#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 448#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 449#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 450#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 451#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 452#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 453#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 454#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 455#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 456#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 457#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 458#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 459#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 460#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 462#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 463#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 464#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 465#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 466#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 468#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 469#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 470#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 471#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 472#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 473#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 474#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 475#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 476#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 477#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 478#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 479#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 480#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 481#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 482#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 483#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 484#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 485#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 486#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 487#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 488#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 489#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 490#define OPC_VZEROUPPER (0x77 | P_EXT) 491#define OPC_XCHG_ax_r32 (0x90) 492#define OPC_XCHG_EvGv (0x87) 493 494#define OPC_GRP3_Eb (0xf6) 495#define OPC_GRP3_Ev (0xf7) 496#define OPC_GRP5 (0xff) 497#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 498#define OPC_GRPBT (0xba | P_EXT) 499 500#define OPC_GRPBT_BT 4 501#define OPC_GRPBT_BTS 5 502#define OPC_GRPBT_BTR 6 503#define OPC_GRPBT_BTC 7 504 505/* Group 1 opcode extensions for 0x80-0x83. 506 These are also used as modifiers for OPC_ARITH. */ 507#define ARITH_ADD 0 508#define ARITH_OR 1 509#define ARITH_ADC 2 510#define ARITH_SBB 3 511#define ARITH_AND 4 512#define ARITH_SUB 5 513#define ARITH_XOR 6 514#define ARITH_CMP 7 515 516/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 517#define SHIFT_ROL 0 518#define SHIFT_ROR 1 519#define SHIFT_SHL 4 520#define SHIFT_SHR 5 521#define SHIFT_SAR 7 522 523/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 524#define EXT3_TESTi 0 525#define EXT3_NOT 2 526#define EXT3_NEG 3 527#define EXT3_MUL 4 528#define EXT3_IMUL 5 529#define EXT3_DIV 6 530#define EXT3_IDIV 7 531 532/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 533#define EXT5_INC_Ev 0 534#define EXT5_DEC_Ev 1 535#define EXT5_CALLN_Ev 2 536#define EXT5_JMPN_Ev 4 537 538/* Condition codes to be added to OPC_JCC_{long,short}. */ 539#define JCC_JMP (-1) 540#define JCC_JO 0x0 541#define JCC_JNO 0x1 542#define JCC_JB 0x2 543#define JCC_JAE 0x3 544#define JCC_JE 0x4 545#define JCC_JNE 0x5 546#define JCC_JBE 0x6 547#define JCC_JA 0x7 548#define JCC_JS 0x8 549#define JCC_JNS 0x9 550#define JCC_JP 0xa 551#define JCC_JNP 0xb 552#define JCC_JL 0xc 553#define JCC_JGE 0xd 554#define JCC_JLE 0xe 555#define JCC_JG 0xf 556 557static const uint8_t tcg_cond_to_jcc[] = { 558 [TCG_COND_EQ] = JCC_JE, 559 [TCG_COND_NE] = JCC_JNE, 560 [TCG_COND_LT] = JCC_JL, 561 [TCG_COND_GE] = JCC_JGE, 562 [TCG_COND_LE] = JCC_JLE, 563 [TCG_COND_GT] = JCC_JG, 564 [TCG_COND_LTU] = JCC_JB, 565 [TCG_COND_GEU] = JCC_JAE, 566 [TCG_COND_LEU] = JCC_JBE, 567 [TCG_COND_GTU] = JCC_JA, 568 [TCG_COND_TSTEQ] = JCC_JE, 569 [TCG_COND_TSTNE] = JCC_JNE, 570}; 571 572#if TCG_TARGET_REG_BITS == 64 573static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 574{ 575 int rex; 576 577 if (opc & P_GS) { 578 tcg_out8(s, 0x65); 579 } 580 if (opc & P_DATA16) { 581 /* We should never be asking for both 16 and 64-bit operation. */ 582 tcg_debug_assert((opc & P_REXW) == 0); 583 tcg_out8(s, 0x66); 584 } 585 if (opc & P_SIMDF3) { 586 tcg_out8(s, 0xf3); 587 } else if (opc & P_SIMDF2) { 588 tcg_out8(s, 0xf2); 589 } 590 591 rex = 0; 592 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 593 rex |= (r & 8) >> 1; /* REX.R */ 594 rex |= (x & 8) >> 2; /* REX.X */ 595 rex |= (rm & 8) >> 3; /* REX.B */ 596 597 /* P_REXB_{R,RM} indicates that the given register is the low byte. 598 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 599 as otherwise the encoding indicates %[abcd]h. Note that the values 600 that are ORed in merely indicate that the REX byte must be present; 601 those bits get discarded in output. */ 602 rex |= opc & (r >= 4 ? P_REXB_R : 0); 603 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 604 605 if (rex) { 606 tcg_out8(s, (uint8_t)(rex | 0x40)); 607 } 608 609 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 610 tcg_out8(s, 0x0f); 611 if (opc & P_EXT38) { 612 tcg_out8(s, 0x38); 613 } else if (opc & P_EXT3A) { 614 tcg_out8(s, 0x3a); 615 } 616 } 617 618 tcg_out8(s, opc); 619} 620#else 621static void tcg_out_opc(TCGContext *s, int opc) 622{ 623 if (opc & P_DATA16) { 624 tcg_out8(s, 0x66); 625 } 626 if (opc & P_SIMDF3) { 627 tcg_out8(s, 0xf3); 628 } else if (opc & P_SIMDF2) { 629 tcg_out8(s, 0xf2); 630 } 631 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 632 tcg_out8(s, 0x0f); 633 if (opc & P_EXT38) { 634 tcg_out8(s, 0x38); 635 } else if (opc & P_EXT3A) { 636 tcg_out8(s, 0x3a); 637 } 638 } 639 tcg_out8(s, opc); 640} 641/* Discard the register arguments to tcg_out_opc early, so as not to penalize 642 the 32-bit compilation paths. This method works with all versions of gcc, 643 whereas relying on optimization may not be able to exclude them. */ 644#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 645#endif 646 647static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 648{ 649 tcg_out_opc(s, opc, r, rm, 0); 650 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 651} 652 653static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 654 int rm, int index) 655{ 656 int tmp; 657 658 if (opc & P_GS) { 659 tcg_out8(s, 0x65); 660 } 661 /* Use the two byte form if possible, which cannot encode 662 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 663 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 664 && ((rm | index) & 8) == 0) { 665 /* Two byte VEX prefix. */ 666 tcg_out8(s, 0xc5); 667 668 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 669 } else { 670 /* Three byte VEX prefix. */ 671 tcg_out8(s, 0xc4); 672 673 /* VEX.m-mmmm */ 674 if (opc & P_EXT3A) { 675 tmp = 3; 676 } else if (opc & P_EXT38) { 677 tmp = 2; 678 } else if (opc & P_EXT) { 679 tmp = 1; 680 } else { 681 g_assert_not_reached(); 682 } 683 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 684 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 685 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 686 tcg_out8(s, tmp); 687 688 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 689 } 690 691 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 692 /* VEX.pp */ 693 if (opc & P_DATA16) { 694 tmp |= 1; /* 0x66 */ 695 } else if (opc & P_SIMDF3) { 696 tmp |= 2; /* 0xf3 */ 697 } else if (opc & P_SIMDF2) { 698 tmp |= 3; /* 0xf2 */ 699 } 700 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 701 tcg_out8(s, tmp); 702 tcg_out8(s, opc); 703} 704 705static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 706 int rm, int index, int aaa, bool z) 707{ 708 /* The entire 4-byte evex prefix; with R' and V' set. */ 709 uint32_t p = 0x08041062; 710 int mm, pp; 711 712 tcg_debug_assert(have_avx512vl); 713 714 /* EVEX.mm */ 715 if (opc & P_EXT3A) { 716 mm = 3; 717 } else if (opc & P_EXT38) { 718 mm = 2; 719 } else if (opc & P_EXT) { 720 mm = 1; 721 } else { 722 g_assert_not_reached(); 723 } 724 725 /* EVEX.pp */ 726 if (opc & P_DATA16) { 727 pp = 1; /* 0x66 */ 728 } else if (opc & P_SIMDF3) { 729 pp = 2; /* 0xf3 */ 730 } else if (opc & P_SIMDF2) { 731 pp = 3; /* 0xf2 */ 732 } else { 733 pp = 0; 734 } 735 736 p = deposit32(p, 8, 2, mm); 737 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 738 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 739 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 740 p = deposit32(p, 16, 2, pp); 741 p = deposit32(p, 19, 4, ~v); 742 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 743 p = deposit32(p, 24, 3, aaa); 744 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 745 p = deposit32(p, 31, 1, z); 746 747 tcg_out32(s, p); 748 tcg_out8(s, opc); 749} 750 751static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 752{ 753 if (opc & P_EVEX) { 754 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 755 } else { 756 tcg_out_vex_opc(s, opc, r, v, rm, 0); 757 } 758 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 759} 760 761static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 762 int r, int v, int rm, TCGType type) 763{ 764 if (type == TCG_TYPE_V256) { 765 opc |= P_VEXL; 766 } 767 tcg_out_vex_modrm(s, opc, r, v, rm); 768} 769 770static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 771 int rm, int aaa, bool z, TCGType type) 772{ 773 if (type == TCG_TYPE_V256) { 774 opc |= P_VEXL; 775 } 776 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 777 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 778} 779 780/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 781 We handle either RM and INDEX missing with a negative value. In 64-bit 782 mode for absolute addresses, ~RM is the size of the immediate operand 783 that will follow the instruction. */ 784 785static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 786 int shift, intptr_t offset) 787{ 788 int mod, len; 789 790 if (index < 0 && rm < 0) { 791 if (TCG_TARGET_REG_BITS == 64) { 792 /* Try for a rip-relative addressing mode. This has replaced 793 the 32-bit-mode absolute addressing encoding. */ 794 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 795 intptr_t disp = offset - pc; 796 if (disp == (int32_t)disp) { 797 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 798 tcg_out32(s, disp); 799 return; 800 } 801 802 /* Try for an absolute address encoding. This requires the 803 use of the MODRM+SIB encoding and is therefore larger than 804 rip-relative addressing. */ 805 if (offset == (int32_t)offset) { 806 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 807 tcg_out8(s, (4 << 3) | 5); 808 tcg_out32(s, offset); 809 return; 810 } 811 812 /* ??? The memory isn't directly addressable. */ 813 g_assert_not_reached(); 814 } else { 815 /* Absolute address. */ 816 tcg_out8(s, (r << 3) | 5); 817 tcg_out32(s, offset); 818 return; 819 } 820 } 821 822 /* Find the length of the immediate addend. Note that the encoding 823 that would be used for (%ebp) indicates absolute addressing. */ 824 if (rm < 0) { 825 mod = 0, len = 4, rm = 5; 826 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 827 mod = 0, len = 0; 828 } else if (offset == (int8_t)offset) { 829 mod = 0x40, len = 1; 830 } else { 831 mod = 0x80, len = 4; 832 } 833 834 /* Use a single byte MODRM format if possible. Note that the encoding 835 that would be used for %esp is the escape to the two byte form. */ 836 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 837 /* Single byte MODRM format. */ 838 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 839 } else { 840 /* Two byte MODRM+SIB format. */ 841 842 /* Note that the encoding that would place %esp into the index 843 field indicates no index register. In 64-bit mode, the REX.X 844 bit counts, so %r12 can be used as the index. */ 845 if (index < 0) { 846 index = 4; 847 } else { 848 tcg_debug_assert(index != TCG_REG_ESP); 849 } 850 851 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 852 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 853 } 854 855 if (len == 1) { 856 tcg_out8(s, offset); 857 } else if (len == 4) { 858 tcg_out32(s, offset); 859 } 860} 861 862static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 863 int index, int shift, intptr_t offset) 864{ 865 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 866 tcg_out_sib_offset(s, r, rm, index, shift, offset); 867} 868 869static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 870 int rm, int index, int shift, 871 intptr_t offset) 872{ 873 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 874 tcg_out_sib_offset(s, r, rm, index, shift, offset); 875} 876 877/* A simplification of the above with no index or shift. */ 878static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 879 int rm, intptr_t offset) 880{ 881 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 882} 883 884static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 885 int v, int rm, intptr_t offset) 886{ 887 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 888} 889 890/* Output an opcode with an expected reference to the constant pool. */ 891static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 892{ 893 tcg_out_opc(s, opc, r, 0, 0); 894 /* Absolute for 32-bit, pc-relative for 64-bit. */ 895 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 896 tcg_out32(s, 0); 897} 898 899/* Output an opcode with an expected reference to the constant pool. */ 900static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 901{ 902 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 903 /* Absolute for 32-bit, pc-relative for 64-bit. */ 904 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 905 tcg_out32(s, 0); 906} 907 908/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 909static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 910{ 911 /* Propagate an opcode prefix, such as P_REXW. */ 912 int ext = subop & ~0x7; 913 subop &= 0x7; 914 915 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 916} 917 918static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 919{ 920 int rexw = 0; 921 922 if (arg == ret) { 923 return true; 924 } 925 switch (type) { 926 case TCG_TYPE_I64: 927 rexw = P_REXW; 928 /* fallthru */ 929 case TCG_TYPE_I32: 930 if (ret < 16) { 931 if (arg < 16) { 932 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 933 } else { 934 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 935 } 936 } else { 937 if (arg < 16) { 938 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 939 } else { 940 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 941 } 942 } 943 break; 944 945 case TCG_TYPE_V64: 946 tcg_debug_assert(ret >= 16 && arg >= 16); 947 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 948 break; 949 case TCG_TYPE_V128: 950 tcg_debug_assert(ret >= 16 && arg >= 16); 951 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 952 break; 953 case TCG_TYPE_V256: 954 tcg_debug_assert(ret >= 16 && arg >= 16); 955 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 956 break; 957 958 default: 959 g_assert_not_reached(); 960 } 961 return true; 962} 963 964static const int avx2_dup_insn[4] = { 965 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 966 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 967}; 968 969static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 970 TCGReg r, TCGReg a) 971{ 972 if (have_avx2) { 973 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 974 } else { 975 switch (vece) { 976 case MO_8: 977 /* ??? With zero in a register, use PSHUFB. */ 978 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 979 a = r; 980 /* FALLTHRU */ 981 case MO_16: 982 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 983 a = r; 984 /* FALLTHRU */ 985 case MO_32: 986 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 987 /* imm8 operand: all output lanes selected from input lane 0. */ 988 tcg_out8(s, 0); 989 break; 990 case MO_64: 991 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 992 break; 993 default: 994 g_assert_not_reached(); 995 } 996 } 997 return true; 998} 999 1000static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 1001 TCGReg r, TCGReg base, intptr_t offset) 1002{ 1003 if (have_avx2) { 1004 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1005 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 1006 r, 0, base, offset); 1007 } else { 1008 switch (vece) { 1009 case MO_64: 1010 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 1011 break; 1012 case MO_32: 1013 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 1014 break; 1015 case MO_16: 1016 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1017 tcg_out8(s, 0); /* imm8 */ 1018 tcg_out_dup_vec(s, type, vece, r, r); 1019 break; 1020 case MO_8: 1021 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1022 tcg_out8(s, 0); /* imm8 */ 1023 tcg_out_dup_vec(s, type, vece, r, r); 1024 break; 1025 default: 1026 g_assert_not_reached(); 1027 } 1028 } 1029 return true; 1030} 1031 1032static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1033 TCGReg ret, int64_t arg) 1034{ 1035 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1036 1037 if (arg == 0) { 1038 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1039 return; 1040 } 1041 if (arg == -1) { 1042 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1043 return; 1044 } 1045 1046 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1047 if (have_avx2) { 1048 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1049 } else { 1050 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1051 } 1052 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1053 } else { 1054 if (type == TCG_TYPE_V64) { 1055 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1056 } else if (have_avx2) { 1057 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1058 } else { 1059 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1060 } 1061 if (TCG_TARGET_REG_BITS == 64) { 1062 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1063 } else { 1064 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1065 } 1066 } 1067} 1068 1069static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1070 TCGReg ret, tcg_target_long arg) 1071{ 1072 if (arg == 0) { 1073 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1074 return; 1075 } 1076 if (arg == -1) { 1077 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1078 return; 1079 } 1080 1081 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1082 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1083 if (TCG_TARGET_REG_BITS == 64) { 1084 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1085 } else { 1086 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1087 } 1088} 1089 1090static void tcg_out_movi_int(TCGContext *s, TCGType type, 1091 TCGReg ret, tcg_target_long arg) 1092{ 1093 tcg_target_long diff; 1094 1095 if (arg == 0) { 1096 tgen_arithr(s, ARITH_XOR, ret, ret); 1097 return; 1098 } 1099 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1100 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1101 tcg_out32(s, arg); 1102 return; 1103 } 1104 if (arg == (int32_t)arg) { 1105 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1106 tcg_out32(s, arg); 1107 return; 1108 } 1109 1110 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1111 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1112 if (diff == (int32_t)diff) { 1113 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1114 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1115 tcg_out32(s, diff); 1116 return; 1117 } 1118 1119 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1120 tcg_out64(s, arg); 1121} 1122 1123static void tcg_out_movi(TCGContext *s, TCGType type, 1124 TCGReg ret, tcg_target_long arg) 1125{ 1126 switch (type) { 1127 case TCG_TYPE_I32: 1128#if TCG_TARGET_REG_BITS == 64 1129 case TCG_TYPE_I64: 1130#endif 1131 if (ret < 16) { 1132 tcg_out_movi_int(s, type, ret, arg); 1133 } else { 1134 tcg_out_movi_vec(s, type, ret, arg); 1135 } 1136 break; 1137 default: 1138 g_assert_not_reached(); 1139 } 1140} 1141 1142static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1143{ 1144 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1145 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1146 return true; 1147} 1148 1149static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1150 tcg_target_long imm) 1151{ 1152 /* This function is only used for passing structs by reference. */ 1153 tcg_debug_assert(imm == (int32_t)imm); 1154 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1155} 1156 1157static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1158{ 1159 if (val == (int8_t)val) { 1160 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1161 tcg_out8(s, val); 1162 } else if (val == (int32_t)val) { 1163 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1164 tcg_out32(s, val); 1165 } else { 1166 g_assert_not_reached(); 1167 } 1168} 1169 1170static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1171{ 1172 /* Given the strength of x86 memory ordering, we only need care for 1173 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1174 faster than "mfence", so don't bother with the sse insn. */ 1175 if (a0 & TCG_MO_ST_LD) { 1176 tcg_out8(s, 0xf0); 1177 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1178 tcg_out8(s, 0); 1179 } 1180} 1181 1182static inline void tcg_out_push(TCGContext *s, int reg) 1183{ 1184 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1185} 1186 1187static inline void tcg_out_pop(TCGContext *s, int reg) 1188{ 1189 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1190} 1191 1192static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1193 TCGReg arg1, intptr_t arg2) 1194{ 1195 switch (type) { 1196 case TCG_TYPE_I32: 1197 if (ret < 16) { 1198 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1199 } else { 1200 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1201 } 1202 break; 1203 case TCG_TYPE_I64: 1204 if (ret < 16) { 1205 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1206 break; 1207 } 1208 /* FALLTHRU */ 1209 case TCG_TYPE_V64: 1210 /* There is no instruction that can validate 8-byte alignment. */ 1211 tcg_debug_assert(ret >= 16); 1212 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1213 break; 1214 case TCG_TYPE_V128: 1215 /* 1216 * The gvec infrastructure is asserts that v128 vector loads 1217 * and stores use a 16-byte aligned offset. Validate that the 1218 * final pointer is aligned by using an insn that will SIGSEGV. 1219 */ 1220 tcg_debug_assert(ret >= 16); 1221 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1222 break; 1223 case TCG_TYPE_V256: 1224 /* 1225 * The gvec infrastructure only requires 16-byte alignment, 1226 * so here we must use an unaligned load. 1227 */ 1228 tcg_debug_assert(ret >= 16); 1229 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1230 ret, 0, arg1, arg2); 1231 break; 1232 default: 1233 g_assert_not_reached(); 1234 } 1235} 1236 1237static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1238 TCGReg arg1, intptr_t arg2) 1239{ 1240 switch (type) { 1241 case TCG_TYPE_I32: 1242 if (arg < 16) { 1243 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1244 } else { 1245 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1246 } 1247 break; 1248 case TCG_TYPE_I64: 1249 if (arg < 16) { 1250 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1251 break; 1252 } 1253 /* FALLTHRU */ 1254 case TCG_TYPE_V64: 1255 /* There is no instruction that can validate 8-byte alignment. */ 1256 tcg_debug_assert(arg >= 16); 1257 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1258 break; 1259 case TCG_TYPE_V128: 1260 /* 1261 * The gvec infrastructure is asserts that v128 vector loads 1262 * and stores use a 16-byte aligned offset. Validate that the 1263 * final pointer is aligned by using an insn that will SIGSEGV. 1264 * 1265 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1266 * for _WIN64, which must have SSE2 but may not have AVX. 1267 */ 1268 tcg_debug_assert(arg >= 16); 1269 if (have_avx1) { 1270 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1271 } else { 1272 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1273 } 1274 break; 1275 case TCG_TYPE_V256: 1276 /* 1277 * The gvec infrastructure only requires 16-byte alignment, 1278 * so here we must use an unaligned store. 1279 */ 1280 tcg_debug_assert(arg >= 16); 1281 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1282 arg, 0, arg1, arg2); 1283 break; 1284 default: 1285 g_assert_not_reached(); 1286 } 1287} 1288 1289static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1290 TCGReg base, intptr_t ofs) 1291{ 1292 int rexw = 0; 1293 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1294 if (val != (int32_t)val) { 1295 return false; 1296 } 1297 rexw = P_REXW; 1298 } else if (type != TCG_TYPE_I32) { 1299 return false; 1300 } 1301 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1302 tcg_out32(s, val); 1303 return true; 1304} 1305 1306static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1307{ 1308 /* Propagate an opcode prefix, such as P_DATA16. */ 1309 int ext = subopc & ~0x7; 1310 subopc &= 0x7; 1311 1312 if (count == 1) { 1313 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1314 } else { 1315 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1316 tcg_out8(s, count); 1317 } 1318} 1319 1320static inline void tcg_out_bswap32(TCGContext *s, int reg) 1321{ 1322 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1323} 1324 1325static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1326{ 1327 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1328} 1329 1330static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1331{ 1332 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1333 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1334 if (dest >= 4) { 1335 tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest); 1336 tcg_out32(s, 0xff); 1337 return; 1338 } 1339 src = dest; 1340 } 1341 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1342} 1343 1344static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1345{ 1346 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1347 1348 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1349 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1350 if (dest >= 4) { 1351 tcg_out_shifti(s, SHIFT_SHL, dest, 24); 1352 tcg_out_shifti(s, SHIFT_SAR, dest, 24); 1353 return; 1354 } 1355 src = dest; 1356 } 1357 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1358} 1359 1360static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1361{ 1362 /* movzwl */ 1363 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1364} 1365 1366static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1367{ 1368 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1369 /* movsw[lq] */ 1370 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1371} 1372 1373static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1374{ 1375 /* 32-bit mov zero extends. */ 1376 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1377} 1378 1379static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1380{ 1381 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1382 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1383} 1384 1385static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1386{ 1387 tcg_out_ext32s(s, dest, src); 1388} 1389 1390static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1391{ 1392 if (dest != src) { 1393 tcg_out_ext32u(s, dest, src); 1394 } 1395} 1396 1397static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1398{ 1399 tcg_out_ext32u(s, dest, src); 1400} 1401 1402static inline void tcg_out_bswap64(TCGContext *s, int reg) 1403{ 1404 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1405} 1406 1407static void tgen_arithi(TCGContext *s, int c, int r0, 1408 tcg_target_long val, int cf) 1409{ 1410 int rexw = 0; 1411 1412 if (TCG_TARGET_REG_BITS == 64) { 1413 rexw = c & -8; 1414 c &= 7; 1415 } 1416 1417 switch (c) { 1418 case ARITH_ADD: 1419 case ARITH_SUB: 1420 if (!cf) { 1421 /* 1422 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1423 * partial flags update stalls on Pentium4 and are not recommended 1424 * by current Intel optimization manuals. 1425 */ 1426 if (val == 1 || val == -1) { 1427 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1428 if (TCG_TARGET_REG_BITS == 64) { 1429 /* 1430 * The single-byte increment encodings are re-tasked 1431 * as the REX prefixes. Use the MODRM encoding. 1432 */ 1433 tcg_out_modrm(s, OPC_GRP5 + rexw, 1434 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1435 } else { 1436 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1437 } 1438 return; 1439 } 1440 if (val == 128) { 1441 /* 1442 * Facilitate using an 8-bit immediate. Carry is inverted 1443 * by this transformation, so do it only if cf == 0. 1444 */ 1445 c ^= ARITH_ADD ^ ARITH_SUB; 1446 val = -128; 1447 } 1448 } 1449 break; 1450 1451 case ARITH_AND: 1452 if (TCG_TARGET_REG_BITS == 64) { 1453 if (val == 0xffffffffu) { 1454 tcg_out_ext32u(s, r0, r0); 1455 return; 1456 } 1457 if (val == (uint32_t)val) { 1458 /* AND with no high bits set can use a 32-bit operation. */ 1459 rexw = 0; 1460 } 1461 } 1462 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1463 tcg_out_ext8u(s, r0, r0); 1464 return; 1465 } 1466 if (val == 0xffffu) { 1467 tcg_out_ext16u(s, r0, r0); 1468 return; 1469 } 1470 break; 1471 1472 case ARITH_OR: 1473 case ARITH_XOR: 1474 if (val >= 0x80 && val <= 0xff 1475 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1476 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1477 tcg_out8(s, val); 1478 return; 1479 } 1480 break; 1481 } 1482 1483 if (val == (int8_t)val) { 1484 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1485 tcg_out8(s, val); 1486 return; 1487 } 1488 if (rexw == 0 || val == (int32_t)val) { 1489 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1490 tcg_out32(s, val); 1491 return; 1492 } 1493 1494 g_assert_not_reached(); 1495} 1496 1497static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1498{ 1499 if (val != 0) { 1500 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1501 } 1502} 1503 1504/* Set SMALL to force a short forward branch. */ 1505static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1506{ 1507 int32_t val, val1; 1508 1509 if (l->has_value) { 1510 val = tcg_pcrel_diff(s, l->u.value_ptr); 1511 val1 = val - 2; 1512 if ((int8_t)val1 == val1) { 1513 if (opc == -1) { 1514 tcg_out8(s, OPC_JMP_short); 1515 } else { 1516 tcg_out8(s, OPC_JCC_short + opc); 1517 } 1518 tcg_out8(s, val1); 1519 } else { 1520 tcg_debug_assert(!small); 1521 if (opc == -1) { 1522 tcg_out8(s, OPC_JMP_long); 1523 tcg_out32(s, val - 5); 1524 } else { 1525 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1526 tcg_out32(s, val - 6); 1527 } 1528 } 1529 } else if (small) { 1530 if (opc == -1) { 1531 tcg_out8(s, OPC_JMP_short); 1532 } else { 1533 tcg_out8(s, OPC_JCC_short + opc); 1534 } 1535 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1536 s->code_ptr += 1; 1537 } else { 1538 if (opc == -1) { 1539 tcg_out8(s, OPC_JMP_long); 1540 } else { 1541 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1542 } 1543 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1544 s->code_ptr += 4; 1545 } 1546} 1547 1548static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1549 TCGArg arg2, int const_arg2, int rexw) 1550{ 1551 int jz, js; 1552 1553 if (!is_tst_cond(cond)) { 1554 if (!const_arg2) { 1555 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1556 } else if (arg2 == 0) { 1557 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1558 } else { 1559 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1560 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1561 } 1562 return tcg_cond_to_jcc[cond]; 1563 } 1564 1565 jz = tcg_cond_to_jcc[cond]; 1566 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1567 1568 if (!const_arg2) { 1569 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1570 return jz; 1571 } 1572 1573 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1574 if (arg2 == 0x80) { 1575 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1576 return js; 1577 } 1578 if (arg2 == 0xff) { 1579 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1580 return jz; 1581 } 1582 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1583 tcg_out8(s, arg2); 1584 return jz; 1585 } 1586 1587 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1588 if (arg2 == 0x8000) { 1589 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1590 return js; 1591 } 1592 if (arg2 == 0xff00) { 1593 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1594 return jz; 1595 } 1596 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1597 tcg_out8(s, arg2 >> 8); 1598 return jz; 1599 } 1600 1601 if (arg2 == 0xffff) { 1602 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1603 return jz; 1604 } 1605 if (arg2 == 0xffffffffu) { 1606 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1607 return jz; 1608 } 1609 1610 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1611 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1612 int sh = ctz64(arg2); 1613 1614 rexw = (sh & 32 ? P_REXW : 0); 1615 if ((sh & 31) == 31) { 1616 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1617 return js; 1618 } else { 1619 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1620 tcg_out8(s, sh); 1621 return jc; 1622 } 1623 } 1624 1625 if (rexw) { 1626 if (arg2 == (uint32_t)arg2) { 1627 rexw = 0; 1628 } else { 1629 tcg_debug_assert(arg2 == (int32_t)arg2); 1630 } 1631 } 1632 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1633 tcg_out32(s, arg2); 1634 return jz; 1635} 1636 1637static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1638 TCGArg arg1, TCGArg arg2, int const_arg2, 1639 TCGLabel *label, bool small) 1640{ 1641 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1642 tcg_out_jxx(s, jcc, label, small); 1643} 1644 1645static void tgen_brcond(TCGContext *s, TCGType type, TCGCond cond, 1646 TCGReg arg1, TCGReg arg2, TCGLabel *label) 1647{ 1648 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1649 tcg_out_brcond(s, rexw, cond, arg1, arg2, false, label, false); 1650} 1651 1652static void tgen_brcondi(TCGContext *s, TCGType type, TCGCond cond, 1653 TCGReg arg1, tcg_target_long arg2, TCGLabel *label) 1654{ 1655 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1656 tcg_out_brcond(s, rexw, cond, arg1, arg2, true, label, false); 1657} 1658 1659static const TCGOutOpBrcond outop_brcond = { 1660 .base.static_constraint = C_O0_I2(r, reT), 1661 .out_rr = tgen_brcond, 1662 .out_ri = tgen_brcondi, 1663}; 1664 1665#if TCG_TARGET_REG_BITS == 32 1666static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1667 const int *const_args, bool small) 1668{ 1669 TCGLabel *label_next = gen_new_label(); 1670 TCGLabel *label_this = arg_label(args[5]); 1671 TCGCond cond = args[4]; 1672 1673 switch (cond) { 1674 case TCG_COND_EQ: 1675 case TCG_COND_TSTEQ: 1676 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1677 args[0], args[2], const_args[2], label_next, 1); 1678 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1679 label_this, small); 1680 break; 1681 1682 case TCG_COND_NE: 1683 case TCG_COND_TSTNE: 1684 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1685 label_this, small); 1686 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1687 label_this, small); 1688 break; 1689 1690 default: 1691 tcg_out_brcond(s, 0, tcg_high_cond(cond), args[1], 1692 args[3], const_args[3], label_this, small); 1693 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1694 tcg_out_brcond(s, 0, tcg_unsigned_cond(cond), args[0], 1695 args[2], const_args[2], label_this, small); 1696 break; 1697 } 1698 tcg_out_label(s, label_next); 1699} 1700#endif 1701 1702static void tcg_out_setcond(TCGContext *s, TCGType type, TCGCond cond, 1703 TCGReg dest, TCGReg arg1, TCGArg arg2, 1704 bool const_arg2, bool neg) 1705{ 1706 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1707 int cmp_rexw = rexw; 1708 bool inv = false; 1709 bool cleared; 1710 int jcc; 1711 1712 switch (cond) { 1713 case TCG_COND_NE: 1714 inv = true; 1715 /* fall through */ 1716 case TCG_COND_EQ: 1717 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1718 if (const_arg2 && arg2 == 0) { 1719 arg2 = 1; 1720 goto do_ltu; 1721 } 1722 break; 1723 1724 case TCG_COND_TSTNE: 1725 inv = true; 1726 /* fall through */ 1727 case TCG_COND_TSTEQ: 1728 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1729 if (const_arg2 && arg2 == 0xffffffffu) { 1730 arg2 = 1; 1731 cmp_rexw = 0; 1732 goto do_ltu; 1733 } 1734 break; 1735 1736 case TCG_COND_LEU: 1737 inv = true; 1738 /* fall through */ 1739 case TCG_COND_GTU: 1740 /* If arg2 is a register, swap for LTU/GEU. */ 1741 if (!const_arg2) { 1742 TCGReg t = arg1; 1743 arg1 = arg2; 1744 arg2 = t; 1745 goto do_ltu; 1746 } 1747 break; 1748 1749 case TCG_COND_GEU: 1750 inv = true; 1751 /* fall through */ 1752 case TCG_COND_LTU: 1753 do_ltu: 1754 /* 1755 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1756 * We can then use NEG or INC to produce the desired result. 1757 * This is always smaller than the SETCC expansion. 1758 */ 1759 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1760 1761 /* X - X - C = -C = (C ? -1 : 0) */ 1762 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1763 if (inv && neg) { 1764 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1765 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1766 } else if (inv) { 1767 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1768 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1769 } else if (!neg) { 1770 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1771 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1772 } 1773 return; 1774 1775 case TCG_COND_GE: 1776 inv = true; 1777 /* fall through */ 1778 case TCG_COND_LT: 1779 /* If arg2 is 0, extract the sign bit. */ 1780 if (const_arg2 && arg2 == 0) { 1781 tcg_out_mov(s, type, dest, arg1); 1782 if (inv) { 1783 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1784 } 1785 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1786 dest, rexw ? 63 : 31); 1787 return; 1788 } 1789 break; 1790 1791 default: 1792 break; 1793 } 1794 1795 /* 1796 * If dest does not overlap the inputs, clearing it first is preferred. 1797 * The XOR breaks any false dependency for the low-byte write to dest, 1798 * and is also one byte smaller than MOVZBL. 1799 */ 1800 cleared = false; 1801 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1802 tgen_arithr(s, ARITH_XOR, dest, dest); 1803 cleared = true; 1804 } 1805 1806 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1807 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1808 1809 if (!cleared) { 1810 tcg_out_ext8u(s, dest, dest); 1811 } 1812 if (neg) { 1813 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1814 } 1815} 1816 1817static void tgen_setcond(TCGContext *s, TCGType type, TCGCond cond, 1818 TCGReg dest, TCGReg arg1, TCGReg arg2) 1819{ 1820 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, false); 1821} 1822 1823static void tgen_setcondi(TCGContext *s, TCGType type, TCGCond cond, 1824 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1825{ 1826 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, false); 1827} 1828 1829static const TCGOutOpSetcond outop_setcond = { 1830 .base.static_constraint = C_O1_I2(q, r, reT), 1831 .out_rrr = tgen_setcond, 1832 .out_rri = tgen_setcondi, 1833}; 1834 1835static void tgen_negsetcond(TCGContext *s, TCGType type, TCGCond cond, 1836 TCGReg dest, TCGReg arg1, TCGReg arg2) 1837{ 1838 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, true); 1839} 1840 1841static void tgen_negsetcondi(TCGContext *s, TCGType type, TCGCond cond, 1842 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1843{ 1844 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, true); 1845} 1846 1847static const TCGOutOpSetcond outop_negsetcond = { 1848 .base.static_constraint = C_O1_I2(q, r, reT), 1849 .out_rrr = tgen_negsetcond, 1850 .out_rri = tgen_negsetcondi, 1851}; 1852 1853#if TCG_TARGET_REG_BITS == 32 1854static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1855 const int *const_args) 1856{ 1857 TCGArg new_args[6]; 1858 TCGLabel *label_true, *label_over; 1859 1860 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1861 1862 if (args[0] == args[1] || args[0] == args[2] 1863 || (!const_args[3] && args[0] == args[3]) 1864 || (!const_args[4] && args[0] == args[4])) { 1865 /* When the destination overlaps with one of the argument 1866 registers, don't do anything tricky. */ 1867 label_true = gen_new_label(); 1868 label_over = gen_new_label(); 1869 1870 new_args[5] = label_arg(label_true); 1871 tcg_out_brcond2(s, new_args, const_args+1, 1); 1872 1873 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1874 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1875 tcg_out_label(s, label_true); 1876 1877 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1878 tcg_out_label(s, label_over); 1879 } else { 1880 /* When the destination does not overlap one of the arguments, 1881 clear the destination first, jump if cond false, and emit an 1882 increment in the true case. This results in smaller code. */ 1883 1884 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1885 1886 label_over = gen_new_label(); 1887 new_args[4] = tcg_invert_cond(new_args[4]); 1888 new_args[5] = label_arg(label_over); 1889 tcg_out_brcond2(s, new_args, const_args+1, 1); 1890 1891 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1892 tcg_out_label(s, label_over); 1893 } 1894} 1895#endif 1896 1897static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1898 TCGReg dest, TCGReg v1) 1899{ 1900 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1901} 1902 1903static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1904 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1905 TCGReg v1) 1906{ 1907 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1908 tcg_out_cmov(s, jcc, rexw, dest, v1); 1909} 1910 1911static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1912{ 1913 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1914 1915 if (disp == (int32_t)disp) { 1916 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1917 tcg_out32(s, disp); 1918 } else { 1919 /* rip-relative addressing into the constant pool. 1920 This is 6 + 8 = 14 bytes, as compared to using an 1921 immediate load 10 + 6 = 16 bytes, plus we may 1922 be able to re-use the pool constant for more calls. */ 1923 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1924 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1925 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1926 tcg_out32(s, 0); 1927 } 1928} 1929 1930static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1931 const TCGHelperInfo *info) 1932{ 1933 tcg_out_branch(s, 1, dest); 1934 1935#ifndef _WIN32 1936 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1937 /* 1938 * The sysv i386 abi for struct return places a reference as the 1939 * first argument of the stack, and pops that argument with the 1940 * return statement. Since we want to retain the aligned stack 1941 * pointer for the callee, we do not want to actually push that 1942 * argument before the call but rely on the normal store to the 1943 * stack slot. But we do need to compensate for the pop in order 1944 * to reset our correct stack pointer value. 1945 * Pushing a garbage value back onto the stack is quickest. 1946 */ 1947 tcg_out_push(s, TCG_REG_EAX); 1948 } 1949#endif 1950} 1951 1952static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1953{ 1954 tcg_out_branch(s, 0, dest); 1955} 1956 1957static void tcg_out_nopn(TCGContext *s, int n) 1958{ 1959 int i; 1960 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1961 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1962 * duplicate prefix, and all of the interesting recent cores can 1963 * decode and discard the duplicates in a single cycle. 1964 */ 1965 tcg_debug_assert(n >= 1); 1966 for (i = 1; i < n; ++i) { 1967 tcg_out8(s, 0x66); 1968 } 1969 tcg_out8(s, 0x90); 1970} 1971 1972typedef struct { 1973 TCGReg base; 1974 int index; 1975 int ofs; 1976 int seg; 1977 TCGAtomAlign aa; 1978} HostAddress; 1979 1980bool tcg_target_has_memory_bswap(MemOp memop) 1981{ 1982 TCGAtomAlign aa; 1983 1984 if (!have_movbe) { 1985 return false; 1986 } 1987 if ((memop & MO_SIZE) < MO_128) { 1988 return true; 1989 } 1990 1991 /* 1992 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 1993 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 1994 */ 1995 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 1996 return aa.atom < MO_128; 1997} 1998 1999/* 2000 * Because i686 has no register parameters and because x86_64 has xchg 2001 * to handle addr/data register overlap, we have placed all input arguments 2002 * before we need might need a scratch reg. 2003 * 2004 * Even then, a scratch is only needed for l->raddr. Rather than expose 2005 * a general-purpose scratch when we don't actually know it's available, 2006 * use the ra_gen hook to load into RAX if needed. 2007 */ 2008#if TCG_TARGET_REG_BITS == 64 2009static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 2010{ 2011 if (arg < 0) { 2012 arg = TCG_REG_RAX; 2013 } 2014 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 2015 return arg; 2016} 2017static const TCGLdstHelperParam ldst_helper_param = { 2018 .ra_gen = ldst_ra_gen 2019}; 2020#else 2021static const TCGLdstHelperParam ldst_helper_param = { }; 2022#endif 2023 2024static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 2025 TCGReg l, TCGReg h, TCGReg v) 2026{ 2027 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2028 2029 /* vpmov{d,q} %v, %l */ 2030 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2031 /* vpextr{d,q} $1, %v, %h */ 2032 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2033 tcg_out8(s, 1); 2034} 2035 2036static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2037 TCGReg v, TCGReg l, TCGReg h) 2038{ 2039 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2040 2041 /* vmov{d,q} %l, %v */ 2042 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2043 /* vpinsr{d,q} $1, %h, %v, %v */ 2044 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2045 tcg_out8(s, 1); 2046} 2047 2048/* 2049 * Generate code for the slow path for a load at the end of block 2050 */ 2051static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2052{ 2053 MemOp opc = get_memop(l->oi); 2054 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2055 2056 /* resolve label address */ 2057 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2058 if (label_ptr[1]) { 2059 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2060 } 2061 2062 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2063 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2064 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2065 2066 tcg_out_jmp(s, l->raddr); 2067 return true; 2068} 2069 2070/* 2071 * Generate code for the slow path for a store at the end of block 2072 */ 2073static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2074{ 2075 MemOp opc = get_memop(l->oi); 2076 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2077 2078 /* resolve label address */ 2079 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2080 if (label_ptr[1]) { 2081 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2082 } 2083 2084 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2085 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2086 2087 tcg_out_jmp(s, l->raddr); 2088 return true; 2089} 2090 2091#ifdef CONFIG_USER_ONLY 2092static HostAddress x86_guest_base = { 2093 .index = -1 2094}; 2095 2096#if defined(__x86_64__) && defined(__linux__) 2097# include <asm/prctl.h> 2098# include <sys/prctl.h> 2099int arch_prctl(int code, unsigned long addr); 2100static inline int setup_guest_base_seg(void) 2101{ 2102 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2103 return P_GS; 2104 } 2105 return 0; 2106} 2107#define setup_guest_base_seg setup_guest_base_seg 2108#elif defined(__x86_64__) && \ 2109 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2110# include <machine/sysarch.h> 2111static inline int setup_guest_base_seg(void) 2112{ 2113 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2114 return P_GS; 2115 } 2116 return 0; 2117} 2118#define setup_guest_base_seg setup_guest_base_seg 2119#endif 2120#else 2121# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2122#endif /* CONFIG_USER_ONLY */ 2123#ifndef setup_guest_base_seg 2124# define setup_guest_base_seg() 0 2125#endif 2126 2127#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2128 2129/* 2130 * For softmmu, perform the TLB load and compare. 2131 * For useronly, perform any required alignment tests. 2132 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2133 * is required and fill in @h with the host address for the fast path. 2134 */ 2135static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2136 TCGReg addr, MemOpIdx oi, bool is_ld) 2137{ 2138 TCGLabelQemuLdst *ldst = NULL; 2139 MemOp opc = get_memop(oi); 2140 MemOp s_bits = opc & MO_SIZE; 2141 unsigned a_mask; 2142 2143 if (tcg_use_softmmu) { 2144 h->index = TCG_REG_L0; 2145 h->ofs = 0; 2146 h->seg = 0; 2147 } else { 2148 *h = x86_guest_base; 2149 } 2150 h->base = addr; 2151 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2152 a_mask = (1 << h->aa.align) - 1; 2153 2154 if (tcg_use_softmmu) { 2155 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2156 : offsetof(CPUTLBEntry, addr_write); 2157 TCGType ttype = TCG_TYPE_I32; 2158 TCGType tlbtype = TCG_TYPE_I32; 2159 int trexw = 0, hrexw = 0, tlbrexw = 0; 2160 unsigned mem_index = get_mmuidx(oi); 2161 unsigned s_mask = (1 << s_bits) - 1; 2162 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2163 int tlb_mask; 2164 2165 ldst = new_ldst_label(s); 2166 ldst->is_ld = is_ld; 2167 ldst->oi = oi; 2168 ldst->addr_reg = addr; 2169 2170 if (TCG_TARGET_REG_BITS == 64) { 2171 ttype = s->addr_type; 2172 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2173 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2174 hrexw = P_REXW; 2175 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2176 tlbtype = TCG_TYPE_I64; 2177 tlbrexw = P_REXW; 2178 } 2179 } 2180 } 2181 2182 tcg_out_mov(s, tlbtype, TCG_REG_L0, addr); 2183 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2184 s->page_bits - CPU_TLB_ENTRY_BITS); 2185 2186 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2187 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2188 2189 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2190 fast_ofs + offsetof(CPUTLBDescFast, table)); 2191 2192 /* 2193 * If the required alignment is at least as large as the access, 2194 * simply copy the address and mask. For lesser alignments, 2195 * check that we don't cross pages for the complete access. 2196 */ 2197 if (a_mask >= s_mask) { 2198 tcg_out_mov(s, ttype, TCG_REG_L1, addr); 2199 } else { 2200 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2201 addr, s_mask - a_mask); 2202 } 2203 tlb_mask = s->page_mask | a_mask; 2204 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2205 2206 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2207 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2208 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2209 2210 /* jne slow_path */ 2211 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2212 ldst->label_ptr[0] = s->code_ptr; 2213 s->code_ptr += 4; 2214 2215 /* TLB Hit. */ 2216 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2217 offsetof(CPUTLBEntry, addend)); 2218 } else if (a_mask) { 2219 int jcc; 2220 2221 ldst = new_ldst_label(s); 2222 ldst->is_ld = is_ld; 2223 ldst->oi = oi; 2224 ldst->addr_reg = addr; 2225 2226 /* jne slow_path */ 2227 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addr, a_mask, true, false); 2228 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2229 ldst->label_ptr[0] = s->code_ptr; 2230 s->code_ptr += 4; 2231 } 2232 2233 return ldst; 2234} 2235 2236static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2237 HostAddress h, TCGType type, MemOp memop) 2238{ 2239 bool use_movbe = false; 2240 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2241 int movop = OPC_MOVL_GvEv; 2242 2243 /* Do big-endian loads with movbe. */ 2244 if (memop & MO_BSWAP) { 2245 tcg_debug_assert(have_movbe); 2246 use_movbe = true; 2247 movop = OPC_MOVBE_GyMy; 2248 } 2249 2250 switch (memop & MO_SSIZE) { 2251 case MO_UB: 2252 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2253 h.base, h.index, 0, h.ofs); 2254 break; 2255 case MO_SB: 2256 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2257 h.base, h.index, 0, h.ofs); 2258 break; 2259 case MO_UW: 2260 if (use_movbe) { 2261 /* There is no extending movbe; only low 16-bits are modified. */ 2262 if (datalo != h.base && datalo != h.index) { 2263 /* XOR breaks dependency chains. */ 2264 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2265 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2266 datalo, h.base, h.index, 0, h.ofs); 2267 } else { 2268 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2269 datalo, h.base, h.index, 0, h.ofs); 2270 tcg_out_ext16u(s, datalo, datalo); 2271 } 2272 } else { 2273 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2274 h.base, h.index, 0, h.ofs); 2275 } 2276 break; 2277 case MO_SW: 2278 if (use_movbe) { 2279 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2280 datalo, h.base, h.index, 0, h.ofs); 2281 tcg_out_ext16s(s, type, datalo, datalo); 2282 } else { 2283 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2284 datalo, h.base, h.index, 0, h.ofs); 2285 } 2286 break; 2287 case MO_UL: 2288 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2289 h.base, h.index, 0, h.ofs); 2290 break; 2291#if TCG_TARGET_REG_BITS == 64 2292 case MO_SL: 2293 if (use_movbe) { 2294 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2295 h.base, h.index, 0, h.ofs); 2296 tcg_out_ext32s(s, datalo, datalo); 2297 } else { 2298 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2299 h.base, h.index, 0, h.ofs); 2300 } 2301 break; 2302#endif 2303 case MO_UQ: 2304 if (TCG_TARGET_REG_BITS == 64) { 2305 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2306 h.base, h.index, 0, h.ofs); 2307 break; 2308 } 2309 if (use_movbe) { 2310 TCGReg t = datalo; 2311 datalo = datahi; 2312 datahi = t; 2313 } 2314 if (h.base == datalo || h.index == datalo) { 2315 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2316 h.base, h.index, 0, h.ofs); 2317 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2318 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2319 } else { 2320 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2321 h.base, h.index, 0, h.ofs); 2322 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2323 h.base, h.index, 0, h.ofs + 4); 2324 } 2325 break; 2326 2327 case MO_128: 2328 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2329 2330 /* 2331 * Without 16-byte atomicity, use integer regs. 2332 * That is where we want the data, and it allows bswaps. 2333 */ 2334 if (h.aa.atom < MO_128) { 2335 if (use_movbe) { 2336 TCGReg t = datalo; 2337 datalo = datahi; 2338 datahi = t; 2339 } 2340 if (h.base == datalo || h.index == datalo) { 2341 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2342 h.base, h.index, 0, h.ofs); 2343 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2344 datalo, datahi, 0); 2345 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2346 datahi, datahi, 8); 2347 } else { 2348 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2349 h.base, h.index, 0, h.ofs); 2350 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2351 h.base, h.index, 0, h.ofs + 8); 2352 } 2353 break; 2354 } 2355 2356 /* 2357 * With 16-byte atomicity, a vector load is required. 2358 * If we already have 16-byte alignment, then VMOVDQA always works. 2359 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2360 * Else use we require a runtime test for alignment for VMOVDQA; 2361 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2362 */ 2363 if (h.aa.align >= MO_128) { 2364 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2365 TCG_TMP_VEC, 0, 2366 h.base, h.index, 0, h.ofs); 2367 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2368 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2369 TCG_TMP_VEC, 0, 2370 h.base, h.index, 0, h.ofs); 2371 } else { 2372 TCGLabel *l1 = gen_new_label(); 2373 TCGLabel *l2 = gen_new_label(); 2374 int jcc; 2375 2376 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2377 tcg_out_jxx(s, jcc, l1, true); 2378 2379 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2380 TCG_TMP_VEC, 0, 2381 h.base, h.index, 0, h.ofs); 2382 tcg_out_jxx(s, JCC_JMP, l2, true); 2383 2384 tcg_out_label(s, l1); 2385 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2386 TCG_TMP_VEC, 0, 2387 h.base, h.index, 0, h.ofs); 2388 tcg_out_label(s, l2); 2389 } 2390 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2391 break; 2392 2393 default: 2394 g_assert_not_reached(); 2395 } 2396} 2397 2398static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2399 TCGReg addr, MemOpIdx oi, TCGType data_type) 2400{ 2401 TCGLabelQemuLdst *ldst; 2402 HostAddress h; 2403 2404 ldst = prepare_host_addr(s, &h, addr, oi, true); 2405 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2406 2407 if (ldst) { 2408 ldst->type = data_type; 2409 ldst->datalo_reg = datalo; 2410 ldst->datahi_reg = datahi; 2411 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2412 } 2413} 2414 2415static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2416 HostAddress h, MemOp memop) 2417{ 2418 bool use_movbe = false; 2419 int movop = OPC_MOVL_EvGv; 2420 2421 /* 2422 * Do big-endian stores with movbe or system-mode. 2423 * User-only without movbe will have its swapping done generically. 2424 */ 2425 if (memop & MO_BSWAP) { 2426 tcg_debug_assert(have_movbe); 2427 use_movbe = true; 2428 movop = OPC_MOVBE_MyGy; 2429 } 2430 2431 switch (memop & MO_SIZE) { 2432 case MO_8: 2433 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2434 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2435 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2436 datalo, h.base, h.index, 0, h.ofs); 2437 break; 2438 case MO_16: 2439 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2440 h.base, h.index, 0, h.ofs); 2441 break; 2442 case MO_32: 2443 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2444 h.base, h.index, 0, h.ofs); 2445 break; 2446 case MO_64: 2447 if (TCG_TARGET_REG_BITS == 64) { 2448 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2449 h.base, h.index, 0, h.ofs); 2450 } else { 2451 if (use_movbe) { 2452 TCGReg t = datalo; 2453 datalo = datahi; 2454 datahi = t; 2455 } 2456 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2457 h.base, h.index, 0, h.ofs); 2458 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2459 h.base, h.index, 0, h.ofs + 4); 2460 } 2461 break; 2462 2463 case MO_128: 2464 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2465 2466 /* 2467 * Without 16-byte atomicity, use integer regs. 2468 * That is where we have the data, and it allows bswaps. 2469 */ 2470 if (h.aa.atom < MO_128) { 2471 if (use_movbe) { 2472 TCGReg t = datalo; 2473 datalo = datahi; 2474 datahi = t; 2475 } 2476 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2477 h.base, h.index, 0, h.ofs); 2478 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2479 h.base, h.index, 0, h.ofs + 8); 2480 break; 2481 } 2482 2483 /* 2484 * With 16-byte atomicity, a vector store is required. 2485 * If we already have 16-byte alignment, then VMOVDQA always works. 2486 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2487 * Else use we require a runtime test for alignment for VMOVDQA; 2488 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2489 */ 2490 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2491 if (h.aa.align >= MO_128) { 2492 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2493 TCG_TMP_VEC, 0, 2494 h.base, h.index, 0, h.ofs); 2495 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2496 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2497 TCG_TMP_VEC, 0, 2498 h.base, h.index, 0, h.ofs); 2499 } else { 2500 TCGLabel *l1 = gen_new_label(); 2501 TCGLabel *l2 = gen_new_label(); 2502 int jcc; 2503 2504 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2505 tcg_out_jxx(s, jcc, l1, true); 2506 2507 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2508 TCG_TMP_VEC, 0, 2509 h.base, h.index, 0, h.ofs); 2510 tcg_out_jxx(s, JCC_JMP, l2, true); 2511 2512 tcg_out_label(s, l1); 2513 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2514 TCG_TMP_VEC, 0, 2515 h.base, h.index, 0, h.ofs); 2516 tcg_out_label(s, l2); 2517 } 2518 break; 2519 2520 default: 2521 g_assert_not_reached(); 2522 } 2523} 2524 2525static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2526 TCGReg addr, MemOpIdx oi, TCGType data_type) 2527{ 2528 TCGLabelQemuLdst *ldst; 2529 HostAddress h; 2530 2531 ldst = prepare_host_addr(s, &h, addr, oi, false); 2532 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2533 2534 if (ldst) { 2535 ldst->type = data_type; 2536 ldst->datalo_reg = datalo; 2537 ldst->datahi_reg = datahi; 2538 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2539 } 2540} 2541 2542static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2543{ 2544 /* Reuse the zeroing that exists for goto_ptr. */ 2545 if (a0 == 0) { 2546 tcg_out_jmp(s, tcg_code_gen_epilogue); 2547 } else { 2548 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2549 tcg_out_jmp(s, tb_ret_addr); 2550 } 2551} 2552 2553static void tcg_out_goto_tb(TCGContext *s, int which) 2554{ 2555 /* 2556 * Jump displacement must be aligned for atomic patching; 2557 * see if we need to add extra nops before jump 2558 */ 2559 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2560 if (gap != 1) { 2561 tcg_out_nopn(s, gap - 1); 2562 } 2563 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2564 set_jmp_insn_offset(s, which); 2565 tcg_out32(s, 0); 2566 set_jmp_reset_offset(s, which); 2567} 2568 2569void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2570 uintptr_t jmp_rx, uintptr_t jmp_rw) 2571{ 2572 /* patch the branch destination */ 2573 uintptr_t addr = tb->jmp_target_addr[n]; 2574 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2575 /* no need to flush icache explicitly */ 2576} 2577 2578 2579static void tgen_add(TCGContext *s, TCGType type, 2580 TCGReg a0, TCGReg a1, TCGReg a2) 2581{ 2582 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2583 2584 if (a0 == a1) { 2585 tgen_arithr(s, ARITH_ADD + rexw, a0, a2); 2586 } else if (a0 == a2) { 2587 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2588 } else { 2589 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, 0); 2590 } 2591} 2592 2593static void tgen_addi(TCGContext *s, TCGType type, 2594 TCGReg a0, TCGReg a1, tcg_target_long a2) 2595{ 2596 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2597 2598 if (a0 == a1) { 2599 tgen_arithi(s, ARITH_ADD + rexw, a0, a2, false); 2600 } else { 2601 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2); 2602 } 2603} 2604 2605static const TCGOutOpBinary outop_add = { 2606 .base.static_constraint = C_O1_I2(r, r, re), 2607 .out_rrr = tgen_add, 2608 .out_rri = tgen_addi, 2609}; 2610 2611static void tgen_and(TCGContext *s, TCGType type, 2612 TCGReg a0, TCGReg a1, TCGReg a2) 2613{ 2614 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2615 tgen_arithr(s, ARITH_AND + rexw, a0, a2); 2616} 2617 2618static void tgen_andi(TCGContext *s, TCGType type, 2619 TCGReg a0, TCGReg a1, tcg_target_long a2) 2620{ 2621 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2622 tgen_arithi(s, ARITH_AND + rexw, a0, a2, false); 2623} 2624 2625static const TCGOutOpBinary outop_and = { 2626 .base.static_constraint = C_O1_I2(r, 0, reZ), 2627 .out_rrr = tgen_and, 2628 .out_rri = tgen_andi, 2629}; 2630 2631static void tgen_andc(TCGContext *s, TCGType type, 2632 TCGReg a0, TCGReg a1, TCGReg a2) 2633{ 2634 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2635 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2636} 2637 2638static TCGConstraintSetIndex cset_andc(TCGType type, unsigned flags) 2639{ 2640 return have_bmi1 ? C_O1_I2(r, r, r) : C_NotImplemented; 2641} 2642 2643static const TCGOutOpBinary outop_andc = { 2644 .base.static_constraint = C_Dynamic, 2645 .base.dynamic_constraint = cset_andc, 2646 .out_rrr = tgen_andc, 2647}; 2648 2649static void tgen_clz(TCGContext *s, TCGType type, 2650 TCGReg a0, TCGReg a1, TCGReg a2) 2651{ 2652 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2653 int jcc; 2654 2655 if (have_lzcnt) { 2656 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2657 jcc = JCC_JB; 2658 } else { 2659 /* Recall that the output of BSR is the index not the count. */ 2660 tcg_out_modrm(s, OPC_BSR + rexw, a0, a1); 2661 tgen_arithi(s, ARITH_XOR + rexw, a0, rexw ? 63 : 31, 0); 2662 2663 /* Since we have destroyed the flags from BSR, we have to re-test. */ 2664 jcc = tcg_out_cmp(s, TCG_COND_EQ, a1, 0, 1, rexw); 2665 } 2666 tcg_out_cmov(s, jcc, rexw, a0, a2); 2667} 2668 2669static void tgen_clzi(TCGContext *s, TCGType type, 2670 TCGReg a0, TCGReg a1, tcg_target_long a2) 2671{ 2672 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2673 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2674} 2675 2676static TCGConstraintSetIndex cset_clz(TCGType type, unsigned flags) 2677{ 2678 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2679} 2680 2681static const TCGOutOpBinary outop_clz = { 2682 .base.static_constraint = C_Dynamic, 2683 .base.dynamic_constraint = cset_clz, 2684 .out_rrr = tgen_clz, 2685 .out_rri = tgen_clzi, 2686}; 2687 2688static void tgen_ctpop(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2689{ 2690 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2691 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2692} 2693 2694static TCGConstraintSetIndex cset_ctpop(TCGType type, unsigned flags) 2695{ 2696 return have_popcnt ? C_O1_I1(r, r) : C_NotImplemented; 2697} 2698 2699static const TCGOutOpUnary outop_ctpop = { 2700 .base.static_constraint = C_Dynamic, 2701 .base.dynamic_constraint = cset_ctpop, 2702 .out_rr = tgen_ctpop, 2703}; 2704 2705static void tgen_ctz(TCGContext *s, TCGType type, 2706 TCGReg a0, TCGReg a1, TCGReg a2) 2707{ 2708 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2709 int jcc; 2710 2711 if (have_bmi1) { 2712 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2713 jcc = JCC_JB; 2714 } else { 2715 tcg_out_modrm(s, OPC_BSF + rexw, a0, a1); 2716 jcc = JCC_JE; 2717 } 2718 tcg_out_cmov(s, jcc, rexw, a0, a2); 2719} 2720 2721static void tgen_ctzi(TCGContext *s, TCGType type, 2722 TCGReg a0, TCGReg a1, tcg_target_long a2) 2723{ 2724 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2725 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2726} 2727 2728static TCGConstraintSetIndex cset_ctz(TCGType type, unsigned flags) 2729{ 2730 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2731} 2732 2733static const TCGOutOpBinary outop_ctz = { 2734 .base.static_constraint = C_Dynamic, 2735 .base.dynamic_constraint = cset_ctz, 2736 .out_rrr = tgen_ctz, 2737 .out_rri = tgen_ctzi, 2738}; 2739 2740static const TCGOutOpBinary outop_divs = { 2741 .base.static_constraint = C_NotImplemented, 2742}; 2743 2744static void tgen_divs2(TCGContext *s, TCGType type, 2745 TCGReg a0, TCGReg a1, TCGReg a4) 2746{ 2747 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2748 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, a4); 2749} 2750 2751static const TCGOutOpDivRem outop_divs2 = { 2752 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2753 .out_rr01r = tgen_divs2, 2754}; 2755 2756static const TCGOutOpBinary outop_divu = { 2757 .base.static_constraint = C_NotImplemented, 2758}; 2759 2760static void tgen_divu2(TCGContext *s, TCGType type, 2761 TCGReg a0, TCGReg a1, TCGReg a4) 2762{ 2763 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2764 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, a4); 2765} 2766 2767static const TCGOutOpDivRem outop_divu2 = { 2768 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2769 .out_rr01r = tgen_divu2, 2770}; 2771 2772static const TCGOutOpBinary outop_eqv = { 2773 .base.static_constraint = C_NotImplemented, 2774}; 2775 2776static void tgen_mul(TCGContext *s, TCGType type, 2777 TCGReg a0, TCGReg a1, TCGReg a2) 2778{ 2779 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2780 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2781} 2782 2783static void tgen_muli(TCGContext *s, TCGType type, 2784 TCGReg a0, TCGReg a1, tcg_target_long a2) 2785{ 2786 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2787 2788 if (a2 == (int8_t)a2) { 2789 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2790 tcg_out8(s, a2); 2791 } else { 2792 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2793 tcg_out32(s, a2); 2794 } 2795} 2796 2797static const TCGOutOpBinary outop_mul = { 2798 .base.static_constraint = C_O1_I2(r, 0, re), 2799 .out_rrr = tgen_mul, 2800 .out_rri = tgen_muli, 2801}; 2802 2803static void tgen_muls2(TCGContext *s, TCGType type, 2804 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2805{ 2806 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2807 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, a3); 2808} 2809 2810static const TCGOutOpMul2 outop_muls2 = { 2811 .base.static_constraint = C_O2_I2(a, d, a, r), 2812 .out_rrrr = tgen_muls2, 2813}; 2814 2815static const TCGOutOpBinary outop_mulsh = { 2816 .base.static_constraint = C_NotImplemented, 2817}; 2818 2819static const TCGOutOpBinary outop_muluh = { 2820 .base.static_constraint = C_NotImplemented, 2821}; 2822 2823static void tgen_mulu2(TCGContext *s, TCGType type, 2824 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2825{ 2826 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2827 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, a3); 2828} 2829 2830static const TCGOutOpMul2 outop_mulu2 = { 2831 .base.static_constraint = C_O2_I2(a, d, a, r), 2832 .out_rrrr = tgen_mulu2, 2833}; 2834 2835static const TCGOutOpBinary outop_nand = { 2836 .base.static_constraint = C_NotImplemented, 2837}; 2838 2839static const TCGOutOpBinary outop_nor = { 2840 .base.static_constraint = C_NotImplemented, 2841}; 2842 2843static void tgen_or(TCGContext *s, TCGType type, 2844 TCGReg a0, TCGReg a1, TCGReg a2) 2845{ 2846 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2847 tgen_arithr(s, ARITH_OR + rexw, a0, a2); 2848} 2849 2850static void tgen_ori(TCGContext *s, TCGType type, 2851 TCGReg a0, TCGReg a1, tcg_target_long a2) 2852{ 2853 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2854 tgen_arithi(s, ARITH_OR + rexw, a0, a2, false); 2855} 2856 2857static const TCGOutOpBinary outop_or = { 2858 .base.static_constraint = C_O1_I2(r, 0, re), 2859 .out_rrr = tgen_or, 2860 .out_rri = tgen_ori, 2861}; 2862 2863static const TCGOutOpBinary outop_orc = { 2864 .base.static_constraint = C_NotImplemented, 2865}; 2866 2867static const TCGOutOpBinary outop_rems = { 2868 .base.static_constraint = C_NotImplemented, 2869}; 2870 2871static const TCGOutOpBinary outop_remu = { 2872 .base.static_constraint = C_NotImplemented, 2873}; 2874 2875static void tgen_rotl(TCGContext *s, TCGType type, 2876 TCGReg a0, TCGReg a1, TCGReg a2) 2877{ 2878 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2879 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROL, a0); 2880} 2881 2882static void tgen_rotli(TCGContext *s, TCGType type, 2883 TCGReg a0, TCGReg a1, tcg_target_long a2) 2884{ 2885 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2886 tcg_out_shifti(s, SHIFT_ROL + rexw, a0, a2); 2887} 2888 2889static const TCGOutOpBinary outop_rotl = { 2890 .base.static_constraint = C_O1_I2(r, 0, ci), 2891 .out_rrr = tgen_rotl, 2892 .out_rri = tgen_rotli, 2893}; 2894 2895static void tgen_rotr(TCGContext *s, TCGType type, 2896 TCGReg a0, TCGReg a1, TCGReg a2) 2897{ 2898 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2899 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROR, a0); 2900} 2901 2902static void tgen_rotri(TCGContext *s, TCGType type, 2903 TCGReg a0, TCGReg a1, tcg_target_long a2) 2904{ 2905 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2906 tcg_out_shifti(s, SHIFT_ROR + rexw, a0, a2); 2907} 2908 2909static const TCGOutOpBinary outop_rotr = { 2910 .base.static_constraint = C_O1_I2(r, 0, ci), 2911 .out_rrr = tgen_rotr, 2912 .out_rri = tgen_rotri, 2913}; 2914 2915static TCGConstraintSetIndex cset_shift(TCGType type, unsigned flags) 2916{ 2917 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 2918} 2919 2920static void tgen_sar(TCGContext *s, TCGType type, 2921 TCGReg a0, TCGReg a1, TCGReg a2) 2922{ 2923 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2924 if (have_bmi2) { 2925 tcg_out_vex_modrm(s, OPC_SARX + rexw, a0, a2, a1); 2926 } else { 2927 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SAR, a0); 2928 } 2929} 2930 2931static void tgen_sari(TCGContext *s, TCGType type, 2932 TCGReg a0, TCGReg a1, tcg_target_long a2) 2933{ 2934 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2935 2936 tcg_out_mov(s, type, a0, a1); 2937 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, a2); 2938} 2939 2940static const TCGOutOpBinary outop_sar = { 2941 .base.static_constraint = C_Dynamic, 2942 .base.dynamic_constraint = cset_shift, 2943 .out_rrr = tgen_sar, 2944 .out_rri = tgen_sari, 2945}; 2946 2947static void tgen_shl(TCGContext *s, TCGType type, 2948 TCGReg a0, TCGReg a1, TCGReg a2) 2949{ 2950 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2951 if (have_bmi2) { 2952 tcg_out_vex_modrm(s, OPC_SHLX + rexw, a0, a2, a1); 2953 } else { 2954 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHL, a0); 2955 } 2956} 2957 2958static void tgen_shli(TCGContext *s, TCGType type, 2959 TCGReg a0, TCGReg a1, tcg_target_long a2) 2960{ 2961 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2962 2963 /* For small constant 3-operand shift, use LEA. */ 2964 if (a0 != a1 && a2 >= 1 && a2 <= 3) { 2965 if (a2 == 1) { 2966 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2967 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2968 } else { 2969 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2970 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2971 } 2972 return; 2973 } 2974 tcg_out_mov(s, type, a0, a1); 2975 tcg_out_shifti(s, SHIFT_SHL + rexw, a0, a2); 2976} 2977 2978static const TCGOutOpBinary outop_shl = { 2979 .base.static_constraint = C_Dynamic, 2980 .base.dynamic_constraint = cset_shift, 2981 .out_rrr = tgen_shl, 2982 .out_rri = tgen_shli, 2983}; 2984 2985static void tgen_shr(TCGContext *s, TCGType type, 2986 TCGReg a0, TCGReg a1, TCGReg a2) 2987{ 2988 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2989 if (have_bmi2) { 2990 tcg_out_vex_modrm(s, OPC_SHRX + rexw, a0, a2, a1); 2991 } else { 2992 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHR, a0); 2993 } 2994} 2995 2996static void tgen_shri(TCGContext *s, TCGType type, 2997 TCGReg a0, TCGReg a1, tcg_target_long a2) 2998{ 2999 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3000 3001 tcg_out_mov(s, type, a0, a1); 3002 tcg_out_shifti(s, SHIFT_SHR + rexw, a0, a2); 3003} 3004 3005static const TCGOutOpBinary outop_shr = { 3006 .base.static_constraint = C_Dynamic, 3007 .base.dynamic_constraint = cset_shift, 3008 .out_rrr = tgen_shr, 3009 .out_rri = tgen_shri, 3010}; 3011 3012static void tgen_sub(TCGContext *s, TCGType type, 3013 TCGReg a0, TCGReg a1, TCGReg a2) 3014{ 3015 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3016 tgen_arithr(s, ARITH_SUB + rexw, a0, a2); 3017} 3018 3019static const TCGOutOpSubtract outop_sub = { 3020 .base.static_constraint = C_O1_I2(r, 0, r), 3021 .out_rrr = tgen_sub, 3022}; 3023 3024static void tgen_xor(TCGContext *s, TCGType type, 3025 TCGReg a0, TCGReg a1, TCGReg a2) 3026{ 3027 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3028 tgen_arithr(s, ARITH_XOR + rexw, a0, a2); 3029} 3030 3031static void tgen_xori(TCGContext *s, TCGType type, 3032 TCGReg a0, TCGReg a1, tcg_target_long a2) 3033{ 3034 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3035 tgen_arithi(s, ARITH_XOR + rexw, a0, a2, false); 3036} 3037 3038static const TCGOutOpBinary outop_xor = { 3039 .base.static_constraint = C_O1_I2(r, 0, re), 3040 .out_rrr = tgen_xor, 3041 .out_rri = tgen_xori, 3042}; 3043 3044static void tgen_neg(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3045{ 3046 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3047 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 3048} 3049 3050static const TCGOutOpUnary outop_neg = { 3051 .base.static_constraint = C_O1_I1(r, 0), 3052 .out_rr = tgen_neg, 3053}; 3054 3055static void tgen_not(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3056{ 3057 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3058 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 3059} 3060 3061static const TCGOutOpUnary outop_not = { 3062 .base.static_constraint = C_O1_I1(r, 0), 3063 .out_rr = tgen_not, 3064}; 3065 3066 3067static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type, 3068 const TCGArg args[TCG_MAX_OP_ARGS], 3069 const int const_args[TCG_MAX_OP_ARGS]) 3070{ 3071 TCGArg a0, a1, a2; 3072 int const_a2, rexw; 3073 3074#if TCG_TARGET_REG_BITS == 64 3075# define OP_32_64(x) \ 3076 case glue(glue(INDEX_op_, x), _i64): \ 3077 case glue(glue(INDEX_op_, x), _i32) 3078#else 3079# define OP_32_64(x) \ 3080 case glue(glue(INDEX_op_, x), _i32) 3081#endif 3082 3083 /* Hoist the loads of the most common arguments. */ 3084 a0 = args[0]; 3085 a1 = args[1]; 3086 a2 = args[2]; 3087 const_a2 = const_args[2]; 3088 rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3089 3090 switch (opc) { 3091 case INDEX_op_goto_ptr: 3092 /* jmp to the given host address (could be epilogue) */ 3093 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 3094 break; 3095 case INDEX_op_br: 3096 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 3097 break; 3098 OP_32_64(ld8u): 3099 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3100 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 3101 break; 3102 OP_32_64(ld8s): 3103 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 3104 break; 3105 OP_32_64(ld16u): 3106 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3107 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 3108 break; 3109 OP_32_64(ld16s): 3110 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 3111 break; 3112#if TCG_TARGET_REG_BITS == 64 3113 case INDEX_op_ld32u_i64: 3114#endif 3115 case INDEX_op_ld_i32: 3116 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 3117 break; 3118 3119 OP_32_64(st8): 3120 if (const_args[0]) { 3121 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 3122 tcg_out8(s, a0); 3123 } else { 3124 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 3125 } 3126 break; 3127 OP_32_64(st16): 3128 if (const_args[0]) { 3129 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 3130 tcg_out16(s, a0); 3131 } else { 3132 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 3133 } 3134 break; 3135#if TCG_TARGET_REG_BITS == 64 3136 case INDEX_op_st32_i64: 3137#endif 3138 case INDEX_op_st_i32: 3139 if (const_args[0]) { 3140 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 3141 tcg_out32(s, a0); 3142 } else { 3143 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 3144 } 3145 break; 3146 3147 OP_32_64(movcond): 3148 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 3149 break; 3150 3151 OP_32_64(bswap16): 3152 if (a2 & TCG_BSWAP_OS) { 3153 /* Output must be sign-extended. */ 3154 if (rexw) { 3155 tcg_out_bswap64(s, a0); 3156 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 3157 } else { 3158 tcg_out_bswap32(s, a0); 3159 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 3160 } 3161 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 3162 /* Output must be zero-extended, but input isn't. */ 3163 tcg_out_bswap32(s, a0); 3164 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 3165 } else { 3166 tcg_out_rolw_8(s, a0); 3167 } 3168 break; 3169 OP_32_64(bswap32): 3170 tcg_out_bswap32(s, a0); 3171 if (rexw && (a2 & TCG_BSWAP_OS)) { 3172 tcg_out_ext32s(s, a0, a0); 3173 } 3174 break; 3175 3176 case INDEX_op_qemu_ld_i32: 3177 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32); 3178 break; 3179 case INDEX_op_qemu_ld_i64: 3180 if (TCG_TARGET_REG_BITS == 64) { 3181 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64); 3182 } else { 3183 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3184 } 3185 break; 3186 case INDEX_op_qemu_ld_i128: 3187 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3188 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3189 break; 3190 3191 case INDEX_op_qemu_st_i32: 3192 case INDEX_op_qemu_st8_i32: 3193 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32); 3194 break; 3195 case INDEX_op_qemu_st_i64: 3196 if (TCG_TARGET_REG_BITS == 64) { 3197 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64); 3198 } else { 3199 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3200 } 3201 break; 3202 case INDEX_op_qemu_st_i128: 3203 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3204 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3205 break; 3206 3207 OP_32_64(add2): 3208 if (const_args[4]) { 3209 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 3210 } else { 3211 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 3212 } 3213 if (const_args[5]) { 3214 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 3215 } else { 3216 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 3217 } 3218 break; 3219 OP_32_64(sub2): 3220 if (const_args[4]) { 3221 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 3222 } else { 3223 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 3224 } 3225 if (const_args[5]) { 3226 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 3227 } else { 3228 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 3229 } 3230 break; 3231 3232#if TCG_TARGET_REG_BITS == 32 3233 case INDEX_op_brcond2_i32: 3234 tcg_out_brcond2(s, args, const_args, 0); 3235 break; 3236 case INDEX_op_setcond2_i32: 3237 tcg_out_setcond2(s, args, const_args); 3238 break; 3239#else /* TCG_TARGET_REG_BITS == 64 */ 3240 case INDEX_op_ld32s_i64: 3241 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 3242 break; 3243 case INDEX_op_ld_i64: 3244 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 3245 break; 3246 case INDEX_op_st_i64: 3247 if (const_args[0]) { 3248 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 3249 tcg_out32(s, a0); 3250 } else { 3251 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 3252 } 3253 break; 3254 3255 case INDEX_op_bswap64_i64: 3256 tcg_out_bswap64(s, a0); 3257 break; 3258 case INDEX_op_extrh_i64_i32: 3259 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 3260 break; 3261#endif 3262 3263 OP_32_64(deposit): 3264 if (args[3] == 0 && args[4] == 8) { 3265 /* load bits 0..7 */ 3266 if (const_a2) { 3267 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 3268 0, a0, 0); 3269 tcg_out8(s, a2); 3270 } else { 3271 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 3272 } 3273 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 3274 /* load bits 8..15 */ 3275 if (const_a2) { 3276 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 3277 tcg_out8(s, a2); 3278 } else { 3279 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 3280 } 3281 } else if (args[3] == 0 && args[4] == 16) { 3282 /* load bits 0..15 */ 3283 if (const_a2) { 3284 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 3285 0, a0, 0); 3286 tcg_out16(s, a2); 3287 } else { 3288 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 3289 } 3290 } else { 3291 g_assert_not_reached(); 3292 } 3293 break; 3294 3295 case INDEX_op_extract_i64: 3296 if (a2 + args[3] == 32) { 3297 if (a2 == 0) { 3298 tcg_out_ext32u(s, a0, a1); 3299 break; 3300 } 3301 /* This is a 32-bit zero-extending right shift. */ 3302 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3303 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 3304 break; 3305 } 3306 /* FALLTHRU */ 3307 case INDEX_op_extract_i32: 3308 if (a2 == 0 && args[3] == 8) { 3309 tcg_out_ext8u(s, a0, a1); 3310 } else if (a2 == 0 && args[3] == 16) { 3311 tcg_out_ext16u(s, a0, a1); 3312 } else if (a2 == 8 && args[3] == 8) { 3313 /* 3314 * On the off-chance that we can use the high-byte registers. 3315 * Otherwise we emit the same ext16 + shift pattern that we 3316 * would have gotten from the normal tcg-op.c expansion. 3317 */ 3318 if (a1 < 4 && a0 < 8) { 3319 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3320 } else { 3321 tcg_out_ext16u(s, a0, a1); 3322 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3323 } 3324 } else { 3325 g_assert_not_reached(); 3326 } 3327 break; 3328 3329 case INDEX_op_sextract_i64: 3330 if (a2 == 0 && args[3] == 8) { 3331 tcg_out_ext8s(s, TCG_TYPE_I64, a0, a1); 3332 } else if (a2 == 0 && args[3] == 16) { 3333 tcg_out_ext16s(s, TCG_TYPE_I64, a0, a1); 3334 } else if (a2 == 0 && args[3] == 32) { 3335 tcg_out_ext32s(s, a0, a1); 3336 } else { 3337 g_assert_not_reached(); 3338 } 3339 break; 3340 3341 case INDEX_op_sextract_i32: 3342 if (a2 == 0 && args[3] == 8) { 3343 tcg_out_ext8s(s, TCG_TYPE_I32, a0, a1); 3344 } else if (a2 == 0 && args[3] == 16) { 3345 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3346 } else if (a2 == 8 && args[3] == 8) { 3347 if (a1 < 4 && a0 < 8) { 3348 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3349 } else { 3350 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3351 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3352 } 3353 } else { 3354 g_assert_not_reached(); 3355 } 3356 break; 3357 3358 OP_32_64(extract2): 3359 /* Note that SHRD outputs to the r/m operand. */ 3360 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3361 tcg_out8(s, args[3]); 3362 break; 3363 3364 case INDEX_op_mb: 3365 tcg_out_mb(s, a0); 3366 break; 3367 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3368 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3369 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3370 case INDEX_op_ext_i32_i64: /* Always emitted via tcg_reg_alloc_op. */ 3371 case INDEX_op_extu_i32_i64: 3372 case INDEX_op_extrl_i64_i32: 3373 default: 3374 g_assert_not_reached(); 3375 } 3376 3377#undef OP_32_64 3378} 3379 3380static int const umin_insn[4] = { 3381 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3382}; 3383 3384static int const umax_insn[4] = { 3385 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3386}; 3387 3388static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3389 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3390{ 3391 static int const cmpeq_insn[4] = { 3392 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3393 }; 3394 static int const cmpgt_insn[4] = { 3395 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3396 }; 3397 3398 enum { 3399 NEED_INV = 1, 3400 NEED_SWAP = 2, 3401 NEED_UMIN = 4, 3402 NEED_UMAX = 8, 3403 INVALID = 16, 3404 }; 3405 static const uint8_t cond_fixup[16] = { 3406 [0 ... 15] = INVALID, 3407 [TCG_COND_EQ] = 0, 3408 [TCG_COND_GT] = 0, 3409 [TCG_COND_NE] = NEED_INV, 3410 [TCG_COND_LE] = NEED_INV, 3411 [TCG_COND_LT] = NEED_SWAP, 3412 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3413 [TCG_COND_LEU] = NEED_UMIN, 3414 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3415 [TCG_COND_GEU] = NEED_UMAX, 3416 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3417 }; 3418 int fixup = cond_fixup[cond]; 3419 3420 assert(!(fixup & INVALID)); 3421 3422 if (fixup & NEED_INV) { 3423 cond = tcg_invert_cond(cond); 3424 } 3425 3426 if (fixup & NEED_SWAP) { 3427 TCGReg swap = v1; 3428 v1 = v2; 3429 v2 = swap; 3430 cond = tcg_swap_cond(cond); 3431 } 3432 3433 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3434 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3435 3436 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3437 assert(vece <= MO_32); 3438 3439 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3440 v2 = TCG_TMP_VEC; 3441 cond = TCG_COND_EQ; 3442 } 3443 3444 switch (cond) { 3445 case TCG_COND_EQ: 3446 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3447 break; 3448 case TCG_COND_GT: 3449 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3450 break; 3451 default: 3452 g_assert_not_reached(); 3453 } 3454 return fixup & NEED_INV; 3455} 3456 3457static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3458 TCGReg v1, TCGReg v2, TCGCond cond) 3459{ 3460 static const int cmpm_insn[2][4] = { 3461 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3462 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3463 }; 3464 static const int testm_insn[4] = { 3465 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3466 }; 3467 static const int testnm_insn[4] = { 3468 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3469 }; 3470 3471 static const int cond_ext[16] = { 3472 [TCG_COND_EQ] = 0, 3473 [TCG_COND_NE] = 4, 3474 [TCG_COND_LT] = 1, 3475 [TCG_COND_LTU] = 1, 3476 [TCG_COND_LE] = 2, 3477 [TCG_COND_LEU] = 2, 3478 [TCG_COND_NEVER] = 3, 3479 [TCG_COND_GE] = 5, 3480 [TCG_COND_GEU] = 5, 3481 [TCG_COND_GT] = 6, 3482 [TCG_COND_GTU] = 6, 3483 [TCG_COND_ALWAYS] = 7, 3484 }; 3485 3486 switch (cond) { 3487 case TCG_COND_TSTNE: 3488 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3489 break; 3490 case TCG_COND_TSTEQ: 3491 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3492 break; 3493 default: 3494 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3495 /* k1 */ 1, v1, v2, type); 3496 tcg_out8(s, cond_ext[cond]); 3497 break; 3498 } 3499} 3500 3501static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3502 unsigned vece, TCGReg dest) 3503{ 3504 static const int movm_insn[] = { 3505 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3506 }; 3507 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3508} 3509 3510static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3511 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3512{ 3513 /* 3514 * With avx512, we have a complete set of comparisons into mask. 3515 * Unless there's a single insn expansion for the comparision, 3516 * expand via a mask in k1. 3517 */ 3518 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3519 && cond != TCG_COND_EQ 3520 && cond != TCG_COND_LT 3521 && cond != TCG_COND_GT) { 3522 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3523 tcg_out_k1_to_vec(s, type, vece, v0); 3524 return; 3525 } 3526 3527 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3528 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3529 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3530 } 3531} 3532 3533static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3534 TCGReg v0, TCGReg c1, TCGReg c2, 3535 TCGReg v3, TCGReg v4, TCGCond cond) 3536{ 3537 static const int vpblendm_insn[] = { 3538 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3539 }; 3540 bool z = false; 3541 3542 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3543 if (!v3) { 3544 z = true; 3545 v3 = v4; 3546 cond = tcg_invert_cond(cond); 3547 } 3548 3549 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3550 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3551 /* k1 */1, z, type); 3552} 3553 3554static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3555 TCGReg v0, TCGReg c1, TCGReg c2, 3556 TCGReg v3, TCGReg v4, TCGCond cond) 3557{ 3558 bool inv; 3559 3560 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3561 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3562 return; 3563 } 3564 3565 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3566 3567 /* 3568 * Since XMM0 is 16, the only way we get 0 into V3 3569 * is via the constant zero constraint. 3570 */ 3571 if (!v3) { 3572 if (inv) { 3573 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3574 } else { 3575 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3576 } 3577 } else { 3578 if (inv) { 3579 TCGReg swap = v3; 3580 v3 = v4; 3581 v4 = swap; 3582 } 3583 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3584 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3585 } 3586} 3587 3588static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3589 unsigned vecl, unsigned vece, 3590 const TCGArg args[TCG_MAX_OP_ARGS], 3591 const int const_args[TCG_MAX_OP_ARGS]) 3592{ 3593 static int const add_insn[4] = { 3594 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3595 }; 3596 static int const ssadd_insn[4] = { 3597 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3598 }; 3599 static int const usadd_insn[4] = { 3600 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3601 }; 3602 static int const sub_insn[4] = { 3603 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3604 }; 3605 static int const sssub_insn[4] = { 3606 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3607 }; 3608 static int const ussub_insn[4] = { 3609 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3610 }; 3611 static int const mul_insn[4] = { 3612 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3613 }; 3614 static int const shift_imm_insn[4] = { 3615 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3616 }; 3617 static int const punpckl_insn[4] = { 3618 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3619 }; 3620 static int const punpckh_insn[4] = { 3621 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3622 }; 3623 static int const packss_insn[4] = { 3624 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3625 }; 3626 static int const packus_insn[4] = { 3627 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3628 }; 3629 static int const smin_insn[4] = { 3630 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3631 }; 3632 static int const smax_insn[4] = { 3633 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3634 }; 3635 static int const rotlv_insn[4] = { 3636 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3637 }; 3638 static int const rotrv_insn[4] = { 3639 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3640 }; 3641 static int const shlv_insn[4] = { 3642 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3643 }; 3644 static int const shrv_insn[4] = { 3645 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3646 }; 3647 static int const sarv_insn[4] = { 3648 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3649 }; 3650 static int const shls_insn[4] = { 3651 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3652 }; 3653 static int const shrs_insn[4] = { 3654 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3655 }; 3656 static int const sars_insn[4] = { 3657 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3658 }; 3659 static int const vpshldi_insn[4] = { 3660 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3661 }; 3662 static int const vpshldv_insn[4] = { 3663 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3664 }; 3665 static int const vpshrdv_insn[4] = { 3666 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3667 }; 3668 static int const abs_insn[4] = { 3669 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3670 }; 3671 3672 TCGType type = vecl + TCG_TYPE_V64; 3673 int insn, sub; 3674 TCGArg a0, a1, a2, a3; 3675 3676 a0 = args[0]; 3677 a1 = args[1]; 3678 a2 = args[2]; 3679 3680 switch (opc) { 3681 case INDEX_op_add_vec: 3682 insn = add_insn[vece]; 3683 goto gen_simd; 3684 case INDEX_op_ssadd_vec: 3685 insn = ssadd_insn[vece]; 3686 goto gen_simd; 3687 case INDEX_op_usadd_vec: 3688 insn = usadd_insn[vece]; 3689 goto gen_simd; 3690 case INDEX_op_sub_vec: 3691 insn = sub_insn[vece]; 3692 goto gen_simd; 3693 case INDEX_op_sssub_vec: 3694 insn = sssub_insn[vece]; 3695 goto gen_simd; 3696 case INDEX_op_ussub_vec: 3697 insn = ussub_insn[vece]; 3698 goto gen_simd; 3699 case INDEX_op_mul_vec: 3700 insn = mul_insn[vece]; 3701 goto gen_simd; 3702 case INDEX_op_and_vec: 3703 insn = OPC_PAND; 3704 goto gen_simd; 3705 case INDEX_op_or_vec: 3706 insn = OPC_POR; 3707 goto gen_simd; 3708 case INDEX_op_xor_vec: 3709 insn = OPC_PXOR; 3710 goto gen_simd; 3711 case INDEX_op_smin_vec: 3712 insn = smin_insn[vece]; 3713 goto gen_simd; 3714 case INDEX_op_umin_vec: 3715 insn = umin_insn[vece]; 3716 goto gen_simd; 3717 case INDEX_op_smax_vec: 3718 insn = smax_insn[vece]; 3719 goto gen_simd; 3720 case INDEX_op_umax_vec: 3721 insn = umax_insn[vece]; 3722 goto gen_simd; 3723 case INDEX_op_shlv_vec: 3724 insn = shlv_insn[vece]; 3725 goto gen_simd; 3726 case INDEX_op_shrv_vec: 3727 insn = shrv_insn[vece]; 3728 goto gen_simd; 3729 case INDEX_op_sarv_vec: 3730 insn = sarv_insn[vece]; 3731 goto gen_simd; 3732 case INDEX_op_rotlv_vec: 3733 insn = rotlv_insn[vece]; 3734 goto gen_simd; 3735 case INDEX_op_rotrv_vec: 3736 insn = rotrv_insn[vece]; 3737 goto gen_simd; 3738 case INDEX_op_shls_vec: 3739 insn = shls_insn[vece]; 3740 goto gen_simd; 3741 case INDEX_op_shrs_vec: 3742 insn = shrs_insn[vece]; 3743 goto gen_simd; 3744 case INDEX_op_sars_vec: 3745 insn = sars_insn[vece]; 3746 goto gen_simd; 3747 case INDEX_op_x86_punpckl_vec: 3748 insn = punpckl_insn[vece]; 3749 goto gen_simd; 3750 case INDEX_op_x86_punpckh_vec: 3751 insn = punpckh_insn[vece]; 3752 goto gen_simd; 3753 case INDEX_op_x86_packss_vec: 3754 insn = packss_insn[vece]; 3755 goto gen_simd; 3756 case INDEX_op_x86_packus_vec: 3757 insn = packus_insn[vece]; 3758 goto gen_simd; 3759 case INDEX_op_x86_vpshldv_vec: 3760 insn = vpshldv_insn[vece]; 3761 a1 = a2; 3762 a2 = args[3]; 3763 goto gen_simd; 3764 case INDEX_op_x86_vpshrdv_vec: 3765 insn = vpshrdv_insn[vece]; 3766 a1 = a2; 3767 a2 = args[3]; 3768 goto gen_simd; 3769#if TCG_TARGET_REG_BITS == 32 3770 case INDEX_op_dup2_vec: 3771 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3772 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3773 /* Then replicate the 64-bit elements across the rest of the vector. */ 3774 if (type != TCG_TYPE_V64) { 3775 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3776 } 3777 break; 3778#endif 3779 case INDEX_op_abs_vec: 3780 insn = abs_insn[vece]; 3781 a2 = a1; 3782 a1 = 0; 3783 goto gen_simd; 3784 gen_simd: 3785 tcg_debug_assert(insn != OPC_UD2); 3786 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3787 break; 3788 3789 case INDEX_op_cmp_vec: 3790 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3791 break; 3792 3793 case INDEX_op_cmpsel_vec: 3794 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3795 args[3], args[4], args[5]); 3796 break; 3797 3798 case INDEX_op_andc_vec: 3799 insn = OPC_PANDN; 3800 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3801 break; 3802 3803 case INDEX_op_shli_vec: 3804 insn = shift_imm_insn[vece]; 3805 sub = 6; 3806 goto gen_shift; 3807 case INDEX_op_shri_vec: 3808 insn = shift_imm_insn[vece]; 3809 sub = 2; 3810 goto gen_shift; 3811 case INDEX_op_sari_vec: 3812 if (vece == MO_64) { 3813 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3814 } else { 3815 insn = shift_imm_insn[vece]; 3816 } 3817 sub = 4; 3818 goto gen_shift; 3819 case INDEX_op_rotli_vec: 3820 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3821 if (vece == MO_64) { 3822 insn |= P_VEXW; 3823 } 3824 sub = 1; 3825 goto gen_shift; 3826 gen_shift: 3827 tcg_debug_assert(vece != MO_8); 3828 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3829 tcg_out8(s, a2); 3830 break; 3831 3832 case INDEX_op_ld_vec: 3833 tcg_out_ld(s, type, a0, a1, a2); 3834 break; 3835 case INDEX_op_st_vec: 3836 tcg_out_st(s, type, a0, a1, a2); 3837 break; 3838 case INDEX_op_dupm_vec: 3839 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3840 break; 3841 3842 case INDEX_op_x86_shufps_vec: 3843 insn = OPC_SHUFPS; 3844 sub = args[3]; 3845 goto gen_simd_imm8; 3846 case INDEX_op_x86_blend_vec: 3847 if (vece == MO_16) { 3848 insn = OPC_PBLENDW; 3849 } else if (vece == MO_32) { 3850 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3851 } else { 3852 g_assert_not_reached(); 3853 } 3854 sub = args[3]; 3855 goto gen_simd_imm8; 3856 case INDEX_op_x86_vperm2i128_vec: 3857 insn = OPC_VPERM2I128; 3858 sub = args[3]; 3859 goto gen_simd_imm8; 3860 case INDEX_op_x86_vpshldi_vec: 3861 insn = vpshldi_insn[vece]; 3862 sub = args[3]; 3863 goto gen_simd_imm8; 3864 3865 case INDEX_op_not_vec: 3866 insn = OPC_VPTERNLOGQ; 3867 a2 = a1; 3868 sub = 0x33; /* !B */ 3869 goto gen_simd_imm8; 3870 case INDEX_op_nor_vec: 3871 insn = OPC_VPTERNLOGQ; 3872 sub = 0x11; /* norCB */ 3873 goto gen_simd_imm8; 3874 case INDEX_op_nand_vec: 3875 insn = OPC_VPTERNLOGQ; 3876 sub = 0x77; /* nandCB */ 3877 goto gen_simd_imm8; 3878 case INDEX_op_eqv_vec: 3879 insn = OPC_VPTERNLOGQ; 3880 sub = 0x99; /* xnorCB */ 3881 goto gen_simd_imm8; 3882 case INDEX_op_orc_vec: 3883 insn = OPC_VPTERNLOGQ; 3884 sub = 0xdd; /* orB!C */ 3885 goto gen_simd_imm8; 3886 3887 case INDEX_op_bitsel_vec: 3888 insn = OPC_VPTERNLOGQ; 3889 a3 = args[3]; 3890 if (a0 == a1) { 3891 a1 = a2; 3892 a2 = a3; 3893 sub = 0xca; /* A?B:C */ 3894 } else if (a0 == a2) { 3895 a2 = a3; 3896 sub = 0xe2; /* B?A:C */ 3897 } else { 3898 tcg_out_mov(s, type, a0, a3); 3899 sub = 0xb8; /* B?C:A */ 3900 } 3901 goto gen_simd_imm8; 3902 3903 gen_simd_imm8: 3904 tcg_debug_assert(insn != OPC_UD2); 3905 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3906 tcg_out8(s, sub); 3907 break; 3908 3909 case INDEX_op_x86_psrldq_vec: 3910 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3911 tcg_out8(s, a2); 3912 break; 3913 3914 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3915 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3916 default: 3917 g_assert_not_reached(); 3918 } 3919} 3920 3921static TCGConstraintSetIndex 3922tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) 3923{ 3924 switch (op) { 3925 case INDEX_op_goto_ptr: 3926 return C_O0_I1(r); 3927 3928 case INDEX_op_ld8u_i32: 3929 case INDEX_op_ld8u_i64: 3930 case INDEX_op_ld8s_i32: 3931 case INDEX_op_ld8s_i64: 3932 case INDEX_op_ld16u_i32: 3933 case INDEX_op_ld16u_i64: 3934 case INDEX_op_ld16s_i32: 3935 case INDEX_op_ld16s_i64: 3936 case INDEX_op_ld_i32: 3937 case INDEX_op_ld32u_i64: 3938 case INDEX_op_ld32s_i64: 3939 case INDEX_op_ld_i64: 3940 return C_O1_I1(r, r); 3941 3942 case INDEX_op_st8_i32: 3943 case INDEX_op_st8_i64: 3944 return C_O0_I2(qi, r); 3945 3946 case INDEX_op_st16_i32: 3947 case INDEX_op_st16_i64: 3948 case INDEX_op_st_i32: 3949 case INDEX_op_st32_i64: 3950 return C_O0_I2(ri, r); 3951 3952 case INDEX_op_st_i64: 3953 return C_O0_I2(re, r); 3954 3955 case INDEX_op_bswap16_i32: 3956 case INDEX_op_bswap16_i64: 3957 case INDEX_op_bswap32_i32: 3958 case INDEX_op_bswap32_i64: 3959 case INDEX_op_bswap64_i64: 3960 case INDEX_op_extrh_i64_i32: 3961 return C_O1_I1(r, 0); 3962 3963 case INDEX_op_ext_i32_i64: 3964 case INDEX_op_extu_i32_i64: 3965 case INDEX_op_extrl_i64_i32: 3966 case INDEX_op_extract_i32: 3967 case INDEX_op_extract_i64: 3968 case INDEX_op_sextract_i32: 3969 case INDEX_op_sextract_i64: 3970 return C_O1_I1(r, r); 3971 3972 case INDEX_op_extract2_i32: 3973 case INDEX_op_extract2_i64: 3974 return C_O1_I2(r, 0, r); 3975 3976 case INDEX_op_deposit_i32: 3977 case INDEX_op_deposit_i64: 3978 return C_O1_I2(q, 0, qi); 3979 3980 case INDEX_op_movcond_i32: 3981 case INDEX_op_movcond_i64: 3982 return C_O1_I4(r, r, reT, r, 0); 3983 3984 case INDEX_op_add2_i32: 3985 case INDEX_op_add2_i64: 3986 case INDEX_op_sub2_i32: 3987 case INDEX_op_sub2_i64: 3988 return C_N1_O1_I4(r, r, 0, 1, re, re); 3989 3990 case INDEX_op_qemu_ld_i32: 3991 return C_O1_I1(r, L); 3992 3993 case INDEX_op_qemu_st_i32: 3994 return C_O0_I2(L, L); 3995 case INDEX_op_qemu_st8_i32: 3996 return C_O0_I2(s, L); 3997 3998 case INDEX_op_qemu_ld_i64: 3999 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 4000 4001 case INDEX_op_qemu_st_i64: 4002 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 4003 4004 case INDEX_op_qemu_ld_i128: 4005 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 4006 return C_O2_I1(r, r, L); 4007 case INDEX_op_qemu_st_i128: 4008 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 4009 return C_O0_I3(L, L, L); 4010 4011 case INDEX_op_brcond2_i32: 4012 return C_O0_I4(r, r, ri, ri); 4013 4014 case INDEX_op_setcond2_i32: 4015 return C_O1_I4(r, r, r, ri, ri); 4016 4017 case INDEX_op_ld_vec: 4018 case INDEX_op_dupm_vec: 4019 return C_O1_I1(x, r); 4020 4021 case INDEX_op_st_vec: 4022 return C_O0_I2(x, r); 4023 4024 case INDEX_op_add_vec: 4025 case INDEX_op_sub_vec: 4026 case INDEX_op_mul_vec: 4027 case INDEX_op_and_vec: 4028 case INDEX_op_or_vec: 4029 case INDEX_op_xor_vec: 4030 case INDEX_op_andc_vec: 4031 case INDEX_op_orc_vec: 4032 case INDEX_op_nand_vec: 4033 case INDEX_op_nor_vec: 4034 case INDEX_op_eqv_vec: 4035 case INDEX_op_ssadd_vec: 4036 case INDEX_op_usadd_vec: 4037 case INDEX_op_sssub_vec: 4038 case INDEX_op_ussub_vec: 4039 case INDEX_op_smin_vec: 4040 case INDEX_op_umin_vec: 4041 case INDEX_op_smax_vec: 4042 case INDEX_op_umax_vec: 4043 case INDEX_op_shlv_vec: 4044 case INDEX_op_shrv_vec: 4045 case INDEX_op_sarv_vec: 4046 case INDEX_op_rotlv_vec: 4047 case INDEX_op_rotrv_vec: 4048 case INDEX_op_shls_vec: 4049 case INDEX_op_shrs_vec: 4050 case INDEX_op_sars_vec: 4051 case INDEX_op_cmp_vec: 4052 case INDEX_op_x86_shufps_vec: 4053 case INDEX_op_x86_blend_vec: 4054 case INDEX_op_x86_packss_vec: 4055 case INDEX_op_x86_packus_vec: 4056 case INDEX_op_x86_vperm2i128_vec: 4057 case INDEX_op_x86_punpckl_vec: 4058 case INDEX_op_x86_punpckh_vec: 4059 case INDEX_op_x86_vpshldi_vec: 4060#if TCG_TARGET_REG_BITS == 32 4061 case INDEX_op_dup2_vec: 4062#endif 4063 return C_O1_I2(x, x, x); 4064 4065 case INDEX_op_abs_vec: 4066 case INDEX_op_dup_vec: 4067 case INDEX_op_not_vec: 4068 case INDEX_op_shli_vec: 4069 case INDEX_op_shri_vec: 4070 case INDEX_op_sari_vec: 4071 case INDEX_op_rotli_vec: 4072 case INDEX_op_x86_psrldq_vec: 4073 return C_O1_I1(x, x); 4074 4075 case INDEX_op_x86_vpshldv_vec: 4076 case INDEX_op_x86_vpshrdv_vec: 4077 return C_O1_I3(x, 0, x, x); 4078 4079 case INDEX_op_bitsel_vec: 4080 return C_O1_I3(x, x, x, x); 4081 case INDEX_op_cmpsel_vec: 4082 return C_O1_I4(x, x, x, xO, x); 4083 4084 default: 4085 return C_NotImplemented; 4086 } 4087} 4088 4089int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 4090{ 4091 switch (opc) { 4092 case INDEX_op_add_vec: 4093 case INDEX_op_sub_vec: 4094 case INDEX_op_and_vec: 4095 case INDEX_op_or_vec: 4096 case INDEX_op_xor_vec: 4097 case INDEX_op_andc_vec: 4098 case INDEX_op_orc_vec: 4099 case INDEX_op_nand_vec: 4100 case INDEX_op_nor_vec: 4101 case INDEX_op_eqv_vec: 4102 case INDEX_op_not_vec: 4103 case INDEX_op_bitsel_vec: 4104 return 1; 4105 case INDEX_op_cmp_vec: 4106 case INDEX_op_cmpsel_vec: 4107 return -1; 4108 4109 case INDEX_op_rotli_vec: 4110 return have_avx512vl && vece >= MO_32 ? 1 : -1; 4111 4112 case INDEX_op_shli_vec: 4113 case INDEX_op_shri_vec: 4114 /* We must expand the operation for MO_8. */ 4115 return vece == MO_8 ? -1 : 1; 4116 4117 case INDEX_op_sari_vec: 4118 switch (vece) { 4119 case MO_8: 4120 return -1; 4121 case MO_16: 4122 case MO_32: 4123 return 1; 4124 case MO_64: 4125 if (have_avx512vl) { 4126 return 1; 4127 } 4128 /* 4129 * We can emulate this for MO_64, but it does not pay off 4130 * unless we're producing at least 4 values. 4131 */ 4132 return type >= TCG_TYPE_V256 ? -1 : 0; 4133 } 4134 return 0; 4135 4136 case INDEX_op_shls_vec: 4137 case INDEX_op_shrs_vec: 4138 return vece >= MO_16; 4139 case INDEX_op_sars_vec: 4140 switch (vece) { 4141 case MO_16: 4142 case MO_32: 4143 return 1; 4144 case MO_64: 4145 return have_avx512vl; 4146 } 4147 return 0; 4148 case INDEX_op_rotls_vec: 4149 return vece >= MO_16 ? -1 : 0; 4150 4151 case INDEX_op_shlv_vec: 4152 case INDEX_op_shrv_vec: 4153 switch (vece) { 4154 case MO_16: 4155 return have_avx512bw; 4156 case MO_32: 4157 case MO_64: 4158 return have_avx2; 4159 } 4160 return 0; 4161 case INDEX_op_sarv_vec: 4162 switch (vece) { 4163 case MO_16: 4164 return have_avx512bw; 4165 case MO_32: 4166 return have_avx2; 4167 case MO_64: 4168 return have_avx512vl; 4169 } 4170 return 0; 4171 case INDEX_op_rotlv_vec: 4172 case INDEX_op_rotrv_vec: 4173 switch (vece) { 4174 case MO_16: 4175 return have_avx512vbmi2 ? -1 : 0; 4176 case MO_32: 4177 case MO_64: 4178 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 4179 } 4180 return 0; 4181 4182 case INDEX_op_mul_vec: 4183 switch (vece) { 4184 case MO_8: 4185 return -1; 4186 case MO_64: 4187 return have_avx512dq; 4188 } 4189 return 1; 4190 4191 case INDEX_op_ssadd_vec: 4192 case INDEX_op_usadd_vec: 4193 case INDEX_op_sssub_vec: 4194 case INDEX_op_ussub_vec: 4195 return vece <= MO_16; 4196 case INDEX_op_smin_vec: 4197 case INDEX_op_smax_vec: 4198 case INDEX_op_umin_vec: 4199 case INDEX_op_umax_vec: 4200 case INDEX_op_abs_vec: 4201 return vece <= MO_32 || have_avx512vl; 4202 4203 default: 4204 return 0; 4205 } 4206} 4207 4208static void expand_vec_shi(TCGType type, unsigned vece, bool right, 4209 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4210{ 4211 uint8_t mask; 4212 4213 tcg_debug_assert(vece == MO_8); 4214 if (right) { 4215 mask = 0xff >> imm; 4216 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4217 } else { 4218 mask = 0xff << imm; 4219 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4220 } 4221 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4222} 4223 4224static void expand_vec_sari(TCGType type, unsigned vece, 4225 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4226{ 4227 TCGv_vec t1, t2; 4228 4229 switch (vece) { 4230 case MO_8: 4231 /* Unpack to 16-bit, shift, and repack. */ 4232 t1 = tcg_temp_new_vec(type); 4233 t2 = tcg_temp_new_vec(type); 4234 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4235 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4236 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4237 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4238 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4239 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4240 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4241 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4242 tcg_temp_free_vec(t1); 4243 tcg_temp_free_vec(t2); 4244 break; 4245 4246 case MO_64: 4247 t1 = tcg_temp_new_vec(type); 4248 if (imm <= 32) { 4249 /* 4250 * We can emulate a small sign extend by performing an arithmetic 4251 * 32-bit shift and overwriting the high half of a 64-bit logical 4252 * shift. Note that the ISA says shift of 32 is valid, but TCG 4253 * does not, so we have to bound the smaller shift -- we get the 4254 * same result in the high half either way. 4255 */ 4256 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4257 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4258 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4259 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4260 tcgv_vec_arg(t1), 0xaa); 4261 } else { 4262 /* Otherwise we will need to use a compare vs 0 to produce 4263 * the sign-extend, shift and merge. 4264 */ 4265 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4266 tcg_constant_vec(type, MO_64, 0), v1); 4267 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4268 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4269 tcg_gen_or_vec(MO_64, v0, v0, t1); 4270 } 4271 tcg_temp_free_vec(t1); 4272 break; 4273 4274 default: 4275 g_assert_not_reached(); 4276 } 4277} 4278 4279static void expand_vec_rotli(TCGType type, unsigned vece, 4280 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4281{ 4282 TCGv_vec t; 4283 4284 if (vece != MO_8 && have_avx512vbmi2) { 4285 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4286 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4287 return; 4288 } 4289 4290 t = tcg_temp_new_vec(type); 4291 tcg_gen_shli_vec(vece, t, v1, imm); 4292 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4293 tcg_gen_or_vec(vece, v0, v0, t); 4294 tcg_temp_free_vec(t); 4295} 4296 4297static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4298 TCGv_vec v1, TCGv_vec sh, bool right) 4299{ 4300 TCGv_vec t; 4301 4302 if (have_avx512vbmi2) { 4303 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4304 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4305 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4306 return; 4307 } 4308 4309 t = tcg_temp_new_vec(type); 4310 tcg_gen_dupi_vec(vece, t, 8 << vece); 4311 tcg_gen_sub_vec(vece, t, t, sh); 4312 if (right) { 4313 tcg_gen_shlv_vec(vece, t, v1, t); 4314 tcg_gen_shrv_vec(vece, v0, v1, sh); 4315 } else { 4316 tcg_gen_shrv_vec(vece, t, v1, t); 4317 tcg_gen_shlv_vec(vece, v0, v1, sh); 4318 } 4319 tcg_gen_or_vec(vece, v0, v0, t); 4320 tcg_temp_free_vec(t); 4321} 4322 4323static void expand_vec_rotls(TCGType type, unsigned vece, 4324 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4325{ 4326 TCGv_vec t = tcg_temp_new_vec(type); 4327 4328 tcg_debug_assert(vece != MO_8); 4329 4330 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4331 tcg_gen_dup_i32_vec(vece, t, lsh); 4332 if (vece >= MO_32) { 4333 tcg_gen_rotlv_vec(vece, v0, v1, t); 4334 } else { 4335 expand_vec_rotv(type, vece, v0, v1, t, false); 4336 } 4337 } else { 4338 TCGv_i32 rsh = tcg_temp_new_i32(); 4339 4340 tcg_gen_neg_i32(rsh, lsh); 4341 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4342 tcg_gen_shls_vec(vece, t, v1, lsh); 4343 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4344 tcg_gen_or_vec(vece, v0, v0, t); 4345 4346 tcg_temp_free_i32(rsh); 4347 } 4348 4349 tcg_temp_free_vec(t); 4350} 4351 4352static void expand_vec_mul(TCGType type, unsigned vece, 4353 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4354{ 4355 TCGv_vec t1, t2, t3, t4, zero; 4356 4357 tcg_debug_assert(vece == MO_8); 4358 4359 /* 4360 * Unpack v1 bytes to words, 0 | x. 4361 * Unpack v2 bytes to words, y | 0. 4362 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4363 * Shift logical right by 8 bits to clear the high 8 bytes before 4364 * using an unsigned saturated pack. 4365 * 4366 * The difference between the V64, V128 and V256 cases is merely how 4367 * we distribute the expansion between temporaries. 4368 */ 4369 switch (type) { 4370 case TCG_TYPE_V64: 4371 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4372 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4373 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4374 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4375 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4376 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4377 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4378 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4379 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4380 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4381 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4382 tcg_temp_free_vec(t1); 4383 tcg_temp_free_vec(t2); 4384 break; 4385 4386 case TCG_TYPE_V128: 4387 case TCG_TYPE_V256: 4388 t1 = tcg_temp_new_vec(type); 4389 t2 = tcg_temp_new_vec(type); 4390 t3 = tcg_temp_new_vec(type); 4391 t4 = tcg_temp_new_vec(type); 4392 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4393 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4394 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4395 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4396 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4397 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4398 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4399 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4400 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4401 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4402 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4403 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4404 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4405 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4406 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4407 tcg_temp_free_vec(t1); 4408 tcg_temp_free_vec(t2); 4409 tcg_temp_free_vec(t3); 4410 tcg_temp_free_vec(t4); 4411 break; 4412 4413 default: 4414 g_assert_not_reached(); 4415 } 4416} 4417 4418static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4419 TCGArg *a1, TCGArg *a2, TCGCond cond) 4420{ 4421 /* 4422 * Without AVX512, there are no 64-bit unsigned comparisons. 4423 * We must bias the inputs so that they become signed. 4424 * All other swapping and inversion are handled during code generation. 4425 */ 4426 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4427 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4428 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4429 TCGv_vec t1 = tcg_temp_new_vec(type); 4430 TCGv_vec t2 = tcg_temp_new_vec(type); 4431 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4432 4433 tcg_gen_sub_vec(vece, t1, v1, t3); 4434 tcg_gen_sub_vec(vece, t2, v2, t3); 4435 *a1 = tcgv_vec_arg(t1); 4436 *a2 = tcgv_vec_arg(t2); 4437 cond = tcg_signed_cond(cond); 4438 } 4439 return cond; 4440} 4441 4442static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4443 TCGArg a1, TCGArg a2, TCGCond cond) 4444{ 4445 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4446 /* Expand directly; do not recurse. */ 4447 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4448} 4449 4450static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4451 TCGArg a1, TCGArg a2, 4452 TCGArg a3, TCGArg a4, TCGCond cond) 4453{ 4454 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4455 /* Expand directly; do not recurse. */ 4456 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4457} 4458 4459void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4460 TCGArg a0, ...) 4461{ 4462 va_list va; 4463 TCGArg a1, a2, a3, a4, a5; 4464 TCGv_vec v0, v1, v2; 4465 4466 va_start(va, a0); 4467 a1 = va_arg(va, TCGArg); 4468 a2 = va_arg(va, TCGArg); 4469 v0 = temp_tcgv_vec(arg_temp(a0)); 4470 v1 = temp_tcgv_vec(arg_temp(a1)); 4471 4472 switch (opc) { 4473 case INDEX_op_shli_vec: 4474 expand_vec_shi(type, vece, false, v0, v1, a2); 4475 break; 4476 case INDEX_op_shri_vec: 4477 expand_vec_shi(type, vece, true, v0, v1, a2); 4478 break; 4479 case INDEX_op_sari_vec: 4480 expand_vec_sari(type, vece, v0, v1, a2); 4481 break; 4482 4483 case INDEX_op_rotli_vec: 4484 expand_vec_rotli(type, vece, v0, v1, a2); 4485 break; 4486 4487 case INDEX_op_rotls_vec: 4488 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4489 break; 4490 4491 case INDEX_op_rotlv_vec: 4492 v2 = temp_tcgv_vec(arg_temp(a2)); 4493 expand_vec_rotv(type, vece, v0, v1, v2, false); 4494 break; 4495 case INDEX_op_rotrv_vec: 4496 v2 = temp_tcgv_vec(arg_temp(a2)); 4497 expand_vec_rotv(type, vece, v0, v1, v2, true); 4498 break; 4499 4500 case INDEX_op_mul_vec: 4501 v2 = temp_tcgv_vec(arg_temp(a2)); 4502 expand_vec_mul(type, vece, v0, v1, v2); 4503 break; 4504 4505 case INDEX_op_cmp_vec: 4506 a3 = va_arg(va, TCGArg); 4507 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4508 break; 4509 4510 case INDEX_op_cmpsel_vec: 4511 a3 = va_arg(va, TCGArg); 4512 a4 = va_arg(va, TCGArg); 4513 a5 = va_arg(va, TCGArg); 4514 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4515 break; 4516 4517 default: 4518 break; 4519 } 4520 4521 va_end(va); 4522} 4523 4524static const int tcg_target_callee_save_regs[] = { 4525#if TCG_TARGET_REG_BITS == 64 4526 TCG_REG_RBP, 4527 TCG_REG_RBX, 4528#if defined(_WIN64) 4529 TCG_REG_RDI, 4530 TCG_REG_RSI, 4531#endif 4532 TCG_REG_R12, 4533 TCG_REG_R13, 4534 TCG_REG_R14, /* Currently used for the global env. */ 4535 TCG_REG_R15, 4536#else 4537 TCG_REG_EBP, /* Currently used for the global env. */ 4538 TCG_REG_EBX, 4539 TCG_REG_ESI, 4540 TCG_REG_EDI, 4541#endif 4542}; 4543 4544/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4545 and tcg_register_jit. */ 4546 4547#define PUSH_SIZE \ 4548 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4549 * (TCG_TARGET_REG_BITS / 8)) 4550 4551#define FRAME_SIZE \ 4552 ((PUSH_SIZE \ 4553 + TCG_STATIC_CALL_ARGS_SIZE \ 4554 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4555 + TCG_TARGET_STACK_ALIGN - 1) \ 4556 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4557 4558/* Generate global QEMU prologue and epilogue code */ 4559static void tcg_target_qemu_prologue(TCGContext *s) 4560{ 4561 int i, stack_addend; 4562 4563 /* TB prologue */ 4564 4565 /* Reserve some stack space, also for TCG temps. */ 4566 stack_addend = FRAME_SIZE - PUSH_SIZE; 4567 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4568 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4569 4570 /* Save all callee saved registers. */ 4571 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4572 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4573 } 4574 4575 if (!tcg_use_softmmu && guest_base) { 4576 int seg = setup_guest_base_seg(); 4577 if (seg != 0) { 4578 x86_guest_base.seg = seg; 4579 } else if (guest_base == (int32_t)guest_base) { 4580 x86_guest_base.ofs = guest_base; 4581 } else { 4582 assert(TCG_TARGET_REG_BITS == 64); 4583 /* Choose R12 because, as a base, it requires a SIB byte. */ 4584 x86_guest_base.index = TCG_REG_R12; 4585 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4586 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4587 } 4588 } 4589 4590 if (TCG_TARGET_REG_BITS == 32) { 4591 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4592 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4593 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4594 /* jmp *tb. */ 4595 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4596 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4597 + stack_addend); 4598 } else { 4599 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4600 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4601 /* jmp *tb. */ 4602 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4603 } 4604 4605 /* 4606 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4607 * and fall through to the rest of the epilogue. 4608 */ 4609 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4610 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4611 4612 /* TB epilogue */ 4613 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4614 4615 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4616 4617 if (have_avx2) { 4618 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4619 } 4620 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4621 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4622 } 4623 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4624} 4625 4626static void tcg_out_tb_start(TCGContext *s) 4627{ 4628 /* nothing to do */ 4629} 4630 4631static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4632{ 4633 memset(p, 0x90, count); 4634} 4635 4636static void tcg_target_init(TCGContext *s) 4637{ 4638 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4639 if (TCG_TARGET_REG_BITS == 64) { 4640 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4641 } 4642 if (have_avx1) { 4643 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4644 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4645 } 4646 if (have_avx2) { 4647 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4648 } 4649 4650 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4651 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4652 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4653 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4654 if (TCG_TARGET_REG_BITS == 64) { 4655#if !defined(_WIN64) 4656 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4657 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4658#endif 4659 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4660 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4661 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4662 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4663 } 4664 4665 s->reserved_regs = 0; 4666 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4667 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4668#ifdef _WIN64 4669 /* These are call saved, and we don't save them, so don't use them. */ 4670 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4671 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4672 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4673 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4674 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4675 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4676 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4677 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4678 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4679 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4680#endif 4681} 4682 4683typedef struct { 4684 DebugFrameHeader h; 4685 uint8_t fde_def_cfa[4]; 4686 uint8_t fde_reg_ofs[14]; 4687} DebugFrame; 4688 4689/* We're expecting a 2 byte uleb128 encoded value. */ 4690QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4691 4692#if !defined(__ELF__) 4693 /* Host machine without ELF. */ 4694#elif TCG_TARGET_REG_BITS == 64 4695#define ELF_HOST_MACHINE EM_X86_64 4696static const DebugFrame debug_frame = { 4697 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4698 .h.cie.id = -1, 4699 .h.cie.version = 1, 4700 .h.cie.code_align = 1, 4701 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4702 .h.cie.return_column = 16, 4703 4704 /* Total FDE size does not include the "len" member. */ 4705 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4706 4707 .fde_def_cfa = { 4708 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4709 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4710 (FRAME_SIZE >> 7) 4711 }, 4712 .fde_reg_ofs = { 4713 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4714 /* The following ordering must match tcg_target_callee_save_regs. */ 4715 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4716 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4717 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4718 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4719 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4720 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4721 } 4722}; 4723#else 4724#define ELF_HOST_MACHINE EM_386 4725static const DebugFrame debug_frame = { 4726 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4727 .h.cie.id = -1, 4728 .h.cie.version = 1, 4729 .h.cie.code_align = 1, 4730 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4731 .h.cie.return_column = 8, 4732 4733 /* Total FDE size does not include the "len" member. */ 4734 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4735 4736 .fde_def_cfa = { 4737 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4738 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4739 (FRAME_SIZE >> 7) 4740 }, 4741 .fde_reg_ofs = { 4742 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4743 /* The following ordering must match tcg_target_callee_save_regs. */ 4744 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4745 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4746 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4747 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4748 } 4749}; 4750#endif 4751 4752#if defined(ELF_HOST_MACHINE) 4753void tcg_register_jit(const void *buf, size_t buf_size) 4754{ 4755 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4756} 4757#endif 4758