1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25/* Used for function call generation. */ 26#define TCG_TARGET_STACK_ALIGN 16 27#if defined(_WIN64) 28#define TCG_TARGET_CALL_STACK_OFFSET 32 29#else 30#define TCG_TARGET_CALL_STACK_OFFSET 0 31#endif 32#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL 33#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL 34#if defined(_WIN64) 35# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF 36# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_VEC 37#elif TCG_TARGET_REG_BITS == 64 38# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 39# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL 40#else 41# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 42# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF 43#endif 44 45#ifdef CONFIG_DEBUG_TCG 46static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 47#if TCG_TARGET_REG_BITS == 64 48 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 49#else 50 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 51#endif 52 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 53 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 54#if TCG_TARGET_REG_BITS == 64 55 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 56 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 57#endif 58}; 59#endif 60 61static const int tcg_target_reg_alloc_order[] = { 62#if TCG_TARGET_REG_BITS == 64 63 TCG_REG_RBP, 64 TCG_REG_RBX, 65 TCG_REG_R12, 66 TCG_REG_R13, 67 TCG_REG_R14, 68 TCG_REG_R15, 69 TCG_REG_R10, 70 TCG_REG_R11, 71 TCG_REG_R9, 72 TCG_REG_R8, 73 TCG_REG_RCX, 74 TCG_REG_RDX, 75 TCG_REG_RSI, 76 TCG_REG_RDI, 77 TCG_REG_RAX, 78#else 79 TCG_REG_EBX, 80 TCG_REG_ESI, 81 TCG_REG_EDI, 82 TCG_REG_EBP, 83 TCG_REG_ECX, 84 TCG_REG_EDX, 85 TCG_REG_EAX, 86#endif 87 TCG_REG_XMM0, 88 TCG_REG_XMM1, 89 TCG_REG_XMM2, 90 TCG_REG_XMM3, 91 TCG_REG_XMM4, 92 TCG_REG_XMM5, 93#ifndef _WIN64 94 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 95 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 96 TCG_REG_XMM6, 97 TCG_REG_XMM7, 98#if TCG_TARGET_REG_BITS == 64 99 TCG_REG_XMM8, 100 TCG_REG_XMM9, 101 TCG_REG_XMM10, 102 TCG_REG_XMM11, 103 TCG_REG_XMM12, 104 TCG_REG_XMM13, 105 TCG_REG_XMM14, 106 TCG_REG_XMM15, 107#endif 108#endif 109}; 110 111#define TCG_TMP_VEC TCG_REG_XMM5 112 113static const int tcg_target_call_iarg_regs[] = { 114#if TCG_TARGET_REG_BITS == 64 115#if defined(_WIN64) 116 TCG_REG_RCX, 117 TCG_REG_RDX, 118#else 119 TCG_REG_RDI, 120 TCG_REG_RSI, 121 TCG_REG_RDX, 122 TCG_REG_RCX, 123#endif 124 TCG_REG_R8, 125 TCG_REG_R9, 126#else 127 /* 32 bit mode uses stack based calling convention (GCC default). */ 128#endif 129}; 130 131static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 132{ 133 switch (kind) { 134 case TCG_CALL_RET_NORMAL: 135 tcg_debug_assert(slot >= 0 && slot <= 1); 136 return slot ? TCG_REG_EDX : TCG_REG_EAX; 137#ifdef _WIN64 138 case TCG_CALL_RET_BY_VEC: 139 tcg_debug_assert(slot == 0); 140 return TCG_REG_XMM0; 141#endif 142 default: 143 g_assert_not_reached(); 144 } 145} 146 147/* Constants we accept. */ 148#define TCG_CT_CONST_S32 0x100 149#define TCG_CT_CONST_U32 0x200 150#define TCG_CT_CONST_I32 0x400 151#define TCG_CT_CONST_WSZ 0x800 152#define TCG_CT_CONST_TST 0x1000 153#define TCG_CT_CONST_ZERO 0x2000 154 155/* Registers used with L constraint, which are the first argument 156 registers on x86_64, and two random call clobbered registers on 157 i386. */ 158#if TCG_TARGET_REG_BITS == 64 159# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 160# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 161#else 162# define TCG_REG_L0 TCG_REG_EAX 163# define TCG_REG_L1 TCG_REG_EDX 164#endif 165 166#if TCG_TARGET_REG_BITS == 64 167# define ALL_GENERAL_REGS 0x0000ffffu 168# define ALL_VECTOR_REGS 0xffff0000u 169# define ALL_BYTEL_REGS ALL_GENERAL_REGS 170#else 171# define ALL_GENERAL_REGS 0x000000ffu 172# define ALL_VECTOR_REGS 0x00ff0000u 173# define ALL_BYTEL_REGS 0x0000000fu 174#endif 175#define SOFTMMU_RESERVE_REGS \ 176 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 177 178#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 179#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 180 181static const tcg_insn_unit *tb_ret_addr; 182 183static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 184 intptr_t value, intptr_t addend) 185{ 186 value += addend; 187 switch(type) { 188 case R_386_PC32: 189 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 190 if (value != (int32_t)value) { 191 return false; 192 } 193 /* FALLTHRU */ 194 case R_386_32: 195 tcg_patch32(code_ptr, value); 196 break; 197 case R_386_PC8: 198 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 199 if (value != (int8_t)value) { 200 return false; 201 } 202 tcg_patch8(code_ptr, value); 203 break; 204 default: 205 g_assert_not_reached(); 206 } 207 return true; 208} 209 210/* test if a constant matches the constraint */ 211static bool tcg_target_const_match(int64_t val, int ct, 212 TCGType type, TCGCond cond, int vece) 213{ 214 if (ct & TCG_CT_CONST) { 215 return 1; 216 } 217 if (type == TCG_TYPE_I32) { 218 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 219 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 220 return 1; 221 } 222 } else { 223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 224 return 1; 225 } 226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 227 return 1; 228 } 229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 230 return 1; 231 } 232 /* 233 * This will be used in combination with TCG_CT_CONST_S32, 234 * so "normal" TESTQ is already matched. Also accept: 235 * TESTQ -> TESTL (uint32_t) 236 * TESTQ -> BT (is_power_of_2) 237 */ 238 if ((ct & TCG_CT_CONST_TST) 239 && is_tst_cond(cond) 240 && (val == (uint32_t)val || is_power_of_2(val))) { 241 return 1; 242 } 243 } 244 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 245 return 1; 246 } 247 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EbIb (0x80) 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVB_Ib (0xb0) 307#define OPC_MOVL_Iv (0xb8) 308#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 309#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 310#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 311#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 312#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 313#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 314#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 315#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 316#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 318#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 319#define OPC_MOVSBL (0xbe | P_EXT) 320#define OPC_MOVSWL (0xbf | P_EXT) 321#define OPC_MOVSLQ (0x63 | P_REXW) 322#define OPC_MOVZBL (0xb6 | P_EXT) 323#define OPC_MOVZWL (0xb7 | P_EXT) 324#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 325#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 326#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 327#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 328#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 329#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 330#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 331#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 332#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 333#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 334#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 335#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 336#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 337#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 338#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 339#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 340#define OPC_PAND (0xdb | P_EXT | P_DATA16) 341#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 342#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 343#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 344#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 345#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 346#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 347#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 348#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 349#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 350#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 351#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 352#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 353#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 354#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 355#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 356#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 358#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 359#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 360#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 362#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 363#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 364#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 366#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 367#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 368#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 369#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 370#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 371#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 372#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 373#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 374#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 375#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 376#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 377#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 378#define OPC_POR (0xeb | P_EXT | P_DATA16) 379#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 380#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 381#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 382#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 383#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 384#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 385#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 386#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 387#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 388#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 389#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 390#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 391#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 392#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 393#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 394#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 395#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 396#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 397#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 398#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 399#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 400#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 401#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 402#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 404#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 405#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 406#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 407#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 408#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 409#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 410#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 411#define OPC_PXOR (0xef | P_EXT | P_DATA16) 412#define OPC_POP_r32 (0x58) 413#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 414#define OPC_PUSH_r32 (0x50) 415#define OPC_PUSH_Iv (0x68) 416#define OPC_PUSH_Ib (0x6a) 417#define OPC_RET (0xc3) 418#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 419#define OPC_SHIFT_1 (0xd1) 420#define OPC_SHIFT_Ib (0xc1) 421#define OPC_SHIFT_cl (0xd3) 422#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 423#define OPC_SHUFPS (0xc6 | P_EXT) 424#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 425#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 426#define OPC_SHRD_Ib (0xac | P_EXT) 427#define OPC_TESTB (0x84) 428#define OPC_TESTL (0x85) 429#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 430#define OPC_UD2 (0x0b | P_EXT) 431#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 432#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 433#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 434#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 436#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 437#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 438#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 439#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 440#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 442#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 443#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 446#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 447#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 448#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 449#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 450#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 451#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 452#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 453#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 454#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 455#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 456#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 457#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 458#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 459#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 460#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 462#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 463#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 464#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 465#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 466#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 468#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 469#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 470#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 471#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 472#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 473#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 474#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 475#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 476#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 477#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 478#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 479#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 480#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 481#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 482#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 483#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 484#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 485#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 486#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 487#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 488#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 489#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 490#define OPC_VZEROUPPER (0x77 | P_EXT) 491#define OPC_XCHG_ax_r32 (0x90) 492#define OPC_XCHG_EvGv (0x87) 493 494#define OPC_GRP3_Eb (0xf6) 495#define OPC_GRP3_Ev (0xf7) 496#define OPC_GRP5 (0xff) 497#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 498#define OPC_GRPBT (0xba | P_EXT) 499 500#define OPC_GRPBT_BT 4 501#define OPC_GRPBT_BTS 5 502#define OPC_GRPBT_BTR 6 503#define OPC_GRPBT_BTC 7 504 505/* Group 1 opcode extensions for 0x80-0x83. 506 These are also used as modifiers for OPC_ARITH. */ 507#define ARITH_ADD 0 508#define ARITH_OR 1 509#define ARITH_ADC 2 510#define ARITH_SBB 3 511#define ARITH_AND 4 512#define ARITH_SUB 5 513#define ARITH_XOR 6 514#define ARITH_CMP 7 515 516/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 517#define SHIFT_ROL 0 518#define SHIFT_ROR 1 519#define SHIFT_SHL 4 520#define SHIFT_SHR 5 521#define SHIFT_SAR 7 522 523/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 524#define EXT3_TESTi 0 525#define EXT3_NOT 2 526#define EXT3_NEG 3 527#define EXT3_MUL 4 528#define EXT3_IMUL 5 529#define EXT3_DIV 6 530#define EXT3_IDIV 7 531 532/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 533#define EXT5_INC_Ev 0 534#define EXT5_DEC_Ev 1 535#define EXT5_CALLN_Ev 2 536#define EXT5_JMPN_Ev 4 537 538/* Condition codes to be added to OPC_JCC_{long,short}. */ 539#define JCC_JMP (-1) 540#define JCC_JO 0x0 541#define JCC_JNO 0x1 542#define JCC_JB 0x2 543#define JCC_JAE 0x3 544#define JCC_JE 0x4 545#define JCC_JNE 0x5 546#define JCC_JBE 0x6 547#define JCC_JA 0x7 548#define JCC_JS 0x8 549#define JCC_JNS 0x9 550#define JCC_JP 0xa 551#define JCC_JNP 0xb 552#define JCC_JL 0xc 553#define JCC_JGE 0xd 554#define JCC_JLE 0xe 555#define JCC_JG 0xf 556 557static const uint8_t tcg_cond_to_jcc[] = { 558 [TCG_COND_EQ] = JCC_JE, 559 [TCG_COND_NE] = JCC_JNE, 560 [TCG_COND_LT] = JCC_JL, 561 [TCG_COND_GE] = JCC_JGE, 562 [TCG_COND_LE] = JCC_JLE, 563 [TCG_COND_GT] = JCC_JG, 564 [TCG_COND_LTU] = JCC_JB, 565 [TCG_COND_GEU] = JCC_JAE, 566 [TCG_COND_LEU] = JCC_JBE, 567 [TCG_COND_GTU] = JCC_JA, 568 [TCG_COND_TSTEQ] = JCC_JE, 569 [TCG_COND_TSTNE] = JCC_JNE, 570}; 571 572#if TCG_TARGET_REG_BITS == 64 573static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 574{ 575 int rex; 576 577 if (opc & P_GS) { 578 tcg_out8(s, 0x65); 579 } 580 if (opc & P_DATA16) { 581 /* We should never be asking for both 16 and 64-bit operation. */ 582 tcg_debug_assert((opc & P_REXW) == 0); 583 tcg_out8(s, 0x66); 584 } 585 if (opc & P_SIMDF3) { 586 tcg_out8(s, 0xf3); 587 } else if (opc & P_SIMDF2) { 588 tcg_out8(s, 0xf2); 589 } 590 591 rex = 0; 592 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 593 rex |= (r & 8) >> 1; /* REX.R */ 594 rex |= (x & 8) >> 2; /* REX.X */ 595 rex |= (rm & 8) >> 3; /* REX.B */ 596 597 /* P_REXB_{R,RM} indicates that the given register is the low byte. 598 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 599 as otherwise the encoding indicates %[abcd]h. Note that the values 600 that are ORed in merely indicate that the REX byte must be present; 601 those bits get discarded in output. */ 602 rex |= opc & (r >= 4 ? P_REXB_R : 0); 603 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 604 605 if (rex) { 606 tcg_out8(s, (uint8_t)(rex | 0x40)); 607 } 608 609 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 610 tcg_out8(s, 0x0f); 611 if (opc & P_EXT38) { 612 tcg_out8(s, 0x38); 613 } else if (opc & P_EXT3A) { 614 tcg_out8(s, 0x3a); 615 } 616 } 617 618 tcg_out8(s, opc); 619} 620#else 621static void tcg_out_opc(TCGContext *s, int opc) 622{ 623 if (opc & P_DATA16) { 624 tcg_out8(s, 0x66); 625 } 626 if (opc & P_SIMDF3) { 627 tcg_out8(s, 0xf3); 628 } else if (opc & P_SIMDF2) { 629 tcg_out8(s, 0xf2); 630 } 631 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 632 tcg_out8(s, 0x0f); 633 if (opc & P_EXT38) { 634 tcg_out8(s, 0x38); 635 } else if (opc & P_EXT3A) { 636 tcg_out8(s, 0x3a); 637 } 638 } 639 tcg_out8(s, opc); 640} 641/* Discard the register arguments to tcg_out_opc early, so as not to penalize 642 the 32-bit compilation paths. This method works with all versions of gcc, 643 whereas relying on optimization may not be able to exclude them. */ 644#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 645#endif 646 647static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 648{ 649 tcg_out_opc(s, opc, r, rm, 0); 650 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 651} 652 653static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 654 int rm, int index) 655{ 656 int tmp; 657 658 if (opc & P_GS) { 659 tcg_out8(s, 0x65); 660 } 661 /* Use the two byte form if possible, which cannot encode 662 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 663 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 664 && ((rm | index) & 8) == 0) { 665 /* Two byte VEX prefix. */ 666 tcg_out8(s, 0xc5); 667 668 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 669 } else { 670 /* Three byte VEX prefix. */ 671 tcg_out8(s, 0xc4); 672 673 /* VEX.m-mmmm */ 674 if (opc & P_EXT3A) { 675 tmp = 3; 676 } else if (opc & P_EXT38) { 677 tmp = 2; 678 } else if (opc & P_EXT) { 679 tmp = 1; 680 } else { 681 g_assert_not_reached(); 682 } 683 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 684 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 685 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 686 tcg_out8(s, tmp); 687 688 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 689 } 690 691 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 692 /* VEX.pp */ 693 if (opc & P_DATA16) { 694 tmp |= 1; /* 0x66 */ 695 } else if (opc & P_SIMDF3) { 696 tmp |= 2; /* 0xf3 */ 697 } else if (opc & P_SIMDF2) { 698 tmp |= 3; /* 0xf2 */ 699 } 700 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 701 tcg_out8(s, tmp); 702 tcg_out8(s, opc); 703} 704 705static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 706 int rm, int index, int aaa, bool z) 707{ 708 /* The entire 4-byte evex prefix; with R' and V' set. */ 709 uint32_t p = 0x08041062; 710 int mm, pp; 711 712 tcg_debug_assert(have_avx512vl); 713 714 /* EVEX.mm */ 715 if (opc & P_EXT3A) { 716 mm = 3; 717 } else if (opc & P_EXT38) { 718 mm = 2; 719 } else if (opc & P_EXT) { 720 mm = 1; 721 } else { 722 g_assert_not_reached(); 723 } 724 725 /* EVEX.pp */ 726 if (opc & P_DATA16) { 727 pp = 1; /* 0x66 */ 728 } else if (opc & P_SIMDF3) { 729 pp = 2; /* 0xf3 */ 730 } else if (opc & P_SIMDF2) { 731 pp = 3; /* 0xf2 */ 732 } else { 733 pp = 0; 734 } 735 736 p = deposit32(p, 8, 2, mm); 737 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 738 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 739 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 740 p = deposit32(p, 16, 2, pp); 741 p = deposit32(p, 19, 4, ~v); 742 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 743 p = deposit32(p, 24, 3, aaa); 744 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 745 p = deposit32(p, 31, 1, z); 746 747 tcg_out32(s, p); 748 tcg_out8(s, opc); 749} 750 751static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 752{ 753 if (opc & P_EVEX) { 754 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 755 } else { 756 tcg_out_vex_opc(s, opc, r, v, rm, 0); 757 } 758 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 759} 760 761static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 762 int r, int v, int rm, TCGType type) 763{ 764 if (type == TCG_TYPE_V256) { 765 opc |= P_VEXL; 766 } 767 tcg_out_vex_modrm(s, opc, r, v, rm); 768} 769 770static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 771 int rm, int aaa, bool z, TCGType type) 772{ 773 if (type == TCG_TYPE_V256) { 774 opc |= P_VEXL; 775 } 776 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 777 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 778} 779 780/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 781 We handle either RM and INDEX missing with a negative value. In 64-bit 782 mode for absolute addresses, ~RM is the size of the immediate operand 783 that will follow the instruction. */ 784 785static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 786 int shift, intptr_t offset) 787{ 788 int mod, len; 789 790 if (index < 0 && rm < 0) { 791 if (TCG_TARGET_REG_BITS == 64) { 792 /* Try for a rip-relative addressing mode. This has replaced 793 the 32-bit-mode absolute addressing encoding. */ 794 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 795 intptr_t disp = offset - pc; 796 if (disp == (int32_t)disp) { 797 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 798 tcg_out32(s, disp); 799 return; 800 } 801 802 /* Try for an absolute address encoding. This requires the 803 use of the MODRM+SIB encoding and is therefore larger than 804 rip-relative addressing. */ 805 if (offset == (int32_t)offset) { 806 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 807 tcg_out8(s, (4 << 3) | 5); 808 tcg_out32(s, offset); 809 return; 810 } 811 812 /* ??? The memory isn't directly addressable. */ 813 g_assert_not_reached(); 814 } else { 815 /* Absolute address. */ 816 tcg_out8(s, (r << 3) | 5); 817 tcg_out32(s, offset); 818 return; 819 } 820 } 821 822 /* Find the length of the immediate addend. Note that the encoding 823 that would be used for (%ebp) indicates absolute addressing. */ 824 if (rm < 0) { 825 mod = 0, len = 4, rm = 5; 826 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 827 mod = 0, len = 0; 828 } else if (offset == (int8_t)offset) { 829 mod = 0x40, len = 1; 830 } else { 831 mod = 0x80, len = 4; 832 } 833 834 /* Use a single byte MODRM format if possible. Note that the encoding 835 that would be used for %esp is the escape to the two byte form. */ 836 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 837 /* Single byte MODRM format. */ 838 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 839 } else { 840 /* Two byte MODRM+SIB format. */ 841 842 /* Note that the encoding that would place %esp into the index 843 field indicates no index register. In 64-bit mode, the REX.X 844 bit counts, so %r12 can be used as the index. */ 845 if (index < 0) { 846 index = 4; 847 } else { 848 tcg_debug_assert(index != TCG_REG_ESP); 849 } 850 851 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 852 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 853 } 854 855 if (len == 1) { 856 tcg_out8(s, offset); 857 } else if (len == 4) { 858 tcg_out32(s, offset); 859 } 860} 861 862static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 863 int index, int shift, intptr_t offset) 864{ 865 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 866 tcg_out_sib_offset(s, r, rm, index, shift, offset); 867} 868 869static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 870 int rm, int index, int shift, 871 intptr_t offset) 872{ 873 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 874 tcg_out_sib_offset(s, r, rm, index, shift, offset); 875} 876 877/* A simplification of the above with no index or shift. */ 878static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 879 int rm, intptr_t offset) 880{ 881 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 882} 883 884static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 885 int v, int rm, intptr_t offset) 886{ 887 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 888} 889 890/* Output an opcode with an expected reference to the constant pool. */ 891static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 892{ 893 tcg_out_opc(s, opc, r, 0, 0); 894 /* Absolute for 32-bit, pc-relative for 64-bit. */ 895 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 896 tcg_out32(s, 0); 897} 898 899/* Output an opcode with an expected reference to the constant pool. */ 900static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 901{ 902 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 903 /* Absolute for 32-bit, pc-relative for 64-bit. */ 904 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 905 tcg_out32(s, 0); 906} 907 908/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 909static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 910{ 911 /* Propagate an opcode prefix, such as P_REXW. */ 912 int ext = subop & ~0x7; 913 subop &= 0x7; 914 915 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 916} 917 918static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 919{ 920 int rexw = 0; 921 922 if (arg == ret) { 923 return true; 924 } 925 switch (type) { 926 case TCG_TYPE_I64: 927 rexw = P_REXW; 928 /* fallthru */ 929 case TCG_TYPE_I32: 930 if (ret < 16) { 931 if (arg < 16) { 932 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 933 } else { 934 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 935 } 936 } else { 937 if (arg < 16) { 938 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 939 } else { 940 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 941 } 942 } 943 break; 944 945 case TCG_TYPE_V64: 946 tcg_debug_assert(ret >= 16 && arg >= 16); 947 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 948 break; 949 case TCG_TYPE_V128: 950 tcg_debug_assert(ret >= 16 && arg >= 16); 951 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 952 break; 953 case TCG_TYPE_V256: 954 tcg_debug_assert(ret >= 16 && arg >= 16); 955 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 956 break; 957 958 default: 959 g_assert_not_reached(); 960 } 961 return true; 962} 963 964static const int avx2_dup_insn[4] = { 965 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 966 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 967}; 968 969static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 970 TCGReg r, TCGReg a) 971{ 972 if (have_avx2) { 973 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 974 } else { 975 switch (vece) { 976 case MO_8: 977 /* ??? With zero in a register, use PSHUFB. */ 978 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 979 a = r; 980 /* FALLTHRU */ 981 case MO_16: 982 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 983 a = r; 984 /* FALLTHRU */ 985 case MO_32: 986 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 987 /* imm8 operand: all output lanes selected from input lane 0. */ 988 tcg_out8(s, 0); 989 break; 990 case MO_64: 991 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 992 break; 993 default: 994 g_assert_not_reached(); 995 } 996 } 997 return true; 998} 999 1000static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 1001 TCGReg r, TCGReg base, intptr_t offset) 1002{ 1003 if (have_avx2) { 1004 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1005 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 1006 r, 0, base, offset); 1007 } else { 1008 switch (vece) { 1009 case MO_64: 1010 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 1011 break; 1012 case MO_32: 1013 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 1014 break; 1015 case MO_16: 1016 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1017 tcg_out8(s, 0); /* imm8 */ 1018 tcg_out_dup_vec(s, type, vece, r, r); 1019 break; 1020 case MO_8: 1021 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1022 tcg_out8(s, 0); /* imm8 */ 1023 tcg_out_dup_vec(s, type, vece, r, r); 1024 break; 1025 default: 1026 g_assert_not_reached(); 1027 } 1028 } 1029 return true; 1030} 1031 1032static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1033 TCGReg ret, int64_t arg) 1034{ 1035 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1036 1037 if (arg == 0) { 1038 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1039 return; 1040 } 1041 if (arg == -1) { 1042 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1043 return; 1044 } 1045 1046 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1047 if (have_avx2) { 1048 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1049 } else { 1050 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1051 } 1052 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1053 } else { 1054 if (type == TCG_TYPE_V64) { 1055 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1056 } else if (have_avx2) { 1057 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1058 } else { 1059 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1060 } 1061 if (TCG_TARGET_REG_BITS == 64) { 1062 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1063 } else { 1064 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1065 } 1066 } 1067} 1068 1069static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1070 TCGReg ret, tcg_target_long arg) 1071{ 1072 if (arg == 0) { 1073 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1074 return; 1075 } 1076 if (arg == -1) { 1077 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1078 return; 1079 } 1080 1081 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1082 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1083 if (TCG_TARGET_REG_BITS == 64) { 1084 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1085 } else { 1086 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1087 } 1088} 1089 1090static void tcg_out_movi_int(TCGContext *s, TCGType type, 1091 TCGReg ret, tcg_target_long arg) 1092{ 1093 tcg_target_long diff; 1094 1095 if (arg == 0) { 1096 tgen_arithr(s, ARITH_XOR, ret, ret); 1097 return; 1098 } 1099 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1100 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1101 tcg_out32(s, arg); 1102 return; 1103 } 1104 if (arg == (int32_t)arg) { 1105 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1106 tcg_out32(s, arg); 1107 return; 1108 } 1109 1110 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1111 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1112 if (diff == (int32_t)diff) { 1113 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1114 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1115 tcg_out32(s, diff); 1116 return; 1117 } 1118 1119 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1120 tcg_out64(s, arg); 1121} 1122 1123static void tcg_out_movi(TCGContext *s, TCGType type, 1124 TCGReg ret, tcg_target_long arg) 1125{ 1126 switch (type) { 1127 case TCG_TYPE_I32: 1128#if TCG_TARGET_REG_BITS == 64 1129 case TCG_TYPE_I64: 1130#endif 1131 if (ret < 16) { 1132 tcg_out_movi_int(s, type, ret, arg); 1133 } else { 1134 tcg_out_movi_vec(s, type, ret, arg); 1135 } 1136 break; 1137 default: 1138 g_assert_not_reached(); 1139 } 1140} 1141 1142static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1143{ 1144 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1145 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1146 return true; 1147} 1148 1149static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1150 tcg_target_long imm) 1151{ 1152 /* This function is only used for passing structs by reference. */ 1153 tcg_debug_assert(imm == (int32_t)imm); 1154 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1155} 1156 1157static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1158{ 1159 if (val == (int8_t)val) { 1160 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1161 tcg_out8(s, val); 1162 } else if (val == (int32_t)val) { 1163 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1164 tcg_out32(s, val); 1165 } else { 1166 g_assert_not_reached(); 1167 } 1168} 1169 1170static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1171{ 1172 /* Given the strength of x86 memory ordering, we only need care for 1173 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1174 faster than "mfence", so don't bother with the sse insn. */ 1175 if (a0 & TCG_MO_ST_LD) { 1176 tcg_out8(s, 0xf0); 1177 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1178 tcg_out8(s, 0); 1179 } 1180} 1181 1182static inline void tcg_out_push(TCGContext *s, int reg) 1183{ 1184 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1185} 1186 1187static inline void tcg_out_pop(TCGContext *s, int reg) 1188{ 1189 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1190} 1191 1192static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1193 TCGReg arg1, intptr_t arg2) 1194{ 1195 switch (type) { 1196 case TCG_TYPE_I32: 1197 if (ret < 16) { 1198 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1199 } else { 1200 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1201 } 1202 break; 1203 case TCG_TYPE_I64: 1204 if (ret < 16) { 1205 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1206 break; 1207 } 1208 /* FALLTHRU */ 1209 case TCG_TYPE_V64: 1210 /* There is no instruction that can validate 8-byte alignment. */ 1211 tcg_debug_assert(ret >= 16); 1212 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1213 break; 1214 case TCG_TYPE_V128: 1215 /* 1216 * The gvec infrastructure is asserts that v128 vector loads 1217 * and stores use a 16-byte aligned offset. Validate that the 1218 * final pointer is aligned by using an insn that will SIGSEGV. 1219 */ 1220 tcg_debug_assert(ret >= 16); 1221 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1222 break; 1223 case TCG_TYPE_V256: 1224 /* 1225 * The gvec infrastructure only requires 16-byte alignment, 1226 * so here we must use an unaligned load. 1227 */ 1228 tcg_debug_assert(ret >= 16); 1229 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1230 ret, 0, arg1, arg2); 1231 break; 1232 default: 1233 g_assert_not_reached(); 1234 } 1235} 1236 1237static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1238 TCGReg arg1, intptr_t arg2) 1239{ 1240 switch (type) { 1241 case TCG_TYPE_I32: 1242 if (arg < 16) { 1243 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1244 } else { 1245 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1246 } 1247 break; 1248 case TCG_TYPE_I64: 1249 if (arg < 16) { 1250 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1251 break; 1252 } 1253 /* FALLTHRU */ 1254 case TCG_TYPE_V64: 1255 /* There is no instruction that can validate 8-byte alignment. */ 1256 tcg_debug_assert(arg >= 16); 1257 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1258 break; 1259 case TCG_TYPE_V128: 1260 /* 1261 * The gvec infrastructure is asserts that v128 vector loads 1262 * and stores use a 16-byte aligned offset. Validate that the 1263 * final pointer is aligned by using an insn that will SIGSEGV. 1264 * 1265 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1266 * for _WIN64, which must have SSE2 but may not have AVX. 1267 */ 1268 tcg_debug_assert(arg >= 16); 1269 if (have_avx1) { 1270 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1271 } else { 1272 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1273 } 1274 break; 1275 case TCG_TYPE_V256: 1276 /* 1277 * The gvec infrastructure only requires 16-byte alignment, 1278 * so here we must use an unaligned store. 1279 */ 1280 tcg_debug_assert(arg >= 16); 1281 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1282 arg, 0, arg1, arg2); 1283 break; 1284 default: 1285 g_assert_not_reached(); 1286 } 1287} 1288 1289static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1290 TCGReg base, intptr_t ofs) 1291{ 1292 int rexw = 0; 1293 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1294 if (val != (int32_t)val) { 1295 return false; 1296 } 1297 rexw = P_REXW; 1298 } else if (type != TCG_TYPE_I32) { 1299 return false; 1300 } 1301 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1302 tcg_out32(s, val); 1303 return true; 1304} 1305 1306static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1307{ 1308 /* Propagate an opcode prefix, such as P_DATA16. */ 1309 int ext = subopc & ~0x7; 1310 subopc &= 0x7; 1311 1312 if (count == 1) { 1313 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1314 } else { 1315 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1316 tcg_out8(s, count); 1317 } 1318} 1319 1320static inline void tcg_out_bswap32(TCGContext *s, int reg) 1321{ 1322 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1323} 1324 1325static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1326{ 1327 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1328} 1329 1330static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1331{ 1332 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1333 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1334 if (dest >= 4) { 1335 tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest); 1336 tcg_out32(s, 0xff); 1337 return; 1338 } 1339 src = dest; 1340 } 1341 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1342} 1343 1344static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1345{ 1346 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1347 1348 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1349 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1350 if (dest >= 4) { 1351 tcg_out_shifti(s, SHIFT_SHL, dest, 24); 1352 tcg_out_shifti(s, SHIFT_SAR, dest, 24); 1353 return; 1354 } 1355 src = dest; 1356 } 1357 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1358} 1359 1360static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1361{ 1362 /* movzwl */ 1363 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1364} 1365 1366static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1367{ 1368 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1369 /* movsw[lq] */ 1370 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1371} 1372 1373static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1374{ 1375 /* 32-bit mov zero extends. */ 1376 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1377} 1378 1379static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1380{ 1381 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1382 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1383} 1384 1385static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1386{ 1387 tcg_out_ext32s(s, dest, src); 1388} 1389 1390static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1391{ 1392 if (dest != src) { 1393 tcg_out_ext32u(s, dest, src); 1394 } 1395} 1396 1397static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1398{ 1399 tcg_out_ext32u(s, dest, src); 1400} 1401 1402static inline void tcg_out_bswap64(TCGContext *s, int reg) 1403{ 1404 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1405} 1406 1407static void tgen_arithi(TCGContext *s, int c, int r0, 1408 tcg_target_long val, int cf) 1409{ 1410 int rexw = 0; 1411 1412 if (TCG_TARGET_REG_BITS == 64) { 1413 rexw = c & -8; 1414 c &= 7; 1415 } 1416 1417 switch (c) { 1418 case ARITH_ADD: 1419 case ARITH_SUB: 1420 if (!cf) { 1421 /* 1422 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1423 * partial flags update stalls on Pentium4 and are not recommended 1424 * by current Intel optimization manuals. 1425 */ 1426 if (val == 1 || val == -1) { 1427 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1428 if (TCG_TARGET_REG_BITS == 64) { 1429 /* 1430 * The single-byte increment encodings are re-tasked 1431 * as the REX prefixes. Use the MODRM encoding. 1432 */ 1433 tcg_out_modrm(s, OPC_GRP5 + rexw, 1434 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1435 } else { 1436 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1437 } 1438 return; 1439 } 1440 if (val == 128) { 1441 /* 1442 * Facilitate using an 8-bit immediate. Carry is inverted 1443 * by this transformation, so do it only if cf == 0. 1444 */ 1445 c ^= ARITH_ADD ^ ARITH_SUB; 1446 val = -128; 1447 } 1448 } 1449 break; 1450 1451 case ARITH_AND: 1452 if (TCG_TARGET_REG_BITS == 64) { 1453 if (val == 0xffffffffu) { 1454 tcg_out_ext32u(s, r0, r0); 1455 return; 1456 } 1457 if (val == (uint32_t)val) { 1458 /* AND with no high bits set can use a 32-bit operation. */ 1459 rexw = 0; 1460 } 1461 } 1462 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1463 tcg_out_ext8u(s, r0, r0); 1464 return; 1465 } 1466 if (val == 0xffffu) { 1467 tcg_out_ext16u(s, r0, r0); 1468 return; 1469 } 1470 break; 1471 1472 case ARITH_OR: 1473 case ARITH_XOR: 1474 if (val >= 0x80 && val <= 0xff 1475 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1476 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1477 tcg_out8(s, val); 1478 return; 1479 } 1480 break; 1481 } 1482 1483 if (val == (int8_t)val) { 1484 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1485 tcg_out8(s, val); 1486 return; 1487 } 1488 if (rexw == 0 || val == (int32_t)val) { 1489 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1490 tcg_out32(s, val); 1491 return; 1492 } 1493 1494 g_assert_not_reached(); 1495} 1496 1497static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1498{ 1499 if (val != 0) { 1500 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1501 } 1502} 1503 1504/* Set SMALL to force a short forward branch. */ 1505static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1506{ 1507 int32_t val, val1; 1508 1509 if (l->has_value) { 1510 val = tcg_pcrel_diff(s, l->u.value_ptr); 1511 val1 = val - 2; 1512 if ((int8_t)val1 == val1) { 1513 if (opc == -1) { 1514 tcg_out8(s, OPC_JMP_short); 1515 } else { 1516 tcg_out8(s, OPC_JCC_short + opc); 1517 } 1518 tcg_out8(s, val1); 1519 } else { 1520 tcg_debug_assert(!small); 1521 if (opc == -1) { 1522 tcg_out8(s, OPC_JMP_long); 1523 tcg_out32(s, val - 5); 1524 } else { 1525 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1526 tcg_out32(s, val - 6); 1527 } 1528 } 1529 } else if (small) { 1530 if (opc == -1) { 1531 tcg_out8(s, OPC_JMP_short); 1532 } else { 1533 tcg_out8(s, OPC_JCC_short + opc); 1534 } 1535 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1536 s->code_ptr += 1; 1537 } else { 1538 if (opc == -1) { 1539 tcg_out8(s, OPC_JMP_long); 1540 } else { 1541 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1542 } 1543 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1544 s->code_ptr += 4; 1545 } 1546} 1547 1548static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1549 TCGArg arg2, int const_arg2, int rexw) 1550{ 1551 int jz, js; 1552 1553 if (!is_tst_cond(cond)) { 1554 if (!const_arg2) { 1555 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1556 } else if (arg2 == 0) { 1557 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1558 } else { 1559 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1560 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1561 } 1562 return tcg_cond_to_jcc[cond]; 1563 } 1564 1565 jz = tcg_cond_to_jcc[cond]; 1566 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1567 1568 if (!const_arg2) { 1569 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1570 return jz; 1571 } 1572 1573 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1574 if (arg2 == 0x80) { 1575 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1576 return js; 1577 } 1578 if (arg2 == 0xff) { 1579 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1580 return jz; 1581 } 1582 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1583 tcg_out8(s, arg2); 1584 return jz; 1585 } 1586 1587 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1588 if (arg2 == 0x8000) { 1589 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1590 return js; 1591 } 1592 if (arg2 == 0xff00) { 1593 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1594 return jz; 1595 } 1596 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1597 tcg_out8(s, arg2 >> 8); 1598 return jz; 1599 } 1600 1601 if (arg2 == 0xffff) { 1602 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1603 return jz; 1604 } 1605 if (arg2 == 0xffffffffu) { 1606 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1607 return jz; 1608 } 1609 1610 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1611 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1612 int sh = ctz64(arg2); 1613 1614 rexw = (sh & 32 ? P_REXW : 0); 1615 if ((sh & 31) == 31) { 1616 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1617 return js; 1618 } else { 1619 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1620 tcg_out8(s, sh); 1621 return jc; 1622 } 1623 } 1624 1625 if (rexw) { 1626 if (arg2 == (uint32_t)arg2) { 1627 rexw = 0; 1628 } else { 1629 tcg_debug_assert(arg2 == (int32_t)arg2); 1630 } 1631 } 1632 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1633 tcg_out32(s, arg2); 1634 return jz; 1635} 1636 1637static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1638 TCGArg arg1, TCGArg arg2, int const_arg2, 1639 TCGLabel *label, bool small) 1640{ 1641 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1642 tcg_out_jxx(s, jcc, label, small); 1643} 1644 1645static void tgen_brcond(TCGContext *s, TCGType type, TCGCond cond, 1646 TCGReg arg1, TCGReg arg2, TCGLabel *label) 1647{ 1648 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1649 tcg_out_brcond(s, rexw, cond, arg1, arg2, false, label, false); 1650} 1651 1652static void tgen_brcondi(TCGContext *s, TCGType type, TCGCond cond, 1653 TCGReg arg1, tcg_target_long arg2, TCGLabel *label) 1654{ 1655 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1656 tcg_out_brcond(s, rexw, cond, arg1, arg2, true, label, false); 1657} 1658 1659static const TCGOutOpBrcond outop_brcond = { 1660 .base.static_constraint = C_O0_I2(r, reT), 1661 .out_rr = tgen_brcond, 1662 .out_ri = tgen_brcondi, 1663}; 1664 1665#if TCG_TARGET_REG_BITS == 32 1666static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1667 const int *const_args, bool small) 1668{ 1669 TCGLabel *label_next = gen_new_label(); 1670 TCGLabel *label_this = arg_label(args[5]); 1671 TCGCond cond = args[4]; 1672 1673 switch (cond) { 1674 case TCG_COND_EQ: 1675 case TCG_COND_TSTEQ: 1676 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1677 args[0], args[2], const_args[2], label_next, 1); 1678 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1679 label_this, small); 1680 break; 1681 1682 case TCG_COND_NE: 1683 case TCG_COND_TSTNE: 1684 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1685 label_this, small); 1686 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1687 label_this, small); 1688 break; 1689 1690 default: 1691 tcg_out_brcond(s, 0, tcg_high_cond(cond), args[1], 1692 args[3], const_args[3], label_this, small); 1693 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1694 tcg_out_brcond(s, 0, tcg_unsigned_cond(cond), args[0], 1695 args[2], const_args[2], label_this, small); 1696 break; 1697 } 1698 tcg_out_label(s, label_next); 1699} 1700#endif 1701 1702static void tcg_out_setcond(TCGContext *s, TCGType type, TCGCond cond, 1703 TCGReg dest, TCGReg arg1, TCGArg arg2, 1704 bool const_arg2, bool neg) 1705{ 1706 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1707 int cmp_rexw = rexw; 1708 bool inv = false; 1709 bool cleared; 1710 int jcc; 1711 1712 switch (cond) { 1713 case TCG_COND_NE: 1714 inv = true; 1715 /* fall through */ 1716 case TCG_COND_EQ: 1717 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1718 if (const_arg2 && arg2 == 0) { 1719 arg2 = 1; 1720 goto do_ltu; 1721 } 1722 break; 1723 1724 case TCG_COND_TSTNE: 1725 inv = true; 1726 /* fall through */ 1727 case TCG_COND_TSTEQ: 1728 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1729 if (const_arg2 && arg2 == 0xffffffffu) { 1730 arg2 = 1; 1731 cmp_rexw = 0; 1732 goto do_ltu; 1733 } 1734 break; 1735 1736 case TCG_COND_LEU: 1737 inv = true; 1738 /* fall through */ 1739 case TCG_COND_GTU: 1740 /* If arg2 is a register, swap for LTU/GEU. */ 1741 if (!const_arg2) { 1742 TCGReg t = arg1; 1743 arg1 = arg2; 1744 arg2 = t; 1745 goto do_ltu; 1746 } 1747 break; 1748 1749 case TCG_COND_GEU: 1750 inv = true; 1751 /* fall through */ 1752 case TCG_COND_LTU: 1753 do_ltu: 1754 /* 1755 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1756 * We can then use NEG or INC to produce the desired result. 1757 * This is always smaller than the SETCC expansion. 1758 */ 1759 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1760 1761 /* X - X - C = -C = (C ? -1 : 0) */ 1762 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1763 if (inv && neg) { 1764 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1765 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1766 } else if (inv) { 1767 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1768 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1769 } else if (!neg) { 1770 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1771 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1772 } 1773 return; 1774 1775 case TCG_COND_GE: 1776 inv = true; 1777 /* fall through */ 1778 case TCG_COND_LT: 1779 /* If arg2 is 0, extract the sign bit. */ 1780 if (const_arg2 && arg2 == 0) { 1781 tcg_out_mov(s, type, dest, arg1); 1782 if (inv) { 1783 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1784 } 1785 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1786 dest, rexw ? 63 : 31); 1787 return; 1788 } 1789 break; 1790 1791 default: 1792 break; 1793 } 1794 1795 /* 1796 * If dest does not overlap the inputs, clearing it first is preferred. 1797 * The XOR breaks any false dependency for the low-byte write to dest, 1798 * and is also one byte smaller than MOVZBL. 1799 */ 1800 cleared = false; 1801 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1802 tgen_arithr(s, ARITH_XOR, dest, dest); 1803 cleared = true; 1804 } 1805 1806 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1807 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1808 1809 if (!cleared) { 1810 tcg_out_ext8u(s, dest, dest); 1811 } 1812 if (neg) { 1813 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1814 } 1815} 1816 1817static void tgen_setcond(TCGContext *s, TCGType type, TCGCond cond, 1818 TCGReg dest, TCGReg arg1, TCGReg arg2) 1819{ 1820 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, false); 1821} 1822 1823static void tgen_setcondi(TCGContext *s, TCGType type, TCGCond cond, 1824 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1825{ 1826 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, false); 1827} 1828 1829static const TCGOutOpSetcond outop_setcond = { 1830 .base.static_constraint = C_O1_I2(q, r, reT), 1831 .out_rrr = tgen_setcond, 1832 .out_rri = tgen_setcondi, 1833}; 1834 1835static void tgen_negsetcond(TCGContext *s, TCGType type, TCGCond cond, 1836 TCGReg dest, TCGReg arg1, TCGReg arg2) 1837{ 1838 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, true); 1839} 1840 1841static void tgen_negsetcondi(TCGContext *s, TCGType type, TCGCond cond, 1842 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1843{ 1844 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, true); 1845} 1846 1847static const TCGOutOpSetcond outop_negsetcond = { 1848 .base.static_constraint = C_O1_I2(q, r, reT), 1849 .out_rrr = tgen_negsetcond, 1850 .out_rri = tgen_negsetcondi, 1851}; 1852 1853#if TCG_TARGET_REG_BITS == 32 1854static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1855 const int *const_args) 1856{ 1857 TCGArg new_args[6]; 1858 TCGLabel *label_true, *label_over; 1859 1860 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1861 1862 if (args[0] == args[1] || args[0] == args[2] 1863 || (!const_args[3] && args[0] == args[3]) 1864 || (!const_args[4] && args[0] == args[4])) { 1865 /* When the destination overlaps with one of the argument 1866 registers, don't do anything tricky. */ 1867 label_true = gen_new_label(); 1868 label_over = gen_new_label(); 1869 1870 new_args[5] = label_arg(label_true); 1871 tcg_out_brcond2(s, new_args, const_args+1, 1); 1872 1873 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1874 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1875 tcg_out_label(s, label_true); 1876 1877 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1878 tcg_out_label(s, label_over); 1879 } else { 1880 /* When the destination does not overlap one of the arguments, 1881 clear the destination first, jump if cond false, and emit an 1882 increment in the true case. This results in smaller code. */ 1883 1884 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1885 1886 label_over = gen_new_label(); 1887 new_args[4] = tcg_invert_cond(new_args[4]); 1888 new_args[5] = label_arg(label_over); 1889 tcg_out_brcond2(s, new_args, const_args+1, 1); 1890 1891 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1892 tcg_out_label(s, label_over); 1893 } 1894} 1895#endif 1896 1897static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1898 TCGReg dest, TCGReg v1) 1899{ 1900 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1901} 1902 1903static void tgen_movcond(TCGContext *s, TCGType type, TCGCond cond, 1904 TCGReg dest, TCGReg c1, TCGArg c2, bool const_c2, 1905 TCGArg vt, bool const_vt, 1906 TCGArg vf, bool consf_vf) 1907{ 1908 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1909 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1910 tcg_out_cmov(s, jcc, rexw, dest, vt); 1911} 1912 1913static const TCGOutOpMovcond outop_movcond = { 1914 .base.static_constraint = C_O1_I4(r, r, reT, r, 0), 1915 .out = tgen_movcond, 1916}; 1917 1918static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1919{ 1920 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1921 1922 if (disp == (int32_t)disp) { 1923 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1924 tcg_out32(s, disp); 1925 } else { 1926 /* rip-relative addressing into the constant pool. 1927 This is 6 + 8 = 14 bytes, as compared to using an 1928 immediate load 10 + 6 = 16 bytes, plus we may 1929 be able to re-use the pool constant for more calls. */ 1930 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1931 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1932 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1933 tcg_out32(s, 0); 1934 } 1935} 1936 1937static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1938 const TCGHelperInfo *info) 1939{ 1940 tcg_out_branch(s, 1, dest); 1941 1942#ifndef _WIN32 1943 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1944 /* 1945 * The sysv i386 abi for struct return places a reference as the 1946 * first argument of the stack, and pops that argument with the 1947 * return statement. Since we want to retain the aligned stack 1948 * pointer for the callee, we do not want to actually push that 1949 * argument before the call but rely on the normal store to the 1950 * stack slot. But we do need to compensate for the pop in order 1951 * to reset our correct stack pointer value. 1952 * Pushing a garbage value back onto the stack is quickest. 1953 */ 1954 tcg_out_push(s, TCG_REG_EAX); 1955 } 1956#endif 1957} 1958 1959static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1960{ 1961 tcg_out_branch(s, 0, dest); 1962} 1963 1964static void tcg_out_nopn(TCGContext *s, int n) 1965{ 1966 int i; 1967 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1968 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1969 * duplicate prefix, and all of the interesting recent cores can 1970 * decode and discard the duplicates in a single cycle. 1971 */ 1972 tcg_debug_assert(n >= 1); 1973 for (i = 1; i < n; ++i) { 1974 tcg_out8(s, 0x66); 1975 } 1976 tcg_out8(s, 0x90); 1977} 1978 1979typedef struct { 1980 TCGReg base; 1981 int index; 1982 int ofs; 1983 int seg; 1984 TCGAtomAlign aa; 1985} HostAddress; 1986 1987bool tcg_target_has_memory_bswap(MemOp memop) 1988{ 1989 TCGAtomAlign aa; 1990 1991 if (!have_movbe) { 1992 return false; 1993 } 1994 if ((memop & MO_SIZE) < MO_128) { 1995 return true; 1996 } 1997 1998 /* 1999 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 2000 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 2001 */ 2002 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 2003 return aa.atom < MO_128; 2004} 2005 2006/* 2007 * Because i686 has no register parameters and because x86_64 has xchg 2008 * to handle addr/data register overlap, we have placed all input arguments 2009 * before we need might need a scratch reg. 2010 * 2011 * Even then, a scratch is only needed for l->raddr. Rather than expose 2012 * a general-purpose scratch when we don't actually know it's available, 2013 * use the ra_gen hook to load into RAX if needed. 2014 */ 2015#if TCG_TARGET_REG_BITS == 64 2016static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 2017{ 2018 if (arg < 0) { 2019 arg = TCG_REG_RAX; 2020 } 2021 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 2022 return arg; 2023} 2024static const TCGLdstHelperParam ldst_helper_param = { 2025 .ra_gen = ldst_ra_gen 2026}; 2027#else 2028static const TCGLdstHelperParam ldst_helper_param = { }; 2029#endif 2030 2031static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 2032 TCGReg l, TCGReg h, TCGReg v) 2033{ 2034 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2035 2036 /* vpmov{d,q} %v, %l */ 2037 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2038 /* vpextr{d,q} $1, %v, %h */ 2039 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2040 tcg_out8(s, 1); 2041} 2042 2043static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2044 TCGReg v, TCGReg l, TCGReg h) 2045{ 2046 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2047 2048 /* vmov{d,q} %l, %v */ 2049 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2050 /* vpinsr{d,q} $1, %h, %v, %v */ 2051 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2052 tcg_out8(s, 1); 2053} 2054 2055/* 2056 * Generate code for the slow path for a load at the end of block 2057 */ 2058static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2059{ 2060 MemOp opc = get_memop(l->oi); 2061 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2062 2063 /* resolve label address */ 2064 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2065 if (label_ptr[1]) { 2066 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2067 } 2068 2069 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2070 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2071 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2072 2073 tcg_out_jmp(s, l->raddr); 2074 return true; 2075} 2076 2077/* 2078 * Generate code for the slow path for a store at the end of block 2079 */ 2080static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2081{ 2082 MemOp opc = get_memop(l->oi); 2083 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2084 2085 /* resolve label address */ 2086 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2087 if (label_ptr[1]) { 2088 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2089 } 2090 2091 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2092 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2093 2094 tcg_out_jmp(s, l->raddr); 2095 return true; 2096} 2097 2098#ifdef CONFIG_USER_ONLY 2099static HostAddress x86_guest_base = { 2100 .index = -1 2101}; 2102 2103#if defined(__x86_64__) && defined(__linux__) 2104# include <asm/prctl.h> 2105# include <sys/prctl.h> 2106int arch_prctl(int code, unsigned long addr); 2107static inline int setup_guest_base_seg(void) 2108{ 2109 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2110 return P_GS; 2111 } 2112 return 0; 2113} 2114#define setup_guest_base_seg setup_guest_base_seg 2115#elif defined(__x86_64__) && \ 2116 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2117# include <machine/sysarch.h> 2118static inline int setup_guest_base_seg(void) 2119{ 2120 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2121 return P_GS; 2122 } 2123 return 0; 2124} 2125#define setup_guest_base_seg setup_guest_base_seg 2126#endif 2127#else 2128# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2129#endif /* CONFIG_USER_ONLY */ 2130#ifndef setup_guest_base_seg 2131# define setup_guest_base_seg() 0 2132#endif 2133 2134#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2135 2136/* 2137 * For softmmu, perform the TLB load and compare. 2138 * For useronly, perform any required alignment tests. 2139 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2140 * is required and fill in @h with the host address for the fast path. 2141 */ 2142static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2143 TCGReg addr, MemOpIdx oi, bool is_ld) 2144{ 2145 TCGLabelQemuLdst *ldst = NULL; 2146 MemOp opc = get_memop(oi); 2147 MemOp s_bits = opc & MO_SIZE; 2148 unsigned a_mask; 2149 2150 if (tcg_use_softmmu) { 2151 h->index = TCG_REG_L0; 2152 h->ofs = 0; 2153 h->seg = 0; 2154 } else { 2155 *h = x86_guest_base; 2156 } 2157 h->base = addr; 2158 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2159 a_mask = (1 << h->aa.align) - 1; 2160 2161 if (tcg_use_softmmu) { 2162 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2163 : offsetof(CPUTLBEntry, addr_write); 2164 TCGType ttype = TCG_TYPE_I32; 2165 TCGType tlbtype = TCG_TYPE_I32; 2166 int trexw = 0, hrexw = 0, tlbrexw = 0; 2167 unsigned mem_index = get_mmuidx(oi); 2168 unsigned s_mask = (1 << s_bits) - 1; 2169 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2170 int tlb_mask; 2171 2172 ldst = new_ldst_label(s); 2173 ldst->is_ld = is_ld; 2174 ldst->oi = oi; 2175 ldst->addr_reg = addr; 2176 2177 if (TCG_TARGET_REG_BITS == 64) { 2178 ttype = s->addr_type; 2179 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2180 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2181 hrexw = P_REXW; 2182 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2183 tlbtype = TCG_TYPE_I64; 2184 tlbrexw = P_REXW; 2185 } 2186 } 2187 } 2188 2189 tcg_out_mov(s, tlbtype, TCG_REG_L0, addr); 2190 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2191 s->page_bits - CPU_TLB_ENTRY_BITS); 2192 2193 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2194 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2195 2196 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2197 fast_ofs + offsetof(CPUTLBDescFast, table)); 2198 2199 /* 2200 * If the required alignment is at least as large as the access, 2201 * simply copy the address and mask. For lesser alignments, 2202 * check that we don't cross pages for the complete access. 2203 */ 2204 if (a_mask >= s_mask) { 2205 tcg_out_mov(s, ttype, TCG_REG_L1, addr); 2206 } else { 2207 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2208 addr, s_mask - a_mask); 2209 } 2210 tlb_mask = s->page_mask | a_mask; 2211 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2212 2213 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2214 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2215 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2216 2217 /* jne slow_path */ 2218 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2219 ldst->label_ptr[0] = s->code_ptr; 2220 s->code_ptr += 4; 2221 2222 /* TLB Hit. */ 2223 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2224 offsetof(CPUTLBEntry, addend)); 2225 } else if (a_mask) { 2226 int jcc; 2227 2228 ldst = new_ldst_label(s); 2229 ldst->is_ld = is_ld; 2230 ldst->oi = oi; 2231 ldst->addr_reg = addr; 2232 2233 /* jne slow_path */ 2234 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addr, a_mask, true, false); 2235 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2236 ldst->label_ptr[0] = s->code_ptr; 2237 s->code_ptr += 4; 2238 } 2239 2240 return ldst; 2241} 2242 2243static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2244 HostAddress h, TCGType type, MemOp memop) 2245{ 2246 bool use_movbe = false; 2247 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2248 int movop = OPC_MOVL_GvEv; 2249 2250 /* Do big-endian loads with movbe. */ 2251 if (memop & MO_BSWAP) { 2252 tcg_debug_assert(have_movbe); 2253 use_movbe = true; 2254 movop = OPC_MOVBE_GyMy; 2255 } 2256 2257 switch (memop & MO_SSIZE) { 2258 case MO_UB: 2259 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2260 h.base, h.index, 0, h.ofs); 2261 break; 2262 case MO_SB: 2263 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2264 h.base, h.index, 0, h.ofs); 2265 break; 2266 case MO_UW: 2267 if (use_movbe) { 2268 /* There is no extending movbe; only low 16-bits are modified. */ 2269 if (datalo != h.base && datalo != h.index) { 2270 /* XOR breaks dependency chains. */ 2271 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2272 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2273 datalo, h.base, h.index, 0, h.ofs); 2274 } else { 2275 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2276 datalo, h.base, h.index, 0, h.ofs); 2277 tcg_out_ext16u(s, datalo, datalo); 2278 } 2279 } else { 2280 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2281 h.base, h.index, 0, h.ofs); 2282 } 2283 break; 2284 case MO_SW: 2285 if (use_movbe) { 2286 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2287 datalo, h.base, h.index, 0, h.ofs); 2288 tcg_out_ext16s(s, type, datalo, datalo); 2289 } else { 2290 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2291 datalo, h.base, h.index, 0, h.ofs); 2292 } 2293 break; 2294 case MO_UL: 2295 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2296 h.base, h.index, 0, h.ofs); 2297 break; 2298#if TCG_TARGET_REG_BITS == 64 2299 case MO_SL: 2300 if (use_movbe) { 2301 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2302 h.base, h.index, 0, h.ofs); 2303 tcg_out_ext32s(s, datalo, datalo); 2304 } else { 2305 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2306 h.base, h.index, 0, h.ofs); 2307 } 2308 break; 2309#endif 2310 case MO_UQ: 2311 if (TCG_TARGET_REG_BITS == 64) { 2312 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2313 h.base, h.index, 0, h.ofs); 2314 break; 2315 } 2316 if (use_movbe) { 2317 TCGReg t = datalo; 2318 datalo = datahi; 2319 datahi = t; 2320 } 2321 if (h.base == datalo || h.index == datalo) { 2322 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2323 h.base, h.index, 0, h.ofs); 2324 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2325 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2326 } else { 2327 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2328 h.base, h.index, 0, h.ofs); 2329 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2330 h.base, h.index, 0, h.ofs + 4); 2331 } 2332 break; 2333 2334 case MO_128: 2335 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2336 2337 /* 2338 * Without 16-byte atomicity, use integer regs. 2339 * That is where we want the data, and it allows bswaps. 2340 */ 2341 if (h.aa.atom < MO_128) { 2342 if (use_movbe) { 2343 TCGReg t = datalo; 2344 datalo = datahi; 2345 datahi = t; 2346 } 2347 if (h.base == datalo || h.index == datalo) { 2348 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2349 h.base, h.index, 0, h.ofs); 2350 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2351 datalo, datahi, 0); 2352 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2353 datahi, datahi, 8); 2354 } else { 2355 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2356 h.base, h.index, 0, h.ofs); 2357 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2358 h.base, h.index, 0, h.ofs + 8); 2359 } 2360 break; 2361 } 2362 2363 /* 2364 * With 16-byte atomicity, a vector load is required. 2365 * If we already have 16-byte alignment, then VMOVDQA always works. 2366 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2367 * Else use we require a runtime test for alignment for VMOVDQA; 2368 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2369 */ 2370 if (h.aa.align >= MO_128) { 2371 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2372 TCG_TMP_VEC, 0, 2373 h.base, h.index, 0, h.ofs); 2374 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2375 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2376 TCG_TMP_VEC, 0, 2377 h.base, h.index, 0, h.ofs); 2378 } else { 2379 TCGLabel *l1 = gen_new_label(); 2380 TCGLabel *l2 = gen_new_label(); 2381 int jcc; 2382 2383 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2384 tcg_out_jxx(s, jcc, l1, true); 2385 2386 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2387 TCG_TMP_VEC, 0, 2388 h.base, h.index, 0, h.ofs); 2389 tcg_out_jxx(s, JCC_JMP, l2, true); 2390 2391 tcg_out_label(s, l1); 2392 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2393 TCG_TMP_VEC, 0, 2394 h.base, h.index, 0, h.ofs); 2395 tcg_out_label(s, l2); 2396 } 2397 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2398 break; 2399 2400 default: 2401 g_assert_not_reached(); 2402 } 2403} 2404 2405static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2406 TCGReg addr, MemOpIdx oi, TCGType data_type) 2407{ 2408 TCGLabelQemuLdst *ldst; 2409 HostAddress h; 2410 2411 ldst = prepare_host_addr(s, &h, addr, oi, true); 2412 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2413 2414 if (ldst) { 2415 ldst->type = data_type; 2416 ldst->datalo_reg = datalo; 2417 ldst->datahi_reg = datahi; 2418 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2419 } 2420} 2421 2422static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2423 HostAddress h, MemOp memop) 2424{ 2425 bool use_movbe = false; 2426 int movop = OPC_MOVL_EvGv; 2427 2428 /* 2429 * Do big-endian stores with movbe or system-mode. 2430 * User-only without movbe will have its swapping done generically. 2431 */ 2432 if (memop & MO_BSWAP) { 2433 tcg_debug_assert(have_movbe); 2434 use_movbe = true; 2435 movop = OPC_MOVBE_MyGy; 2436 } 2437 2438 switch (memop & MO_SIZE) { 2439 case MO_8: 2440 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2441 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2442 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2443 datalo, h.base, h.index, 0, h.ofs); 2444 break; 2445 case MO_16: 2446 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2447 h.base, h.index, 0, h.ofs); 2448 break; 2449 case MO_32: 2450 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2451 h.base, h.index, 0, h.ofs); 2452 break; 2453 case MO_64: 2454 if (TCG_TARGET_REG_BITS == 64) { 2455 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2456 h.base, h.index, 0, h.ofs); 2457 } else { 2458 if (use_movbe) { 2459 TCGReg t = datalo; 2460 datalo = datahi; 2461 datahi = t; 2462 } 2463 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2464 h.base, h.index, 0, h.ofs); 2465 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2466 h.base, h.index, 0, h.ofs + 4); 2467 } 2468 break; 2469 2470 case MO_128: 2471 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2472 2473 /* 2474 * Without 16-byte atomicity, use integer regs. 2475 * That is where we have the data, and it allows bswaps. 2476 */ 2477 if (h.aa.atom < MO_128) { 2478 if (use_movbe) { 2479 TCGReg t = datalo; 2480 datalo = datahi; 2481 datahi = t; 2482 } 2483 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2484 h.base, h.index, 0, h.ofs); 2485 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2486 h.base, h.index, 0, h.ofs + 8); 2487 break; 2488 } 2489 2490 /* 2491 * With 16-byte atomicity, a vector store is required. 2492 * If we already have 16-byte alignment, then VMOVDQA always works. 2493 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2494 * Else use we require a runtime test for alignment for VMOVDQA; 2495 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2496 */ 2497 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2498 if (h.aa.align >= MO_128) { 2499 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2500 TCG_TMP_VEC, 0, 2501 h.base, h.index, 0, h.ofs); 2502 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2503 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2504 TCG_TMP_VEC, 0, 2505 h.base, h.index, 0, h.ofs); 2506 } else { 2507 TCGLabel *l1 = gen_new_label(); 2508 TCGLabel *l2 = gen_new_label(); 2509 int jcc; 2510 2511 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2512 tcg_out_jxx(s, jcc, l1, true); 2513 2514 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2515 TCG_TMP_VEC, 0, 2516 h.base, h.index, 0, h.ofs); 2517 tcg_out_jxx(s, JCC_JMP, l2, true); 2518 2519 tcg_out_label(s, l1); 2520 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2521 TCG_TMP_VEC, 0, 2522 h.base, h.index, 0, h.ofs); 2523 tcg_out_label(s, l2); 2524 } 2525 break; 2526 2527 default: 2528 g_assert_not_reached(); 2529 } 2530} 2531 2532static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2533 TCGReg addr, MemOpIdx oi, TCGType data_type) 2534{ 2535 TCGLabelQemuLdst *ldst; 2536 HostAddress h; 2537 2538 ldst = prepare_host_addr(s, &h, addr, oi, false); 2539 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2540 2541 if (ldst) { 2542 ldst->type = data_type; 2543 ldst->datalo_reg = datalo; 2544 ldst->datahi_reg = datahi; 2545 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2546 } 2547} 2548 2549static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2550{ 2551 /* Reuse the zeroing that exists for goto_ptr. */ 2552 if (a0 == 0) { 2553 tcg_out_jmp(s, tcg_code_gen_epilogue); 2554 } else { 2555 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2556 tcg_out_jmp(s, tb_ret_addr); 2557 } 2558} 2559 2560static void tcg_out_goto_tb(TCGContext *s, int which) 2561{ 2562 /* 2563 * Jump displacement must be aligned for atomic patching; 2564 * see if we need to add extra nops before jump 2565 */ 2566 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2567 if (gap != 1) { 2568 tcg_out_nopn(s, gap - 1); 2569 } 2570 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2571 set_jmp_insn_offset(s, which); 2572 tcg_out32(s, 0); 2573 set_jmp_reset_offset(s, which); 2574} 2575 2576void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2577 uintptr_t jmp_rx, uintptr_t jmp_rw) 2578{ 2579 /* patch the branch destination */ 2580 uintptr_t addr = tb->jmp_target_addr[n]; 2581 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2582 /* no need to flush icache explicitly */ 2583} 2584 2585 2586static void tgen_add(TCGContext *s, TCGType type, 2587 TCGReg a0, TCGReg a1, TCGReg a2) 2588{ 2589 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2590 2591 if (a0 == a1) { 2592 tgen_arithr(s, ARITH_ADD + rexw, a0, a2); 2593 } else if (a0 == a2) { 2594 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2595 } else { 2596 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, 0); 2597 } 2598} 2599 2600static void tgen_addi(TCGContext *s, TCGType type, 2601 TCGReg a0, TCGReg a1, tcg_target_long a2) 2602{ 2603 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2604 2605 if (a0 == a1) { 2606 tgen_arithi(s, ARITH_ADD + rexw, a0, a2, false); 2607 } else { 2608 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2); 2609 } 2610} 2611 2612static const TCGOutOpBinary outop_add = { 2613 .base.static_constraint = C_O1_I2(r, r, re), 2614 .out_rrr = tgen_add, 2615 .out_rri = tgen_addi, 2616}; 2617 2618static void tgen_and(TCGContext *s, TCGType type, 2619 TCGReg a0, TCGReg a1, TCGReg a2) 2620{ 2621 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2622 tgen_arithr(s, ARITH_AND + rexw, a0, a2); 2623} 2624 2625static void tgen_andi(TCGContext *s, TCGType type, 2626 TCGReg a0, TCGReg a1, tcg_target_long a2) 2627{ 2628 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2629 tgen_arithi(s, ARITH_AND + rexw, a0, a2, false); 2630} 2631 2632static const TCGOutOpBinary outop_and = { 2633 .base.static_constraint = C_O1_I2(r, 0, reZ), 2634 .out_rrr = tgen_and, 2635 .out_rri = tgen_andi, 2636}; 2637 2638static void tgen_andc(TCGContext *s, TCGType type, 2639 TCGReg a0, TCGReg a1, TCGReg a2) 2640{ 2641 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2642 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2643} 2644 2645static TCGConstraintSetIndex cset_andc(TCGType type, unsigned flags) 2646{ 2647 return have_bmi1 ? C_O1_I2(r, r, r) : C_NotImplemented; 2648} 2649 2650static const TCGOutOpBinary outop_andc = { 2651 .base.static_constraint = C_Dynamic, 2652 .base.dynamic_constraint = cset_andc, 2653 .out_rrr = tgen_andc, 2654}; 2655 2656static void tgen_clz(TCGContext *s, TCGType type, 2657 TCGReg a0, TCGReg a1, TCGReg a2) 2658{ 2659 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2660 int jcc; 2661 2662 if (have_lzcnt) { 2663 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2664 jcc = JCC_JB; 2665 } else { 2666 /* Recall that the output of BSR is the index not the count. */ 2667 tcg_out_modrm(s, OPC_BSR + rexw, a0, a1); 2668 tgen_arithi(s, ARITH_XOR + rexw, a0, rexw ? 63 : 31, 0); 2669 2670 /* Since we have destroyed the flags from BSR, we have to re-test. */ 2671 jcc = tcg_out_cmp(s, TCG_COND_EQ, a1, 0, 1, rexw); 2672 } 2673 tcg_out_cmov(s, jcc, rexw, a0, a2); 2674} 2675 2676static void tgen_clzi(TCGContext *s, TCGType type, 2677 TCGReg a0, TCGReg a1, tcg_target_long a2) 2678{ 2679 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2680 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2681} 2682 2683static TCGConstraintSetIndex cset_clz(TCGType type, unsigned flags) 2684{ 2685 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2686} 2687 2688static const TCGOutOpBinary outop_clz = { 2689 .base.static_constraint = C_Dynamic, 2690 .base.dynamic_constraint = cset_clz, 2691 .out_rrr = tgen_clz, 2692 .out_rri = tgen_clzi, 2693}; 2694 2695static void tgen_ctpop(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2696{ 2697 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2698 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2699} 2700 2701static TCGConstraintSetIndex cset_ctpop(TCGType type, unsigned flags) 2702{ 2703 return have_popcnt ? C_O1_I1(r, r) : C_NotImplemented; 2704} 2705 2706static const TCGOutOpUnary outop_ctpop = { 2707 .base.static_constraint = C_Dynamic, 2708 .base.dynamic_constraint = cset_ctpop, 2709 .out_rr = tgen_ctpop, 2710}; 2711 2712static void tgen_ctz(TCGContext *s, TCGType type, 2713 TCGReg a0, TCGReg a1, TCGReg a2) 2714{ 2715 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2716 int jcc; 2717 2718 if (have_bmi1) { 2719 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2720 jcc = JCC_JB; 2721 } else { 2722 tcg_out_modrm(s, OPC_BSF + rexw, a0, a1); 2723 jcc = JCC_JE; 2724 } 2725 tcg_out_cmov(s, jcc, rexw, a0, a2); 2726} 2727 2728static void tgen_ctzi(TCGContext *s, TCGType type, 2729 TCGReg a0, TCGReg a1, tcg_target_long a2) 2730{ 2731 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2732 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2733} 2734 2735static TCGConstraintSetIndex cset_ctz(TCGType type, unsigned flags) 2736{ 2737 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2738} 2739 2740static const TCGOutOpBinary outop_ctz = { 2741 .base.static_constraint = C_Dynamic, 2742 .base.dynamic_constraint = cset_ctz, 2743 .out_rrr = tgen_ctz, 2744 .out_rri = tgen_ctzi, 2745}; 2746 2747static const TCGOutOpBinary outop_divs = { 2748 .base.static_constraint = C_NotImplemented, 2749}; 2750 2751static void tgen_divs2(TCGContext *s, TCGType type, 2752 TCGReg a0, TCGReg a1, TCGReg a4) 2753{ 2754 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2755 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, a4); 2756} 2757 2758static const TCGOutOpDivRem outop_divs2 = { 2759 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2760 .out_rr01r = tgen_divs2, 2761}; 2762 2763static const TCGOutOpBinary outop_divu = { 2764 .base.static_constraint = C_NotImplemented, 2765}; 2766 2767static void tgen_divu2(TCGContext *s, TCGType type, 2768 TCGReg a0, TCGReg a1, TCGReg a4) 2769{ 2770 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2771 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, a4); 2772} 2773 2774static const TCGOutOpDivRem outop_divu2 = { 2775 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2776 .out_rr01r = tgen_divu2, 2777}; 2778 2779static const TCGOutOpBinary outop_eqv = { 2780 .base.static_constraint = C_NotImplemented, 2781}; 2782 2783static void tgen_mul(TCGContext *s, TCGType type, 2784 TCGReg a0, TCGReg a1, TCGReg a2) 2785{ 2786 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2787 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2788} 2789 2790static void tgen_muli(TCGContext *s, TCGType type, 2791 TCGReg a0, TCGReg a1, tcg_target_long a2) 2792{ 2793 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2794 2795 if (a2 == (int8_t)a2) { 2796 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2797 tcg_out8(s, a2); 2798 } else { 2799 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2800 tcg_out32(s, a2); 2801 } 2802} 2803 2804static const TCGOutOpBinary outop_mul = { 2805 .base.static_constraint = C_O1_I2(r, 0, re), 2806 .out_rrr = tgen_mul, 2807 .out_rri = tgen_muli, 2808}; 2809 2810static void tgen_muls2(TCGContext *s, TCGType type, 2811 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2812{ 2813 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2814 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, a3); 2815} 2816 2817static const TCGOutOpMul2 outop_muls2 = { 2818 .base.static_constraint = C_O2_I2(a, d, a, r), 2819 .out_rrrr = tgen_muls2, 2820}; 2821 2822static const TCGOutOpBinary outop_mulsh = { 2823 .base.static_constraint = C_NotImplemented, 2824}; 2825 2826static const TCGOutOpBinary outop_muluh = { 2827 .base.static_constraint = C_NotImplemented, 2828}; 2829 2830static void tgen_mulu2(TCGContext *s, TCGType type, 2831 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2832{ 2833 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2834 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, a3); 2835} 2836 2837static const TCGOutOpMul2 outop_mulu2 = { 2838 .base.static_constraint = C_O2_I2(a, d, a, r), 2839 .out_rrrr = tgen_mulu2, 2840}; 2841 2842static const TCGOutOpBinary outop_nand = { 2843 .base.static_constraint = C_NotImplemented, 2844}; 2845 2846static const TCGOutOpBinary outop_nor = { 2847 .base.static_constraint = C_NotImplemented, 2848}; 2849 2850static void tgen_or(TCGContext *s, TCGType type, 2851 TCGReg a0, TCGReg a1, TCGReg a2) 2852{ 2853 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2854 tgen_arithr(s, ARITH_OR + rexw, a0, a2); 2855} 2856 2857static void tgen_ori(TCGContext *s, TCGType type, 2858 TCGReg a0, TCGReg a1, tcg_target_long a2) 2859{ 2860 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2861 tgen_arithi(s, ARITH_OR + rexw, a0, a2, false); 2862} 2863 2864static const TCGOutOpBinary outop_or = { 2865 .base.static_constraint = C_O1_I2(r, 0, re), 2866 .out_rrr = tgen_or, 2867 .out_rri = tgen_ori, 2868}; 2869 2870static const TCGOutOpBinary outop_orc = { 2871 .base.static_constraint = C_NotImplemented, 2872}; 2873 2874static const TCGOutOpBinary outop_rems = { 2875 .base.static_constraint = C_NotImplemented, 2876}; 2877 2878static const TCGOutOpBinary outop_remu = { 2879 .base.static_constraint = C_NotImplemented, 2880}; 2881 2882static void tgen_rotl(TCGContext *s, TCGType type, 2883 TCGReg a0, TCGReg a1, TCGReg a2) 2884{ 2885 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2886 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROL, a0); 2887} 2888 2889static void tgen_rotli(TCGContext *s, TCGType type, 2890 TCGReg a0, TCGReg a1, tcg_target_long a2) 2891{ 2892 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2893 tcg_out_shifti(s, SHIFT_ROL + rexw, a0, a2); 2894} 2895 2896static const TCGOutOpBinary outop_rotl = { 2897 .base.static_constraint = C_O1_I2(r, 0, ci), 2898 .out_rrr = tgen_rotl, 2899 .out_rri = tgen_rotli, 2900}; 2901 2902static void tgen_rotr(TCGContext *s, TCGType type, 2903 TCGReg a0, TCGReg a1, TCGReg a2) 2904{ 2905 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2906 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROR, a0); 2907} 2908 2909static void tgen_rotri(TCGContext *s, TCGType type, 2910 TCGReg a0, TCGReg a1, tcg_target_long a2) 2911{ 2912 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2913 tcg_out_shifti(s, SHIFT_ROR + rexw, a0, a2); 2914} 2915 2916static const TCGOutOpBinary outop_rotr = { 2917 .base.static_constraint = C_O1_I2(r, 0, ci), 2918 .out_rrr = tgen_rotr, 2919 .out_rri = tgen_rotri, 2920}; 2921 2922static TCGConstraintSetIndex cset_shift(TCGType type, unsigned flags) 2923{ 2924 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 2925} 2926 2927static void tgen_sar(TCGContext *s, TCGType type, 2928 TCGReg a0, TCGReg a1, TCGReg a2) 2929{ 2930 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2931 if (have_bmi2) { 2932 tcg_out_vex_modrm(s, OPC_SARX + rexw, a0, a2, a1); 2933 } else { 2934 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SAR, a0); 2935 } 2936} 2937 2938static void tgen_sari(TCGContext *s, TCGType type, 2939 TCGReg a0, TCGReg a1, tcg_target_long a2) 2940{ 2941 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2942 2943 tcg_out_mov(s, type, a0, a1); 2944 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, a2); 2945} 2946 2947static const TCGOutOpBinary outop_sar = { 2948 .base.static_constraint = C_Dynamic, 2949 .base.dynamic_constraint = cset_shift, 2950 .out_rrr = tgen_sar, 2951 .out_rri = tgen_sari, 2952}; 2953 2954static void tgen_shl(TCGContext *s, TCGType type, 2955 TCGReg a0, TCGReg a1, TCGReg a2) 2956{ 2957 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2958 if (have_bmi2) { 2959 tcg_out_vex_modrm(s, OPC_SHLX + rexw, a0, a2, a1); 2960 } else { 2961 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHL, a0); 2962 } 2963} 2964 2965static void tgen_shli(TCGContext *s, TCGType type, 2966 TCGReg a0, TCGReg a1, tcg_target_long a2) 2967{ 2968 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2969 2970 /* For small constant 3-operand shift, use LEA. */ 2971 if (a0 != a1 && a2 >= 1 && a2 <= 3) { 2972 if (a2 == 1) { 2973 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2974 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2975 } else { 2976 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2977 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2978 } 2979 return; 2980 } 2981 tcg_out_mov(s, type, a0, a1); 2982 tcg_out_shifti(s, SHIFT_SHL + rexw, a0, a2); 2983} 2984 2985static const TCGOutOpBinary outop_shl = { 2986 .base.static_constraint = C_Dynamic, 2987 .base.dynamic_constraint = cset_shift, 2988 .out_rrr = tgen_shl, 2989 .out_rri = tgen_shli, 2990}; 2991 2992static void tgen_shr(TCGContext *s, TCGType type, 2993 TCGReg a0, TCGReg a1, TCGReg a2) 2994{ 2995 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2996 if (have_bmi2) { 2997 tcg_out_vex_modrm(s, OPC_SHRX + rexw, a0, a2, a1); 2998 } else { 2999 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHR, a0); 3000 } 3001} 3002 3003static void tgen_shri(TCGContext *s, TCGType type, 3004 TCGReg a0, TCGReg a1, tcg_target_long a2) 3005{ 3006 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3007 3008 tcg_out_mov(s, type, a0, a1); 3009 tcg_out_shifti(s, SHIFT_SHR + rexw, a0, a2); 3010} 3011 3012static const TCGOutOpBinary outop_shr = { 3013 .base.static_constraint = C_Dynamic, 3014 .base.dynamic_constraint = cset_shift, 3015 .out_rrr = tgen_shr, 3016 .out_rri = tgen_shri, 3017}; 3018 3019static void tgen_sub(TCGContext *s, TCGType type, 3020 TCGReg a0, TCGReg a1, TCGReg a2) 3021{ 3022 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3023 tgen_arithr(s, ARITH_SUB + rexw, a0, a2); 3024} 3025 3026static const TCGOutOpSubtract outop_sub = { 3027 .base.static_constraint = C_O1_I2(r, 0, r), 3028 .out_rrr = tgen_sub, 3029}; 3030 3031static void tgen_xor(TCGContext *s, TCGType type, 3032 TCGReg a0, TCGReg a1, TCGReg a2) 3033{ 3034 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3035 tgen_arithr(s, ARITH_XOR + rexw, a0, a2); 3036} 3037 3038static void tgen_xori(TCGContext *s, TCGType type, 3039 TCGReg a0, TCGReg a1, tcg_target_long a2) 3040{ 3041 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3042 tgen_arithi(s, ARITH_XOR + rexw, a0, a2, false); 3043} 3044 3045static const TCGOutOpBinary outop_xor = { 3046 .base.static_constraint = C_O1_I2(r, 0, re), 3047 .out_rrr = tgen_xor, 3048 .out_rri = tgen_xori, 3049}; 3050 3051static void tgen_neg(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3052{ 3053 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3054 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 3055} 3056 3057static const TCGOutOpUnary outop_neg = { 3058 .base.static_constraint = C_O1_I1(r, 0), 3059 .out_rr = tgen_neg, 3060}; 3061 3062static void tgen_not(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3063{ 3064 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3065 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 3066} 3067 3068static const TCGOutOpUnary outop_not = { 3069 .base.static_constraint = C_O1_I1(r, 0), 3070 .out_rr = tgen_not, 3071}; 3072 3073 3074static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type, 3075 const TCGArg args[TCG_MAX_OP_ARGS], 3076 const int const_args[TCG_MAX_OP_ARGS]) 3077{ 3078 TCGArg a0, a1, a2; 3079 int const_a2, rexw; 3080 3081#if TCG_TARGET_REG_BITS == 64 3082# define OP_32_64(x) \ 3083 case glue(glue(INDEX_op_, x), _i64): \ 3084 case glue(glue(INDEX_op_, x), _i32) 3085#else 3086# define OP_32_64(x) \ 3087 case glue(glue(INDEX_op_, x), _i32) 3088#endif 3089 3090 /* Hoist the loads of the most common arguments. */ 3091 a0 = args[0]; 3092 a1 = args[1]; 3093 a2 = args[2]; 3094 const_a2 = const_args[2]; 3095 rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3096 3097 switch (opc) { 3098 case INDEX_op_goto_ptr: 3099 /* jmp to the given host address (could be epilogue) */ 3100 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 3101 break; 3102 case INDEX_op_br: 3103 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 3104 break; 3105 OP_32_64(ld8u): 3106 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3107 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 3108 break; 3109 OP_32_64(ld8s): 3110 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 3111 break; 3112 OP_32_64(ld16u): 3113 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3114 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 3115 break; 3116 OP_32_64(ld16s): 3117 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 3118 break; 3119#if TCG_TARGET_REG_BITS == 64 3120 case INDEX_op_ld32u_i64: 3121#endif 3122 case INDEX_op_ld_i32: 3123 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 3124 break; 3125 3126 OP_32_64(st8): 3127 if (const_args[0]) { 3128 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 3129 tcg_out8(s, a0); 3130 } else { 3131 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 3132 } 3133 break; 3134 OP_32_64(st16): 3135 if (const_args[0]) { 3136 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 3137 tcg_out16(s, a0); 3138 } else { 3139 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 3140 } 3141 break; 3142#if TCG_TARGET_REG_BITS == 64 3143 case INDEX_op_st32_i64: 3144#endif 3145 case INDEX_op_st_i32: 3146 if (const_args[0]) { 3147 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 3148 tcg_out32(s, a0); 3149 } else { 3150 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 3151 } 3152 break; 3153 3154 OP_32_64(bswap16): 3155 if (a2 & TCG_BSWAP_OS) { 3156 /* Output must be sign-extended. */ 3157 if (rexw) { 3158 tcg_out_bswap64(s, a0); 3159 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 3160 } else { 3161 tcg_out_bswap32(s, a0); 3162 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 3163 } 3164 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 3165 /* Output must be zero-extended, but input isn't. */ 3166 tcg_out_bswap32(s, a0); 3167 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 3168 } else { 3169 tcg_out_rolw_8(s, a0); 3170 } 3171 break; 3172 OP_32_64(bswap32): 3173 tcg_out_bswap32(s, a0); 3174 if (rexw && (a2 & TCG_BSWAP_OS)) { 3175 tcg_out_ext32s(s, a0, a0); 3176 } 3177 break; 3178 3179 case INDEX_op_qemu_ld_i32: 3180 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32); 3181 break; 3182 case INDEX_op_qemu_ld_i64: 3183 if (TCG_TARGET_REG_BITS == 64) { 3184 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64); 3185 } else { 3186 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3187 } 3188 break; 3189 case INDEX_op_qemu_ld_i128: 3190 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3191 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3192 break; 3193 3194 case INDEX_op_qemu_st_i32: 3195 case INDEX_op_qemu_st8_i32: 3196 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32); 3197 break; 3198 case INDEX_op_qemu_st_i64: 3199 if (TCG_TARGET_REG_BITS == 64) { 3200 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64); 3201 } else { 3202 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3203 } 3204 break; 3205 case INDEX_op_qemu_st_i128: 3206 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3207 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3208 break; 3209 3210 OP_32_64(add2): 3211 if (const_args[4]) { 3212 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 3213 } else { 3214 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 3215 } 3216 if (const_args[5]) { 3217 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 3218 } else { 3219 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 3220 } 3221 break; 3222 OP_32_64(sub2): 3223 if (const_args[4]) { 3224 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 3225 } else { 3226 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 3227 } 3228 if (const_args[5]) { 3229 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 3230 } else { 3231 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 3232 } 3233 break; 3234 3235#if TCG_TARGET_REG_BITS == 32 3236 case INDEX_op_brcond2_i32: 3237 tcg_out_brcond2(s, args, const_args, 0); 3238 break; 3239 case INDEX_op_setcond2_i32: 3240 tcg_out_setcond2(s, args, const_args); 3241 break; 3242#else /* TCG_TARGET_REG_BITS == 64 */ 3243 case INDEX_op_ld32s_i64: 3244 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 3245 break; 3246 case INDEX_op_ld_i64: 3247 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 3248 break; 3249 case INDEX_op_st_i64: 3250 if (const_args[0]) { 3251 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 3252 tcg_out32(s, a0); 3253 } else { 3254 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 3255 } 3256 break; 3257 3258 case INDEX_op_bswap64_i64: 3259 tcg_out_bswap64(s, a0); 3260 break; 3261 case INDEX_op_extrh_i64_i32: 3262 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 3263 break; 3264#endif 3265 3266 OP_32_64(deposit): 3267 if (args[3] == 0 && args[4] == 8) { 3268 /* load bits 0..7 */ 3269 if (const_a2) { 3270 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 3271 0, a0, 0); 3272 tcg_out8(s, a2); 3273 } else { 3274 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 3275 } 3276 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 3277 /* load bits 8..15 */ 3278 if (const_a2) { 3279 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 3280 tcg_out8(s, a2); 3281 } else { 3282 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 3283 } 3284 } else if (args[3] == 0 && args[4] == 16) { 3285 /* load bits 0..15 */ 3286 if (const_a2) { 3287 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 3288 0, a0, 0); 3289 tcg_out16(s, a2); 3290 } else { 3291 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 3292 } 3293 } else { 3294 g_assert_not_reached(); 3295 } 3296 break; 3297 3298 case INDEX_op_extract_i64: 3299 if (a2 + args[3] == 32) { 3300 if (a2 == 0) { 3301 tcg_out_ext32u(s, a0, a1); 3302 break; 3303 } 3304 /* This is a 32-bit zero-extending right shift. */ 3305 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3306 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 3307 break; 3308 } 3309 /* FALLTHRU */ 3310 case INDEX_op_extract_i32: 3311 if (a2 == 0 && args[3] == 8) { 3312 tcg_out_ext8u(s, a0, a1); 3313 } else if (a2 == 0 && args[3] == 16) { 3314 tcg_out_ext16u(s, a0, a1); 3315 } else if (a2 == 8 && args[3] == 8) { 3316 /* 3317 * On the off-chance that we can use the high-byte registers. 3318 * Otherwise we emit the same ext16 + shift pattern that we 3319 * would have gotten from the normal tcg-op.c expansion. 3320 */ 3321 if (a1 < 4 && a0 < 8) { 3322 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3323 } else { 3324 tcg_out_ext16u(s, a0, a1); 3325 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3326 } 3327 } else { 3328 g_assert_not_reached(); 3329 } 3330 break; 3331 3332 case INDEX_op_sextract_i64: 3333 if (a2 == 0 && args[3] == 8) { 3334 tcg_out_ext8s(s, TCG_TYPE_I64, a0, a1); 3335 } else if (a2 == 0 && args[3] == 16) { 3336 tcg_out_ext16s(s, TCG_TYPE_I64, a0, a1); 3337 } else if (a2 == 0 && args[3] == 32) { 3338 tcg_out_ext32s(s, a0, a1); 3339 } else { 3340 g_assert_not_reached(); 3341 } 3342 break; 3343 3344 case INDEX_op_sextract_i32: 3345 if (a2 == 0 && args[3] == 8) { 3346 tcg_out_ext8s(s, TCG_TYPE_I32, a0, a1); 3347 } else if (a2 == 0 && args[3] == 16) { 3348 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3349 } else if (a2 == 8 && args[3] == 8) { 3350 if (a1 < 4 && a0 < 8) { 3351 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3352 } else { 3353 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3354 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3355 } 3356 } else { 3357 g_assert_not_reached(); 3358 } 3359 break; 3360 3361 OP_32_64(extract2): 3362 /* Note that SHRD outputs to the r/m operand. */ 3363 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3364 tcg_out8(s, args[3]); 3365 break; 3366 3367 case INDEX_op_mb: 3368 tcg_out_mb(s, a0); 3369 break; 3370 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3371 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3372 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3373 case INDEX_op_ext_i32_i64: /* Always emitted via tcg_reg_alloc_op. */ 3374 case INDEX_op_extu_i32_i64: 3375 case INDEX_op_extrl_i64_i32: 3376 default: 3377 g_assert_not_reached(); 3378 } 3379 3380#undef OP_32_64 3381} 3382 3383static int const umin_insn[4] = { 3384 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3385}; 3386 3387static int const umax_insn[4] = { 3388 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3389}; 3390 3391static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3392 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3393{ 3394 static int const cmpeq_insn[4] = { 3395 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3396 }; 3397 static int const cmpgt_insn[4] = { 3398 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3399 }; 3400 3401 enum { 3402 NEED_INV = 1, 3403 NEED_SWAP = 2, 3404 NEED_UMIN = 4, 3405 NEED_UMAX = 8, 3406 INVALID = 16, 3407 }; 3408 static const uint8_t cond_fixup[16] = { 3409 [0 ... 15] = INVALID, 3410 [TCG_COND_EQ] = 0, 3411 [TCG_COND_GT] = 0, 3412 [TCG_COND_NE] = NEED_INV, 3413 [TCG_COND_LE] = NEED_INV, 3414 [TCG_COND_LT] = NEED_SWAP, 3415 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3416 [TCG_COND_LEU] = NEED_UMIN, 3417 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3418 [TCG_COND_GEU] = NEED_UMAX, 3419 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3420 }; 3421 int fixup = cond_fixup[cond]; 3422 3423 assert(!(fixup & INVALID)); 3424 3425 if (fixup & NEED_INV) { 3426 cond = tcg_invert_cond(cond); 3427 } 3428 3429 if (fixup & NEED_SWAP) { 3430 TCGReg swap = v1; 3431 v1 = v2; 3432 v2 = swap; 3433 cond = tcg_swap_cond(cond); 3434 } 3435 3436 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3437 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3438 3439 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3440 assert(vece <= MO_32); 3441 3442 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3443 v2 = TCG_TMP_VEC; 3444 cond = TCG_COND_EQ; 3445 } 3446 3447 switch (cond) { 3448 case TCG_COND_EQ: 3449 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3450 break; 3451 case TCG_COND_GT: 3452 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3453 break; 3454 default: 3455 g_assert_not_reached(); 3456 } 3457 return fixup & NEED_INV; 3458} 3459 3460static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3461 TCGReg v1, TCGReg v2, TCGCond cond) 3462{ 3463 static const int cmpm_insn[2][4] = { 3464 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3465 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3466 }; 3467 static const int testm_insn[4] = { 3468 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3469 }; 3470 static const int testnm_insn[4] = { 3471 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3472 }; 3473 3474 static const int cond_ext[16] = { 3475 [TCG_COND_EQ] = 0, 3476 [TCG_COND_NE] = 4, 3477 [TCG_COND_LT] = 1, 3478 [TCG_COND_LTU] = 1, 3479 [TCG_COND_LE] = 2, 3480 [TCG_COND_LEU] = 2, 3481 [TCG_COND_NEVER] = 3, 3482 [TCG_COND_GE] = 5, 3483 [TCG_COND_GEU] = 5, 3484 [TCG_COND_GT] = 6, 3485 [TCG_COND_GTU] = 6, 3486 [TCG_COND_ALWAYS] = 7, 3487 }; 3488 3489 switch (cond) { 3490 case TCG_COND_TSTNE: 3491 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3492 break; 3493 case TCG_COND_TSTEQ: 3494 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3495 break; 3496 default: 3497 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3498 /* k1 */ 1, v1, v2, type); 3499 tcg_out8(s, cond_ext[cond]); 3500 break; 3501 } 3502} 3503 3504static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3505 unsigned vece, TCGReg dest) 3506{ 3507 static const int movm_insn[] = { 3508 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3509 }; 3510 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3511} 3512 3513static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3514 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3515{ 3516 /* 3517 * With avx512, we have a complete set of comparisons into mask. 3518 * Unless there's a single insn expansion for the comparision, 3519 * expand via a mask in k1. 3520 */ 3521 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3522 && cond != TCG_COND_EQ 3523 && cond != TCG_COND_LT 3524 && cond != TCG_COND_GT) { 3525 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3526 tcg_out_k1_to_vec(s, type, vece, v0); 3527 return; 3528 } 3529 3530 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3531 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3532 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3533 } 3534} 3535 3536static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3537 TCGReg v0, TCGReg c1, TCGReg c2, 3538 TCGReg v3, TCGReg v4, TCGCond cond) 3539{ 3540 static const int vpblendm_insn[] = { 3541 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3542 }; 3543 bool z = false; 3544 3545 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3546 if (!v3) { 3547 z = true; 3548 v3 = v4; 3549 cond = tcg_invert_cond(cond); 3550 } 3551 3552 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3553 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3554 /* k1 */1, z, type); 3555} 3556 3557static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3558 TCGReg v0, TCGReg c1, TCGReg c2, 3559 TCGReg v3, TCGReg v4, TCGCond cond) 3560{ 3561 bool inv; 3562 3563 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3564 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3565 return; 3566 } 3567 3568 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3569 3570 /* 3571 * Since XMM0 is 16, the only way we get 0 into V3 3572 * is via the constant zero constraint. 3573 */ 3574 if (!v3) { 3575 if (inv) { 3576 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3577 } else { 3578 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3579 } 3580 } else { 3581 if (inv) { 3582 TCGReg swap = v3; 3583 v3 = v4; 3584 v4 = swap; 3585 } 3586 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3587 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3588 } 3589} 3590 3591static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3592 unsigned vecl, unsigned vece, 3593 const TCGArg args[TCG_MAX_OP_ARGS], 3594 const int const_args[TCG_MAX_OP_ARGS]) 3595{ 3596 static int const add_insn[4] = { 3597 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3598 }; 3599 static int const ssadd_insn[4] = { 3600 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3601 }; 3602 static int const usadd_insn[4] = { 3603 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3604 }; 3605 static int const sub_insn[4] = { 3606 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3607 }; 3608 static int const sssub_insn[4] = { 3609 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3610 }; 3611 static int const ussub_insn[4] = { 3612 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3613 }; 3614 static int const mul_insn[4] = { 3615 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3616 }; 3617 static int const shift_imm_insn[4] = { 3618 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3619 }; 3620 static int const punpckl_insn[4] = { 3621 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3622 }; 3623 static int const punpckh_insn[4] = { 3624 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3625 }; 3626 static int const packss_insn[4] = { 3627 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3628 }; 3629 static int const packus_insn[4] = { 3630 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3631 }; 3632 static int const smin_insn[4] = { 3633 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3634 }; 3635 static int const smax_insn[4] = { 3636 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3637 }; 3638 static int const rotlv_insn[4] = { 3639 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3640 }; 3641 static int const rotrv_insn[4] = { 3642 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3643 }; 3644 static int const shlv_insn[4] = { 3645 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3646 }; 3647 static int const shrv_insn[4] = { 3648 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3649 }; 3650 static int const sarv_insn[4] = { 3651 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3652 }; 3653 static int const shls_insn[4] = { 3654 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3655 }; 3656 static int const shrs_insn[4] = { 3657 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3658 }; 3659 static int const sars_insn[4] = { 3660 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3661 }; 3662 static int const vpshldi_insn[4] = { 3663 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3664 }; 3665 static int const vpshldv_insn[4] = { 3666 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3667 }; 3668 static int const vpshrdv_insn[4] = { 3669 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3670 }; 3671 static int const abs_insn[4] = { 3672 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3673 }; 3674 3675 TCGType type = vecl + TCG_TYPE_V64; 3676 int insn, sub; 3677 TCGArg a0, a1, a2, a3; 3678 3679 a0 = args[0]; 3680 a1 = args[1]; 3681 a2 = args[2]; 3682 3683 switch (opc) { 3684 case INDEX_op_add_vec: 3685 insn = add_insn[vece]; 3686 goto gen_simd; 3687 case INDEX_op_ssadd_vec: 3688 insn = ssadd_insn[vece]; 3689 goto gen_simd; 3690 case INDEX_op_usadd_vec: 3691 insn = usadd_insn[vece]; 3692 goto gen_simd; 3693 case INDEX_op_sub_vec: 3694 insn = sub_insn[vece]; 3695 goto gen_simd; 3696 case INDEX_op_sssub_vec: 3697 insn = sssub_insn[vece]; 3698 goto gen_simd; 3699 case INDEX_op_ussub_vec: 3700 insn = ussub_insn[vece]; 3701 goto gen_simd; 3702 case INDEX_op_mul_vec: 3703 insn = mul_insn[vece]; 3704 goto gen_simd; 3705 case INDEX_op_and_vec: 3706 insn = OPC_PAND; 3707 goto gen_simd; 3708 case INDEX_op_or_vec: 3709 insn = OPC_POR; 3710 goto gen_simd; 3711 case INDEX_op_xor_vec: 3712 insn = OPC_PXOR; 3713 goto gen_simd; 3714 case INDEX_op_smin_vec: 3715 insn = smin_insn[vece]; 3716 goto gen_simd; 3717 case INDEX_op_umin_vec: 3718 insn = umin_insn[vece]; 3719 goto gen_simd; 3720 case INDEX_op_smax_vec: 3721 insn = smax_insn[vece]; 3722 goto gen_simd; 3723 case INDEX_op_umax_vec: 3724 insn = umax_insn[vece]; 3725 goto gen_simd; 3726 case INDEX_op_shlv_vec: 3727 insn = shlv_insn[vece]; 3728 goto gen_simd; 3729 case INDEX_op_shrv_vec: 3730 insn = shrv_insn[vece]; 3731 goto gen_simd; 3732 case INDEX_op_sarv_vec: 3733 insn = sarv_insn[vece]; 3734 goto gen_simd; 3735 case INDEX_op_rotlv_vec: 3736 insn = rotlv_insn[vece]; 3737 goto gen_simd; 3738 case INDEX_op_rotrv_vec: 3739 insn = rotrv_insn[vece]; 3740 goto gen_simd; 3741 case INDEX_op_shls_vec: 3742 insn = shls_insn[vece]; 3743 goto gen_simd; 3744 case INDEX_op_shrs_vec: 3745 insn = shrs_insn[vece]; 3746 goto gen_simd; 3747 case INDEX_op_sars_vec: 3748 insn = sars_insn[vece]; 3749 goto gen_simd; 3750 case INDEX_op_x86_punpckl_vec: 3751 insn = punpckl_insn[vece]; 3752 goto gen_simd; 3753 case INDEX_op_x86_punpckh_vec: 3754 insn = punpckh_insn[vece]; 3755 goto gen_simd; 3756 case INDEX_op_x86_packss_vec: 3757 insn = packss_insn[vece]; 3758 goto gen_simd; 3759 case INDEX_op_x86_packus_vec: 3760 insn = packus_insn[vece]; 3761 goto gen_simd; 3762 case INDEX_op_x86_vpshldv_vec: 3763 insn = vpshldv_insn[vece]; 3764 a1 = a2; 3765 a2 = args[3]; 3766 goto gen_simd; 3767 case INDEX_op_x86_vpshrdv_vec: 3768 insn = vpshrdv_insn[vece]; 3769 a1 = a2; 3770 a2 = args[3]; 3771 goto gen_simd; 3772#if TCG_TARGET_REG_BITS == 32 3773 case INDEX_op_dup2_vec: 3774 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3775 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3776 /* Then replicate the 64-bit elements across the rest of the vector. */ 3777 if (type != TCG_TYPE_V64) { 3778 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3779 } 3780 break; 3781#endif 3782 case INDEX_op_abs_vec: 3783 insn = abs_insn[vece]; 3784 a2 = a1; 3785 a1 = 0; 3786 goto gen_simd; 3787 gen_simd: 3788 tcg_debug_assert(insn != OPC_UD2); 3789 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3790 break; 3791 3792 case INDEX_op_cmp_vec: 3793 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3794 break; 3795 3796 case INDEX_op_cmpsel_vec: 3797 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3798 args[3], args[4], args[5]); 3799 break; 3800 3801 case INDEX_op_andc_vec: 3802 insn = OPC_PANDN; 3803 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3804 break; 3805 3806 case INDEX_op_shli_vec: 3807 insn = shift_imm_insn[vece]; 3808 sub = 6; 3809 goto gen_shift; 3810 case INDEX_op_shri_vec: 3811 insn = shift_imm_insn[vece]; 3812 sub = 2; 3813 goto gen_shift; 3814 case INDEX_op_sari_vec: 3815 if (vece == MO_64) { 3816 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3817 } else { 3818 insn = shift_imm_insn[vece]; 3819 } 3820 sub = 4; 3821 goto gen_shift; 3822 case INDEX_op_rotli_vec: 3823 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3824 if (vece == MO_64) { 3825 insn |= P_VEXW; 3826 } 3827 sub = 1; 3828 goto gen_shift; 3829 gen_shift: 3830 tcg_debug_assert(vece != MO_8); 3831 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3832 tcg_out8(s, a2); 3833 break; 3834 3835 case INDEX_op_ld_vec: 3836 tcg_out_ld(s, type, a0, a1, a2); 3837 break; 3838 case INDEX_op_st_vec: 3839 tcg_out_st(s, type, a0, a1, a2); 3840 break; 3841 case INDEX_op_dupm_vec: 3842 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3843 break; 3844 3845 case INDEX_op_x86_shufps_vec: 3846 insn = OPC_SHUFPS; 3847 sub = args[3]; 3848 goto gen_simd_imm8; 3849 case INDEX_op_x86_blend_vec: 3850 if (vece == MO_16) { 3851 insn = OPC_PBLENDW; 3852 } else if (vece == MO_32) { 3853 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3854 } else { 3855 g_assert_not_reached(); 3856 } 3857 sub = args[3]; 3858 goto gen_simd_imm8; 3859 case INDEX_op_x86_vperm2i128_vec: 3860 insn = OPC_VPERM2I128; 3861 sub = args[3]; 3862 goto gen_simd_imm8; 3863 case INDEX_op_x86_vpshldi_vec: 3864 insn = vpshldi_insn[vece]; 3865 sub = args[3]; 3866 goto gen_simd_imm8; 3867 3868 case INDEX_op_not_vec: 3869 insn = OPC_VPTERNLOGQ; 3870 a2 = a1; 3871 sub = 0x33; /* !B */ 3872 goto gen_simd_imm8; 3873 case INDEX_op_nor_vec: 3874 insn = OPC_VPTERNLOGQ; 3875 sub = 0x11; /* norCB */ 3876 goto gen_simd_imm8; 3877 case INDEX_op_nand_vec: 3878 insn = OPC_VPTERNLOGQ; 3879 sub = 0x77; /* nandCB */ 3880 goto gen_simd_imm8; 3881 case INDEX_op_eqv_vec: 3882 insn = OPC_VPTERNLOGQ; 3883 sub = 0x99; /* xnorCB */ 3884 goto gen_simd_imm8; 3885 case INDEX_op_orc_vec: 3886 insn = OPC_VPTERNLOGQ; 3887 sub = 0xdd; /* orB!C */ 3888 goto gen_simd_imm8; 3889 3890 case INDEX_op_bitsel_vec: 3891 insn = OPC_VPTERNLOGQ; 3892 a3 = args[3]; 3893 if (a0 == a1) { 3894 a1 = a2; 3895 a2 = a3; 3896 sub = 0xca; /* A?B:C */ 3897 } else if (a0 == a2) { 3898 a2 = a3; 3899 sub = 0xe2; /* B?A:C */ 3900 } else { 3901 tcg_out_mov(s, type, a0, a3); 3902 sub = 0xb8; /* B?C:A */ 3903 } 3904 goto gen_simd_imm8; 3905 3906 gen_simd_imm8: 3907 tcg_debug_assert(insn != OPC_UD2); 3908 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3909 tcg_out8(s, sub); 3910 break; 3911 3912 case INDEX_op_x86_psrldq_vec: 3913 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3914 tcg_out8(s, a2); 3915 break; 3916 3917 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3918 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3919 default: 3920 g_assert_not_reached(); 3921 } 3922} 3923 3924static TCGConstraintSetIndex 3925tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) 3926{ 3927 switch (op) { 3928 case INDEX_op_goto_ptr: 3929 return C_O0_I1(r); 3930 3931 case INDEX_op_ld8u_i32: 3932 case INDEX_op_ld8u_i64: 3933 case INDEX_op_ld8s_i32: 3934 case INDEX_op_ld8s_i64: 3935 case INDEX_op_ld16u_i32: 3936 case INDEX_op_ld16u_i64: 3937 case INDEX_op_ld16s_i32: 3938 case INDEX_op_ld16s_i64: 3939 case INDEX_op_ld_i32: 3940 case INDEX_op_ld32u_i64: 3941 case INDEX_op_ld32s_i64: 3942 case INDEX_op_ld_i64: 3943 return C_O1_I1(r, r); 3944 3945 case INDEX_op_st8_i32: 3946 case INDEX_op_st8_i64: 3947 return C_O0_I2(qi, r); 3948 3949 case INDEX_op_st16_i32: 3950 case INDEX_op_st16_i64: 3951 case INDEX_op_st_i32: 3952 case INDEX_op_st32_i64: 3953 return C_O0_I2(ri, r); 3954 3955 case INDEX_op_st_i64: 3956 return C_O0_I2(re, r); 3957 3958 case INDEX_op_bswap16_i32: 3959 case INDEX_op_bswap16_i64: 3960 case INDEX_op_bswap32_i32: 3961 case INDEX_op_bswap32_i64: 3962 case INDEX_op_bswap64_i64: 3963 case INDEX_op_extrh_i64_i32: 3964 return C_O1_I1(r, 0); 3965 3966 case INDEX_op_ext_i32_i64: 3967 case INDEX_op_extu_i32_i64: 3968 case INDEX_op_extrl_i64_i32: 3969 case INDEX_op_extract_i32: 3970 case INDEX_op_extract_i64: 3971 case INDEX_op_sextract_i32: 3972 case INDEX_op_sextract_i64: 3973 return C_O1_I1(r, r); 3974 3975 case INDEX_op_extract2_i32: 3976 case INDEX_op_extract2_i64: 3977 return C_O1_I2(r, 0, r); 3978 3979 case INDEX_op_deposit_i32: 3980 case INDEX_op_deposit_i64: 3981 return C_O1_I2(q, 0, qi); 3982 3983 case INDEX_op_add2_i32: 3984 case INDEX_op_add2_i64: 3985 case INDEX_op_sub2_i32: 3986 case INDEX_op_sub2_i64: 3987 return C_N1_O1_I4(r, r, 0, 1, re, re); 3988 3989 case INDEX_op_qemu_ld_i32: 3990 return C_O1_I1(r, L); 3991 3992 case INDEX_op_qemu_st_i32: 3993 return C_O0_I2(L, L); 3994 case INDEX_op_qemu_st8_i32: 3995 return C_O0_I2(s, L); 3996 3997 case INDEX_op_qemu_ld_i64: 3998 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3999 4000 case INDEX_op_qemu_st_i64: 4001 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 4002 4003 case INDEX_op_qemu_ld_i128: 4004 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 4005 return C_O2_I1(r, r, L); 4006 case INDEX_op_qemu_st_i128: 4007 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 4008 return C_O0_I3(L, L, L); 4009 4010 case INDEX_op_brcond2_i32: 4011 return C_O0_I4(r, r, ri, ri); 4012 4013 case INDEX_op_setcond2_i32: 4014 return C_O1_I4(r, r, r, ri, ri); 4015 4016 case INDEX_op_ld_vec: 4017 case INDEX_op_dupm_vec: 4018 return C_O1_I1(x, r); 4019 4020 case INDEX_op_st_vec: 4021 return C_O0_I2(x, r); 4022 4023 case INDEX_op_add_vec: 4024 case INDEX_op_sub_vec: 4025 case INDEX_op_mul_vec: 4026 case INDEX_op_and_vec: 4027 case INDEX_op_or_vec: 4028 case INDEX_op_xor_vec: 4029 case INDEX_op_andc_vec: 4030 case INDEX_op_orc_vec: 4031 case INDEX_op_nand_vec: 4032 case INDEX_op_nor_vec: 4033 case INDEX_op_eqv_vec: 4034 case INDEX_op_ssadd_vec: 4035 case INDEX_op_usadd_vec: 4036 case INDEX_op_sssub_vec: 4037 case INDEX_op_ussub_vec: 4038 case INDEX_op_smin_vec: 4039 case INDEX_op_umin_vec: 4040 case INDEX_op_smax_vec: 4041 case INDEX_op_umax_vec: 4042 case INDEX_op_shlv_vec: 4043 case INDEX_op_shrv_vec: 4044 case INDEX_op_sarv_vec: 4045 case INDEX_op_rotlv_vec: 4046 case INDEX_op_rotrv_vec: 4047 case INDEX_op_shls_vec: 4048 case INDEX_op_shrs_vec: 4049 case INDEX_op_sars_vec: 4050 case INDEX_op_cmp_vec: 4051 case INDEX_op_x86_shufps_vec: 4052 case INDEX_op_x86_blend_vec: 4053 case INDEX_op_x86_packss_vec: 4054 case INDEX_op_x86_packus_vec: 4055 case INDEX_op_x86_vperm2i128_vec: 4056 case INDEX_op_x86_punpckl_vec: 4057 case INDEX_op_x86_punpckh_vec: 4058 case INDEX_op_x86_vpshldi_vec: 4059#if TCG_TARGET_REG_BITS == 32 4060 case INDEX_op_dup2_vec: 4061#endif 4062 return C_O1_I2(x, x, x); 4063 4064 case INDEX_op_abs_vec: 4065 case INDEX_op_dup_vec: 4066 case INDEX_op_not_vec: 4067 case INDEX_op_shli_vec: 4068 case INDEX_op_shri_vec: 4069 case INDEX_op_sari_vec: 4070 case INDEX_op_rotli_vec: 4071 case INDEX_op_x86_psrldq_vec: 4072 return C_O1_I1(x, x); 4073 4074 case INDEX_op_x86_vpshldv_vec: 4075 case INDEX_op_x86_vpshrdv_vec: 4076 return C_O1_I3(x, 0, x, x); 4077 4078 case INDEX_op_bitsel_vec: 4079 return C_O1_I3(x, x, x, x); 4080 case INDEX_op_cmpsel_vec: 4081 return C_O1_I4(x, x, x, xO, x); 4082 4083 default: 4084 return C_NotImplemented; 4085 } 4086} 4087 4088int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 4089{ 4090 switch (opc) { 4091 case INDEX_op_add_vec: 4092 case INDEX_op_sub_vec: 4093 case INDEX_op_and_vec: 4094 case INDEX_op_or_vec: 4095 case INDEX_op_xor_vec: 4096 case INDEX_op_andc_vec: 4097 case INDEX_op_orc_vec: 4098 case INDEX_op_nand_vec: 4099 case INDEX_op_nor_vec: 4100 case INDEX_op_eqv_vec: 4101 case INDEX_op_not_vec: 4102 case INDEX_op_bitsel_vec: 4103 return 1; 4104 case INDEX_op_cmp_vec: 4105 case INDEX_op_cmpsel_vec: 4106 return -1; 4107 4108 case INDEX_op_rotli_vec: 4109 return have_avx512vl && vece >= MO_32 ? 1 : -1; 4110 4111 case INDEX_op_shli_vec: 4112 case INDEX_op_shri_vec: 4113 /* We must expand the operation for MO_8. */ 4114 return vece == MO_8 ? -1 : 1; 4115 4116 case INDEX_op_sari_vec: 4117 switch (vece) { 4118 case MO_8: 4119 return -1; 4120 case MO_16: 4121 case MO_32: 4122 return 1; 4123 case MO_64: 4124 if (have_avx512vl) { 4125 return 1; 4126 } 4127 /* 4128 * We can emulate this for MO_64, but it does not pay off 4129 * unless we're producing at least 4 values. 4130 */ 4131 return type >= TCG_TYPE_V256 ? -1 : 0; 4132 } 4133 return 0; 4134 4135 case INDEX_op_shls_vec: 4136 case INDEX_op_shrs_vec: 4137 return vece >= MO_16; 4138 case INDEX_op_sars_vec: 4139 switch (vece) { 4140 case MO_16: 4141 case MO_32: 4142 return 1; 4143 case MO_64: 4144 return have_avx512vl; 4145 } 4146 return 0; 4147 case INDEX_op_rotls_vec: 4148 return vece >= MO_16 ? -1 : 0; 4149 4150 case INDEX_op_shlv_vec: 4151 case INDEX_op_shrv_vec: 4152 switch (vece) { 4153 case MO_16: 4154 return have_avx512bw; 4155 case MO_32: 4156 case MO_64: 4157 return have_avx2; 4158 } 4159 return 0; 4160 case INDEX_op_sarv_vec: 4161 switch (vece) { 4162 case MO_16: 4163 return have_avx512bw; 4164 case MO_32: 4165 return have_avx2; 4166 case MO_64: 4167 return have_avx512vl; 4168 } 4169 return 0; 4170 case INDEX_op_rotlv_vec: 4171 case INDEX_op_rotrv_vec: 4172 switch (vece) { 4173 case MO_16: 4174 return have_avx512vbmi2 ? -1 : 0; 4175 case MO_32: 4176 case MO_64: 4177 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 4178 } 4179 return 0; 4180 4181 case INDEX_op_mul_vec: 4182 switch (vece) { 4183 case MO_8: 4184 return -1; 4185 case MO_64: 4186 return have_avx512dq; 4187 } 4188 return 1; 4189 4190 case INDEX_op_ssadd_vec: 4191 case INDEX_op_usadd_vec: 4192 case INDEX_op_sssub_vec: 4193 case INDEX_op_ussub_vec: 4194 return vece <= MO_16; 4195 case INDEX_op_smin_vec: 4196 case INDEX_op_smax_vec: 4197 case INDEX_op_umin_vec: 4198 case INDEX_op_umax_vec: 4199 case INDEX_op_abs_vec: 4200 return vece <= MO_32 || have_avx512vl; 4201 4202 default: 4203 return 0; 4204 } 4205} 4206 4207static void expand_vec_shi(TCGType type, unsigned vece, bool right, 4208 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4209{ 4210 uint8_t mask; 4211 4212 tcg_debug_assert(vece == MO_8); 4213 if (right) { 4214 mask = 0xff >> imm; 4215 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4216 } else { 4217 mask = 0xff << imm; 4218 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4219 } 4220 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4221} 4222 4223static void expand_vec_sari(TCGType type, unsigned vece, 4224 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4225{ 4226 TCGv_vec t1, t2; 4227 4228 switch (vece) { 4229 case MO_8: 4230 /* Unpack to 16-bit, shift, and repack. */ 4231 t1 = tcg_temp_new_vec(type); 4232 t2 = tcg_temp_new_vec(type); 4233 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4234 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4235 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4236 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4237 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4238 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4239 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4240 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4241 tcg_temp_free_vec(t1); 4242 tcg_temp_free_vec(t2); 4243 break; 4244 4245 case MO_64: 4246 t1 = tcg_temp_new_vec(type); 4247 if (imm <= 32) { 4248 /* 4249 * We can emulate a small sign extend by performing an arithmetic 4250 * 32-bit shift and overwriting the high half of a 64-bit logical 4251 * shift. Note that the ISA says shift of 32 is valid, but TCG 4252 * does not, so we have to bound the smaller shift -- we get the 4253 * same result in the high half either way. 4254 */ 4255 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4256 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4257 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4258 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4259 tcgv_vec_arg(t1), 0xaa); 4260 } else { 4261 /* Otherwise we will need to use a compare vs 0 to produce 4262 * the sign-extend, shift and merge. 4263 */ 4264 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4265 tcg_constant_vec(type, MO_64, 0), v1); 4266 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4267 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4268 tcg_gen_or_vec(MO_64, v0, v0, t1); 4269 } 4270 tcg_temp_free_vec(t1); 4271 break; 4272 4273 default: 4274 g_assert_not_reached(); 4275 } 4276} 4277 4278static void expand_vec_rotli(TCGType type, unsigned vece, 4279 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4280{ 4281 TCGv_vec t; 4282 4283 if (vece != MO_8 && have_avx512vbmi2) { 4284 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4285 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4286 return; 4287 } 4288 4289 t = tcg_temp_new_vec(type); 4290 tcg_gen_shli_vec(vece, t, v1, imm); 4291 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4292 tcg_gen_or_vec(vece, v0, v0, t); 4293 tcg_temp_free_vec(t); 4294} 4295 4296static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4297 TCGv_vec v1, TCGv_vec sh, bool right) 4298{ 4299 TCGv_vec t; 4300 4301 if (have_avx512vbmi2) { 4302 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4303 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4304 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4305 return; 4306 } 4307 4308 t = tcg_temp_new_vec(type); 4309 tcg_gen_dupi_vec(vece, t, 8 << vece); 4310 tcg_gen_sub_vec(vece, t, t, sh); 4311 if (right) { 4312 tcg_gen_shlv_vec(vece, t, v1, t); 4313 tcg_gen_shrv_vec(vece, v0, v1, sh); 4314 } else { 4315 tcg_gen_shrv_vec(vece, t, v1, t); 4316 tcg_gen_shlv_vec(vece, v0, v1, sh); 4317 } 4318 tcg_gen_or_vec(vece, v0, v0, t); 4319 tcg_temp_free_vec(t); 4320} 4321 4322static void expand_vec_rotls(TCGType type, unsigned vece, 4323 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4324{ 4325 TCGv_vec t = tcg_temp_new_vec(type); 4326 4327 tcg_debug_assert(vece != MO_8); 4328 4329 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4330 tcg_gen_dup_i32_vec(vece, t, lsh); 4331 if (vece >= MO_32) { 4332 tcg_gen_rotlv_vec(vece, v0, v1, t); 4333 } else { 4334 expand_vec_rotv(type, vece, v0, v1, t, false); 4335 } 4336 } else { 4337 TCGv_i32 rsh = tcg_temp_new_i32(); 4338 4339 tcg_gen_neg_i32(rsh, lsh); 4340 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4341 tcg_gen_shls_vec(vece, t, v1, lsh); 4342 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4343 tcg_gen_or_vec(vece, v0, v0, t); 4344 4345 tcg_temp_free_i32(rsh); 4346 } 4347 4348 tcg_temp_free_vec(t); 4349} 4350 4351static void expand_vec_mul(TCGType type, unsigned vece, 4352 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4353{ 4354 TCGv_vec t1, t2, t3, t4, zero; 4355 4356 tcg_debug_assert(vece == MO_8); 4357 4358 /* 4359 * Unpack v1 bytes to words, 0 | x. 4360 * Unpack v2 bytes to words, y | 0. 4361 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4362 * Shift logical right by 8 bits to clear the high 8 bytes before 4363 * using an unsigned saturated pack. 4364 * 4365 * The difference between the V64, V128 and V256 cases is merely how 4366 * we distribute the expansion between temporaries. 4367 */ 4368 switch (type) { 4369 case TCG_TYPE_V64: 4370 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4371 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4372 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4373 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4374 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4375 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4376 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4377 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4378 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4379 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4380 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4381 tcg_temp_free_vec(t1); 4382 tcg_temp_free_vec(t2); 4383 break; 4384 4385 case TCG_TYPE_V128: 4386 case TCG_TYPE_V256: 4387 t1 = tcg_temp_new_vec(type); 4388 t2 = tcg_temp_new_vec(type); 4389 t3 = tcg_temp_new_vec(type); 4390 t4 = tcg_temp_new_vec(type); 4391 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4392 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4393 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4394 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4395 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4396 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4397 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4398 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4399 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4400 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4401 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4402 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4403 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4404 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4405 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4406 tcg_temp_free_vec(t1); 4407 tcg_temp_free_vec(t2); 4408 tcg_temp_free_vec(t3); 4409 tcg_temp_free_vec(t4); 4410 break; 4411 4412 default: 4413 g_assert_not_reached(); 4414 } 4415} 4416 4417static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4418 TCGArg *a1, TCGArg *a2, TCGCond cond) 4419{ 4420 /* 4421 * Without AVX512, there are no 64-bit unsigned comparisons. 4422 * We must bias the inputs so that they become signed. 4423 * All other swapping and inversion are handled during code generation. 4424 */ 4425 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4426 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4427 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4428 TCGv_vec t1 = tcg_temp_new_vec(type); 4429 TCGv_vec t2 = tcg_temp_new_vec(type); 4430 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4431 4432 tcg_gen_sub_vec(vece, t1, v1, t3); 4433 tcg_gen_sub_vec(vece, t2, v2, t3); 4434 *a1 = tcgv_vec_arg(t1); 4435 *a2 = tcgv_vec_arg(t2); 4436 cond = tcg_signed_cond(cond); 4437 } 4438 return cond; 4439} 4440 4441static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4442 TCGArg a1, TCGArg a2, TCGCond cond) 4443{ 4444 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4445 /* Expand directly; do not recurse. */ 4446 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4447} 4448 4449static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4450 TCGArg a1, TCGArg a2, 4451 TCGArg a3, TCGArg a4, TCGCond cond) 4452{ 4453 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4454 /* Expand directly; do not recurse. */ 4455 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4456} 4457 4458void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4459 TCGArg a0, ...) 4460{ 4461 va_list va; 4462 TCGArg a1, a2, a3, a4, a5; 4463 TCGv_vec v0, v1, v2; 4464 4465 va_start(va, a0); 4466 a1 = va_arg(va, TCGArg); 4467 a2 = va_arg(va, TCGArg); 4468 v0 = temp_tcgv_vec(arg_temp(a0)); 4469 v1 = temp_tcgv_vec(arg_temp(a1)); 4470 4471 switch (opc) { 4472 case INDEX_op_shli_vec: 4473 expand_vec_shi(type, vece, false, v0, v1, a2); 4474 break; 4475 case INDEX_op_shri_vec: 4476 expand_vec_shi(type, vece, true, v0, v1, a2); 4477 break; 4478 case INDEX_op_sari_vec: 4479 expand_vec_sari(type, vece, v0, v1, a2); 4480 break; 4481 4482 case INDEX_op_rotli_vec: 4483 expand_vec_rotli(type, vece, v0, v1, a2); 4484 break; 4485 4486 case INDEX_op_rotls_vec: 4487 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4488 break; 4489 4490 case INDEX_op_rotlv_vec: 4491 v2 = temp_tcgv_vec(arg_temp(a2)); 4492 expand_vec_rotv(type, vece, v0, v1, v2, false); 4493 break; 4494 case INDEX_op_rotrv_vec: 4495 v2 = temp_tcgv_vec(arg_temp(a2)); 4496 expand_vec_rotv(type, vece, v0, v1, v2, true); 4497 break; 4498 4499 case INDEX_op_mul_vec: 4500 v2 = temp_tcgv_vec(arg_temp(a2)); 4501 expand_vec_mul(type, vece, v0, v1, v2); 4502 break; 4503 4504 case INDEX_op_cmp_vec: 4505 a3 = va_arg(va, TCGArg); 4506 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4507 break; 4508 4509 case INDEX_op_cmpsel_vec: 4510 a3 = va_arg(va, TCGArg); 4511 a4 = va_arg(va, TCGArg); 4512 a5 = va_arg(va, TCGArg); 4513 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4514 break; 4515 4516 default: 4517 break; 4518 } 4519 4520 va_end(va); 4521} 4522 4523static const int tcg_target_callee_save_regs[] = { 4524#if TCG_TARGET_REG_BITS == 64 4525 TCG_REG_RBP, 4526 TCG_REG_RBX, 4527#if defined(_WIN64) 4528 TCG_REG_RDI, 4529 TCG_REG_RSI, 4530#endif 4531 TCG_REG_R12, 4532 TCG_REG_R13, 4533 TCG_REG_R14, /* Currently used for the global env. */ 4534 TCG_REG_R15, 4535#else 4536 TCG_REG_EBP, /* Currently used for the global env. */ 4537 TCG_REG_EBX, 4538 TCG_REG_ESI, 4539 TCG_REG_EDI, 4540#endif 4541}; 4542 4543/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4544 and tcg_register_jit. */ 4545 4546#define PUSH_SIZE \ 4547 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4548 * (TCG_TARGET_REG_BITS / 8)) 4549 4550#define FRAME_SIZE \ 4551 ((PUSH_SIZE \ 4552 + TCG_STATIC_CALL_ARGS_SIZE \ 4553 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4554 + TCG_TARGET_STACK_ALIGN - 1) \ 4555 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4556 4557/* Generate global QEMU prologue and epilogue code */ 4558static void tcg_target_qemu_prologue(TCGContext *s) 4559{ 4560 int i, stack_addend; 4561 4562 /* TB prologue */ 4563 4564 /* Reserve some stack space, also for TCG temps. */ 4565 stack_addend = FRAME_SIZE - PUSH_SIZE; 4566 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4567 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4568 4569 /* Save all callee saved registers. */ 4570 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4571 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4572 } 4573 4574 if (!tcg_use_softmmu && guest_base) { 4575 int seg = setup_guest_base_seg(); 4576 if (seg != 0) { 4577 x86_guest_base.seg = seg; 4578 } else if (guest_base == (int32_t)guest_base) { 4579 x86_guest_base.ofs = guest_base; 4580 } else { 4581 assert(TCG_TARGET_REG_BITS == 64); 4582 /* Choose R12 because, as a base, it requires a SIB byte. */ 4583 x86_guest_base.index = TCG_REG_R12; 4584 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4585 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4586 } 4587 } 4588 4589 if (TCG_TARGET_REG_BITS == 32) { 4590 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4591 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4592 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4593 /* jmp *tb. */ 4594 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4595 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4596 + stack_addend); 4597 } else { 4598 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4599 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4600 /* jmp *tb. */ 4601 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4602 } 4603 4604 /* 4605 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4606 * and fall through to the rest of the epilogue. 4607 */ 4608 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4609 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4610 4611 /* TB epilogue */ 4612 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4613 4614 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4615 4616 if (have_avx2) { 4617 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4618 } 4619 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4620 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4621 } 4622 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4623} 4624 4625static void tcg_out_tb_start(TCGContext *s) 4626{ 4627 /* nothing to do */ 4628} 4629 4630static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4631{ 4632 memset(p, 0x90, count); 4633} 4634 4635static void tcg_target_init(TCGContext *s) 4636{ 4637 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4638 if (TCG_TARGET_REG_BITS == 64) { 4639 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4640 } 4641 if (have_avx1) { 4642 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4643 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4644 } 4645 if (have_avx2) { 4646 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4647 } 4648 4649 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4650 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4651 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4652 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4653 if (TCG_TARGET_REG_BITS == 64) { 4654#if !defined(_WIN64) 4655 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4656 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4657#endif 4658 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4659 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4660 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4661 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4662 } 4663 4664 s->reserved_regs = 0; 4665 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4666 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4667#ifdef _WIN64 4668 /* These are call saved, and we don't save them, so don't use them. */ 4669 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4670 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4671 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4672 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4673 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4674 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4675 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4676 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4677 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4678 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4679#endif 4680} 4681 4682typedef struct { 4683 DebugFrameHeader h; 4684 uint8_t fde_def_cfa[4]; 4685 uint8_t fde_reg_ofs[14]; 4686} DebugFrame; 4687 4688/* We're expecting a 2 byte uleb128 encoded value. */ 4689QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4690 4691#if !defined(__ELF__) 4692 /* Host machine without ELF. */ 4693#elif TCG_TARGET_REG_BITS == 64 4694#define ELF_HOST_MACHINE EM_X86_64 4695static const DebugFrame debug_frame = { 4696 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4697 .h.cie.id = -1, 4698 .h.cie.version = 1, 4699 .h.cie.code_align = 1, 4700 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4701 .h.cie.return_column = 16, 4702 4703 /* Total FDE size does not include the "len" member. */ 4704 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4705 4706 .fde_def_cfa = { 4707 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4708 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4709 (FRAME_SIZE >> 7) 4710 }, 4711 .fde_reg_ofs = { 4712 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4713 /* The following ordering must match tcg_target_callee_save_regs. */ 4714 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4715 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4716 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4717 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4718 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4719 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4720 } 4721}; 4722#else 4723#define ELF_HOST_MACHINE EM_386 4724static const DebugFrame debug_frame = { 4725 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4726 .h.cie.id = -1, 4727 .h.cie.version = 1, 4728 .h.cie.code_align = 1, 4729 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4730 .h.cie.return_column = 8, 4731 4732 /* Total FDE size does not include the "len" member. */ 4733 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4734 4735 .fde_def_cfa = { 4736 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4737 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4738 (FRAME_SIZE >> 7) 4739 }, 4740 .fde_reg_ofs = { 4741 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4742 /* The following ordering must match tcg_target_callee_save_regs. */ 4743 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4744 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4745 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4746 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4747 } 4748}; 4749#endif 4750 4751#if defined(ELF_HOST_MACHINE) 4752void tcg_register_jit(const void *buf, size_t buf_size) 4753{ 4754 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4755} 4756#endif 4757