1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25/* Used for function call generation. */ 26#define TCG_TARGET_STACK_ALIGN 16 27#if defined(_WIN64) 28#define TCG_TARGET_CALL_STACK_OFFSET 32 29#else 30#define TCG_TARGET_CALL_STACK_OFFSET 0 31#endif 32#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL 33#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL 34#if defined(_WIN64) 35# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF 36# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_VEC 37#elif TCG_TARGET_REG_BITS == 64 38# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 39# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL 40#else 41# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 42# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF 43#endif 44 45#ifdef CONFIG_DEBUG_TCG 46static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 47#if TCG_TARGET_REG_BITS == 64 48 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 49#else 50 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 51#endif 52 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 53 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 54#if TCG_TARGET_REG_BITS == 64 55 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 56 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 57#endif 58}; 59#endif 60 61static const int tcg_target_reg_alloc_order[] = { 62#if TCG_TARGET_REG_BITS == 64 63 TCG_REG_RBP, 64 TCG_REG_RBX, 65 TCG_REG_R12, 66 TCG_REG_R13, 67 TCG_REG_R14, 68 TCG_REG_R15, 69 TCG_REG_R10, 70 TCG_REG_R11, 71 TCG_REG_R9, 72 TCG_REG_R8, 73 TCG_REG_RCX, 74 TCG_REG_RDX, 75 TCG_REG_RSI, 76 TCG_REG_RDI, 77 TCG_REG_RAX, 78#else 79 TCG_REG_EBX, 80 TCG_REG_ESI, 81 TCG_REG_EDI, 82 TCG_REG_EBP, 83 TCG_REG_ECX, 84 TCG_REG_EDX, 85 TCG_REG_EAX, 86#endif 87 TCG_REG_XMM0, 88 TCG_REG_XMM1, 89 TCG_REG_XMM2, 90 TCG_REG_XMM3, 91 TCG_REG_XMM4, 92 TCG_REG_XMM5, 93#ifndef _WIN64 94 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 95 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 96 TCG_REG_XMM6, 97 TCG_REG_XMM7, 98#if TCG_TARGET_REG_BITS == 64 99 TCG_REG_XMM8, 100 TCG_REG_XMM9, 101 TCG_REG_XMM10, 102 TCG_REG_XMM11, 103 TCG_REG_XMM12, 104 TCG_REG_XMM13, 105 TCG_REG_XMM14, 106 TCG_REG_XMM15, 107#endif 108#endif 109}; 110 111#define TCG_TMP_VEC TCG_REG_XMM5 112 113static const int tcg_target_call_iarg_regs[] = { 114#if TCG_TARGET_REG_BITS == 64 115#if defined(_WIN64) 116 TCG_REG_RCX, 117 TCG_REG_RDX, 118#else 119 TCG_REG_RDI, 120 TCG_REG_RSI, 121 TCG_REG_RDX, 122 TCG_REG_RCX, 123#endif 124 TCG_REG_R8, 125 TCG_REG_R9, 126#else 127 /* 32 bit mode uses stack based calling convention (GCC default). */ 128#endif 129}; 130 131static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 132{ 133 switch (kind) { 134 case TCG_CALL_RET_NORMAL: 135 tcg_debug_assert(slot >= 0 && slot <= 1); 136 return slot ? TCG_REG_EDX : TCG_REG_EAX; 137#ifdef _WIN64 138 case TCG_CALL_RET_BY_VEC: 139 tcg_debug_assert(slot == 0); 140 return TCG_REG_XMM0; 141#endif 142 default: 143 g_assert_not_reached(); 144 } 145} 146 147/* Constants we accept. */ 148#define TCG_CT_CONST_S32 0x100 149#define TCG_CT_CONST_U32 0x200 150#define TCG_CT_CONST_I32 0x400 151#define TCG_CT_CONST_WSZ 0x800 152#define TCG_CT_CONST_TST 0x1000 153#define TCG_CT_CONST_ZERO 0x2000 154 155/* Registers used with L constraint, which are the first argument 156 registers on x86_64, and two random call clobbered registers on 157 i386. */ 158#if TCG_TARGET_REG_BITS == 64 159# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 160# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 161#else 162# define TCG_REG_L0 TCG_REG_EAX 163# define TCG_REG_L1 TCG_REG_EDX 164#endif 165 166#if TCG_TARGET_REG_BITS == 64 167# define ALL_GENERAL_REGS 0x0000ffffu 168# define ALL_VECTOR_REGS 0xffff0000u 169# define ALL_BYTEL_REGS ALL_GENERAL_REGS 170#else 171# define ALL_GENERAL_REGS 0x000000ffu 172# define ALL_VECTOR_REGS 0x00ff0000u 173# define ALL_BYTEL_REGS 0x0000000fu 174#endif 175#define SOFTMMU_RESERVE_REGS \ 176 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 177 178#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 179#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 180 181static const tcg_insn_unit *tb_ret_addr; 182 183static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 184 intptr_t value, intptr_t addend) 185{ 186 value += addend; 187 switch(type) { 188 case R_386_PC32: 189 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 190 if (value != (int32_t)value) { 191 return false; 192 } 193 /* FALLTHRU */ 194 case R_386_32: 195 tcg_patch32(code_ptr, value); 196 break; 197 case R_386_PC8: 198 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 199 if (value != (int8_t)value) { 200 return false; 201 } 202 tcg_patch8(code_ptr, value); 203 break; 204 default: 205 g_assert_not_reached(); 206 } 207 return true; 208} 209 210/* test if a constant matches the constraint */ 211static bool tcg_target_const_match(int64_t val, int ct, 212 TCGType type, TCGCond cond, int vece) 213{ 214 if (ct & TCG_CT_CONST) { 215 return 1; 216 } 217 if (type == TCG_TYPE_I32) { 218 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 219 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 220 return 1; 221 } 222 } else { 223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 224 return 1; 225 } 226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 227 return 1; 228 } 229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 230 return 1; 231 } 232 /* 233 * This will be used in combination with TCG_CT_CONST_S32, 234 * so "normal" TESTQ is already matched. Also accept: 235 * TESTQ -> TESTL (uint32_t) 236 * TESTQ -> BT (is_power_of_2) 237 */ 238 if ((ct & TCG_CT_CONST_TST) 239 && is_tst_cond(cond) 240 && (val == (uint32_t)val || is_power_of_2(val))) { 241 return 1; 242 } 243 } 244 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 245 return 1; 246 } 247 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EbIb (0x80) 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVB_Ib (0xb0) 307#define OPC_MOVL_Iv (0xb8) 308#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 309#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 310#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 311#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 312#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 313#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 314#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 315#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 316#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 318#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 319#define OPC_MOVSBL (0xbe | P_EXT) 320#define OPC_MOVSWL (0xbf | P_EXT) 321#define OPC_MOVSLQ (0x63 | P_REXW) 322#define OPC_MOVZBL (0xb6 | P_EXT) 323#define OPC_MOVZWL (0xb7 | P_EXT) 324#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 325#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 326#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 327#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 328#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 329#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 330#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 331#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 332#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 333#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 334#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 335#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 336#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 337#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 338#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 339#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 340#define OPC_PAND (0xdb | P_EXT | P_DATA16) 341#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 342#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 343#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 344#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 345#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 346#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 347#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 348#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 349#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 350#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 351#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 352#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 353#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 354#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 355#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 356#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 358#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 359#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 360#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 362#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 363#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 364#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 366#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 367#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 368#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 369#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 370#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 371#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 372#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 373#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 374#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 375#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 376#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 377#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 378#define OPC_POR (0xeb | P_EXT | P_DATA16) 379#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 380#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 381#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 382#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 383#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 384#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 385#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 386#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 387#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 388#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 389#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 390#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 391#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 392#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 393#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 394#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 395#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 396#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 397#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 398#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 399#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 400#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 401#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 402#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 404#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 405#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 406#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 407#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 408#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 409#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 410#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 411#define OPC_PXOR (0xef | P_EXT | P_DATA16) 412#define OPC_POP_r32 (0x58) 413#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 414#define OPC_PUSH_r32 (0x50) 415#define OPC_PUSH_Iv (0x68) 416#define OPC_PUSH_Ib (0x6a) 417#define OPC_RET (0xc3) 418#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 419#define OPC_SHIFT_1 (0xd1) 420#define OPC_SHIFT_Ib (0xc1) 421#define OPC_SHIFT_cl (0xd3) 422#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 423#define OPC_SHUFPS (0xc6 | P_EXT) 424#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 425#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 426#define OPC_SHRD_Ib (0xac | P_EXT) 427#define OPC_TESTB (0x84) 428#define OPC_TESTL (0x85) 429#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 430#define OPC_UD2 (0x0b | P_EXT) 431#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 432#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 433#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 434#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 436#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 437#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 438#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 439#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 440#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 442#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 443#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 446#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 447#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 448#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 449#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 450#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 451#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 452#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 453#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 454#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 455#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 456#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 457#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 458#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 459#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 460#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 462#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 463#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 464#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 465#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 466#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 468#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 469#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 470#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 471#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 472#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 473#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 474#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 475#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 476#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 477#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 478#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 479#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 480#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 481#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 482#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 483#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 484#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 485#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 486#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 487#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 488#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 489#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 490#define OPC_VZEROUPPER (0x77 | P_EXT) 491#define OPC_XCHG_ax_r32 (0x90) 492#define OPC_XCHG_EvGv (0x87) 493 494#define OPC_GRP3_Eb (0xf6) 495#define OPC_GRP3_Ev (0xf7) 496#define OPC_GRP5 (0xff) 497#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 498#define OPC_GRPBT (0xba | P_EXT) 499 500#define OPC_GRPBT_BT 4 501#define OPC_GRPBT_BTS 5 502#define OPC_GRPBT_BTR 6 503#define OPC_GRPBT_BTC 7 504 505/* Group 1 opcode extensions for 0x80-0x83. 506 These are also used as modifiers for OPC_ARITH. */ 507#define ARITH_ADD 0 508#define ARITH_OR 1 509#define ARITH_ADC 2 510#define ARITH_SBB 3 511#define ARITH_AND 4 512#define ARITH_SUB 5 513#define ARITH_XOR 6 514#define ARITH_CMP 7 515 516/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 517#define SHIFT_ROL 0 518#define SHIFT_ROR 1 519#define SHIFT_SHL 4 520#define SHIFT_SHR 5 521#define SHIFT_SAR 7 522 523/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 524#define EXT3_TESTi 0 525#define EXT3_NOT 2 526#define EXT3_NEG 3 527#define EXT3_MUL 4 528#define EXT3_IMUL 5 529#define EXT3_DIV 6 530#define EXT3_IDIV 7 531 532/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 533#define EXT5_INC_Ev 0 534#define EXT5_DEC_Ev 1 535#define EXT5_CALLN_Ev 2 536#define EXT5_JMPN_Ev 4 537 538/* Condition codes to be added to OPC_JCC_{long,short}. */ 539#define JCC_JMP (-1) 540#define JCC_JO 0x0 541#define JCC_JNO 0x1 542#define JCC_JB 0x2 543#define JCC_JAE 0x3 544#define JCC_JE 0x4 545#define JCC_JNE 0x5 546#define JCC_JBE 0x6 547#define JCC_JA 0x7 548#define JCC_JS 0x8 549#define JCC_JNS 0x9 550#define JCC_JP 0xa 551#define JCC_JNP 0xb 552#define JCC_JL 0xc 553#define JCC_JGE 0xd 554#define JCC_JLE 0xe 555#define JCC_JG 0xf 556 557static const uint8_t tcg_cond_to_jcc[] = { 558 [TCG_COND_EQ] = JCC_JE, 559 [TCG_COND_NE] = JCC_JNE, 560 [TCG_COND_LT] = JCC_JL, 561 [TCG_COND_GE] = JCC_JGE, 562 [TCG_COND_LE] = JCC_JLE, 563 [TCG_COND_GT] = JCC_JG, 564 [TCG_COND_LTU] = JCC_JB, 565 [TCG_COND_GEU] = JCC_JAE, 566 [TCG_COND_LEU] = JCC_JBE, 567 [TCG_COND_GTU] = JCC_JA, 568 [TCG_COND_TSTEQ] = JCC_JE, 569 [TCG_COND_TSTNE] = JCC_JNE, 570}; 571 572#if TCG_TARGET_REG_BITS == 64 573static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 574{ 575 int rex; 576 577 if (opc & P_GS) { 578 tcg_out8(s, 0x65); 579 } 580 if (opc & P_DATA16) { 581 /* We should never be asking for both 16 and 64-bit operation. */ 582 tcg_debug_assert((opc & P_REXW) == 0); 583 tcg_out8(s, 0x66); 584 } 585 if (opc & P_SIMDF3) { 586 tcg_out8(s, 0xf3); 587 } else if (opc & P_SIMDF2) { 588 tcg_out8(s, 0xf2); 589 } 590 591 rex = 0; 592 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 593 rex |= (r & 8) >> 1; /* REX.R */ 594 rex |= (x & 8) >> 2; /* REX.X */ 595 rex |= (rm & 8) >> 3; /* REX.B */ 596 597 /* P_REXB_{R,RM} indicates that the given register is the low byte. 598 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 599 as otherwise the encoding indicates %[abcd]h. Note that the values 600 that are ORed in merely indicate that the REX byte must be present; 601 those bits get discarded in output. */ 602 rex |= opc & (r >= 4 ? P_REXB_R : 0); 603 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 604 605 if (rex) { 606 tcg_out8(s, (uint8_t)(rex | 0x40)); 607 } 608 609 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 610 tcg_out8(s, 0x0f); 611 if (opc & P_EXT38) { 612 tcg_out8(s, 0x38); 613 } else if (opc & P_EXT3A) { 614 tcg_out8(s, 0x3a); 615 } 616 } 617 618 tcg_out8(s, opc); 619} 620#else 621static void tcg_out_opc(TCGContext *s, int opc) 622{ 623 if (opc & P_DATA16) { 624 tcg_out8(s, 0x66); 625 } 626 if (opc & P_SIMDF3) { 627 tcg_out8(s, 0xf3); 628 } else if (opc & P_SIMDF2) { 629 tcg_out8(s, 0xf2); 630 } 631 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 632 tcg_out8(s, 0x0f); 633 if (opc & P_EXT38) { 634 tcg_out8(s, 0x38); 635 } else if (opc & P_EXT3A) { 636 tcg_out8(s, 0x3a); 637 } 638 } 639 tcg_out8(s, opc); 640} 641/* Discard the register arguments to tcg_out_opc early, so as not to penalize 642 the 32-bit compilation paths. This method works with all versions of gcc, 643 whereas relying on optimization may not be able to exclude them. */ 644#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 645#endif 646 647static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 648{ 649 tcg_out_opc(s, opc, r, rm, 0); 650 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 651} 652 653static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 654 int rm, int index) 655{ 656 int tmp; 657 658 if (opc & P_GS) { 659 tcg_out8(s, 0x65); 660 } 661 /* Use the two byte form if possible, which cannot encode 662 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 663 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 664 && ((rm | index) & 8) == 0) { 665 /* Two byte VEX prefix. */ 666 tcg_out8(s, 0xc5); 667 668 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 669 } else { 670 /* Three byte VEX prefix. */ 671 tcg_out8(s, 0xc4); 672 673 /* VEX.m-mmmm */ 674 if (opc & P_EXT3A) { 675 tmp = 3; 676 } else if (opc & P_EXT38) { 677 tmp = 2; 678 } else if (opc & P_EXT) { 679 tmp = 1; 680 } else { 681 g_assert_not_reached(); 682 } 683 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 684 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 685 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 686 tcg_out8(s, tmp); 687 688 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 689 } 690 691 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 692 /* VEX.pp */ 693 if (opc & P_DATA16) { 694 tmp |= 1; /* 0x66 */ 695 } else if (opc & P_SIMDF3) { 696 tmp |= 2; /* 0xf3 */ 697 } else if (opc & P_SIMDF2) { 698 tmp |= 3; /* 0xf2 */ 699 } 700 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 701 tcg_out8(s, tmp); 702 tcg_out8(s, opc); 703} 704 705static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 706 int rm, int index, int aaa, bool z) 707{ 708 /* The entire 4-byte evex prefix; with R' and V' set. */ 709 uint32_t p = 0x08041062; 710 int mm, pp; 711 712 tcg_debug_assert(have_avx512vl); 713 714 /* EVEX.mm */ 715 if (opc & P_EXT3A) { 716 mm = 3; 717 } else if (opc & P_EXT38) { 718 mm = 2; 719 } else if (opc & P_EXT) { 720 mm = 1; 721 } else { 722 g_assert_not_reached(); 723 } 724 725 /* EVEX.pp */ 726 if (opc & P_DATA16) { 727 pp = 1; /* 0x66 */ 728 } else if (opc & P_SIMDF3) { 729 pp = 2; /* 0xf3 */ 730 } else if (opc & P_SIMDF2) { 731 pp = 3; /* 0xf2 */ 732 } else { 733 pp = 0; 734 } 735 736 p = deposit32(p, 8, 2, mm); 737 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 738 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 739 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 740 p = deposit32(p, 16, 2, pp); 741 p = deposit32(p, 19, 4, ~v); 742 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 743 p = deposit32(p, 24, 3, aaa); 744 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 745 p = deposit32(p, 31, 1, z); 746 747 tcg_out32(s, p); 748 tcg_out8(s, opc); 749} 750 751static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 752{ 753 if (opc & P_EVEX) { 754 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 755 } else { 756 tcg_out_vex_opc(s, opc, r, v, rm, 0); 757 } 758 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 759} 760 761static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 762 int r, int v, int rm, TCGType type) 763{ 764 if (type == TCG_TYPE_V256) { 765 opc |= P_VEXL; 766 } 767 tcg_out_vex_modrm(s, opc, r, v, rm); 768} 769 770static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 771 int rm, int aaa, bool z, TCGType type) 772{ 773 if (type == TCG_TYPE_V256) { 774 opc |= P_VEXL; 775 } 776 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 777 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 778} 779 780/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 781 We handle either RM and INDEX missing with a negative value. In 64-bit 782 mode for absolute addresses, ~RM is the size of the immediate operand 783 that will follow the instruction. */ 784 785static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 786 int shift, intptr_t offset) 787{ 788 int mod, len; 789 790 if (index < 0 && rm < 0) { 791 if (TCG_TARGET_REG_BITS == 64) { 792 /* Try for a rip-relative addressing mode. This has replaced 793 the 32-bit-mode absolute addressing encoding. */ 794 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 795 intptr_t disp = offset - pc; 796 if (disp == (int32_t)disp) { 797 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 798 tcg_out32(s, disp); 799 return; 800 } 801 802 /* Try for an absolute address encoding. This requires the 803 use of the MODRM+SIB encoding and is therefore larger than 804 rip-relative addressing. */ 805 if (offset == (int32_t)offset) { 806 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 807 tcg_out8(s, (4 << 3) | 5); 808 tcg_out32(s, offset); 809 return; 810 } 811 812 /* ??? The memory isn't directly addressable. */ 813 g_assert_not_reached(); 814 } else { 815 /* Absolute address. */ 816 tcg_out8(s, (r << 3) | 5); 817 tcg_out32(s, offset); 818 return; 819 } 820 } 821 822 /* Find the length of the immediate addend. Note that the encoding 823 that would be used for (%ebp) indicates absolute addressing. */ 824 if (rm < 0) { 825 mod = 0, len = 4, rm = 5; 826 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 827 mod = 0, len = 0; 828 } else if (offset == (int8_t)offset) { 829 mod = 0x40, len = 1; 830 } else { 831 mod = 0x80, len = 4; 832 } 833 834 /* Use a single byte MODRM format if possible. Note that the encoding 835 that would be used for %esp is the escape to the two byte form. */ 836 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 837 /* Single byte MODRM format. */ 838 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 839 } else { 840 /* Two byte MODRM+SIB format. */ 841 842 /* Note that the encoding that would place %esp into the index 843 field indicates no index register. In 64-bit mode, the REX.X 844 bit counts, so %r12 can be used as the index. */ 845 if (index < 0) { 846 index = 4; 847 } else { 848 tcg_debug_assert(index != TCG_REG_ESP); 849 } 850 851 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 852 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 853 } 854 855 if (len == 1) { 856 tcg_out8(s, offset); 857 } else if (len == 4) { 858 tcg_out32(s, offset); 859 } 860} 861 862static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 863 int index, int shift, intptr_t offset) 864{ 865 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 866 tcg_out_sib_offset(s, r, rm, index, shift, offset); 867} 868 869static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 870 int rm, int index, int shift, 871 intptr_t offset) 872{ 873 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 874 tcg_out_sib_offset(s, r, rm, index, shift, offset); 875} 876 877/* A simplification of the above with no index or shift. */ 878static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 879 int rm, intptr_t offset) 880{ 881 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 882} 883 884static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 885 int v, int rm, intptr_t offset) 886{ 887 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 888} 889 890/* Output an opcode with an expected reference to the constant pool. */ 891static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 892{ 893 tcg_out_opc(s, opc, r, 0, 0); 894 /* Absolute for 32-bit, pc-relative for 64-bit. */ 895 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 896 tcg_out32(s, 0); 897} 898 899/* Output an opcode with an expected reference to the constant pool. */ 900static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 901{ 902 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 903 /* Absolute for 32-bit, pc-relative for 64-bit. */ 904 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 905 tcg_out32(s, 0); 906} 907 908/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 909static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 910{ 911 /* Propagate an opcode prefix, such as P_REXW. */ 912 int ext = subop & ~0x7; 913 subop &= 0x7; 914 915 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 916} 917 918static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 919{ 920 int rexw = 0; 921 922 if (arg == ret) { 923 return true; 924 } 925 switch (type) { 926 case TCG_TYPE_I64: 927 rexw = P_REXW; 928 /* fallthru */ 929 case TCG_TYPE_I32: 930 if (ret < 16) { 931 if (arg < 16) { 932 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 933 } else { 934 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 935 } 936 } else { 937 if (arg < 16) { 938 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 939 } else { 940 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 941 } 942 } 943 break; 944 945 case TCG_TYPE_V64: 946 tcg_debug_assert(ret >= 16 && arg >= 16); 947 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 948 break; 949 case TCG_TYPE_V128: 950 tcg_debug_assert(ret >= 16 && arg >= 16); 951 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 952 break; 953 case TCG_TYPE_V256: 954 tcg_debug_assert(ret >= 16 && arg >= 16); 955 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 956 break; 957 958 default: 959 g_assert_not_reached(); 960 } 961 return true; 962} 963 964static const int avx2_dup_insn[4] = { 965 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 966 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 967}; 968 969static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 970 TCGReg r, TCGReg a) 971{ 972 if (have_avx2) { 973 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 974 } else { 975 switch (vece) { 976 case MO_8: 977 /* ??? With zero in a register, use PSHUFB. */ 978 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 979 a = r; 980 /* FALLTHRU */ 981 case MO_16: 982 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 983 a = r; 984 /* FALLTHRU */ 985 case MO_32: 986 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 987 /* imm8 operand: all output lanes selected from input lane 0. */ 988 tcg_out8(s, 0); 989 break; 990 case MO_64: 991 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 992 break; 993 default: 994 g_assert_not_reached(); 995 } 996 } 997 return true; 998} 999 1000static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 1001 TCGReg r, TCGReg base, intptr_t offset) 1002{ 1003 if (have_avx2) { 1004 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1005 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 1006 r, 0, base, offset); 1007 } else { 1008 switch (vece) { 1009 case MO_64: 1010 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 1011 break; 1012 case MO_32: 1013 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 1014 break; 1015 case MO_16: 1016 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1017 tcg_out8(s, 0); /* imm8 */ 1018 tcg_out_dup_vec(s, type, vece, r, r); 1019 break; 1020 case MO_8: 1021 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1022 tcg_out8(s, 0); /* imm8 */ 1023 tcg_out_dup_vec(s, type, vece, r, r); 1024 break; 1025 default: 1026 g_assert_not_reached(); 1027 } 1028 } 1029 return true; 1030} 1031 1032static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1033 TCGReg ret, int64_t arg) 1034{ 1035 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1036 1037 if (arg == 0) { 1038 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1039 return; 1040 } 1041 if (arg == -1) { 1042 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1043 return; 1044 } 1045 1046 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1047 if (have_avx2) { 1048 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1049 } else { 1050 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1051 } 1052 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1053 } else { 1054 if (type == TCG_TYPE_V64) { 1055 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1056 } else if (have_avx2) { 1057 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1058 } else { 1059 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1060 } 1061 if (TCG_TARGET_REG_BITS == 64) { 1062 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1063 } else { 1064 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1065 } 1066 } 1067} 1068 1069static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1070 TCGReg ret, tcg_target_long arg) 1071{ 1072 if (arg == 0) { 1073 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1074 return; 1075 } 1076 if (arg == -1) { 1077 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1078 return; 1079 } 1080 1081 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1082 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1083 if (TCG_TARGET_REG_BITS == 64) { 1084 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1085 } else { 1086 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1087 } 1088} 1089 1090static void tcg_out_movi_int(TCGContext *s, TCGType type, 1091 TCGReg ret, tcg_target_long arg) 1092{ 1093 tcg_target_long diff; 1094 1095 if (arg == 0) { 1096 tgen_arithr(s, ARITH_XOR, ret, ret); 1097 return; 1098 } 1099 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1100 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1101 tcg_out32(s, arg); 1102 return; 1103 } 1104 if (arg == (int32_t)arg) { 1105 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1106 tcg_out32(s, arg); 1107 return; 1108 } 1109 1110 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1111 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1112 if (diff == (int32_t)diff) { 1113 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1114 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1115 tcg_out32(s, diff); 1116 return; 1117 } 1118 1119 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1120 tcg_out64(s, arg); 1121} 1122 1123static void tcg_out_movi(TCGContext *s, TCGType type, 1124 TCGReg ret, tcg_target_long arg) 1125{ 1126 switch (type) { 1127 case TCG_TYPE_I32: 1128#if TCG_TARGET_REG_BITS == 64 1129 case TCG_TYPE_I64: 1130#endif 1131 if (ret < 16) { 1132 tcg_out_movi_int(s, type, ret, arg); 1133 } else { 1134 tcg_out_movi_vec(s, type, ret, arg); 1135 } 1136 break; 1137 default: 1138 g_assert_not_reached(); 1139 } 1140} 1141 1142static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1143{ 1144 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1145 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1146 return true; 1147} 1148 1149static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1150 tcg_target_long imm) 1151{ 1152 /* This function is only used for passing structs by reference. */ 1153 tcg_debug_assert(imm == (int32_t)imm); 1154 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1155} 1156 1157static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1158{ 1159 if (val == (int8_t)val) { 1160 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1161 tcg_out8(s, val); 1162 } else if (val == (int32_t)val) { 1163 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1164 tcg_out32(s, val); 1165 } else { 1166 g_assert_not_reached(); 1167 } 1168} 1169 1170static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1171{ 1172 /* Given the strength of x86 memory ordering, we only need care for 1173 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1174 faster than "mfence", so don't bother with the sse insn. */ 1175 if (a0 & TCG_MO_ST_LD) { 1176 tcg_out8(s, 0xf0); 1177 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1178 tcg_out8(s, 0); 1179 } 1180} 1181 1182static inline void tcg_out_push(TCGContext *s, int reg) 1183{ 1184 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1185} 1186 1187static inline void tcg_out_pop(TCGContext *s, int reg) 1188{ 1189 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1190} 1191 1192static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1193 TCGReg arg1, intptr_t arg2) 1194{ 1195 switch (type) { 1196 case TCG_TYPE_I32: 1197 if (ret < 16) { 1198 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1199 } else { 1200 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1201 } 1202 break; 1203 case TCG_TYPE_I64: 1204 if (ret < 16) { 1205 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1206 break; 1207 } 1208 /* FALLTHRU */ 1209 case TCG_TYPE_V64: 1210 /* There is no instruction that can validate 8-byte alignment. */ 1211 tcg_debug_assert(ret >= 16); 1212 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1213 break; 1214 case TCG_TYPE_V128: 1215 /* 1216 * The gvec infrastructure is asserts that v128 vector loads 1217 * and stores use a 16-byte aligned offset. Validate that the 1218 * final pointer is aligned by using an insn that will SIGSEGV. 1219 */ 1220 tcg_debug_assert(ret >= 16); 1221 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1222 break; 1223 case TCG_TYPE_V256: 1224 /* 1225 * The gvec infrastructure only requires 16-byte alignment, 1226 * so here we must use an unaligned load. 1227 */ 1228 tcg_debug_assert(ret >= 16); 1229 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1230 ret, 0, arg1, arg2); 1231 break; 1232 default: 1233 g_assert_not_reached(); 1234 } 1235} 1236 1237static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1238 TCGReg arg1, intptr_t arg2) 1239{ 1240 switch (type) { 1241 case TCG_TYPE_I32: 1242 if (arg < 16) { 1243 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1244 } else { 1245 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1246 } 1247 break; 1248 case TCG_TYPE_I64: 1249 if (arg < 16) { 1250 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1251 break; 1252 } 1253 /* FALLTHRU */ 1254 case TCG_TYPE_V64: 1255 /* There is no instruction that can validate 8-byte alignment. */ 1256 tcg_debug_assert(arg >= 16); 1257 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1258 break; 1259 case TCG_TYPE_V128: 1260 /* 1261 * The gvec infrastructure is asserts that v128 vector loads 1262 * and stores use a 16-byte aligned offset. Validate that the 1263 * final pointer is aligned by using an insn that will SIGSEGV. 1264 * 1265 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1266 * for _WIN64, which must have SSE2 but may not have AVX. 1267 */ 1268 tcg_debug_assert(arg >= 16); 1269 if (have_avx1) { 1270 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1271 } else { 1272 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1273 } 1274 break; 1275 case TCG_TYPE_V256: 1276 /* 1277 * The gvec infrastructure only requires 16-byte alignment, 1278 * so here we must use an unaligned store. 1279 */ 1280 tcg_debug_assert(arg >= 16); 1281 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1282 arg, 0, arg1, arg2); 1283 break; 1284 default: 1285 g_assert_not_reached(); 1286 } 1287} 1288 1289static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1290 TCGReg base, intptr_t ofs) 1291{ 1292 int rexw = 0; 1293 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1294 if (val != (int32_t)val) { 1295 return false; 1296 } 1297 rexw = P_REXW; 1298 } else if (type != TCG_TYPE_I32) { 1299 return false; 1300 } 1301 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1302 tcg_out32(s, val); 1303 return true; 1304} 1305 1306static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1307{ 1308 /* Propagate an opcode prefix, such as P_DATA16. */ 1309 int ext = subopc & ~0x7; 1310 subopc &= 0x7; 1311 1312 if (count == 1) { 1313 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1314 } else { 1315 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1316 tcg_out8(s, count); 1317 } 1318} 1319 1320static inline void tcg_out_bswap32(TCGContext *s, int reg) 1321{ 1322 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1323} 1324 1325static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1326{ 1327 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1328} 1329 1330static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1331{ 1332 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1333 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1334 if (dest >= 4) { 1335 tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest); 1336 tcg_out32(s, 0xff); 1337 return; 1338 } 1339 src = dest; 1340 } 1341 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1342} 1343 1344static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1345{ 1346 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1347 1348 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1349 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1350 if (dest >= 4) { 1351 tcg_out_shifti(s, SHIFT_SHL, dest, 24); 1352 tcg_out_shifti(s, SHIFT_SAR, dest, 24); 1353 return; 1354 } 1355 src = dest; 1356 } 1357 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1358} 1359 1360static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1361{ 1362 /* movzwl */ 1363 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1364} 1365 1366static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1367{ 1368 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1369 /* movsw[lq] */ 1370 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1371} 1372 1373static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1374{ 1375 /* 32-bit mov zero extends. */ 1376 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1377} 1378 1379static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1380{ 1381 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1382 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1383} 1384 1385static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1386{ 1387 tcg_out_ext32s(s, dest, src); 1388} 1389 1390static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1391{ 1392 if (dest != src) { 1393 tcg_out_ext32u(s, dest, src); 1394 } 1395} 1396 1397static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1398{ 1399 tcg_out_ext32u(s, dest, src); 1400} 1401 1402static inline void tcg_out_bswap64(TCGContext *s, int reg) 1403{ 1404 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1405} 1406 1407static void tgen_arithi(TCGContext *s, int c, int r0, 1408 tcg_target_long val, int cf) 1409{ 1410 int rexw = 0; 1411 1412 if (TCG_TARGET_REG_BITS == 64) { 1413 rexw = c & -8; 1414 c &= 7; 1415 } 1416 1417 switch (c) { 1418 case ARITH_ADD: 1419 case ARITH_SUB: 1420 if (!cf) { 1421 /* 1422 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1423 * partial flags update stalls on Pentium4 and are not recommended 1424 * by current Intel optimization manuals. 1425 */ 1426 if (val == 1 || val == -1) { 1427 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1428 if (TCG_TARGET_REG_BITS == 64) { 1429 /* 1430 * The single-byte increment encodings are re-tasked 1431 * as the REX prefixes. Use the MODRM encoding. 1432 */ 1433 tcg_out_modrm(s, OPC_GRP5 + rexw, 1434 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1435 } else { 1436 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1437 } 1438 return; 1439 } 1440 if (val == 128) { 1441 /* 1442 * Facilitate using an 8-bit immediate. Carry is inverted 1443 * by this transformation, so do it only if cf == 0. 1444 */ 1445 c ^= ARITH_ADD ^ ARITH_SUB; 1446 val = -128; 1447 } 1448 } 1449 break; 1450 1451 case ARITH_AND: 1452 if (TCG_TARGET_REG_BITS == 64) { 1453 if (val == 0xffffffffu) { 1454 tcg_out_ext32u(s, r0, r0); 1455 return; 1456 } 1457 if (val == (uint32_t)val) { 1458 /* AND with no high bits set can use a 32-bit operation. */ 1459 rexw = 0; 1460 } 1461 } 1462 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1463 tcg_out_ext8u(s, r0, r0); 1464 return; 1465 } 1466 if (val == 0xffffu) { 1467 tcg_out_ext16u(s, r0, r0); 1468 return; 1469 } 1470 break; 1471 1472 case ARITH_OR: 1473 case ARITH_XOR: 1474 if (val >= 0x80 && val <= 0xff 1475 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1476 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1477 tcg_out8(s, val); 1478 return; 1479 } 1480 break; 1481 } 1482 1483 if (val == (int8_t)val) { 1484 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1485 tcg_out8(s, val); 1486 return; 1487 } 1488 if (rexw == 0 || val == (int32_t)val) { 1489 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1490 tcg_out32(s, val); 1491 return; 1492 } 1493 1494 g_assert_not_reached(); 1495} 1496 1497static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1498{ 1499 if (val != 0) { 1500 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1501 } 1502} 1503 1504/* Set SMALL to force a short forward branch. */ 1505static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1506{ 1507 int32_t val, val1; 1508 1509 if (l->has_value) { 1510 val = tcg_pcrel_diff(s, l->u.value_ptr); 1511 val1 = val - 2; 1512 if ((int8_t)val1 == val1) { 1513 if (opc == -1) { 1514 tcg_out8(s, OPC_JMP_short); 1515 } else { 1516 tcg_out8(s, OPC_JCC_short + opc); 1517 } 1518 tcg_out8(s, val1); 1519 } else { 1520 tcg_debug_assert(!small); 1521 if (opc == -1) { 1522 tcg_out8(s, OPC_JMP_long); 1523 tcg_out32(s, val - 5); 1524 } else { 1525 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1526 tcg_out32(s, val - 6); 1527 } 1528 } 1529 } else if (small) { 1530 if (opc == -1) { 1531 tcg_out8(s, OPC_JMP_short); 1532 } else { 1533 tcg_out8(s, OPC_JCC_short + opc); 1534 } 1535 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1536 s->code_ptr += 1; 1537 } else { 1538 if (opc == -1) { 1539 tcg_out8(s, OPC_JMP_long); 1540 } else { 1541 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1542 } 1543 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1544 s->code_ptr += 4; 1545 } 1546} 1547 1548static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1549 TCGArg arg2, int const_arg2, int rexw) 1550{ 1551 int jz, js; 1552 1553 if (!is_tst_cond(cond)) { 1554 if (!const_arg2) { 1555 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1556 } else if (arg2 == 0) { 1557 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1558 } else { 1559 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1560 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1561 } 1562 return tcg_cond_to_jcc[cond]; 1563 } 1564 1565 jz = tcg_cond_to_jcc[cond]; 1566 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1567 1568 if (!const_arg2) { 1569 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1570 return jz; 1571 } 1572 1573 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1574 if (arg2 == 0x80) { 1575 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1576 return js; 1577 } 1578 if (arg2 == 0xff) { 1579 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1580 return jz; 1581 } 1582 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1583 tcg_out8(s, arg2); 1584 return jz; 1585 } 1586 1587 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1588 if (arg2 == 0x8000) { 1589 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1590 return js; 1591 } 1592 if (arg2 == 0xff00) { 1593 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1594 return jz; 1595 } 1596 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1597 tcg_out8(s, arg2 >> 8); 1598 return jz; 1599 } 1600 1601 if (arg2 == 0xffff) { 1602 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1603 return jz; 1604 } 1605 if (arg2 == 0xffffffffu) { 1606 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1607 return jz; 1608 } 1609 1610 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1611 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1612 int sh = ctz64(arg2); 1613 1614 rexw = (sh & 32 ? P_REXW : 0); 1615 if ((sh & 31) == 31) { 1616 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1617 return js; 1618 } else { 1619 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1620 tcg_out8(s, sh); 1621 return jc; 1622 } 1623 } 1624 1625 if (rexw) { 1626 if (arg2 == (uint32_t)arg2) { 1627 rexw = 0; 1628 } else { 1629 tcg_debug_assert(arg2 == (int32_t)arg2); 1630 } 1631 } 1632 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1633 tcg_out32(s, arg2); 1634 return jz; 1635} 1636 1637static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1638 TCGArg arg1, TCGArg arg2, int const_arg2, 1639 TCGLabel *label, bool small) 1640{ 1641 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1642 tcg_out_jxx(s, jcc, label, small); 1643} 1644 1645#if TCG_TARGET_REG_BITS == 32 1646static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1647 const int *const_args, bool small) 1648{ 1649 TCGLabel *label_next = gen_new_label(); 1650 TCGLabel *label_this = arg_label(args[5]); 1651 TCGCond cond = args[4]; 1652 1653 switch (cond) { 1654 case TCG_COND_EQ: 1655 case TCG_COND_TSTEQ: 1656 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1657 args[0], args[2], const_args[2], label_next, 1); 1658 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1659 label_this, small); 1660 break; 1661 case TCG_COND_NE: 1662 case TCG_COND_TSTNE: 1663 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1664 label_this, small); 1665 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1666 label_this, small); 1667 break; 1668 case TCG_COND_LT: 1669 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1670 label_this, small); 1671 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1672 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1673 label_this, small); 1674 break; 1675 case TCG_COND_LE: 1676 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1677 label_this, small); 1678 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1679 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1680 label_this, small); 1681 break; 1682 case TCG_COND_GT: 1683 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1684 label_this, small); 1685 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1686 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1687 label_this, small); 1688 break; 1689 case TCG_COND_GE: 1690 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1691 label_this, small); 1692 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1693 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1694 label_this, small); 1695 break; 1696 case TCG_COND_LTU: 1697 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1698 label_this, small); 1699 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1700 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1701 label_this, small); 1702 break; 1703 case TCG_COND_LEU: 1704 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1705 label_this, small); 1706 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1707 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1708 label_this, small); 1709 break; 1710 case TCG_COND_GTU: 1711 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1712 label_this, small); 1713 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1714 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1715 label_this, small); 1716 break; 1717 case TCG_COND_GEU: 1718 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1719 label_this, small); 1720 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1721 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1722 label_this, small); 1723 break; 1724 default: 1725 g_assert_not_reached(); 1726 } 1727 tcg_out_label(s, label_next); 1728} 1729#endif 1730 1731static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond, 1732 TCGArg dest, TCGArg arg1, TCGArg arg2, 1733 int const_arg2, bool neg) 1734{ 1735 int cmp_rexw = rexw; 1736 bool inv = false; 1737 bool cleared; 1738 int jcc; 1739 1740 switch (cond) { 1741 case TCG_COND_NE: 1742 inv = true; 1743 /* fall through */ 1744 case TCG_COND_EQ: 1745 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1746 if (const_arg2 && arg2 == 0) { 1747 arg2 = 1; 1748 goto do_ltu; 1749 } 1750 break; 1751 1752 case TCG_COND_TSTNE: 1753 inv = true; 1754 /* fall through */ 1755 case TCG_COND_TSTEQ: 1756 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1757 if (const_arg2 && arg2 == 0xffffffffu) { 1758 arg2 = 1; 1759 cmp_rexw = 0; 1760 goto do_ltu; 1761 } 1762 break; 1763 1764 case TCG_COND_LEU: 1765 inv = true; 1766 /* fall through */ 1767 case TCG_COND_GTU: 1768 /* If arg2 is a register, swap for LTU/GEU. */ 1769 if (!const_arg2) { 1770 TCGReg t = arg1; 1771 arg1 = arg2; 1772 arg2 = t; 1773 goto do_ltu; 1774 } 1775 break; 1776 1777 case TCG_COND_GEU: 1778 inv = true; 1779 /* fall through */ 1780 case TCG_COND_LTU: 1781 do_ltu: 1782 /* 1783 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1784 * We can then use NEG or INC to produce the desired result. 1785 * This is always smaller than the SETCC expansion. 1786 */ 1787 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1788 1789 /* X - X - C = -C = (C ? -1 : 0) */ 1790 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1791 if (inv && neg) { 1792 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1793 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1794 } else if (inv) { 1795 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1796 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1797 } else if (!neg) { 1798 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1799 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1800 } 1801 return; 1802 1803 case TCG_COND_GE: 1804 inv = true; 1805 /* fall through */ 1806 case TCG_COND_LT: 1807 /* If arg2 is 0, extract the sign bit. */ 1808 if (const_arg2 && arg2 == 0) { 1809 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1); 1810 if (inv) { 1811 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1812 } 1813 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1814 dest, rexw ? 63 : 31); 1815 return; 1816 } 1817 break; 1818 1819 default: 1820 break; 1821 } 1822 1823 /* 1824 * If dest does not overlap the inputs, clearing it first is preferred. 1825 * The XOR breaks any false dependency for the low-byte write to dest, 1826 * and is also one byte smaller than MOVZBL. 1827 */ 1828 cleared = false; 1829 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1830 tgen_arithr(s, ARITH_XOR, dest, dest); 1831 cleared = true; 1832 } 1833 1834 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1835 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1836 1837 if (!cleared) { 1838 tcg_out_ext8u(s, dest, dest); 1839 } 1840 if (neg) { 1841 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1842 } 1843} 1844 1845#if TCG_TARGET_REG_BITS == 32 1846static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1847 const int *const_args) 1848{ 1849 TCGArg new_args[6]; 1850 TCGLabel *label_true, *label_over; 1851 1852 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1853 1854 if (args[0] == args[1] || args[0] == args[2] 1855 || (!const_args[3] && args[0] == args[3]) 1856 || (!const_args[4] && args[0] == args[4])) { 1857 /* When the destination overlaps with one of the argument 1858 registers, don't do anything tricky. */ 1859 label_true = gen_new_label(); 1860 label_over = gen_new_label(); 1861 1862 new_args[5] = label_arg(label_true); 1863 tcg_out_brcond2(s, new_args, const_args+1, 1); 1864 1865 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1866 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1867 tcg_out_label(s, label_true); 1868 1869 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1870 tcg_out_label(s, label_over); 1871 } else { 1872 /* When the destination does not overlap one of the arguments, 1873 clear the destination first, jump if cond false, and emit an 1874 increment in the true case. This results in smaller code. */ 1875 1876 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1877 1878 label_over = gen_new_label(); 1879 new_args[4] = tcg_invert_cond(new_args[4]); 1880 new_args[5] = label_arg(label_over); 1881 tcg_out_brcond2(s, new_args, const_args+1, 1); 1882 1883 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1884 tcg_out_label(s, label_over); 1885 } 1886} 1887#endif 1888 1889static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1890 TCGReg dest, TCGReg v1) 1891{ 1892 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1893} 1894 1895static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1896 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1897 TCGReg v1) 1898{ 1899 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1900 tcg_out_cmov(s, jcc, rexw, dest, v1); 1901} 1902 1903static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1904 TCGArg arg2, bool const_a2) 1905{ 1906 if (have_bmi1) { 1907 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1908 if (const_a2) { 1909 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1910 } else { 1911 tcg_debug_assert(dest != arg2); 1912 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1913 } 1914 } else { 1915 tcg_debug_assert(dest != arg2); 1916 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1917 tcg_out_cmov(s, JCC_JE, rexw, dest, arg2); 1918 } 1919} 1920 1921static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1922 TCGArg arg2, bool const_a2) 1923{ 1924 if (have_lzcnt) { 1925 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1926 if (const_a2) { 1927 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1928 } else { 1929 tcg_debug_assert(dest != arg2); 1930 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1931 } 1932 } else { 1933 tcg_debug_assert(!const_a2); 1934 tcg_debug_assert(dest != arg1); 1935 tcg_debug_assert(dest != arg2); 1936 1937 /* Recall that the output of BSR is the index not the count. */ 1938 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1939 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1940 1941 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1942 int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw); 1943 tcg_out_cmov(s, jcc, rexw, dest, arg2); 1944 } 1945} 1946 1947static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1948{ 1949 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1950 1951 if (disp == (int32_t)disp) { 1952 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1953 tcg_out32(s, disp); 1954 } else { 1955 /* rip-relative addressing into the constant pool. 1956 This is 6 + 8 = 14 bytes, as compared to using an 1957 immediate load 10 + 6 = 16 bytes, plus we may 1958 be able to re-use the pool constant for more calls. */ 1959 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1960 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1961 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1962 tcg_out32(s, 0); 1963 } 1964} 1965 1966static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1967 const TCGHelperInfo *info) 1968{ 1969 tcg_out_branch(s, 1, dest); 1970 1971#ifndef _WIN32 1972 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1973 /* 1974 * The sysv i386 abi for struct return places a reference as the 1975 * first argument of the stack, and pops that argument with the 1976 * return statement. Since we want to retain the aligned stack 1977 * pointer for the callee, we do not want to actually push that 1978 * argument before the call but rely on the normal store to the 1979 * stack slot. But we do need to compensate for the pop in order 1980 * to reset our correct stack pointer value. 1981 * Pushing a garbage value back onto the stack is quickest. 1982 */ 1983 tcg_out_push(s, TCG_REG_EAX); 1984 } 1985#endif 1986} 1987 1988static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1989{ 1990 tcg_out_branch(s, 0, dest); 1991} 1992 1993static void tcg_out_nopn(TCGContext *s, int n) 1994{ 1995 int i; 1996 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1997 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1998 * duplicate prefix, and all of the interesting recent cores can 1999 * decode and discard the duplicates in a single cycle. 2000 */ 2001 tcg_debug_assert(n >= 1); 2002 for (i = 1; i < n; ++i) { 2003 tcg_out8(s, 0x66); 2004 } 2005 tcg_out8(s, 0x90); 2006} 2007 2008typedef struct { 2009 TCGReg base; 2010 int index; 2011 int ofs; 2012 int seg; 2013 TCGAtomAlign aa; 2014} HostAddress; 2015 2016bool tcg_target_has_memory_bswap(MemOp memop) 2017{ 2018 TCGAtomAlign aa; 2019 2020 if (!have_movbe) { 2021 return false; 2022 } 2023 if ((memop & MO_SIZE) < MO_128) { 2024 return true; 2025 } 2026 2027 /* 2028 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 2029 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 2030 */ 2031 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 2032 return aa.atom < MO_128; 2033} 2034 2035/* 2036 * Because i686 has no register parameters and because x86_64 has xchg 2037 * to handle addr/data register overlap, we have placed all input arguments 2038 * before we need might need a scratch reg. 2039 * 2040 * Even then, a scratch is only needed for l->raddr. Rather than expose 2041 * a general-purpose scratch when we don't actually know it's available, 2042 * use the ra_gen hook to load into RAX if needed. 2043 */ 2044#if TCG_TARGET_REG_BITS == 64 2045static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 2046{ 2047 if (arg < 0) { 2048 arg = TCG_REG_RAX; 2049 } 2050 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 2051 return arg; 2052} 2053static const TCGLdstHelperParam ldst_helper_param = { 2054 .ra_gen = ldst_ra_gen 2055}; 2056#else 2057static const TCGLdstHelperParam ldst_helper_param = { }; 2058#endif 2059 2060static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 2061 TCGReg l, TCGReg h, TCGReg v) 2062{ 2063 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2064 2065 /* vpmov{d,q} %v, %l */ 2066 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2067 /* vpextr{d,q} $1, %v, %h */ 2068 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2069 tcg_out8(s, 1); 2070} 2071 2072static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2073 TCGReg v, TCGReg l, TCGReg h) 2074{ 2075 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2076 2077 /* vmov{d,q} %l, %v */ 2078 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2079 /* vpinsr{d,q} $1, %h, %v, %v */ 2080 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2081 tcg_out8(s, 1); 2082} 2083 2084/* 2085 * Generate code for the slow path for a load at the end of block 2086 */ 2087static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2088{ 2089 MemOp opc = get_memop(l->oi); 2090 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2091 2092 /* resolve label address */ 2093 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2094 if (label_ptr[1]) { 2095 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2096 } 2097 2098 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2099 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2100 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2101 2102 tcg_out_jmp(s, l->raddr); 2103 return true; 2104} 2105 2106/* 2107 * Generate code for the slow path for a store at the end of block 2108 */ 2109static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2110{ 2111 MemOp opc = get_memop(l->oi); 2112 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2113 2114 /* resolve label address */ 2115 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2116 if (label_ptr[1]) { 2117 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2118 } 2119 2120 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2121 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2122 2123 tcg_out_jmp(s, l->raddr); 2124 return true; 2125} 2126 2127#ifdef CONFIG_USER_ONLY 2128static HostAddress x86_guest_base = { 2129 .index = -1 2130}; 2131 2132#if defined(__x86_64__) && defined(__linux__) 2133# include <asm/prctl.h> 2134# include <sys/prctl.h> 2135int arch_prctl(int code, unsigned long addr); 2136static inline int setup_guest_base_seg(void) 2137{ 2138 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2139 return P_GS; 2140 } 2141 return 0; 2142} 2143#define setup_guest_base_seg setup_guest_base_seg 2144#elif defined(__x86_64__) && \ 2145 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2146# include <machine/sysarch.h> 2147static inline int setup_guest_base_seg(void) 2148{ 2149 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2150 return P_GS; 2151 } 2152 return 0; 2153} 2154#define setup_guest_base_seg setup_guest_base_seg 2155#endif 2156#else 2157# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2158#endif /* CONFIG_USER_ONLY */ 2159#ifndef setup_guest_base_seg 2160# define setup_guest_base_seg() 0 2161#endif 2162 2163#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2164 2165/* 2166 * For softmmu, perform the TLB load and compare. 2167 * For useronly, perform any required alignment tests. 2168 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2169 * is required and fill in @h with the host address for the fast path. 2170 */ 2171static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2172 TCGReg addrlo, TCGReg addrhi, 2173 MemOpIdx oi, bool is_ld) 2174{ 2175 TCGLabelQemuLdst *ldst = NULL; 2176 MemOp opc = get_memop(oi); 2177 MemOp s_bits = opc & MO_SIZE; 2178 unsigned a_mask; 2179 2180 if (tcg_use_softmmu) { 2181 h->index = TCG_REG_L0; 2182 h->ofs = 0; 2183 h->seg = 0; 2184 } else { 2185 *h = x86_guest_base; 2186 } 2187 h->base = addrlo; 2188 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2189 a_mask = (1 << h->aa.align) - 1; 2190 2191 if (tcg_use_softmmu) { 2192 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2193 : offsetof(CPUTLBEntry, addr_write); 2194 TCGType ttype = TCG_TYPE_I32; 2195 TCGType tlbtype = TCG_TYPE_I32; 2196 int trexw = 0, hrexw = 0, tlbrexw = 0; 2197 unsigned mem_index = get_mmuidx(oi); 2198 unsigned s_mask = (1 << s_bits) - 1; 2199 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2200 int tlb_mask; 2201 2202 ldst = new_ldst_label(s); 2203 ldst->is_ld = is_ld; 2204 ldst->oi = oi; 2205 ldst->addrlo_reg = addrlo; 2206 ldst->addrhi_reg = addrhi; 2207 2208 if (TCG_TARGET_REG_BITS == 64) { 2209 ttype = s->addr_type; 2210 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2211 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2212 hrexw = P_REXW; 2213 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2214 tlbtype = TCG_TYPE_I64; 2215 tlbrexw = P_REXW; 2216 } 2217 } 2218 } 2219 2220 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); 2221 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2222 s->page_bits - CPU_TLB_ENTRY_BITS); 2223 2224 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2225 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2226 2227 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2228 fast_ofs + offsetof(CPUTLBDescFast, table)); 2229 2230 /* 2231 * If the required alignment is at least as large as the access, 2232 * simply copy the address and mask. For lesser alignments, 2233 * check that we don't cross pages for the complete access. 2234 */ 2235 if (a_mask >= s_mask) { 2236 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); 2237 } else { 2238 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2239 addrlo, s_mask - a_mask); 2240 } 2241 tlb_mask = s->page_mask | a_mask; 2242 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2243 2244 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2245 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2246 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2247 2248 /* jne slow_path */ 2249 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2250 ldst->label_ptr[0] = s->code_ptr; 2251 s->code_ptr += 4; 2252 2253 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { 2254 /* cmp 4(TCG_REG_L0), addrhi */ 2255 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, 2256 TCG_REG_L0, cmp_ofs + 4); 2257 2258 /* jne slow_path */ 2259 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2260 ldst->label_ptr[1] = s->code_ptr; 2261 s->code_ptr += 4; 2262 } 2263 2264 /* TLB Hit. */ 2265 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2266 offsetof(CPUTLBEntry, addend)); 2267 } else if (a_mask) { 2268 int jcc; 2269 2270 ldst = new_ldst_label(s); 2271 ldst->is_ld = is_ld; 2272 ldst->oi = oi; 2273 ldst->addrlo_reg = addrlo; 2274 ldst->addrhi_reg = addrhi; 2275 2276 /* jne slow_path */ 2277 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false); 2278 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2279 ldst->label_ptr[0] = s->code_ptr; 2280 s->code_ptr += 4; 2281 } 2282 2283 return ldst; 2284} 2285 2286static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2287 HostAddress h, TCGType type, MemOp memop) 2288{ 2289 bool use_movbe = false; 2290 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2291 int movop = OPC_MOVL_GvEv; 2292 2293 /* Do big-endian loads with movbe. */ 2294 if (memop & MO_BSWAP) { 2295 tcg_debug_assert(have_movbe); 2296 use_movbe = true; 2297 movop = OPC_MOVBE_GyMy; 2298 } 2299 2300 switch (memop & MO_SSIZE) { 2301 case MO_UB: 2302 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2303 h.base, h.index, 0, h.ofs); 2304 break; 2305 case MO_SB: 2306 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2307 h.base, h.index, 0, h.ofs); 2308 break; 2309 case MO_UW: 2310 if (use_movbe) { 2311 /* There is no extending movbe; only low 16-bits are modified. */ 2312 if (datalo != h.base && datalo != h.index) { 2313 /* XOR breaks dependency chains. */ 2314 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2315 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2316 datalo, h.base, h.index, 0, h.ofs); 2317 } else { 2318 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2319 datalo, h.base, h.index, 0, h.ofs); 2320 tcg_out_ext16u(s, datalo, datalo); 2321 } 2322 } else { 2323 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2324 h.base, h.index, 0, h.ofs); 2325 } 2326 break; 2327 case MO_SW: 2328 if (use_movbe) { 2329 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2330 datalo, h.base, h.index, 0, h.ofs); 2331 tcg_out_ext16s(s, type, datalo, datalo); 2332 } else { 2333 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2334 datalo, h.base, h.index, 0, h.ofs); 2335 } 2336 break; 2337 case MO_UL: 2338 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2339 h.base, h.index, 0, h.ofs); 2340 break; 2341#if TCG_TARGET_REG_BITS == 64 2342 case MO_SL: 2343 if (use_movbe) { 2344 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2345 h.base, h.index, 0, h.ofs); 2346 tcg_out_ext32s(s, datalo, datalo); 2347 } else { 2348 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2349 h.base, h.index, 0, h.ofs); 2350 } 2351 break; 2352#endif 2353 case MO_UQ: 2354 if (TCG_TARGET_REG_BITS == 64) { 2355 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2356 h.base, h.index, 0, h.ofs); 2357 break; 2358 } 2359 if (use_movbe) { 2360 TCGReg t = datalo; 2361 datalo = datahi; 2362 datahi = t; 2363 } 2364 if (h.base == datalo || h.index == datalo) { 2365 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2366 h.base, h.index, 0, h.ofs); 2367 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2368 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2369 } else { 2370 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2371 h.base, h.index, 0, h.ofs); 2372 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2373 h.base, h.index, 0, h.ofs + 4); 2374 } 2375 break; 2376 2377 case MO_128: 2378 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2379 2380 /* 2381 * Without 16-byte atomicity, use integer regs. 2382 * That is where we want the data, and it allows bswaps. 2383 */ 2384 if (h.aa.atom < MO_128) { 2385 if (use_movbe) { 2386 TCGReg t = datalo; 2387 datalo = datahi; 2388 datahi = t; 2389 } 2390 if (h.base == datalo || h.index == datalo) { 2391 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2392 h.base, h.index, 0, h.ofs); 2393 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2394 datalo, datahi, 0); 2395 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2396 datahi, datahi, 8); 2397 } else { 2398 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2399 h.base, h.index, 0, h.ofs); 2400 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2401 h.base, h.index, 0, h.ofs + 8); 2402 } 2403 break; 2404 } 2405 2406 /* 2407 * With 16-byte atomicity, a vector load is required. 2408 * If we already have 16-byte alignment, then VMOVDQA always works. 2409 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2410 * Else use we require a runtime test for alignment for VMOVDQA; 2411 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2412 */ 2413 if (h.aa.align >= MO_128) { 2414 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2415 TCG_TMP_VEC, 0, 2416 h.base, h.index, 0, h.ofs); 2417 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2418 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2419 TCG_TMP_VEC, 0, 2420 h.base, h.index, 0, h.ofs); 2421 } else { 2422 TCGLabel *l1 = gen_new_label(); 2423 TCGLabel *l2 = gen_new_label(); 2424 int jcc; 2425 2426 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2427 tcg_out_jxx(s, jcc, l1, true); 2428 2429 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2430 TCG_TMP_VEC, 0, 2431 h.base, h.index, 0, h.ofs); 2432 tcg_out_jxx(s, JCC_JMP, l2, true); 2433 2434 tcg_out_label(s, l1); 2435 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2436 TCG_TMP_VEC, 0, 2437 h.base, h.index, 0, h.ofs); 2438 tcg_out_label(s, l2); 2439 } 2440 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2441 break; 2442 2443 default: 2444 g_assert_not_reached(); 2445 } 2446} 2447 2448static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2449 TCGReg addrlo, TCGReg addrhi, 2450 MemOpIdx oi, TCGType data_type) 2451{ 2452 TCGLabelQemuLdst *ldst; 2453 HostAddress h; 2454 2455 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); 2456 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2457 2458 if (ldst) { 2459 ldst->type = data_type; 2460 ldst->datalo_reg = datalo; 2461 ldst->datahi_reg = datahi; 2462 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2463 } 2464} 2465 2466static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2467 HostAddress h, MemOp memop) 2468{ 2469 bool use_movbe = false; 2470 int movop = OPC_MOVL_EvGv; 2471 2472 /* 2473 * Do big-endian stores with movbe or system-mode. 2474 * User-only without movbe will have its swapping done generically. 2475 */ 2476 if (memop & MO_BSWAP) { 2477 tcg_debug_assert(have_movbe); 2478 use_movbe = true; 2479 movop = OPC_MOVBE_MyGy; 2480 } 2481 2482 switch (memop & MO_SIZE) { 2483 case MO_8: 2484 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2485 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2486 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2487 datalo, h.base, h.index, 0, h.ofs); 2488 break; 2489 case MO_16: 2490 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2491 h.base, h.index, 0, h.ofs); 2492 break; 2493 case MO_32: 2494 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2495 h.base, h.index, 0, h.ofs); 2496 break; 2497 case MO_64: 2498 if (TCG_TARGET_REG_BITS == 64) { 2499 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2500 h.base, h.index, 0, h.ofs); 2501 } else { 2502 if (use_movbe) { 2503 TCGReg t = datalo; 2504 datalo = datahi; 2505 datahi = t; 2506 } 2507 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2508 h.base, h.index, 0, h.ofs); 2509 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2510 h.base, h.index, 0, h.ofs + 4); 2511 } 2512 break; 2513 2514 case MO_128: 2515 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2516 2517 /* 2518 * Without 16-byte atomicity, use integer regs. 2519 * That is where we have the data, and it allows bswaps. 2520 */ 2521 if (h.aa.atom < MO_128) { 2522 if (use_movbe) { 2523 TCGReg t = datalo; 2524 datalo = datahi; 2525 datahi = t; 2526 } 2527 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2528 h.base, h.index, 0, h.ofs); 2529 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2530 h.base, h.index, 0, h.ofs + 8); 2531 break; 2532 } 2533 2534 /* 2535 * With 16-byte atomicity, a vector store is required. 2536 * If we already have 16-byte alignment, then VMOVDQA always works. 2537 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2538 * Else use we require a runtime test for alignment for VMOVDQA; 2539 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2540 */ 2541 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2542 if (h.aa.align >= MO_128) { 2543 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2544 TCG_TMP_VEC, 0, 2545 h.base, h.index, 0, h.ofs); 2546 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2547 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2548 TCG_TMP_VEC, 0, 2549 h.base, h.index, 0, h.ofs); 2550 } else { 2551 TCGLabel *l1 = gen_new_label(); 2552 TCGLabel *l2 = gen_new_label(); 2553 int jcc; 2554 2555 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2556 tcg_out_jxx(s, jcc, l1, true); 2557 2558 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2559 TCG_TMP_VEC, 0, 2560 h.base, h.index, 0, h.ofs); 2561 tcg_out_jxx(s, JCC_JMP, l2, true); 2562 2563 tcg_out_label(s, l1); 2564 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2565 TCG_TMP_VEC, 0, 2566 h.base, h.index, 0, h.ofs); 2567 tcg_out_label(s, l2); 2568 } 2569 break; 2570 2571 default: 2572 g_assert_not_reached(); 2573 } 2574} 2575 2576static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2577 TCGReg addrlo, TCGReg addrhi, 2578 MemOpIdx oi, TCGType data_type) 2579{ 2580 TCGLabelQemuLdst *ldst; 2581 HostAddress h; 2582 2583 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); 2584 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2585 2586 if (ldst) { 2587 ldst->type = data_type; 2588 ldst->datalo_reg = datalo; 2589 ldst->datahi_reg = datahi; 2590 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2591 } 2592} 2593 2594static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2595{ 2596 /* Reuse the zeroing that exists for goto_ptr. */ 2597 if (a0 == 0) { 2598 tcg_out_jmp(s, tcg_code_gen_epilogue); 2599 } else { 2600 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2601 tcg_out_jmp(s, tb_ret_addr); 2602 } 2603} 2604 2605static void tcg_out_goto_tb(TCGContext *s, int which) 2606{ 2607 /* 2608 * Jump displacement must be aligned for atomic patching; 2609 * see if we need to add extra nops before jump 2610 */ 2611 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2612 if (gap != 1) { 2613 tcg_out_nopn(s, gap - 1); 2614 } 2615 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2616 set_jmp_insn_offset(s, which); 2617 tcg_out32(s, 0); 2618 set_jmp_reset_offset(s, which); 2619} 2620 2621void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2622 uintptr_t jmp_rx, uintptr_t jmp_rw) 2623{ 2624 /* patch the branch destination */ 2625 uintptr_t addr = tb->jmp_target_addr[n]; 2626 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2627 /* no need to flush icache explicitly */ 2628} 2629 2630static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type, 2631 const TCGArg args[TCG_MAX_OP_ARGS], 2632 const int const_args[TCG_MAX_OP_ARGS]) 2633{ 2634 TCGArg a0, a1, a2; 2635 int c, const_a2, vexop, rexw; 2636 2637#if TCG_TARGET_REG_BITS == 64 2638# define OP_32_64(x) \ 2639 case glue(glue(INDEX_op_, x), _i64): \ 2640 case glue(glue(INDEX_op_, x), _i32) 2641#else 2642# define OP_32_64(x) \ 2643 case glue(glue(INDEX_op_, x), _i32) 2644#endif 2645 2646 /* Hoist the loads of the most common arguments. */ 2647 a0 = args[0]; 2648 a1 = args[1]; 2649 a2 = args[2]; 2650 const_a2 = const_args[2]; 2651 rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2652 2653 switch (opc) { 2654 case INDEX_op_goto_ptr: 2655 /* jmp to the given host address (could be epilogue) */ 2656 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2657 break; 2658 case INDEX_op_br: 2659 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2660 break; 2661 OP_32_64(ld8u): 2662 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2663 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2664 break; 2665 OP_32_64(ld8s): 2666 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2667 break; 2668 OP_32_64(ld16u): 2669 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2670 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2671 break; 2672 OP_32_64(ld16s): 2673 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2674 break; 2675#if TCG_TARGET_REG_BITS == 64 2676 case INDEX_op_ld32u_i64: 2677#endif 2678 case INDEX_op_ld_i32: 2679 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2680 break; 2681 2682 OP_32_64(st8): 2683 if (const_args[0]) { 2684 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2685 tcg_out8(s, a0); 2686 } else { 2687 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2688 } 2689 break; 2690 OP_32_64(st16): 2691 if (const_args[0]) { 2692 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2693 tcg_out16(s, a0); 2694 } else { 2695 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2696 } 2697 break; 2698#if TCG_TARGET_REG_BITS == 64 2699 case INDEX_op_st32_i64: 2700#endif 2701 case INDEX_op_st_i32: 2702 if (const_args[0]) { 2703 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2704 tcg_out32(s, a0); 2705 } else { 2706 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2707 } 2708 break; 2709 2710 OP_32_64(add): 2711 /* For 3-operand addition, use LEA. */ 2712 if (a0 != a1) { 2713 TCGArg c3 = 0; 2714 if (const_a2) { 2715 c3 = a2, a2 = -1; 2716 } else if (a0 == a2) { 2717 /* Watch out for dest = src + dest, since we've removed 2718 the matching constraint on the add. */ 2719 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2720 break; 2721 } 2722 2723 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2724 break; 2725 } 2726 c = ARITH_ADD; 2727 goto gen_arith; 2728 OP_32_64(sub): 2729 c = ARITH_SUB; 2730 goto gen_arith; 2731 OP_32_64(and): 2732 c = ARITH_AND; 2733 goto gen_arith; 2734 OP_32_64(or): 2735 c = ARITH_OR; 2736 goto gen_arith; 2737 OP_32_64(xor): 2738 c = ARITH_XOR; 2739 goto gen_arith; 2740 gen_arith: 2741 if (const_a2) { 2742 tgen_arithi(s, c + rexw, a0, a2, 0); 2743 } else { 2744 tgen_arithr(s, c + rexw, a0, a2); 2745 } 2746 break; 2747 2748 OP_32_64(andc): 2749 if (const_a2) { 2750 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2751 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2752 } else { 2753 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2754 } 2755 break; 2756 2757 OP_32_64(mul): 2758 if (const_a2) { 2759 int32_t val; 2760 val = a2; 2761 if (val == (int8_t)val) { 2762 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2763 tcg_out8(s, val); 2764 } else { 2765 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2766 tcg_out32(s, val); 2767 } 2768 } else { 2769 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2770 } 2771 break; 2772 2773 OP_32_64(div2): 2774 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2775 break; 2776 OP_32_64(divu2): 2777 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2778 break; 2779 2780 OP_32_64(shl): 2781 /* For small constant 3-operand shift, use LEA. */ 2782 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2783 if (a2 - 1 == 0) { 2784 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2785 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2786 } else { 2787 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2788 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2789 } 2790 break; 2791 } 2792 c = SHIFT_SHL; 2793 vexop = OPC_SHLX; 2794 goto gen_shift_maybe_vex; 2795 OP_32_64(shr): 2796 c = SHIFT_SHR; 2797 vexop = OPC_SHRX; 2798 goto gen_shift_maybe_vex; 2799 OP_32_64(sar): 2800 c = SHIFT_SAR; 2801 vexop = OPC_SARX; 2802 goto gen_shift_maybe_vex; 2803 OP_32_64(rotl): 2804 c = SHIFT_ROL; 2805 goto gen_shift; 2806 OP_32_64(rotr): 2807 c = SHIFT_ROR; 2808 goto gen_shift; 2809 gen_shift_maybe_vex: 2810 if (have_bmi2) { 2811 if (!const_a2) { 2812 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2813 break; 2814 } 2815 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2816 } 2817 /* FALLTHRU */ 2818 gen_shift: 2819 if (const_a2) { 2820 tcg_out_shifti(s, c + rexw, a0, a2); 2821 } else { 2822 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2823 } 2824 break; 2825 2826 OP_32_64(ctz): 2827 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2828 break; 2829 OP_32_64(clz): 2830 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2831 break; 2832 OP_32_64(ctpop): 2833 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2834 break; 2835 2836 OP_32_64(brcond): 2837 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1], 2838 arg_label(args[3]), 0); 2839 break; 2840 OP_32_64(setcond): 2841 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false); 2842 break; 2843 OP_32_64(negsetcond): 2844 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true); 2845 break; 2846 OP_32_64(movcond): 2847 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 2848 break; 2849 2850 OP_32_64(bswap16): 2851 if (a2 & TCG_BSWAP_OS) { 2852 /* Output must be sign-extended. */ 2853 if (rexw) { 2854 tcg_out_bswap64(s, a0); 2855 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2856 } else { 2857 tcg_out_bswap32(s, a0); 2858 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2859 } 2860 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2861 /* Output must be zero-extended, but input isn't. */ 2862 tcg_out_bswap32(s, a0); 2863 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2864 } else { 2865 tcg_out_rolw_8(s, a0); 2866 } 2867 break; 2868 OP_32_64(bswap32): 2869 tcg_out_bswap32(s, a0); 2870 if (rexw && (a2 & TCG_BSWAP_OS)) { 2871 tcg_out_ext32s(s, a0, a0); 2872 } 2873 break; 2874 2875 OP_32_64(neg): 2876 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2877 break; 2878 OP_32_64(not): 2879 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2880 break; 2881 2882 case INDEX_op_qemu_ld_a64_i32: 2883 if (TCG_TARGET_REG_BITS == 32) { 2884 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2885 break; 2886 } 2887 /* fall through */ 2888 case INDEX_op_qemu_ld_a32_i32: 2889 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2890 break; 2891 case INDEX_op_qemu_ld_a32_i64: 2892 if (TCG_TARGET_REG_BITS == 64) { 2893 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2894 } else { 2895 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2896 } 2897 break; 2898 case INDEX_op_qemu_ld_a64_i64: 2899 if (TCG_TARGET_REG_BITS == 64) { 2900 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2901 } else { 2902 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2903 } 2904 break; 2905 case INDEX_op_qemu_ld_a32_i128: 2906 case INDEX_op_qemu_ld_a64_i128: 2907 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2908 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2909 break; 2910 2911 case INDEX_op_qemu_st_a64_i32: 2912 case INDEX_op_qemu_st8_a64_i32: 2913 if (TCG_TARGET_REG_BITS == 32) { 2914 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2915 break; 2916 } 2917 /* fall through */ 2918 case INDEX_op_qemu_st_a32_i32: 2919 case INDEX_op_qemu_st8_a32_i32: 2920 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2921 break; 2922 case INDEX_op_qemu_st_a32_i64: 2923 if (TCG_TARGET_REG_BITS == 64) { 2924 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2925 } else { 2926 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2927 } 2928 break; 2929 case INDEX_op_qemu_st_a64_i64: 2930 if (TCG_TARGET_REG_BITS == 64) { 2931 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2932 } else { 2933 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2934 } 2935 break; 2936 case INDEX_op_qemu_st_a32_i128: 2937 case INDEX_op_qemu_st_a64_i128: 2938 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2939 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2940 break; 2941 2942 OP_32_64(mulu2): 2943 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2944 break; 2945 OP_32_64(muls2): 2946 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2947 break; 2948 OP_32_64(add2): 2949 if (const_args[4]) { 2950 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2951 } else { 2952 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2953 } 2954 if (const_args[5]) { 2955 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2956 } else { 2957 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2958 } 2959 break; 2960 OP_32_64(sub2): 2961 if (const_args[4]) { 2962 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2963 } else { 2964 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2965 } 2966 if (const_args[5]) { 2967 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2968 } else { 2969 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2970 } 2971 break; 2972 2973#if TCG_TARGET_REG_BITS == 32 2974 case INDEX_op_brcond2_i32: 2975 tcg_out_brcond2(s, args, const_args, 0); 2976 break; 2977 case INDEX_op_setcond2_i32: 2978 tcg_out_setcond2(s, args, const_args); 2979 break; 2980#else /* TCG_TARGET_REG_BITS == 64 */ 2981 case INDEX_op_ld32s_i64: 2982 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2983 break; 2984 case INDEX_op_ld_i64: 2985 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2986 break; 2987 case INDEX_op_st_i64: 2988 if (const_args[0]) { 2989 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2990 tcg_out32(s, a0); 2991 } else { 2992 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2993 } 2994 break; 2995 2996 case INDEX_op_bswap64_i64: 2997 tcg_out_bswap64(s, a0); 2998 break; 2999 case INDEX_op_extrh_i64_i32: 3000 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 3001 break; 3002#endif 3003 3004 OP_32_64(deposit): 3005 if (args[3] == 0 && args[4] == 8) { 3006 /* load bits 0..7 */ 3007 if (const_a2) { 3008 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 3009 0, a0, 0); 3010 tcg_out8(s, a2); 3011 } else { 3012 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 3013 } 3014 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 3015 /* load bits 8..15 */ 3016 if (const_a2) { 3017 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 3018 tcg_out8(s, a2); 3019 } else { 3020 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 3021 } 3022 } else if (args[3] == 0 && args[4] == 16) { 3023 /* load bits 0..15 */ 3024 if (const_a2) { 3025 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 3026 0, a0, 0); 3027 tcg_out16(s, a2); 3028 } else { 3029 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 3030 } 3031 } else { 3032 g_assert_not_reached(); 3033 } 3034 break; 3035 3036 case INDEX_op_extract_i64: 3037 if (a2 + args[3] == 32) { 3038 if (a2 == 0) { 3039 tcg_out_ext32u(s, a0, a1); 3040 break; 3041 } 3042 /* This is a 32-bit zero-extending right shift. */ 3043 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3044 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 3045 break; 3046 } 3047 /* FALLTHRU */ 3048 case INDEX_op_extract_i32: 3049 if (a2 == 0 && args[3] == 8) { 3050 tcg_out_ext8u(s, a0, a1); 3051 } else if (a2 == 0 && args[3] == 16) { 3052 tcg_out_ext16u(s, a0, a1); 3053 } else if (a2 == 8 && args[3] == 8) { 3054 /* 3055 * On the off-chance that we can use the high-byte registers. 3056 * Otherwise we emit the same ext16 + shift pattern that we 3057 * would have gotten from the normal tcg-op.c expansion. 3058 */ 3059 if (a1 < 4 && a0 < 8) { 3060 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3061 } else { 3062 tcg_out_ext16u(s, a0, a1); 3063 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3064 } 3065 } else { 3066 g_assert_not_reached(); 3067 } 3068 break; 3069 3070 case INDEX_op_sextract_i64: 3071 if (a2 == 0 && args[3] == 8) { 3072 tcg_out_ext8s(s, TCG_TYPE_I64, a0, a1); 3073 } else if (a2 == 0 && args[3] == 16) { 3074 tcg_out_ext16s(s, TCG_TYPE_I64, a0, a1); 3075 } else if (a2 == 0 && args[3] == 32) { 3076 tcg_out_ext32s(s, a0, a1); 3077 } else { 3078 g_assert_not_reached(); 3079 } 3080 break; 3081 3082 case INDEX_op_sextract_i32: 3083 if (a2 == 0 && args[3] == 8) { 3084 tcg_out_ext8s(s, TCG_TYPE_I32, a0, a1); 3085 } else if (a2 == 0 && args[3] == 16) { 3086 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3087 } else if (a2 == 8 && args[3] == 8) { 3088 if (a1 < 4 && a0 < 8) { 3089 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3090 } else { 3091 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3092 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3093 } 3094 } else { 3095 g_assert_not_reached(); 3096 } 3097 break; 3098 3099 OP_32_64(extract2): 3100 /* Note that SHRD outputs to the r/m operand. */ 3101 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3102 tcg_out8(s, args[3]); 3103 break; 3104 3105 case INDEX_op_mb: 3106 tcg_out_mb(s, a0); 3107 break; 3108 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 3109 case INDEX_op_mov_i64: 3110 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3111 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3112 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3113 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */ 3114 case INDEX_op_ext8s_i64: 3115 case INDEX_op_ext8u_i32: 3116 case INDEX_op_ext8u_i64: 3117 case INDEX_op_ext16s_i32: 3118 case INDEX_op_ext16s_i64: 3119 case INDEX_op_ext16u_i32: 3120 case INDEX_op_ext16u_i64: 3121 case INDEX_op_ext32s_i64: 3122 case INDEX_op_ext32u_i64: 3123 case INDEX_op_ext_i32_i64: 3124 case INDEX_op_extu_i32_i64: 3125 case INDEX_op_extrl_i64_i32: 3126 default: 3127 g_assert_not_reached(); 3128 } 3129 3130#undef OP_32_64 3131} 3132 3133static int const umin_insn[4] = { 3134 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3135}; 3136 3137static int const umax_insn[4] = { 3138 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3139}; 3140 3141static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3142 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3143{ 3144 static int const cmpeq_insn[4] = { 3145 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3146 }; 3147 static int const cmpgt_insn[4] = { 3148 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3149 }; 3150 3151 enum { 3152 NEED_INV = 1, 3153 NEED_SWAP = 2, 3154 NEED_UMIN = 4, 3155 NEED_UMAX = 8, 3156 INVALID = 16, 3157 }; 3158 static const uint8_t cond_fixup[16] = { 3159 [0 ... 15] = INVALID, 3160 [TCG_COND_EQ] = 0, 3161 [TCG_COND_GT] = 0, 3162 [TCG_COND_NE] = NEED_INV, 3163 [TCG_COND_LE] = NEED_INV, 3164 [TCG_COND_LT] = NEED_SWAP, 3165 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3166 [TCG_COND_LEU] = NEED_UMIN, 3167 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3168 [TCG_COND_GEU] = NEED_UMAX, 3169 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3170 }; 3171 int fixup = cond_fixup[cond]; 3172 3173 assert(!(fixup & INVALID)); 3174 3175 if (fixup & NEED_INV) { 3176 cond = tcg_invert_cond(cond); 3177 } 3178 3179 if (fixup & NEED_SWAP) { 3180 TCGReg swap = v1; 3181 v1 = v2; 3182 v2 = swap; 3183 cond = tcg_swap_cond(cond); 3184 } 3185 3186 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3187 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3188 3189 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3190 assert(vece <= MO_32); 3191 3192 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3193 v2 = TCG_TMP_VEC; 3194 cond = TCG_COND_EQ; 3195 } 3196 3197 switch (cond) { 3198 case TCG_COND_EQ: 3199 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3200 break; 3201 case TCG_COND_GT: 3202 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3203 break; 3204 default: 3205 g_assert_not_reached(); 3206 } 3207 return fixup & NEED_INV; 3208} 3209 3210static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3211 TCGReg v1, TCGReg v2, TCGCond cond) 3212{ 3213 static const int cmpm_insn[2][4] = { 3214 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3215 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3216 }; 3217 static const int testm_insn[4] = { 3218 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3219 }; 3220 static const int testnm_insn[4] = { 3221 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3222 }; 3223 3224 static const int cond_ext[16] = { 3225 [TCG_COND_EQ] = 0, 3226 [TCG_COND_NE] = 4, 3227 [TCG_COND_LT] = 1, 3228 [TCG_COND_LTU] = 1, 3229 [TCG_COND_LE] = 2, 3230 [TCG_COND_LEU] = 2, 3231 [TCG_COND_NEVER] = 3, 3232 [TCG_COND_GE] = 5, 3233 [TCG_COND_GEU] = 5, 3234 [TCG_COND_GT] = 6, 3235 [TCG_COND_GTU] = 6, 3236 [TCG_COND_ALWAYS] = 7, 3237 }; 3238 3239 switch (cond) { 3240 case TCG_COND_TSTNE: 3241 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3242 break; 3243 case TCG_COND_TSTEQ: 3244 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3245 break; 3246 default: 3247 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3248 /* k1 */ 1, v1, v2, type); 3249 tcg_out8(s, cond_ext[cond]); 3250 break; 3251 } 3252} 3253 3254static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3255 unsigned vece, TCGReg dest) 3256{ 3257 static const int movm_insn[] = { 3258 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3259 }; 3260 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3261} 3262 3263static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3264 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3265{ 3266 /* 3267 * With avx512, we have a complete set of comparisons into mask. 3268 * Unless there's a single insn expansion for the comparision, 3269 * expand via a mask in k1. 3270 */ 3271 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3272 && cond != TCG_COND_EQ 3273 && cond != TCG_COND_LT 3274 && cond != TCG_COND_GT) { 3275 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3276 tcg_out_k1_to_vec(s, type, vece, v0); 3277 return; 3278 } 3279 3280 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3281 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3282 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3283 } 3284} 3285 3286static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3287 TCGReg v0, TCGReg c1, TCGReg c2, 3288 TCGReg v3, TCGReg v4, TCGCond cond) 3289{ 3290 static const int vpblendm_insn[] = { 3291 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3292 }; 3293 bool z = false; 3294 3295 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3296 if (!v3) { 3297 z = true; 3298 v3 = v4; 3299 cond = tcg_invert_cond(cond); 3300 } 3301 3302 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3303 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3304 /* k1 */1, z, type); 3305} 3306 3307static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3308 TCGReg v0, TCGReg c1, TCGReg c2, 3309 TCGReg v3, TCGReg v4, TCGCond cond) 3310{ 3311 bool inv; 3312 3313 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3314 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3315 return; 3316 } 3317 3318 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3319 3320 /* 3321 * Since XMM0 is 16, the only way we get 0 into V3 3322 * is via the constant zero constraint. 3323 */ 3324 if (!v3) { 3325 if (inv) { 3326 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3327 } else { 3328 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3329 } 3330 } else { 3331 if (inv) { 3332 TCGReg swap = v3; 3333 v3 = v4; 3334 v4 = swap; 3335 } 3336 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3337 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3338 } 3339} 3340 3341static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3342 unsigned vecl, unsigned vece, 3343 const TCGArg args[TCG_MAX_OP_ARGS], 3344 const int const_args[TCG_MAX_OP_ARGS]) 3345{ 3346 static int const add_insn[4] = { 3347 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3348 }; 3349 static int const ssadd_insn[4] = { 3350 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3351 }; 3352 static int const usadd_insn[4] = { 3353 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3354 }; 3355 static int const sub_insn[4] = { 3356 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3357 }; 3358 static int const sssub_insn[4] = { 3359 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3360 }; 3361 static int const ussub_insn[4] = { 3362 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3363 }; 3364 static int const mul_insn[4] = { 3365 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3366 }; 3367 static int const shift_imm_insn[4] = { 3368 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3369 }; 3370 static int const punpckl_insn[4] = { 3371 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3372 }; 3373 static int const punpckh_insn[4] = { 3374 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3375 }; 3376 static int const packss_insn[4] = { 3377 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3378 }; 3379 static int const packus_insn[4] = { 3380 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3381 }; 3382 static int const smin_insn[4] = { 3383 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3384 }; 3385 static int const smax_insn[4] = { 3386 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3387 }; 3388 static int const rotlv_insn[4] = { 3389 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3390 }; 3391 static int const rotrv_insn[4] = { 3392 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3393 }; 3394 static int const shlv_insn[4] = { 3395 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3396 }; 3397 static int const shrv_insn[4] = { 3398 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3399 }; 3400 static int const sarv_insn[4] = { 3401 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3402 }; 3403 static int const shls_insn[4] = { 3404 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3405 }; 3406 static int const shrs_insn[4] = { 3407 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3408 }; 3409 static int const sars_insn[4] = { 3410 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3411 }; 3412 static int const vpshldi_insn[4] = { 3413 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3414 }; 3415 static int const vpshldv_insn[4] = { 3416 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3417 }; 3418 static int const vpshrdv_insn[4] = { 3419 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3420 }; 3421 static int const abs_insn[4] = { 3422 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3423 }; 3424 3425 TCGType type = vecl + TCG_TYPE_V64; 3426 int insn, sub; 3427 TCGArg a0, a1, a2, a3; 3428 3429 a0 = args[0]; 3430 a1 = args[1]; 3431 a2 = args[2]; 3432 3433 switch (opc) { 3434 case INDEX_op_add_vec: 3435 insn = add_insn[vece]; 3436 goto gen_simd; 3437 case INDEX_op_ssadd_vec: 3438 insn = ssadd_insn[vece]; 3439 goto gen_simd; 3440 case INDEX_op_usadd_vec: 3441 insn = usadd_insn[vece]; 3442 goto gen_simd; 3443 case INDEX_op_sub_vec: 3444 insn = sub_insn[vece]; 3445 goto gen_simd; 3446 case INDEX_op_sssub_vec: 3447 insn = sssub_insn[vece]; 3448 goto gen_simd; 3449 case INDEX_op_ussub_vec: 3450 insn = ussub_insn[vece]; 3451 goto gen_simd; 3452 case INDEX_op_mul_vec: 3453 insn = mul_insn[vece]; 3454 goto gen_simd; 3455 case INDEX_op_and_vec: 3456 insn = OPC_PAND; 3457 goto gen_simd; 3458 case INDEX_op_or_vec: 3459 insn = OPC_POR; 3460 goto gen_simd; 3461 case INDEX_op_xor_vec: 3462 insn = OPC_PXOR; 3463 goto gen_simd; 3464 case INDEX_op_smin_vec: 3465 insn = smin_insn[vece]; 3466 goto gen_simd; 3467 case INDEX_op_umin_vec: 3468 insn = umin_insn[vece]; 3469 goto gen_simd; 3470 case INDEX_op_smax_vec: 3471 insn = smax_insn[vece]; 3472 goto gen_simd; 3473 case INDEX_op_umax_vec: 3474 insn = umax_insn[vece]; 3475 goto gen_simd; 3476 case INDEX_op_shlv_vec: 3477 insn = shlv_insn[vece]; 3478 goto gen_simd; 3479 case INDEX_op_shrv_vec: 3480 insn = shrv_insn[vece]; 3481 goto gen_simd; 3482 case INDEX_op_sarv_vec: 3483 insn = sarv_insn[vece]; 3484 goto gen_simd; 3485 case INDEX_op_rotlv_vec: 3486 insn = rotlv_insn[vece]; 3487 goto gen_simd; 3488 case INDEX_op_rotrv_vec: 3489 insn = rotrv_insn[vece]; 3490 goto gen_simd; 3491 case INDEX_op_shls_vec: 3492 insn = shls_insn[vece]; 3493 goto gen_simd; 3494 case INDEX_op_shrs_vec: 3495 insn = shrs_insn[vece]; 3496 goto gen_simd; 3497 case INDEX_op_sars_vec: 3498 insn = sars_insn[vece]; 3499 goto gen_simd; 3500 case INDEX_op_x86_punpckl_vec: 3501 insn = punpckl_insn[vece]; 3502 goto gen_simd; 3503 case INDEX_op_x86_punpckh_vec: 3504 insn = punpckh_insn[vece]; 3505 goto gen_simd; 3506 case INDEX_op_x86_packss_vec: 3507 insn = packss_insn[vece]; 3508 goto gen_simd; 3509 case INDEX_op_x86_packus_vec: 3510 insn = packus_insn[vece]; 3511 goto gen_simd; 3512 case INDEX_op_x86_vpshldv_vec: 3513 insn = vpshldv_insn[vece]; 3514 a1 = a2; 3515 a2 = args[3]; 3516 goto gen_simd; 3517 case INDEX_op_x86_vpshrdv_vec: 3518 insn = vpshrdv_insn[vece]; 3519 a1 = a2; 3520 a2 = args[3]; 3521 goto gen_simd; 3522#if TCG_TARGET_REG_BITS == 32 3523 case INDEX_op_dup2_vec: 3524 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3525 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3526 /* Then replicate the 64-bit elements across the rest of the vector. */ 3527 if (type != TCG_TYPE_V64) { 3528 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3529 } 3530 break; 3531#endif 3532 case INDEX_op_abs_vec: 3533 insn = abs_insn[vece]; 3534 a2 = a1; 3535 a1 = 0; 3536 goto gen_simd; 3537 gen_simd: 3538 tcg_debug_assert(insn != OPC_UD2); 3539 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3540 break; 3541 3542 case INDEX_op_cmp_vec: 3543 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3544 break; 3545 3546 case INDEX_op_cmpsel_vec: 3547 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3548 args[3], args[4], args[5]); 3549 break; 3550 3551 case INDEX_op_andc_vec: 3552 insn = OPC_PANDN; 3553 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3554 break; 3555 3556 case INDEX_op_shli_vec: 3557 insn = shift_imm_insn[vece]; 3558 sub = 6; 3559 goto gen_shift; 3560 case INDEX_op_shri_vec: 3561 insn = shift_imm_insn[vece]; 3562 sub = 2; 3563 goto gen_shift; 3564 case INDEX_op_sari_vec: 3565 if (vece == MO_64) { 3566 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3567 } else { 3568 insn = shift_imm_insn[vece]; 3569 } 3570 sub = 4; 3571 goto gen_shift; 3572 case INDEX_op_rotli_vec: 3573 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3574 if (vece == MO_64) { 3575 insn |= P_VEXW; 3576 } 3577 sub = 1; 3578 goto gen_shift; 3579 gen_shift: 3580 tcg_debug_assert(vece != MO_8); 3581 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3582 tcg_out8(s, a2); 3583 break; 3584 3585 case INDEX_op_ld_vec: 3586 tcg_out_ld(s, type, a0, a1, a2); 3587 break; 3588 case INDEX_op_st_vec: 3589 tcg_out_st(s, type, a0, a1, a2); 3590 break; 3591 case INDEX_op_dupm_vec: 3592 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3593 break; 3594 3595 case INDEX_op_x86_shufps_vec: 3596 insn = OPC_SHUFPS; 3597 sub = args[3]; 3598 goto gen_simd_imm8; 3599 case INDEX_op_x86_blend_vec: 3600 if (vece == MO_16) { 3601 insn = OPC_PBLENDW; 3602 } else if (vece == MO_32) { 3603 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3604 } else { 3605 g_assert_not_reached(); 3606 } 3607 sub = args[3]; 3608 goto gen_simd_imm8; 3609 case INDEX_op_x86_vperm2i128_vec: 3610 insn = OPC_VPERM2I128; 3611 sub = args[3]; 3612 goto gen_simd_imm8; 3613 case INDEX_op_x86_vpshldi_vec: 3614 insn = vpshldi_insn[vece]; 3615 sub = args[3]; 3616 goto gen_simd_imm8; 3617 3618 case INDEX_op_not_vec: 3619 insn = OPC_VPTERNLOGQ; 3620 a2 = a1; 3621 sub = 0x33; /* !B */ 3622 goto gen_simd_imm8; 3623 case INDEX_op_nor_vec: 3624 insn = OPC_VPTERNLOGQ; 3625 sub = 0x11; /* norCB */ 3626 goto gen_simd_imm8; 3627 case INDEX_op_nand_vec: 3628 insn = OPC_VPTERNLOGQ; 3629 sub = 0x77; /* nandCB */ 3630 goto gen_simd_imm8; 3631 case INDEX_op_eqv_vec: 3632 insn = OPC_VPTERNLOGQ; 3633 sub = 0x99; /* xnorCB */ 3634 goto gen_simd_imm8; 3635 case INDEX_op_orc_vec: 3636 insn = OPC_VPTERNLOGQ; 3637 sub = 0xdd; /* orB!C */ 3638 goto gen_simd_imm8; 3639 3640 case INDEX_op_bitsel_vec: 3641 insn = OPC_VPTERNLOGQ; 3642 a3 = args[3]; 3643 if (a0 == a1) { 3644 a1 = a2; 3645 a2 = a3; 3646 sub = 0xca; /* A?B:C */ 3647 } else if (a0 == a2) { 3648 a2 = a3; 3649 sub = 0xe2; /* B?A:C */ 3650 } else { 3651 tcg_out_mov(s, type, a0, a3); 3652 sub = 0xb8; /* B?C:A */ 3653 } 3654 goto gen_simd_imm8; 3655 3656 gen_simd_imm8: 3657 tcg_debug_assert(insn != OPC_UD2); 3658 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3659 tcg_out8(s, sub); 3660 break; 3661 3662 case INDEX_op_x86_psrldq_vec: 3663 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3664 tcg_out8(s, a2); 3665 break; 3666 3667 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3668 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3669 default: 3670 g_assert_not_reached(); 3671 } 3672} 3673 3674static TCGConstraintSetIndex 3675tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) 3676{ 3677 switch (op) { 3678 case INDEX_op_goto_ptr: 3679 return C_O0_I1(r); 3680 3681 case INDEX_op_ld8u_i32: 3682 case INDEX_op_ld8u_i64: 3683 case INDEX_op_ld8s_i32: 3684 case INDEX_op_ld8s_i64: 3685 case INDEX_op_ld16u_i32: 3686 case INDEX_op_ld16u_i64: 3687 case INDEX_op_ld16s_i32: 3688 case INDEX_op_ld16s_i64: 3689 case INDEX_op_ld_i32: 3690 case INDEX_op_ld32u_i64: 3691 case INDEX_op_ld32s_i64: 3692 case INDEX_op_ld_i64: 3693 return C_O1_I1(r, r); 3694 3695 case INDEX_op_st8_i32: 3696 case INDEX_op_st8_i64: 3697 return C_O0_I2(qi, r); 3698 3699 case INDEX_op_st16_i32: 3700 case INDEX_op_st16_i64: 3701 case INDEX_op_st_i32: 3702 case INDEX_op_st32_i64: 3703 return C_O0_I2(ri, r); 3704 3705 case INDEX_op_st_i64: 3706 return C_O0_I2(re, r); 3707 3708 case INDEX_op_add_i32: 3709 case INDEX_op_add_i64: 3710 return C_O1_I2(r, r, re); 3711 3712 case INDEX_op_sub_i32: 3713 case INDEX_op_sub_i64: 3714 case INDEX_op_mul_i32: 3715 case INDEX_op_mul_i64: 3716 case INDEX_op_or_i32: 3717 case INDEX_op_or_i64: 3718 case INDEX_op_xor_i32: 3719 case INDEX_op_xor_i64: 3720 return C_O1_I2(r, 0, re); 3721 3722 case INDEX_op_and_i32: 3723 case INDEX_op_and_i64: 3724 return C_O1_I2(r, 0, reZ); 3725 3726 case INDEX_op_andc_i32: 3727 case INDEX_op_andc_i64: 3728 return C_O1_I2(r, r, rI); 3729 3730 case INDEX_op_shl_i32: 3731 case INDEX_op_shl_i64: 3732 case INDEX_op_shr_i32: 3733 case INDEX_op_shr_i64: 3734 case INDEX_op_sar_i32: 3735 case INDEX_op_sar_i64: 3736 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3737 3738 case INDEX_op_rotl_i32: 3739 case INDEX_op_rotl_i64: 3740 case INDEX_op_rotr_i32: 3741 case INDEX_op_rotr_i64: 3742 return C_O1_I2(r, 0, ci); 3743 3744 case INDEX_op_brcond_i32: 3745 case INDEX_op_brcond_i64: 3746 return C_O0_I2(r, reT); 3747 3748 case INDEX_op_bswap16_i32: 3749 case INDEX_op_bswap16_i64: 3750 case INDEX_op_bswap32_i32: 3751 case INDEX_op_bswap32_i64: 3752 case INDEX_op_bswap64_i64: 3753 case INDEX_op_neg_i32: 3754 case INDEX_op_neg_i64: 3755 case INDEX_op_not_i32: 3756 case INDEX_op_not_i64: 3757 case INDEX_op_extrh_i64_i32: 3758 return C_O1_I1(r, 0); 3759 3760 case INDEX_op_ext8s_i32: 3761 case INDEX_op_ext8s_i64: 3762 case INDEX_op_ext8u_i32: 3763 case INDEX_op_ext8u_i64: 3764 return C_O1_I1(r, q); 3765 3766 case INDEX_op_ext16s_i32: 3767 case INDEX_op_ext16s_i64: 3768 case INDEX_op_ext16u_i32: 3769 case INDEX_op_ext16u_i64: 3770 case INDEX_op_ext32s_i64: 3771 case INDEX_op_ext32u_i64: 3772 case INDEX_op_ext_i32_i64: 3773 case INDEX_op_extu_i32_i64: 3774 case INDEX_op_extrl_i64_i32: 3775 case INDEX_op_extract_i32: 3776 case INDEX_op_extract_i64: 3777 case INDEX_op_sextract_i32: 3778 case INDEX_op_sextract_i64: 3779 case INDEX_op_ctpop_i32: 3780 case INDEX_op_ctpop_i64: 3781 return C_O1_I1(r, r); 3782 3783 case INDEX_op_extract2_i32: 3784 case INDEX_op_extract2_i64: 3785 return C_O1_I2(r, 0, r); 3786 3787 case INDEX_op_deposit_i32: 3788 case INDEX_op_deposit_i64: 3789 return C_O1_I2(q, 0, qi); 3790 3791 case INDEX_op_setcond_i32: 3792 case INDEX_op_setcond_i64: 3793 case INDEX_op_negsetcond_i32: 3794 case INDEX_op_negsetcond_i64: 3795 return C_O1_I2(q, r, reT); 3796 3797 case INDEX_op_movcond_i32: 3798 case INDEX_op_movcond_i64: 3799 return C_O1_I4(r, r, reT, r, 0); 3800 3801 case INDEX_op_div2_i32: 3802 case INDEX_op_div2_i64: 3803 case INDEX_op_divu2_i32: 3804 case INDEX_op_divu2_i64: 3805 return C_O2_I3(a, d, 0, 1, r); 3806 3807 case INDEX_op_mulu2_i32: 3808 case INDEX_op_mulu2_i64: 3809 case INDEX_op_muls2_i32: 3810 case INDEX_op_muls2_i64: 3811 return C_O2_I2(a, d, a, r); 3812 3813 case INDEX_op_add2_i32: 3814 case INDEX_op_add2_i64: 3815 case INDEX_op_sub2_i32: 3816 case INDEX_op_sub2_i64: 3817 return C_N1_O1_I4(r, r, 0, 1, re, re); 3818 3819 case INDEX_op_ctz_i32: 3820 case INDEX_op_ctz_i64: 3821 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3822 3823 case INDEX_op_clz_i32: 3824 case INDEX_op_clz_i64: 3825 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3826 3827 case INDEX_op_qemu_ld_a32_i32: 3828 return C_O1_I1(r, L); 3829 case INDEX_op_qemu_ld_a64_i32: 3830 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L); 3831 3832 case INDEX_op_qemu_st_a32_i32: 3833 return C_O0_I2(L, L); 3834 case INDEX_op_qemu_st_a64_i32: 3835 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3836 case INDEX_op_qemu_st8_a32_i32: 3837 return C_O0_I2(s, L); 3838 case INDEX_op_qemu_st8_a64_i32: 3839 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L); 3840 3841 case INDEX_op_qemu_ld_a32_i64: 3842 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3843 case INDEX_op_qemu_ld_a64_i64: 3844 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L); 3845 3846 case INDEX_op_qemu_st_a32_i64: 3847 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3848 case INDEX_op_qemu_st_a64_i64: 3849 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); 3850 3851 case INDEX_op_qemu_ld_a32_i128: 3852 case INDEX_op_qemu_ld_a64_i128: 3853 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3854 return C_O2_I1(r, r, L); 3855 case INDEX_op_qemu_st_a32_i128: 3856 case INDEX_op_qemu_st_a64_i128: 3857 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3858 return C_O0_I3(L, L, L); 3859 3860 case INDEX_op_brcond2_i32: 3861 return C_O0_I4(r, r, ri, ri); 3862 3863 case INDEX_op_setcond2_i32: 3864 return C_O1_I4(r, r, r, ri, ri); 3865 3866 case INDEX_op_ld_vec: 3867 case INDEX_op_dupm_vec: 3868 return C_O1_I1(x, r); 3869 3870 case INDEX_op_st_vec: 3871 return C_O0_I2(x, r); 3872 3873 case INDEX_op_add_vec: 3874 case INDEX_op_sub_vec: 3875 case INDEX_op_mul_vec: 3876 case INDEX_op_and_vec: 3877 case INDEX_op_or_vec: 3878 case INDEX_op_xor_vec: 3879 case INDEX_op_andc_vec: 3880 case INDEX_op_orc_vec: 3881 case INDEX_op_nand_vec: 3882 case INDEX_op_nor_vec: 3883 case INDEX_op_eqv_vec: 3884 case INDEX_op_ssadd_vec: 3885 case INDEX_op_usadd_vec: 3886 case INDEX_op_sssub_vec: 3887 case INDEX_op_ussub_vec: 3888 case INDEX_op_smin_vec: 3889 case INDEX_op_umin_vec: 3890 case INDEX_op_smax_vec: 3891 case INDEX_op_umax_vec: 3892 case INDEX_op_shlv_vec: 3893 case INDEX_op_shrv_vec: 3894 case INDEX_op_sarv_vec: 3895 case INDEX_op_rotlv_vec: 3896 case INDEX_op_rotrv_vec: 3897 case INDEX_op_shls_vec: 3898 case INDEX_op_shrs_vec: 3899 case INDEX_op_sars_vec: 3900 case INDEX_op_cmp_vec: 3901 case INDEX_op_x86_shufps_vec: 3902 case INDEX_op_x86_blend_vec: 3903 case INDEX_op_x86_packss_vec: 3904 case INDEX_op_x86_packus_vec: 3905 case INDEX_op_x86_vperm2i128_vec: 3906 case INDEX_op_x86_punpckl_vec: 3907 case INDEX_op_x86_punpckh_vec: 3908 case INDEX_op_x86_vpshldi_vec: 3909#if TCG_TARGET_REG_BITS == 32 3910 case INDEX_op_dup2_vec: 3911#endif 3912 return C_O1_I2(x, x, x); 3913 3914 case INDEX_op_abs_vec: 3915 case INDEX_op_dup_vec: 3916 case INDEX_op_not_vec: 3917 case INDEX_op_shli_vec: 3918 case INDEX_op_shri_vec: 3919 case INDEX_op_sari_vec: 3920 case INDEX_op_rotli_vec: 3921 case INDEX_op_x86_psrldq_vec: 3922 return C_O1_I1(x, x); 3923 3924 case INDEX_op_x86_vpshldv_vec: 3925 case INDEX_op_x86_vpshrdv_vec: 3926 return C_O1_I3(x, 0, x, x); 3927 3928 case INDEX_op_bitsel_vec: 3929 return C_O1_I3(x, x, x, x); 3930 case INDEX_op_cmpsel_vec: 3931 return C_O1_I4(x, x, x, xO, x); 3932 3933 default: 3934 return C_NotImplemented; 3935 } 3936} 3937 3938int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3939{ 3940 switch (opc) { 3941 case INDEX_op_add_vec: 3942 case INDEX_op_sub_vec: 3943 case INDEX_op_and_vec: 3944 case INDEX_op_or_vec: 3945 case INDEX_op_xor_vec: 3946 case INDEX_op_andc_vec: 3947 case INDEX_op_orc_vec: 3948 case INDEX_op_nand_vec: 3949 case INDEX_op_nor_vec: 3950 case INDEX_op_eqv_vec: 3951 case INDEX_op_not_vec: 3952 case INDEX_op_bitsel_vec: 3953 return 1; 3954 case INDEX_op_cmp_vec: 3955 case INDEX_op_cmpsel_vec: 3956 return -1; 3957 3958 case INDEX_op_rotli_vec: 3959 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3960 3961 case INDEX_op_shli_vec: 3962 case INDEX_op_shri_vec: 3963 /* We must expand the operation for MO_8. */ 3964 return vece == MO_8 ? -1 : 1; 3965 3966 case INDEX_op_sari_vec: 3967 switch (vece) { 3968 case MO_8: 3969 return -1; 3970 case MO_16: 3971 case MO_32: 3972 return 1; 3973 case MO_64: 3974 if (have_avx512vl) { 3975 return 1; 3976 } 3977 /* 3978 * We can emulate this for MO_64, but it does not pay off 3979 * unless we're producing at least 4 values. 3980 */ 3981 return type >= TCG_TYPE_V256 ? -1 : 0; 3982 } 3983 return 0; 3984 3985 case INDEX_op_shls_vec: 3986 case INDEX_op_shrs_vec: 3987 return vece >= MO_16; 3988 case INDEX_op_sars_vec: 3989 switch (vece) { 3990 case MO_16: 3991 case MO_32: 3992 return 1; 3993 case MO_64: 3994 return have_avx512vl; 3995 } 3996 return 0; 3997 case INDEX_op_rotls_vec: 3998 return vece >= MO_16 ? -1 : 0; 3999 4000 case INDEX_op_shlv_vec: 4001 case INDEX_op_shrv_vec: 4002 switch (vece) { 4003 case MO_16: 4004 return have_avx512bw; 4005 case MO_32: 4006 case MO_64: 4007 return have_avx2; 4008 } 4009 return 0; 4010 case INDEX_op_sarv_vec: 4011 switch (vece) { 4012 case MO_16: 4013 return have_avx512bw; 4014 case MO_32: 4015 return have_avx2; 4016 case MO_64: 4017 return have_avx512vl; 4018 } 4019 return 0; 4020 case INDEX_op_rotlv_vec: 4021 case INDEX_op_rotrv_vec: 4022 switch (vece) { 4023 case MO_16: 4024 return have_avx512vbmi2 ? -1 : 0; 4025 case MO_32: 4026 case MO_64: 4027 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 4028 } 4029 return 0; 4030 4031 case INDEX_op_mul_vec: 4032 switch (vece) { 4033 case MO_8: 4034 return -1; 4035 case MO_64: 4036 return have_avx512dq; 4037 } 4038 return 1; 4039 4040 case INDEX_op_ssadd_vec: 4041 case INDEX_op_usadd_vec: 4042 case INDEX_op_sssub_vec: 4043 case INDEX_op_ussub_vec: 4044 return vece <= MO_16; 4045 case INDEX_op_smin_vec: 4046 case INDEX_op_smax_vec: 4047 case INDEX_op_umin_vec: 4048 case INDEX_op_umax_vec: 4049 case INDEX_op_abs_vec: 4050 return vece <= MO_32 || have_avx512vl; 4051 4052 default: 4053 return 0; 4054 } 4055} 4056 4057static void expand_vec_shi(TCGType type, unsigned vece, bool right, 4058 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4059{ 4060 uint8_t mask; 4061 4062 tcg_debug_assert(vece == MO_8); 4063 if (right) { 4064 mask = 0xff >> imm; 4065 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4066 } else { 4067 mask = 0xff << imm; 4068 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4069 } 4070 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4071} 4072 4073static void expand_vec_sari(TCGType type, unsigned vece, 4074 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4075{ 4076 TCGv_vec t1, t2; 4077 4078 switch (vece) { 4079 case MO_8: 4080 /* Unpack to 16-bit, shift, and repack. */ 4081 t1 = tcg_temp_new_vec(type); 4082 t2 = tcg_temp_new_vec(type); 4083 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4084 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4085 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4086 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4087 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4088 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4089 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4090 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4091 tcg_temp_free_vec(t1); 4092 tcg_temp_free_vec(t2); 4093 break; 4094 4095 case MO_64: 4096 t1 = tcg_temp_new_vec(type); 4097 if (imm <= 32) { 4098 /* 4099 * We can emulate a small sign extend by performing an arithmetic 4100 * 32-bit shift and overwriting the high half of a 64-bit logical 4101 * shift. Note that the ISA says shift of 32 is valid, but TCG 4102 * does not, so we have to bound the smaller shift -- we get the 4103 * same result in the high half either way. 4104 */ 4105 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4106 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4107 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4108 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4109 tcgv_vec_arg(t1), 0xaa); 4110 } else { 4111 /* Otherwise we will need to use a compare vs 0 to produce 4112 * the sign-extend, shift and merge. 4113 */ 4114 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4115 tcg_constant_vec(type, MO_64, 0), v1); 4116 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4117 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4118 tcg_gen_or_vec(MO_64, v0, v0, t1); 4119 } 4120 tcg_temp_free_vec(t1); 4121 break; 4122 4123 default: 4124 g_assert_not_reached(); 4125 } 4126} 4127 4128static void expand_vec_rotli(TCGType type, unsigned vece, 4129 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4130{ 4131 TCGv_vec t; 4132 4133 if (vece != MO_8 && have_avx512vbmi2) { 4134 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4135 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4136 return; 4137 } 4138 4139 t = tcg_temp_new_vec(type); 4140 tcg_gen_shli_vec(vece, t, v1, imm); 4141 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4142 tcg_gen_or_vec(vece, v0, v0, t); 4143 tcg_temp_free_vec(t); 4144} 4145 4146static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4147 TCGv_vec v1, TCGv_vec sh, bool right) 4148{ 4149 TCGv_vec t; 4150 4151 if (have_avx512vbmi2) { 4152 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4153 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4154 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4155 return; 4156 } 4157 4158 t = tcg_temp_new_vec(type); 4159 tcg_gen_dupi_vec(vece, t, 8 << vece); 4160 tcg_gen_sub_vec(vece, t, t, sh); 4161 if (right) { 4162 tcg_gen_shlv_vec(vece, t, v1, t); 4163 tcg_gen_shrv_vec(vece, v0, v1, sh); 4164 } else { 4165 tcg_gen_shrv_vec(vece, t, v1, t); 4166 tcg_gen_shlv_vec(vece, v0, v1, sh); 4167 } 4168 tcg_gen_or_vec(vece, v0, v0, t); 4169 tcg_temp_free_vec(t); 4170} 4171 4172static void expand_vec_rotls(TCGType type, unsigned vece, 4173 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4174{ 4175 TCGv_vec t = tcg_temp_new_vec(type); 4176 4177 tcg_debug_assert(vece != MO_8); 4178 4179 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4180 tcg_gen_dup_i32_vec(vece, t, lsh); 4181 if (vece >= MO_32) { 4182 tcg_gen_rotlv_vec(vece, v0, v1, t); 4183 } else { 4184 expand_vec_rotv(type, vece, v0, v1, t, false); 4185 } 4186 } else { 4187 TCGv_i32 rsh = tcg_temp_new_i32(); 4188 4189 tcg_gen_neg_i32(rsh, lsh); 4190 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4191 tcg_gen_shls_vec(vece, t, v1, lsh); 4192 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4193 tcg_gen_or_vec(vece, v0, v0, t); 4194 4195 tcg_temp_free_i32(rsh); 4196 } 4197 4198 tcg_temp_free_vec(t); 4199} 4200 4201static void expand_vec_mul(TCGType type, unsigned vece, 4202 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4203{ 4204 TCGv_vec t1, t2, t3, t4, zero; 4205 4206 tcg_debug_assert(vece == MO_8); 4207 4208 /* 4209 * Unpack v1 bytes to words, 0 | x. 4210 * Unpack v2 bytes to words, y | 0. 4211 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4212 * Shift logical right by 8 bits to clear the high 8 bytes before 4213 * using an unsigned saturated pack. 4214 * 4215 * The difference between the V64, V128 and V256 cases is merely how 4216 * we distribute the expansion between temporaries. 4217 */ 4218 switch (type) { 4219 case TCG_TYPE_V64: 4220 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4221 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4222 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4223 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4224 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4225 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4226 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4227 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4228 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4229 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4230 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4231 tcg_temp_free_vec(t1); 4232 tcg_temp_free_vec(t2); 4233 break; 4234 4235 case TCG_TYPE_V128: 4236 case TCG_TYPE_V256: 4237 t1 = tcg_temp_new_vec(type); 4238 t2 = tcg_temp_new_vec(type); 4239 t3 = tcg_temp_new_vec(type); 4240 t4 = tcg_temp_new_vec(type); 4241 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4242 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4243 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4244 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4245 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4246 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4247 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4248 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4249 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4250 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4251 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4252 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4253 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4254 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4255 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4256 tcg_temp_free_vec(t1); 4257 tcg_temp_free_vec(t2); 4258 tcg_temp_free_vec(t3); 4259 tcg_temp_free_vec(t4); 4260 break; 4261 4262 default: 4263 g_assert_not_reached(); 4264 } 4265} 4266 4267static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4268 TCGArg *a1, TCGArg *a2, TCGCond cond) 4269{ 4270 /* 4271 * Without AVX512, there are no 64-bit unsigned comparisons. 4272 * We must bias the inputs so that they become signed. 4273 * All other swapping and inversion are handled during code generation. 4274 */ 4275 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4276 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4277 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4278 TCGv_vec t1 = tcg_temp_new_vec(type); 4279 TCGv_vec t2 = tcg_temp_new_vec(type); 4280 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4281 4282 tcg_gen_sub_vec(vece, t1, v1, t3); 4283 tcg_gen_sub_vec(vece, t2, v2, t3); 4284 *a1 = tcgv_vec_arg(t1); 4285 *a2 = tcgv_vec_arg(t2); 4286 cond = tcg_signed_cond(cond); 4287 } 4288 return cond; 4289} 4290 4291static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4292 TCGArg a1, TCGArg a2, TCGCond cond) 4293{ 4294 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4295 /* Expand directly; do not recurse. */ 4296 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4297} 4298 4299static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4300 TCGArg a1, TCGArg a2, 4301 TCGArg a3, TCGArg a4, TCGCond cond) 4302{ 4303 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4304 /* Expand directly; do not recurse. */ 4305 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4306} 4307 4308void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4309 TCGArg a0, ...) 4310{ 4311 va_list va; 4312 TCGArg a1, a2, a3, a4, a5; 4313 TCGv_vec v0, v1, v2; 4314 4315 va_start(va, a0); 4316 a1 = va_arg(va, TCGArg); 4317 a2 = va_arg(va, TCGArg); 4318 v0 = temp_tcgv_vec(arg_temp(a0)); 4319 v1 = temp_tcgv_vec(arg_temp(a1)); 4320 4321 switch (opc) { 4322 case INDEX_op_shli_vec: 4323 expand_vec_shi(type, vece, false, v0, v1, a2); 4324 break; 4325 case INDEX_op_shri_vec: 4326 expand_vec_shi(type, vece, true, v0, v1, a2); 4327 break; 4328 case INDEX_op_sari_vec: 4329 expand_vec_sari(type, vece, v0, v1, a2); 4330 break; 4331 4332 case INDEX_op_rotli_vec: 4333 expand_vec_rotli(type, vece, v0, v1, a2); 4334 break; 4335 4336 case INDEX_op_rotls_vec: 4337 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4338 break; 4339 4340 case INDEX_op_rotlv_vec: 4341 v2 = temp_tcgv_vec(arg_temp(a2)); 4342 expand_vec_rotv(type, vece, v0, v1, v2, false); 4343 break; 4344 case INDEX_op_rotrv_vec: 4345 v2 = temp_tcgv_vec(arg_temp(a2)); 4346 expand_vec_rotv(type, vece, v0, v1, v2, true); 4347 break; 4348 4349 case INDEX_op_mul_vec: 4350 v2 = temp_tcgv_vec(arg_temp(a2)); 4351 expand_vec_mul(type, vece, v0, v1, v2); 4352 break; 4353 4354 case INDEX_op_cmp_vec: 4355 a3 = va_arg(va, TCGArg); 4356 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4357 break; 4358 4359 case INDEX_op_cmpsel_vec: 4360 a3 = va_arg(va, TCGArg); 4361 a4 = va_arg(va, TCGArg); 4362 a5 = va_arg(va, TCGArg); 4363 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4364 break; 4365 4366 default: 4367 break; 4368 } 4369 4370 va_end(va); 4371} 4372 4373static const int tcg_target_callee_save_regs[] = { 4374#if TCG_TARGET_REG_BITS == 64 4375 TCG_REG_RBP, 4376 TCG_REG_RBX, 4377#if defined(_WIN64) 4378 TCG_REG_RDI, 4379 TCG_REG_RSI, 4380#endif 4381 TCG_REG_R12, 4382 TCG_REG_R13, 4383 TCG_REG_R14, /* Currently used for the global env. */ 4384 TCG_REG_R15, 4385#else 4386 TCG_REG_EBP, /* Currently used for the global env. */ 4387 TCG_REG_EBX, 4388 TCG_REG_ESI, 4389 TCG_REG_EDI, 4390#endif 4391}; 4392 4393/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4394 and tcg_register_jit. */ 4395 4396#define PUSH_SIZE \ 4397 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4398 * (TCG_TARGET_REG_BITS / 8)) 4399 4400#define FRAME_SIZE \ 4401 ((PUSH_SIZE \ 4402 + TCG_STATIC_CALL_ARGS_SIZE \ 4403 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4404 + TCG_TARGET_STACK_ALIGN - 1) \ 4405 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4406 4407/* Generate global QEMU prologue and epilogue code */ 4408static void tcg_target_qemu_prologue(TCGContext *s) 4409{ 4410 int i, stack_addend; 4411 4412 /* TB prologue */ 4413 4414 /* Reserve some stack space, also for TCG temps. */ 4415 stack_addend = FRAME_SIZE - PUSH_SIZE; 4416 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4417 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4418 4419 /* Save all callee saved registers. */ 4420 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4421 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4422 } 4423 4424 if (!tcg_use_softmmu && guest_base) { 4425 int seg = setup_guest_base_seg(); 4426 if (seg != 0) { 4427 x86_guest_base.seg = seg; 4428 } else if (guest_base == (int32_t)guest_base) { 4429 x86_guest_base.ofs = guest_base; 4430 } else { 4431 assert(TCG_TARGET_REG_BITS == 64); 4432 /* Choose R12 because, as a base, it requires a SIB byte. */ 4433 x86_guest_base.index = TCG_REG_R12; 4434 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4435 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4436 } 4437 } 4438 4439 if (TCG_TARGET_REG_BITS == 32) { 4440 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4441 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4442 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4443 /* jmp *tb. */ 4444 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4445 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4446 + stack_addend); 4447 } else { 4448 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4449 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4450 /* jmp *tb. */ 4451 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4452 } 4453 4454 /* 4455 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4456 * and fall through to the rest of the epilogue. 4457 */ 4458 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4459 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4460 4461 /* TB epilogue */ 4462 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4463 4464 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4465 4466 if (have_avx2) { 4467 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4468 } 4469 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4470 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4471 } 4472 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4473} 4474 4475static void tcg_out_tb_start(TCGContext *s) 4476{ 4477 /* nothing to do */ 4478} 4479 4480static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4481{ 4482 memset(p, 0x90, count); 4483} 4484 4485static void tcg_target_init(TCGContext *s) 4486{ 4487 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4488 if (TCG_TARGET_REG_BITS == 64) { 4489 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4490 } 4491 if (have_avx1) { 4492 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4493 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4494 } 4495 if (have_avx2) { 4496 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4497 } 4498 4499 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4500 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4501 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4502 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4503 if (TCG_TARGET_REG_BITS == 64) { 4504#if !defined(_WIN64) 4505 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4506 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4507#endif 4508 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4509 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4510 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4511 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4512 } 4513 4514 s->reserved_regs = 0; 4515 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4516 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4517#ifdef _WIN64 4518 /* These are call saved, and we don't save them, so don't use them. */ 4519 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4520 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4521 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4522 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4523 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4524 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4525 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4526 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4527 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4528 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4529#endif 4530} 4531 4532typedef struct { 4533 DebugFrameHeader h; 4534 uint8_t fde_def_cfa[4]; 4535 uint8_t fde_reg_ofs[14]; 4536} DebugFrame; 4537 4538/* We're expecting a 2 byte uleb128 encoded value. */ 4539QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4540 4541#if !defined(__ELF__) 4542 /* Host machine without ELF. */ 4543#elif TCG_TARGET_REG_BITS == 64 4544#define ELF_HOST_MACHINE EM_X86_64 4545static const DebugFrame debug_frame = { 4546 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4547 .h.cie.id = -1, 4548 .h.cie.version = 1, 4549 .h.cie.code_align = 1, 4550 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4551 .h.cie.return_column = 16, 4552 4553 /* Total FDE size does not include the "len" member. */ 4554 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4555 4556 .fde_def_cfa = { 4557 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4558 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4559 (FRAME_SIZE >> 7) 4560 }, 4561 .fde_reg_ofs = { 4562 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4563 /* The following ordering must match tcg_target_callee_save_regs. */ 4564 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4565 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4566 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4567 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4568 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4569 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4570 } 4571}; 4572#else 4573#define ELF_HOST_MACHINE EM_386 4574static const DebugFrame debug_frame = { 4575 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4576 .h.cie.id = -1, 4577 .h.cie.version = 1, 4578 .h.cie.code_align = 1, 4579 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4580 .h.cie.return_column = 8, 4581 4582 /* Total FDE size does not include the "len" member. */ 4583 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4584 4585 .fde_def_cfa = { 4586 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4587 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4588 (FRAME_SIZE >> 7) 4589 }, 4590 .fde_reg_ofs = { 4591 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4592 /* The following ordering must match tcg_target_callee_save_regs. */ 4593 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4594 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4595 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4596 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4597 } 4598}; 4599#endif 4600 4601#if defined(ELF_HOST_MACHINE) 4602void tcg_register_jit(const void *buf, size_t buf_size) 4603{ 4604 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4605} 4606#endif 4607