1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25/* Used for function call generation. */ 26#define TCG_TARGET_STACK_ALIGN 16 27#if defined(_WIN64) 28#define TCG_TARGET_CALL_STACK_OFFSET 32 29#else 30#define TCG_TARGET_CALL_STACK_OFFSET 0 31#endif 32#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL 33#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL 34#if defined(_WIN64) 35# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF 36# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_VEC 37#elif TCG_TARGET_REG_BITS == 64 38# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 39# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL 40#else 41# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 42# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF 43#endif 44 45#ifdef CONFIG_DEBUG_TCG 46static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 47#if TCG_TARGET_REG_BITS == 64 48 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 49#else 50 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 51#endif 52 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 53 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 54#if TCG_TARGET_REG_BITS == 64 55 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 56 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 57#endif 58}; 59#endif 60 61static const int tcg_target_reg_alloc_order[] = { 62#if TCG_TARGET_REG_BITS == 64 63 TCG_REG_RBP, 64 TCG_REG_RBX, 65 TCG_REG_R12, 66 TCG_REG_R13, 67 TCG_REG_R14, 68 TCG_REG_R15, 69 TCG_REG_R10, 70 TCG_REG_R11, 71 TCG_REG_R9, 72 TCG_REG_R8, 73 TCG_REG_RCX, 74 TCG_REG_RDX, 75 TCG_REG_RSI, 76 TCG_REG_RDI, 77 TCG_REG_RAX, 78#else 79 TCG_REG_EBX, 80 TCG_REG_ESI, 81 TCG_REG_EDI, 82 TCG_REG_EBP, 83 TCG_REG_ECX, 84 TCG_REG_EDX, 85 TCG_REG_EAX, 86#endif 87 TCG_REG_XMM0, 88 TCG_REG_XMM1, 89 TCG_REG_XMM2, 90 TCG_REG_XMM3, 91 TCG_REG_XMM4, 92 TCG_REG_XMM5, 93#ifndef _WIN64 94 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 95 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 96 TCG_REG_XMM6, 97 TCG_REG_XMM7, 98#if TCG_TARGET_REG_BITS == 64 99 TCG_REG_XMM8, 100 TCG_REG_XMM9, 101 TCG_REG_XMM10, 102 TCG_REG_XMM11, 103 TCG_REG_XMM12, 104 TCG_REG_XMM13, 105 TCG_REG_XMM14, 106 TCG_REG_XMM15, 107#endif 108#endif 109}; 110 111#define TCG_TMP_VEC TCG_REG_XMM5 112 113static const int tcg_target_call_iarg_regs[] = { 114#if TCG_TARGET_REG_BITS == 64 115#if defined(_WIN64) 116 TCG_REG_RCX, 117 TCG_REG_RDX, 118#else 119 TCG_REG_RDI, 120 TCG_REG_RSI, 121 TCG_REG_RDX, 122 TCG_REG_RCX, 123#endif 124 TCG_REG_R8, 125 TCG_REG_R9, 126#else 127 /* 32 bit mode uses stack based calling convention (GCC default). */ 128#endif 129}; 130 131static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 132{ 133 switch (kind) { 134 case TCG_CALL_RET_NORMAL: 135 tcg_debug_assert(slot >= 0 && slot <= 1); 136 return slot ? TCG_REG_EDX : TCG_REG_EAX; 137#ifdef _WIN64 138 case TCG_CALL_RET_BY_VEC: 139 tcg_debug_assert(slot == 0); 140 return TCG_REG_XMM0; 141#endif 142 default: 143 g_assert_not_reached(); 144 } 145} 146 147/* Constants we accept. */ 148#define TCG_CT_CONST_S32 0x100 149#define TCG_CT_CONST_U32 0x200 150#define TCG_CT_CONST_I32 0x400 151#define TCG_CT_CONST_WSZ 0x800 152#define TCG_CT_CONST_TST 0x1000 153#define TCG_CT_CONST_ZERO 0x2000 154 155/* Registers used with L constraint, which are the first argument 156 registers on x86_64, and two random call clobbered registers on 157 i386. */ 158#if TCG_TARGET_REG_BITS == 64 159# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 160# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 161#else 162# define TCG_REG_L0 TCG_REG_EAX 163# define TCG_REG_L1 TCG_REG_EDX 164#endif 165 166#if TCG_TARGET_REG_BITS == 64 167# define ALL_GENERAL_REGS 0x0000ffffu 168# define ALL_VECTOR_REGS 0xffff0000u 169# define ALL_BYTEL_REGS ALL_GENERAL_REGS 170#else 171# define ALL_GENERAL_REGS 0x000000ffu 172# define ALL_VECTOR_REGS 0x00ff0000u 173# define ALL_BYTEL_REGS 0x0000000fu 174#endif 175#define SOFTMMU_RESERVE_REGS \ 176 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 177 178#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 179#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 180 181static const tcg_insn_unit *tb_ret_addr; 182 183static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 184 intptr_t value, intptr_t addend) 185{ 186 value += addend; 187 switch(type) { 188 case R_386_PC32: 189 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 190 if (value != (int32_t)value) { 191 return false; 192 } 193 /* FALLTHRU */ 194 case R_386_32: 195 tcg_patch32(code_ptr, value); 196 break; 197 case R_386_PC8: 198 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 199 if (value != (int8_t)value) { 200 return false; 201 } 202 tcg_patch8(code_ptr, value); 203 break; 204 default: 205 g_assert_not_reached(); 206 } 207 return true; 208} 209 210/* test if a constant matches the constraint */ 211static bool tcg_target_const_match(int64_t val, int ct, 212 TCGType type, TCGCond cond, int vece) 213{ 214 if (ct & TCG_CT_CONST) { 215 return 1; 216 } 217 if (type == TCG_TYPE_I32) { 218 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 219 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 220 return 1; 221 } 222 } else { 223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 224 return 1; 225 } 226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 227 return 1; 228 } 229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 230 return 1; 231 } 232 /* 233 * This will be used in combination with TCG_CT_CONST_S32, 234 * so "normal" TESTQ is already matched. Also accept: 235 * TESTQ -> TESTL (uint32_t) 236 * TESTQ -> BT (is_power_of_2) 237 */ 238 if ((ct & TCG_CT_CONST_TST) 239 && is_tst_cond(cond) 240 && (val == (uint32_t)val || is_power_of_2(val))) { 241 return 1; 242 } 243 } 244 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 245 return 1; 246 } 247 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EbIb (0x80) 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVB_Ib (0xb0) 307#define OPC_MOVL_Iv (0xb8) 308#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 309#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 310#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 311#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 312#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 313#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 314#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 315#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 316#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 318#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 319#define OPC_MOVSBL (0xbe | P_EXT) 320#define OPC_MOVSWL (0xbf | P_EXT) 321#define OPC_MOVSLQ (0x63 | P_REXW) 322#define OPC_MOVZBL (0xb6 | P_EXT) 323#define OPC_MOVZWL (0xb7 | P_EXT) 324#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 325#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 326#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 327#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 328#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 329#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 330#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 331#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 332#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 333#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 334#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 335#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 336#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 337#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 338#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 339#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 340#define OPC_PAND (0xdb | P_EXT | P_DATA16) 341#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 342#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 343#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 344#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 345#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 346#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 347#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 348#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 349#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 350#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 351#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 352#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 353#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 354#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 355#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 356#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 358#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 359#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 360#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 362#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 363#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 364#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 366#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 367#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 368#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 369#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 370#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 371#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 372#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 373#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 374#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 375#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 376#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 377#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 378#define OPC_POR (0xeb | P_EXT | P_DATA16) 379#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 380#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 381#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 382#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 383#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 384#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 385#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 386#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 387#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 388#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 389#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 390#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 391#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 392#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 393#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 394#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 395#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 396#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 397#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 398#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 399#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 400#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 401#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 402#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 404#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 405#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 406#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 407#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 408#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 409#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 410#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 411#define OPC_PXOR (0xef | P_EXT | P_DATA16) 412#define OPC_POP_r32 (0x58) 413#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 414#define OPC_PUSH_r32 (0x50) 415#define OPC_PUSH_Iv (0x68) 416#define OPC_PUSH_Ib (0x6a) 417#define OPC_RET (0xc3) 418#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 419#define OPC_SHIFT_1 (0xd1) 420#define OPC_SHIFT_Ib (0xc1) 421#define OPC_SHIFT_cl (0xd3) 422#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 423#define OPC_SHUFPS (0xc6 | P_EXT) 424#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 425#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 426#define OPC_SHRD_Ib (0xac | P_EXT) 427#define OPC_TESTB (0x84) 428#define OPC_TESTL (0x85) 429#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 430#define OPC_UD2 (0x0b | P_EXT) 431#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 432#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 433#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 434#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 436#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 437#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 438#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 439#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 440#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 442#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 443#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 446#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 447#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 448#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 449#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 450#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 451#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 452#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 453#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 454#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 455#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 456#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 457#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 458#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 459#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 460#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 462#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 463#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 464#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 465#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 466#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 468#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 469#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 470#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 471#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 472#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 473#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 474#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 475#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 476#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 477#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 478#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 479#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 480#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 481#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 482#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 483#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 484#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 485#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 486#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 487#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 488#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 489#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 490#define OPC_VZEROUPPER (0x77 | P_EXT) 491#define OPC_XCHG_ax_r32 (0x90) 492#define OPC_XCHG_EvGv (0x87) 493 494#define OPC_GRP3_Eb (0xf6) 495#define OPC_GRP3_Ev (0xf7) 496#define OPC_GRP5 (0xff) 497#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 498#define OPC_GRPBT (0xba | P_EXT) 499 500#define OPC_GRPBT_BT 4 501#define OPC_GRPBT_BTS 5 502#define OPC_GRPBT_BTR 6 503#define OPC_GRPBT_BTC 7 504 505/* Group 1 opcode extensions for 0x80-0x83. 506 These are also used as modifiers for OPC_ARITH. */ 507#define ARITH_ADD 0 508#define ARITH_OR 1 509#define ARITH_ADC 2 510#define ARITH_SBB 3 511#define ARITH_AND 4 512#define ARITH_SUB 5 513#define ARITH_XOR 6 514#define ARITH_CMP 7 515 516/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 517#define SHIFT_ROL 0 518#define SHIFT_ROR 1 519#define SHIFT_SHL 4 520#define SHIFT_SHR 5 521#define SHIFT_SAR 7 522 523/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 524#define EXT3_TESTi 0 525#define EXT3_NOT 2 526#define EXT3_NEG 3 527#define EXT3_MUL 4 528#define EXT3_IMUL 5 529#define EXT3_DIV 6 530#define EXT3_IDIV 7 531 532/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 533#define EXT5_INC_Ev 0 534#define EXT5_DEC_Ev 1 535#define EXT5_CALLN_Ev 2 536#define EXT5_JMPN_Ev 4 537 538/* Condition codes to be added to OPC_JCC_{long,short}. */ 539#define JCC_JMP (-1) 540#define JCC_JO 0x0 541#define JCC_JNO 0x1 542#define JCC_JB 0x2 543#define JCC_JAE 0x3 544#define JCC_JE 0x4 545#define JCC_JNE 0x5 546#define JCC_JBE 0x6 547#define JCC_JA 0x7 548#define JCC_JS 0x8 549#define JCC_JNS 0x9 550#define JCC_JP 0xa 551#define JCC_JNP 0xb 552#define JCC_JL 0xc 553#define JCC_JGE 0xd 554#define JCC_JLE 0xe 555#define JCC_JG 0xf 556 557static const uint8_t tcg_cond_to_jcc[] = { 558 [TCG_COND_EQ] = JCC_JE, 559 [TCG_COND_NE] = JCC_JNE, 560 [TCG_COND_LT] = JCC_JL, 561 [TCG_COND_GE] = JCC_JGE, 562 [TCG_COND_LE] = JCC_JLE, 563 [TCG_COND_GT] = JCC_JG, 564 [TCG_COND_LTU] = JCC_JB, 565 [TCG_COND_GEU] = JCC_JAE, 566 [TCG_COND_LEU] = JCC_JBE, 567 [TCG_COND_GTU] = JCC_JA, 568 [TCG_COND_TSTEQ] = JCC_JE, 569 [TCG_COND_TSTNE] = JCC_JNE, 570}; 571 572#if TCG_TARGET_REG_BITS == 64 573static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 574{ 575 int rex; 576 577 if (opc & P_GS) { 578 tcg_out8(s, 0x65); 579 } 580 if (opc & P_DATA16) { 581 /* We should never be asking for both 16 and 64-bit operation. */ 582 tcg_debug_assert((opc & P_REXW) == 0); 583 tcg_out8(s, 0x66); 584 } 585 if (opc & P_SIMDF3) { 586 tcg_out8(s, 0xf3); 587 } else if (opc & P_SIMDF2) { 588 tcg_out8(s, 0xf2); 589 } 590 591 rex = 0; 592 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 593 rex |= (r & 8) >> 1; /* REX.R */ 594 rex |= (x & 8) >> 2; /* REX.X */ 595 rex |= (rm & 8) >> 3; /* REX.B */ 596 597 /* P_REXB_{R,RM} indicates that the given register is the low byte. 598 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 599 as otherwise the encoding indicates %[abcd]h. Note that the values 600 that are ORed in merely indicate that the REX byte must be present; 601 those bits get discarded in output. */ 602 rex |= opc & (r >= 4 ? P_REXB_R : 0); 603 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 604 605 if (rex) { 606 tcg_out8(s, (uint8_t)(rex | 0x40)); 607 } 608 609 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 610 tcg_out8(s, 0x0f); 611 if (opc & P_EXT38) { 612 tcg_out8(s, 0x38); 613 } else if (opc & P_EXT3A) { 614 tcg_out8(s, 0x3a); 615 } 616 } 617 618 tcg_out8(s, opc); 619} 620#else 621static void tcg_out_opc(TCGContext *s, int opc) 622{ 623 if (opc & P_DATA16) { 624 tcg_out8(s, 0x66); 625 } 626 if (opc & P_SIMDF3) { 627 tcg_out8(s, 0xf3); 628 } else if (opc & P_SIMDF2) { 629 tcg_out8(s, 0xf2); 630 } 631 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 632 tcg_out8(s, 0x0f); 633 if (opc & P_EXT38) { 634 tcg_out8(s, 0x38); 635 } else if (opc & P_EXT3A) { 636 tcg_out8(s, 0x3a); 637 } 638 } 639 tcg_out8(s, opc); 640} 641/* Discard the register arguments to tcg_out_opc early, so as not to penalize 642 the 32-bit compilation paths. This method works with all versions of gcc, 643 whereas relying on optimization may not be able to exclude them. */ 644#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 645#endif 646 647static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 648{ 649 tcg_out_opc(s, opc, r, rm, 0); 650 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 651} 652 653static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 654 int rm, int index) 655{ 656 int tmp; 657 658 if (opc & P_GS) { 659 tcg_out8(s, 0x65); 660 } 661 /* Use the two byte form if possible, which cannot encode 662 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 663 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 664 && ((rm | index) & 8) == 0) { 665 /* Two byte VEX prefix. */ 666 tcg_out8(s, 0xc5); 667 668 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 669 } else { 670 /* Three byte VEX prefix. */ 671 tcg_out8(s, 0xc4); 672 673 /* VEX.m-mmmm */ 674 if (opc & P_EXT3A) { 675 tmp = 3; 676 } else if (opc & P_EXT38) { 677 tmp = 2; 678 } else if (opc & P_EXT) { 679 tmp = 1; 680 } else { 681 g_assert_not_reached(); 682 } 683 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 684 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 685 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 686 tcg_out8(s, tmp); 687 688 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 689 } 690 691 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 692 /* VEX.pp */ 693 if (opc & P_DATA16) { 694 tmp |= 1; /* 0x66 */ 695 } else if (opc & P_SIMDF3) { 696 tmp |= 2; /* 0xf3 */ 697 } else if (opc & P_SIMDF2) { 698 tmp |= 3; /* 0xf2 */ 699 } 700 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 701 tcg_out8(s, tmp); 702 tcg_out8(s, opc); 703} 704 705static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 706 int rm, int index, int aaa, bool z) 707{ 708 /* The entire 4-byte evex prefix; with R' and V' set. */ 709 uint32_t p = 0x08041062; 710 int mm, pp; 711 712 tcg_debug_assert(have_avx512vl); 713 714 /* EVEX.mm */ 715 if (opc & P_EXT3A) { 716 mm = 3; 717 } else if (opc & P_EXT38) { 718 mm = 2; 719 } else if (opc & P_EXT) { 720 mm = 1; 721 } else { 722 g_assert_not_reached(); 723 } 724 725 /* EVEX.pp */ 726 if (opc & P_DATA16) { 727 pp = 1; /* 0x66 */ 728 } else if (opc & P_SIMDF3) { 729 pp = 2; /* 0xf3 */ 730 } else if (opc & P_SIMDF2) { 731 pp = 3; /* 0xf2 */ 732 } else { 733 pp = 0; 734 } 735 736 p = deposit32(p, 8, 2, mm); 737 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 738 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 739 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 740 p = deposit32(p, 16, 2, pp); 741 p = deposit32(p, 19, 4, ~v); 742 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 743 p = deposit32(p, 24, 3, aaa); 744 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 745 p = deposit32(p, 31, 1, z); 746 747 tcg_out32(s, p); 748 tcg_out8(s, opc); 749} 750 751static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 752{ 753 if (opc & P_EVEX) { 754 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 755 } else { 756 tcg_out_vex_opc(s, opc, r, v, rm, 0); 757 } 758 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 759} 760 761static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 762 int r, int v, int rm, TCGType type) 763{ 764 if (type == TCG_TYPE_V256) { 765 opc |= P_VEXL; 766 } 767 tcg_out_vex_modrm(s, opc, r, v, rm); 768} 769 770static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 771 int rm, int aaa, bool z, TCGType type) 772{ 773 if (type == TCG_TYPE_V256) { 774 opc |= P_VEXL; 775 } 776 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 777 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 778} 779 780/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 781 We handle either RM and INDEX missing with a negative value. In 64-bit 782 mode for absolute addresses, ~RM is the size of the immediate operand 783 that will follow the instruction. */ 784 785static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 786 int shift, intptr_t offset) 787{ 788 int mod, len; 789 790 if (index < 0 && rm < 0) { 791 if (TCG_TARGET_REG_BITS == 64) { 792 /* Try for a rip-relative addressing mode. This has replaced 793 the 32-bit-mode absolute addressing encoding. */ 794 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 795 intptr_t disp = offset - pc; 796 if (disp == (int32_t)disp) { 797 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 798 tcg_out32(s, disp); 799 return; 800 } 801 802 /* Try for an absolute address encoding. This requires the 803 use of the MODRM+SIB encoding and is therefore larger than 804 rip-relative addressing. */ 805 if (offset == (int32_t)offset) { 806 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 807 tcg_out8(s, (4 << 3) | 5); 808 tcg_out32(s, offset); 809 return; 810 } 811 812 /* ??? The memory isn't directly addressable. */ 813 g_assert_not_reached(); 814 } else { 815 /* Absolute address. */ 816 tcg_out8(s, (r << 3) | 5); 817 tcg_out32(s, offset); 818 return; 819 } 820 } 821 822 /* Find the length of the immediate addend. Note that the encoding 823 that would be used for (%ebp) indicates absolute addressing. */ 824 if (rm < 0) { 825 mod = 0, len = 4, rm = 5; 826 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 827 mod = 0, len = 0; 828 } else if (offset == (int8_t)offset) { 829 mod = 0x40, len = 1; 830 } else { 831 mod = 0x80, len = 4; 832 } 833 834 /* Use a single byte MODRM format if possible. Note that the encoding 835 that would be used for %esp is the escape to the two byte form. */ 836 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 837 /* Single byte MODRM format. */ 838 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 839 } else { 840 /* Two byte MODRM+SIB format. */ 841 842 /* Note that the encoding that would place %esp into the index 843 field indicates no index register. In 64-bit mode, the REX.X 844 bit counts, so %r12 can be used as the index. */ 845 if (index < 0) { 846 index = 4; 847 } else { 848 tcg_debug_assert(index != TCG_REG_ESP); 849 } 850 851 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 852 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 853 } 854 855 if (len == 1) { 856 tcg_out8(s, offset); 857 } else if (len == 4) { 858 tcg_out32(s, offset); 859 } 860} 861 862static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 863 int index, int shift, intptr_t offset) 864{ 865 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 866 tcg_out_sib_offset(s, r, rm, index, shift, offset); 867} 868 869static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 870 int rm, int index, int shift, 871 intptr_t offset) 872{ 873 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 874 tcg_out_sib_offset(s, r, rm, index, shift, offset); 875} 876 877/* A simplification of the above with no index or shift. */ 878static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 879 int rm, intptr_t offset) 880{ 881 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 882} 883 884static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 885 int v, int rm, intptr_t offset) 886{ 887 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 888} 889 890/* Output an opcode with an expected reference to the constant pool. */ 891static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 892{ 893 tcg_out_opc(s, opc, r, 0, 0); 894 /* Absolute for 32-bit, pc-relative for 64-bit. */ 895 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 896 tcg_out32(s, 0); 897} 898 899/* Output an opcode with an expected reference to the constant pool. */ 900static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 901{ 902 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 903 /* Absolute for 32-bit, pc-relative for 64-bit. */ 904 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 905 tcg_out32(s, 0); 906} 907 908/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 909static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 910{ 911 /* Propagate an opcode prefix, such as P_REXW. */ 912 int ext = subop & ~0x7; 913 subop &= 0x7; 914 915 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 916} 917 918static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 919{ 920 int rexw = 0; 921 922 if (arg == ret) { 923 return true; 924 } 925 switch (type) { 926 case TCG_TYPE_I64: 927 rexw = P_REXW; 928 /* fallthru */ 929 case TCG_TYPE_I32: 930 if (ret < 16) { 931 if (arg < 16) { 932 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 933 } else { 934 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 935 } 936 } else { 937 if (arg < 16) { 938 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 939 } else { 940 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 941 } 942 } 943 break; 944 945 case TCG_TYPE_V64: 946 tcg_debug_assert(ret >= 16 && arg >= 16); 947 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 948 break; 949 case TCG_TYPE_V128: 950 tcg_debug_assert(ret >= 16 && arg >= 16); 951 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 952 break; 953 case TCG_TYPE_V256: 954 tcg_debug_assert(ret >= 16 && arg >= 16); 955 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 956 break; 957 958 default: 959 g_assert_not_reached(); 960 } 961 return true; 962} 963 964static const int avx2_dup_insn[4] = { 965 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 966 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 967}; 968 969static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 970 TCGReg r, TCGReg a) 971{ 972 if (have_avx2) { 973 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 974 } else { 975 switch (vece) { 976 case MO_8: 977 /* ??? With zero in a register, use PSHUFB. */ 978 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 979 a = r; 980 /* FALLTHRU */ 981 case MO_16: 982 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 983 a = r; 984 /* FALLTHRU */ 985 case MO_32: 986 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 987 /* imm8 operand: all output lanes selected from input lane 0. */ 988 tcg_out8(s, 0); 989 break; 990 case MO_64: 991 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 992 break; 993 default: 994 g_assert_not_reached(); 995 } 996 } 997 return true; 998} 999 1000static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 1001 TCGReg r, TCGReg base, intptr_t offset) 1002{ 1003 if (have_avx2) { 1004 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1005 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 1006 r, 0, base, offset); 1007 } else { 1008 switch (vece) { 1009 case MO_64: 1010 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 1011 break; 1012 case MO_32: 1013 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 1014 break; 1015 case MO_16: 1016 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1017 tcg_out8(s, 0); /* imm8 */ 1018 tcg_out_dup_vec(s, type, vece, r, r); 1019 break; 1020 case MO_8: 1021 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1022 tcg_out8(s, 0); /* imm8 */ 1023 tcg_out_dup_vec(s, type, vece, r, r); 1024 break; 1025 default: 1026 g_assert_not_reached(); 1027 } 1028 } 1029 return true; 1030} 1031 1032static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1033 TCGReg ret, int64_t arg) 1034{ 1035 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1036 1037 if (arg == 0) { 1038 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1039 return; 1040 } 1041 if (arg == -1) { 1042 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1043 return; 1044 } 1045 1046 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1047 if (have_avx2) { 1048 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1049 } else { 1050 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1051 } 1052 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1053 } else { 1054 if (type == TCG_TYPE_V64) { 1055 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1056 } else if (have_avx2) { 1057 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1058 } else { 1059 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1060 } 1061 if (TCG_TARGET_REG_BITS == 64) { 1062 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1063 } else { 1064 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1065 } 1066 } 1067} 1068 1069static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1070 TCGReg ret, tcg_target_long arg) 1071{ 1072 if (arg == 0) { 1073 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1074 return; 1075 } 1076 if (arg == -1) { 1077 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1078 return; 1079 } 1080 1081 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1082 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1083 if (TCG_TARGET_REG_BITS == 64) { 1084 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1085 } else { 1086 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1087 } 1088} 1089 1090static void tcg_out_movi_int(TCGContext *s, TCGType type, 1091 TCGReg ret, tcg_target_long arg) 1092{ 1093 tcg_target_long diff; 1094 1095 if (arg == 0) { 1096 tgen_arithr(s, ARITH_XOR, ret, ret); 1097 return; 1098 } 1099 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1100 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1101 tcg_out32(s, arg); 1102 return; 1103 } 1104 if (arg == (int32_t)arg) { 1105 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1106 tcg_out32(s, arg); 1107 return; 1108 } 1109 1110 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1111 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1112 if (diff == (int32_t)diff) { 1113 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1114 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1115 tcg_out32(s, diff); 1116 return; 1117 } 1118 1119 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1120 tcg_out64(s, arg); 1121} 1122 1123static void tcg_out_movi(TCGContext *s, TCGType type, 1124 TCGReg ret, tcg_target_long arg) 1125{ 1126 switch (type) { 1127 case TCG_TYPE_I32: 1128#if TCG_TARGET_REG_BITS == 64 1129 case TCG_TYPE_I64: 1130#endif 1131 if (ret < 16) { 1132 tcg_out_movi_int(s, type, ret, arg); 1133 } else { 1134 tcg_out_movi_vec(s, type, ret, arg); 1135 } 1136 break; 1137 default: 1138 g_assert_not_reached(); 1139 } 1140} 1141 1142static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1143{ 1144 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1145 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1146 return true; 1147} 1148 1149static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1150 tcg_target_long imm) 1151{ 1152 /* This function is only used for passing structs by reference. */ 1153 tcg_debug_assert(imm == (int32_t)imm); 1154 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1155} 1156 1157static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1158{ 1159 if (val == (int8_t)val) { 1160 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1161 tcg_out8(s, val); 1162 } else if (val == (int32_t)val) { 1163 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1164 tcg_out32(s, val); 1165 } else { 1166 g_assert_not_reached(); 1167 } 1168} 1169 1170static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1171{ 1172 /* Given the strength of x86 memory ordering, we only need care for 1173 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1174 faster than "mfence", so don't bother with the sse insn. */ 1175 if (a0 & TCG_MO_ST_LD) { 1176 tcg_out8(s, 0xf0); 1177 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1178 tcg_out8(s, 0); 1179 } 1180} 1181 1182static inline void tcg_out_push(TCGContext *s, int reg) 1183{ 1184 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1185} 1186 1187static inline void tcg_out_pop(TCGContext *s, int reg) 1188{ 1189 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1190} 1191 1192static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1193 TCGReg arg1, intptr_t arg2) 1194{ 1195 switch (type) { 1196 case TCG_TYPE_I32: 1197 if (ret < 16) { 1198 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1199 } else { 1200 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1201 } 1202 break; 1203 case TCG_TYPE_I64: 1204 if (ret < 16) { 1205 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1206 break; 1207 } 1208 /* FALLTHRU */ 1209 case TCG_TYPE_V64: 1210 /* There is no instruction that can validate 8-byte alignment. */ 1211 tcg_debug_assert(ret >= 16); 1212 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1213 break; 1214 case TCG_TYPE_V128: 1215 /* 1216 * The gvec infrastructure is asserts that v128 vector loads 1217 * and stores use a 16-byte aligned offset. Validate that the 1218 * final pointer is aligned by using an insn that will SIGSEGV. 1219 */ 1220 tcg_debug_assert(ret >= 16); 1221 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1222 break; 1223 case TCG_TYPE_V256: 1224 /* 1225 * The gvec infrastructure only requires 16-byte alignment, 1226 * so here we must use an unaligned load. 1227 */ 1228 tcg_debug_assert(ret >= 16); 1229 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1230 ret, 0, arg1, arg2); 1231 break; 1232 default: 1233 g_assert_not_reached(); 1234 } 1235} 1236 1237static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1238 TCGReg arg1, intptr_t arg2) 1239{ 1240 switch (type) { 1241 case TCG_TYPE_I32: 1242 if (arg < 16) { 1243 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1244 } else { 1245 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1246 } 1247 break; 1248 case TCG_TYPE_I64: 1249 if (arg < 16) { 1250 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1251 break; 1252 } 1253 /* FALLTHRU */ 1254 case TCG_TYPE_V64: 1255 /* There is no instruction that can validate 8-byte alignment. */ 1256 tcg_debug_assert(arg >= 16); 1257 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1258 break; 1259 case TCG_TYPE_V128: 1260 /* 1261 * The gvec infrastructure is asserts that v128 vector loads 1262 * and stores use a 16-byte aligned offset. Validate that the 1263 * final pointer is aligned by using an insn that will SIGSEGV. 1264 * 1265 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1266 * for _WIN64, which must have SSE2 but may not have AVX. 1267 */ 1268 tcg_debug_assert(arg >= 16); 1269 if (have_avx1) { 1270 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1271 } else { 1272 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1273 } 1274 break; 1275 case TCG_TYPE_V256: 1276 /* 1277 * The gvec infrastructure only requires 16-byte alignment, 1278 * so here we must use an unaligned store. 1279 */ 1280 tcg_debug_assert(arg >= 16); 1281 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1282 arg, 0, arg1, arg2); 1283 break; 1284 default: 1285 g_assert_not_reached(); 1286 } 1287} 1288 1289static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1290 TCGReg base, intptr_t ofs) 1291{ 1292 int rexw = 0; 1293 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1294 if (val != (int32_t)val) { 1295 return false; 1296 } 1297 rexw = P_REXW; 1298 } else if (type != TCG_TYPE_I32) { 1299 return false; 1300 } 1301 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1302 tcg_out32(s, val); 1303 return true; 1304} 1305 1306static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1307{ 1308 /* Propagate an opcode prefix, such as P_DATA16. */ 1309 int ext = subopc & ~0x7; 1310 subopc &= 0x7; 1311 1312 if (count == 1) { 1313 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1314 } else { 1315 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1316 tcg_out8(s, count); 1317 } 1318} 1319 1320static inline void tcg_out_bswap32(TCGContext *s, int reg) 1321{ 1322 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1323} 1324 1325static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1326{ 1327 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1328} 1329 1330static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1331{ 1332 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1333 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1334 if (dest >= 4) { 1335 tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest); 1336 tcg_out32(s, 0xff); 1337 return; 1338 } 1339 src = dest; 1340 } 1341 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1342} 1343 1344static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1345{ 1346 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1347 1348 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1349 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1350 if (dest >= 4) { 1351 tcg_out_shifti(s, SHIFT_SHL, dest, 24); 1352 tcg_out_shifti(s, SHIFT_SAR, dest, 24); 1353 return; 1354 } 1355 src = dest; 1356 } 1357 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1358} 1359 1360static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1361{ 1362 /* movzwl */ 1363 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1364} 1365 1366static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1367{ 1368 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1369 /* movsw[lq] */ 1370 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1371} 1372 1373static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1374{ 1375 /* 32-bit mov zero extends. */ 1376 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1377} 1378 1379static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1380{ 1381 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1382 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1383} 1384 1385static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1386{ 1387 tcg_out_ext32s(s, dest, src); 1388} 1389 1390static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1391{ 1392 if (dest != src) { 1393 tcg_out_ext32u(s, dest, src); 1394 } 1395} 1396 1397static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1398{ 1399 tcg_out_ext32u(s, dest, src); 1400} 1401 1402static inline void tcg_out_bswap64(TCGContext *s, int reg) 1403{ 1404 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1405} 1406 1407static void tgen_arithi(TCGContext *s, int c, int r0, 1408 tcg_target_long val, int cf) 1409{ 1410 int rexw = 0; 1411 1412 if (TCG_TARGET_REG_BITS == 64) { 1413 rexw = c & -8; 1414 c &= 7; 1415 } 1416 1417 switch (c) { 1418 case ARITH_ADD: 1419 case ARITH_SUB: 1420 if (!cf) { 1421 /* 1422 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1423 * partial flags update stalls on Pentium4 and are not recommended 1424 * by current Intel optimization manuals. 1425 */ 1426 if (val == 1 || val == -1) { 1427 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1428 if (TCG_TARGET_REG_BITS == 64) { 1429 /* 1430 * The single-byte increment encodings are re-tasked 1431 * as the REX prefixes. Use the MODRM encoding. 1432 */ 1433 tcg_out_modrm(s, OPC_GRP5 + rexw, 1434 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1435 } else { 1436 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1437 } 1438 return; 1439 } 1440 if (val == 128) { 1441 /* 1442 * Facilitate using an 8-bit immediate. Carry is inverted 1443 * by this transformation, so do it only if cf == 0. 1444 */ 1445 c ^= ARITH_ADD ^ ARITH_SUB; 1446 val = -128; 1447 } 1448 } 1449 break; 1450 1451 case ARITH_AND: 1452 if (TCG_TARGET_REG_BITS == 64) { 1453 if (val == 0xffffffffu) { 1454 tcg_out_ext32u(s, r0, r0); 1455 return; 1456 } 1457 if (val == (uint32_t)val) { 1458 /* AND with no high bits set can use a 32-bit operation. */ 1459 rexw = 0; 1460 } 1461 } 1462 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1463 tcg_out_ext8u(s, r0, r0); 1464 return; 1465 } 1466 if (val == 0xffffu) { 1467 tcg_out_ext16u(s, r0, r0); 1468 return; 1469 } 1470 break; 1471 1472 case ARITH_OR: 1473 case ARITH_XOR: 1474 if (val >= 0x80 && val <= 0xff 1475 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1476 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1477 tcg_out8(s, val); 1478 return; 1479 } 1480 break; 1481 } 1482 1483 if (val == (int8_t)val) { 1484 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1485 tcg_out8(s, val); 1486 return; 1487 } 1488 if (rexw == 0 || val == (int32_t)val) { 1489 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1490 tcg_out32(s, val); 1491 return; 1492 } 1493 1494 g_assert_not_reached(); 1495} 1496 1497static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1498{ 1499 if (val != 0) { 1500 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1501 } 1502} 1503 1504/* Set SMALL to force a short forward branch. */ 1505static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1506{ 1507 int32_t val, val1; 1508 1509 if (l->has_value) { 1510 val = tcg_pcrel_diff(s, l->u.value_ptr); 1511 val1 = val - 2; 1512 if ((int8_t)val1 == val1) { 1513 if (opc == -1) { 1514 tcg_out8(s, OPC_JMP_short); 1515 } else { 1516 tcg_out8(s, OPC_JCC_short + opc); 1517 } 1518 tcg_out8(s, val1); 1519 } else { 1520 tcg_debug_assert(!small); 1521 if (opc == -1) { 1522 tcg_out8(s, OPC_JMP_long); 1523 tcg_out32(s, val - 5); 1524 } else { 1525 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1526 tcg_out32(s, val - 6); 1527 } 1528 } 1529 } else if (small) { 1530 if (opc == -1) { 1531 tcg_out8(s, OPC_JMP_short); 1532 } else { 1533 tcg_out8(s, OPC_JCC_short + opc); 1534 } 1535 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1536 s->code_ptr += 1; 1537 } else { 1538 if (opc == -1) { 1539 tcg_out8(s, OPC_JMP_long); 1540 } else { 1541 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1542 } 1543 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1544 s->code_ptr += 4; 1545 } 1546} 1547 1548static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1549 TCGArg arg2, int const_arg2, int rexw) 1550{ 1551 int jz, js; 1552 1553 if (!is_tst_cond(cond)) { 1554 if (!const_arg2) { 1555 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1556 } else if (arg2 == 0) { 1557 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1558 } else { 1559 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1560 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1561 } 1562 return tcg_cond_to_jcc[cond]; 1563 } 1564 1565 jz = tcg_cond_to_jcc[cond]; 1566 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1567 1568 if (!const_arg2) { 1569 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1570 return jz; 1571 } 1572 1573 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1574 if (arg2 == 0x80) { 1575 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1576 return js; 1577 } 1578 if (arg2 == 0xff) { 1579 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1580 return jz; 1581 } 1582 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1583 tcg_out8(s, arg2); 1584 return jz; 1585 } 1586 1587 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1588 if (arg2 == 0x8000) { 1589 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1590 return js; 1591 } 1592 if (arg2 == 0xff00) { 1593 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1594 return jz; 1595 } 1596 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1597 tcg_out8(s, arg2 >> 8); 1598 return jz; 1599 } 1600 1601 if (arg2 == 0xffff) { 1602 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1603 return jz; 1604 } 1605 if (arg2 == 0xffffffffu) { 1606 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1607 return jz; 1608 } 1609 1610 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1611 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1612 int sh = ctz64(arg2); 1613 1614 rexw = (sh & 32 ? P_REXW : 0); 1615 if ((sh & 31) == 31) { 1616 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1617 return js; 1618 } else { 1619 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1620 tcg_out8(s, sh); 1621 return jc; 1622 } 1623 } 1624 1625 if (rexw) { 1626 if (arg2 == (uint32_t)arg2) { 1627 rexw = 0; 1628 } else { 1629 tcg_debug_assert(arg2 == (int32_t)arg2); 1630 } 1631 } 1632 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1633 tcg_out32(s, arg2); 1634 return jz; 1635} 1636 1637static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1638 TCGArg arg1, TCGArg arg2, int const_arg2, 1639 TCGLabel *label, bool small) 1640{ 1641 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1642 tcg_out_jxx(s, jcc, label, small); 1643} 1644 1645#if TCG_TARGET_REG_BITS == 32 1646static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1647 const int *const_args, bool small) 1648{ 1649 TCGLabel *label_next = gen_new_label(); 1650 TCGLabel *label_this = arg_label(args[5]); 1651 TCGCond cond = args[4]; 1652 1653 switch (cond) { 1654 case TCG_COND_EQ: 1655 case TCG_COND_TSTEQ: 1656 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1657 args[0], args[2], const_args[2], label_next, 1); 1658 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1659 label_this, small); 1660 break; 1661 1662 case TCG_COND_NE: 1663 case TCG_COND_TSTNE: 1664 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1665 label_this, small); 1666 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1667 label_this, small); 1668 break; 1669 1670 default: 1671 tcg_out_brcond(s, 0, tcg_high_cond(cond), args[1], 1672 args[3], const_args[3], label_this, small); 1673 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1674 tcg_out_brcond(s, 0, tcg_unsigned_cond(cond), args[0], 1675 args[2], const_args[2], label_this, small); 1676 break; 1677 } 1678 tcg_out_label(s, label_next); 1679} 1680#endif 1681 1682static void tcg_out_setcond(TCGContext *s, TCGType type, TCGCond cond, 1683 TCGReg dest, TCGReg arg1, TCGArg arg2, 1684 bool const_arg2, bool neg) 1685{ 1686 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1687 int cmp_rexw = rexw; 1688 bool inv = false; 1689 bool cleared; 1690 int jcc; 1691 1692 switch (cond) { 1693 case TCG_COND_NE: 1694 inv = true; 1695 /* fall through */ 1696 case TCG_COND_EQ: 1697 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1698 if (const_arg2 && arg2 == 0) { 1699 arg2 = 1; 1700 goto do_ltu; 1701 } 1702 break; 1703 1704 case TCG_COND_TSTNE: 1705 inv = true; 1706 /* fall through */ 1707 case TCG_COND_TSTEQ: 1708 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1709 if (const_arg2 && arg2 == 0xffffffffu) { 1710 arg2 = 1; 1711 cmp_rexw = 0; 1712 goto do_ltu; 1713 } 1714 break; 1715 1716 case TCG_COND_LEU: 1717 inv = true; 1718 /* fall through */ 1719 case TCG_COND_GTU: 1720 /* If arg2 is a register, swap for LTU/GEU. */ 1721 if (!const_arg2) { 1722 TCGReg t = arg1; 1723 arg1 = arg2; 1724 arg2 = t; 1725 goto do_ltu; 1726 } 1727 break; 1728 1729 case TCG_COND_GEU: 1730 inv = true; 1731 /* fall through */ 1732 case TCG_COND_LTU: 1733 do_ltu: 1734 /* 1735 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1736 * We can then use NEG or INC to produce the desired result. 1737 * This is always smaller than the SETCC expansion. 1738 */ 1739 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1740 1741 /* X - X - C = -C = (C ? -1 : 0) */ 1742 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1743 if (inv && neg) { 1744 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1745 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1746 } else if (inv) { 1747 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1748 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1749 } else if (!neg) { 1750 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1751 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1752 } 1753 return; 1754 1755 case TCG_COND_GE: 1756 inv = true; 1757 /* fall through */ 1758 case TCG_COND_LT: 1759 /* If arg2 is 0, extract the sign bit. */ 1760 if (const_arg2 && arg2 == 0) { 1761 tcg_out_mov(s, type, dest, arg1); 1762 if (inv) { 1763 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1764 } 1765 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1766 dest, rexw ? 63 : 31); 1767 return; 1768 } 1769 break; 1770 1771 default: 1772 break; 1773 } 1774 1775 /* 1776 * If dest does not overlap the inputs, clearing it first is preferred. 1777 * The XOR breaks any false dependency for the low-byte write to dest, 1778 * and is also one byte smaller than MOVZBL. 1779 */ 1780 cleared = false; 1781 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1782 tgen_arithr(s, ARITH_XOR, dest, dest); 1783 cleared = true; 1784 } 1785 1786 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1787 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1788 1789 if (!cleared) { 1790 tcg_out_ext8u(s, dest, dest); 1791 } 1792 if (neg) { 1793 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1794 } 1795} 1796 1797static void tgen_setcond(TCGContext *s, TCGType type, TCGCond cond, 1798 TCGReg dest, TCGReg arg1, TCGReg arg2) 1799{ 1800 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, false); 1801} 1802 1803static void tgen_setcondi(TCGContext *s, TCGType type, TCGCond cond, 1804 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1805{ 1806 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, false); 1807} 1808 1809static const TCGOutOpSetcond outop_setcond = { 1810 .base.static_constraint = C_O1_I2(q, r, reT), 1811 .out_rrr = tgen_setcond, 1812 .out_rri = tgen_setcondi, 1813}; 1814 1815static void tgen_negsetcond(TCGContext *s, TCGType type, TCGCond cond, 1816 TCGReg dest, TCGReg arg1, TCGReg arg2) 1817{ 1818 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, true); 1819} 1820 1821static void tgen_negsetcondi(TCGContext *s, TCGType type, TCGCond cond, 1822 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1823{ 1824 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, true); 1825} 1826 1827static const TCGOutOpSetcond outop_negsetcond = { 1828 .base.static_constraint = C_O1_I2(q, r, reT), 1829 .out_rrr = tgen_negsetcond, 1830 .out_rri = tgen_negsetcondi, 1831}; 1832 1833#if TCG_TARGET_REG_BITS == 32 1834static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1835 const int *const_args) 1836{ 1837 TCGArg new_args[6]; 1838 TCGLabel *label_true, *label_over; 1839 1840 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1841 1842 if (args[0] == args[1] || args[0] == args[2] 1843 || (!const_args[3] && args[0] == args[3]) 1844 || (!const_args[4] && args[0] == args[4])) { 1845 /* When the destination overlaps with one of the argument 1846 registers, don't do anything tricky. */ 1847 label_true = gen_new_label(); 1848 label_over = gen_new_label(); 1849 1850 new_args[5] = label_arg(label_true); 1851 tcg_out_brcond2(s, new_args, const_args+1, 1); 1852 1853 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1854 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1855 tcg_out_label(s, label_true); 1856 1857 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1858 tcg_out_label(s, label_over); 1859 } else { 1860 /* When the destination does not overlap one of the arguments, 1861 clear the destination first, jump if cond false, and emit an 1862 increment in the true case. This results in smaller code. */ 1863 1864 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1865 1866 label_over = gen_new_label(); 1867 new_args[4] = tcg_invert_cond(new_args[4]); 1868 new_args[5] = label_arg(label_over); 1869 tcg_out_brcond2(s, new_args, const_args+1, 1); 1870 1871 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1872 tcg_out_label(s, label_over); 1873 } 1874} 1875#endif 1876 1877static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1878 TCGReg dest, TCGReg v1) 1879{ 1880 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1881} 1882 1883static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1884 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1885 TCGReg v1) 1886{ 1887 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1888 tcg_out_cmov(s, jcc, rexw, dest, v1); 1889} 1890 1891static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1892{ 1893 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1894 1895 if (disp == (int32_t)disp) { 1896 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1897 tcg_out32(s, disp); 1898 } else { 1899 /* rip-relative addressing into the constant pool. 1900 This is 6 + 8 = 14 bytes, as compared to using an 1901 immediate load 10 + 6 = 16 bytes, plus we may 1902 be able to re-use the pool constant for more calls. */ 1903 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1904 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1905 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1906 tcg_out32(s, 0); 1907 } 1908} 1909 1910static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1911 const TCGHelperInfo *info) 1912{ 1913 tcg_out_branch(s, 1, dest); 1914 1915#ifndef _WIN32 1916 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1917 /* 1918 * The sysv i386 abi for struct return places a reference as the 1919 * first argument of the stack, and pops that argument with the 1920 * return statement. Since we want to retain the aligned stack 1921 * pointer for the callee, we do not want to actually push that 1922 * argument before the call but rely on the normal store to the 1923 * stack slot. But we do need to compensate for the pop in order 1924 * to reset our correct stack pointer value. 1925 * Pushing a garbage value back onto the stack is quickest. 1926 */ 1927 tcg_out_push(s, TCG_REG_EAX); 1928 } 1929#endif 1930} 1931 1932static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1933{ 1934 tcg_out_branch(s, 0, dest); 1935} 1936 1937static void tcg_out_nopn(TCGContext *s, int n) 1938{ 1939 int i; 1940 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1941 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1942 * duplicate prefix, and all of the interesting recent cores can 1943 * decode and discard the duplicates in a single cycle. 1944 */ 1945 tcg_debug_assert(n >= 1); 1946 for (i = 1; i < n; ++i) { 1947 tcg_out8(s, 0x66); 1948 } 1949 tcg_out8(s, 0x90); 1950} 1951 1952typedef struct { 1953 TCGReg base; 1954 int index; 1955 int ofs; 1956 int seg; 1957 TCGAtomAlign aa; 1958} HostAddress; 1959 1960bool tcg_target_has_memory_bswap(MemOp memop) 1961{ 1962 TCGAtomAlign aa; 1963 1964 if (!have_movbe) { 1965 return false; 1966 } 1967 if ((memop & MO_SIZE) < MO_128) { 1968 return true; 1969 } 1970 1971 /* 1972 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 1973 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 1974 */ 1975 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 1976 return aa.atom < MO_128; 1977} 1978 1979/* 1980 * Because i686 has no register parameters and because x86_64 has xchg 1981 * to handle addr/data register overlap, we have placed all input arguments 1982 * before we need might need a scratch reg. 1983 * 1984 * Even then, a scratch is only needed for l->raddr. Rather than expose 1985 * a general-purpose scratch when we don't actually know it's available, 1986 * use the ra_gen hook to load into RAX if needed. 1987 */ 1988#if TCG_TARGET_REG_BITS == 64 1989static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1990{ 1991 if (arg < 0) { 1992 arg = TCG_REG_RAX; 1993 } 1994 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 1995 return arg; 1996} 1997static const TCGLdstHelperParam ldst_helper_param = { 1998 .ra_gen = ldst_ra_gen 1999}; 2000#else 2001static const TCGLdstHelperParam ldst_helper_param = { }; 2002#endif 2003 2004static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 2005 TCGReg l, TCGReg h, TCGReg v) 2006{ 2007 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2008 2009 /* vpmov{d,q} %v, %l */ 2010 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2011 /* vpextr{d,q} $1, %v, %h */ 2012 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2013 tcg_out8(s, 1); 2014} 2015 2016static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2017 TCGReg v, TCGReg l, TCGReg h) 2018{ 2019 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2020 2021 /* vmov{d,q} %l, %v */ 2022 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2023 /* vpinsr{d,q} $1, %h, %v, %v */ 2024 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2025 tcg_out8(s, 1); 2026} 2027 2028/* 2029 * Generate code for the slow path for a load at the end of block 2030 */ 2031static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2032{ 2033 MemOp opc = get_memop(l->oi); 2034 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2035 2036 /* resolve label address */ 2037 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2038 if (label_ptr[1]) { 2039 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2040 } 2041 2042 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2043 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2044 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2045 2046 tcg_out_jmp(s, l->raddr); 2047 return true; 2048} 2049 2050/* 2051 * Generate code for the slow path for a store at the end of block 2052 */ 2053static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2054{ 2055 MemOp opc = get_memop(l->oi); 2056 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2057 2058 /* resolve label address */ 2059 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2060 if (label_ptr[1]) { 2061 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2062 } 2063 2064 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2065 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2066 2067 tcg_out_jmp(s, l->raddr); 2068 return true; 2069} 2070 2071#ifdef CONFIG_USER_ONLY 2072static HostAddress x86_guest_base = { 2073 .index = -1 2074}; 2075 2076#if defined(__x86_64__) && defined(__linux__) 2077# include <asm/prctl.h> 2078# include <sys/prctl.h> 2079int arch_prctl(int code, unsigned long addr); 2080static inline int setup_guest_base_seg(void) 2081{ 2082 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2083 return P_GS; 2084 } 2085 return 0; 2086} 2087#define setup_guest_base_seg setup_guest_base_seg 2088#elif defined(__x86_64__) && \ 2089 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2090# include <machine/sysarch.h> 2091static inline int setup_guest_base_seg(void) 2092{ 2093 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2094 return P_GS; 2095 } 2096 return 0; 2097} 2098#define setup_guest_base_seg setup_guest_base_seg 2099#endif 2100#else 2101# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2102#endif /* CONFIG_USER_ONLY */ 2103#ifndef setup_guest_base_seg 2104# define setup_guest_base_seg() 0 2105#endif 2106 2107#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2108 2109/* 2110 * For softmmu, perform the TLB load and compare. 2111 * For useronly, perform any required alignment tests. 2112 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2113 * is required and fill in @h with the host address for the fast path. 2114 */ 2115static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2116 TCGReg addr, MemOpIdx oi, bool is_ld) 2117{ 2118 TCGLabelQemuLdst *ldst = NULL; 2119 MemOp opc = get_memop(oi); 2120 MemOp s_bits = opc & MO_SIZE; 2121 unsigned a_mask; 2122 2123 if (tcg_use_softmmu) { 2124 h->index = TCG_REG_L0; 2125 h->ofs = 0; 2126 h->seg = 0; 2127 } else { 2128 *h = x86_guest_base; 2129 } 2130 h->base = addr; 2131 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2132 a_mask = (1 << h->aa.align) - 1; 2133 2134 if (tcg_use_softmmu) { 2135 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2136 : offsetof(CPUTLBEntry, addr_write); 2137 TCGType ttype = TCG_TYPE_I32; 2138 TCGType tlbtype = TCG_TYPE_I32; 2139 int trexw = 0, hrexw = 0, tlbrexw = 0; 2140 unsigned mem_index = get_mmuidx(oi); 2141 unsigned s_mask = (1 << s_bits) - 1; 2142 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2143 int tlb_mask; 2144 2145 ldst = new_ldst_label(s); 2146 ldst->is_ld = is_ld; 2147 ldst->oi = oi; 2148 ldst->addr_reg = addr; 2149 2150 if (TCG_TARGET_REG_BITS == 64) { 2151 ttype = s->addr_type; 2152 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2153 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2154 hrexw = P_REXW; 2155 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2156 tlbtype = TCG_TYPE_I64; 2157 tlbrexw = P_REXW; 2158 } 2159 } 2160 } 2161 2162 tcg_out_mov(s, tlbtype, TCG_REG_L0, addr); 2163 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2164 s->page_bits - CPU_TLB_ENTRY_BITS); 2165 2166 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2167 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2168 2169 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2170 fast_ofs + offsetof(CPUTLBDescFast, table)); 2171 2172 /* 2173 * If the required alignment is at least as large as the access, 2174 * simply copy the address and mask. For lesser alignments, 2175 * check that we don't cross pages for the complete access. 2176 */ 2177 if (a_mask >= s_mask) { 2178 tcg_out_mov(s, ttype, TCG_REG_L1, addr); 2179 } else { 2180 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2181 addr, s_mask - a_mask); 2182 } 2183 tlb_mask = s->page_mask | a_mask; 2184 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2185 2186 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2187 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2188 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2189 2190 /* jne slow_path */ 2191 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2192 ldst->label_ptr[0] = s->code_ptr; 2193 s->code_ptr += 4; 2194 2195 /* TLB Hit. */ 2196 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2197 offsetof(CPUTLBEntry, addend)); 2198 } else if (a_mask) { 2199 int jcc; 2200 2201 ldst = new_ldst_label(s); 2202 ldst->is_ld = is_ld; 2203 ldst->oi = oi; 2204 ldst->addr_reg = addr; 2205 2206 /* jne slow_path */ 2207 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addr, a_mask, true, false); 2208 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2209 ldst->label_ptr[0] = s->code_ptr; 2210 s->code_ptr += 4; 2211 } 2212 2213 return ldst; 2214} 2215 2216static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2217 HostAddress h, TCGType type, MemOp memop) 2218{ 2219 bool use_movbe = false; 2220 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2221 int movop = OPC_MOVL_GvEv; 2222 2223 /* Do big-endian loads with movbe. */ 2224 if (memop & MO_BSWAP) { 2225 tcg_debug_assert(have_movbe); 2226 use_movbe = true; 2227 movop = OPC_MOVBE_GyMy; 2228 } 2229 2230 switch (memop & MO_SSIZE) { 2231 case MO_UB: 2232 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2233 h.base, h.index, 0, h.ofs); 2234 break; 2235 case MO_SB: 2236 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2237 h.base, h.index, 0, h.ofs); 2238 break; 2239 case MO_UW: 2240 if (use_movbe) { 2241 /* There is no extending movbe; only low 16-bits are modified. */ 2242 if (datalo != h.base && datalo != h.index) { 2243 /* XOR breaks dependency chains. */ 2244 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2245 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2246 datalo, h.base, h.index, 0, h.ofs); 2247 } else { 2248 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2249 datalo, h.base, h.index, 0, h.ofs); 2250 tcg_out_ext16u(s, datalo, datalo); 2251 } 2252 } else { 2253 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2254 h.base, h.index, 0, h.ofs); 2255 } 2256 break; 2257 case MO_SW: 2258 if (use_movbe) { 2259 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2260 datalo, h.base, h.index, 0, h.ofs); 2261 tcg_out_ext16s(s, type, datalo, datalo); 2262 } else { 2263 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2264 datalo, h.base, h.index, 0, h.ofs); 2265 } 2266 break; 2267 case MO_UL: 2268 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2269 h.base, h.index, 0, h.ofs); 2270 break; 2271#if TCG_TARGET_REG_BITS == 64 2272 case MO_SL: 2273 if (use_movbe) { 2274 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2275 h.base, h.index, 0, h.ofs); 2276 tcg_out_ext32s(s, datalo, datalo); 2277 } else { 2278 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2279 h.base, h.index, 0, h.ofs); 2280 } 2281 break; 2282#endif 2283 case MO_UQ: 2284 if (TCG_TARGET_REG_BITS == 64) { 2285 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2286 h.base, h.index, 0, h.ofs); 2287 break; 2288 } 2289 if (use_movbe) { 2290 TCGReg t = datalo; 2291 datalo = datahi; 2292 datahi = t; 2293 } 2294 if (h.base == datalo || h.index == datalo) { 2295 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2296 h.base, h.index, 0, h.ofs); 2297 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2298 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2299 } else { 2300 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2301 h.base, h.index, 0, h.ofs); 2302 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2303 h.base, h.index, 0, h.ofs + 4); 2304 } 2305 break; 2306 2307 case MO_128: 2308 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2309 2310 /* 2311 * Without 16-byte atomicity, use integer regs. 2312 * That is where we want the data, and it allows bswaps. 2313 */ 2314 if (h.aa.atom < MO_128) { 2315 if (use_movbe) { 2316 TCGReg t = datalo; 2317 datalo = datahi; 2318 datahi = t; 2319 } 2320 if (h.base == datalo || h.index == datalo) { 2321 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2322 h.base, h.index, 0, h.ofs); 2323 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2324 datalo, datahi, 0); 2325 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2326 datahi, datahi, 8); 2327 } else { 2328 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2329 h.base, h.index, 0, h.ofs); 2330 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2331 h.base, h.index, 0, h.ofs + 8); 2332 } 2333 break; 2334 } 2335 2336 /* 2337 * With 16-byte atomicity, a vector load is required. 2338 * If we already have 16-byte alignment, then VMOVDQA always works. 2339 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2340 * Else use we require a runtime test for alignment for VMOVDQA; 2341 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2342 */ 2343 if (h.aa.align >= MO_128) { 2344 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2345 TCG_TMP_VEC, 0, 2346 h.base, h.index, 0, h.ofs); 2347 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2348 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2349 TCG_TMP_VEC, 0, 2350 h.base, h.index, 0, h.ofs); 2351 } else { 2352 TCGLabel *l1 = gen_new_label(); 2353 TCGLabel *l2 = gen_new_label(); 2354 int jcc; 2355 2356 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2357 tcg_out_jxx(s, jcc, l1, true); 2358 2359 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2360 TCG_TMP_VEC, 0, 2361 h.base, h.index, 0, h.ofs); 2362 tcg_out_jxx(s, JCC_JMP, l2, true); 2363 2364 tcg_out_label(s, l1); 2365 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2366 TCG_TMP_VEC, 0, 2367 h.base, h.index, 0, h.ofs); 2368 tcg_out_label(s, l2); 2369 } 2370 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2371 break; 2372 2373 default: 2374 g_assert_not_reached(); 2375 } 2376} 2377 2378static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2379 TCGReg addr, MemOpIdx oi, TCGType data_type) 2380{ 2381 TCGLabelQemuLdst *ldst; 2382 HostAddress h; 2383 2384 ldst = prepare_host_addr(s, &h, addr, oi, true); 2385 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2386 2387 if (ldst) { 2388 ldst->type = data_type; 2389 ldst->datalo_reg = datalo; 2390 ldst->datahi_reg = datahi; 2391 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2392 } 2393} 2394 2395static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2396 HostAddress h, MemOp memop) 2397{ 2398 bool use_movbe = false; 2399 int movop = OPC_MOVL_EvGv; 2400 2401 /* 2402 * Do big-endian stores with movbe or system-mode. 2403 * User-only without movbe will have its swapping done generically. 2404 */ 2405 if (memop & MO_BSWAP) { 2406 tcg_debug_assert(have_movbe); 2407 use_movbe = true; 2408 movop = OPC_MOVBE_MyGy; 2409 } 2410 2411 switch (memop & MO_SIZE) { 2412 case MO_8: 2413 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2414 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2415 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2416 datalo, h.base, h.index, 0, h.ofs); 2417 break; 2418 case MO_16: 2419 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2420 h.base, h.index, 0, h.ofs); 2421 break; 2422 case MO_32: 2423 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2424 h.base, h.index, 0, h.ofs); 2425 break; 2426 case MO_64: 2427 if (TCG_TARGET_REG_BITS == 64) { 2428 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2429 h.base, h.index, 0, h.ofs); 2430 } else { 2431 if (use_movbe) { 2432 TCGReg t = datalo; 2433 datalo = datahi; 2434 datahi = t; 2435 } 2436 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2437 h.base, h.index, 0, h.ofs); 2438 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2439 h.base, h.index, 0, h.ofs + 4); 2440 } 2441 break; 2442 2443 case MO_128: 2444 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2445 2446 /* 2447 * Without 16-byte atomicity, use integer regs. 2448 * That is where we have the data, and it allows bswaps. 2449 */ 2450 if (h.aa.atom < MO_128) { 2451 if (use_movbe) { 2452 TCGReg t = datalo; 2453 datalo = datahi; 2454 datahi = t; 2455 } 2456 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2457 h.base, h.index, 0, h.ofs); 2458 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2459 h.base, h.index, 0, h.ofs + 8); 2460 break; 2461 } 2462 2463 /* 2464 * With 16-byte atomicity, a vector store is required. 2465 * If we already have 16-byte alignment, then VMOVDQA always works. 2466 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2467 * Else use we require a runtime test for alignment for VMOVDQA; 2468 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2469 */ 2470 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2471 if (h.aa.align >= MO_128) { 2472 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2473 TCG_TMP_VEC, 0, 2474 h.base, h.index, 0, h.ofs); 2475 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2476 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2477 TCG_TMP_VEC, 0, 2478 h.base, h.index, 0, h.ofs); 2479 } else { 2480 TCGLabel *l1 = gen_new_label(); 2481 TCGLabel *l2 = gen_new_label(); 2482 int jcc; 2483 2484 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2485 tcg_out_jxx(s, jcc, l1, true); 2486 2487 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2488 TCG_TMP_VEC, 0, 2489 h.base, h.index, 0, h.ofs); 2490 tcg_out_jxx(s, JCC_JMP, l2, true); 2491 2492 tcg_out_label(s, l1); 2493 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2494 TCG_TMP_VEC, 0, 2495 h.base, h.index, 0, h.ofs); 2496 tcg_out_label(s, l2); 2497 } 2498 break; 2499 2500 default: 2501 g_assert_not_reached(); 2502 } 2503} 2504 2505static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2506 TCGReg addr, MemOpIdx oi, TCGType data_type) 2507{ 2508 TCGLabelQemuLdst *ldst; 2509 HostAddress h; 2510 2511 ldst = prepare_host_addr(s, &h, addr, oi, false); 2512 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2513 2514 if (ldst) { 2515 ldst->type = data_type; 2516 ldst->datalo_reg = datalo; 2517 ldst->datahi_reg = datahi; 2518 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2519 } 2520} 2521 2522static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2523{ 2524 /* Reuse the zeroing that exists for goto_ptr. */ 2525 if (a0 == 0) { 2526 tcg_out_jmp(s, tcg_code_gen_epilogue); 2527 } else { 2528 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2529 tcg_out_jmp(s, tb_ret_addr); 2530 } 2531} 2532 2533static void tcg_out_goto_tb(TCGContext *s, int which) 2534{ 2535 /* 2536 * Jump displacement must be aligned for atomic patching; 2537 * see if we need to add extra nops before jump 2538 */ 2539 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2540 if (gap != 1) { 2541 tcg_out_nopn(s, gap - 1); 2542 } 2543 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2544 set_jmp_insn_offset(s, which); 2545 tcg_out32(s, 0); 2546 set_jmp_reset_offset(s, which); 2547} 2548 2549void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2550 uintptr_t jmp_rx, uintptr_t jmp_rw) 2551{ 2552 /* patch the branch destination */ 2553 uintptr_t addr = tb->jmp_target_addr[n]; 2554 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2555 /* no need to flush icache explicitly */ 2556} 2557 2558 2559static void tgen_add(TCGContext *s, TCGType type, 2560 TCGReg a0, TCGReg a1, TCGReg a2) 2561{ 2562 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2563 2564 if (a0 == a1) { 2565 tgen_arithr(s, ARITH_ADD + rexw, a0, a2); 2566 } else if (a0 == a2) { 2567 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2568 } else { 2569 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, 0); 2570 } 2571} 2572 2573static void tgen_addi(TCGContext *s, TCGType type, 2574 TCGReg a0, TCGReg a1, tcg_target_long a2) 2575{ 2576 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2577 2578 if (a0 == a1) { 2579 tgen_arithi(s, ARITH_ADD + rexw, a0, a2, false); 2580 } else { 2581 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2); 2582 } 2583} 2584 2585static const TCGOutOpBinary outop_add = { 2586 .base.static_constraint = C_O1_I2(r, r, re), 2587 .out_rrr = tgen_add, 2588 .out_rri = tgen_addi, 2589}; 2590 2591static void tgen_and(TCGContext *s, TCGType type, 2592 TCGReg a0, TCGReg a1, TCGReg a2) 2593{ 2594 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2595 tgen_arithr(s, ARITH_AND + rexw, a0, a2); 2596} 2597 2598static void tgen_andi(TCGContext *s, TCGType type, 2599 TCGReg a0, TCGReg a1, tcg_target_long a2) 2600{ 2601 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2602 tgen_arithi(s, ARITH_AND + rexw, a0, a2, false); 2603} 2604 2605static const TCGOutOpBinary outop_and = { 2606 .base.static_constraint = C_O1_I2(r, 0, reZ), 2607 .out_rrr = tgen_and, 2608 .out_rri = tgen_andi, 2609}; 2610 2611static void tgen_andc(TCGContext *s, TCGType type, 2612 TCGReg a0, TCGReg a1, TCGReg a2) 2613{ 2614 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2615 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2616} 2617 2618static TCGConstraintSetIndex cset_andc(TCGType type, unsigned flags) 2619{ 2620 return have_bmi1 ? C_O1_I2(r, r, r) : C_NotImplemented; 2621} 2622 2623static const TCGOutOpBinary outop_andc = { 2624 .base.static_constraint = C_Dynamic, 2625 .base.dynamic_constraint = cset_andc, 2626 .out_rrr = tgen_andc, 2627}; 2628 2629static void tgen_clz(TCGContext *s, TCGType type, 2630 TCGReg a0, TCGReg a1, TCGReg a2) 2631{ 2632 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2633 int jcc; 2634 2635 if (have_lzcnt) { 2636 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2637 jcc = JCC_JB; 2638 } else { 2639 /* Recall that the output of BSR is the index not the count. */ 2640 tcg_out_modrm(s, OPC_BSR + rexw, a0, a1); 2641 tgen_arithi(s, ARITH_XOR + rexw, a0, rexw ? 63 : 31, 0); 2642 2643 /* Since we have destroyed the flags from BSR, we have to re-test. */ 2644 jcc = tcg_out_cmp(s, TCG_COND_EQ, a1, 0, 1, rexw); 2645 } 2646 tcg_out_cmov(s, jcc, rexw, a0, a2); 2647} 2648 2649static void tgen_clzi(TCGContext *s, TCGType type, 2650 TCGReg a0, TCGReg a1, tcg_target_long a2) 2651{ 2652 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2653 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2654} 2655 2656static TCGConstraintSetIndex cset_clz(TCGType type, unsigned flags) 2657{ 2658 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2659} 2660 2661static const TCGOutOpBinary outop_clz = { 2662 .base.static_constraint = C_Dynamic, 2663 .base.dynamic_constraint = cset_clz, 2664 .out_rrr = tgen_clz, 2665 .out_rri = tgen_clzi, 2666}; 2667 2668static void tgen_ctpop(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2669{ 2670 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2671 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2672} 2673 2674static TCGConstraintSetIndex cset_ctpop(TCGType type, unsigned flags) 2675{ 2676 return have_popcnt ? C_O1_I1(r, r) : C_NotImplemented; 2677} 2678 2679static const TCGOutOpUnary outop_ctpop = { 2680 .base.static_constraint = C_Dynamic, 2681 .base.dynamic_constraint = cset_ctpop, 2682 .out_rr = tgen_ctpop, 2683}; 2684 2685static void tgen_ctz(TCGContext *s, TCGType type, 2686 TCGReg a0, TCGReg a1, TCGReg a2) 2687{ 2688 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2689 int jcc; 2690 2691 if (have_bmi1) { 2692 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2693 jcc = JCC_JB; 2694 } else { 2695 tcg_out_modrm(s, OPC_BSF + rexw, a0, a1); 2696 jcc = JCC_JE; 2697 } 2698 tcg_out_cmov(s, jcc, rexw, a0, a2); 2699} 2700 2701static void tgen_ctzi(TCGContext *s, TCGType type, 2702 TCGReg a0, TCGReg a1, tcg_target_long a2) 2703{ 2704 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2705 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2706} 2707 2708static TCGConstraintSetIndex cset_ctz(TCGType type, unsigned flags) 2709{ 2710 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2711} 2712 2713static const TCGOutOpBinary outop_ctz = { 2714 .base.static_constraint = C_Dynamic, 2715 .base.dynamic_constraint = cset_ctz, 2716 .out_rrr = tgen_ctz, 2717 .out_rri = tgen_ctzi, 2718}; 2719 2720static const TCGOutOpBinary outop_divs = { 2721 .base.static_constraint = C_NotImplemented, 2722}; 2723 2724static void tgen_divs2(TCGContext *s, TCGType type, 2725 TCGReg a0, TCGReg a1, TCGReg a4) 2726{ 2727 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2728 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, a4); 2729} 2730 2731static const TCGOutOpDivRem outop_divs2 = { 2732 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2733 .out_rr01r = tgen_divs2, 2734}; 2735 2736static const TCGOutOpBinary outop_divu = { 2737 .base.static_constraint = C_NotImplemented, 2738}; 2739 2740static void tgen_divu2(TCGContext *s, TCGType type, 2741 TCGReg a0, TCGReg a1, TCGReg a4) 2742{ 2743 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2744 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, a4); 2745} 2746 2747static const TCGOutOpDivRem outop_divu2 = { 2748 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2749 .out_rr01r = tgen_divu2, 2750}; 2751 2752static const TCGOutOpBinary outop_eqv = { 2753 .base.static_constraint = C_NotImplemented, 2754}; 2755 2756static void tgen_mul(TCGContext *s, TCGType type, 2757 TCGReg a0, TCGReg a1, TCGReg a2) 2758{ 2759 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2760 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2761} 2762 2763static void tgen_muli(TCGContext *s, TCGType type, 2764 TCGReg a0, TCGReg a1, tcg_target_long a2) 2765{ 2766 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2767 2768 if (a2 == (int8_t)a2) { 2769 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2770 tcg_out8(s, a2); 2771 } else { 2772 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2773 tcg_out32(s, a2); 2774 } 2775} 2776 2777static const TCGOutOpBinary outop_mul = { 2778 .base.static_constraint = C_O1_I2(r, 0, re), 2779 .out_rrr = tgen_mul, 2780 .out_rri = tgen_muli, 2781}; 2782 2783static void tgen_muls2(TCGContext *s, TCGType type, 2784 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2785{ 2786 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2787 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, a3); 2788} 2789 2790static const TCGOutOpMul2 outop_muls2 = { 2791 .base.static_constraint = C_O2_I2(a, d, a, r), 2792 .out_rrrr = tgen_muls2, 2793}; 2794 2795static const TCGOutOpBinary outop_mulsh = { 2796 .base.static_constraint = C_NotImplemented, 2797}; 2798 2799static const TCGOutOpBinary outop_muluh = { 2800 .base.static_constraint = C_NotImplemented, 2801}; 2802 2803static void tgen_mulu2(TCGContext *s, TCGType type, 2804 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2805{ 2806 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2807 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, a3); 2808} 2809 2810static const TCGOutOpMul2 outop_mulu2 = { 2811 .base.static_constraint = C_O2_I2(a, d, a, r), 2812 .out_rrrr = tgen_mulu2, 2813}; 2814 2815static const TCGOutOpBinary outop_nand = { 2816 .base.static_constraint = C_NotImplemented, 2817}; 2818 2819static const TCGOutOpBinary outop_nor = { 2820 .base.static_constraint = C_NotImplemented, 2821}; 2822 2823static void tgen_or(TCGContext *s, TCGType type, 2824 TCGReg a0, TCGReg a1, TCGReg a2) 2825{ 2826 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2827 tgen_arithr(s, ARITH_OR + rexw, a0, a2); 2828} 2829 2830static void tgen_ori(TCGContext *s, TCGType type, 2831 TCGReg a0, TCGReg a1, tcg_target_long a2) 2832{ 2833 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2834 tgen_arithi(s, ARITH_OR + rexw, a0, a2, false); 2835} 2836 2837static const TCGOutOpBinary outop_or = { 2838 .base.static_constraint = C_O1_I2(r, 0, re), 2839 .out_rrr = tgen_or, 2840 .out_rri = tgen_ori, 2841}; 2842 2843static const TCGOutOpBinary outop_orc = { 2844 .base.static_constraint = C_NotImplemented, 2845}; 2846 2847static const TCGOutOpBinary outop_rems = { 2848 .base.static_constraint = C_NotImplemented, 2849}; 2850 2851static const TCGOutOpBinary outop_remu = { 2852 .base.static_constraint = C_NotImplemented, 2853}; 2854 2855static void tgen_rotl(TCGContext *s, TCGType type, 2856 TCGReg a0, TCGReg a1, TCGReg a2) 2857{ 2858 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2859 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROL, a0); 2860} 2861 2862static void tgen_rotli(TCGContext *s, TCGType type, 2863 TCGReg a0, TCGReg a1, tcg_target_long a2) 2864{ 2865 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2866 tcg_out_shifti(s, SHIFT_ROL + rexw, a0, a2); 2867} 2868 2869static const TCGOutOpBinary outop_rotl = { 2870 .base.static_constraint = C_O1_I2(r, 0, ci), 2871 .out_rrr = tgen_rotl, 2872 .out_rri = tgen_rotli, 2873}; 2874 2875static void tgen_rotr(TCGContext *s, TCGType type, 2876 TCGReg a0, TCGReg a1, TCGReg a2) 2877{ 2878 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2879 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROR, a0); 2880} 2881 2882static void tgen_rotri(TCGContext *s, TCGType type, 2883 TCGReg a0, TCGReg a1, tcg_target_long a2) 2884{ 2885 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2886 tcg_out_shifti(s, SHIFT_ROR + rexw, a0, a2); 2887} 2888 2889static const TCGOutOpBinary outop_rotr = { 2890 .base.static_constraint = C_O1_I2(r, 0, ci), 2891 .out_rrr = tgen_rotr, 2892 .out_rri = tgen_rotri, 2893}; 2894 2895static TCGConstraintSetIndex cset_shift(TCGType type, unsigned flags) 2896{ 2897 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 2898} 2899 2900static void tgen_sar(TCGContext *s, TCGType type, 2901 TCGReg a0, TCGReg a1, TCGReg a2) 2902{ 2903 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2904 if (have_bmi2) { 2905 tcg_out_vex_modrm(s, OPC_SARX + rexw, a0, a2, a1); 2906 } else { 2907 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SAR, a0); 2908 } 2909} 2910 2911static void tgen_sari(TCGContext *s, TCGType type, 2912 TCGReg a0, TCGReg a1, tcg_target_long a2) 2913{ 2914 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2915 2916 tcg_out_mov(s, type, a0, a1); 2917 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, a2); 2918} 2919 2920static const TCGOutOpBinary outop_sar = { 2921 .base.static_constraint = C_Dynamic, 2922 .base.dynamic_constraint = cset_shift, 2923 .out_rrr = tgen_sar, 2924 .out_rri = tgen_sari, 2925}; 2926 2927static void tgen_shl(TCGContext *s, TCGType type, 2928 TCGReg a0, TCGReg a1, TCGReg a2) 2929{ 2930 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2931 if (have_bmi2) { 2932 tcg_out_vex_modrm(s, OPC_SHLX + rexw, a0, a2, a1); 2933 } else { 2934 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHL, a0); 2935 } 2936} 2937 2938static void tgen_shli(TCGContext *s, TCGType type, 2939 TCGReg a0, TCGReg a1, tcg_target_long a2) 2940{ 2941 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2942 2943 /* For small constant 3-operand shift, use LEA. */ 2944 if (a0 != a1 && a2 >= 1 && a2 <= 3) { 2945 if (a2 == 1) { 2946 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2947 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2948 } else { 2949 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2950 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2951 } 2952 return; 2953 } 2954 tcg_out_mov(s, type, a0, a1); 2955 tcg_out_shifti(s, SHIFT_SHL + rexw, a0, a2); 2956} 2957 2958static const TCGOutOpBinary outop_shl = { 2959 .base.static_constraint = C_Dynamic, 2960 .base.dynamic_constraint = cset_shift, 2961 .out_rrr = tgen_shl, 2962 .out_rri = tgen_shli, 2963}; 2964 2965static void tgen_shr(TCGContext *s, TCGType type, 2966 TCGReg a0, TCGReg a1, TCGReg a2) 2967{ 2968 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2969 if (have_bmi2) { 2970 tcg_out_vex_modrm(s, OPC_SHRX + rexw, a0, a2, a1); 2971 } else { 2972 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHR, a0); 2973 } 2974} 2975 2976static void tgen_shri(TCGContext *s, TCGType type, 2977 TCGReg a0, TCGReg a1, tcg_target_long a2) 2978{ 2979 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2980 2981 tcg_out_mov(s, type, a0, a1); 2982 tcg_out_shifti(s, SHIFT_SHR + rexw, a0, a2); 2983} 2984 2985static const TCGOutOpBinary outop_shr = { 2986 .base.static_constraint = C_Dynamic, 2987 .base.dynamic_constraint = cset_shift, 2988 .out_rrr = tgen_shr, 2989 .out_rri = tgen_shri, 2990}; 2991 2992static void tgen_sub(TCGContext *s, TCGType type, 2993 TCGReg a0, TCGReg a1, TCGReg a2) 2994{ 2995 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2996 tgen_arithr(s, ARITH_SUB + rexw, a0, a2); 2997} 2998 2999static const TCGOutOpSubtract outop_sub = { 3000 .base.static_constraint = C_O1_I2(r, 0, r), 3001 .out_rrr = tgen_sub, 3002}; 3003 3004static void tgen_xor(TCGContext *s, TCGType type, 3005 TCGReg a0, TCGReg a1, TCGReg a2) 3006{ 3007 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3008 tgen_arithr(s, ARITH_XOR + rexw, a0, a2); 3009} 3010 3011static void tgen_xori(TCGContext *s, TCGType type, 3012 TCGReg a0, TCGReg a1, tcg_target_long a2) 3013{ 3014 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3015 tgen_arithi(s, ARITH_XOR + rexw, a0, a2, false); 3016} 3017 3018static const TCGOutOpBinary outop_xor = { 3019 .base.static_constraint = C_O1_I2(r, 0, re), 3020 .out_rrr = tgen_xor, 3021 .out_rri = tgen_xori, 3022}; 3023 3024static void tgen_neg(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3025{ 3026 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3027 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 3028} 3029 3030static const TCGOutOpUnary outop_neg = { 3031 .base.static_constraint = C_O1_I1(r, 0), 3032 .out_rr = tgen_neg, 3033}; 3034 3035static void tgen_not(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3036{ 3037 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3038 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 3039} 3040 3041static const TCGOutOpUnary outop_not = { 3042 .base.static_constraint = C_O1_I1(r, 0), 3043 .out_rr = tgen_not, 3044}; 3045 3046 3047static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type, 3048 const TCGArg args[TCG_MAX_OP_ARGS], 3049 const int const_args[TCG_MAX_OP_ARGS]) 3050{ 3051 TCGArg a0, a1, a2; 3052 int const_a2, rexw; 3053 3054#if TCG_TARGET_REG_BITS == 64 3055# define OP_32_64(x) \ 3056 case glue(glue(INDEX_op_, x), _i64): \ 3057 case glue(glue(INDEX_op_, x), _i32) 3058#else 3059# define OP_32_64(x) \ 3060 case glue(glue(INDEX_op_, x), _i32) 3061#endif 3062 3063 /* Hoist the loads of the most common arguments. */ 3064 a0 = args[0]; 3065 a1 = args[1]; 3066 a2 = args[2]; 3067 const_a2 = const_args[2]; 3068 rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3069 3070 switch (opc) { 3071 case INDEX_op_goto_ptr: 3072 /* jmp to the given host address (could be epilogue) */ 3073 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 3074 break; 3075 case INDEX_op_br: 3076 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 3077 break; 3078 OP_32_64(ld8u): 3079 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3080 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 3081 break; 3082 OP_32_64(ld8s): 3083 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 3084 break; 3085 OP_32_64(ld16u): 3086 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3087 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 3088 break; 3089 OP_32_64(ld16s): 3090 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 3091 break; 3092#if TCG_TARGET_REG_BITS == 64 3093 case INDEX_op_ld32u_i64: 3094#endif 3095 case INDEX_op_ld_i32: 3096 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 3097 break; 3098 3099 OP_32_64(st8): 3100 if (const_args[0]) { 3101 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 3102 tcg_out8(s, a0); 3103 } else { 3104 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 3105 } 3106 break; 3107 OP_32_64(st16): 3108 if (const_args[0]) { 3109 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 3110 tcg_out16(s, a0); 3111 } else { 3112 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 3113 } 3114 break; 3115#if TCG_TARGET_REG_BITS == 64 3116 case INDEX_op_st32_i64: 3117#endif 3118 case INDEX_op_st_i32: 3119 if (const_args[0]) { 3120 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 3121 tcg_out32(s, a0); 3122 } else { 3123 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 3124 } 3125 break; 3126 3127 OP_32_64(brcond): 3128 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1], 3129 arg_label(args[3]), 0); 3130 break; 3131 OP_32_64(movcond): 3132 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 3133 break; 3134 3135 OP_32_64(bswap16): 3136 if (a2 & TCG_BSWAP_OS) { 3137 /* Output must be sign-extended. */ 3138 if (rexw) { 3139 tcg_out_bswap64(s, a0); 3140 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 3141 } else { 3142 tcg_out_bswap32(s, a0); 3143 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 3144 } 3145 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 3146 /* Output must be zero-extended, but input isn't. */ 3147 tcg_out_bswap32(s, a0); 3148 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 3149 } else { 3150 tcg_out_rolw_8(s, a0); 3151 } 3152 break; 3153 OP_32_64(bswap32): 3154 tcg_out_bswap32(s, a0); 3155 if (rexw && (a2 & TCG_BSWAP_OS)) { 3156 tcg_out_ext32s(s, a0, a0); 3157 } 3158 break; 3159 3160 case INDEX_op_qemu_ld_i32: 3161 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32); 3162 break; 3163 case INDEX_op_qemu_ld_i64: 3164 if (TCG_TARGET_REG_BITS == 64) { 3165 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64); 3166 } else { 3167 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3168 } 3169 break; 3170 case INDEX_op_qemu_ld_i128: 3171 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3172 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3173 break; 3174 3175 case INDEX_op_qemu_st_i32: 3176 case INDEX_op_qemu_st8_i32: 3177 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32); 3178 break; 3179 case INDEX_op_qemu_st_i64: 3180 if (TCG_TARGET_REG_BITS == 64) { 3181 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64); 3182 } else { 3183 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3184 } 3185 break; 3186 case INDEX_op_qemu_st_i128: 3187 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3188 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3189 break; 3190 3191 OP_32_64(add2): 3192 if (const_args[4]) { 3193 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 3194 } else { 3195 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 3196 } 3197 if (const_args[5]) { 3198 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 3199 } else { 3200 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 3201 } 3202 break; 3203 OP_32_64(sub2): 3204 if (const_args[4]) { 3205 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 3206 } else { 3207 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 3208 } 3209 if (const_args[5]) { 3210 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 3211 } else { 3212 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 3213 } 3214 break; 3215 3216#if TCG_TARGET_REG_BITS == 32 3217 case INDEX_op_brcond2_i32: 3218 tcg_out_brcond2(s, args, const_args, 0); 3219 break; 3220 case INDEX_op_setcond2_i32: 3221 tcg_out_setcond2(s, args, const_args); 3222 break; 3223#else /* TCG_TARGET_REG_BITS == 64 */ 3224 case INDEX_op_ld32s_i64: 3225 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 3226 break; 3227 case INDEX_op_ld_i64: 3228 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 3229 break; 3230 case INDEX_op_st_i64: 3231 if (const_args[0]) { 3232 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 3233 tcg_out32(s, a0); 3234 } else { 3235 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 3236 } 3237 break; 3238 3239 case INDEX_op_bswap64_i64: 3240 tcg_out_bswap64(s, a0); 3241 break; 3242 case INDEX_op_extrh_i64_i32: 3243 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 3244 break; 3245#endif 3246 3247 OP_32_64(deposit): 3248 if (args[3] == 0 && args[4] == 8) { 3249 /* load bits 0..7 */ 3250 if (const_a2) { 3251 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 3252 0, a0, 0); 3253 tcg_out8(s, a2); 3254 } else { 3255 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 3256 } 3257 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 3258 /* load bits 8..15 */ 3259 if (const_a2) { 3260 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 3261 tcg_out8(s, a2); 3262 } else { 3263 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 3264 } 3265 } else if (args[3] == 0 && args[4] == 16) { 3266 /* load bits 0..15 */ 3267 if (const_a2) { 3268 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 3269 0, a0, 0); 3270 tcg_out16(s, a2); 3271 } else { 3272 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 3273 } 3274 } else { 3275 g_assert_not_reached(); 3276 } 3277 break; 3278 3279 case INDEX_op_extract_i64: 3280 if (a2 + args[3] == 32) { 3281 if (a2 == 0) { 3282 tcg_out_ext32u(s, a0, a1); 3283 break; 3284 } 3285 /* This is a 32-bit zero-extending right shift. */ 3286 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3287 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 3288 break; 3289 } 3290 /* FALLTHRU */ 3291 case INDEX_op_extract_i32: 3292 if (a2 == 0 && args[3] == 8) { 3293 tcg_out_ext8u(s, a0, a1); 3294 } else if (a2 == 0 && args[3] == 16) { 3295 tcg_out_ext16u(s, a0, a1); 3296 } else if (a2 == 8 && args[3] == 8) { 3297 /* 3298 * On the off-chance that we can use the high-byte registers. 3299 * Otherwise we emit the same ext16 + shift pattern that we 3300 * would have gotten from the normal tcg-op.c expansion. 3301 */ 3302 if (a1 < 4 && a0 < 8) { 3303 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3304 } else { 3305 tcg_out_ext16u(s, a0, a1); 3306 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3307 } 3308 } else { 3309 g_assert_not_reached(); 3310 } 3311 break; 3312 3313 case INDEX_op_sextract_i64: 3314 if (a2 == 0 && args[3] == 8) { 3315 tcg_out_ext8s(s, TCG_TYPE_I64, a0, a1); 3316 } else if (a2 == 0 && args[3] == 16) { 3317 tcg_out_ext16s(s, TCG_TYPE_I64, a0, a1); 3318 } else if (a2 == 0 && args[3] == 32) { 3319 tcg_out_ext32s(s, a0, a1); 3320 } else { 3321 g_assert_not_reached(); 3322 } 3323 break; 3324 3325 case INDEX_op_sextract_i32: 3326 if (a2 == 0 && args[3] == 8) { 3327 tcg_out_ext8s(s, TCG_TYPE_I32, a0, a1); 3328 } else if (a2 == 0 && args[3] == 16) { 3329 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3330 } else if (a2 == 8 && args[3] == 8) { 3331 if (a1 < 4 && a0 < 8) { 3332 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3333 } else { 3334 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3335 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3336 } 3337 } else { 3338 g_assert_not_reached(); 3339 } 3340 break; 3341 3342 OP_32_64(extract2): 3343 /* Note that SHRD outputs to the r/m operand. */ 3344 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3345 tcg_out8(s, args[3]); 3346 break; 3347 3348 case INDEX_op_mb: 3349 tcg_out_mb(s, a0); 3350 break; 3351 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3352 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3353 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3354 case INDEX_op_ext_i32_i64: /* Always emitted via tcg_reg_alloc_op. */ 3355 case INDEX_op_extu_i32_i64: 3356 case INDEX_op_extrl_i64_i32: 3357 default: 3358 g_assert_not_reached(); 3359 } 3360 3361#undef OP_32_64 3362} 3363 3364static int const umin_insn[4] = { 3365 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3366}; 3367 3368static int const umax_insn[4] = { 3369 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3370}; 3371 3372static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3373 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3374{ 3375 static int const cmpeq_insn[4] = { 3376 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3377 }; 3378 static int const cmpgt_insn[4] = { 3379 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3380 }; 3381 3382 enum { 3383 NEED_INV = 1, 3384 NEED_SWAP = 2, 3385 NEED_UMIN = 4, 3386 NEED_UMAX = 8, 3387 INVALID = 16, 3388 }; 3389 static const uint8_t cond_fixup[16] = { 3390 [0 ... 15] = INVALID, 3391 [TCG_COND_EQ] = 0, 3392 [TCG_COND_GT] = 0, 3393 [TCG_COND_NE] = NEED_INV, 3394 [TCG_COND_LE] = NEED_INV, 3395 [TCG_COND_LT] = NEED_SWAP, 3396 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3397 [TCG_COND_LEU] = NEED_UMIN, 3398 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3399 [TCG_COND_GEU] = NEED_UMAX, 3400 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3401 }; 3402 int fixup = cond_fixup[cond]; 3403 3404 assert(!(fixup & INVALID)); 3405 3406 if (fixup & NEED_INV) { 3407 cond = tcg_invert_cond(cond); 3408 } 3409 3410 if (fixup & NEED_SWAP) { 3411 TCGReg swap = v1; 3412 v1 = v2; 3413 v2 = swap; 3414 cond = tcg_swap_cond(cond); 3415 } 3416 3417 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3418 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3419 3420 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3421 assert(vece <= MO_32); 3422 3423 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3424 v2 = TCG_TMP_VEC; 3425 cond = TCG_COND_EQ; 3426 } 3427 3428 switch (cond) { 3429 case TCG_COND_EQ: 3430 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3431 break; 3432 case TCG_COND_GT: 3433 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3434 break; 3435 default: 3436 g_assert_not_reached(); 3437 } 3438 return fixup & NEED_INV; 3439} 3440 3441static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3442 TCGReg v1, TCGReg v2, TCGCond cond) 3443{ 3444 static const int cmpm_insn[2][4] = { 3445 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3446 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3447 }; 3448 static const int testm_insn[4] = { 3449 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3450 }; 3451 static const int testnm_insn[4] = { 3452 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3453 }; 3454 3455 static const int cond_ext[16] = { 3456 [TCG_COND_EQ] = 0, 3457 [TCG_COND_NE] = 4, 3458 [TCG_COND_LT] = 1, 3459 [TCG_COND_LTU] = 1, 3460 [TCG_COND_LE] = 2, 3461 [TCG_COND_LEU] = 2, 3462 [TCG_COND_NEVER] = 3, 3463 [TCG_COND_GE] = 5, 3464 [TCG_COND_GEU] = 5, 3465 [TCG_COND_GT] = 6, 3466 [TCG_COND_GTU] = 6, 3467 [TCG_COND_ALWAYS] = 7, 3468 }; 3469 3470 switch (cond) { 3471 case TCG_COND_TSTNE: 3472 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3473 break; 3474 case TCG_COND_TSTEQ: 3475 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3476 break; 3477 default: 3478 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3479 /* k1 */ 1, v1, v2, type); 3480 tcg_out8(s, cond_ext[cond]); 3481 break; 3482 } 3483} 3484 3485static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3486 unsigned vece, TCGReg dest) 3487{ 3488 static const int movm_insn[] = { 3489 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3490 }; 3491 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3492} 3493 3494static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3495 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3496{ 3497 /* 3498 * With avx512, we have a complete set of comparisons into mask. 3499 * Unless there's a single insn expansion for the comparision, 3500 * expand via a mask in k1. 3501 */ 3502 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3503 && cond != TCG_COND_EQ 3504 && cond != TCG_COND_LT 3505 && cond != TCG_COND_GT) { 3506 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3507 tcg_out_k1_to_vec(s, type, vece, v0); 3508 return; 3509 } 3510 3511 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3512 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3513 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3514 } 3515} 3516 3517static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3518 TCGReg v0, TCGReg c1, TCGReg c2, 3519 TCGReg v3, TCGReg v4, TCGCond cond) 3520{ 3521 static const int vpblendm_insn[] = { 3522 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3523 }; 3524 bool z = false; 3525 3526 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3527 if (!v3) { 3528 z = true; 3529 v3 = v4; 3530 cond = tcg_invert_cond(cond); 3531 } 3532 3533 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3534 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3535 /* k1 */1, z, type); 3536} 3537 3538static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3539 TCGReg v0, TCGReg c1, TCGReg c2, 3540 TCGReg v3, TCGReg v4, TCGCond cond) 3541{ 3542 bool inv; 3543 3544 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3545 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3546 return; 3547 } 3548 3549 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3550 3551 /* 3552 * Since XMM0 is 16, the only way we get 0 into V3 3553 * is via the constant zero constraint. 3554 */ 3555 if (!v3) { 3556 if (inv) { 3557 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3558 } else { 3559 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3560 } 3561 } else { 3562 if (inv) { 3563 TCGReg swap = v3; 3564 v3 = v4; 3565 v4 = swap; 3566 } 3567 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3568 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3569 } 3570} 3571 3572static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3573 unsigned vecl, unsigned vece, 3574 const TCGArg args[TCG_MAX_OP_ARGS], 3575 const int const_args[TCG_MAX_OP_ARGS]) 3576{ 3577 static int const add_insn[4] = { 3578 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3579 }; 3580 static int const ssadd_insn[4] = { 3581 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3582 }; 3583 static int const usadd_insn[4] = { 3584 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3585 }; 3586 static int const sub_insn[4] = { 3587 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3588 }; 3589 static int const sssub_insn[4] = { 3590 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3591 }; 3592 static int const ussub_insn[4] = { 3593 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3594 }; 3595 static int const mul_insn[4] = { 3596 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3597 }; 3598 static int const shift_imm_insn[4] = { 3599 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3600 }; 3601 static int const punpckl_insn[4] = { 3602 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3603 }; 3604 static int const punpckh_insn[4] = { 3605 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3606 }; 3607 static int const packss_insn[4] = { 3608 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3609 }; 3610 static int const packus_insn[4] = { 3611 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3612 }; 3613 static int const smin_insn[4] = { 3614 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3615 }; 3616 static int const smax_insn[4] = { 3617 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3618 }; 3619 static int const rotlv_insn[4] = { 3620 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3621 }; 3622 static int const rotrv_insn[4] = { 3623 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3624 }; 3625 static int const shlv_insn[4] = { 3626 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3627 }; 3628 static int const shrv_insn[4] = { 3629 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3630 }; 3631 static int const sarv_insn[4] = { 3632 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3633 }; 3634 static int const shls_insn[4] = { 3635 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3636 }; 3637 static int const shrs_insn[4] = { 3638 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3639 }; 3640 static int const sars_insn[4] = { 3641 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3642 }; 3643 static int const vpshldi_insn[4] = { 3644 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3645 }; 3646 static int const vpshldv_insn[4] = { 3647 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3648 }; 3649 static int const vpshrdv_insn[4] = { 3650 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3651 }; 3652 static int const abs_insn[4] = { 3653 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3654 }; 3655 3656 TCGType type = vecl + TCG_TYPE_V64; 3657 int insn, sub; 3658 TCGArg a0, a1, a2, a3; 3659 3660 a0 = args[0]; 3661 a1 = args[1]; 3662 a2 = args[2]; 3663 3664 switch (opc) { 3665 case INDEX_op_add_vec: 3666 insn = add_insn[vece]; 3667 goto gen_simd; 3668 case INDEX_op_ssadd_vec: 3669 insn = ssadd_insn[vece]; 3670 goto gen_simd; 3671 case INDEX_op_usadd_vec: 3672 insn = usadd_insn[vece]; 3673 goto gen_simd; 3674 case INDEX_op_sub_vec: 3675 insn = sub_insn[vece]; 3676 goto gen_simd; 3677 case INDEX_op_sssub_vec: 3678 insn = sssub_insn[vece]; 3679 goto gen_simd; 3680 case INDEX_op_ussub_vec: 3681 insn = ussub_insn[vece]; 3682 goto gen_simd; 3683 case INDEX_op_mul_vec: 3684 insn = mul_insn[vece]; 3685 goto gen_simd; 3686 case INDEX_op_and_vec: 3687 insn = OPC_PAND; 3688 goto gen_simd; 3689 case INDEX_op_or_vec: 3690 insn = OPC_POR; 3691 goto gen_simd; 3692 case INDEX_op_xor_vec: 3693 insn = OPC_PXOR; 3694 goto gen_simd; 3695 case INDEX_op_smin_vec: 3696 insn = smin_insn[vece]; 3697 goto gen_simd; 3698 case INDEX_op_umin_vec: 3699 insn = umin_insn[vece]; 3700 goto gen_simd; 3701 case INDEX_op_smax_vec: 3702 insn = smax_insn[vece]; 3703 goto gen_simd; 3704 case INDEX_op_umax_vec: 3705 insn = umax_insn[vece]; 3706 goto gen_simd; 3707 case INDEX_op_shlv_vec: 3708 insn = shlv_insn[vece]; 3709 goto gen_simd; 3710 case INDEX_op_shrv_vec: 3711 insn = shrv_insn[vece]; 3712 goto gen_simd; 3713 case INDEX_op_sarv_vec: 3714 insn = sarv_insn[vece]; 3715 goto gen_simd; 3716 case INDEX_op_rotlv_vec: 3717 insn = rotlv_insn[vece]; 3718 goto gen_simd; 3719 case INDEX_op_rotrv_vec: 3720 insn = rotrv_insn[vece]; 3721 goto gen_simd; 3722 case INDEX_op_shls_vec: 3723 insn = shls_insn[vece]; 3724 goto gen_simd; 3725 case INDEX_op_shrs_vec: 3726 insn = shrs_insn[vece]; 3727 goto gen_simd; 3728 case INDEX_op_sars_vec: 3729 insn = sars_insn[vece]; 3730 goto gen_simd; 3731 case INDEX_op_x86_punpckl_vec: 3732 insn = punpckl_insn[vece]; 3733 goto gen_simd; 3734 case INDEX_op_x86_punpckh_vec: 3735 insn = punpckh_insn[vece]; 3736 goto gen_simd; 3737 case INDEX_op_x86_packss_vec: 3738 insn = packss_insn[vece]; 3739 goto gen_simd; 3740 case INDEX_op_x86_packus_vec: 3741 insn = packus_insn[vece]; 3742 goto gen_simd; 3743 case INDEX_op_x86_vpshldv_vec: 3744 insn = vpshldv_insn[vece]; 3745 a1 = a2; 3746 a2 = args[3]; 3747 goto gen_simd; 3748 case INDEX_op_x86_vpshrdv_vec: 3749 insn = vpshrdv_insn[vece]; 3750 a1 = a2; 3751 a2 = args[3]; 3752 goto gen_simd; 3753#if TCG_TARGET_REG_BITS == 32 3754 case INDEX_op_dup2_vec: 3755 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3756 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3757 /* Then replicate the 64-bit elements across the rest of the vector. */ 3758 if (type != TCG_TYPE_V64) { 3759 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3760 } 3761 break; 3762#endif 3763 case INDEX_op_abs_vec: 3764 insn = abs_insn[vece]; 3765 a2 = a1; 3766 a1 = 0; 3767 goto gen_simd; 3768 gen_simd: 3769 tcg_debug_assert(insn != OPC_UD2); 3770 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3771 break; 3772 3773 case INDEX_op_cmp_vec: 3774 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3775 break; 3776 3777 case INDEX_op_cmpsel_vec: 3778 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3779 args[3], args[4], args[5]); 3780 break; 3781 3782 case INDEX_op_andc_vec: 3783 insn = OPC_PANDN; 3784 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3785 break; 3786 3787 case INDEX_op_shli_vec: 3788 insn = shift_imm_insn[vece]; 3789 sub = 6; 3790 goto gen_shift; 3791 case INDEX_op_shri_vec: 3792 insn = shift_imm_insn[vece]; 3793 sub = 2; 3794 goto gen_shift; 3795 case INDEX_op_sari_vec: 3796 if (vece == MO_64) { 3797 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3798 } else { 3799 insn = shift_imm_insn[vece]; 3800 } 3801 sub = 4; 3802 goto gen_shift; 3803 case INDEX_op_rotli_vec: 3804 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3805 if (vece == MO_64) { 3806 insn |= P_VEXW; 3807 } 3808 sub = 1; 3809 goto gen_shift; 3810 gen_shift: 3811 tcg_debug_assert(vece != MO_8); 3812 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3813 tcg_out8(s, a2); 3814 break; 3815 3816 case INDEX_op_ld_vec: 3817 tcg_out_ld(s, type, a0, a1, a2); 3818 break; 3819 case INDEX_op_st_vec: 3820 tcg_out_st(s, type, a0, a1, a2); 3821 break; 3822 case INDEX_op_dupm_vec: 3823 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3824 break; 3825 3826 case INDEX_op_x86_shufps_vec: 3827 insn = OPC_SHUFPS; 3828 sub = args[3]; 3829 goto gen_simd_imm8; 3830 case INDEX_op_x86_blend_vec: 3831 if (vece == MO_16) { 3832 insn = OPC_PBLENDW; 3833 } else if (vece == MO_32) { 3834 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3835 } else { 3836 g_assert_not_reached(); 3837 } 3838 sub = args[3]; 3839 goto gen_simd_imm8; 3840 case INDEX_op_x86_vperm2i128_vec: 3841 insn = OPC_VPERM2I128; 3842 sub = args[3]; 3843 goto gen_simd_imm8; 3844 case INDEX_op_x86_vpshldi_vec: 3845 insn = vpshldi_insn[vece]; 3846 sub = args[3]; 3847 goto gen_simd_imm8; 3848 3849 case INDEX_op_not_vec: 3850 insn = OPC_VPTERNLOGQ; 3851 a2 = a1; 3852 sub = 0x33; /* !B */ 3853 goto gen_simd_imm8; 3854 case INDEX_op_nor_vec: 3855 insn = OPC_VPTERNLOGQ; 3856 sub = 0x11; /* norCB */ 3857 goto gen_simd_imm8; 3858 case INDEX_op_nand_vec: 3859 insn = OPC_VPTERNLOGQ; 3860 sub = 0x77; /* nandCB */ 3861 goto gen_simd_imm8; 3862 case INDEX_op_eqv_vec: 3863 insn = OPC_VPTERNLOGQ; 3864 sub = 0x99; /* xnorCB */ 3865 goto gen_simd_imm8; 3866 case INDEX_op_orc_vec: 3867 insn = OPC_VPTERNLOGQ; 3868 sub = 0xdd; /* orB!C */ 3869 goto gen_simd_imm8; 3870 3871 case INDEX_op_bitsel_vec: 3872 insn = OPC_VPTERNLOGQ; 3873 a3 = args[3]; 3874 if (a0 == a1) { 3875 a1 = a2; 3876 a2 = a3; 3877 sub = 0xca; /* A?B:C */ 3878 } else if (a0 == a2) { 3879 a2 = a3; 3880 sub = 0xe2; /* B?A:C */ 3881 } else { 3882 tcg_out_mov(s, type, a0, a3); 3883 sub = 0xb8; /* B?C:A */ 3884 } 3885 goto gen_simd_imm8; 3886 3887 gen_simd_imm8: 3888 tcg_debug_assert(insn != OPC_UD2); 3889 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3890 tcg_out8(s, sub); 3891 break; 3892 3893 case INDEX_op_x86_psrldq_vec: 3894 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3895 tcg_out8(s, a2); 3896 break; 3897 3898 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3899 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3900 default: 3901 g_assert_not_reached(); 3902 } 3903} 3904 3905static TCGConstraintSetIndex 3906tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) 3907{ 3908 switch (op) { 3909 case INDEX_op_goto_ptr: 3910 return C_O0_I1(r); 3911 3912 case INDEX_op_ld8u_i32: 3913 case INDEX_op_ld8u_i64: 3914 case INDEX_op_ld8s_i32: 3915 case INDEX_op_ld8s_i64: 3916 case INDEX_op_ld16u_i32: 3917 case INDEX_op_ld16u_i64: 3918 case INDEX_op_ld16s_i32: 3919 case INDEX_op_ld16s_i64: 3920 case INDEX_op_ld_i32: 3921 case INDEX_op_ld32u_i64: 3922 case INDEX_op_ld32s_i64: 3923 case INDEX_op_ld_i64: 3924 return C_O1_I1(r, r); 3925 3926 case INDEX_op_st8_i32: 3927 case INDEX_op_st8_i64: 3928 return C_O0_I2(qi, r); 3929 3930 case INDEX_op_st16_i32: 3931 case INDEX_op_st16_i64: 3932 case INDEX_op_st_i32: 3933 case INDEX_op_st32_i64: 3934 return C_O0_I2(ri, r); 3935 3936 case INDEX_op_st_i64: 3937 return C_O0_I2(re, r); 3938 3939 case INDEX_op_brcond_i32: 3940 case INDEX_op_brcond_i64: 3941 return C_O0_I2(r, reT); 3942 3943 case INDEX_op_bswap16_i32: 3944 case INDEX_op_bswap16_i64: 3945 case INDEX_op_bswap32_i32: 3946 case INDEX_op_bswap32_i64: 3947 case INDEX_op_bswap64_i64: 3948 case INDEX_op_extrh_i64_i32: 3949 return C_O1_I1(r, 0); 3950 3951 case INDEX_op_ext_i32_i64: 3952 case INDEX_op_extu_i32_i64: 3953 case INDEX_op_extrl_i64_i32: 3954 case INDEX_op_extract_i32: 3955 case INDEX_op_extract_i64: 3956 case INDEX_op_sextract_i32: 3957 case INDEX_op_sextract_i64: 3958 return C_O1_I1(r, r); 3959 3960 case INDEX_op_extract2_i32: 3961 case INDEX_op_extract2_i64: 3962 return C_O1_I2(r, 0, r); 3963 3964 case INDEX_op_deposit_i32: 3965 case INDEX_op_deposit_i64: 3966 return C_O1_I2(q, 0, qi); 3967 3968 case INDEX_op_movcond_i32: 3969 case INDEX_op_movcond_i64: 3970 return C_O1_I4(r, r, reT, r, 0); 3971 3972 case INDEX_op_add2_i32: 3973 case INDEX_op_add2_i64: 3974 case INDEX_op_sub2_i32: 3975 case INDEX_op_sub2_i64: 3976 return C_N1_O1_I4(r, r, 0, 1, re, re); 3977 3978 case INDEX_op_qemu_ld_i32: 3979 return C_O1_I1(r, L); 3980 3981 case INDEX_op_qemu_st_i32: 3982 return C_O0_I2(L, L); 3983 case INDEX_op_qemu_st8_i32: 3984 return C_O0_I2(s, L); 3985 3986 case INDEX_op_qemu_ld_i64: 3987 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3988 3989 case INDEX_op_qemu_st_i64: 3990 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3991 3992 case INDEX_op_qemu_ld_i128: 3993 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3994 return C_O2_I1(r, r, L); 3995 case INDEX_op_qemu_st_i128: 3996 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3997 return C_O0_I3(L, L, L); 3998 3999 case INDEX_op_brcond2_i32: 4000 return C_O0_I4(r, r, ri, ri); 4001 4002 case INDEX_op_setcond2_i32: 4003 return C_O1_I4(r, r, r, ri, ri); 4004 4005 case INDEX_op_ld_vec: 4006 case INDEX_op_dupm_vec: 4007 return C_O1_I1(x, r); 4008 4009 case INDEX_op_st_vec: 4010 return C_O0_I2(x, r); 4011 4012 case INDEX_op_add_vec: 4013 case INDEX_op_sub_vec: 4014 case INDEX_op_mul_vec: 4015 case INDEX_op_and_vec: 4016 case INDEX_op_or_vec: 4017 case INDEX_op_xor_vec: 4018 case INDEX_op_andc_vec: 4019 case INDEX_op_orc_vec: 4020 case INDEX_op_nand_vec: 4021 case INDEX_op_nor_vec: 4022 case INDEX_op_eqv_vec: 4023 case INDEX_op_ssadd_vec: 4024 case INDEX_op_usadd_vec: 4025 case INDEX_op_sssub_vec: 4026 case INDEX_op_ussub_vec: 4027 case INDEX_op_smin_vec: 4028 case INDEX_op_umin_vec: 4029 case INDEX_op_smax_vec: 4030 case INDEX_op_umax_vec: 4031 case INDEX_op_shlv_vec: 4032 case INDEX_op_shrv_vec: 4033 case INDEX_op_sarv_vec: 4034 case INDEX_op_rotlv_vec: 4035 case INDEX_op_rotrv_vec: 4036 case INDEX_op_shls_vec: 4037 case INDEX_op_shrs_vec: 4038 case INDEX_op_sars_vec: 4039 case INDEX_op_cmp_vec: 4040 case INDEX_op_x86_shufps_vec: 4041 case INDEX_op_x86_blend_vec: 4042 case INDEX_op_x86_packss_vec: 4043 case INDEX_op_x86_packus_vec: 4044 case INDEX_op_x86_vperm2i128_vec: 4045 case INDEX_op_x86_punpckl_vec: 4046 case INDEX_op_x86_punpckh_vec: 4047 case INDEX_op_x86_vpshldi_vec: 4048#if TCG_TARGET_REG_BITS == 32 4049 case INDEX_op_dup2_vec: 4050#endif 4051 return C_O1_I2(x, x, x); 4052 4053 case INDEX_op_abs_vec: 4054 case INDEX_op_dup_vec: 4055 case INDEX_op_not_vec: 4056 case INDEX_op_shli_vec: 4057 case INDEX_op_shri_vec: 4058 case INDEX_op_sari_vec: 4059 case INDEX_op_rotli_vec: 4060 case INDEX_op_x86_psrldq_vec: 4061 return C_O1_I1(x, x); 4062 4063 case INDEX_op_x86_vpshldv_vec: 4064 case INDEX_op_x86_vpshrdv_vec: 4065 return C_O1_I3(x, 0, x, x); 4066 4067 case INDEX_op_bitsel_vec: 4068 return C_O1_I3(x, x, x, x); 4069 case INDEX_op_cmpsel_vec: 4070 return C_O1_I4(x, x, x, xO, x); 4071 4072 default: 4073 return C_NotImplemented; 4074 } 4075} 4076 4077int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 4078{ 4079 switch (opc) { 4080 case INDEX_op_add_vec: 4081 case INDEX_op_sub_vec: 4082 case INDEX_op_and_vec: 4083 case INDEX_op_or_vec: 4084 case INDEX_op_xor_vec: 4085 case INDEX_op_andc_vec: 4086 case INDEX_op_orc_vec: 4087 case INDEX_op_nand_vec: 4088 case INDEX_op_nor_vec: 4089 case INDEX_op_eqv_vec: 4090 case INDEX_op_not_vec: 4091 case INDEX_op_bitsel_vec: 4092 return 1; 4093 case INDEX_op_cmp_vec: 4094 case INDEX_op_cmpsel_vec: 4095 return -1; 4096 4097 case INDEX_op_rotli_vec: 4098 return have_avx512vl && vece >= MO_32 ? 1 : -1; 4099 4100 case INDEX_op_shli_vec: 4101 case INDEX_op_shri_vec: 4102 /* We must expand the operation for MO_8. */ 4103 return vece == MO_8 ? -1 : 1; 4104 4105 case INDEX_op_sari_vec: 4106 switch (vece) { 4107 case MO_8: 4108 return -1; 4109 case MO_16: 4110 case MO_32: 4111 return 1; 4112 case MO_64: 4113 if (have_avx512vl) { 4114 return 1; 4115 } 4116 /* 4117 * We can emulate this for MO_64, but it does not pay off 4118 * unless we're producing at least 4 values. 4119 */ 4120 return type >= TCG_TYPE_V256 ? -1 : 0; 4121 } 4122 return 0; 4123 4124 case INDEX_op_shls_vec: 4125 case INDEX_op_shrs_vec: 4126 return vece >= MO_16; 4127 case INDEX_op_sars_vec: 4128 switch (vece) { 4129 case MO_16: 4130 case MO_32: 4131 return 1; 4132 case MO_64: 4133 return have_avx512vl; 4134 } 4135 return 0; 4136 case INDEX_op_rotls_vec: 4137 return vece >= MO_16 ? -1 : 0; 4138 4139 case INDEX_op_shlv_vec: 4140 case INDEX_op_shrv_vec: 4141 switch (vece) { 4142 case MO_16: 4143 return have_avx512bw; 4144 case MO_32: 4145 case MO_64: 4146 return have_avx2; 4147 } 4148 return 0; 4149 case INDEX_op_sarv_vec: 4150 switch (vece) { 4151 case MO_16: 4152 return have_avx512bw; 4153 case MO_32: 4154 return have_avx2; 4155 case MO_64: 4156 return have_avx512vl; 4157 } 4158 return 0; 4159 case INDEX_op_rotlv_vec: 4160 case INDEX_op_rotrv_vec: 4161 switch (vece) { 4162 case MO_16: 4163 return have_avx512vbmi2 ? -1 : 0; 4164 case MO_32: 4165 case MO_64: 4166 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 4167 } 4168 return 0; 4169 4170 case INDEX_op_mul_vec: 4171 switch (vece) { 4172 case MO_8: 4173 return -1; 4174 case MO_64: 4175 return have_avx512dq; 4176 } 4177 return 1; 4178 4179 case INDEX_op_ssadd_vec: 4180 case INDEX_op_usadd_vec: 4181 case INDEX_op_sssub_vec: 4182 case INDEX_op_ussub_vec: 4183 return vece <= MO_16; 4184 case INDEX_op_smin_vec: 4185 case INDEX_op_smax_vec: 4186 case INDEX_op_umin_vec: 4187 case INDEX_op_umax_vec: 4188 case INDEX_op_abs_vec: 4189 return vece <= MO_32 || have_avx512vl; 4190 4191 default: 4192 return 0; 4193 } 4194} 4195 4196static void expand_vec_shi(TCGType type, unsigned vece, bool right, 4197 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4198{ 4199 uint8_t mask; 4200 4201 tcg_debug_assert(vece == MO_8); 4202 if (right) { 4203 mask = 0xff >> imm; 4204 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4205 } else { 4206 mask = 0xff << imm; 4207 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4208 } 4209 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4210} 4211 4212static void expand_vec_sari(TCGType type, unsigned vece, 4213 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4214{ 4215 TCGv_vec t1, t2; 4216 4217 switch (vece) { 4218 case MO_8: 4219 /* Unpack to 16-bit, shift, and repack. */ 4220 t1 = tcg_temp_new_vec(type); 4221 t2 = tcg_temp_new_vec(type); 4222 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4223 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4224 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4225 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4226 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4227 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4228 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4229 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4230 tcg_temp_free_vec(t1); 4231 tcg_temp_free_vec(t2); 4232 break; 4233 4234 case MO_64: 4235 t1 = tcg_temp_new_vec(type); 4236 if (imm <= 32) { 4237 /* 4238 * We can emulate a small sign extend by performing an arithmetic 4239 * 32-bit shift and overwriting the high half of a 64-bit logical 4240 * shift. Note that the ISA says shift of 32 is valid, but TCG 4241 * does not, so we have to bound the smaller shift -- we get the 4242 * same result in the high half either way. 4243 */ 4244 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4245 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4246 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4247 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4248 tcgv_vec_arg(t1), 0xaa); 4249 } else { 4250 /* Otherwise we will need to use a compare vs 0 to produce 4251 * the sign-extend, shift and merge. 4252 */ 4253 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4254 tcg_constant_vec(type, MO_64, 0), v1); 4255 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4256 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4257 tcg_gen_or_vec(MO_64, v0, v0, t1); 4258 } 4259 tcg_temp_free_vec(t1); 4260 break; 4261 4262 default: 4263 g_assert_not_reached(); 4264 } 4265} 4266 4267static void expand_vec_rotli(TCGType type, unsigned vece, 4268 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4269{ 4270 TCGv_vec t; 4271 4272 if (vece != MO_8 && have_avx512vbmi2) { 4273 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4274 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4275 return; 4276 } 4277 4278 t = tcg_temp_new_vec(type); 4279 tcg_gen_shli_vec(vece, t, v1, imm); 4280 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4281 tcg_gen_or_vec(vece, v0, v0, t); 4282 tcg_temp_free_vec(t); 4283} 4284 4285static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4286 TCGv_vec v1, TCGv_vec sh, bool right) 4287{ 4288 TCGv_vec t; 4289 4290 if (have_avx512vbmi2) { 4291 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4292 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4293 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4294 return; 4295 } 4296 4297 t = tcg_temp_new_vec(type); 4298 tcg_gen_dupi_vec(vece, t, 8 << vece); 4299 tcg_gen_sub_vec(vece, t, t, sh); 4300 if (right) { 4301 tcg_gen_shlv_vec(vece, t, v1, t); 4302 tcg_gen_shrv_vec(vece, v0, v1, sh); 4303 } else { 4304 tcg_gen_shrv_vec(vece, t, v1, t); 4305 tcg_gen_shlv_vec(vece, v0, v1, sh); 4306 } 4307 tcg_gen_or_vec(vece, v0, v0, t); 4308 tcg_temp_free_vec(t); 4309} 4310 4311static void expand_vec_rotls(TCGType type, unsigned vece, 4312 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4313{ 4314 TCGv_vec t = tcg_temp_new_vec(type); 4315 4316 tcg_debug_assert(vece != MO_8); 4317 4318 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4319 tcg_gen_dup_i32_vec(vece, t, lsh); 4320 if (vece >= MO_32) { 4321 tcg_gen_rotlv_vec(vece, v0, v1, t); 4322 } else { 4323 expand_vec_rotv(type, vece, v0, v1, t, false); 4324 } 4325 } else { 4326 TCGv_i32 rsh = tcg_temp_new_i32(); 4327 4328 tcg_gen_neg_i32(rsh, lsh); 4329 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4330 tcg_gen_shls_vec(vece, t, v1, lsh); 4331 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4332 tcg_gen_or_vec(vece, v0, v0, t); 4333 4334 tcg_temp_free_i32(rsh); 4335 } 4336 4337 tcg_temp_free_vec(t); 4338} 4339 4340static void expand_vec_mul(TCGType type, unsigned vece, 4341 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4342{ 4343 TCGv_vec t1, t2, t3, t4, zero; 4344 4345 tcg_debug_assert(vece == MO_8); 4346 4347 /* 4348 * Unpack v1 bytes to words, 0 | x. 4349 * Unpack v2 bytes to words, y | 0. 4350 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4351 * Shift logical right by 8 bits to clear the high 8 bytes before 4352 * using an unsigned saturated pack. 4353 * 4354 * The difference between the V64, V128 and V256 cases is merely how 4355 * we distribute the expansion between temporaries. 4356 */ 4357 switch (type) { 4358 case TCG_TYPE_V64: 4359 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4360 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4361 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4362 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4363 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4364 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4365 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4366 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4367 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4368 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4369 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4370 tcg_temp_free_vec(t1); 4371 tcg_temp_free_vec(t2); 4372 break; 4373 4374 case TCG_TYPE_V128: 4375 case TCG_TYPE_V256: 4376 t1 = tcg_temp_new_vec(type); 4377 t2 = tcg_temp_new_vec(type); 4378 t3 = tcg_temp_new_vec(type); 4379 t4 = tcg_temp_new_vec(type); 4380 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4381 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4382 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4383 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4384 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4385 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4386 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4387 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4388 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4389 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4390 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4391 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4392 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4393 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4394 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4395 tcg_temp_free_vec(t1); 4396 tcg_temp_free_vec(t2); 4397 tcg_temp_free_vec(t3); 4398 tcg_temp_free_vec(t4); 4399 break; 4400 4401 default: 4402 g_assert_not_reached(); 4403 } 4404} 4405 4406static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4407 TCGArg *a1, TCGArg *a2, TCGCond cond) 4408{ 4409 /* 4410 * Without AVX512, there are no 64-bit unsigned comparisons. 4411 * We must bias the inputs so that they become signed. 4412 * All other swapping and inversion are handled during code generation. 4413 */ 4414 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4415 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4416 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4417 TCGv_vec t1 = tcg_temp_new_vec(type); 4418 TCGv_vec t2 = tcg_temp_new_vec(type); 4419 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4420 4421 tcg_gen_sub_vec(vece, t1, v1, t3); 4422 tcg_gen_sub_vec(vece, t2, v2, t3); 4423 *a1 = tcgv_vec_arg(t1); 4424 *a2 = tcgv_vec_arg(t2); 4425 cond = tcg_signed_cond(cond); 4426 } 4427 return cond; 4428} 4429 4430static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4431 TCGArg a1, TCGArg a2, TCGCond cond) 4432{ 4433 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4434 /* Expand directly; do not recurse. */ 4435 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4436} 4437 4438static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4439 TCGArg a1, TCGArg a2, 4440 TCGArg a3, TCGArg a4, TCGCond cond) 4441{ 4442 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4443 /* Expand directly; do not recurse. */ 4444 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4445} 4446 4447void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4448 TCGArg a0, ...) 4449{ 4450 va_list va; 4451 TCGArg a1, a2, a3, a4, a5; 4452 TCGv_vec v0, v1, v2; 4453 4454 va_start(va, a0); 4455 a1 = va_arg(va, TCGArg); 4456 a2 = va_arg(va, TCGArg); 4457 v0 = temp_tcgv_vec(arg_temp(a0)); 4458 v1 = temp_tcgv_vec(arg_temp(a1)); 4459 4460 switch (opc) { 4461 case INDEX_op_shli_vec: 4462 expand_vec_shi(type, vece, false, v0, v1, a2); 4463 break; 4464 case INDEX_op_shri_vec: 4465 expand_vec_shi(type, vece, true, v0, v1, a2); 4466 break; 4467 case INDEX_op_sari_vec: 4468 expand_vec_sari(type, vece, v0, v1, a2); 4469 break; 4470 4471 case INDEX_op_rotli_vec: 4472 expand_vec_rotli(type, vece, v0, v1, a2); 4473 break; 4474 4475 case INDEX_op_rotls_vec: 4476 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4477 break; 4478 4479 case INDEX_op_rotlv_vec: 4480 v2 = temp_tcgv_vec(arg_temp(a2)); 4481 expand_vec_rotv(type, vece, v0, v1, v2, false); 4482 break; 4483 case INDEX_op_rotrv_vec: 4484 v2 = temp_tcgv_vec(arg_temp(a2)); 4485 expand_vec_rotv(type, vece, v0, v1, v2, true); 4486 break; 4487 4488 case INDEX_op_mul_vec: 4489 v2 = temp_tcgv_vec(arg_temp(a2)); 4490 expand_vec_mul(type, vece, v0, v1, v2); 4491 break; 4492 4493 case INDEX_op_cmp_vec: 4494 a3 = va_arg(va, TCGArg); 4495 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4496 break; 4497 4498 case INDEX_op_cmpsel_vec: 4499 a3 = va_arg(va, TCGArg); 4500 a4 = va_arg(va, TCGArg); 4501 a5 = va_arg(va, TCGArg); 4502 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4503 break; 4504 4505 default: 4506 break; 4507 } 4508 4509 va_end(va); 4510} 4511 4512static const int tcg_target_callee_save_regs[] = { 4513#if TCG_TARGET_REG_BITS == 64 4514 TCG_REG_RBP, 4515 TCG_REG_RBX, 4516#if defined(_WIN64) 4517 TCG_REG_RDI, 4518 TCG_REG_RSI, 4519#endif 4520 TCG_REG_R12, 4521 TCG_REG_R13, 4522 TCG_REG_R14, /* Currently used for the global env. */ 4523 TCG_REG_R15, 4524#else 4525 TCG_REG_EBP, /* Currently used for the global env. */ 4526 TCG_REG_EBX, 4527 TCG_REG_ESI, 4528 TCG_REG_EDI, 4529#endif 4530}; 4531 4532/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4533 and tcg_register_jit. */ 4534 4535#define PUSH_SIZE \ 4536 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4537 * (TCG_TARGET_REG_BITS / 8)) 4538 4539#define FRAME_SIZE \ 4540 ((PUSH_SIZE \ 4541 + TCG_STATIC_CALL_ARGS_SIZE \ 4542 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4543 + TCG_TARGET_STACK_ALIGN - 1) \ 4544 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4545 4546/* Generate global QEMU prologue and epilogue code */ 4547static void tcg_target_qemu_prologue(TCGContext *s) 4548{ 4549 int i, stack_addend; 4550 4551 /* TB prologue */ 4552 4553 /* Reserve some stack space, also for TCG temps. */ 4554 stack_addend = FRAME_SIZE - PUSH_SIZE; 4555 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4556 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4557 4558 /* Save all callee saved registers. */ 4559 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4560 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4561 } 4562 4563 if (!tcg_use_softmmu && guest_base) { 4564 int seg = setup_guest_base_seg(); 4565 if (seg != 0) { 4566 x86_guest_base.seg = seg; 4567 } else if (guest_base == (int32_t)guest_base) { 4568 x86_guest_base.ofs = guest_base; 4569 } else { 4570 assert(TCG_TARGET_REG_BITS == 64); 4571 /* Choose R12 because, as a base, it requires a SIB byte. */ 4572 x86_guest_base.index = TCG_REG_R12; 4573 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4574 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4575 } 4576 } 4577 4578 if (TCG_TARGET_REG_BITS == 32) { 4579 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4580 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4581 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4582 /* jmp *tb. */ 4583 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4584 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4585 + stack_addend); 4586 } else { 4587 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4588 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4589 /* jmp *tb. */ 4590 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4591 } 4592 4593 /* 4594 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4595 * and fall through to the rest of the epilogue. 4596 */ 4597 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4598 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4599 4600 /* TB epilogue */ 4601 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4602 4603 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4604 4605 if (have_avx2) { 4606 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4607 } 4608 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4609 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4610 } 4611 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4612} 4613 4614static void tcg_out_tb_start(TCGContext *s) 4615{ 4616 /* nothing to do */ 4617} 4618 4619static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4620{ 4621 memset(p, 0x90, count); 4622} 4623 4624static void tcg_target_init(TCGContext *s) 4625{ 4626 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4627 if (TCG_TARGET_REG_BITS == 64) { 4628 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4629 } 4630 if (have_avx1) { 4631 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4632 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4633 } 4634 if (have_avx2) { 4635 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4636 } 4637 4638 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4639 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4640 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4641 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4642 if (TCG_TARGET_REG_BITS == 64) { 4643#if !defined(_WIN64) 4644 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4645 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4646#endif 4647 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4648 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4649 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4650 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4651 } 4652 4653 s->reserved_regs = 0; 4654 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4655 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4656#ifdef _WIN64 4657 /* These are call saved, and we don't save them, so don't use them. */ 4658 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4659 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4660 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4661 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4662 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4663 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4664 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4665 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4666 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4667 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4668#endif 4669} 4670 4671typedef struct { 4672 DebugFrameHeader h; 4673 uint8_t fde_def_cfa[4]; 4674 uint8_t fde_reg_ofs[14]; 4675} DebugFrame; 4676 4677/* We're expecting a 2 byte uleb128 encoded value. */ 4678QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4679 4680#if !defined(__ELF__) 4681 /* Host machine without ELF. */ 4682#elif TCG_TARGET_REG_BITS == 64 4683#define ELF_HOST_MACHINE EM_X86_64 4684static const DebugFrame debug_frame = { 4685 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4686 .h.cie.id = -1, 4687 .h.cie.version = 1, 4688 .h.cie.code_align = 1, 4689 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4690 .h.cie.return_column = 16, 4691 4692 /* Total FDE size does not include the "len" member. */ 4693 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4694 4695 .fde_def_cfa = { 4696 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4697 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4698 (FRAME_SIZE >> 7) 4699 }, 4700 .fde_reg_ofs = { 4701 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4702 /* The following ordering must match tcg_target_callee_save_regs. */ 4703 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4704 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4705 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4706 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4707 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4708 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4709 } 4710}; 4711#else 4712#define ELF_HOST_MACHINE EM_386 4713static const DebugFrame debug_frame = { 4714 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4715 .h.cie.id = -1, 4716 .h.cie.version = 1, 4717 .h.cie.code_align = 1, 4718 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4719 .h.cie.return_column = 8, 4720 4721 /* Total FDE size does not include the "len" member. */ 4722 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4723 4724 .fde_def_cfa = { 4725 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4726 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4727 (FRAME_SIZE >> 7) 4728 }, 4729 .fde_reg_ofs = { 4730 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4731 /* The following ordering must match tcg_target_callee_save_regs. */ 4732 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4733 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4734 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4735 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4736 } 4737}; 4738#endif 4739 4740#if defined(ELF_HOST_MACHINE) 4741void tcg_register_jit(const void *buf, size_t buf_size) 4742{ 4743 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4744} 4745#endif 4746