1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25/* Used for function call generation. */ 26#define TCG_TARGET_STACK_ALIGN 16 27#if defined(_WIN64) 28#define TCG_TARGET_CALL_STACK_OFFSET 32 29#else 30#define TCG_TARGET_CALL_STACK_OFFSET 0 31#endif 32#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL 33#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL 34#if defined(_WIN64) 35# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF 36# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_VEC 37#elif TCG_TARGET_REG_BITS == 64 38# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 39# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL 40#else 41# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 42# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF 43#endif 44 45#ifdef CONFIG_DEBUG_TCG 46static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 47#if TCG_TARGET_REG_BITS == 64 48 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 49#else 50 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 51#endif 52 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 53 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 54#if TCG_TARGET_REG_BITS == 64 55 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 56 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 57#endif 58}; 59#endif 60 61static const int tcg_target_reg_alloc_order[] = { 62#if TCG_TARGET_REG_BITS == 64 63 TCG_REG_RBP, 64 TCG_REG_RBX, 65 TCG_REG_R12, 66 TCG_REG_R13, 67 TCG_REG_R14, 68 TCG_REG_R15, 69 TCG_REG_R10, 70 TCG_REG_R11, 71 TCG_REG_R9, 72 TCG_REG_R8, 73 TCG_REG_RCX, 74 TCG_REG_RDX, 75 TCG_REG_RSI, 76 TCG_REG_RDI, 77 TCG_REG_RAX, 78#else 79 TCG_REG_EBX, 80 TCG_REG_ESI, 81 TCG_REG_EDI, 82 TCG_REG_EBP, 83 TCG_REG_ECX, 84 TCG_REG_EDX, 85 TCG_REG_EAX, 86#endif 87 TCG_REG_XMM0, 88 TCG_REG_XMM1, 89 TCG_REG_XMM2, 90 TCG_REG_XMM3, 91 TCG_REG_XMM4, 92 TCG_REG_XMM5, 93#ifndef _WIN64 94 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 95 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 96 TCG_REG_XMM6, 97 TCG_REG_XMM7, 98#if TCG_TARGET_REG_BITS == 64 99 TCG_REG_XMM8, 100 TCG_REG_XMM9, 101 TCG_REG_XMM10, 102 TCG_REG_XMM11, 103 TCG_REG_XMM12, 104 TCG_REG_XMM13, 105 TCG_REG_XMM14, 106 TCG_REG_XMM15, 107#endif 108#endif 109}; 110 111#define TCG_TMP_VEC TCG_REG_XMM5 112 113static const int tcg_target_call_iarg_regs[] = { 114#if TCG_TARGET_REG_BITS == 64 115#if defined(_WIN64) 116 TCG_REG_RCX, 117 TCG_REG_RDX, 118#else 119 TCG_REG_RDI, 120 TCG_REG_RSI, 121 TCG_REG_RDX, 122 TCG_REG_RCX, 123#endif 124 TCG_REG_R8, 125 TCG_REG_R9, 126#else 127 /* 32 bit mode uses stack based calling convention (GCC default). */ 128#endif 129}; 130 131static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 132{ 133 switch (kind) { 134 case TCG_CALL_RET_NORMAL: 135 tcg_debug_assert(slot >= 0 && slot <= 1); 136 return slot ? TCG_REG_EDX : TCG_REG_EAX; 137#ifdef _WIN64 138 case TCG_CALL_RET_BY_VEC: 139 tcg_debug_assert(slot == 0); 140 return TCG_REG_XMM0; 141#endif 142 default: 143 g_assert_not_reached(); 144 } 145} 146 147/* Constants we accept. */ 148#define TCG_CT_CONST_S32 0x100 149#define TCG_CT_CONST_U32 0x200 150#define TCG_CT_CONST_I32 0x400 151#define TCG_CT_CONST_WSZ 0x800 152#define TCG_CT_CONST_TST 0x1000 153#define TCG_CT_CONST_ZERO 0x2000 154 155/* Registers used with L constraint, which are the first argument 156 registers on x86_64, and two random call clobbered registers on 157 i386. */ 158#if TCG_TARGET_REG_BITS == 64 159# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 160# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 161#else 162# define TCG_REG_L0 TCG_REG_EAX 163# define TCG_REG_L1 TCG_REG_EDX 164#endif 165 166#if TCG_TARGET_REG_BITS == 64 167# define ALL_GENERAL_REGS 0x0000ffffu 168# define ALL_VECTOR_REGS 0xffff0000u 169# define ALL_BYTEL_REGS ALL_GENERAL_REGS 170#else 171# define ALL_GENERAL_REGS 0x000000ffu 172# define ALL_VECTOR_REGS 0x00ff0000u 173# define ALL_BYTEL_REGS 0x0000000fu 174#endif 175#define SOFTMMU_RESERVE_REGS \ 176 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 177 178#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 179#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 180 181static const tcg_insn_unit *tb_ret_addr; 182 183static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 184 intptr_t value, intptr_t addend) 185{ 186 value += addend; 187 switch(type) { 188 case R_386_PC32: 189 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 190 if (value != (int32_t)value) { 191 return false; 192 } 193 /* FALLTHRU */ 194 case R_386_32: 195 tcg_patch32(code_ptr, value); 196 break; 197 case R_386_PC8: 198 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 199 if (value != (int8_t)value) { 200 return false; 201 } 202 tcg_patch8(code_ptr, value); 203 break; 204 default: 205 g_assert_not_reached(); 206 } 207 return true; 208} 209 210/* test if a constant matches the constraint */ 211static bool tcg_target_const_match(int64_t val, int ct, 212 TCGType type, TCGCond cond, int vece) 213{ 214 if (ct & TCG_CT_CONST) { 215 return 1; 216 } 217 if (type == TCG_TYPE_I32) { 218 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 219 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 220 return 1; 221 } 222 } else { 223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 224 return 1; 225 } 226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 227 return 1; 228 } 229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 230 return 1; 231 } 232 /* 233 * This will be used in combination with TCG_CT_CONST_S32, 234 * so "normal" TESTQ is already matched. Also accept: 235 * TESTQ -> TESTL (uint32_t) 236 * TESTQ -> BT (is_power_of_2) 237 */ 238 if ((ct & TCG_CT_CONST_TST) 239 && is_tst_cond(cond) 240 && (val == (uint32_t)val || is_power_of_2(val))) { 241 return 1; 242 } 243 } 244 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 245 return 1; 246 } 247 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EbIb (0x80) 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVB_Ib (0xb0) 307#define OPC_MOVL_Iv (0xb8) 308#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 309#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 310#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 311#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 312#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 313#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 314#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 315#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 316#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 318#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 319#define OPC_MOVSBL (0xbe | P_EXT) 320#define OPC_MOVSWL (0xbf | P_EXT) 321#define OPC_MOVSLQ (0x63 | P_REXW) 322#define OPC_MOVZBL (0xb6 | P_EXT) 323#define OPC_MOVZWL (0xb7 | P_EXT) 324#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 325#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 326#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 327#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 328#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 329#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 330#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 331#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 332#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 333#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 334#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 335#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 336#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 337#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 338#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 339#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 340#define OPC_PAND (0xdb | P_EXT | P_DATA16) 341#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 342#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 343#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 344#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 345#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 346#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 347#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 348#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 349#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 350#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 351#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 352#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 353#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 354#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 355#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 356#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 358#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 359#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 360#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 362#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 363#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 364#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 366#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 367#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 368#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 369#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 370#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 371#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 372#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 373#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 374#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 375#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 376#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 377#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 378#define OPC_POR (0xeb | P_EXT | P_DATA16) 379#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 380#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 381#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 382#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 383#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 384#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 385#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 386#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 387#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 388#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 389#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 390#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 391#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 392#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 393#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 394#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 395#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 396#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 397#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 398#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 399#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 400#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 401#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 402#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 404#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 405#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 406#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 407#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 408#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 409#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 410#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 411#define OPC_PXOR (0xef | P_EXT | P_DATA16) 412#define OPC_POP_r32 (0x58) 413#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 414#define OPC_PUSH_r32 (0x50) 415#define OPC_PUSH_Iv (0x68) 416#define OPC_PUSH_Ib (0x6a) 417#define OPC_RET (0xc3) 418#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 419#define OPC_SHIFT_1 (0xd1) 420#define OPC_SHIFT_Ib (0xc1) 421#define OPC_SHIFT_cl (0xd3) 422#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 423#define OPC_SHUFPS (0xc6 | P_EXT) 424#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 425#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 426#define OPC_SHRD_Ib (0xac | P_EXT) 427#define OPC_TESTB (0x84) 428#define OPC_TESTL (0x85) 429#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 430#define OPC_UD2 (0x0b | P_EXT) 431#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 432#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 433#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 434#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 436#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 437#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 438#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 439#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 440#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 442#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 443#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 446#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 447#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 448#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 449#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 450#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 451#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 452#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 453#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 454#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 455#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 456#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 457#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 458#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 459#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 460#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 462#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 463#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 464#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 465#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 466#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 468#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 469#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 470#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 471#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 472#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 473#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 474#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 475#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 476#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 477#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 478#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 479#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 480#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 481#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 482#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 483#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 484#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 485#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 486#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 487#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 488#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 489#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 490#define OPC_VZEROUPPER (0x77 | P_EXT) 491#define OPC_XCHG_ax_r32 (0x90) 492#define OPC_XCHG_EvGv (0x87) 493 494#define OPC_GRP3_Eb (0xf6) 495#define OPC_GRP3_Ev (0xf7) 496#define OPC_GRP5 (0xff) 497#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 498#define OPC_GRPBT (0xba | P_EXT) 499 500#define OPC_GRPBT_BT 4 501#define OPC_GRPBT_BTS 5 502#define OPC_GRPBT_BTR 6 503#define OPC_GRPBT_BTC 7 504 505/* Group 1 opcode extensions for 0x80-0x83. 506 These are also used as modifiers for OPC_ARITH. */ 507#define ARITH_ADD 0 508#define ARITH_OR 1 509#define ARITH_ADC 2 510#define ARITH_SBB 3 511#define ARITH_AND 4 512#define ARITH_SUB 5 513#define ARITH_XOR 6 514#define ARITH_CMP 7 515 516/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 517#define SHIFT_ROL 0 518#define SHIFT_ROR 1 519#define SHIFT_SHL 4 520#define SHIFT_SHR 5 521#define SHIFT_SAR 7 522 523/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 524#define EXT3_TESTi 0 525#define EXT3_NOT 2 526#define EXT3_NEG 3 527#define EXT3_MUL 4 528#define EXT3_IMUL 5 529#define EXT3_DIV 6 530#define EXT3_IDIV 7 531 532/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 533#define EXT5_INC_Ev 0 534#define EXT5_DEC_Ev 1 535#define EXT5_CALLN_Ev 2 536#define EXT5_JMPN_Ev 4 537 538/* Condition codes to be added to OPC_JCC_{long,short}. */ 539#define JCC_JMP (-1) 540#define JCC_JO 0x0 541#define JCC_JNO 0x1 542#define JCC_JB 0x2 543#define JCC_JAE 0x3 544#define JCC_JE 0x4 545#define JCC_JNE 0x5 546#define JCC_JBE 0x6 547#define JCC_JA 0x7 548#define JCC_JS 0x8 549#define JCC_JNS 0x9 550#define JCC_JP 0xa 551#define JCC_JNP 0xb 552#define JCC_JL 0xc 553#define JCC_JGE 0xd 554#define JCC_JLE 0xe 555#define JCC_JG 0xf 556 557static const uint8_t tcg_cond_to_jcc[] = { 558 [TCG_COND_EQ] = JCC_JE, 559 [TCG_COND_NE] = JCC_JNE, 560 [TCG_COND_LT] = JCC_JL, 561 [TCG_COND_GE] = JCC_JGE, 562 [TCG_COND_LE] = JCC_JLE, 563 [TCG_COND_GT] = JCC_JG, 564 [TCG_COND_LTU] = JCC_JB, 565 [TCG_COND_GEU] = JCC_JAE, 566 [TCG_COND_LEU] = JCC_JBE, 567 [TCG_COND_GTU] = JCC_JA, 568 [TCG_COND_TSTEQ] = JCC_JE, 569 [TCG_COND_TSTNE] = JCC_JNE, 570}; 571 572#if TCG_TARGET_REG_BITS == 64 573static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 574{ 575 int rex; 576 577 if (opc & P_GS) { 578 tcg_out8(s, 0x65); 579 } 580 if (opc & P_DATA16) { 581 /* We should never be asking for both 16 and 64-bit operation. */ 582 tcg_debug_assert((opc & P_REXW) == 0); 583 tcg_out8(s, 0x66); 584 } 585 if (opc & P_SIMDF3) { 586 tcg_out8(s, 0xf3); 587 } else if (opc & P_SIMDF2) { 588 tcg_out8(s, 0xf2); 589 } 590 591 rex = 0; 592 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 593 rex |= (r & 8) >> 1; /* REX.R */ 594 rex |= (x & 8) >> 2; /* REX.X */ 595 rex |= (rm & 8) >> 3; /* REX.B */ 596 597 /* P_REXB_{R,RM} indicates that the given register is the low byte. 598 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 599 as otherwise the encoding indicates %[abcd]h. Note that the values 600 that are ORed in merely indicate that the REX byte must be present; 601 those bits get discarded in output. */ 602 rex |= opc & (r >= 4 ? P_REXB_R : 0); 603 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 604 605 if (rex) { 606 tcg_out8(s, (uint8_t)(rex | 0x40)); 607 } 608 609 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 610 tcg_out8(s, 0x0f); 611 if (opc & P_EXT38) { 612 tcg_out8(s, 0x38); 613 } else if (opc & P_EXT3A) { 614 tcg_out8(s, 0x3a); 615 } 616 } 617 618 tcg_out8(s, opc); 619} 620#else 621static void tcg_out_opc(TCGContext *s, int opc) 622{ 623 if (opc & P_DATA16) { 624 tcg_out8(s, 0x66); 625 } 626 if (opc & P_SIMDF3) { 627 tcg_out8(s, 0xf3); 628 } else if (opc & P_SIMDF2) { 629 tcg_out8(s, 0xf2); 630 } 631 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 632 tcg_out8(s, 0x0f); 633 if (opc & P_EXT38) { 634 tcg_out8(s, 0x38); 635 } else if (opc & P_EXT3A) { 636 tcg_out8(s, 0x3a); 637 } 638 } 639 tcg_out8(s, opc); 640} 641/* Discard the register arguments to tcg_out_opc early, so as not to penalize 642 the 32-bit compilation paths. This method works with all versions of gcc, 643 whereas relying on optimization may not be able to exclude them. */ 644#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 645#endif 646 647static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 648{ 649 tcg_out_opc(s, opc, r, rm, 0); 650 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 651} 652 653static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 654 int rm, int index) 655{ 656 int tmp; 657 658 if (opc & P_GS) { 659 tcg_out8(s, 0x65); 660 } 661 /* Use the two byte form if possible, which cannot encode 662 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 663 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 664 && ((rm | index) & 8) == 0) { 665 /* Two byte VEX prefix. */ 666 tcg_out8(s, 0xc5); 667 668 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 669 } else { 670 /* Three byte VEX prefix. */ 671 tcg_out8(s, 0xc4); 672 673 /* VEX.m-mmmm */ 674 if (opc & P_EXT3A) { 675 tmp = 3; 676 } else if (opc & P_EXT38) { 677 tmp = 2; 678 } else if (opc & P_EXT) { 679 tmp = 1; 680 } else { 681 g_assert_not_reached(); 682 } 683 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 684 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 685 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 686 tcg_out8(s, tmp); 687 688 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 689 } 690 691 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 692 /* VEX.pp */ 693 if (opc & P_DATA16) { 694 tmp |= 1; /* 0x66 */ 695 } else if (opc & P_SIMDF3) { 696 tmp |= 2; /* 0xf3 */ 697 } else if (opc & P_SIMDF2) { 698 tmp |= 3; /* 0xf2 */ 699 } 700 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 701 tcg_out8(s, tmp); 702 tcg_out8(s, opc); 703} 704 705static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 706 int rm, int index, int aaa, bool z) 707{ 708 /* The entire 4-byte evex prefix; with R' and V' set. */ 709 uint32_t p = 0x08041062; 710 int mm, pp; 711 712 tcg_debug_assert(have_avx512vl); 713 714 /* EVEX.mm */ 715 if (opc & P_EXT3A) { 716 mm = 3; 717 } else if (opc & P_EXT38) { 718 mm = 2; 719 } else if (opc & P_EXT) { 720 mm = 1; 721 } else { 722 g_assert_not_reached(); 723 } 724 725 /* EVEX.pp */ 726 if (opc & P_DATA16) { 727 pp = 1; /* 0x66 */ 728 } else if (opc & P_SIMDF3) { 729 pp = 2; /* 0xf3 */ 730 } else if (opc & P_SIMDF2) { 731 pp = 3; /* 0xf2 */ 732 } else { 733 pp = 0; 734 } 735 736 p = deposit32(p, 8, 2, mm); 737 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 738 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 739 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 740 p = deposit32(p, 16, 2, pp); 741 p = deposit32(p, 19, 4, ~v); 742 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 743 p = deposit32(p, 24, 3, aaa); 744 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 745 p = deposit32(p, 31, 1, z); 746 747 tcg_out32(s, p); 748 tcg_out8(s, opc); 749} 750 751static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 752{ 753 if (opc & P_EVEX) { 754 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 755 } else { 756 tcg_out_vex_opc(s, opc, r, v, rm, 0); 757 } 758 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 759} 760 761static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 762 int r, int v, int rm, TCGType type) 763{ 764 if (type == TCG_TYPE_V256) { 765 opc |= P_VEXL; 766 } 767 tcg_out_vex_modrm(s, opc, r, v, rm); 768} 769 770static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 771 int rm, int aaa, bool z, TCGType type) 772{ 773 if (type == TCG_TYPE_V256) { 774 opc |= P_VEXL; 775 } 776 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 777 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 778} 779 780/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 781 We handle either RM and INDEX missing with a negative value. In 64-bit 782 mode for absolute addresses, ~RM is the size of the immediate operand 783 that will follow the instruction. */ 784 785static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 786 int shift, intptr_t offset) 787{ 788 int mod, len; 789 790 if (index < 0 && rm < 0) { 791 if (TCG_TARGET_REG_BITS == 64) { 792 /* Try for a rip-relative addressing mode. This has replaced 793 the 32-bit-mode absolute addressing encoding. */ 794 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 795 intptr_t disp = offset - pc; 796 if (disp == (int32_t)disp) { 797 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 798 tcg_out32(s, disp); 799 return; 800 } 801 802 /* Try for an absolute address encoding. This requires the 803 use of the MODRM+SIB encoding and is therefore larger than 804 rip-relative addressing. */ 805 if (offset == (int32_t)offset) { 806 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 807 tcg_out8(s, (4 << 3) | 5); 808 tcg_out32(s, offset); 809 return; 810 } 811 812 /* ??? The memory isn't directly addressable. */ 813 g_assert_not_reached(); 814 } else { 815 /* Absolute address. */ 816 tcg_out8(s, (r << 3) | 5); 817 tcg_out32(s, offset); 818 return; 819 } 820 } 821 822 /* Find the length of the immediate addend. Note that the encoding 823 that would be used for (%ebp) indicates absolute addressing. */ 824 if (rm < 0) { 825 mod = 0, len = 4, rm = 5; 826 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 827 mod = 0, len = 0; 828 } else if (offset == (int8_t)offset) { 829 mod = 0x40, len = 1; 830 } else { 831 mod = 0x80, len = 4; 832 } 833 834 /* Use a single byte MODRM format if possible. Note that the encoding 835 that would be used for %esp is the escape to the two byte form. */ 836 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 837 /* Single byte MODRM format. */ 838 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 839 } else { 840 /* Two byte MODRM+SIB format. */ 841 842 /* Note that the encoding that would place %esp into the index 843 field indicates no index register. In 64-bit mode, the REX.X 844 bit counts, so %r12 can be used as the index. */ 845 if (index < 0) { 846 index = 4; 847 } else { 848 tcg_debug_assert(index != TCG_REG_ESP); 849 } 850 851 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 852 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 853 } 854 855 if (len == 1) { 856 tcg_out8(s, offset); 857 } else if (len == 4) { 858 tcg_out32(s, offset); 859 } 860} 861 862static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 863 int index, int shift, intptr_t offset) 864{ 865 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 866 tcg_out_sib_offset(s, r, rm, index, shift, offset); 867} 868 869static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 870 int rm, int index, int shift, 871 intptr_t offset) 872{ 873 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 874 tcg_out_sib_offset(s, r, rm, index, shift, offset); 875} 876 877/* A simplification of the above with no index or shift. */ 878static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 879 int rm, intptr_t offset) 880{ 881 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 882} 883 884static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 885 int v, int rm, intptr_t offset) 886{ 887 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 888} 889 890/* Output an opcode with an expected reference to the constant pool. */ 891static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 892{ 893 tcg_out_opc(s, opc, r, 0, 0); 894 /* Absolute for 32-bit, pc-relative for 64-bit. */ 895 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 896 tcg_out32(s, 0); 897} 898 899/* Output an opcode with an expected reference to the constant pool. */ 900static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 901{ 902 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 903 /* Absolute for 32-bit, pc-relative for 64-bit. */ 904 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 905 tcg_out32(s, 0); 906} 907 908/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 909static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 910{ 911 /* Propagate an opcode prefix, such as P_REXW. */ 912 int ext = subop & ~0x7; 913 subop &= 0x7; 914 915 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 916} 917 918static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 919{ 920 int rexw = 0; 921 922 if (arg == ret) { 923 return true; 924 } 925 switch (type) { 926 case TCG_TYPE_I64: 927 rexw = P_REXW; 928 /* fallthru */ 929 case TCG_TYPE_I32: 930 if (ret < 16) { 931 if (arg < 16) { 932 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 933 } else { 934 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 935 } 936 } else { 937 if (arg < 16) { 938 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 939 } else { 940 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 941 } 942 } 943 break; 944 945 case TCG_TYPE_V64: 946 tcg_debug_assert(ret >= 16 && arg >= 16); 947 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 948 break; 949 case TCG_TYPE_V128: 950 tcg_debug_assert(ret >= 16 && arg >= 16); 951 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 952 break; 953 case TCG_TYPE_V256: 954 tcg_debug_assert(ret >= 16 && arg >= 16); 955 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 956 break; 957 958 default: 959 g_assert_not_reached(); 960 } 961 return true; 962} 963 964static const int avx2_dup_insn[4] = { 965 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 966 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 967}; 968 969static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 970 TCGReg r, TCGReg a) 971{ 972 if (have_avx2) { 973 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 974 } else { 975 switch (vece) { 976 case MO_8: 977 /* ??? With zero in a register, use PSHUFB. */ 978 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 979 a = r; 980 /* FALLTHRU */ 981 case MO_16: 982 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 983 a = r; 984 /* FALLTHRU */ 985 case MO_32: 986 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 987 /* imm8 operand: all output lanes selected from input lane 0. */ 988 tcg_out8(s, 0); 989 break; 990 case MO_64: 991 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 992 break; 993 default: 994 g_assert_not_reached(); 995 } 996 } 997 return true; 998} 999 1000static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 1001 TCGReg r, TCGReg base, intptr_t offset) 1002{ 1003 if (have_avx2) { 1004 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1005 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 1006 r, 0, base, offset); 1007 } else { 1008 switch (vece) { 1009 case MO_64: 1010 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 1011 break; 1012 case MO_32: 1013 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 1014 break; 1015 case MO_16: 1016 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1017 tcg_out8(s, 0); /* imm8 */ 1018 tcg_out_dup_vec(s, type, vece, r, r); 1019 break; 1020 case MO_8: 1021 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1022 tcg_out8(s, 0); /* imm8 */ 1023 tcg_out_dup_vec(s, type, vece, r, r); 1024 break; 1025 default: 1026 g_assert_not_reached(); 1027 } 1028 } 1029 return true; 1030} 1031 1032static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1033 TCGReg ret, int64_t arg) 1034{ 1035 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1036 1037 if (arg == 0) { 1038 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1039 return; 1040 } 1041 if (arg == -1) { 1042 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1043 return; 1044 } 1045 1046 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1047 if (have_avx2) { 1048 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1049 } else { 1050 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1051 } 1052 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1053 } else { 1054 if (type == TCG_TYPE_V64) { 1055 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1056 } else if (have_avx2) { 1057 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1058 } else { 1059 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1060 } 1061 if (TCG_TARGET_REG_BITS == 64) { 1062 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1063 } else { 1064 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1065 } 1066 } 1067} 1068 1069static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1070 TCGReg ret, tcg_target_long arg) 1071{ 1072 if (arg == 0) { 1073 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1074 return; 1075 } 1076 if (arg == -1) { 1077 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1078 return; 1079 } 1080 1081 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1082 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1083 if (TCG_TARGET_REG_BITS == 64) { 1084 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1085 } else { 1086 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1087 } 1088} 1089 1090static void tcg_out_movi_int(TCGContext *s, TCGType type, 1091 TCGReg ret, tcg_target_long arg) 1092{ 1093 tcg_target_long diff; 1094 1095 if (arg == 0) { 1096 tgen_arithr(s, ARITH_XOR, ret, ret); 1097 return; 1098 } 1099 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1100 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1101 tcg_out32(s, arg); 1102 return; 1103 } 1104 if (arg == (int32_t)arg) { 1105 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1106 tcg_out32(s, arg); 1107 return; 1108 } 1109 1110 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1111 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1112 if (diff == (int32_t)diff) { 1113 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1114 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1115 tcg_out32(s, diff); 1116 return; 1117 } 1118 1119 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1120 tcg_out64(s, arg); 1121} 1122 1123static void tcg_out_movi(TCGContext *s, TCGType type, 1124 TCGReg ret, tcg_target_long arg) 1125{ 1126 switch (type) { 1127 case TCG_TYPE_I32: 1128#if TCG_TARGET_REG_BITS == 64 1129 case TCG_TYPE_I64: 1130#endif 1131 if (ret < 16) { 1132 tcg_out_movi_int(s, type, ret, arg); 1133 } else { 1134 tcg_out_movi_vec(s, type, ret, arg); 1135 } 1136 break; 1137 default: 1138 g_assert_not_reached(); 1139 } 1140} 1141 1142static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1143{ 1144 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1145 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1146 return true; 1147} 1148 1149static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1150 tcg_target_long imm) 1151{ 1152 /* This function is only used for passing structs by reference. */ 1153 tcg_debug_assert(imm == (int32_t)imm); 1154 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1155} 1156 1157static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1158{ 1159 if (val == (int8_t)val) { 1160 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1161 tcg_out8(s, val); 1162 } else if (val == (int32_t)val) { 1163 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1164 tcg_out32(s, val); 1165 } else { 1166 g_assert_not_reached(); 1167 } 1168} 1169 1170static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1171{ 1172 /* Given the strength of x86 memory ordering, we only need care for 1173 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1174 faster than "mfence", so don't bother with the sse insn. */ 1175 if (a0 & TCG_MO_ST_LD) { 1176 tcg_out8(s, 0xf0); 1177 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1178 tcg_out8(s, 0); 1179 } 1180} 1181 1182static inline void tcg_out_push(TCGContext *s, int reg) 1183{ 1184 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1185} 1186 1187static inline void tcg_out_pop(TCGContext *s, int reg) 1188{ 1189 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1190} 1191 1192static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1193 TCGReg arg1, intptr_t arg2) 1194{ 1195 switch (type) { 1196 case TCG_TYPE_I32: 1197 if (ret < 16) { 1198 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1199 } else { 1200 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1201 } 1202 break; 1203 case TCG_TYPE_I64: 1204 if (ret < 16) { 1205 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1206 break; 1207 } 1208 /* FALLTHRU */ 1209 case TCG_TYPE_V64: 1210 /* There is no instruction that can validate 8-byte alignment. */ 1211 tcg_debug_assert(ret >= 16); 1212 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1213 break; 1214 case TCG_TYPE_V128: 1215 /* 1216 * The gvec infrastructure is asserts that v128 vector loads 1217 * and stores use a 16-byte aligned offset. Validate that the 1218 * final pointer is aligned by using an insn that will SIGSEGV. 1219 */ 1220 tcg_debug_assert(ret >= 16); 1221 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1222 break; 1223 case TCG_TYPE_V256: 1224 /* 1225 * The gvec infrastructure only requires 16-byte alignment, 1226 * so here we must use an unaligned load. 1227 */ 1228 tcg_debug_assert(ret >= 16); 1229 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1230 ret, 0, arg1, arg2); 1231 break; 1232 default: 1233 g_assert_not_reached(); 1234 } 1235} 1236 1237static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1238 TCGReg arg1, intptr_t arg2) 1239{ 1240 switch (type) { 1241 case TCG_TYPE_I32: 1242 if (arg < 16) { 1243 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1244 } else { 1245 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1246 } 1247 break; 1248 case TCG_TYPE_I64: 1249 if (arg < 16) { 1250 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1251 break; 1252 } 1253 /* FALLTHRU */ 1254 case TCG_TYPE_V64: 1255 /* There is no instruction that can validate 8-byte alignment. */ 1256 tcg_debug_assert(arg >= 16); 1257 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1258 break; 1259 case TCG_TYPE_V128: 1260 /* 1261 * The gvec infrastructure is asserts that v128 vector loads 1262 * and stores use a 16-byte aligned offset. Validate that the 1263 * final pointer is aligned by using an insn that will SIGSEGV. 1264 * 1265 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1266 * for _WIN64, which must have SSE2 but may not have AVX. 1267 */ 1268 tcg_debug_assert(arg >= 16); 1269 if (have_avx1) { 1270 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1271 } else { 1272 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1273 } 1274 break; 1275 case TCG_TYPE_V256: 1276 /* 1277 * The gvec infrastructure only requires 16-byte alignment, 1278 * so here we must use an unaligned store. 1279 */ 1280 tcg_debug_assert(arg >= 16); 1281 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1282 arg, 0, arg1, arg2); 1283 break; 1284 default: 1285 g_assert_not_reached(); 1286 } 1287} 1288 1289static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1290 TCGReg base, intptr_t ofs) 1291{ 1292 int rexw = 0; 1293 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1294 if (val != (int32_t)val) { 1295 return false; 1296 } 1297 rexw = P_REXW; 1298 } else if (type != TCG_TYPE_I32) { 1299 return false; 1300 } 1301 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1302 tcg_out32(s, val); 1303 return true; 1304} 1305 1306static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1307{ 1308 /* Propagate an opcode prefix, such as P_DATA16. */ 1309 int ext = subopc & ~0x7; 1310 subopc &= 0x7; 1311 1312 if (count == 1) { 1313 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1314 } else { 1315 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1316 tcg_out8(s, count); 1317 } 1318} 1319 1320static inline void tcg_out_bswap32(TCGContext *s, int reg) 1321{ 1322 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1323} 1324 1325static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1326{ 1327 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1328} 1329 1330static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1331{ 1332 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1333 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1334 if (dest >= 4) { 1335 tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest); 1336 tcg_out32(s, 0xff); 1337 return; 1338 } 1339 src = dest; 1340 } 1341 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1342} 1343 1344static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1345{ 1346 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1347 1348 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1349 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1350 if (dest >= 4) { 1351 tcg_out_shifti(s, SHIFT_SHL, dest, 24); 1352 tcg_out_shifti(s, SHIFT_SAR, dest, 24); 1353 return; 1354 } 1355 src = dest; 1356 } 1357 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1358} 1359 1360static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1361{ 1362 /* movzwl */ 1363 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1364} 1365 1366static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1367{ 1368 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1369 /* movsw[lq] */ 1370 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1371} 1372 1373static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1374{ 1375 /* 32-bit mov zero extends. */ 1376 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1377} 1378 1379static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1380{ 1381 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1382 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1383} 1384 1385static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1386{ 1387 tcg_out_ext32s(s, dest, src); 1388} 1389 1390static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1391{ 1392 if (dest != src) { 1393 tcg_out_ext32u(s, dest, src); 1394 } 1395} 1396 1397static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1398{ 1399 tcg_out_ext32u(s, dest, src); 1400} 1401 1402static inline void tcg_out_bswap64(TCGContext *s, int reg) 1403{ 1404 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1405} 1406 1407static void tgen_arithi(TCGContext *s, int c, int r0, 1408 tcg_target_long val, int cf) 1409{ 1410 int rexw = 0; 1411 1412 if (TCG_TARGET_REG_BITS == 64) { 1413 rexw = c & -8; 1414 c &= 7; 1415 } 1416 1417 switch (c) { 1418 case ARITH_ADD: 1419 case ARITH_SUB: 1420 if (!cf) { 1421 /* 1422 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1423 * partial flags update stalls on Pentium4 and are not recommended 1424 * by current Intel optimization manuals. 1425 */ 1426 if (val == 1 || val == -1) { 1427 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1428 if (TCG_TARGET_REG_BITS == 64) { 1429 /* 1430 * The single-byte increment encodings are re-tasked 1431 * as the REX prefixes. Use the MODRM encoding. 1432 */ 1433 tcg_out_modrm(s, OPC_GRP5 + rexw, 1434 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1435 } else { 1436 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1437 } 1438 return; 1439 } 1440 if (val == 128) { 1441 /* 1442 * Facilitate using an 8-bit immediate. Carry is inverted 1443 * by this transformation, so do it only if cf == 0. 1444 */ 1445 c ^= ARITH_ADD ^ ARITH_SUB; 1446 val = -128; 1447 } 1448 } 1449 break; 1450 1451 case ARITH_AND: 1452 if (TCG_TARGET_REG_BITS == 64) { 1453 if (val == 0xffffffffu) { 1454 tcg_out_ext32u(s, r0, r0); 1455 return; 1456 } 1457 if (val == (uint32_t)val) { 1458 /* AND with no high bits set can use a 32-bit operation. */ 1459 rexw = 0; 1460 } 1461 } 1462 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1463 tcg_out_ext8u(s, r0, r0); 1464 return; 1465 } 1466 if (val == 0xffffu) { 1467 tcg_out_ext16u(s, r0, r0); 1468 return; 1469 } 1470 break; 1471 1472 case ARITH_OR: 1473 case ARITH_XOR: 1474 if (val >= 0x80 && val <= 0xff 1475 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1476 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1477 tcg_out8(s, val); 1478 return; 1479 } 1480 break; 1481 } 1482 1483 if (val == (int8_t)val) { 1484 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1485 tcg_out8(s, val); 1486 return; 1487 } 1488 if (rexw == 0 || val == (int32_t)val) { 1489 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1490 tcg_out32(s, val); 1491 return; 1492 } 1493 1494 g_assert_not_reached(); 1495} 1496 1497static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1498{ 1499 if (val != 0) { 1500 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1501 } 1502} 1503 1504/* Set SMALL to force a short forward branch. */ 1505static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1506{ 1507 int32_t val, val1; 1508 1509 if (l->has_value) { 1510 val = tcg_pcrel_diff(s, l->u.value_ptr); 1511 val1 = val - 2; 1512 if ((int8_t)val1 == val1) { 1513 if (opc == -1) { 1514 tcg_out8(s, OPC_JMP_short); 1515 } else { 1516 tcg_out8(s, OPC_JCC_short + opc); 1517 } 1518 tcg_out8(s, val1); 1519 } else { 1520 tcg_debug_assert(!small); 1521 if (opc == -1) { 1522 tcg_out8(s, OPC_JMP_long); 1523 tcg_out32(s, val - 5); 1524 } else { 1525 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1526 tcg_out32(s, val - 6); 1527 } 1528 } 1529 } else if (small) { 1530 if (opc == -1) { 1531 tcg_out8(s, OPC_JMP_short); 1532 } else { 1533 tcg_out8(s, OPC_JCC_short + opc); 1534 } 1535 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1536 s->code_ptr += 1; 1537 } else { 1538 if (opc == -1) { 1539 tcg_out8(s, OPC_JMP_long); 1540 } else { 1541 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1542 } 1543 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1544 s->code_ptr += 4; 1545 } 1546} 1547 1548static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1549 TCGArg arg2, int const_arg2, int rexw) 1550{ 1551 int jz, js; 1552 1553 if (!is_tst_cond(cond)) { 1554 if (!const_arg2) { 1555 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1556 } else if (arg2 == 0) { 1557 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1558 } else { 1559 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1560 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1561 } 1562 return tcg_cond_to_jcc[cond]; 1563 } 1564 1565 jz = tcg_cond_to_jcc[cond]; 1566 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1567 1568 if (!const_arg2) { 1569 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1570 return jz; 1571 } 1572 1573 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1574 if (arg2 == 0x80) { 1575 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1576 return js; 1577 } 1578 if (arg2 == 0xff) { 1579 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1580 return jz; 1581 } 1582 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1583 tcg_out8(s, arg2); 1584 return jz; 1585 } 1586 1587 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1588 if (arg2 == 0x8000) { 1589 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1590 return js; 1591 } 1592 if (arg2 == 0xff00) { 1593 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1594 return jz; 1595 } 1596 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1597 tcg_out8(s, arg2 >> 8); 1598 return jz; 1599 } 1600 1601 if (arg2 == 0xffff) { 1602 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1603 return jz; 1604 } 1605 if (arg2 == 0xffffffffu) { 1606 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1607 return jz; 1608 } 1609 1610 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1611 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1612 int sh = ctz64(arg2); 1613 1614 rexw = (sh & 32 ? P_REXW : 0); 1615 if ((sh & 31) == 31) { 1616 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1617 return js; 1618 } else { 1619 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1620 tcg_out8(s, sh); 1621 return jc; 1622 } 1623 } 1624 1625 if (rexw) { 1626 if (arg2 == (uint32_t)arg2) { 1627 rexw = 0; 1628 } else { 1629 tcg_debug_assert(arg2 == (int32_t)arg2); 1630 } 1631 } 1632 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1633 tcg_out32(s, arg2); 1634 return jz; 1635} 1636 1637static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1638 TCGArg arg1, TCGArg arg2, int const_arg2, 1639 TCGLabel *label, bool small) 1640{ 1641 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1642 tcg_out_jxx(s, jcc, label, small); 1643} 1644 1645#if TCG_TARGET_REG_BITS == 32 1646static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1647 const int *const_args, bool small) 1648{ 1649 TCGLabel *label_next = gen_new_label(); 1650 TCGLabel *label_this = arg_label(args[5]); 1651 TCGCond cond = args[4]; 1652 1653 switch (cond) { 1654 case TCG_COND_EQ: 1655 case TCG_COND_TSTEQ: 1656 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1657 args[0], args[2], const_args[2], label_next, 1); 1658 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1659 label_this, small); 1660 break; 1661 1662 case TCG_COND_NE: 1663 case TCG_COND_TSTNE: 1664 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1665 label_this, small); 1666 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1667 label_this, small); 1668 break; 1669 1670 default: 1671 tcg_out_brcond(s, 0, tcg_high_cond(cond), args[1], 1672 args[3], const_args[3], label_this, small); 1673 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1674 tcg_out_brcond(s, 0, tcg_unsigned_cond(cond), args[0], 1675 args[2], const_args[2], label_this, small); 1676 break; 1677 } 1678 tcg_out_label(s, label_next); 1679} 1680#endif 1681 1682static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond, 1683 TCGArg dest, TCGArg arg1, TCGArg arg2, 1684 int const_arg2, bool neg) 1685{ 1686 int cmp_rexw = rexw; 1687 bool inv = false; 1688 bool cleared; 1689 int jcc; 1690 1691 switch (cond) { 1692 case TCG_COND_NE: 1693 inv = true; 1694 /* fall through */ 1695 case TCG_COND_EQ: 1696 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1697 if (const_arg2 && arg2 == 0) { 1698 arg2 = 1; 1699 goto do_ltu; 1700 } 1701 break; 1702 1703 case TCG_COND_TSTNE: 1704 inv = true; 1705 /* fall through */ 1706 case TCG_COND_TSTEQ: 1707 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1708 if (const_arg2 && arg2 == 0xffffffffu) { 1709 arg2 = 1; 1710 cmp_rexw = 0; 1711 goto do_ltu; 1712 } 1713 break; 1714 1715 case TCG_COND_LEU: 1716 inv = true; 1717 /* fall through */ 1718 case TCG_COND_GTU: 1719 /* If arg2 is a register, swap for LTU/GEU. */ 1720 if (!const_arg2) { 1721 TCGReg t = arg1; 1722 arg1 = arg2; 1723 arg2 = t; 1724 goto do_ltu; 1725 } 1726 break; 1727 1728 case TCG_COND_GEU: 1729 inv = true; 1730 /* fall through */ 1731 case TCG_COND_LTU: 1732 do_ltu: 1733 /* 1734 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1735 * We can then use NEG or INC to produce the desired result. 1736 * This is always smaller than the SETCC expansion. 1737 */ 1738 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1739 1740 /* X - X - C = -C = (C ? -1 : 0) */ 1741 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1742 if (inv && neg) { 1743 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1744 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1745 } else if (inv) { 1746 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1747 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1748 } else if (!neg) { 1749 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1750 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1751 } 1752 return; 1753 1754 case TCG_COND_GE: 1755 inv = true; 1756 /* fall through */ 1757 case TCG_COND_LT: 1758 /* If arg2 is 0, extract the sign bit. */ 1759 if (const_arg2 && arg2 == 0) { 1760 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1); 1761 if (inv) { 1762 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1763 } 1764 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1765 dest, rexw ? 63 : 31); 1766 return; 1767 } 1768 break; 1769 1770 default: 1771 break; 1772 } 1773 1774 /* 1775 * If dest does not overlap the inputs, clearing it first is preferred. 1776 * The XOR breaks any false dependency for the low-byte write to dest, 1777 * and is also one byte smaller than MOVZBL. 1778 */ 1779 cleared = false; 1780 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1781 tgen_arithr(s, ARITH_XOR, dest, dest); 1782 cleared = true; 1783 } 1784 1785 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1786 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1787 1788 if (!cleared) { 1789 tcg_out_ext8u(s, dest, dest); 1790 } 1791 if (neg) { 1792 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1793 } 1794} 1795 1796#if TCG_TARGET_REG_BITS == 32 1797static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1798 const int *const_args) 1799{ 1800 TCGArg new_args[6]; 1801 TCGLabel *label_true, *label_over; 1802 1803 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1804 1805 if (args[0] == args[1] || args[0] == args[2] 1806 || (!const_args[3] && args[0] == args[3]) 1807 || (!const_args[4] && args[0] == args[4])) { 1808 /* When the destination overlaps with one of the argument 1809 registers, don't do anything tricky. */ 1810 label_true = gen_new_label(); 1811 label_over = gen_new_label(); 1812 1813 new_args[5] = label_arg(label_true); 1814 tcg_out_brcond2(s, new_args, const_args+1, 1); 1815 1816 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1817 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1818 tcg_out_label(s, label_true); 1819 1820 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1821 tcg_out_label(s, label_over); 1822 } else { 1823 /* When the destination does not overlap one of the arguments, 1824 clear the destination first, jump if cond false, and emit an 1825 increment in the true case. This results in smaller code. */ 1826 1827 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1828 1829 label_over = gen_new_label(); 1830 new_args[4] = tcg_invert_cond(new_args[4]); 1831 new_args[5] = label_arg(label_over); 1832 tcg_out_brcond2(s, new_args, const_args+1, 1); 1833 1834 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1835 tcg_out_label(s, label_over); 1836 } 1837} 1838#endif 1839 1840static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1841 TCGReg dest, TCGReg v1) 1842{ 1843 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1844} 1845 1846static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1847 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1848 TCGReg v1) 1849{ 1850 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1851 tcg_out_cmov(s, jcc, rexw, dest, v1); 1852} 1853 1854static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1855{ 1856 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1857 1858 if (disp == (int32_t)disp) { 1859 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1860 tcg_out32(s, disp); 1861 } else { 1862 /* rip-relative addressing into the constant pool. 1863 This is 6 + 8 = 14 bytes, as compared to using an 1864 immediate load 10 + 6 = 16 bytes, plus we may 1865 be able to re-use the pool constant for more calls. */ 1866 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1867 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1868 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1869 tcg_out32(s, 0); 1870 } 1871} 1872 1873static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1874 const TCGHelperInfo *info) 1875{ 1876 tcg_out_branch(s, 1, dest); 1877 1878#ifndef _WIN32 1879 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1880 /* 1881 * The sysv i386 abi for struct return places a reference as the 1882 * first argument of the stack, and pops that argument with the 1883 * return statement. Since we want to retain the aligned stack 1884 * pointer for the callee, we do not want to actually push that 1885 * argument before the call but rely on the normal store to the 1886 * stack slot. But we do need to compensate for the pop in order 1887 * to reset our correct stack pointer value. 1888 * Pushing a garbage value back onto the stack is quickest. 1889 */ 1890 tcg_out_push(s, TCG_REG_EAX); 1891 } 1892#endif 1893} 1894 1895static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1896{ 1897 tcg_out_branch(s, 0, dest); 1898} 1899 1900static void tcg_out_nopn(TCGContext *s, int n) 1901{ 1902 int i; 1903 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1904 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1905 * duplicate prefix, and all of the interesting recent cores can 1906 * decode and discard the duplicates in a single cycle. 1907 */ 1908 tcg_debug_assert(n >= 1); 1909 for (i = 1; i < n; ++i) { 1910 tcg_out8(s, 0x66); 1911 } 1912 tcg_out8(s, 0x90); 1913} 1914 1915typedef struct { 1916 TCGReg base; 1917 int index; 1918 int ofs; 1919 int seg; 1920 TCGAtomAlign aa; 1921} HostAddress; 1922 1923bool tcg_target_has_memory_bswap(MemOp memop) 1924{ 1925 TCGAtomAlign aa; 1926 1927 if (!have_movbe) { 1928 return false; 1929 } 1930 if ((memop & MO_SIZE) < MO_128) { 1931 return true; 1932 } 1933 1934 /* 1935 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 1936 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 1937 */ 1938 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 1939 return aa.atom < MO_128; 1940} 1941 1942/* 1943 * Because i686 has no register parameters and because x86_64 has xchg 1944 * to handle addr/data register overlap, we have placed all input arguments 1945 * before we need might need a scratch reg. 1946 * 1947 * Even then, a scratch is only needed for l->raddr. Rather than expose 1948 * a general-purpose scratch when we don't actually know it's available, 1949 * use the ra_gen hook to load into RAX if needed. 1950 */ 1951#if TCG_TARGET_REG_BITS == 64 1952static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1953{ 1954 if (arg < 0) { 1955 arg = TCG_REG_RAX; 1956 } 1957 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 1958 return arg; 1959} 1960static const TCGLdstHelperParam ldst_helper_param = { 1961 .ra_gen = ldst_ra_gen 1962}; 1963#else 1964static const TCGLdstHelperParam ldst_helper_param = { }; 1965#endif 1966 1967static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 1968 TCGReg l, TCGReg h, TCGReg v) 1969{ 1970 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1971 1972 /* vpmov{d,q} %v, %l */ 1973 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 1974 /* vpextr{d,q} $1, %v, %h */ 1975 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 1976 tcg_out8(s, 1); 1977} 1978 1979static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 1980 TCGReg v, TCGReg l, TCGReg h) 1981{ 1982 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1983 1984 /* vmov{d,q} %l, %v */ 1985 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 1986 /* vpinsr{d,q} $1, %h, %v, %v */ 1987 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 1988 tcg_out8(s, 1); 1989} 1990 1991/* 1992 * Generate code for the slow path for a load at the end of block 1993 */ 1994static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1995{ 1996 MemOp opc = get_memop(l->oi); 1997 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1998 1999 /* resolve label address */ 2000 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2001 if (label_ptr[1]) { 2002 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2003 } 2004 2005 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2006 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2007 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2008 2009 tcg_out_jmp(s, l->raddr); 2010 return true; 2011} 2012 2013/* 2014 * Generate code for the slow path for a store at the end of block 2015 */ 2016static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2017{ 2018 MemOp opc = get_memop(l->oi); 2019 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2020 2021 /* resolve label address */ 2022 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2023 if (label_ptr[1]) { 2024 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2025 } 2026 2027 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2028 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2029 2030 tcg_out_jmp(s, l->raddr); 2031 return true; 2032} 2033 2034#ifdef CONFIG_USER_ONLY 2035static HostAddress x86_guest_base = { 2036 .index = -1 2037}; 2038 2039#if defined(__x86_64__) && defined(__linux__) 2040# include <asm/prctl.h> 2041# include <sys/prctl.h> 2042int arch_prctl(int code, unsigned long addr); 2043static inline int setup_guest_base_seg(void) 2044{ 2045 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2046 return P_GS; 2047 } 2048 return 0; 2049} 2050#define setup_guest_base_seg setup_guest_base_seg 2051#elif defined(__x86_64__) && \ 2052 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2053# include <machine/sysarch.h> 2054static inline int setup_guest_base_seg(void) 2055{ 2056 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2057 return P_GS; 2058 } 2059 return 0; 2060} 2061#define setup_guest_base_seg setup_guest_base_seg 2062#endif 2063#else 2064# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2065#endif /* CONFIG_USER_ONLY */ 2066#ifndef setup_guest_base_seg 2067# define setup_guest_base_seg() 0 2068#endif 2069 2070#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2071 2072/* 2073 * For softmmu, perform the TLB load and compare. 2074 * For useronly, perform any required alignment tests. 2075 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2076 * is required and fill in @h with the host address for the fast path. 2077 */ 2078static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2079 TCGReg addr, MemOpIdx oi, bool is_ld) 2080{ 2081 TCGLabelQemuLdst *ldst = NULL; 2082 MemOp opc = get_memop(oi); 2083 MemOp s_bits = opc & MO_SIZE; 2084 unsigned a_mask; 2085 2086 if (tcg_use_softmmu) { 2087 h->index = TCG_REG_L0; 2088 h->ofs = 0; 2089 h->seg = 0; 2090 } else { 2091 *h = x86_guest_base; 2092 } 2093 h->base = addr; 2094 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2095 a_mask = (1 << h->aa.align) - 1; 2096 2097 if (tcg_use_softmmu) { 2098 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2099 : offsetof(CPUTLBEntry, addr_write); 2100 TCGType ttype = TCG_TYPE_I32; 2101 TCGType tlbtype = TCG_TYPE_I32; 2102 int trexw = 0, hrexw = 0, tlbrexw = 0; 2103 unsigned mem_index = get_mmuidx(oi); 2104 unsigned s_mask = (1 << s_bits) - 1; 2105 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2106 int tlb_mask; 2107 2108 ldst = new_ldst_label(s); 2109 ldst->is_ld = is_ld; 2110 ldst->oi = oi; 2111 ldst->addr_reg = addr; 2112 2113 if (TCG_TARGET_REG_BITS == 64) { 2114 ttype = s->addr_type; 2115 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2116 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2117 hrexw = P_REXW; 2118 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2119 tlbtype = TCG_TYPE_I64; 2120 tlbrexw = P_REXW; 2121 } 2122 } 2123 } 2124 2125 tcg_out_mov(s, tlbtype, TCG_REG_L0, addr); 2126 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2127 s->page_bits - CPU_TLB_ENTRY_BITS); 2128 2129 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2130 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2131 2132 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2133 fast_ofs + offsetof(CPUTLBDescFast, table)); 2134 2135 /* 2136 * If the required alignment is at least as large as the access, 2137 * simply copy the address and mask. For lesser alignments, 2138 * check that we don't cross pages for the complete access. 2139 */ 2140 if (a_mask >= s_mask) { 2141 tcg_out_mov(s, ttype, TCG_REG_L1, addr); 2142 } else { 2143 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2144 addr, s_mask - a_mask); 2145 } 2146 tlb_mask = s->page_mask | a_mask; 2147 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2148 2149 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2150 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2151 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2152 2153 /* jne slow_path */ 2154 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2155 ldst->label_ptr[0] = s->code_ptr; 2156 s->code_ptr += 4; 2157 2158 /* TLB Hit. */ 2159 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2160 offsetof(CPUTLBEntry, addend)); 2161 } else if (a_mask) { 2162 int jcc; 2163 2164 ldst = new_ldst_label(s); 2165 ldst->is_ld = is_ld; 2166 ldst->oi = oi; 2167 ldst->addr_reg = addr; 2168 2169 /* jne slow_path */ 2170 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addr, a_mask, true, false); 2171 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2172 ldst->label_ptr[0] = s->code_ptr; 2173 s->code_ptr += 4; 2174 } 2175 2176 return ldst; 2177} 2178 2179static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2180 HostAddress h, TCGType type, MemOp memop) 2181{ 2182 bool use_movbe = false; 2183 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2184 int movop = OPC_MOVL_GvEv; 2185 2186 /* Do big-endian loads with movbe. */ 2187 if (memop & MO_BSWAP) { 2188 tcg_debug_assert(have_movbe); 2189 use_movbe = true; 2190 movop = OPC_MOVBE_GyMy; 2191 } 2192 2193 switch (memop & MO_SSIZE) { 2194 case MO_UB: 2195 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2196 h.base, h.index, 0, h.ofs); 2197 break; 2198 case MO_SB: 2199 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2200 h.base, h.index, 0, h.ofs); 2201 break; 2202 case MO_UW: 2203 if (use_movbe) { 2204 /* There is no extending movbe; only low 16-bits are modified. */ 2205 if (datalo != h.base && datalo != h.index) { 2206 /* XOR breaks dependency chains. */ 2207 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2208 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2209 datalo, h.base, h.index, 0, h.ofs); 2210 } else { 2211 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2212 datalo, h.base, h.index, 0, h.ofs); 2213 tcg_out_ext16u(s, datalo, datalo); 2214 } 2215 } else { 2216 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2217 h.base, h.index, 0, h.ofs); 2218 } 2219 break; 2220 case MO_SW: 2221 if (use_movbe) { 2222 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2223 datalo, h.base, h.index, 0, h.ofs); 2224 tcg_out_ext16s(s, type, datalo, datalo); 2225 } else { 2226 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2227 datalo, h.base, h.index, 0, h.ofs); 2228 } 2229 break; 2230 case MO_UL: 2231 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2232 h.base, h.index, 0, h.ofs); 2233 break; 2234#if TCG_TARGET_REG_BITS == 64 2235 case MO_SL: 2236 if (use_movbe) { 2237 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2238 h.base, h.index, 0, h.ofs); 2239 tcg_out_ext32s(s, datalo, datalo); 2240 } else { 2241 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2242 h.base, h.index, 0, h.ofs); 2243 } 2244 break; 2245#endif 2246 case MO_UQ: 2247 if (TCG_TARGET_REG_BITS == 64) { 2248 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2249 h.base, h.index, 0, h.ofs); 2250 break; 2251 } 2252 if (use_movbe) { 2253 TCGReg t = datalo; 2254 datalo = datahi; 2255 datahi = t; 2256 } 2257 if (h.base == datalo || h.index == datalo) { 2258 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2259 h.base, h.index, 0, h.ofs); 2260 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2261 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2262 } else { 2263 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2264 h.base, h.index, 0, h.ofs); 2265 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2266 h.base, h.index, 0, h.ofs + 4); 2267 } 2268 break; 2269 2270 case MO_128: 2271 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2272 2273 /* 2274 * Without 16-byte atomicity, use integer regs. 2275 * That is where we want the data, and it allows bswaps. 2276 */ 2277 if (h.aa.atom < MO_128) { 2278 if (use_movbe) { 2279 TCGReg t = datalo; 2280 datalo = datahi; 2281 datahi = t; 2282 } 2283 if (h.base == datalo || h.index == datalo) { 2284 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2285 h.base, h.index, 0, h.ofs); 2286 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2287 datalo, datahi, 0); 2288 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2289 datahi, datahi, 8); 2290 } else { 2291 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2292 h.base, h.index, 0, h.ofs); 2293 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2294 h.base, h.index, 0, h.ofs + 8); 2295 } 2296 break; 2297 } 2298 2299 /* 2300 * With 16-byte atomicity, a vector load is required. 2301 * If we already have 16-byte alignment, then VMOVDQA always works. 2302 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2303 * Else use we require a runtime test for alignment for VMOVDQA; 2304 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2305 */ 2306 if (h.aa.align >= MO_128) { 2307 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2308 TCG_TMP_VEC, 0, 2309 h.base, h.index, 0, h.ofs); 2310 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2311 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2312 TCG_TMP_VEC, 0, 2313 h.base, h.index, 0, h.ofs); 2314 } else { 2315 TCGLabel *l1 = gen_new_label(); 2316 TCGLabel *l2 = gen_new_label(); 2317 int jcc; 2318 2319 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2320 tcg_out_jxx(s, jcc, l1, true); 2321 2322 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2323 TCG_TMP_VEC, 0, 2324 h.base, h.index, 0, h.ofs); 2325 tcg_out_jxx(s, JCC_JMP, l2, true); 2326 2327 tcg_out_label(s, l1); 2328 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2329 TCG_TMP_VEC, 0, 2330 h.base, h.index, 0, h.ofs); 2331 tcg_out_label(s, l2); 2332 } 2333 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2334 break; 2335 2336 default: 2337 g_assert_not_reached(); 2338 } 2339} 2340 2341static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2342 TCGReg addr, MemOpIdx oi, TCGType data_type) 2343{ 2344 TCGLabelQemuLdst *ldst; 2345 HostAddress h; 2346 2347 ldst = prepare_host_addr(s, &h, addr, oi, true); 2348 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2349 2350 if (ldst) { 2351 ldst->type = data_type; 2352 ldst->datalo_reg = datalo; 2353 ldst->datahi_reg = datahi; 2354 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2355 } 2356} 2357 2358static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2359 HostAddress h, MemOp memop) 2360{ 2361 bool use_movbe = false; 2362 int movop = OPC_MOVL_EvGv; 2363 2364 /* 2365 * Do big-endian stores with movbe or system-mode. 2366 * User-only without movbe will have its swapping done generically. 2367 */ 2368 if (memop & MO_BSWAP) { 2369 tcg_debug_assert(have_movbe); 2370 use_movbe = true; 2371 movop = OPC_MOVBE_MyGy; 2372 } 2373 2374 switch (memop & MO_SIZE) { 2375 case MO_8: 2376 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2377 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2378 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2379 datalo, h.base, h.index, 0, h.ofs); 2380 break; 2381 case MO_16: 2382 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2383 h.base, h.index, 0, h.ofs); 2384 break; 2385 case MO_32: 2386 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2387 h.base, h.index, 0, h.ofs); 2388 break; 2389 case MO_64: 2390 if (TCG_TARGET_REG_BITS == 64) { 2391 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2392 h.base, h.index, 0, h.ofs); 2393 } else { 2394 if (use_movbe) { 2395 TCGReg t = datalo; 2396 datalo = datahi; 2397 datahi = t; 2398 } 2399 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2400 h.base, h.index, 0, h.ofs); 2401 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2402 h.base, h.index, 0, h.ofs + 4); 2403 } 2404 break; 2405 2406 case MO_128: 2407 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2408 2409 /* 2410 * Without 16-byte atomicity, use integer regs. 2411 * That is where we have the data, and it allows bswaps. 2412 */ 2413 if (h.aa.atom < MO_128) { 2414 if (use_movbe) { 2415 TCGReg t = datalo; 2416 datalo = datahi; 2417 datahi = t; 2418 } 2419 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2420 h.base, h.index, 0, h.ofs); 2421 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2422 h.base, h.index, 0, h.ofs + 8); 2423 break; 2424 } 2425 2426 /* 2427 * With 16-byte atomicity, a vector store is required. 2428 * If we already have 16-byte alignment, then VMOVDQA always works. 2429 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2430 * Else use we require a runtime test for alignment for VMOVDQA; 2431 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2432 */ 2433 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2434 if (h.aa.align >= MO_128) { 2435 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2436 TCG_TMP_VEC, 0, 2437 h.base, h.index, 0, h.ofs); 2438 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2439 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2440 TCG_TMP_VEC, 0, 2441 h.base, h.index, 0, h.ofs); 2442 } else { 2443 TCGLabel *l1 = gen_new_label(); 2444 TCGLabel *l2 = gen_new_label(); 2445 int jcc; 2446 2447 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2448 tcg_out_jxx(s, jcc, l1, true); 2449 2450 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2451 TCG_TMP_VEC, 0, 2452 h.base, h.index, 0, h.ofs); 2453 tcg_out_jxx(s, JCC_JMP, l2, true); 2454 2455 tcg_out_label(s, l1); 2456 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2457 TCG_TMP_VEC, 0, 2458 h.base, h.index, 0, h.ofs); 2459 tcg_out_label(s, l2); 2460 } 2461 break; 2462 2463 default: 2464 g_assert_not_reached(); 2465 } 2466} 2467 2468static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2469 TCGReg addr, MemOpIdx oi, TCGType data_type) 2470{ 2471 TCGLabelQemuLdst *ldst; 2472 HostAddress h; 2473 2474 ldst = prepare_host_addr(s, &h, addr, oi, false); 2475 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2476 2477 if (ldst) { 2478 ldst->type = data_type; 2479 ldst->datalo_reg = datalo; 2480 ldst->datahi_reg = datahi; 2481 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2482 } 2483} 2484 2485static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2486{ 2487 /* Reuse the zeroing that exists for goto_ptr. */ 2488 if (a0 == 0) { 2489 tcg_out_jmp(s, tcg_code_gen_epilogue); 2490 } else { 2491 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2492 tcg_out_jmp(s, tb_ret_addr); 2493 } 2494} 2495 2496static void tcg_out_goto_tb(TCGContext *s, int which) 2497{ 2498 /* 2499 * Jump displacement must be aligned for atomic patching; 2500 * see if we need to add extra nops before jump 2501 */ 2502 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2503 if (gap != 1) { 2504 tcg_out_nopn(s, gap - 1); 2505 } 2506 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2507 set_jmp_insn_offset(s, which); 2508 tcg_out32(s, 0); 2509 set_jmp_reset_offset(s, which); 2510} 2511 2512void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2513 uintptr_t jmp_rx, uintptr_t jmp_rw) 2514{ 2515 /* patch the branch destination */ 2516 uintptr_t addr = tb->jmp_target_addr[n]; 2517 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2518 /* no need to flush icache explicitly */ 2519} 2520 2521 2522static void tgen_add(TCGContext *s, TCGType type, 2523 TCGReg a0, TCGReg a1, TCGReg a2) 2524{ 2525 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2526 2527 if (a0 == a1) { 2528 tgen_arithr(s, ARITH_ADD + rexw, a0, a2); 2529 } else if (a0 == a2) { 2530 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2531 } else { 2532 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, 0); 2533 } 2534} 2535 2536static void tgen_addi(TCGContext *s, TCGType type, 2537 TCGReg a0, TCGReg a1, tcg_target_long a2) 2538{ 2539 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2540 2541 if (a0 == a1) { 2542 tgen_arithi(s, ARITH_ADD + rexw, a0, a2, false); 2543 } else { 2544 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2); 2545 } 2546} 2547 2548static const TCGOutOpBinary outop_add = { 2549 .base.static_constraint = C_O1_I2(r, r, re), 2550 .out_rrr = tgen_add, 2551 .out_rri = tgen_addi, 2552}; 2553 2554static void tgen_and(TCGContext *s, TCGType type, 2555 TCGReg a0, TCGReg a1, TCGReg a2) 2556{ 2557 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2558 tgen_arithr(s, ARITH_AND + rexw, a0, a2); 2559} 2560 2561static void tgen_andi(TCGContext *s, TCGType type, 2562 TCGReg a0, TCGReg a1, tcg_target_long a2) 2563{ 2564 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2565 tgen_arithi(s, ARITH_AND + rexw, a0, a2, false); 2566} 2567 2568static const TCGOutOpBinary outop_and = { 2569 .base.static_constraint = C_O1_I2(r, 0, reZ), 2570 .out_rrr = tgen_and, 2571 .out_rri = tgen_andi, 2572}; 2573 2574static void tgen_andc(TCGContext *s, TCGType type, 2575 TCGReg a0, TCGReg a1, TCGReg a2) 2576{ 2577 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2578 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2579} 2580 2581static TCGConstraintSetIndex cset_andc(TCGType type, unsigned flags) 2582{ 2583 return have_bmi1 ? C_O1_I2(r, r, r) : C_NotImplemented; 2584} 2585 2586static const TCGOutOpBinary outop_andc = { 2587 .base.static_constraint = C_Dynamic, 2588 .base.dynamic_constraint = cset_andc, 2589 .out_rrr = tgen_andc, 2590}; 2591 2592static void tgen_clz(TCGContext *s, TCGType type, 2593 TCGReg a0, TCGReg a1, TCGReg a2) 2594{ 2595 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2596 int jcc; 2597 2598 if (have_lzcnt) { 2599 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2600 jcc = JCC_JB; 2601 } else { 2602 /* Recall that the output of BSR is the index not the count. */ 2603 tcg_out_modrm(s, OPC_BSR + rexw, a0, a1); 2604 tgen_arithi(s, ARITH_XOR + rexw, a0, rexw ? 63 : 31, 0); 2605 2606 /* Since we have destroyed the flags from BSR, we have to re-test. */ 2607 jcc = tcg_out_cmp(s, TCG_COND_EQ, a1, 0, 1, rexw); 2608 } 2609 tcg_out_cmov(s, jcc, rexw, a0, a2); 2610} 2611 2612static void tgen_clzi(TCGContext *s, TCGType type, 2613 TCGReg a0, TCGReg a1, tcg_target_long a2) 2614{ 2615 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2616 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2617} 2618 2619static TCGConstraintSetIndex cset_clz(TCGType type, unsigned flags) 2620{ 2621 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2622} 2623 2624static const TCGOutOpBinary outop_clz = { 2625 .base.static_constraint = C_Dynamic, 2626 .base.dynamic_constraint = cset_clz, 2627 .out_rrr = tgen_clz, 2628 .out_rri = tgen_clzi, 2629}; 2630 2631static void tgen_ctpop(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2632{ 2633 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2634 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2635} 2636 2637static TCGConstraintSetIndex cset_ctpop(TCGType type, unsigned flags) 2638{ 2639 return have_popcnt ? C_O1_I1(r, r) : C_NotImplemented; 2640} 2641 2642static const TCGOutOpUnary outop_ctpop = { 2643 .base.static_constraint = C_Dynamic, 2644 .base.dynamic_constraint = cset_ctpop, 2645 .out_rr = tgen_ctpop, 2646}; 2647 2648static void tgen_ctz(TCGContext *s, TCGType type, 2649 TCGReg a0, TCGReg a1, TCGReg a2) 2650{ 2651 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2652 int jcc; 2653 2654 if (have_bmi1) { 2655 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2656 jcc = JCC_JB; 2657 } else { 2658 tcg_out_modrm(s, OPC_BSF + rexw, a0, a1); 2659 jcc = JCC_JE; 2660 } 2661 tcg_out_cmov(s, jcc, rexw, a0, a2); 2662} 2663 2664static void tgen_ctzi(TCGContext *s, TCGType type, 2665 TCGReg a0, TCGReg a1, tcg_target_long a2) 2666{ 2667 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2668 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2669} 2670 2671static TCGConstraintSetIndex cset_ctz(TCGType type, unsigned flags) 2672{ 2673 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2674} 2675 2676static const TCGOutOpBinary outop_ctz = { 2677 .base.static_constraint = C_Dynamic, 2678 .base.dynamic_constraint = cset_ctz, 2679 .out_rrr = tgen_ctz, 2680 .out_rri = tgen_ctzi, 2681}; 2682 2683static const TCGOutOpBinary outop_divs = { 2684 .base.static_constraint = C_NotImplemented, 2685}; 2686 2687static void tgen_divs2(TCGContext *s, TCGType type, 2688 TCGReg a0, TCGReg a1, TCGReg a4) 2689{ 2690 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2691 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, a4); 2692} 2693 2694static const TCGOutOpDivRem outop_divs2 = { 2695 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2696 .out_rr01r = tgen_divs2, 2697}; 2698 2699static const TCGOutOpBinary outop_divu = { 2700 .base.static_constraint = C_NotImplemented, 2701}; 2702 2703static void tgen_divu2(TCGContext *s, TCGType type, 2704 TCGReg a0, TCGReg a1, TCGReg a4) 2705{ 2706 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2707 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, a4); 2708} 2709 2710static const TCGOutOpDivRem outop_divu2 = { 2711 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2712 .out_rr01r = tgen_divu2, 2713}; 2714 2715static const TCGOutOpBinary outop_eqv = { 2716 .base.static_constraint = C_NotImplemented, 2717}; 2718 2719static void tgen_mul(TCGContext *s, TCGType type, 2720 TCGReg a0, TCGReg a1, TCGReg a2) 2721{ 2722 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2723 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2724} 2725 2726static void tgen_muli(TCGContext *s, TCGType type, 2727 TCGReg a0, TCGReg a1, tcg_target_long a2) 2728{ 2729 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2730 2731 if (a2 == (int8_t)a2) { 2732 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2733 tcg_out8(s, a2); 2734 } else { 2735 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2736 tcg_out32(s, a2); 2737 } 2738} 2739 2740static const TCGOutOpBinary outop_mul = { 2741 .base.static_constraint = C_O1_I2(r, 0, re), 2742 .out_rrr = tgen_mul, 2743 .out_rri = tgen_muli, 2744}; 2745 2746static void tgen_muls2(TCGContext *s, TCGType type, 2747 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2748{ 2749 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2750 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, a3); 2751} 2752 2753static const TCGOutOpMul2 outop_muls2 = { 2754 .base.static_constraint = C_O2_I2(a, d, a, r), 2755 .out_rrrr = tgen_muls2, 2756}; 2757 2758static const TCGOutOpBinary outop_mulsh = { 2759 .base.static_constraint = C_NotImplemented, 2760}; 2761 2762static const TCGOutOpBinary outop_muluh = { 2763 .base.static_constraint = C_NotImplemented, 2764}; 2765 2766static void tgen_mulu2(TCGContext *s, TCGType type, 2767 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2768{ 2769 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2770 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, a3); 2771} 2772 2773static const TCGOutOpMul2 outop_mulu2 = { 2774 .base.static_constraint = C_O2_I2(a, d, a, r), 2775 .out_rrrr = tgen_mulu2, 2776}; 2777 2778static const TCGOutOpBinary outop_nand = { 2779 .base.static_constraint = C_NotImplemented, 2780}; 2781 2782static const TCGOutOpBinary outop_nor = { 2783 .base.static_constraint = C_NotImplemented, 2784}; 2785 2786static void tgen_or(TCGContext *s, TCGType type, 2787 TCGReg a0, TCGReg a1, TCGReg a2) 2788{ 2789 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2790 tgen_arithr(s, ARITH_OR + rexw, a0, a2); 2791} 2792 2793static void tgen_ori(TCGContext *s, TCGType type, 2794 TCGReg a0, TCGReg a1, tcg_target_long a2) 2795{ 2796 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2797 tgen_arithi(s, ARITH_OR + rexw, a0, a2, false); 2798} 2799 2800static const TCGOutOpBinary outop_or = { 2801 .base.static_constraint = C_O1_I2(r, 0, re), 2802 .out_rrr = tgen_or, 2803 .out_rri = tgen_ori, 2804}; 2805 2806static const TCGOutOpBinary outop_orc = { 2807 .base.static_constraint = C_NotImplemented, 2808}; 2809 2810static const TCGOutOpBinary outop_rems = { 2811 .base.static_constraint = C_NotImplemented, 2812}; 2813 2814static const TCGOutOpBinary outop_remu = { 2815 .base.static_constraint = C_NotImplemented, 2816}; 2817 2818static void tgen_rotl(TCGContext *s, TCGType type, 2819 TCGReg a0, TCGReg a1, TCGReg a2) 2820{ 2821 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2822 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROL, a0); 2823} 2824 2825static void tgen_rotli(TCGContext *s, TCGType type, 2826 TCGReg a0, TCGReg a1, tcg_target_long a2) 2827{ 2828 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2829 tcg_out_shifti(s, SHIFT_ROL + rexw, a0, a2); 2830} 2831 2832static const TCGOutOpBinary outop_rotl = { 2833 .base.static_constraint = C_O1_I2(r, 0, ci), 2834 .out_rrr = tgen_rotl, 2835 .out_rri = tgen_rotli, 2836}; 2837 2838static void tgen_rotr(TCGContext *s, TCGType type, 2839 TCGReg a0, TCGReg a1, TCGReg a2) 2840{ 2841 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2842 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROR, a0); 2843} 2844 2845static void tgen_rotri(TCGContext *s, TCGType type, 2846 TCGReg a0, TCGReg a1, tcg_target_long a2) 2847{ 2848 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2849 tcg_out_shifti(s, SHIFT_ROR + rexw, a0, a2); 2850} 2851 2852static const TCGOutOpBinary outop_rotr = { 2853 .base.static_constraint = C_O1_I2(r, 0, ci), 2854 .out_rrr = tgen_rotr, 2855 .out_rri = tgen_rotri, 2856}; 2857 2858static TCGConstraintSetIndex cset_shift(TCGType type, unsigned flags) 2859{ 2860 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 2861} 2862 2863static void tgen_sar(TCGContext *s, TCGType type, 2864 TCGReg a0, TCGReg a1, TCGReg a2) 2865{ 2866 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2867 if (have_bmi2) { 2868 tcg_out_vex_modrm(s, OPC_SARX + rexw, a0, a2, a1); 2869 } else { 2870 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SAR, a0); 2871 } 2872} 2873 2874static void tgen_sari(TCGContext *s, TCGType type, 2875 TCGReg a0, TCGReg a1, tcg_target_long a2) 2876{ 2877 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2878 2879 tcg_out_mov(s, type, a0, a1); 2880 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, a2); 2881} 2882 2883static const TCGOutOpBinary outop_sar = { 2884 .base.static_constraint = C_Dynamic, 2885 .base.dynamic_constraint = cset_shift, 2886 .out_rrr = tgen_sar, 2887 .out_rri = tgen_sari, 2888}; 2889 2890static void tgen_shl(TCGContext *s, TCGType type, 2891 TCGReg a0, TCGReg a1, TCGReg a2) 2892{ 2893 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2894 if (have_bmi2) { 2895 tcg_out_vex_modrm(s, OPC_SHLX + rexw, a0, a2, a1); 2896 } else { 2897 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHL, a0); 2898 } 2899} 2900 2901static void tgen_shli(TCGContext *s, TCGType type, 2902 TCGReg a0, TCGReg a1, tcg_target_long a2) 2903{ 2904 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2905 2906 /* For small constant 3-operand shift, use LEA. */ 2907 if (a0 != a1 && a2 >= 1 && a2 <= 3) { 2908 if (a2 == 1) { 2909 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2910 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2911 } else { 2912 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2913 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2914 } 2915 return; 2916 } 2917 tcg_out_mov(s, type, a0, a1); 2918 tcg_out_shifti(s, SHIFT_SHL + rexw, a0, a2); 2919} 2920 2921static const TCGOutOpBinary outop_shl = { 2922 .base.static_constraint = C_Dynamic, 2923 .base.dynamic_constraint = cset_shift, 2924 .out_rrr = tgen_shl, 2925 .out_rri = tgen_shli, 2926}; 2927 2928static void tgen_shr(TCGContext *s, TCGType type, 2929 TCGReg a0, TCGReg a1, TCGReg a2) 2930{ 2931 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2932 if (have_bmi2) { 2933 tcg_out_vex_modrm(s, OPC_SHRX + rexw, a0, a2, a1); 2934 } else { 2935 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHR, a0); 2936 } 2937} 2938 2939static void tgen_shri(TCGContext *s, TCGType type, 2940 TCGReg a0, TCGReg a1, tcg_target_long a2) 2941{ 2942 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2943 2944 tcg_out_mov(s, type, a0, a1); 2945 tcg_out_shifti(s, SHIFT_SHR + rexw, a0, a2); 2946} 2947 2948static const TCGOutOpBinary outop_shr = { 2949 .base.static_constraint = C_Dynamic, 2950 .base.dynamic_constraint = cset_shift, 2951 .out_rrr = tgen_shr, 2952 .out_rri = tgen_shri, 2953}; 2954 2955static void tgen_sub(TCGContext *s, TCGType type, 2956 TCGReg a0, TCGReg a1, TCGReg a2) 2957{ 2958 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2959 tgen_arithr(s, ARITH_SUB + rexw, a0, a2); 2960} 2961 2962static const TCGOutOpSubtract outop_sub = { 2963 .base.static_constraint = C_O1_I2(r, 0, r), 2964 .out_rrr = tgen_sub, 2965}; 2966 2967static void tgen_xor(TCGContext *s, TCGType type, 2968 TCGReg a0, TCGReg a1, TCGReg a2) 2969{ 2970 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2971 tgen_arithr(s, ARITH_XOR + rexw, a0, a2); 2972} 2973 2974static void tgen_xori(TCGContext *s, TCGType type, 2975 TCGReg a0, TCGReg a1, tcg_target_long a2) 2976{ 2977 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2978 tgen_arithi(s, ARITH_XOR + rexw, a0, a2, false); 2979} 2980 2981static const TCGOutOpBinary outop_xor = { 2982 .base.static_constraint = C_O1_I2(r, 0, re), 2983 .out_rrr = tgen_xor, 2984 .out_rri = tgen_xori, 2985}; 2986 2987static void tgen_neg(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2988{ 2989 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2990 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2991} 2992 2993static const TCGOutOpUnary outop_neg = { 2994 .base.static_constraint = C_O1_I1(r, 0), 2995 .out_rr = tgen_neg, 2996}; 2997 2998static void tgen_not(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2999{ 3000 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3001 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 3002} 3003 3004static const TCGOutOpUnary outop_not = { 3005 .base.static_constraint = C_O1_I1(r, 0), 3006 .out_rr = tgen_not, 3007}; 3008 3009 3010static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type, 3011 const TCGArg args[TCG_MAX_OP_ARGS], 3012 const int const_args[TCG_MAX_OP_ARGS]) 3013{ 3014 TCGArg a0, a1, a2; 3015 int const_a2, rexw; 3016 3017#if TCG_TARGET_REG_BITS == 64 3018# define OP_32_64(x) \ 3019 case glue(glue(INDEX_op_, x), _i64): \ 3020 case glue(glue(INDEX_op_, x), _i32) 3021#else 3022# define OP_32_64(x) \ 3023 case glue(glue(INDEX_op_, x), _i32) 3024#endif 3025 3026 /* Hoist the loads of the most common arguments. */ 3027 a0 = args[0]; 3028 a1 = args[1]; 3029 a2 = args[2]; 3030 const_a2 = const_args[2]; 3031 rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3032 3033 switch (opc) { 3034 case INDEX_op_goto_ptr: 3035 /* jmp to the given host address (could be epilogue) */ 3036 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 3037 break; 3038 case INDEX_op_br: 3039 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 3040 break; 3041 OP_32_64(ld8u): 3042 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3043 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 3044 break; 3045 OP_32_64(ld8s): 3046 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 3047 break; 3048 OP_32_64(ld16u): 3049 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3050 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 3051 break; 3052 OP_32_64(ld16s): 3053 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 3054 break; 3055#if TCG_TARGET_REG_BITS == 64 3056 case INDEX_op_ld32u_i64: 3057#endif 3058 case INDEX_op_ld_i32: 3059 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 3060 break; 3061 3062 OP_32_64(st8): 3063 if (const_args[0]) { 3064 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 3065 tcg_out8(s, a0); 3066 } else { 3067 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 3068 } 3069 break; 3070 OP_32_64(st16): 3071 if (const_args[0]) { 3072 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 3073 tcg_out16(s, a0); 3074 } else { 3075 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 3076 } 3077 break; 3078#if TCG_TARGET_REG_BITS == 64 3079 case INDEX_op_st32_i64: 3080#endif 3081 case INDEX_op_st_i32: 3082 if (const_args[0]) { 3083 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 3084 tcg_out32(s, a0); 3085 } else { 3086 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 3087 } 3088 break; 3089 3090 OP_32_64(brcond): 3091 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1], 3092 arg_label(args[3]), 0); 3093 break; 3094 OP_32_64(setcond): 3095 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false); 3096 break; 3097 OP_32_64(negsetcond): 3098 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true); 3099 break; 3100 OP_32_64(movcond): 3101 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 3102 break; 3103 3104 OP_32_64(bswap16): 3105 if (a2 & TCG_BSWAP_OS) { 3106 /* Output must be sign-extended. */ 3107 if (rexw) { 3108 tcg_out_bswap64(s, a0); 3109 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 3110 } else { 3111 tcg_out_bswap32(s, a0); 3112 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 3113 } 3114 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 3115 /* Output must be zero-extended, but input isn't. */ 3116 tcg_out_bswap32(s, a0); 3117 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 3118 } else { 3119 tcg_out_rolw_8(s, a0); 3120 } 3121 break; 3122 OP_32_64(bswap32): 3123 tcg_out_bswap32(s, a0); 3124 if (rexw && (a2 & TCG_BSWAP_OS)) { 3125 tcg_out_ext32s(s, a0, a0); 3126 } 3127 break; 3128 3129 case INDEX_op_qemu_ld_i32: 3130 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32); 3131 break; 3132 case INDEX_op_qemu_ld_i64: 3133 if (TCG_TARGET_REG_BITS == 64) { 3134 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64); 3135 } else { 3136 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3137 } 3138 break; 3139 case INDEX_op_qemu_ld_i128: 3140 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3141 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3142 break; 3143 3144 case INDEX_op_qemu_st_i32: 3145 case INDEX_op_qemu_st8_i32: 3146 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32); 3147 break; 3148 case INDEX_op_qemu_st_i64: 3149 if (TCG_TARGET_REG_BITS == 64) { 3150 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64); 3151 } else { 3152 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3153 } 3154 break; 3155 case INDEX_op_qemu_st_i128: 3156 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3157 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3158 break; 3159 3160 OP_32_64(add2): 3161 if (const_args[4]) { 3162 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 3163 } else { 3164 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 3165 } 3166 if (const_args[5]) { 3167 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 3168 } else { 3169 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 3170 } 3171 break; 3172 OP_32_64(sub2): 3173 if (const_args[4]) { 3174 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 3175 } else { 3176 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 3177 } 3178 if (const_args[5]) { 3179 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 3180 } else { 3181 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 3182 } 3183 break; 3184 3185#if TCG_TARGET_REG_BITS == 32 3186 case INDEX_op_brcond2_i32: 3187 tcg_out_brcond2(s, args, const_args, 0); 3188 break; 3189 case INDEX_op_setcond2_i32: 3190 tcg_out_setcond2(s, args, const_args); 3191 break; 3192#else /* TCG_TARGET_REG_BITS == 64 */ 3193 case INDEX_op_ld32s_i64: 3194 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 3195 break; 3196 case INDEX_op_ld_i64: 3197 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 3198 break; 3199 case INDEX_op_st_i64: 3200 if (const_args[0]) { 3201 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 3202 tcg_out32(s, a0); 3203 } else { 3204 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 3205 } 3206 break; 3207 3208 case INDEX_op_bswap64_i64: 3209 tcg_out_bswap64(s, a0); 3210 break; 3211 case INDEX_op_extrh_i64_i32: 3212 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 3213 break; 3214#endif 3215 3216 OP_32_64(deposit): 3217 if (args[3] == 0 && args[4] == 8) { 3218 /* load bits 0..7 */ 3219 if (const_a2) { 3220 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 3221 0, a0, 0); 3222 tcg_out8(s, a2); 3223 } else { 3224 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 3225 } 3226 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 3227 /* load bits 8..15 */ 3228 if (const_a2) { 3229 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 3230 tcg_out8(s, a2); 3231 } else { 3232 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 3233 } 3234 } else if (args[3] == 0 && args[4] == 16) { 3235 /* load bits 0..15 */ 3236 if (const_a2) { 3237 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 3238 0, a0, 0); 3239 tcg_out16(s, a2); 3240 } else { 3241 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 3242 } 3243 } else { 3244 g_assert_not_reached(); 3245 } 3246 break; 3247 3248 case INDEX_op_extract_i64: 3249 if (a2 + args[3] == 32) { 3250 if (a2 == 0) { 3251 tcg_out_ext32u(s, a0, a1); 3252 break; 3253 } 3254 /* This is a 32-bit zero-extending right shift. */ 3255 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3256 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 3257 break; 3258 } 3259 /* FALLTHRU */ 3260 case INDEX_op_extract_i32: 3261 if (a2 == 0 && args[3] == 8) { 3262 tcg_out_ext8u(s, a0, a1); 3263 } else if (a2 == 0 && args[3] == 16) { 3264 tcg_out_ext16u(s, a0, a1); 3265 } else if (a2 == 8 && args[3] == 8) { 3266 /* 3267 * On the off-chance that we can use the high-byte registers. 3268 * Otherwise we emit the same ext16 + shift pattern that we 3269 * would have gotten from the normal tcg-op.c expansion. 3270 */ 3271 if (a1 < 4 && a0 < 8) { 3272 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3273 } else { 3274 tcg_out_ext16u(s, a0, a1); 3275 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3276 } 3277 } else { 3278 g_assert_not_reached(); 3279 } 3280 break; 3281 3282 case INDEX_op_sextract_i64: 3283 if (a2 == 0 && args[3] == 8) { 3284 tcg_out_ext8s(s, TCG_TYPE_I64, a0, a1); 3285 } else if (a2 == 0 && args[3] == 16) { 3286 tcg_out_ext16s(s, TCG_TYPE_I64, a0, a1); 3287 } else if (a2 == 0 && args[3] == 32) { 3288 tcg_out_ext32s(s, a0, a1); 3289 } else { 3290 g_assert_not_reached(); 3291 } 3292 break; 3293 3294 case INDEX_op_sextract_i32: 3295 if (a2 == 0 && args[3] == 8) { 3296 tcg_out_ext8s(s, TCG_TYPE_I32, a0, a1); 3297 } else if (a2 == 0 && args[3] == 16) { 3298 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3299 } else if (a2 == 8 && args[3] == 8) { 3300 if (a1 < 4 && a0 < 8) { 3301 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3302 } else { 3303 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3304 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3305 } 3306 } else { 3307 g_assert_not_reached(); 3308 } 3309 break; 3310 3311 OP_32_64(extract2): 3312 /* Note that SHRD outputs to the r/m operand. */ 3313 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3314 tcg_out8(s, args[3]); 3315 break; 3316 3317 case INDEX_op_mb: 3318 tcg_out_mb(s, a0); 3319 break; 3320 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3321 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3322 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3323 case INDEX_op_ext_i32_i64: /* Always emitted via tcg_reg_alloc_op. */ 3324 case INDEX_op_extu_i32_i64: 3325 case INDEX_op_extrl_i64_i32: 3326 default: 3327 g_assert_not_reached(); 3328 } 3329 3330#undef OP_32_64 3331} 3332 3333static int const umin_insn[4] = { 3334 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3335}; 3336 3337static int const umax_insn[4] = { 3338 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3339}; 3340 3341static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3342 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3343{ 3344 static int const cmpeq_insn[4] = { 3345 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3346 }; 3347 static int const cmpgt_insn[4] = { 3348 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3349 }; 3350 3351 enum { 3352 NEED_INV = 1, 3353 NEED_SWAP = 2, 3354 NEED_UMIN = 4, 3355 NEED_UMAX = 8, 3356 INVALID = 16, 3357 }; 3358 static const uint8_t cond_fixup[16] = { 3359 [0 ... 15] = INVALID, 3360 [TCG_COND_EQ] = 0, 3361 [TCG_COND_GT] = 0, 3362 [TCG_COND_NE] = NEED_INV, 3363 [TCG_COND_LE] = NEED_INV, 3364 [TCG_COND_LT] = NEED_SWAP, 3365 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3366 [TCG_COND_LEU] = NEED_UMIN, 3367 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3368 [TCG_COND_GEU] = NEED_UMAX, 3369 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3370 }; 3371 int fixup = cond_fixup[cond]; 3372 3373 assert(!(fixup & INVALID)); 3374 3375 if (fixup & NEED_INV) { 3376 cond = tcg_invert_cond(cond); 3377 } 3378 3379 if (fixup & NEED_SWAP) { 3380 TCGReg swap = v1; 3381 v1 = v2; 3382 v2 = swap; 3383 cond = tcg_swap_cond(cond); 3384 } 3385 3386 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3387 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3388 3389 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3390 assert(vece <= MO_32); 3391 3392 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3393 v2 = TCG_TMP_VEC; 3394 cond = TCG_COND_EQ; 3395 } 3396 3397 switch (cond) { 3398 case TCG_COND_EQ: 3399 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3400 break; 3401 case TCG_COND_GT: 3402 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3403 break; 3404 default: 3405 g_assert_not_reached(); 3406 } 3407 return fixup & NEED_INV; 3408} 3409 3410static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3411 TCGReg v1, TCGReg v2, TCGCond cond) 3412{ 3413 static const int cmpm_insn[2][4] = { 3414 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3415 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3416 }; 3417 static const int testm_insn[4] = { 3418 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3419 }; 3420 static const int testnm_insn[4] = { 3421 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3422 }; 3423 3424 static const int cond_ext[16] = { 3425 [TCG_COND_EQ] = 0, 3426 [TCG_COND_NE] = 4, 3427 [TCG_COND_LT] = 1, 3428 [TCG_COND_LTU] = 1, 3429 [TCG_COND_LE] = 2, 3430 [TCG_COND_LEU] = 2, 3431 [TCG_COND_NEVER] = 3, 3432 [TCG_COND_GE] = 5, 3433 [TCG_COND_GEU] = 5, 3434 [TCG_COND_GT] = 6, 3435 [TCG_COND_GTU] = 6, 3436 [TCG_COND_ALWAYS] = 7, 3437 }; 3438 3439 switch (cond) { 3440 case TCG_COND_TSTNE: 3441 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3442 break; 3443 case TCG_COND_TSTEQ: 3444 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3445 break; 3446 default: 3447 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3448 /* k1 */ 1, v1, v2, type); 3449 tcg_out8(s, cond_ext[cond]); 3450 break; 3451 } 3452} 3453 3454static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3455 unsigned vece, TCGReg dest) 3456{ 3457 static const int movm_insn[] = { 3458 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3459 }; 3460 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3461} 3462 3463static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3464 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3465{ 3466 /* 3467 * With avx512, we have a complete set of comparisons into mask. 3468 * Unless there's a single insn expansion for the comparision, 3469 * expand via a mask in k1. 3470 */ 3471 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3472 && cond != TCG_COND_EQ 3473 && cond != TCG_COND_LT 3474 && cond != TCG_COND_GT) { 3475 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3476 tcg_out_k1_to_vec(s, type, vece, v0); 3477 return; 3478 } 3479 3480 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3481 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3482 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3483 } 3484} 3485 3486static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3487 TCGReg v0, TCGReg c1, TCGReg c2, 3488 TCGReg v3, TCGReg v4, TCGCond cond) 3489{ 3490 static const int vpblendm_insn[] = { 3491 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3492 }; 3493 bool z = false; 3494 3495 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3496 if (!v3) { 3497 z = true; 3498 v3 = v4; 3499 cond = tcg_invert_cond(cond); 3500 } 3501 3502 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3503 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3504 /* k1 */1, z, type); 3505} 3506 3507static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3508 TCGReg v0, TCGReg c1, TCGReg c2, 3509 TCGReg v3, TCGReg v4, TCGCond cond) 3510{ 3511 bool inv; 3512 3513 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3514 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3515 return; 3516 } 3517 3518 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3519 3520 /* 3521 * Since XMM0 is 16, the only way we get 0 into V3 3522 * is via the constant zero constraint. 3523 */ 3524 if (!v3) { 3525 if (inv) { 3526 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3527 } else { 3528 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3529 } 3530 } else { 3531 if (inv) { 3532 TCGReg swap = v3; 3533 v3 = v4; 3534 v4 = swap; 3535 } 3536 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3537 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3538 } 3539} 3540 3541static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3542 unsigned vecl, unsigned vece, 3543 const TCGArg args[TCG_MAX_OP_ARGS], 3544 const int const_args[TCG_MAX_OP_ARGS]) 3545{ 3546 static int const add_insn[4] = { 3547 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3548 }; 3549 static int const ssadd_insn[4] = { 3550 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3551 }; 3552 static int const usadd_insn[4] = { 3553 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3554 }; 3555 static int const sub_insn[4] = { 3556 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3557 }; 3558 static int const sssub_insn[4] = { 3559 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3560 }; 3561 static int const ussub_insn[4] = { 3562 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3563 }; 3564 static int const mul_insn[4] = { 3565 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3566 }; 3567 static int const shift_imm_insn[4] = { 3568 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3569 }; 3570 static int const punpckl_insn[4] = { 3571 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3572 }; 3573 static int const punpckh_insn[4] = { 3574 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3575 }; 3576 static int const packss_insn[4] = { 3577 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3578 }; 3579 static int const packus_insn[4] = { 3580 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3581 }; 3582 static int const smin_insn[4] = { 3583 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3584 }; 3585 static int const smax_insn[4] = { 3586 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3587 }; 3588 static int const rotlv_insn[4] = { 3589 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3590 }; 3591 static int const rotrv_insn[4] = { 3592 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3593 }; 3594 static int const shlv_insn[4] = { 3595 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3596 }; 3597 static int const shrv_insn[4] = { 3598 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3599 }; 3600 static int const sarv_insn[4] = { 3601 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3602 }; 3603 static int const shls_insn[4] = { 3604 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3605 }; 3606 static int const shrs_insn[4] = { 3607 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3608 }; 3609 static int const sars_insn[4] = { 3610 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3611 }; 3612 static int const vpshldi_insn[4] = { 3613 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3614 }; 3615 static int const vpshldv_insn[4] = { 3616 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3617 }; 3618 static int const vpshrdv_insn[4] = { 3619 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3620 }; 3621 static int const abs_insn[4] = { 3622 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3623 }; 3624 3625 TCGType type = vecl + TCG_TYPE_V64; 3626 int insn, sub; 3627 TCGArg a0, a1, a2, a3; 3628 3629 a0 = args[0]; 3630 a1 = args[1]; 3631 a2 = args[2]; 3632 3633 switch (opc) { 3634 case INDEX_op_add_vec: 3635 insn = add_insn[vece]; 3636 goto gen_simd; 3637 case INDEX_op_ssadd_vec: 3638 insn = ssadd_insn[vece]; 3639 goto gen_simd; 3640 case INDEX_op_usadd_vec: 3641 insn = usadd_insn[vece]; 3642 goto gen_simd; 3643 case INDEX_op_sub_vec: 3644 insn = sub_insn[vece]; 3645 goto gen_simd; 3646 case INDEX_op_sssub_vec: 3647 insn = sssub_insn[vece]; 3648 goto gen_simd; 3649 case INDEX_op_ussub_vec: 3650 insn = ussub_insn[vece]; 3651 goto gen_simd; 3652 case INDEX_op_mul_vec: 3653 insn = mul_insn[vece]; 3654 goto gen_simd; 3655 case INDEX_op_and_vec: 3656 insn = OPC_PAND; 3657 goto gen_simd; 3658 case INDEX_op_or_vec: 3659 insn = OPC_POR; 3660 goto gen_simd; 3661 case INDEX_op_xor_vec: 3662 insn = OPC_PXOR; 3663 goto gen_simd; 3664 case INDEX_op_smin_vec: 3665 insn = smin_insn[vece]; 3666 goto gen_simd; 3667 case INDEX_op_umin_vec: 3668 insn = umin_insn[vece]; 3669 goto gen_simd; 3670 case INDEX_op_smax_vec: 3671 insn = smax_insn[vece]; 3672 goto gen_simd; 3673 case INDEX_op_umax_vec: 3674 insn = umax_insn[vece]; 3675 goto gen_simd; 3676 case INDEX_op_shlv_vec: 3677 insn = shlv_insn[vece]; 3678 goto gen_simd; 3679 case INDEX_op_shrv_vec: 3680 insn = shrv_insn[vece]; 3681 goto gen_simd; 3682 case INDEX_op_sarv_vec: 3683 insn = sarv_insn[vece]; 3684 goto gen_simd; 3685 case INDEX_op_rotlv_vec: 3686 insn = rotlv_insn[vece]; 3687 goto gen_simd; 3688 case INDEX_op_rotrv_vec: 3689 insn = rotrv_insn[vece]; 3690 goto gen_simd; 3691 case INDEX_op_shls_vec: 3692 insn = shls_insn[vece]; 3693 goto gen_simd; 3694 case INDEX_op_shrs_vec: 3695 insn = shrs_insn[vece]; 3696 goto gen_simd; 3697 case INDEX_op_sars_vec: 3698 insn = sars_insn[vece]; 3699 goto gen_simd; 3700 case INDEX_op_x86_punpckl_vec: 3701 insn = punpckl_insn[vece]; 3702 goto gen_simd; 3703 case INDEX_op_x86_punpckh_vec: 3704 insn = punpckh_insn[vece]; 3705 goto gen_simd; 3706 case INDEX_op_x86_packss_vec: 3707 insn = packss_insn[vece]; 3708 goto gen_simd; 3709 case INDEX_op_x86_packus_vec: 3710 insn = packus_insn[vece]; 3711 goto gen_simd; 3712 case INDEX_op_x86_vpshldv_vec: 3713 insn = vpshldv_insn[vece]; 3714 a1 = a2; 3715 a2 = args[3]; 3716 goto gen_simd; 3717 case INDEX_op_x86_vpshrdv_vec: 3718 insn = vpshrdv_insn[vece]; 3719 a1 = a2; 3720 a2 = args[3]; 3721 goto gen_simd; 3722#if TCG_TARGET_REG_BITS == 32 3723 case INDEX_op_dup2_vec: 3724 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3725 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3726 /* Then replicate the 64-bit elements across the rest of the vector. */ 3727 if (type != TCG_TYPE_V64) { 3728 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3729 } 3730 break; 3731#endif 3732 case INDEX_op_abs_vec: 3733 insn = abs_insn[vece]; 3734 a2 = a1; 3735 a1 = 0; 3736 goto gen_simd; 3737 gen_simd: 3738 tcg_debug_assert(insn != OPC_UD2); 3739 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3740 break; 3741 3742 case INDEX_op_cmp_vec: 3743 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3744 break; 3745 3746 case INDEX_op_cmpsel_vec: 3747 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3748 args[3], args[4], args[5]); 3749 break; 3750 3751 case INDEX_op_andc_vec: 3752 insn = OPC_PANDN; 3753 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3754 break; 3755 3756 case INDEX_op_shli_vec: 3757 insn = shift_imm_insn[vece]; 3758 sub = 6; 3759 goto gen_shift; 3760 case INDEX_op_shri_vec: 3761 insn = shift_imm_insn[vece]; 3762 sub = 2; 3763 goto gen_shift; 3764 case INDEX_op_sari_vec: 3765 if (vece == MO_64) { 3766 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3767 } else { 3768 insn = shift_imm_insn[vece]; 3769 } 3770 sub = 4; 3771 goto gen_shift; 3772 case INDEX_op_rotli_vec: 3773 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3774 if (vece == MO_64) { 3775 insn |= P_VEXW; 3776 } 3777 sub = 1; 3778 goto gen_shift; 3779 gen_shift: 3780 tcg_debug_assert(vece != MO_8); 3781 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3782 tcg_out8(s, a2); 3783 break; 3784 3785 case INDEX_op_ld_vec: 3786 tcg_out_ld(s, type, a0, a1, a2); 3787 break; 3788 case INDEX_op_st_vec: 3789 tcg_out_st(s, type, a0, a1, a2); 3790 break; 3791 case INDEX_op_dupm_vec: 3792 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3793 break; 3794 3795 case INDEX_op_x86_shufps_vec: 3796 insn = OPC_SHUFPS; 3797 sub = args[3]; 3798 goto gen_simd_imm8; 3799 case INDEX_op_x86_blend_vec: 3800 if (vece == MO_16) { 3801 insn = OPC_PBLENDW; 3802 } else if (vece == MO_32) { 3803 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3804 } else { 3805 g_assert_not_reached(); 3806 } 3807 sub = args[3]; 3808 goto gen_simd_imm8; 3809 case INDEX_op_x86_vperm2i128_vec: 3810 insn = OPC_VPERM2I128; 3811 sub = args[3]; 3812 goto gen_simd_imm8; 3813 case INDEX_op_x86_vpshldi_vec: 3814 insn = vpshldi_insn[vece]; 3815 sub = args[3]; 3816 goto gen_simd_imm8; 3817 3818 case INDEX_op_not_vec: 3819 insn = OPC_VPTERNLOGQ; 3820 a2 = a1; 3821 sub = 0x33; /* !B */ 3822 goto gen_simd_imm8; 3823 case INDEX_op_nor_vec: 3824 insn = OPC_VPTERNLOGQ; 3825 sub = 0x11; /* norCB */ 3826 goto gen_simd_imm8; 3827 case INDEX_op_nand_vec: 3828 insn = OPC_VPTERNLOGQ; 3829 sub = 0x77; /* nandCB */ 3830 goto gen_simd_imm8; 3831 case INDEX_op_eqv_vec: 3832 insn = OPC_VPTERNLOGQ; 3833 sub = 0x99; /* xnorCB */ 3834 goto gen_simd_imm8; 3835 case INDEX_op_orc_vec: 3836 insn = OPC_VPTERNLOGQ; 3837 sub = 0xdd; /* orB!C */ 3838 goto gen_simd_imm8; 3839 3840 case INDEX_op_bitsel_vec: 3841 insn = OPC_VPTERNLOGQ; 3842 a3 = args[3]; 3843 if (a0 == a1) { 3844 a1 = a2; 3845 a2 = a3; 3846 sub = 0xca; /* A?B:C */ 3847 } else if (a0 == a2) { 3848 a2 = a3; 3849 sub = 0xe2; /* B?A:C */ 3850 } else { 3851 tcg_out_mov(s, type, a0, a3); 3852 sub = 0xb8; /* B?C:A */ 3853 } 3854 goto gen_simd_imm8; 3855 3856 gen_simd_imm8: 3857 tcg_debug_assert(insn != OPC_UD2); 3858 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3859 tcg_out8(s, sub); 3860 break; 3861 3862 case INDEX_op_x86_psrldq_vec: 3863 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3864 tcg_out8(s, a2); 3865 break; 3866 3867 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3868 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3869 default: 3870 g_assert_not_reached(); 3871 } 3872} 3873 3874static TCGConstraintSetIndex 3875tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) 3876{ 3877 switch (op) { 3878 case INDEX_op_goto_ptr: 3879 return C_O0_I1(r); 3880 3881 case INDEX_op_ld8u_i32: 3882 case INDEX_op_ld8u_i64: 3883 case INDEX_op_ld8s_i32: 3884 case INDEX_op_ld8s_i64: 3885 case INDEX_op_ld16u_i32: 3886 case INDEX_op_ld16u_i64: 3887 case INDEX_op_ld16s_i32: 3888 case INDEX_op_ld16s_i64: 3889 case INDEX_op_ld_i32: 3890 case INDEX_op_ld32u_i64: 3891 case INDEX_op_ld32s_i64: 3892 case INDEX_op_ld_i64: 3893 return C_O1_I1(r, r); 3894 3895 case INDEX_op_st8_i32: 3896 case INDEX_op_st8_i64: 3897 return C_O0_I2(qi, r); 3898 3899 case INDEX_op_st16_i32: 3900 case INDEX_op_st16_i64: 3901 case INDEX_op_st_i32: 3902 case INDEX_op_st32_i64: 3903 return C_O0_I2(ri, r); 3904 3905 case INDEX_op_st_i64: 3906 return C_O0_I2(re, r); 3907 3908 case INDEX_op_brcond_i32: 3909 case INDEX_op_brcond_i64: 3910 return C_O0_I2(r, reT); 3911 3912 case INDEX_op_bswap16_i32: 3913 case INDEX_op_bswap16_i64: 3914 case INDEX_op_bswap32_i32: 3915 case INDEX_op_bswap32_i64: 3916 case INDEX_op_bswap64_i64: 3917 case INDEX_op_extrh_i64_i32: 3918 return C_O1_I1(r, 0); 3919 3920 case INDEX_op_ext_i32_i64: 3921 case INDEX_op_extu_i32_i64: 3922 case INDEX_op_extrl_i64_i32: 3923 case INDEX_op_extract_i32: 3924 case INDEX_op_extract_i64: 3925 case INDEX_op_sextract_i32: 3926 case INDEX_op_sextract_i64: 3927 return C_O1_I1(r, r); 3928 3929 case INDEX_op_extract2_i32: 3930 case INDEX_op_extract2_i64: 3931 return C_O1_I2(r, 0, r); 3932 3933 case INDEX_op_deposit_i32: 3934 case INDEX_op_deposit_i64: 3935 return C_O1_I2(q, 0, qi); 3936 3937 case INDEX_op_setcond_i32: 3938 case INDEX_op_setcond_i64: 3939 case INDEX_op_negsetcond_i32: 3940 case INDEX_op_negsetcond_i64: 3941 return C_O1_I2(q, r, reT); 3942 3943 case INDEX_op_movcond_i32: 3944 case INDEX_op_movcond_i64: 3945 return C_O1_I4(r, r, reT, r, 0); 3946 3947 case INDEX_op_add2_i32: 3948 case INDEX_op_add2_i64: 3949 case INDEX_op_sub2_i32: 3950 case INDEX_op_sub2_i64: 3951 return C_N1_O1_I4(r, r, 0, 1, re, re); 3952 3953 case INDEX_op_qemu_ld_i32: 3954 return C_O1_I1(r, L); 3955 3956 case INDEX_op_qemu_st_i32: 3957 return C_O0_I2(L, L); 3958 case INDEX_op_qemu_st8_i32: 3959 return C_O0_I2(s, L); 3960 3961 case INDEX_op_qemu_ld_i64: 3962 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3963 3964 case INDEX_op_qemu_st_i64: 3965 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3966 3967 case INDEX_op_qemu_ld_i128: 3968 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3969 return C_O2_I1(r, r, L); 3970 case INDEX_op_qemu_st_i128: 3971 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3972 return C_O0_I3(L, L, L); 3973 3974 case INDEX_op_brcond2_i32: 3975 return C_O0_I4(r, r, ri, ri); 3976 3977 case INDEX_op_setcond2_i32: 3978 return C_O1_I4(r, r, r, ri, ri); 3979 3980 case INDEX_op_ld_vec: 3981 case INDEX_op_dupm_vec: 3982 return C_O1_I1(x, r); 3983 3984 case INDEX_op_st_vec: 3985 return C_O0_I2(x, r); 3986 3987 case INDEX_op_add_vec: 3988 case INDEX_op_sub_vec: 3989 case INDEX_op_mul_vec: 3990 case INDEX_op_and_vec: 3991 case INDEX_op_or_vec: 3992 case INDEX_op_xor_vec: 3993 case INDEX_op_andc_vec: 3994 case INDEX_op_orc_vec: 3995 case INDEX_op_nand_vec: 3996 case INDEX_op_nor_vec: 3997 case INDEX_op_eqv_vec: 3998 case INDEX_op_ssadd_vec: 3999 case INDEX_op_usadd_vec: 4000 case INDEX_op_sssub_vec: 4001 case INDEX_op_ussub_vec: 4002 case INDEX_op_smin_vec: 4003 case INDEX_op_umin_vec: 4004 case INDEX_op_smax_vec: 4005 case INDEX_op_umax_vec: 4006 case INDEX_op_shlv_vec: 4007 case INDEX_op_shrv_vec: 4008 case INDEX_op_sarv_vec: 4009 case INDEX_op_rotlv_vec: 4010 case INDEX_op_rotrv_vec: 4011 case INDEX_op_shls_vec: 4012 case INDEX_op_shrs_vec: 4013 case INDEX_op_sars_vec: 4014 case INDEX_op_cmp_vec: 4015 case INDEX_op_x86_shufps_vec: 4016 case INDEX_op_x86_blend_vec: 4017 case INDEX_op_x86_packss_vec: 4018 case INDEX_op_x86_packus_vec: 4019 case INDEX_op_x86_vperm2i128_vec: 4020 case INDEX_op_x86_punpckl_vec: 4021 case INDEX_op_x86_punpckh_vec: 4022 case INDEX_op_x86_vpshldi_vec: 4023#if TCG_TARGET_REG_BITS == 32 4024 case INDEX_op_dup2_vec: 4025#endif 4026 return C_O1_I2(x, x, x); 4027 4028 case INDEX_op_abs_vec: 4029 case INDEX_op_dup_vec: 4030 case INDEX_op_not_vec: 4031 case INDEX_op_shli_vec: 4032 case INDEX_op_shri_vec: 4033 case INDEX_op_sari_vec: 4034 case INDEX_op_rotli_vec: 4035 case INDEX_op_x86_psrldq_vec: 4036 return C_O1_I1(x, x); 4037 4038 case INDEX_op_x86_vpshldv_vec: 4039 case INDEX_op_x86_vpshrdv_vec: 4040 return C_O1_I3(x, 0, x, x); 4041 4042 case INDEX_op_bitsel_vec: 4043 return C_O1_I3(x, x, x, x); 4044 case INDEX_op_cmpsel_vec: 4045 return C_O1_I4(x, x, x, xO, x); 4046 4047 default: 4048 return C_NotImplemented; 4049 } 4050} 4051 4052int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 4053{ 4054 switch (opc) { 4055 case INDEX_op_add_vec: 4056 case INDEX_op_sub_vec: 4057 case INDEX_op_and_vec: 4058 case INDEX_op_or_vec: 4059 case INDEX_op_xor_vec: 4060 case INDEX_op_andc_vec: 4061 case INDEX_op_orc_vec: 4062 case INDEX_op_nand_vec: 4063 case INDEX_op_nor_vec: 4064 case INDEX_op_eqv_vec: 4065 case INDEX_op_not_vec: 4066 case INDEX_op_bitsel_vec: 4067 return 1; 4068 case INDEX_op_cmp_vec: 4069 case INDEX_op_cmpsel_vec: 4070 return -1; 4071 4072 case INDEX_op_rotli_vec: 4073 return have_avx512vl && vece >= MO_32 ? 1 : -1; 4074 4075 case INDEX_op_shli_vec: 4076 case INDEX_op_shri_vec: 4077 /* We must expand the operation for MO_8. */ 4078 return vece == MO_8 ? -1 : 1; 4079 4080 case INDEX_op_sari_vec: 4081 switch (vece) { 4082 case MO_8: 4083 return -1; 4084 case MO_16: 4085 case MO_32: 4086 return 1; 4087 case MO_64: 4088 if (have_avx512vl) { 4089 return 1; 4090 } 4091 /* 4092 * We can emulate this for MO_64, but it does not pay off 4093 * unless we're producing at least 4 values. 4094 */ 4095 return type >= TCG_TYPE_V256 ? -1 : 0; 4096 } 4097 return 0; 4098 4099 case INDEX_op_shls_vec: 4100 case INDEX_op_shrs_vec: 4101 return vece >= MO_16; 4102 case INDEX_op_sars_vec: 4103 switch (vece) { 4104 case MO_16: 4105 case MO_32: 4106 return 1; 4107 case MO_64: 4108 return have_avx512vl; 4109 } 4110 return 0; 4111 case INDEX_op_rotls_vec: 4112 return vece >= MO_16 ? -1 : 0; 4113 4114 case INDEX_op_shlv_vec: 4115 case INDEX_op_shrv_vec: 4116 switch (vece) { 4117 case MO_16: 4118 return have_avx512bw; 4119 case MO_32: 4120 case MO_64: 4121 return have_avx2; 4122 } 4123 return 0; 4124 case INDEX_op_sarv_vec: 4125 switch (vece) { 4126 case MO_16: 4127 return have_avx512bw; 4128 case MO_32: 4129 return have_avx2; 4130 case MO_64: 4131 return have_avx512vl; 4132 } 4133 return 0; 4134 case INDEX_op_rotlv_vec: 4135 case INDEX_op_rotrv_vec: 4136 switch (vece) { 4137 case MO_16: 4138 return have_avx512vbmi2 ? -1 : 0; 4139 case MO_32: 4140 case MO_64: 4141 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 4142 } 4143 return 0; 4144 4145 case INDEX_op_mul_vec: 4146 switch (vece) { 4147 case MO_8: 4148 return -1; 4149 case MO_64: 4150 return have_avx512dq; 4151 } 4152 return 1; 4153 4154 case INDEX_op_ssadd_vec: 4155 case INDEX_op_usadd_vec: 4156 case INDEX_op_sssub_vec: 4157 case INDEX_op_ussub_vec: 4158 return vece <= MO_16; 4159 case INDEX_op_smin_vec: 4160 case INDEX_op_smax_vec: 4161 case INDEX_op_umin_vec: 4162 case INDEX_op_umax_vec: 4163 case INDEX_op_abs_vec: 4164 return vece <= MO_32 || have_avx512vl; 4165 4166 default: 4167 return 0; 4168 } 4169} 4170 4171static void expand_vec_shi(TCGType type, unsigned vece, bool right, 4172 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4173{ 4174 uint8_t mask; 4175 4176 tcg_debug_assert(vece == MO_8); 4177 if (right) { 4178 mask = 0xff >> imm; 4179 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4180 } else { 4181 mask = 0xff << imm; 4182 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4183 } 4184 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4185} 4186 4187static void expand_vec_sari(TCGType type, unsigned vece, 4188 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4189{ 4190 TCGv_vec t1, t2; 4191 4192 switch (vece) { 4193 case MO_8: 4194 /* Unpack to 16-bit, shift, and repack. */ 4195 t1 = tcg_temp_new_vec(type); 4196 t2 = tcg_temp_new_vec(type); 4197 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4198 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4199 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4200 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4201 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4202 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4203 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4204 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4205 tcg_temp_free_vec(t1); 4206 tcg_temp_free_vec(t2); 4207 break; 4208 4209 case MO_64: 4210 t1 = tcg_temp_new_vec(type); 4211 if (imm <= 32) { 4212 /* 4213 * We can emulate a small sign extend by performing an arithmetic 4214 * 32-bit shift and overwriting the high half of a 64-bit logical 4215 * shift. Note that the ISA says shift of 32 is valid, but TCG 4216 * does not, so we have to bound the smaller shift -- we get the 4217 * same result in the high half either way. 4218 */ 4219 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4220 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4221 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4222 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4223 tcgv_vec_arg(t1), 0xaa); 4224 } else { 4225 /* Otherwise we will need to use a compare vs 0 to produce 4226 * the sign-extend, shift and merge. 4227 */ 4228 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4229 tcg_constant_vec(type, MO_64, 0), v1); 4230 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4231 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4232 tcg_gen_or_vec(MO_64, v0, v0, t1); 4233 } 4234 tcg_temp_free_vec(t1); 4235 break; 4236 4237 default: 4238 g_assert_not_reached(); 4239 } 4240} 4241 4242static void expand_vec_rotli(TCGType type, unsigned vece, 4243 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4244{ 4245 TCGv_vec t; 4246 4247 if (vece != MO_8 && have_avx512vbmi2) { 4248 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4249 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4250 return; 4251 } 4252 4253 t = tcg_temp_new_vec(type); 4254 tcg_gen_shli_vec(vece, t, v1, imm); 4255 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4256 tcg_gen_or_vec(vece, v0, v0, t); 4257 tcg_temp_free_vec(t); 4258} 4259 4260static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4261 TCGv_vec v1, TCGv_vec sh, bool right) 4262{ 4263 TCGv_vec t; 4264 4265 if (have_avx512vbmi2) { 4266 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4267 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4268 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4269 return; 4270 } 4271 4272 t = tcg_temp_new_vec(type); 4273 tcg_gen_dupi_vec(vece, t, 8 << vece); 4274 tcg_gen_sub_vec(vece, t, t, sh); 4275 if (right) { 4276 tcg_gen_shlv_vec(vece, t, v1, t); 4277 tcg_gen_shrv_vec(vece, v0, v1, sh); 4278 } else { 4279 tcg_gen_shrv_vec(vece, t, v1, t); 4280 tcg_gen_shlv_vec(vece, v0, v1, sh); 4281 } 4282 tcg_gen_or_vec(vece, v0, v0, t); 4283 tcg_temp_free_vec(t); 4284} 4285 4286static void expand_vec_rotls(TCGType type, unsigned vece, 4287 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4288{ 4289 TCGv_vec t = tcg_temp_new_vec(type); 4290 4291 tcg_debug_assert(vece != MO_8); 4292 4293 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4294 tcg_gen_dup_i32_vec(vece, t, lsh); 4295 if (vece >= MO_32) { 4296 tcg_gen_rotlv_vec(vece, v0, v1, t); 4297 } else { 4298 expand_vec_rotv(type, vece, v0, v1, t, false); 4299 } 4300 } else { 4301 TCGv_i32 rsh = tcg_temp_new_i32(); 4302 4303 tcg_gen_neg_i32(rsh, lsh); 4304 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4305 tcg_gen_shls_vec(vece, t, v1, lsh); 4306 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4307 tcg_gen_or_vec(vece, v0, v0, t); 4308 4309 tcg_temp_free_i32(rsh); 4310 } 4311 4312 tcg_temp_free_vec(t); 4313} 4314 4315static void expand_vec_mul(TCGType type, unsigned vece, 4316 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4317{ 4318 TCGv_vec t1, t2, t3, t4, zero; 4319 4320 tcg_debug_assert(vece == MO_8); 4321 4322 /* 4323 * Unpack v1 bytes to words, 0 | x. 4324 * Unpack v2 bytes to words, y | 0. 4325 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4326 * Shift logical right by 8 bits to clear the high 8 bytes before 4327 * using an unsigned saturated pack. 4328 * 4329 * The difference between the V64, V128 and V256 cases is merely how 4330 * we distribute the expansion between temporaries. 4331 */ 4332 switch (type) { 4333 case TCG_TYPE_V64: 4334 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4335 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4336 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4337 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4338 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4339 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4340 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4341 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4342 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4343 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4344 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4345 tcg_temp_free_vec(t1); 4346 tcg_temp_free_vec(t2); 4347 break; 4348 4349 case TCG_TYPE_V128: 4350 case TCG_TYPE_V256: 4351 t1 = tcg_temp_new_vec(type); 4352 t2 = tcg_temp_new_vec(type); 4353 t3 = tcg_temp_new_vec(type); 4354 t4 = tcg_temp_new_vec(type); 4355 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4356 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4357 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4358 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4359 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4360 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4361 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4362 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4363 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4364 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4365 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4366 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4367 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4368 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4369 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4370 tcg_temp_free_vec(t1); 4371 tcg_temp_free_vec(t2); 4372 tcg_temp_free_vec(t3); 4373 tcg_temp_free_vec(t4); 4374 break; 4375 4376 default: 4377 g_assert_not_reached(); 4378 } 4379} 4380 4381static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4382 TCGArg *a1, TCGArg *a2, TCGCond cond) 4383{ 4384 /* 4385 * Without AVX512, there are no 64-bit unsigned comparisons. 4386 * We must bias the inputs so that they become signed. 4387 * All other swapping and inversion are handled during code generation. 4388 */ 4389 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4390 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4391 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4392 TCGv_vec t1 = tcg_temp_new_vec(type); 4393 TCGv_vec t2 = tcg_temp_new_vec(type); 4394 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4395 4396 tcg_gen_sub_vec(vece, t1, v1, t3); 4397 tcg_gen_sub_vec(vece, t2, v2, t3); 4398 *a1 = tcgv_vec_arg(t1); 4399 *a2 = tcgv_vec_arg(t2); 4400 cond = tcg_signed_cond(cond); 4401 } 4402 return cond; 4403} 4404 4405static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4406 TCGArg a1, TCGArg a2, TCGCond cond) 4407{ 4408 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4409 /* Expand directly; do not recurse. */ 4410 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4411} 4412 4413static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4414 TCGArg a1, TCGArg a2, 4415 TCGArg a3, TCGArg a4, TCGCond cond) 4416{ 4417 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4418 /* Expand directly; do not recurse. */ 4419 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4420} 4421 4422void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4423 TCGArg a0, ...) 4424{ 4425 va_list va; 4426 TCGArg a1, a2, a3, a4, a5; 4427 TCGv_vec v0, v1, v2; 4428 4429 va_start(va, a0); 4430 a1 = va_arg(va, TCGArg); 4431 a2 = va_arg(va, TCGArg); 4432 v0 = temp_tcgv_vec(arg_temp(a0)); 4433 v1 = temp_tcgv_vec(arg_temp(a1)); 4434 4435 switch (opc) { 4436 case INDEX_op_shli_vec: 4437 expand_vec_shi(type, vece, false, v0, v1, a2); 4438 break; 4439 case INDEX_op_shri_vec: 4440 expand_vec_shi(type, vece, true, v0, v1, a2); 4441 break; 4442 case INDEX_op_sari_vec: 4443 expand_vec_sari(type, vece, v0, v1, a2); 4444 break; 4445 4446 case INDEX_op_rotli_vec: 4447 expand_vec_rotli(type, vece, v0, v1, a2); 4448 break; 4449 4450 case INDEX_op_rotls_vec: 4451 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4452 break; 4453 4454 case INDEX_op_rotlv_vec: 4455 v2 = temp_tcgv_vec(arg_temp(a2)); 4456 expand_vec_rotv(type, vece, v0, v1, v2, false); 4457 break; 4458 case INDEX_op_rotrv_vec: 4459 v2 = temp_tcgv_vec(arg_temp(a2)); 4460 expand_vec_rotv(type, vece, v0, v1, v2, true); 4461 break; 4462 4463 case INDEX_op_mul_vec: 4464 v2 = temp_tcgv_vec(arg_temp(a2)); 4465 expand_vec_mul(type, vece, v0, v1, v2); 4466 break; 4467 4468 case INDEX_op_cmp_vec: 4469 a3 = va_arg(va, TCGArg); 4470 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4471 break; 4472 4473 case INDEX_op_cmpsel_vec: 4474 a3 = va_arg(va, TCGArg); 4475 a4 = va_arg(va, TCGArg); 4476 a5 = va_arg(va, TCGArg); 4477 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4478 break; 4479 4480 default: 4481 break; 4482 } 4483 4484 va_end(va); 4485} 4486 4487static const int tcg_target_callee_save_regs[] = { 4488#if TCG_TARGET_REG_BITS == 64 4489 TCG_REG_RBP, 4490 TCG_REG_RBX, 4491#if defined(_WIN64) 4492 TCG_REG_RDI, 4493 TCG_REG_RSI, 4494#endif 4495 TCG_REG_R12, 4496 TCG_REG_R13, 4497 TCG_REG_R14, /* Currently used for the global env. */ 4498 TCG_REG_R15, 4499#else 4500 TCG_REG_EBP, /* Currently used for the global env. */ 4501 TCG_REG_EBX, 4502 TCG_REG_ESI, 4503 TCG_REG_EDI, 4504#endif 4505}; 4506 4507/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4508 and tcg_register_jit. */ 4509 4510#define PUSH_SIZE \ 4511 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4512 * (TCG_TARGET_REG_BITS / 8)) 4513 4514#define FRAME_SIZE \ 4515 ((PUSH_SIZE \ 4516 + TCG_STATIC_CALL_ARGS_SIZE \ 4517 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4518 + TCG_TARGET_STACK_ALIGN - 1) \ 4519 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4520 4521/* Generate global QEMU prologue and epilogue code */ 4522static void tcg_target_qemu_prologue(TCGContext *s) 4523{ 4524 int i, stack_addend; 4525 4526 /* TB prologue */ 4527 4528 /* Reserve some stack space, also for TCG temps. */ 4529 stack_addend = FRAME_SIZE - PUSH_SIZE; 4530 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4531 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4532 4533 /* Save all callee saved registers. */ 4534 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4535 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4536 } 4537 4538 if (!tcg_use_softmmu && guest_base) { 4539 int seg = setup_guest_base_seg(); 4540 if (seg != 0) { 4541 x86_guest_base.seg = seg; 4542 } else if (guest_base == (int32_t)guest_base) { 4543 x86_guest_base.ofs = guest_base; 4544 } else { 4545 assert(TCG_TARGET_REG_BITS == 64); 4546 /* Choose R12 because, as a base, it requires a SIB byte. */ 4547 x86_guest_base.index = TCG_REG_R12; 4548 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4549 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4550 } 4551 } 4552 4553 if (TCG_TARGET_REG_BITS == 32) { 4554 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4555 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4556 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4557 /* jmp *tb. */ 4558 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4559 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4560 + stack_addend); 4561 } else { 4562 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4563 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4564 /* jmp *tb. */ 4565 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4566 } 4567 4568 /* 4569 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4570 * and fall through to the rest of the epilogue. 4571 */ 4572 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4573 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4574 4575 /* TB epilogue */ 4576 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4577 4578 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4579 4580 if (have_avx2) { 4581 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4582 } 4583 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4584 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4585 } 4586 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4587} 4588 4589static void tcg_out_tb_start(TCGContext *s) 4590{ 4591 /* nothing to do */ 4592} 4593 4594static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4595{ 4596 memset(p, 0x90, count); 4597} 4598 4599static void tcg_target_init(TCGContext *s) 4600{ 4601 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4602 if (TCG_TARGET_REG_BITS == 64) { 4603 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4604 } 4605 if (have_avx1) { 4606 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4607 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4608 } 4609 if (have_avx2) { 4610 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4611 } 4612 4613 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4614 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4615 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4616 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4617 if (TCG_TARGET_REG_BITS == 64) { 4618#if !defined(_WIN64) 4619 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4620 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4621#endif 4622 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4623 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4624 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4625 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4626 } 4627 4628 s->reserved_regs = 0; 4629 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4630 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4631#ifdef _WIN64 4632 /* These are call saved, and we don't save them, so don't use them. */ 4633 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4634 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4635 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4636 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4637 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4638 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4639 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4640 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4641 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4642 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4643#endif 4644} 4645 4646typedef struct { 4647 DebugFrameHeader h; 4648 uint8_t fde_def_cfa[4]; 4649 uint8_t fde_reg_ofs[14]; 4650} DebugFrame; 4651 4652/* We're expecting a 2 byte uleb128 encoded value. */ 4653QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4654 4655#if !defined(__ELF__) 4656 /* Host machine without ELF. */ 4657#elif TCG_TARGET_REG_BITS == 64 4658#define ELF_HOST_MACHINE EM_X86_64 4659static const DebugFrame debug_frame = { 4660 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4661 .h.cie.id = -1, 4662 .h.cie.version = 1, 4663 .h.cie.code_align = 1, 4664 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4665 .h.cie.return_column = 16, 4666 4667 /* Total FDE size does not include the "len" member. */ 4668 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4669 4670 .fde_def_cfa = { 4671 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4672 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4673 (FRAME_SIZE >> 7) 4674 }, 4675 .fde_reg_ofs = { 4676 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4677 /* The following ordering must match tcg_target_callee_save_regs. */ 4678 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4679 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4680 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4681 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4682 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4683 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4684 } 4685}; 4686#else 4687#define ELF_HOST_MACHINE EM_386 4688static const DebugFrame debug_frame = { 4689 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4690 .h.cie.id = -1, 4691 .h.cie.version = 1, 4692 .h.cie.code_align = 1, 4693 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4694 .h.cie.return_column = 8, 4695 4696 /* Total FDE size does not include the "len" member. */ 4697 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4698 4699 .fde_def_cfa = { 4700 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4701 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4702 (FRAME_SIZE >> 7) 4703 }, 4704 .fde_reg_ofs = { 4705 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4706 /* The following ordering must match tcg_target_callee_save_regs. */ 4707 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4708 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4709 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4710 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4711 } 4712}; 4713#endif 4714 4715#if defined(ELF_HOST_MACHINE) 4716void tcg_register_jit(const void *buf, size_t buf_size) 4717{ 4718 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4719} 4720#endif 4721