1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25/* Used for function call generation. */ 26#define TCG_TARGET_STACK_ALIGN 16 27#if defined(_WIN64) 28#define TCG_TARGET_CALL_STACK_OFFSET 32 29#else 30#define TCG_TARGET_CALL_STACK_OFFSET 0 31#endif 32#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL 33#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL 34#if defined(_WIN64) 35# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF 36# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_VEC 37#elif TCG_TARGET_REG_BITS == 64 38# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 39# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL 40#else 41# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 42# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF 43#endif 44 45#ifdef CONFIG_DEBUG_TCG 46static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 47#if TCG_TARGET_REG_BITS == 64 48 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 49#else 50 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 51#endif 52 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 53 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 54#if TCG_TARGET_REG_BITS == 64 55 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 56 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 57#endif 58}; 59#endif 60 61static const int tcg_target_reg_alloc_order[] = { 62#if TCG_TARGET_REG_BITS == 64 63 TCG_REG_RBP, 64 TCG_REG_RBX, 65 TCG_REG_R12, 66 TCG_REG_R13, 67 TCG_REG_R14, 68 TCG_REG_R15, 69 TCG_REG_R10, 70 TCG_REG_R11, 71 TCG_REG_R9, 72 TCG_REG_R8, 73 TCG_REG_RCX, 74 TCG_REG_RDX, 75 TCG_REG_RSI, 76 TCG_REG_RDI, 77 TCG_REG_RAX, 78#else 79 TCG_REG_EBX, 80 TCG_REG_ESI, 81 TCG_REG_EDI, 82 TCG_REG_EBP, 83 TCG_REG_ECX, 84 TCG_REG_EDX, 85 TCG_REG_EAX, 86#endif 87 TCG_REG_XMM0, 88 TCG_REG_XMM1, 89 TCG_REG_XMM2, 90 TCG_REG_XMM3, 91 TCG_REG_XMM4, 92 TCG_REG_XMM5, 93#ifndef _WIN64 94 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 95 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 96 TCG_REG_XMM6, 97 TCG_REG_XMM7, 98#if TCG_TARGET_REG_BITS == 64 99 TCG_REG_XMM8, 100 TCG_REG_XMM9, 101 TCG_REG_XMM10, 102 TCG_REG_XMM11, 103 TCG_REG_XMM12, 104 TCG_REG_XMM13, 105 TCG_REG_XMM14, 106 TCG_REG_XMM15, 107#endif 108#endif 109}; 110 111#define TCG_TMP_VEC TCG_REG_XMM5 112 113static const int tcg_target_call_iarg_regs[] = { 114#if TCG_TARGET_REG_BITS == 64 115#if defined(_WIN64) 116 TCG_REG_RCX, 117 TCG_REG_RDX, 118#else 119 TCG_REG_RDI, 120 TCG_REG_RSI, 121 TCG_REG_RDX, 122 TCG_REG_RCX, 123#endif 124 TCG_REG_R8, 125 TCG_REG_R9, 126#else 127 /* 32 bit mode uses stack based calling convention (GCC default). */ 128#endif 129}; 130 131static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 132{ 133 switch (kind) { 134 case TCG_CALL_RET_NORMAL: 135 tcg_debug_assert(slot >= 0 && slot <= 1); 136 return slot ? TCG_REG_EDX : TCG_REG_EAX; 137#ifdef _WIN64 138 case TCG_CALL_RET_BY_VEC: 139 tcg_debug_assert(slot == 0); 140 return TCG_REG_XMM0; 141#endif 142 default: 143 g_assert_not_reached(); 144 } 145} 146 147/* Constants we accept. */ 148#define TCG_CT_CONST_S32 0x100 149#define TCG_CT_CONST_U32 0x200 150#define TCG_CT_CONST_I32 0x400 151#define TCG_CT_CONST_WSZ 0x800 152#define TCG_CT_CONST_TST 0x1000 153#define TCG_CT_CONST_ZERO 0x2000 154 155/* Registers used with L constraint, which are the first argument 156 registers on x86_64, and two random call clobbered registers on 157 i386. */ 158#if TCG_TARGET_REG_BITS == 64 159# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 160# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 161#else 162# define TCG_REG_L0 TCG_REG_EAX 163# define TCG_REG_L1 TCG_REG_EDX 164#endif 165 166#if TCG_TARGET_REG_BITS == 64 167# define ALL_GENERAL_REGS 0x0000ffffu 168# define ALL_VECTOR_REGS 0xffff0000u 169# define ALL_BYTEL_REGS ALL_GENERAL_REGS 170#else 171# define ALL_GENERAL_REGS 0x000000ffu 172# define ALL_VECTOR_REGS 0x00ff0000u 173# define ALL_BYTEL_REGS 0x0000000fu 174#endif 175#define SOFTMMU_RESERVE_REGS \ 176 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 177 178#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 179#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 180 181static const tcg_insn_unit *tb_ret_addr; 182 183static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 184 intptr_t value, intptr_t addend) 185{ 186 value += addend; 187 switch(type) { 188 case R_386_PC32: 189 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 190 if (value != (int32_t)value) { 191 return false; 192 } 193 /* FALLTHRU */ 194 case R_386_32: 195 tcg_patch32(code_ptr, value); 196 break; 197 case R_386_PC8: 198 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 199 if (value != (int8_t)value) { 200 return false; 201 } 202 tcg_patch8(code_ptr, value); 203 break; 204 default: 205 g_assert_not_reached(); 206 } 207 return true; 208} 209 210/* test if a constant matches the constraint */ 211static bool tcg_target_const_match(int64_t val, int ct, 212 TCGType type, TCGCond cond, int vece) 213{ 214 if (ct & TCG_CT_CONST) { 215 return 1; 216 } 217 if (type == TCG_TYPE_I32) { 218 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 219 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 220 return 1; 221 } 222 } else { 223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 224 return 1; 225 } 226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 227 return 1; 228 } 229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 230 return 1; 231 } 232 /* 233 * This will be used in combination with TCG_CT_CONST_S32, 234 * so "normal" TESTQ is already matched. Also accept: 235 * TESTQ -> TESTL (uint32_t) 236 * TESTQ -> BT (is_power_of_2) 237 */ 238 if ((ct & TCG_CT_CONST_TST) 239 && is_tst_cond(cond) 240 && (val == (uint32_t)val || is_power_of_2(val))) { 241 return 1; 242 } 243 } 244 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 245 return 1; 246 } 247 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EbIb (0x80) 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVB_Ib (0xb0) 307#define OPC_MOVL_Iv (0xb8) 308#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 309#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 310#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 311#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 312#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 313#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 314#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 315#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 316#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 318#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 319#define OPC_MOVSBL (0xbe | P_EXT) 320#define OPC_MOVSWL (0xbf | P_EXT) 321#define OPC_MOVSLQ (0x63 | P_REXW) 322#define OPC_MOVZBL (0xb6 | P_EXT) 323#define OPC_MOVZWL (0xb7 | P_EXT) 324#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 325#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 326#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 327#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 328#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 329#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 330#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 331#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 332#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 333#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 334#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 335#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 336#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 337#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 338#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 339#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 340#define OPC_PAND (0xdb | P_EXT | P_DATA16) 341#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 342#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 343#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 344#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 345#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 346#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 347#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 348#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 349#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 350#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 351#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 352#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 353#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 354#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 355#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 356#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 358#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 359#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 360#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 362#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 363#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 364#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 366#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 367#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 368#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 369#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 370#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 371#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 372#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 373#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 374#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 375#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 376#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 377#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 378#define OPC_POR (0xeb | P_EXT | P_DATA16) 379#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 380#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 381#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 382#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 383#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 384#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 385#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 386#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 387#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 388#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 389#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 390#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 391#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 392#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 393#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 394#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 395#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 396#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 397#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 398#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 399#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 400#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 401#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 402#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 404#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 405#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 406#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 407#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 408#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 409#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 410#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 411#define OPC_PXOR (0xef | P_EXT | P_DATA16) 412#define OPC_POP_r32 (0x58) 413#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 414#define OPC_PUSH_r32 (0x50) 415#define OPC_PUSH_Iv (0x68) 416#define OPC_PUSH_Ib (0x6a) 417#define OPC_RET (0xc3) 418#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 419#define OPC_SHIFT_1 (0xd1) 420#define OPC_SHIFT_Ib (0xc1) 421#define OPC_SHIFT_cl (0xd3) 422#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 423#define OPC_SHUFPS (0xc6 | P_EXT) 424#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 425#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 426#define OPC_SHRD_Ib (0xac | P_EXT) 427#define OPC_TESTB (0x84) 428#define OPC_TESTL (0x85) 429#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 430#define OPC_UD2 (0x0b | P_EXT) 431#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 432#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 433#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 434#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 436#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 437#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 438#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 439#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 440#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 442#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 443#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 446#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 447#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 448#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 449#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 450#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 451#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 452#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 453#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 454#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 455#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 456#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 457#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 458#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 459#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 460#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 462#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 463#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 464#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 465#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 466#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 468#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 469#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 470#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 471#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 472#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 473#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 474#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 475#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 476#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 477#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 478#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 479#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 480#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 481#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 482#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 483#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 484#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 485#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 486#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 487#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 488#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 489#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 490#define OPC_VZEROUPPER (0x77 | P_EXT) 491#define OPC_XCHG_ax_r32 (0x90) 492#define OPC_XCHG_EvGv (0x87) 493 494#define OPC_GRP3_Eb (0xf6) 495#define OPC_GRP3_Ev (0xf7) 496#define OPC_GRP5 (0xff) 497#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 498#define OPC_GRPBT (0xba | P_EXT) 499 500#define OPC_GRPBT_BT 4 501#define OPC_GRPBT_BTS 5 502#define OPC_GRPBT_BTR 6 503#define OPC_GRPBT_BTC 7 504 505/* Group 1 opcode extensions for 0x80-0x83. 506 These are also used as modifiers for OPC_ARITH. */ 507#define ARITH_ADD 0 508#define ARITH_OR 1 509#define ARITH_ADC 2 510#define ARITH_SBB 3 511#define ARITH_AND 4 512#define ARITH_SUB 5 513#define ARITH_XOR 6 514#define ARITH_CMP 7 515 516/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 517#define SHIFT_ROL 0 518#define SHIFT_ROR 1 519#define SHIFT_SHL 4 520#define SHIFT_SHR 5 521#define SHIFT_SAR 7 522 523/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 524#define EXT3_TESTi 0 525#define EXT3_NOT 2 526#define EXT3_NEG 3 527#define EXT3_MUL 4 528#define EXT3_IMUL 5 529#define EXT3_DIV 6 530#define EXT3_IDIV 7 531 532/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 533#define EXT5_INC_Ev 0 534#define EXT5_DEC_Ev 1 535#define EXT5_CALLN_Ev 2 536#define EXT5_JMPN_Ev 4 537 538/* Condition codes to be added to OPC_JCC_{long,short}. */ 539#define JCC_JMP (-1) 540#define JCC_JO 0x0 541#define JCC_JNO 0x1 542#define JCC_JB 0x2 543#define JCC_JAE 0x3 544#define JCC_JE 0x4 545#define JCC_JNE 0x5 546#define JCC_JBE 0x6 547#define JCC_JA 0x7 548#define JCC_JS 0x8 549#define JCC_JNS 0x9 550#define JCC_JP 0xa 551#define JCC_JNP 0xb 552#define JCC_JL 0xc 553#define JCC_JGE 0xd 554#define JCC_JLE 0xe 555#define JCC_JG 0xf 556 557static const uint8_t tcg_cond_to_jcc[] = { 558 [TCG_COND_EQ] = JCC_JE, 559 [TCG_COND_NE] = JCC_JNE, 560 [TCG_COND_LT] = JCC_JL, 561 [TCG_COND_GE] = JCC_JGE, 562 [TCG_COND_LE] = JCC_JLE, 563 [TCG_COND_GT] = JCC_JG, 564 [TCG_COND_LTU] = JCC_JB, 565 [TCG_COND_GEU] = JCC_JAE, 566 [TCG_COND_LEU] = JCC_JBE, 567 [TCG_COND_GTU] = JCC_JA, 568 [TCG_COND_TSTEQ] = JCC_JE, 569 [TCG_COND_TSTNE] = JCC_JNE, 570}; 571 572#if TCG_TARGET_REG_BITS == 64 573static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 574{ 575 int rex; 576 577 if (opc & P_GS) { 578 tcg_out8(s, 0x65); 579 } 580 if (opc & P_DATA16) { 581 /* We should never be asking for both 16 and 64-bit operation. */ 582 tcg_debug_assert((opc & P_REXW) == 0); 583 tcg_out8(s, 0x66); 584 } 585 if (opc & P_SIMDF3) { 586 tcg_out8(s, 0xf3); 587 } else if (opc & P_SIMDF2) { 588 tcg_out8(s, 0xf2); 589 } 590 591 rex = 0; 592 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 593 rex |= (r & 8) >> 1; /* REX.R */ 594 rex |= (x & 8) >> 2; /* REX.X */ 595 rex |= (rm & 8) >> 3; /* REX.B */ 596 597 /* P_REXB_{R,RM} indicates that the given register is the low byte. 598 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 599 as otherwise the encoding indicates %[abcd]h. Note that the values 600 that are ORed in merely indicate that the REX byte must be present; 601 those bits get discarded in output. */ 602 rex |= opc & (r >= 4 ? P_REXB_R : 0); 603 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 604 605 if (rex) { 606 tcg_out8(s, (uint8_t)(rex | 0x40)); 607 } 608 609 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 610 tcg_out8(s, 0x0f); 611 if (opc & P_EXT38) { 612 tcg_out8(s, 0x38); 613 } else if (opc & P_EXT3A) { 614 tcg_out8(s, 0x3a); 615 } 616 } 617 618 tcg_out8(s, opc); 619} 620#else 621static void tcg_out_opc(TCGContext *s, int opc) 622{ 623 if (opc & P_DATA16) { 624 tcg_out8(s, 0x66); 625 } 626 if (opc & P_SIMDF3) { 627 tcg_out8(s, 0xf3); 628 } else if (opc & P_SIMDF2) { 629 tcg_out8(s, 0xf2); 630 } 631 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 632 tcg_out8(s, 0x0f); 633 if (opc & P_EXT38) { 634 tcg_out8(s, 0x38); 635 } else if (opc & P_EXT3A) { 636 tcg_out8(s, 0x3a); 637 } 638 } 639 tcg_out8(s, opc); 640} 641/* Discard the register arguments to tcg_out_opc early, so as not to penalize 642 the 32-bit compilation paths. This method works with all versions of gcc, 643 whereas relying on optimization may not be able to exclude them. */ 644#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 645#endif 646 647static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 648{ 649 tcg_out_opc(s, opc, r, rm, 0); 650 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 651} 652 653static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 654 int rm, int index) 655{ 656 int tmp; 657 658 if (opc & P_GS) { 659 tcg_out8(s, 0x65); 660 } 661 /* Use the two byte form if possible, which cannot encode 662 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 663 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 664 && ((rm | index) & 8) == 0) { 665 /* Two byte VEX prefix. */ 666 tcg_out8(s, 0xc5); 667 668 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 669 } else { 670 /* Three byte VEX prefix. */ 671 tcg_out8(s, 0xc4); 672 673 /* VEX.m-mmmm */ 674 if (opc & P_EXT3A) { 675 tmp = 3; 676 } else if (opc & P_EXT38) { 677 tmp = 2; 678 } else if (opc & P_EXT) { 679 tmp = 1; 680 } else { 681 g_assert_not_reached(); 682 } 683 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 684 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 685 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 686 tcg_out8(s, tmp); 687 688 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 689 } 690 691 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 692 /* VEX.pp */ 693 if (opc & P_DATA16) { 694 tmp |= 1; /* 0x66 */ 695 } else if (opc & P_SIMDF3) { 696 tmp |= 2; /* 0xf3 */ 697 } else if (opc & P_SIMDF2) { 698 tmp |= 3; /* 0xf2 */ 699 } 700 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 701 tcg_out8(s, tmp); 702 tcg_out8(s, opc); 703} 704 705static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 706 int rm, int index, int aaa, bool z) 707{ 708 /* The entire 4-byte evex prefix; with R' and V' set. */ 709 uint32_t p = 0x08041062; 710 int mm, pp; 711 712 tcg_debug_assert(have_avx512vl); 713 714 /* EVEX.mm */ 715 if (opc & P_EXT3A) { 716 mm = 3; 717 } else if (opc & P_EXT38) { 718 mm = 2; 719 } else if (opc & P_EXT) { 720 mm = 1; 721 } else { 722 g_assert_not_reached(); 723 } 724 725 /* EVEX.pp */ 726 if (opc & P_DATA16) { 727 pp = 1; /* 0x66 */ 728 } else if (opc & P_SIMDF3) { 729 pp = 2; /* 0xf3 */ 730 } else if (opc & P_SIMDF2) { 731 pp = 3; /* 0xf2 */ 732 } else { 733 pp = 0; 734 } 735 736 p = deposit32(p, 8, 2, mm); 737 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 738 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 739 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 740 p = deposit32(p, 16, 2, pp); 741 p = deposit32(p, 19, 4, ~v); 742 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 743 p = deposit32(p, 24, 3, aaa); 744 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 745 p = deposit32(p, 31, 1, z); 746 747 tcg_out32(s, p); 748 tcg_out8(s, opc); 749} 750 751static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 752{ 753 if (opc & P_EVEX) { 754 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 755 } else { 756 tcg_out_vex_opc(s, opc, r, v, rm, 0); 757 } 758 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 759} 760 761static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 762 int r, int v, int rm, TCGType type) 763{ 764 if (type == TCG_TYPE_V256) { 765 opc |= P_VEXL; 766 } 767 tcg_out_vex_modrm(s, opc, r, v, rm); 768} 769 770static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 771 int rm, int aaa, bool z, TCGType type) 772{ 773 if (type == TCG_TYPE_V256) { 774 opc |= P_VEXL; 775 } 776 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 777 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 778} 779 780/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 781 We handle either RM and INDEX missing with a negative value. In 64-bit 782 mode for absolute addresses, ~RM is the size of the immediate operand 783 that will follow the instruction. */ 784 785static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 786 int shift, intptr_t offset) 787{ 788 int mod, len; 789 790 if (index < 0 && rm < 0) { 791 if (TCG_TARGET_REG_BITS == 64) { 792 /* Try for a rip-relative addressing mode. This has replaced 793 the 32-bit-mode absolute addressing encoding. */ 794 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 795 intptr_t disp = offset - pc; 796 if (disp == (int32_t)disp) { 797 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 798 tcg_out32(s, disp); 799 return; 800 } 801 802 /* Try for an absolute address encoding. This requires the 803 use of the MODRM+SIB encoding and is therefore larger than 804 rip-relative addressing. */ 805 if (offset == (int32_t)offset) { 806 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 807 tcg_out8(s, (4 << 3) | 5); 808 tcg_out32(s, offset); 809 return; 810 } 811 812 /* ??? The memory isn't directly addressable. */ 813 g_assert_not_reached(); 814 } else { 815 /* Absolute address. */ 816 tcg_out8(s, (r << 3) | 5); 817 tcg_out32(s, offset); 818 return; 819 } 820 } 821 822 /* Find the length of the immediate addend. Note that the encoding 823 that would be used for (%ebp) indicates absolute addressing. */ 824 if (rm < 0) { 825 mod = 0, len = 4, rm = 5; 826 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 827 mod = 0, len = 0; 828 } else if (offset == (int8_t)offset) { 829 mod = 0x40, len = 1; 830 } else { 831 mod = 0x80, len = 4; 832 } 833 834 /* Use a single byte MODRM format if possible. Note that the encoding 835 that would be used for %esp is the escape to the two byte form. */ 836 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 837 /* Single byte MODRM format. */ 838 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 839 } else { 840 /* Two byte MODRM+SIB format. */ 841 842 /* Note that the encoding that would place %esp into the index 843 field indicates no index register. In 64-bit mode, the REX.X 844 bit counts, so %r12 can be used as the index. */ 845 if (index < 0) { 846 index = 4; 847 } else { 848 tcg_debug_assert(index != TCG_REG_ESP); 849 } 850 851 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 852 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 853 } 854 855 if (len == 1) { 856 tcg_out8(s, offset); 857 } else if (len == 4) { 858 tcg_out32(s, offset); 859 } 860} 861 862static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 863 int index, int shift, intptr_t offset) 864{ 865 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 866 tcg_out_sib_offset(s, r, rm, index, shift, offset); 867} 868 869static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 870 int rm, int index, int shift, 871 intptr_t offset) 872{ 873 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 874 tcg_out_sib_offset(s, r, rm, index, shift, offset); 875} 876 877/* A simplification of the above with no index or shift. */ 878static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 879 int rm, intptr_t offset) 880{ 881 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 882} 883 884static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 885 int v, int rm, intptr_t offset) 886{ 887 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 888} 889 890/* Output an opcode with an expected reference to the constant pool. */ 891static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 892{ 893 tcg_out_opc(s, opc, r, 0, 0); 894 /* Absolute for 32-bit, pc-relative for 64-bit. */ 895 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 896 tcg_out32(s, 0); 897} 898 899/* Output an opcode with an expected reference to the constant pool. */ 900static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 901{ 902 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 903 /* Absolute for 32-bit, pc-relative for 64-bit. */ 904 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 905 tcg_out32(s, 0); 906} 907 908/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 909static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 910{ 911 /* Propagate an opcode prefix, such as P_REXW. */ 912 int ext = subop & ~0x7; 913 subop &= 0x7; 914 915 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 916} 917 918static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 919{ 920 int rexw = 0; 921 922 if (arg == ret) { 923 return true; 924 } 925 switch (type) { 926 case TCG_TYPE_I64: 927 rexw = P_REXW; 928 /* fallthru */ 929 case TCG_TYPE_I32: 930 if (ret < 16) { 931 if (arg < 16) { 932 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 933 } else { 934 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 935 } 936 } else { 937 if (arg < 16) { 938 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 939 } else { 940 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 941 } 942 } 943 break; 944 945 case TCG_TYPE_V64: 946 tcg_debug_assert(ret >= 16 && arg >= 16); 947 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 948 break; 949 case TCG_TYPE_V128: 950 tcg_debug_assert(ret >= 16 && arg >= 16); 951 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 952 break; 953 case TCG_TYPE_V256: 954 tcg_debug_assert(ret >= 16 && arg >= 16); 955 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 956 break; 957 958 default: 959 g_assert_not_reached(); 960 } 961 return true; 962} 963 964static const int avx2_dup_insn[4] = { 965 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 966 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 967}; 968 969static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 970 TCGReg r, TCGReg a) 971{ 972 if (have_avx2) { 973 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 974 } else { 975 switch (vece) { 976 case MO_8: 977 /* ??? With zero in a register, use PSHUFB. */ 978 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 979 a = r; 980 /* FALLTHRU */ 981 case MO_16: 982 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 983 a = r; 984 /* FALLTHRU */ 985 case MO_32: 986 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 987 /* imm8 operand: all output lanes selected from input lane 0. */ 988 tcg_out8(s, 0); 989 break; 990 case MO_64: 991 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 992 break; 993 default: 994 g_assert_not_reached(); 995 } 996 } 997 return true; 998} 999 1000static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 1001 TCGReg r, TCGReg base, intptr_t offset) 1002{ 1003 if (have_avx2) { 1004 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1005 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 1006 r, 0, base, offset); 1007 } else { 1008 switch (vece) { 1009 case MO_64: 1010 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 1011 break; 1012 case MO_32: 1013 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 1014 break; 1015 case MO_16: 1016 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1017 tcg_out8(s, 0); /* imm8 */ 1018 tcg_out_dup_vec(s, type, vece, r, r); 1019 break; 1020 case MO_8: 1021 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1022 tcg_out8(s, 0); /* imm8 */ 1023 tcg_out_dup_vec(s, type, vece, r, r); 1024 break; 1025 default: 1026 g_assert_not_reached(); 1027 } 1028 } 1029 return true; 1030} 1031 1032static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1033 TCGReg ret, int64_t arg) 1034{ 1035 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1036 1037 if (arg == 0) { 1038 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1039 return; 1040 } 1041 if (arg == -1) { 1042 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1043 return; 1044 } 1045 1046 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1047 if (have_avx2) { 1048 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1049 } else { 1050 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1051 } 1052 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1053 } else { 1054 if (type == TCG_TYPE_V64) { 1055 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1056 } else if (have_avx2) { 1057 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1058 } else { 1059 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1060 } 1061 if (TCG_TARGET_REG_BITS == 64) { 1062 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1063 } else { 1064 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1065 } 1066 } 1067} 1068 1069static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1070 TCGReg ret, tcg_target_long arg) 1071{ 1072 if (arg == 0) { 1073 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1074 return; 1075 } 1076 if (arg == -1) { 1077 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1078 return; 1079 } 1080 1081 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1082 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1083 if (TCG_TARGET_REG_BITS == 64) { 1084 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1085 } else { 1086 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1087 } 1088} 1089 1090static void tcg_out_movi_int(TCGContext *s, TCGType type, 1091 TCGReg ret, tcg_target_long arg) 1092{ 1093 tcg_target_long diff; 1094 1095 if (arg == 0) { 1096 tgen_arithr(s, ARITH_XOR, ret, ret); 1097 return; 1098 } 1099 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1100 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1101 tcg_out32(s, arg); 1102 return; 1103 } 1104 if (arg == (int32_t)arg) { 1105 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1106 tcg_out32(s, arg); 1107 return; 1108 } 1109 1110 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1111 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1112 if (diff == (int32_t)diff) { 1113 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1114 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1115 tcg_out32(s, diff); 1116 return; 1117 } 1118 1119 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1120 tcg_out64(s, arg); 1121} 1122 1123static void tcg_out_movi(TCGContext *s, TCGType type, 1124 TCGReg ret, tcg_target_long arg) 1125{ 1126 switch (type) { 1127 case TCG_TYPE_I32: 1128#if TCG_TARGET_REG_BITS == 64 1129 case TCG_TYPE_I64: 1130#endif 1131 if (ret < 16) { 1132 tcg_out_movi_int(s, type, ret, arg); 1133 } else { 1134 tcg_out_movi_vec(s, type, ret, arg); 1135 } 1136 break; 1137 default: 1138 g_assert_not_reached(); 1139 } 1140} 1141 1142static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1143{ 1144 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1145 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1146 return true; 1147} 1148 1149static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1150 tcg_target_long imm) 1151{ 1152 /* This function is only used for passing structs by reference. */ 1153 tcg_debug_assert(imm == (int32_t)imm); 1154 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1155} 1156 1157static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1158{ 1159 if (val == (int8_t)val) { 1160 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1161 tcg_out8(s, val); 1162 } else if (val == (int32_t)val) { 1163 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1164 tcg_out32(s, val); 1165 } else { 1166 g_assert_not_reached(); 1167 } 1168} 1169 1170static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1171{ 1172 /* Given the strength of x86 memory ordering, we only need care for 1173 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1174 faster than "mfence", so don't bother with the sse insn. */ 1175 if (a0 & TCG_MO_ST_LD) { 1176 tcg_out8(s, 0xf0); 1177 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1178 tcg_out8(s, 0); 1179 } 1180} 1181 1182static inline void tcg_out_push(TCGContext *s, int reg) 1183{ 1184 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1185} 1186 1187static inline void tcg_out_pop(TCGContext *s, int reg) 1188{ 1189 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1190} 1191 1192static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1193 TCGReg arg1, intptr_t arg2) 1194{ 1195 switch (type) { 1196 case TCG_TYPE_I32: 1197 if (ret < 16) { 1198 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1199 } else { 1200 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1201 } 1202 break; 1203 case TCG_TYPE_I64: 1204 if (ret < 16) { 1205 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1206 break; 1207 } 1208 /* FALLTHRU */ 1209 case TCG_TYPE_V64: 1210 /* There is no instruction that can validate 8-byte alignment. */ 1211 tcg_debug_assert(ret >= 16); 1212 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1213 break; 1214 case TCG_TYPE_V128: 1215 /* 1216 * The gvec infrastructure is asserts that v128 vector loads 1217 * and stores use a 16-byte aligned offset. Validate that the 1218 * final pointer is aligned by using an insn that will SIGSEGV. 1219 */ 1220 tcg_debug_assert(ret >= 16); 1221 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1222 break; 1223 case TCG_TYPE_V256: 1224 /* 1225 * The gvec infrastructure only requires 16-byte alignment, 1226 * so here we must use an unaligned load. 1227 */ 1228 tcg_debug_assert(ret >= 16); 1229 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1230 ret, 0, arg1, arg2); 1231 break; 1232 default: 1233 g_assert_not_reached(); 1234 } 1235} 1236 1237static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1238 TCGReg arg1, intptr_t arg2) 1239{ 1240 switch (type) { 1241 case TCG_TYPE_I32: 1242 if (arg < 16) { 1243 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1244 } else { 1245 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1246 } 1247 break; 1248 case TCG_TYPE_I64: 1249 if (arg < 16) { 1250 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1251 break; 1252 } 1253 /* FALLTHRU */ 1254 case TCG_TYPE_V64: 1255 /* There is no instruction that can validate 8-byte alignment. */ 1256 tcg_debug_assert(arg >= 16); 1257 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1258 break; 1259 case TCG_TYPE_V128: 1260 /* 1261 * The gvec infrastructure is asserts that v128 vector loads 1262 * and stores use a 16-byte aligned offset. Validate that the 1263 * final pointer is aligned by using an insn that will SIGSEGV. 1264 * 1265 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1266 * for _WIN64, which must have SSE2 but may not have AVX. 1267 */ 1268 tcg_debug_assert(arg >= 16); 1269 if (have_avx1) { 1270 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1271 } else { 1272 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1273 } 1274 break; 1275 case TCG_TYPE_V256: 1276 /* 1277 * The gvec infrastructure only requires 16-byte alignment, 1278 * so here we must use an unaligned store. 1279 */ 1280 tcg_debug_assert(arg >= 16); 1281 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1282 arg, 0, arg1, arg2); 1283 break; 1284 default: 1285 g_assert_not_reached(); 1286 } 1287} 1288 1289static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1290 TCGReg base, intptr_t ofs) 1291{ 1292 int rexw = 0; 1293 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1294 if (val != (int32_t)val) { 1295 return false; 1296 } 1297 rexw = P_REXW; 1298 } else if (type != TCG_TYPE_I32) { 1299 return false; 1300 } 1301 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1302 tcg_out32(s, val); 1303 return true; 1304} 1305 1306static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1307{ 1308 /* Propagate an opcode prefix, such as P_DATA16. */ 1309 int ext = subopc & ~0x7; 1310 subopc &= 0x7; 1311 1312 if (count == 1) { 1313 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1314 } else { 1315 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1316 tcg_out8(s, count); 1317 } 1318} 1319 1320static inline void tcg_out_bswap32(TCGContext *s, int reg) 1321{ 1322 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1323} 1324 1325static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1326{ 1327 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1328} 1329 1330static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1331{ 1332 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1333 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1334 if (dest >= 4) { 1335 tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest); 1336 tcg_out32(s, 0xff); 1337 return; 1338 } 1339 src = dest; 1340 } 1341 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1342} 1343 1344static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1345{ 1346 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1347 1348 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1349 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1350 if (dest >= 4) { 1351 tcg_out_shifti(s, SHIFT_SHL, dest, 24); 1352 tcg_out_shifti(s, SHIFT_SAR, dest, 24); 1353 return; 1354 } 1355 src = dest; 1356 } 1357 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1358} 1359 1360static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1361{ 1362 /* movzwl */ 1363 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1364} 1365 1366static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1367{ 1368 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1369 /* movsw[lq] */ 1370 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1371} 1372 1373static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1374{ 1375 /* 32-bit mov zero extends. */ 1376 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1377} 1378 1379static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1380{ 1381 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1382 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1383} 1384 1385static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1386{ 1387 tcg_out_ext32s(s, dest, src); 1388} 1389 1390static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1391{ 1392 if (dest != src) { 1393 tcg_out_ext32u(s, dest, src); 1394 } 1395} 1396 1397static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1398{ 1399 tcg_out_ext32u(s, dest, src); 1400} 1401 1402static inline void tcg_out_bswap64(TCGContext *s, int reg) 1403{ 1404 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1405} 1406 1407static void tgen_arithi(TCGContext *s, int c, int r0, 1408 tcg_target_long val, int cf) 1409{ 1410 int rexw = 0; 1411 1412 if (TCG_TARGET_REG_BITS == 64) { 1413 rexw = c & -8; 1414 c &= 7; 1415 } 1416 1417 switch (c) { 1418 case ARITH_ADD: 1419 case ARITH_SUB: 1420 if (!cf) { 1421 /* 1422 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1423 * partial flags update stalls on Pentium4 and are not recommended 1424 * by current Intel optimization manuals. 1425 */ 1426 if (val == 1 || val == -1) { 1427 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1428 if (TCG_TARGET_REG_BITS == 64) { 1429 /* 1430 * The single-byte increment encodings are re-tasked 1431 * as the REX prefixes. Use the MODRM encoding. 1432 */ 1433 tcg_out_modrm(s, OPC_GRP5 + rexw, 1434 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1435 } else { 1436 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1437 } 1438 return; 1439 } 1440 if (val == 128) { 1441 /* 1442 * Facilitate using an 8-bit immediate. Carry is inverted 1443 * by this transformation, so do it only if cf == 0. 1444 */ 1445 c ^= ARITH_ADD ^ ARITH_SUB; 1446 val = -128; 1447 } 1448 } 1449 break; 1450 1451 case ARITH_AND: 1452 if (TCG_TARGET_REG_BITS == 64) { 1453 if (val == 0xffffffffu) { 1454 tcg_out_ext32u(s, r0, r0); 1455 return; 1456 } 1457 if (val == (uint32_t)val) { 1458 /* AND with no high bits set can use a 32-bit operation. */ 1459 rexw = 0; 1460 } 1461 } 1462 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1463 tcg_out_ext8u(s, r0, r0); 1464 return; 1465 } 1466 if (val == 0xffffu) { 1467 tcg_out_ext16u(s, r0, r0); 1468 return; 1469 } 1470 break; 1471 1472 case ARITH_OR: 1473 case ARITH_XOR: 1474 if (val >= 0x80 && val <= 0xff 1475 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1476 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1477 tcg_out8(s, val); 1478 return; 1479 } 1480 break; 1481 } 1482 1483 if (val == (int8_t)val) { 1484 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1485 tcg_out8(s, val); 1486 return; 1487 } 1488 if (rexw == 0 || val == (int32_t)val) { 1489 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1490 tcg_out32(s, val); 1491 return; 1492 } 1493 1494 g_assert_not_reached(); 1495} 1496 1497static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1498{ 1499 if (val != 0) { 1500 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1501 } 1502} 1503 1504/* Set SMALL to force a short forward branch. */ 1505static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1506{ 1507 int32_t val, val1; 1508 1509 if (l->has_value) { 1510 val = tcg_pcrel_diff(s, l->u.value_ptr); 1511 val1 = val - 2; 1512 if ((int8_t)val1 == val1) { 1513 if (opc == -1) { 1514 tcg_out8(s, OPC_JMP_short); 1515 } else { 1516 tcg_out8(s, OPC_JCC_short + opc); 1517 } 1518 tcg_out8(s, val1); 1519 } else { 1520 tcg_debug_assert(!small); 1521 if (opc == -1) { 1522 tcg_out8(s, OPC_JMP_long); 1523 tcg_out32(s, val - 5); 1524 } else { 1525 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1526 tcg_out32(s, val - 6); 1527 } 1528 } 1529 } else if (small) { 1530 if (opc == -1) { 1531 tcg_out8(s, OPC_JMP_short); 1532 } else { 1533 tcg_out8(s, OPC_JCC_short + opc); 1534 } 1535 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1536 s->code_ptr += 1; 1537 } else { 1538 if (opc == -1) { 1539 tcg_out8(s, OPC_JMP_long); 1540 } else { 1541 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1542 } 1543 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1544 s->code_ptr += 4; 1545 } 1546} 1547 1548static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1549 TCGArg arg2, int const_arg2, int rexw) 1550{ 1551 int jz, js; 1552 1553 if (!is_tst_cond(cond)) { 1554 if (!const_arg2) { 1555 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1556 } else if (arg2 == 0) { 1557 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1558 } else { 1559 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1560 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1561 } 1562 return tcg_cond_to_jcc[cond]; 1563 } 1564 1565 jz = tcg_cond_to_jcc[cond]; 1566 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1567 1568 if (!const_arg2) { 1569 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1570 return jz; 1571 } 1572 1573 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1574 if (arg2 == 0x80) { 1575 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1576 return js; 1577 } 1578 if (arg2 == 0xff) { 1579 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1580 return jz; 1581 } 1582 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1583 tcg_out8(s, arg2); 1584 return jz; 1585 } 1586 1587 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1588 if (arg2 == 0x8000) { 1589 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1590 return js; 1591 } 1592 if (arg2 == 0xff00) { 1593 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1594 return jz; 1595 } 1596 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1597 tcg_out8(s, arg2 >> 8); 1598 return jz; 1599 } 1600 1601 if (arg2 == 0xffff) { 1602 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1603 return jz; 1604 } 1605 if (arg2 == 0xffffffffu) { 1606 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1607 return jz; 1608 } 1609 1610 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1611 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1612 int sh = ctz64(arg2); 1613 1614 rexw = (sh & 32 ? P_REXW : 0); 1615 if ((sh & 31) == 31) { 1616 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1617 return js; 1618 } else { 1619 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1620 tcg_out8(s, sh); 1621 return jc; 1622 } 1623 } 1624 1625 if (rexw) { 1626 if (arg2 == (uint32_t)arg2) { 1627 rexw = 0; 1628 } else { 1629 tcg_debug_assert(arg2 == (int32_t)arg2); 1630 } 1631 } 1632 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1633 tcg_out32(s, arg2); 1634 return jz; 1635} 1636 1637static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1638 TCGArg arg1, TCGArg arg2, int const_arg2, 1639 TCGLabel *label, bool small) 1640{ 1641 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1642 tcg_out_jxx(s, jcc, label, small); 1643} 1644 1645static void tgen_brcond(TCGContext *s, TCGType type, TCGCond cond, 1646 TCGReg arg1, TCGReg arg2, TCGLabel *label) 1647{ 1648 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1649 tcg_out_brcond(s, rexw, cond, arg1, arg2, false, label, false); 1650} 1651 1652static void tgen_brcondi(TCGContext *s, TCGType type, TCGCond cond, 1653 TCGReg arg1, tcg_target_long arg2, TCGLabel *label) 1654{ 1655 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1656 tcg_out_brcond(s, rexw, cond, arg1, arg2, true, label, false); 1657} 1658 1659static const TCGOutOpBrcond outop_brcond = { 1660 .base.static_constraint = C_O0_I2(r, reT), 1661 .out_rr = tgen_brcond, 1662 .out_ri = tgen_brcondi, 1663}; 1664 1665static void tcg_out_brcond2(TCGContext *s, TCGCond cond, TCGReg al, 1666 TCGReg ah, TCGArg bl, bool blconst, 1667 TCGArg bh, bool bhconst, 1668 TCGLabel *label_this, bool small) 1669{ 1670 TCGLabel *label_next = gen_new_label(); 1671 1672 switch (cond) { 1673 case TCG_COND_EQ: 1674 case TCG_COND_TSTEQ: 1675 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1676 al, bl, blconst, label_next, true); 1677 tcg_out_brcond(s, 0, cond, ah, bh, bhconst, label_this, small); 1678 break; 1679 1680 case TCG_COND_NE: 1681 case TCG_COND_TSTNE: 1682 tcg_out_brcond(s, 0, cond, al, bl, blconst, label_this, small); 1683 tcg_out_brcond(s, 0, cond, ah, bh, bhconst, label_this, small); 1684 break; 1685 1686 default: 1687 tcg_out_brcond(s, 0, tcg_high_cond(cond), 1688 ah, bh, bhconst, label_this, small); 1689 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1690 tcg_out_brcond(s, 0, tcg_unsigned_cond(cond), 1691 al, bl, blconst, label_this, small); 1692 break; 1693 } 1694 tcg_out_label(s, label_next); 1695} 1696 1697static void tgen_brcond2(TCGContext *s, TCGCond cond, TCGReg al, 1698 TCGReg ah, TCGArg bl, bool blconst, 1699 TCGArg bh, bool bhconst, TCGLabel *l) 1700{ 1701 tcg_out_brcond2(s, cond, al, ah, bl, blconst, bh, bhconst, l, false); 1702} 1703 1704#if TCG_TARGET_REG_BITS != 32 1705__attribute__((unused)) 1706#endif 1707static const TCGOutOpBrcond2 outop_brcond2 = { 1708 .base.static_constraint = C_O0_I4(r, r, ri, ri), 1709 .out = tgen_brcond2, 1710}; 1711 1712static void tcg_out_setcond(TCGContext *s, TCGType type, TCGCond cond, 1713 TCGReg dest, TCGReg arg1, TCGArg arg2, 1714 bool const_arg2, bool neg) 1715{ 1716 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1717 int cmp_rexw = rexw; 1718 bool inv = false; 1719 bool cleared; 1720 int jcc; 1721 1722 switch (cond) { 1723 case TCG_COND_NE: 1724 inv = true; 1725 /* fall through */ 1726 case TCG_COND_EQ: 1727 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1728 if (const_arg2 && arg2 == 0) { 1729 arg2 = 1; 1730 goto do_ltu; 1731 } 1732 break; 1733 1734 case TCG_COND_TSTNE: 1735 inv = true; 1736 /* fall through */ 1737 case TCG_COND_TSTEQ: 1738 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1739 if (const_arg2 && arg2 == 0xffffffffu) { 1740 arg2 = 1; 1741 cmp_rexw = 0; 1742 goto do_ltu; 1743 } 1744 break; 1745 1746 case TCG_COND_LEU: 1747 inv = true; 1748 /* fall through */ 1749 case TCG_COND_GTU: 1750 /* If arg2 is a register, swap for LTU/GEU. */ 1751 if (!const_arg2) { 1752 TCGReg t = arg1; 1753 arg1 = arg2; 1754 arg2 = t; 1755 goto do_ltu; 1756 } 1757 break; 1758 1759 case TCG_COND_GEU: 1760 inv = true; 1761 /* fall through */ 1762 case TCG_COND_LTU: 1763 do_ltu: 1764 /* 1765 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1766 * We can then use NEG or INC to produce the desired result. 1767 * This is always smaller than the SETCC expansion. 1768 */ 1769 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1770 1771 /* X - X - C = -C = (C ? -1 : 0) */ 1772 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1773 if (inv && neg) { 1774 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1775 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1776 } else if (inv) { 1777 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1778 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1779 } else if (!neg) { 1780 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1781 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1782 } 1783 return; 1784 1785 case TCG_COND_GE: 1786 inv = true; 1787 /* fall through */ 1788 case TCG_COND_LT: 1789 /* If arg2 is 0, extract the sign bit. */ 1790 if (const_arg2 && arg2 == 0) { 1791 tcg_out_mov(s, type, dest, arg1); 1792 if (inv) { 1793 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1794 } 1795 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1796 dest, rexw ? 63 : 31); 1797 return; 1798 } 1799 break; 1800 1801 default: 1802 break; 1803 } 1804 1805 /* 1806 * If dest does not overlap the inputs, clearing it first is preferred. 1807 * The XOR breaks any false dependency for the low-byte write to dest, 1808 * and is also one byte smaller than MOVZBL. 1809 */ 1810 cleared = false; 1811 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1812 tgen_arithr(s, ARITH_XOR, dest, dest); 1813 cleared = true; 1814 } 1815 1816 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1817 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1818 1819 if (!cleared) { 1820 tcg_out_ext8u(s, dest, dest); 1821 } 1822 if (neg) { 1823 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1824 } 1825} 1826 1827static void tgen_setcond(TCGContext *s, TCGType type, TCGCond cond, 1828 TCGReg dest, TCGReg arg1, TCGReg arg2) 1829{ 1830 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, false); 1831} 1832 1833static void tgen_setcondi(TCGContext *s, TCGType type, TCGCond cond, 1834 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1835{ 1836 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, false); 1837} 1838 1839static const TCGOutOpSetcond outop_setcond = { 1840 .base.static_constraint = C_O1_I2(q, r, reT), 1841 .out_rrr = tgen_setcond, 1842 .out_rri = tgen_setcondi, 1843}; 1844 1845static void tgen_negsetcond(TCGContext *s, TCGType type, TCGCond cond, 1846 TCGReg dest, TCGReg arg1, TCGReg arg2) 1847{ 1848 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, true); 1849} 1850 1851static void tgen_negsetcondi(TCGContext *s, TCGType type, TCGCond cond, 1852 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1853{ 1854 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, true); 1855} 1856 1857static const TCGOutOpSetcond outop_negsetcond = { 1858 .base.static_constraint = C_O1_I2(q, r, reT), 1859 .out_rrr = tgen_negsetcond, 1860 .out_rri = tgen_negsetcondi, 1861}; 1862 1863static void tgen_setcond2(TCGContext *s, TCGCond cond, TCGReg ret, 1864 TCGReg al, TCGReg ah, 1865 TCGArg bl, bool const_bl, 1866 TCGArg bh, bool const_bh) 1867{ 1868 TCGLabel *label_over = gen_new_label(); 1869 1870 if (ret == al || ret == ah 1871 || (!const_bl && ret == bl) 1872 || (!const_bh && ret == bh)) { 1873 /* 1874 * When the destination overlaps with one of the argument 1875 * registers, don't do anything tricky. 1876 */ 1877 TCGLabel *label_true = gen_new_label(); 1878 1879 tcg_out_brcond2(s, cond, al, ah, bl, const_bl, 1880 bh, const_bh, label_true, true); 1881 1882 tcg_out_movi(s, TCG_TYPE_I32, ret, 0); 1883 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1884 tcg_out_label(s, label_true); 1885 1886 tcg_out_movi(s, TCG_TYPE_I32, ret, 1); 1887 } else { 1888 /* 1889 * When the destination does not overlap one of the arguments, 1890 * clear the destination first, jump if cond false, and emit an 1891 * increment in the true case. This results in smaller code. 1892 */ 1893 tcg_out_movi(s, TCG_TYPE_I32, ret, 0); 1894 1895 tcg_out_brcond2(s, tcg_invert_cond(cond), al, ah, bl, const_bl, 1896 bh, const_bh, label_over, true); 1897 1898 tgen_arithi(s, ARITH_ADD, ret, 1, 0); 1899 } 1900 tcg_out_label(s, label_over); 1901} 1902 1903#if TCG_TARGET_REG_BITS != 32 1904__attribute__((unused)) 1905#endif 1906static const TCGOutOpSetcond2 outop_setcond2 = { 1907 .base.static_constraint = C_O1_I4(r, r, r, ri, ri), 1908 .out = tgen_setcond2, 1909}; 1910 1911static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1912 TCGReg dest, TCGReg v1) 1913{ 1914 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1915} 1916 1917static void tgen_movcond(TCGContext *s, TCGType type, TCGCond cond, 1918 TCGReg dest, TCGReg c1, TCGArg c2, bool const_c2, 1919 TCGArg vt, bool const_vt, 1920 TCGArg vf, bool consf_vf) 1921{ 1922 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1923 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1924 tcg_out_cmov(s, jcc, rexw, dest, vt); 1925} 1926 1927static const TCGOutOpMovcond outop_movcond = { 1928 .base.static_constraint = C_O1_I4(r, r, reT, r, 0), 1929 .out = tgen_movcond, 1930}; 1931 1932static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1933{ 1934 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1935 1936 if (disp == (int32_t)disp) { 1937 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1938 tcg_out32(s, disp); 1939 } else { 1940 /* rip-relative addressing into the constant pool. 1941 This is 6 + 8 = 14 bytes, as compared to using an 1942 immediate load 10 + 6 = 16 bytes, plus we may 1943 be able to re-use the pool constant for more calls. */ 1944 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1945 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1946 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1947 tcg_out32(s, 0); 1948 } 1949} 1950 1951static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1952 const TCGHelperInfo *info) 1953{ 1954 tcg_out_branch(s, 1, dest); 1955 1956#ifndef _WIN32 1957 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1958 /* 1959 * The sysv i386 abi for struct return places a reference as the 1960 * first argument of the stack, and pops that argument with the 1961 * return statement. Since we want to retain the aligned stack 1962 * pointer for the callee, we do not want to actually push that 1963 * argument before the call but rely on the normal store to the 1964 * stack slot. But we do need to compensate for the pop in order 1965 * to reset our correct stack pointer value. 1966 * Pushing a garbage value back onto the stack is quickest. 1967 */ 1968 tcg_out_push(s, TCG_REG_EAX); 1969 } 1970#endif 1971} 1972 1973static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1974{ 1975 tcg_out_branch(s, 0, dest); 1976} 1977 1978static void tcg_out_nopn(TCGContext *s, int n) 1979{ 1980 int i; 1981 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1982 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1983 * duplicate prefix, and all of the interesting recent cores can 1984 * decode and discard the duplicates in a single cycle. 1985 */ 1986 tcg_debug_assert(n >= 1); 1987 for (i = 1; i < n; ++i) { 1988 tcg_out8(s, 0x66); 1989 } 1990 tcg_out8(s, 0x90); 1991} 1992 1993typedef struct { 1994 TCGReg base; 1995 int index; 1996 int ofs; 1997 int seg; 1998 TCGAtomAlign aa; 1999} HostAddress; 2000 2001bool tcg_target_has_memory_bswap(MemOp memop) 2002{ 2003 TCGAtomAlign aa; 2004 2005 if (!have_movbe) { 2006 return false; 2007 } 2008 if ((memop & MO_SIZE) < MO_128) { 2009 return true; 2010 } 2011 2012 /* 2013 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 2014 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 2015 */ 2016 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 2017 return aa.atom < MO_128; 2018} 2019 2020/* 2021 * Because i686 has no register parameters and because x86_64 has xchg 2022 * to handle addr/data register overlap, we have placed all input arguments 2023 * before we need might need a scratch reg. 2024 * 2025 * Even then, a scratch is only needed for l->raddr. Rather than expose 2026 * a general-purpose scratch when we don't actually know it's available, 2027 * use the ra_gen hook to load into RAX if needed. 2028 */ 2029#if TCG_TARGET_REG_BITS == 64 2030static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 2031{ 2032 if (arg < 0) { 2033 arg = TCG_REG_RAX; 2034 } 2035 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 2036 return arg; 2037} 2038static const TCGLdstHelperParam ldst_helper_param = { 2039 .ra_gen = ldst_ra_gen 2040}; 2041#else 2042static const TCGLdstHelperParam ldst_helper_param = { }; 2043#endif 2044 2045static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 2046 TCGReg l, TCGReg h, TCGReg v) 2047{ 2048 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2049 2050 /* vpmov{d,q} %v, %l */ 2051 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2052 /* vpextr{d,q} $1, %v, %h */ 2053 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2054 tcg_out8(s, 1); 2055} 2056 2057static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2058 TCGReg v, TCGReg l, TCGReg h) 2059{ 2060 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2061 2062 /* vmov{d,q} %l, %v */ 2063 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2064 /* vpinsr{d,q} $1, %h, %v, %v */ 2065 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2066 tcg_out8(s, 1); 2067} 2068 2069/* 2070 * Generate code for the slow path for a load at the end of block 2071 */ 2072static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2073{ 2074 MemOp opc = get_memop(l->oi); 2075 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2076 2077 /* resolve label address */ 2078 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2079 if (label_ptr[1]) { 2080 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2081 } 2082 2083 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2084 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2085 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2086 2087 tcg_out_jmp(s, l->raddr); 2088 return true; 2089} 2090 2091/* 2092 * Generate code for the slow path for a store at the end of block 2093 */ 2094static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2095{ 2096 MemOp opc = get_memop(l->oi); 2097 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2098 2099 /* resolve label address */ 2100 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2101 if (label_ptr[1]) { 2102 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2103 } 2104 2105 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2106 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2107 2108 tcg_out_jmp(s, l->raddr); 2109 return true; 2110} 2111 2112#ifdef CONFIG_USER_ONLY 2113static HostAddress x86_guest_base = { 2114 .index = -1 2115}; 2116 2117#if defined(__x86_64__) && defined(__linux__) 2118# include <asm/prctl.h> 2119# include <sys/prctl.h> 2120int arch_prctl(int code, unsigned long addr); 2121static inline int setup_guest_base_seg(void) 2122{ 2123 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2124 return P_GS; 2125 } 2126 return 0; 2127} 2128#define setup_guest_base_seg setup_guest_base_seg 2129#elif defined(__x86_64__) && \ 2130 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2131# include <machine/sysarch.h> 2132static inline int setup_guest_base_seg(void) 2133{ 2134 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2135 return P_GS; 2136 } 2137 return 0; 2138} 2139#define setup_guest_base_seg setup_guest_base_seg 2140#endif 2141#else 2142# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2143#endif /* CONFIG_USER_ONLY */ 2144#ifndef setup_guest_base_seg 2145# define setup_guest_base_seg() 0 2146#endif 2147 2148#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2149 2150/* 2151 * For softmmu, perform the TLB load and compare. 2152 * For useronly, perform any required alignment tests. 2153 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2154 * is required and fill in @h with the host address for the fast path. 2155 */ 2156static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2157 TCGReg addr, MemOpIdx oi, bool is_ld) 2158{ 2159 TCGLabelQemuLdst *ldst = NULL; 2160 MemOp opc = get_memop(oi); 2161 MemOp s_bits = opc & MO_SIZE; 2162 unsigned a_mask; 2163 2164 if (tcg_use_softmmu) { 2165 h->index = TCG_REG_L0; 2166 h->ofs = 0; 2167 h->seg = 0; 2168 } else { 2169 *h = x86_guest_base; 2170 } 2171 h->base = addr; 2172 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2173 a_mask = (1 << h->aa.align) - 1; 2174 2175 if (tcg_use_softmmu) { 2176 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2177 : offsetof(CPUTLBEntry, addr_write); 2178 TCGType ttype = TCG_TYPE_I32; 2179 TCGType tlbtype = TCG_TYPE_I32; 2180 int trexw = 0, hrexw = 0, tlbrexw = 0; 2181 unsigned mem_index = get_mmuidx(oi); 2182 unsigned s_mask = (1 << s_bits) - 1; 2183 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2184 int tlb_mask; 2185 2186 ldst = new_ldst_label(s); 2187 ldst->is_ld = is_ld; 2188 ldst->oi = oi; 2189 ldst->addr_reg = addr; 2190 2191 if (TCG_TARGET_REG_BITS == 64) { 2192 ttype = s->addr_type; 2193 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2194 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2195 hrexw = P_REXW; 2196 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2197 tlbtype = TCG_TYPE_I64; 2198 tlbrexw = P_REXW; 2199 } 2200 } 2201 } 2202 2203 tcg_out_mov(s, tlbtype, TCG_REG_L0, addr); 2204 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2205 s->page_bits - CPU_TLB_ENTRY_BITS); 2206 2207 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2208 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2209 2210 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2211 fast_ofs + offsetof(CPUTLBDescFast, table)); 2212 2213 /* 2214 * If the required alignment is at least as large as the access, 2215 * simply copy the address and mask. For lesser alignments, 2216 * check that we don't cross pages for the complete access. 2217 */ 2218 if (a_mask >= s_mask) { 2219 tcg_out_mov(s, ttype, TCG_REG_L1, addr); 2220 } else { 2221 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2222 addr, s_mask - a_mask); 2223 } 2224 tlb_mask = s->page_mask | a_mask; 2225 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2226 2227 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2228 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2229 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2230 2231 /* jne slow_path */ 2232 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2233 ldst->label_ptr[0] = s->code_ptr; 2234 s->code_ptr += 4; 2235 2236 /* TLB Hit. */ 2237 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2238 offsetof(CPUTLBEntry, addend)); 2239 } else if (a_mask) { 2240 int jcc; 2241 2242 ldst = new_ldst_label(s); 2243 ldst->is_ld = is_ld; 2244 ldst->oi = oi; 2245 ldst->addr_reg = addr; 2246 2247 /* jne slow_path */ 2248 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addr, a_mask, true, false); 2249 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2250 ldst->label_ptr[0] = s->code_ptr; 2251 s->code_ptr += 4; 2252 } 2253 2254 return ldst; 2255} 2256 2257static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2258 HostAddress h, TCGType type, MemOp memop) 2259{ 2260 bool use_movbe = false; 2261 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2262 int movop = OPC_MOVL_GvEv; 2263 2264 /* Do big-endian loads with movbe. */ 2265 if (memop & MO_BSWAP) { 2266 tcg_debug_assert(have_movbe); 2267 use_movbe = true; 2268 movop = OPC_MOVBE_GyMy; 2269 } 2270 2271 switch (memop & MO_SSIZE) { 2272 case MO_UB: 2273 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2274 h.base, h.index, 0, h.ofs); 2275 break; 2276 case MO_SB: 2277 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2278 h.base, h.index, 0, h.ofs); 2279 break; 2280 case MO_UW: 2281 if (use_movbe) { 2282 /* There is no extending movbe; only low 16-bits are modified. */ 2283 if (datalo != h.base && datalo != h.index) { 2284 /* XOR breaks dependency chains. */ 2285 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2286 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2287 datalo, h.base, h.index, 0, h.ofs); 2288 } else { 2289 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2290 datalo, h.base, h.index, 0, h.ofs); 2291 tcg_out_ext16u(s, datalo, datalo); 2292 } 2293 } else { 2294 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2295 h.base, h.index, 0, h.ofs); 2296 } 2297 break; 2298 case MO_SW: 2299 if (use_movbe) { 2300 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2301 datalo, h.base, h.index, 0, h.ofs); 2302 tcg_out_ext16s(s, type, datalo, datalo); 2303 } else { 2304 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2305 datalo, h.base, h.index, 0, h.ofs); 2306 } 2307 break; 2308 case MO_UL: 2309 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2310 h.base, h.index, 0, h.ofs); 2311 break; 2312#if TCG_TARGET_REG_BITS == 64 2313 case MO_SL: 2314 if (use_movbe) { 2315 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2316 h.base, h.index, 0, h.ofs); 2317 tcg_out_ext32s(s, datalo, datalo); 2318 } else { 2319 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2320 h.base, h.index, 0, h.ofs); 2321 } 2322 break; 2323#endif 2324 case MO_UQ: 2325 if (TCG_TARGET_REG_BITS == 64) { 2326 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2327 h.base, h.index, 0, h.ofs); 2328 break; 2329 } 2330 if (use_movbe) { 2331 TCGReg t = datalo; 2332 datalo = datahi; 2333 datahi = t; 2334 } 2335 if (h.base == datalo || h.index == datalo) { 2336 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2337 h.base, h.index, 0, h.ofs); 2338 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2339 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2340 } else { 2341 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2342 h.base, h.index, 0, h.ofs); 2343 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2344 h.base, h.index, 0, h.ofs + 4); 2345 } 2346 break; 2347 2348 case MO_128: 2349 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2350 2351 /* 2352 * Without 16-byte atomicity, use integer regs. 2353 * That is where we want the data, and it allows bswaps. 2354 */ 2355 if (h.aa.atom < MO_128) { 2356 if (use_movbe) { 2357 TCGReg t = datalo; 2358 datalo = datahi; 2359 datahi = t; 2360 } 2361 if (h.base == datalo || h.index == datalo) { 2362 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2363 h.base, h.index, 0, h.ofs); 2364 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2365 datalo, datahi, 0); 2366 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2367 datahi, datahi, 8); 2368 } else { 2369 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2370 h.base, h.index, 0, h.ofs); 2371 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2372 h.base, h.index, 0, h.ofs + 8); 2373 } 2374 break; 2375 } 2376 2377 /* 2378 * With 16-byte atomicity, a vector load is required. 2379 * If we already have 16-byte alignment, then VMOVDQA always works. 2380 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2381 * Else use we require a runtime test for alignment for VMOVDQA; 2382 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2383 */ 2384 if (h.aa.align >= MO_128) { 2385 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2386 TCG_TMP_VEC, 0, 2387 h.base, h.index, 0, h.ofs); 2388 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2389 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2390 TCG_TMP_VEC, 0, 2391 h.base, h.index, 0, h.ofs); 2392 } else { 2393 TCGLabel *l1 = gen_new_label(); 2394 TCGLabel *l2 = gen_new_label(); 2395 int jcc; 2396 2397 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2398 tcg_out_jxx(s, jcc, l1, true); 2399 2400 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2401 TCG_TMP_VEC, 0, 2402 h.base, h.index, 0, h.ofs); 2403 tcg_out_jxx(s, JCC_JMP, l2, true); 2404 2405 tcg_out_label(s, l1); 2406 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2407 TCG_TMP_VEC, 0, 2408 h.base, h.index, 0, h.ofs); 2409 tcg_out_label(s, l2); 2410 } 2411 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2412 break; 2413 2414 default: 2415 g_assert_not_reached(); 2416 } 2417} 2418 2419static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2420 TCGReg addr, MemOpIdx oi, TCGType data_type) 2421{ 2422 TCGLabelQemuLdst *ldst; 2423 HostAddress h; 2424 2425 ldst = prepare_host_addr(s, &h, addr, oi, true); 2426 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2427 2428 if (ldst) { 2429 ldst->type = data_type; 2430 ldst->datalo_reg = datalo; 2431 ldst->datahi_reg = datahi; 2432 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2433 } 2434} 2435 2436static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2437 HostAddress h, MemOp memop) 2438{ 2439 bool use_movbe = false; 2440 int movop = OPC_MOVL_EvGv; 2441 2442 /* 2443 * Do big-endian stores with movbe or system-mode. 2444 * User-only without movbe will have its swapping done generically. 2445 */ 2446 if (memop & MO_BSWAP) { 2447 tcg_debug_assert(have_movbe); 2448 use_movbe = true; 2449 movop = OPC_MOVBE_MyGy; 2450 } 2451 2452 switch (memop & MO_SIZE) { 2453 case MO_8: 2454 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2455 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2456 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2457 datalo, h.base, h.index, 0, h.ofs); 2458 break; 2459 case MO_16: 2460 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2461 h.base, h.index, 0, h.ofs); 2462 break; 2463 case MO_32: 2464 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2465 h.base, h.index, 0, h.ofs); 2466 break; 2467 case MO_64: 2468 if (TCG_TARGET_REG_BITS == 64) { 2469 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2470 h.base, h.index, 0, h.ofs); 2471 } else { 2472 if (use_movbe) { 2473 TCGReg t = datalo; 2474 datalo = datahi; 2475 datahi = t; 2476 } 2477 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2478 h.base, h.index, 0, h.ofs); 2479 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2480 h.base, h.index, 0, h.ofs + 4); 2481 } 2482 break; 2483 2484 case MO_128: 2485 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2486 2487 /* 2488 * Without 16-byte atomicity, use integer regs. 2489 * That is where we have the data, and it allows bswaps. 2490 */ 2491 if (h.aa.atom < MO_128) { 2492 if (use_movbe) { 2493 TCGReg t = datalo; 2494 datalo = datahi; 2495 datahi = t; 2496 } 2497 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2498 h.base, h.index, 0, h.ofs); 2499 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2500 h.base, h.index, 0, h.ofs + 8); 2501 break; 2502 } 2503 2504 /* 2505 * With 16-byte atomicity, a vector store is required. 2506 * If we already have 16-byte alignment, then VMOVDQA always works. 2507 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2508 * Else use we require a runtime test for alignment for VMOVDQA; 2509 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2510 */ 2511 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2512 if (h.aa.align >= MO_128) { 2513 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2514 TCG_TMP_VEC, 0, 2515 h.base, h.index, 0, h.ofs); 2516 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2517 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2518 TCG_TMP_VEC, 0, 2519 h.base, h.index, 0, h.ofs); 2520 } else { 2521 TCGLabel *l1 = gen_new_label(); 2522 TCGLabel *l2 = gen_new_label(); 2523 int jcc; 2524 2525 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2526 tcg_out_jxx(s, jcc, l1, true); 2527 2528 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2529 TCG_TMP_VEC, 0, 2530 h.base, h.index, 0, h.ofs); 2531 tcg_out_jxx(s, JCC_JMP, l2, true); 2532 2533 tcg_out_label(s, l1); 2534 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2535 TCG_TMP_VEC, 0, 2536 h.base, h.index, 0, h.ofs); 2537 tcg_out_label(s, l2); 2538 } 2539 break; 2540 2541 default: 2542 g_assert_not_reached(); 2543 } 2544} 2545 2546static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2547 TCGReg addr, MemOpIdx oi, TCGType data_type) 2548{ 2549 TCGLabelQemuLdst *ldst; 2550 HostAddress h; 2551 2552 ldst = prepare_host_addr(s, &h, addr, oi, false); 2553 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2554 2555 if (ldst) { 2556 ldst->type = data_type; 2557 ldst->datalo_reg = datalo; 2558 ldst->datahi_reg = datahi; 2559 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2560 } 2561} 2562 2563static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2564{ 2565 /* Reuse the zeroing that exists for goto_ptr. */ 2566 if (a0 == 0) { 2567 tcg_out_jmp(s, tcg_code_gen_epilogue); 2568 } else { 2569 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2570 tcg_out_jmp(s, tb_ret_addr); 2571 } 2572} 2573 2574static void tcg_out_goto_tb(TCGContext *s, int which) 2575{ 2576 /* 2577 * Jump displacement must be aligned for atomic patching; 2578 * see if we need to add extra nops before jump 2579 */ 2580 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2581 if (gap != 1) { 2582 tcg_out_nopn(s, gap - 1); 2583 } 2584 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2585 set_jmp_insn_offset(s, which); 2586 tcg_out32(s, 0); 2587 set_jmp_reset_offset(s, which); 2588} 2589 2590void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2591 uintptr_t jmp_rx, uintptr_t jmp_rw) 2592{ 2593 /* patch the branch destination */ 2594 uintptr_t addr = tb->jmp_target_addr[n]; 2595 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2596 /* no need to flush icache explicitly */ 2597} 2598 2599 2600static void tgen_add(TCGContext *s, TCGType type, 2601 TCGReg a0, TCGReg a1, TCGReg a2) 2602{ 2603 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2604 2605 if (a0 == a1) { 2606 tgen_arithr(s, ARITH_ADD + rexw, a0, a2); 2607 } else if (a0 == a2) { 2608 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2609 } else { 2610 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, 0); 2611 } 2612} 2613 2614static void tgen_addi(TCGContext *s, TCGType type, 2615 TCGReg a0, TCGReg a1, tcg_target_long a2) 2616{ 2617 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2618 2619 if (a0 == a1) { 2620 tgen_arithi(s, ARITH_ADD + rexw, a0, a2, false); 2621 } else { 2622 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2); 2623 } 2624} 2625 2626static const TCGOutOpBinary outop_add = { 2627 .base.static_constraint = C_O1_I2(r, r, re), 2628 .out_rrr = tgen_add, 2629 .out_rri = tgen_addi, 2630}; 2631 2632static void tgen_and(TCGContext *s, TCGType type, 2633 TCGReg a0, TCGReg a1, TCGReg a2) 2634{ 2635 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2636 tgen_arithr(s, ARITH_AND + rexw, a0, a2); 2637} 2638 2639static void tgen_andi(TCGContext *s, TCGType type, 2640 TCGReg a0, TCGReg a1, tcg_target_long a2) 2641{ 2642 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2643 tgen_arithi(s, ARITH_AND + rexw, a0, a2, false); 2644} 2645 2646static const TCGOutOpBinary outop_and = { 2647 .base.static_constraint = C_O1_I2(r, 0, reZ), 2648 .out_rrr = tgen_and, 2649 .out_rri = tgen_andi, 2650}; 2651 2652static void tgen_andc(TCGContext *s, TCGType type, 2653 TCGReg a0, TCGReg a1, TCGReg a2) 2654{ 2655 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2656 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2657} 2658 2659static TCGConstraintSetIndex cset_andc(TCGType type, unsigned flags) 2660{ 2661 return have_bmi1 ? C_O1_I2(r, r, r) : C_NotImplemented; 2662} 2663 2664static const TCGOutOpBinary outop_andc = { 2665 .base.static_constraint = C_Dynamic, 2666 .base.dynamic_constraint = cset_andc, 2667 .out_rrr = tgen_andc, 2668}; 2669 2670static void tgen_clz(TCGContext *s, TCGType type, 2671 TCGReg a0, TCGReg a1, TCGReg a2) 2672{ 2673 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2674 int jcc; 2675 2676 if (have_lzcnt) { 2677 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2678 jcc = JCC_JB; 2679 } else { 2680 /* Recall that the output of BSR is the index not the count. */ 2681 tcg_out_modrm(s, OPC_BSR + rexw, a0, a1); 2682 tgen_arithi(s, ARITH_XOR + rexw, a0, rexw ? 63 : 31, 0); 2683 2684 /* Since we have destroyed the flags from BSR, we have to re-test. */ 2685 jcc = tcg_out_cmp(s, TCG_COND_EQ, a1, 0, 1, rexw); 2686 } 2687 tcg_out_cmov(s, jcc, rexw, a0, a2); 2688} 2689 2690static void tgen_clzi(TCGContext *s, TCGType type, 2691 TCGReg a0, TCGReg a1, tcg_target_long a2) 2692{ 2693 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2694 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2695} 2696 2697static TCGConstraintSetIndex cset_clz(TCGType type, unsigned flags) 2698{ 2699 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2700} 2701 2702static const TCGOutOpBinary outop_clz = { 2703 .base.static_constraint = C_Dynamic, 2704 .base.dynamic_constraint = cset_clz, 2705 .out_rrr = tgen_clz, 2706 .out_rri = tgen_clzi, 2707}; 2708 2709static void tgen_ctpop(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2710{ 2711 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2712 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2713} 2714 2715static TCGConstraintSetIndex cset_ctpop(TCGType type, unsigned flags) 2716{ 2717 return have_popcnt ? C_O1_I1(r, r) : C_NotImplemented; 2718} 2719 2720static const TCGOutOpUnary outop_ctpop = { 2721 .base.static_constraint = C_Dynamic, 2722 .base.dynamic_constraint = cset_ctpop, 2723 .out_rr = tgen_ctpop, 2724}; 2725 2726static void tgen_ctz(TCGContext *s, TCGType type, 2727 TCGReg a0, TCGReg a1, TCGReg a2) 2728{ 2729 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2730 int jcc; 2731 2732 if (have_bmi1) { 2733 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2734 jcc = JCC_JB; 2735 } else { 2736 tcg_out_modrm(s, OPC_BSF + rexw, a0, a1); 2737 jcc = JCC_JE; 2738 } 2739 tcg_out_cmov(s, jcc, rexw, a0, a2); 2740} 2741 2742static void tgen_ctzi(TCGContext *s, TCGType type, 2743 TCGReg a0, TCGReg a1, tcg_target_long a2) 2744{ 2745 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2746 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2747} 2748 2749static TCGConstraintSetIndex cset_ctz(TCGType type, unsigned flags) 2750{ 2751 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2752} 2753 2754static const TCGOutOpBinary outop_ctz = { 2755 .base.static_constraint = C_Dynamic, 2756 .base.dynamic_constraint = cset_ctz, 2757 .out_rrr = tgen_ctz, 2758 .out_rri = tgen_ctzi, 2759}; 2760 2761static const TCGOutOpBinary outop_divs = { 2762 .base.static_constraint = C_NotImplemented, 2763}; 2764 2765static void tgen_divs2(TCGContext *s, TCGType type, 2766 TCGReg a0, TCGReg a1, TCGReg a4) 2767{ 2768 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2769 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, a4); 2770} 2771 2772static const TCGOutOpDivRem outop_divs2 = { 2773 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2774 .out_rr01r = tgen_divs2, 2775}; 2776 2777static const TCGOutOpBinary outop_divu = { 2778 .base.static_constraint = C_NotImplemented, 2779}; 2780 2781static void tgen_divu2(TCGContext *s, TCGType type, 2782 TCGReg a0, TCGReg a1, TCGReg a4) 2783{ 2784 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2785 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, a4); 2786} 2787 2788static const TCGOutOpDivRem outop_divu2 = { 2789 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2790 .out_rr01r = tgen_divu2, 2791}; 2792 2793static const TCGOutOpBinary outop_eqv = { 2794 .base.static_constraint = C_NotImplemented, 2795}; 2796 2797static void tgen_mul(TCGContext *s, TCGType type, 2798 TCGReg a0, TCGReg a1, TCGReg a2) 2799{ 2800 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2801 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2802} 2803 2804static void tgen_muli(TCGContext *s, TCGType type, 2805 TCGReg a0, TCGReg a1, tcg_target_long a2) 2806{ 2807 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2808 2809 if (a2 == (int8_t)a2) { 2810 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2811 tcg_out8(s, a2); 2812 } else { 2813 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2814 tcg_out32(s, a2); 2815 } 2816} 2817 2818static const TCGOutOpBinary outop_mul = { 2819 .base.static_constraint = C_O1_I2(r, 0, re), 2820 .out_rrr = tgen_mul, 2821 .out_rri = tgen_muli, 2822}; 2823 2824static void tgen_muls2(TCGContext *s, TCGType type, 2825 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2826{ 2827 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2828 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, a3); 2829} 2830 2831static const TCGOutOpMul2 outop_muls2 = { 2832 .base.static_constraint = C_O2_I2(a, d, a, r), 2833 .out_rrrr = tgen_muls2, 2834}; 2835 2836static const TCGOutOpBinary outop_mulsh = { 2837 .base.static_constraint = C_NotImplemented, 2838}; 2839 2840static const TCGOutOpBinary outop_muluh = { 2841 .base.static_constraint = C_NotImplemented, 2842}; 2843 2844static void tgen_mulu2(TCGContext *s, TCGType type, 2845 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2846{ 2847 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2848 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, a3); 2849} 2850 2851static const TCGOutOpMul2 outop_mulu2 = { 2852 .base.static_constraint = C_O2_I2(a, d, a, r), 2853 .out_rrrr = tgen_mulu2, 2854}; 2855 2856static const TCGOutOpBinary outop_nand = { 2857 .base.static_constraint = C_NotImplemented, 2858}; 2859 2860static const TCGOutOpBinary outop_nor = { 2861 .base.static_constraint = C_NotImplemented, 2862}; 2863 2864static void tgen_or(TCGContext *s, TCGType type, 2865 TCGReg a0, TCGReg a1, TCGReg a2) 2866{ 2867 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2868 tgen_arithr(s, ARITH_OR + rexw, a0, a2); 2869} 2870 2871static void tgen_ori(TCGContext *s, TCGType type, 2872 TCGReg a0, TCGReg a1, tcg_target_long a2) 2873{ 2874 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2875 tgen_arithi(s, ARITH_OR + rexw, a0, a2, false); 2876} 2877 2878static const TCGOutOpBinary outop_or = { 2879 .base.static_constraint = C_O1_I2(r, 0, re), 2880 .out_rrr = tgen_or, 2881 .out_rri = tgen_ori, 2882}; 2883 2884static const TCGOutOpBinary outop_orc = { 2885 .base.static_constraint = C_NotImplemented, 2886}; 2887 2888static const TCGOutOpBinary outop_rems = { 2889 .base.static_constraint = C_NotImplemented, 2890}; 2891 2892static const TCGOutOpBinary outop_remu = { 2893 .base.static_constraint = C_NotImplemented, 2894}; 2895 2896static void tgen_rotl(TCGContext *s, TCGType type, 2897 TCGReg a0, TCGReg a1, TCGReg a2) 2898{ 2899 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2900 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROL, a0); 2901} 2902 2903static void tgen_rotli(TCGContext *s, TCGType type, 2904 TCGReg a0, TCGReg a1, tcg_target_long a2) 2905{ 2906 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2907 tcg_out_shifti(s, SHIFT_ROL + rexw, a0, a2); 2908} 2909 2910static const TCGOutOpBinary outop_rotl = { 2911 .base.static_constraint = C_O1_I2(r, 0, ci), 2912 .out_rrr = tgen_rotl, 2913 .out_rri = tgen_rotli, 2914}; 2915 2916static void tgen_rotr(TCGContext *s, TCGType type, 2917 TCGReg a0, TCGReg a1, TCGReg a2) 2918{ 2919 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2920 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROR, a0); 2921} 2922 2923static void tgen_rotri(TCGContext *s, TCGType type, 2924 TCGReg a0, TCGReg a1, tcg_target_long a2) 2925{ 2926 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2927 tcg_out_shifti(s, SHIFT_ROR + rexw, a0, a2); 2928} 2929 2930static const TCGOutOpBinary outop_rotr = { 2931 .base.static_constraint = C_O1_I2(r, 0, ci), 2932 .out_rrr = tgen_rotr, 2933 .out_rri = tgen_rotri, 2934}; 2935 2936static TCGConstraintSetIndex cset_shift(TCGType type, unsigned flags) 2937{ 2938 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 2939} 2940 2941static void tgen_sar(TCGContext *s, TCGType type, 2942 TCGReg a0, TCGReg a1, TCGReg a2) 2943{ 2944 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2945 if (have_bmi2) { 2946 tcg_out_vex_modrm(s, OPC_SARX + rexw, a0, a2, a1); 2947 } else { 2948 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SAR, a0); 2949 } 2950} 2951 2952static void tgen_sari(TCGContext *s, TCGType type, 2953 TCGReg a0, TCGReg a1, tcg_target_long a2) 2954{ 2955 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2956 2957 tcg_out_mov(s, type, a0, a1); 2958 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, a2); 2959} 2960 2961static const TCGOutOpBinary outop_sar = { 2962 .base.static_constraint = C_Dynamic, 2963 .base.dynamic_constraint = cset_shift, 2964 .out_rrr = tgen_sar, 2965 .out_rri = tgen_sari, 2966}; 2967 2968static void tgen_shl(TCGContext *s, TCGType type, 2969 TCGReg a0, TCGReg a1, TCGReg a2) 2970{ 2971 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2972 if (have_bmi2) { 2973 tcg_out_vex_modrm(s, OPC_SHLX + rexw, a0, a2, a1); 2974 } else { 2975 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHL, a0); 2976 } 2977} 2978 2979static void tgen_shli(TCGContext *s, TCGType type, 2980 TCGReg a0, TCGReg a1, tcg_target_long a2) 2981{ 2982 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2983 2984 /* For small constant 3-operand shift, use LEA. */ 2985 if (a0 != a1 && a2 >= 1 && a2 <= 3) { 2986 if (a2 == 1) { 2987 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2988 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2989 } else { 2990 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2991 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2992 } 2993 return; 2994 } 2995 tcg_out_mov(s, type, a0, a1); 2996 tcg_out_shifti(s, SHIFT_SHL + rexw, a0, a2); 2997} 2998 2999static const TCGOutOpBinary outop_shl = { 3000 .base.static_constraint = C_Dynamic, 3001 .base.dynamic_constraint = cset_shift, 3002 .out_rrr = tgen_shl, 3003 .out_rri = tgen_shli, 3004}; 3005 3006static void tgen_shr(TCGContext *s, TCGType type, 3007 TCGReg a0, TCGReg a1, TCGReg a2) 3008{ 3009 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3010 if (have_bmi2) { 3011 tcg_out_vex_modrm(s, OPC_SHRX + rexw, a0, a2, a1); 3012 } else { 3013 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHR, a0); 3014 } 3015} 3016 3017static void tgen_shri(TCGContext *s, TCGType type, 3018 TCGReg a0, TCGReg a1, tcg_target_long a2) 3019{ 3020 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3021 3022 tcg_out_mov(s, type, a0, a1); 3023 tcg_out_shifti(s, SHIFT_SHR + rexw, a0, a2); 3024} 3025 3026static const TCGOutOpBinary outop_shr = { 3027 .base.static_constraint = C_Dynamic, 3028 .base.dynamic_constraint = cset_shift, 3029 .out_rrr = tgen_shr, 3030 .out_rri = tgen_shri, 3031}; 3032 3033static void tgen_sub(TCGContext *s, TCGType type, 3034 TCGReg a0, TCGReg a1, TCGReg a2) 3035{ 3036 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3037 tgen_arithr(s, ARITH_SUB + rexw, a0, a2); 3038} 3039 3040static const TCGOutOpSubtract outop_sub = { 3041 .base.static_constraint = C_O1_I2(r, 0, r), 3042 .out_rrr = tgen_sub, 3043}; 3044 3045static void tgen_xor(TCGContext *s, TCGType type, 3046 TCGReg a0, TCGReg a1, TCGReg a2) 3047{ 3048 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3049 tgen_arithr(s, ARITH_XOR + rexw, a0, a2); 3050} 3051 3052static void tgen_xori(TCGContext *s, TCGType type, 3053 TCGReg a0, TCGReg a1, tcg_target_long a2) 3054{ 3055 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3056 tgen_arithi(s, ARITH_XOR + rexw, a0, a2, false); 3057} 3058 3059static const TCGOutOpBinary outop_xor = { 3060 .base.static_constraint = C_O1_I2(r, 0, re), 3061 .out_rrr = tgen_xor, 3062 .out_rri = tgen_xori, 3063}; 3064 3065static void tgen_bswap16(TCGContext *s, TCGType type, 3066 TCGReg a0, TCGReg a1, unsigned flags) 3067{ 3068 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3069 3070 if (flags & TCG_BSWAP_OS) { 3071 /* Output must be sign-extended. */ 3072 if (rexw) { 3073 tcg_out_bswap64(s, a0); 3074 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 3075 } else { 3076 tcg_out_bswap32(s, a0); 3077 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 3078 } 3079 } else if ((flags & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 3080 /* Output must be zero-extended, but input isn't. */ 3081 tcg_out_bswap32(s, a0); 3082 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 3083 } else { 3084 tcg_out_rolw_8(s, a0); 3085 } 3086} 3087 3088static const TCGOutOpBswap outop_bswap16 = { 3089 .base.static_constraint = C_O1_I1(r, 0), 3090 .out_rr = tgen_bswap16, 3091}; 3092 3093static void tgen_bswap32(TCGContext *s, TCGType type, 3094 TCGReg a0, TCGReg a1, unsigned flags) 3095{ 3096 tcg_out_bswap32(s, a0); 3097 if (flags & TCG_BSWAP_OS) { 3098 tcg_out_ext32s(s, a0, a0); 3099 } 3100} 3101 3102static const TCGOutOpBswap outop_bswap32 = { 3103 .base.static_constraint = C_O1_I1(r, 0), 3104 .out_rr = tgen_bswap32, 3105}; 3106 3107#if TCG_TARGET_REG_BITS == 64 3108static void tgen_bswap64(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3109{ 3110 tcg_out_bswap64(s, a0); 3111} 3112 3113static const TCGOutOpUnary outop_bswap64 = { 3114 .base.static_constraint = C_O1_I1(r, 0), 3115 .out_rr = tgen_bswap64, 3116}; 3117#endif 3118 3119static void tgen_neg(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3120{ 3121 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3122 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 3123} 3124 3125static const TCGOutOpUnary outop_neg = { 3126 .base.static_constraint = C_O1_I1(r, 0), 3127 .out_rr = tgen_neg, 3128}; 3129 3130static void tgen_not(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3131{ 3132 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3133 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 3134} 3135 3136static const TCGOutOpUnary outop_not = { 3137 .base.static_constraint = C_O1_I1(r, 0), 3138 .out_rr = tgen_not, 3139}; 3140 3141static void tgen_extract(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1, 3142 unsigned ofs, unsigned len) 3143{ 3144 if (ofs == 0) { 3145 switch (len) { 3146 case 8: 3147 tcg_out_ext8u(s, a0, a1); 3148 return; 3149 case 16: 3150 tcg_out_ext16u(s, a0, a1); 3151 return; 3152 case 32: 3153 tcg_out_ext32u(s, a0, a1); 3154 return; 3155 } 3156 } else if (TCG_TARGET_REG_BITS == 64 && ofs + len == 32) { 3157 /* This is a 32-bit zero-extending right shift. */ 3158 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3159 tcg_out_shifti(s, SHIFT_SHR, a0, ofs); 3160 return; 3161 } else if (ofs == 8 && len == 8) { 3162 /* 3163 * On the off-chance that we can use the high-byte registers. 3164 * Otherwise we emit the same ext16 + shift pattern that we 3165 * would have gotten from the normal tcg-op.c expansion. 3166 */ 3167 if (a1 < 4 && (TCG_TARGET_REG_BITS == 32 || a0 < 8)) { 3168 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3169 } else { 3170 tcg_out_ext16u(s, a0, a1); 3171 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3172 } 3173 return; 3174 } 3175 g_assert_not_reached(); 3176} 3177 3178static const TCGOutOpExtract outop_extract = { 3179 .base.static_constraint = C_O1_I1(r, r), 3180 .out_rr = tgen_extract, 3181}; 3182 3183static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type, 3184 const TCGArg args[TCG_MAX_OP_ARGS], 3185 const int const_args[TCG_MAX_OP_ARGS]) 3186{ 3187 TCGArg a0, a1, a2; 3188 int const_a2, rexw; 3189 3190#if TCG_TARGET_REG_BITS == 64 3191# define OP_32_64(x) \ 3192 case glue(glue(INDEX_op_, x), _i64): \ 3193 case glue(glue(INDEX_op_, x), _i32) 3194#else 3195# define OP_32_64(x) \ 3196 case glue(glue(INDEX_op_, x), _i32) 3197#endif 3198 3199 /* Hoist the loads of the most common arguments. */ 3200 a0 = args[0]; 3201 a1 = args[1]; 3202 a2 = args[2]; 3203 const_a2 = const_args[2]; 3204 rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3205 3206 switch (opc) { 3207 case INDEX_op_goto_ptr: 3208 /* jmp to the given host address (could be epilogue) */ 3209 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 3210 break; 3211 case INDEX_op_br: 3212 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 3213 break; 3214 OP_32_64(ld8u): 3215 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3216 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 3217 break; 3218 OP_32_64(ld8s): 3219 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 3220 break; 3221 OP_32_64(ld16u): 3222 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 3223 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 3224 break; 3225 OP_32_64(ld16s): 3226 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 3227 break; 3228#if TCG_TARGET_REG_BITS == 64 3229 case INDEX_op_ld32u_i64: 3230#endif 3231 case INDEX_op_ld_i32: 3232 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 3233 break; 3234 3235 OP_32_64(st8): 3236 if (const_args[0]) { 3237 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 3238 tcg_out8(s, a0); 3239 } else { 3240 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 3241 } 3242 break; 3243 OP_32_64(st16): 3244 if (const_args[0]) { 3245 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 3246 tcg_out16(s, a0); 3247 } else { 3248 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 3249 } 3250 break; 3251#if TCG_TARGET_REG_BITS == 64 3252 case INDEX_op_st32_i64: 3253#endif 3254 case INDEX_op_st_i32: 3255 if (const_args[0]) { 3256 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 3257 tcg_out32(s, a0); 3258 } else { 3259 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 3260 } 3261 break; 3262 3263 case INDEX_op_qemu_ld_i32: 3264 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32); 3265 break; 3266 case INDEX_op_qemu_ld_i64: 3267 if (TCG_TARGET_REG_BITS == 64) { 3268 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64); 3269 } else { 3270 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3271 } 3272 break; 3273 case INDEX_op_qemu_ld_i128: 3274 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3275 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3276 break; 3277 3278 case INDEX_op_qemu_st_i32: 3279 case INDEX_op_qemu_st8_i32: 3280 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32); 3281 break; 3282 case INDEX_op_qemu_st_i64: 3283 if (TCG_TARGET_REG_BITS == 64) { 3284 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64); 3285 } else { 3286 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3287 } 3288 break; 3289 case INDEX_op_qemu_st_i128: 3290 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3291 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3292 break; 3293 3294 OP_32_64(add2): 3295 if (const_args[4]) { 3296 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 3297 } else { 3298 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 3299 } 3300 if (const_args[5]) { 3301 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 3302 } else { 3303 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 3304 } 3305 break; 3306 OP_32_64(sub2): 3307 if (const_args[4]) { 3308 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 3309 } else { 3310 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 3311 } 3312 if (const_args[5]) { 3313 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 3314 } else { 3315 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 3316 } 3317 break; 3318 3319#if TCG_TARGET_REG_BITS == 64 3320 case INDEX_op_ld32s_i64: 3321 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 3322 break; 3323 case INDEX_op_ld_i64: 3324 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 3325 break; 3326 case INDEX_op_st_i64: 3327 if (const_args[0]) { 3328 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 3329 tcg_out32(s, a0); 3330 } else { 3331 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 3332 } 3333 break; 3334 3335 case INDEX_op_extrh_i64_i32: 3336 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 3337 break; 3338#endif 3339 3340 OP_32_64(deposit): 3341 if (args[3] == 0 && args[4] == 8) { 3342 /* load bits 0..7 */ 3343 if (const_a2) { 3344 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 3345 0, a0, 0); 3346 tcg_out8(s, a2); 3347 } else { 3348 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 3349 } 3350 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 3351 /* load bits 8..15 */ 3352 if (const_a2) { 3353 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 3354 tcg_out8(s, a2); 3355 } else { 3356 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 3357 } 3358 } else if (args[3] == 0 && args[4] == 16) { 3359 /* load bits 0..15 */ 3360 if (const_a2) { 3361 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 3362 0, a0, 0); 3363 tcg_out16(s, a2); 3364 } else { 3365 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 3366 } 3367 } else { 3368 g_assert_not_reached(); 3369 } 3370 break; 3371 3372 case INDEX_op_sextract_i64: 3373 if (a2 == 0 && args[3] == 8) { 3374 tcg_out_ext8s(s, TCG_TYPE_I64, a0, a1); 3375 } else if (a2 == 0 && args[3] == 16) { 3376 tcg_out_ext16s(s, TCG_TYPE_I64, a0, a1); 3377 } else if (a2 == 0 && args[3] == 32) { 3378 tcg_out_ext32s(s, a0, a1); 3379 } else { 3380 g_assert_not_reached(); 3381 } 3382 break; 3383 3384 case INDEX_op_sextract_i32: 3385 if (a2 == 0 && args[3] == 8) { 3386 tcg_out_ext8s(s, TCG_TYPE_I32, a0, a1); 3387 } else if (a2 == 0 && args[3] == 16) { 3388 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3389 } else if (a2 == 8 && args[3] == 8) { 3390 if (a1 < 4 && a0 < 8) { 3391 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3392 } else { 3393 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3394 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3395 } 3396 } else { 3397 g_assert_not_reached(); 3398 } 3399 break; 3400 3401 OP_32_64(extract2): 3402 /* Note that SHRD outputs to the r/m operand. */ 3403 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3404 tcg_out8(s, args[3]); 3405 break; 3406 3407 case INDEX_op_mb: 3408 tcg_out_mb(s, a0); 3409 break; 3410 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3411 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3412 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3413 case INDEX_op_ext_i32_i64: /* Always emitted via tcg_reg_alloc_op. */ 3414 case INDEX_op_extu_i32_i64: 3415 case INDEX_op_extrl_i64_i32: 3416 default: 3417 g_assert_not_reached(); 3418 } 3419 3420#undef OP_32_64 3421} 3422 3423static int const umin_insn[4] = { 3424 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3425}; 3426 3427static int const umax_insn[4] = { 3428 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3429}; 3430 3431static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3432 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3433{ 3434 static int const cmpeq_insn[4] = { 3435 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3436 }; 3437 static int const cmpgt_insn[4] = { 3438 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3439 }; 3440 3441 enum { 3442 NEED_INV = 1, 3443 NEED_SWAP = 2, 3444 NEED_UMIN = 4, 3445 NEED_UMAX = 8, 3446 INVALID = 16, 3447 }; 3448 static const uint8_t cond_fixup[16] = { 3449 [0 ... 15] = INVALID, 3450 [TCG_COND_EQ] = 0, 3451 [TCG_COND_GT] = 0, 3452 [TCG_COND_NE] = NEED_INV, 3453 [TCG_COND_LE] = NEED_INV, 3454 [TCG_COND_LT] = NEED_SWAP, 3455 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3456 [TCG_COND_LEU] = NEED_UMIN, 3457 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3458 [TCG_COND_GEU] = NEED_UMAX, 3459 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3460 }; 3461 int fixup = cond_fixup[cond]; 3462 3463 assert(!(fixup & INVALID)); 3464 3465 if (fixup & NEED_INV) { 3466 cond = tcg_invert_cond(cond); 3467 } 3468 3469 if (fixup & NEED_SWAP) { 3470 TCGReg swap = v1; 3471 v1 = v2; 3472 v2 = swap; 3473 cond = tcg_swap_cond(cond); 3474 } 3475 3476 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3477 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3478 3479 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3480 assert(vece <= MO_32); 3481 3482 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3483 v2 = TCG_TMP_VEC; 3484 cond = TCG_COND_EQ; 3485 } 3486 3487 switch (cond) { 3488 case TCG_COND_EQ: 3489 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3490 break; 3491 case TCG_COND_GT: 3492 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3493 break; 3494 default: 3495 g_assert_not_reached(); 3496 } 3497 return fixup & NEED_INV; 3498} 3499 3500static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3501 TCGReg v1, TCGReg v2, TCGCond cond) 3502{ 3503 static const int cmpm_insn[2][4] = { 3504 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3505 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3506 }; 3507 static const int testm_insn[4] = { 3508 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3509 }; 3510 static const int testnm_insn[4] = { 3511 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3512 }; 3513 3514 static const int cond_ext[16] = { 3515 [TCG_COND_EQ] = 0, 3516 [TCG_COND_NE] = 4, 3517 [TCG_COND_LT] = 1, 3518 [TCG_COND_LTU] = 1, 3519 [TCG_COND_LE] = 2, 3520 [TCG_COND_LEU] = 2, 3521 [TCG_COND_NEVER] = 3, 3522 [TCG_COND_GE] = 5, 3523 [TCG_COND_GEU] = 5, 3524 [TCG_COND_GT] = 6, 3525 [TCG_COND_GTU] = 6, 3526 [TCG_COND_ALWAYS] = 7, 3527 }; 3528 3529 switch (cond) { 3530 case TCG_COND_TSTNE: 3531 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3532 break; 3533 case TCG_COND_TSTEQ: 3534 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3535 break; 3536 default: 3537 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3538 /* k1 */ 1, v1, v2, type); 3539 tcg_out8(s, cond_ext[cond]); 3540 break; 3541 } 3542} 3543 3544static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3545 unsigned vece, TCGReg dest) 3546{ 3547 static const int movm_insn[] = { 3548 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3549 }; 3550 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3551} 3552 3553static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3554 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3555{ 3556 /* 3557 * With avx512, we have a complete set of comparisons into mask. 3558 * Unless there's a single insn expansion for the comparision, 3559 * expand via a mask in k1. 3560 */ 3561 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3562 && cond != TCG_COND_EQ 3563 && cond != TCG_COND_LT 3564 && cond != TCG_COND_GT) { 3565 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3566 tcg_out_k1_to_vec(s, type, vece, v0); 3567 return; 3568 } 3569 3570 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3571 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3572 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3573 } 3574} 3575 3576static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3577 TCGReg v0, TCGReg c1, TCGReg c2, 3578 TCGReg v3, TCGReg v4, TCGCond cond) 3579{ 3580 static const int vpblendm_insn[] = { 3581 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3582 }; 3583 bool z = false; 3584 3585 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3586 if (!v3) { 3587 z = true; 3588 v3 = v4; 3589 cond = tcg_invert_cond(cond); 3590 } 3591 3592 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3593 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3594 /* k1 */1, z, type); 3595} 3596 3597static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3598 TCGReg v0, TCGReg c1, TCGReg c2, 3599 TCGReg v3, TCGReg v4, TCGCond cond) 3600{ 3601 bool inv; 3602 3603 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3604 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3605 return; 3606 } 3607 3608 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3609 3610 /* 3611 * Since XMM0 is 16, the only way we get 0 into V3 3612 * is via the constant zero constraint. 3613 */ 3614 if (!v3) { 3615 if (inv) { 3616 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3617 } else { 3618 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3619 } 3620 } else { 3621 if (inv) { 3622 TCGReg swap = v3; 3623 v3 = v4; 3624 v4 = swap; 3625 } 3626 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3627 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3628 } 3629} 3630 3631static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3632 unsigned vecl, unsigned vece, 3633 const TCGArg args[TCG_MAX_OP_ARGS], 3634 const int const_args[TCG_MAX_OP_ARGS]) 3635{ 3636 static int const add_insn[4] = { 3637 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3638 }; 3639 static int const ssadd_insn[4] = { 3640 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3641 }; 3642 static int const usadd_insn[4] = { 3643 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3644 }; 3645 static int const sub_insn[4] = { 3646 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3647 }; 3648 static int const sssub_insn[4] = { 3649 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3650 }; 3651 static int const ussub_insn[4] = { 3652 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3653 }; 3654 static int const mul_insn[4] = { 3655 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3656 }; 3657 static int const shift_imm_insn[4] = { 3658 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3659 }; 3660 static int const punpckl_insn[4] = { 3661 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3662 }; 3663 static int const punpckh_insn[4] = { 3664 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3665 }; 3666 static int const packss_insn[4] = { 3667 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3668 }; 3669 static int const packus_insn[4] = { 3670 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3671 }; 3672 static int const smin_insn[4] = { 3673 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3674 }; 3675 static int const smax_insn[4] = { 3676 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3677 }; 3678 static int const rotlv_insn[4] = { 3679 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3680 }; 3681 static int const rotrv_insn[4] = { 3682 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3683 }; 3684 static int const shlv_insn[4] = { 3685 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3686 }; 3687 static int const shrv_insn[4] = { 3688 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3689 }; 3690 static int const sarv_insn[4] = { 3691 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3692 }; 3693 static int const shls_insn[4] = { 3694 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3695 }; 3696 static int const shrs_insn[4] = { 3697 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3698 }; 3699 static int const sars_insn[4] = { 3700 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3701 }; 3702 static int const vpshldi_insn[4] = { 3703 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3704 }; 3705 static int const vpshldv_insn[4] = { 3706 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3707 }; 3708 static int const vpshrdv_insn[4] = { 3709 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3710 }; 3711 static int const abs_insn[4] = { 3712 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3713 }; 3714 3715 TCGType type = vecl + TCG_TYPE_V64; 3716 int insn, sub; 3717 TCGArg a0, a1, a2, a3; 3718 3719 a0 = args[0]; 3720 a1 = args[1]; 3721 a2 = args[2]; 3722 3723 switch (opc) { 3724 case INDEX_op_add_vec: 3725 insn = add_insn[vece]; 3726 goto gen_simd; 3727 case INDEX_op_ssadd_vec: 3728 insn = ssadd_insn[vece]; 3729 goto gen_simd; 3730 case INDEX_op_usadd_vec: 3731 insn = usadd_insn[vece]; 3732 goto gen_simd; 3733 case INDEX_op_sub_vec: 3734 insn = sub_insn[vece]; 3735 goto gen_simd; 3736 case INDEX_op_sssub_vec: 3737 insn = sssub_insn[vece]; 3738 goto gen_simd; 3739 case INDEX_op_ussub_vec: 3740 insn = ussub_insn[vece]; 3741 goto gen_simd; 3742 case INDEX_op_mul_vec: 3743 insn = mul_insn[vece]; 3744 goto gen_simd; 3745 case INDEX_op_and_vec: 3746 insn = OPC_PAND; 3747 goto gen_simd; 3748 case INDEX_op_or_vec: 3749 insn = OPC_POR; 3750 goto gen_simd; 3751 case INDEX_op_xor_vec: 3752 insn = OPC_PXOR; 3753 goto gen_simd; 3754 case INDEX_op_smin_vec: 3755 insn = smin_insn[vece]; 3756 goto gen_simd; 3757 case INDEX_op_umin_vec: 3758 insn = umin_insn[vece]; 3759 goto gen_simd; 3760 case INDEX_op_smax_vec: 3761 insn = smax_insn[vece]; 3762 goto gen_simd; 3763 case INDEX_op_umax_vec: 3764 insn = umax_insn[vece]; 3765 goto gen_simd; 3766 case INDEX_op_shlv_vec: 3767 insn = shlv_insn[vece]; 3768 goto gen_simd; 3769 case INDEX_op_shrv_vec: 3770 insn = shrv_insn[vece]; 3771 goto gen_simd; 3772 case INDEX_op_sarv_vec: 3773 insn = sarv_insn[vece]; 3774 goto gen_simd; 3775 case INDEX_op_rotlv_vec: 3776 insn = rotlv_insn[vece]; 3777 goto gen_simd; 3778 case INDEX_op_rotrv_vec: 3779 insn = rotrv_insn[vece]; 3780 goto gen_simd; 3781 case INDEX_op_shls_vec: 3782 insn = shls_insn[vece]; 3783 goto gen_simd; 3784 case INDEX_op_shrs_vec: 3785 insn = shrs_insn[vece]; 3786 goto gen_simd; 3787 case INDEX_op_sars_vec: 3788 insn = sars_insn[vece]; 3789 goto gen_simd; 3790 case INDEX_op_x86_punpckl_vec: 3791 insn = punpckl_insn[vece]; 3792 goto gen_simd; 3793 case INDEX_op_x86_punpckh_vec: 3794 insn = punpckh_insn[vece]; 3795 goto gen_simd; 3796 case INDEX_op_x86_packss_vec: 3797 insn = packss_insn[vece]; 3798 goto gen_simd; 3799 case INDEX_op_x86_packus_vec: 3800 insn = packus_insn[vece]; 3801 goto gen_simd; 3802 case INDEX_op_x86_vpshldv_vec: 3803 insn = vpshldv_insn[vece]; 3804 a1 = a2; 3805 a2 = args[3]; 3806 goto gen_simd; 3807 case INDEX_op_x86_vpshrdv_vec: 3808 insn = vpshrdv_insn[vece]; 3809 a1 = a2; 3810 a2 = args[3]; 3811 goto gen_simd; 3812#if TCG_TARGET_REG_BITS == 32 3813 case INDEX_op_dup2_vec: 3814 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3815 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3816 /* Then replicate the 64-bit elements across the rest of the vector. */ 3817 if (type != TCG_TYPE_V64) { 3818 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3819 } 3820 break; 3821#endif 3822 case INDEX_op_abs_vec: 3823 insn = abs_insn[vece]; 3824 a2 = a1; 3825 a1 = 0; 3826 goto gen_simd; 3827 gen_simd: 3828 tcg_debug_assert(insn != OPC_UD2); 3829 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3830 break; 3831 3832 case INDEX_op_cmp_vec: 3833 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3834 break; 3835 3836 case INDEX_op_cmpsel_vec: 3837 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3838 args[3], args[4], args[5]); 3839 break; 3840 3841 case INDEX_op_andc_vec: 3842 insn = OPC_PANDN; 3843 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3844 break; 3845 3846 case INDEX_op_shli_vec: 3847 insn = shift_imm_insn[vece]; 3848 sub = 6; 3849 goto gen_shift; 3850 case INDEX_op_shri_vec: 3851 insn = shift_imm_insn[vece]; 3852 sub = 2; 3853 goto gen_shift; 3854 case INDEX_op_sari_vec: 3855 if (vece == MO_64) { 3856 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3857 } else { 3858 insn = shift_imm_insn[vece]; 3859 } 3860 sub = 4; 3861 goto gen_shift; 3862 case INDEX_op_rotli_vec: 3863 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3864 if (vece == MO_64) { 3865 insn |= P_VEXW; 3866 } 3867 sub = 1; 3868 goto gen_shift; 3869 gen_shift: 3870 tcg_debug_assert(vece != MO_8); 3871 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3872 tcg_out8(s, a2); 3873 break; 3874 3875 case INDEX_op_ld_vec: 3876 tcg_out_ld(s, type, a0, a1, a2); 3877 break; 3878 case INDEX_op_st_vec: 3879 tcg_out_st(s, type, a0, a1, a2); 3880 break; 3881 case INDEX_op_dupm_vec: 3882 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3883 break; 3884 3885 case INDEX_op_x86_shufps_vec: 3886 insn = OPC_SHUFPS; 3887 sub = args[3]; 3888 goto gen_simd_imm8; 3889 case INDEX_op_x86_blend_vec: 3890 if (vece == MO_16) { 3891 insn = OPC_PBLENDW; 3892 } else if (vece == MO_32) { 3893 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3894 } else { 3895 g_assert_not_reached(); 3896 } 3897 sub = args[3]; 3898 goto gen_simd_imm8; 3899 case INDEX_op_x86_vperm2i128_vec: 3900 insn = OPC_VPERM2I128; 3901 sub = args[3]; 3902 goto gen_simd_imm8; 3903 case INDEX_op_x86_vpshldi_vec: 3904 insn = vpshldi_insn[vece]; 3905 sub = args[3]; 3906 goto gen_simd_imm8; 3907 3908 case INDEX_op_not_vec: 3909 insn = OPC_VPTERNLOGQ; 3910 a2 = a1; 3911 sub = 0x33; /* !B */ 3912 goto gen_simd_imm8; 3913 case INDEX_op_nor_vec: 3914 insn = OPC_VPTERNLOGQ; 3915 sub = 0x11; /* norCB */ 3916 goto gen_simd_imm8; 3917 case INDEX_op_nand_vec: 3918 insn = OPC_VPTERNLOGQ; 3919 sub = 0x77; /* nandCB */ 3920 goto gen_simd_imm8; 3921 case INDEX_op_eqv_vec: 3922 insn = OPC_VPTERNLOGQ; 3923 sub = 0x99; /* xnorCB */ 3924 goto gen_simd_imm8; 3925 case INDEX_op_orc_vec: 3926 insn = OPC_VPTERNLOGQ; 3927 sub = 0xdd; /* orB!C */ 3928 goto gen_simd_imm8; 3929 3930 case INDEX_op_bitsel_vec: 3931 insn = OPC_VPTERNLOGQ; 3932 a3 = args[3]; 3933 if (a0 == a1) { 3934 a1 = a2; 3935 a2 = a3; 3936 sub = 0xca; /* A?B:C */ 3937 } else if (a0 == a2) { 3938 a2 = a3; 3939 sub = 0xe2; /* B?A:C */ 3940 } else { 3941 tcg_out_mov(s, type, a0, a3); 3942 sub = 0xb8; /* B?C:A */ 3943 } 3944 goto gen_simd_imm8; 3945 3946 gen_simd_imm8: 3947 tcg_debug_assert(insn != OPC_UD2); 3948 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3949 tcg_out8(s, sub); 3950 break; 3951 3952 case INDEX_op_x86_psrldq_vec: 3953 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3954 tcg_out8(s, a2); 3955 break; 3956 3957 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3958 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3959 default: 3960 g_assert_not_reached(); 3961 } 3962} 3963 3964static TCGConstraintSetIndex 3965tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) 3966{ 3967 switch (op) { 3968 case INDEX_op_goto_ptr: 3969 return C_O0_I1(r); 3970 3971 case INDEX_op_ld8u_i32: 3972 case INDEX_op_ld8u_i64: 3973 case INDEX_op_ld8s_i32: 3974 case INDEX_op_ld8s_i64: 3975 case INDEX_op_ld16u_i32: 3976 case INDEX_op_ld16u_i64: 3977 case INDEX_op_ld16s_i32: 3978 case INDEX_op_ld16s_i64: 3979 case INDEX_op_ld_i32: 3980 case INDEX_op_ld32u_i64: 3981 case INDEX_op_ld32s_i64: 3982 case INDEX_op_ld_i64: 3983 return C_O1_I1(r, r); 3984 3985 case INDEX_op_st8_i32: 3986 case INDEX_op_st8_i64: 3987 return C_O0_I2(qi, r); 3988 3989 case INDEX_op_st16_i32: 3990 case INDEX_op_st16_i64: 3991 case INDEX_op_st_i32: 3992 case INDEX_op_st32_i64: 3993 return C_O0_I2(ri, r); 3994 3995 case INDEX_op_st_i64: 3996 return C_O0_I2(re, r); 3997 3998 case INDEX_op_extrh_i64_i32: 3999 return C_O1_I1(r, 0); 4000 4001 case INDEX_op_ext_i32_i64: 4002 case INDEX_op_extu_i32_i64: 4003 case INDEX_op_extrl_i64_i32: 4004 case INDEX_op_sextract_i32: 4005 case INDEX_op_sextract_i64: 4006 return C_O1_I1(r, r); 4007 4008 case INDEX_op_extract2_i32: 4009 case INDEX_op_extract2_i64: 4010 return C_O1_I2(r, 0, r); 4011 4012 case INDEX_op_deposit_i32: 4013 case INDEX_op_deposit_i64: 4014 return C_O1_I2(q, 0, qi); 4015 4016 case INDEX_op_add2_i32: 4017 case INDEX_op_add2_i64: 4018 case INDEX_op_sub2_i32: 4019 case INDEX_op_sub2_i64: 4020 return C_N1_O1_I4(r, r, 0, 1, re, re); 4021 4022 case INDEX_op_qemu_ld_i32: 4023 return C_O1_I1(r, L); 4024 4025 case INDEX_op_qemu_st_i32: 4026 return C_O0_I2(L, L); 4027 case INDEX_op_qemu_st8_i32: 4028 return C_O0_I2(s, L); 4029 4030 case INDEX_op_qemu_ld_i64: 4031 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 4032 4033 case INDEX_op_qemu_st_i64: 4034 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 4035 4036 case INDEX_op_qemu_ld_i128: 4037 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 4038 return C_O2_I1(r, r, L); 4039 case INDEX_op_qemu_st_i128: 4040 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 4041 return C_O0_I3(L, L, L); 4042 4043 case INDEX_op_ld_vec: 4044 case INDEX_op_dupm_vec: 4045 return C_O1_I1(x, r); 4046 4047 case INDEX_op_st_vec: 4048 return C_O0_I2(x, r); 4049 4050 case INDEX_op_add_vec: 4051 case INDEX_op_sub_vec: 4052 case INDEX_op_mul_vec: 4053 case INDEX_op_and_vec: 4054 case INDEX_op_or_vec: 4055 case INDEX_op_xor_vec: 4056 case INDEX_op_andc_vec: 4057 case INDEX_op_orc_vec: 4058 case INDEX_op_nand_vec: 4059 case INDEX_op_nor_vec: 4060 case INDEX_op_eqv_vec: 4061 case INDEX_op_ssadd_vec: 4062 case INDEX_op_usadd_vec: 4063 case INDEX_op_sssub_vec: 4064 case INDEX_op_ussub_vec: 4065 case INDEX_op_smin_vec: 4066 case INDEX_op_umin_vec: 4067 case INDEX_op_smax_vec: 4068 case INDEX_op_umax_vec: 4069 case INDEX_op_shlv_vec: 4070 case INDEX_op_shrv_vec: 4071 case INDEX_op_sarv_vec: 4072 case INDEX_op_rotlv_vec: 4073 case INDEX_op_rotrv_vec: 4074 case INDEX_op_shls_vec: 4075 case INDEX_op_shrs_vec: 4076 case INDEX_op_sars_vec: 4077 case INDEX_op_cmp_vec: 4078 case INDEX_op_x86_shufps_vec: 4079 case INDEX_op_x86_blend_vec: 4080 case INDEX_op_x86_packss_vec: 4081 case INDEX_op_x86_packus_vec: 4082 case INDEX_op_x86_vperm2i128_vec: 4083 case INDEX_op_x86_punpckl_vec: 4084 case INDEX_op_x86_punpckh_vec: 4085 case INDEX_op_x86_vpshldi_vec: 4086#if TCG_TARGET_REG_BITS == 32 4087 case INDEX_op_dup2_vec: 4088#endif 4089 return C_O1_I2(x, x, x); 4090 4091 case INDEX_op_abs_vec: 4092 case INDEX_op_dup_vec: 4093 case INDEX_op_not_vec: 4094 case INDEX_op_shli_vec: 4095 case INDEX_op_shri_vec: 4096 case INDEX_op_sari_vec: 4097 case INDEX_op_rotli_vec: 4098 case INDEX_op_x86_psrldq_vec: 4099 return C_O1_I1(x, x); 4100 4101 case INDEX_op_x86_vpshldv_vec: 4102 case INDEX_op_x86_vpshrdv_vec: 4103 return C_O1_I3(x, 0, x, x); 4104 4105 case INDEX_op_bitsel_vec: 4106 return C_O1_I3(x, x, x, x); 4107 case INDEX_op_cmpsel_vec: 4108 return C_O1_I4(x, x, x, xO, x); 4109 4110 default: 4111 return C_NotImplemented; 4112 } 4113} 4114 4115int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 4116{ 4117 switch (opc) { 4118 case INDEX_op_add_vec: 4119 case INDEX_op_sub_vec: 4120 case INDEX_op_and_vec: 4121 case INDEX_op_or_vec: 4122 case INDEX_op_xor_vec: 4123 case INDEX_op_andc_vec: 4124 case INDEX_op_orc_vec: 4125 case INDEX_op_nand_vec: 4126 case INDEX_op_nor_vec: 4127 case INDEX_op_eqv_vec: 4128 case INDEX_op_not_vec: 4129 case INDEX_op_bitsel_vec: 4130 return 1; 4131 case INDEX_op_cmp_vec: 4132 case INDEX_op_cmpsel_vec: 4133 return -1; 4134 4135 case INDEX_op_rotli_vec: 4136 return have_avx512vl && vece >= MO_32 ? 1 : -1; 4137 4138 case INDEX_op_shli_vec: 4139 case INDEX_op_shri_vec: 4140 /* We must expand the operation for MO_8. */ 4141 return vece == MO_8 ? -1 : 1; 4142 4143 case INDEX_op_sari_vec: 4144 switch (vece) { 4145 case MO_8: 4146 return -1; 4147 case MO_16: 4148 case MO_32: 4149 return 1; 4150 case MO_64: 4151 if (have_avx512vl) { 4152 return 1; 4153 } 4154 /* 4155 * We can emulate this for MO_64, but it does not pay off 4156 * unless we're producing at least 4 values. 4157 */ 4158 return type >= TCG_TYPE_V256 ? -1 : 0; 4159 } 4160 return 0; 4161 4162 case INDEX_op_shls_vec: 4163 case INDEX_op_shrs_vec: 4164 return vece >= MO_16; 4165 case INDEX_op_sars_vec: 4166 switch (vece) { 4167 case MO_16: 4168 case MO_32: 4169 return 1; 4170 case MO_64: 4171 return have_avx512vl; 4172 } 4173 return 0; 4174 case INDEX_op_rotls_vec: 4175 return vece >= MO_16 ? -1 : 0; 4176 4177 case INDEX_op_shlv_vec: 4178 case INDEX_op_shrv_vec: 4179 switch (vece) { 4180 case MO_16: 4181 return have_avx512bw; 4182 case MO_32: 4183 case MO_64: 4184 return have_avx2; 4185 } 4186 return 0; 4187 case INDEX_op_sarv_vec: 4188 switch (vece) { 4189 case MO_16: 4190 return have_avx512bw; 4191 case MO_32: 4192 return have_avx2; 4193 case MO_64: 4194 return have_avx512vl; 4195 } 4196 return 0; 4197 case INDEX_op_rotlv_vec: 4198 case INDEX_op_rotrv_vec: 4199 switch (vece) { 4200 case MO_16: 4201 return have_avx512vbmi2 ? -1 : 0; 4202 case MO_32: 4203 case MO_64: 4204 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 4205 } 4206 return 0; 4207 4208 case INDEX_op_mul_vec: 4209 switch (vece) { 4210 case MO_8: 4211 return -1; 4212 case MO_64: 4213 return have_avx512dq; 4214 } 4215 return 1; 4216 4217 case INDEX_op_ssadd_vec: 4218 case INDEX_op_usadd_vec: 4219 case INDEX_op_sssub_vec: 4220 case INDEX_op_ussub_vec: 4221 return vece <= MO_16; 4222 case INDEX_op_smin_vec: 4223 case INDEX_op_smax_vec: 4224 case INDEX_op_umin_vec: 4225 case INDEX_op_umax_vec: 4226 case INDEX_op_abs_vec: 4227 return vece <= MO_32 || have_avx512vl; 4228 4229 default: 4230 return 0; 4231 } 4232} 4233 4234static void expand_vec_shi(TCGType type, unsigned vece, bool right, 4235 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4236{ 4237 uint8_t mask; 4238 4239 tcg_debug_assert(vece == MO_8); 4240 if (right) { 4241 mask = 0xff >> imm; 4242 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4243 } else { 4244 mask = 0xff << imm; 4245 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4246 } 4247 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4248} 4249 4250static void expand_vec_sari(TCGType type, unsigned vece, 4251 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4252{ 4253 TCGv_vec t1, t2; 4254 4255 switch (vece) { 4256 case MO_8: 4257 /* Unpack to 16-bit, shift, and repack. */ 4258 t1 = tcg_temp_new_vec(type); 4259 t2 = tcg_temp_new_vec(type); 4260 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4261 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4262 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4263 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4264 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4265 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4266 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4267 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4268 tcg_temp_free_vec(t1); 4269 tcg_temp_free_vec(t2); 4270 break; 4271 4272 case MO_64: 4273 t1 = tcg_temp_new_vec(type); 4274 if (imm <= 32) { 4275 /* 4276 * We can emulate a small sign extend by performing an arithmetic 4277 * 32-bit shift and overwriting the high half of a 64-bit logical 4278 * shift. Note that the ISA says shift of 32 is valid, but TCG 4279 * does not, so we have to bound the smaller shift -- we get the 4280 * same result in the high half either way. 4281 */ 4282 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4283 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4284 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4285 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4286 tcgv_vec_arg(t1), 0xaa); 4287 } else { 4288 /* Otherwise we will need to use a compare vs 0 to produce 4289 * the sign-extend, shift and merge. 4290 */ 4291 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4292 tcg_constant_vec(type, MO_64, 0), v1); 4293 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4294 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4295 tcg_gen_or_vec(MO_64, v0, v0, t1); 4296 } 4297 tcg_temp_free_vec(t1); 4298 break; 4299 4300 default: 4301 g_assert_not_reached(); 4302 } 4303} 4304 4305static void expand_vec_rotli(TCGType type, unsigned vece, 4306 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4307{ 4308 TCGv_vec t; 4309 4310 if (vece != MO_8 && have_avx512vbmi2) { 4311 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4312 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4313 return; 4314 } 4315 4316 t = tcg_temp_new_vec(type); 4317 tcg_gen_shli_vec(vece, t, v1, imm); 4318 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4319 tcg_gen_or_vec(vece, v0, v0, t); 4320 tcg_temp_free_vec(t); 4321} 4322 4323static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4324 TCGv_vec v1, TCGv_vec sh, bool right) 4325{ 4326 TCGv_vec t; 4327 4328 if (have_avx512vbmi2) { 4329 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4330 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4331 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4332 return; 4333 } 4334 4335 t = tcg_temp_new_vec(type); 4336 tcg_gen_dupi_vec(vece, t, 8 << vece); 4337 tcg_gen_sub_vec(vece, t, t, sh); 4338 if (right) { 4339 tcg_gen_shlv_vec(vece, t, v1, t); 4340 tcg_gen_shrv_vec(vece, v0, v1, sh); 4341 } else { 4342 tcg_gen_shrv_vec(vece, t, v1, t); 4343 tcg_gen_shlv_vec(vece, v0, v1, sh); 4344 } 4345 tcg_gen_or_vec(vece, v0, v0, t); 4346 tcg_temp_free_vec(t); 4347} 4348 4349static void expand_vec_rotls(TCGType type, unsigned vece, 4350 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4351{ 4352 TCGv_vec t = tcg_temp_new_vec(type); 4353 4354 tcg_debug_assert(vece != MO_8); 4355 4356 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4357 tcg_gen_dup_i32_vec(vece, t, lsh); 4358 if (vece >= MO_32) { 4359 tcg_gen_rotlv_vec(vece, v0, v1, t); 4360 } else { 4361 expand_vec_rotv(type, vece, v0, v1, t, false); 4362 } 4363 } else { 4364 TCGv_i32 rsh = tcg_temp_new_i32(); 4365 4366 tcg_gen_neg_i32(rsh, lsh); 4367 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4368 tcg_gen_shls_vec(vece, t, v1, lsh); 4369 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4370 tcg_gen_or_vec(vece, v0, v0, t); 4371 4372 tcg_temp_free_i32(rsh); 4373 } 4374 4375 tcg_temp_free_vec(t); 4376} 4377 4378static void expand_vec_mul(TCGType type, unsigned vece, 4379 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4380{ 4381 TCGv_vec t1, t2, t3, t4, zero; 4382 4383 tcg_debug_assert(vece == MO_8); 4384 4385 /* 4386 * Unpack v1 bytes to words, 0 | x. 4387 * Unpack v2 bytes to words, y | 0. 4388 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4389 * Shift logical right by 8 bits to clear the high 8 bytes before 4390 * using an unsigned saturated pack. 4391 * 4392 * The difference between the V64, V128 and V256 cases is merely how 4393 * we distribute the expansion between temporaries. 4394 */ 4395 switch (type) { 4396 case TCG_TYPE_V64: 4397 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4398 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4399 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4400 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4401 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4402 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4403 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4404 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4405 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4406 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4407 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4408 tcg_temp_free_vec(t1); 4409 tcg_temp_free_vec(t2); 4410 break; 4411 4412 case TCG_TYPE_V128: 4413 case TCG_TYPE_V256: 4414 t1 = tcg_temp_new_vec(type); 4415 t2 = tcg_temp_new_vec(type); 4416 t3 = tcg_temp_new_vec(type); 4417 t4 = tcg_temp_new_vec(type); 4418 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4419 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4420 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4421 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4422 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4423 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4424 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4425 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4426 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4427 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4428 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4429 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4430 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4431 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4432 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4433 tcg_temp_free_vec(t1); 4434 tcg_temp_free_vec(t2); 4435 tcg_temp_free_vec(t3); 4436 tcg_temp_free_vec(t4); 4437 break; 4438 4439 default: 4440 g_assert_not_reached(); 4441 } 4442} 4443 4444static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4445 TCGArg *a1, TCGArg *a2, TCGCond cond) 4446{ 4447 /* 4448 * Without AVX512, there are no 64-bit unsigned comparisons. 4449 * We must bias the inputs so that they become signed. 4450 * All other swapping and inversion are handled during code generation. 4451 */ 4452 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4453 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4454 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4455 TCGv_vec t1 = tcg_temp_new_vec(type); 4456 TCGv_vec t2 = tcg_temp_new_vec(type); 4457 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4458 4459 tcg_gen_sub_vec(vece, t1, v1, t3); 4460 tcg_gen_sub_vec(vece, t2, v2, t3); 4461 *a1 = tcgv_vec_arg(t1); 4462 *a2 = tcgv_vec_arg(t2); 4463 cond = tcg_signed_cond(cond); 4464 } 4465 return cond; 4466} 4467 4468static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4469 TCGArg a1, TCGArg a2, TCGCond cond) 4470{ 4471 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4472 /* Expand directly; do not recurse. */ 4473 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4474} 4475 4476static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4477 TCGArg a1, TCGArg a2, 4478 TCGArg a3, TCGArg a4, TCGCond cond) 4479{ 4480 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4481 /* Expand directly; do not recurse. */ 4482 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4483} 4484 4485void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4486 TCGArg a0, ...) 4487{ 4488 va_list va; 4489 TCGArg a1, a2, a3, a4, a5; 4490 TCGv_vec v0, v1, v2; 4491 4492 va_start(va, a0); 4493 a1 = va_arg(va, TCGArg); 4494 a2 = va_arg(va, TCGArg); 4495 v0 = temp_tcgv_vec(arg_temp(a0)); 4496 v1 = temp_tcgv_vec(arg_temp(a1)); 4497 4498 switch (opc) { 4499 case INDEX_op_shli_vec: 4500 expand_vec_shi(type, vece, false, v0, v1, a2); 4501 break; 4502 case INDEX_op_shri_vec: 4503 expand_vec_shi(type, vece, true, v0, v1, a2); 4504 break; 4505 case INDEX_op_sari_vec: 4506 expand_vec_sari(type, vece, v0, v1, a2); 4507 break; 4508 4509 case INDEX_op_rotli_vec: 4510 expand_vec_rotli(type, vece, v0, v1, a2); 4511 break; 4512 4513 case INDEX_op_rotls_vec: 4514 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4515 break; 4516 4517 case INDEX_op_rotlv_vec: 4518 v2 = temp_tcgv_vec(arg_temp(a2)); 4519 expand_vec_rotv(type, vece, v0, v1, v2, false); 4520 break; 4521 case INDEX_op_rotrv_vec: 4522 v2 = temp_tcgv_vec(arg_temp(a2)); 4523 expand_vec_rotv(type, vece, v0, v1, v2, true); 4524 break; 4525 4526 case INDEX_op_mul_vec: 4527 v2 = temp_tcgv_vec(arg_temp(a2)); 4528 expand_vec_mul(type, vece, v0, v1, v2); 4529 break; 4530 4531 case INDEX_op_cmp_vec: 4532 a3 = va_arg(va, TCGArg); 4533 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4534 break; 4535 4536 case INDEX_op_cmpsel_vec: 4537 a3 = va_arg(va, TCGArg); 4538 a4 = va_arg(va, TCGArg); 4539 a5 = va_arg(va, TCGArg); 4540 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4541 break; 4542 4543 default: 4544 break; 4545 } 4546 4547 va_end(va); 4548} 4549 4550static const int tcg_target_callee_save_regs[] = { 4551#if TCG_TARGET_REG_BITS == 64 4552 TCG_REG_RBP, 4553 TCG_REG_RBX, 4554#if defined(_WIN64) 4555 TCG_REG_RDI, 4556 TCG_REG_RSI, 4557#endif 4558 TCG_REG_R12, 4559 TCG_REG_R13, 4560 TCG_REG_R14, /* Currently used for the global env. */ 4561 TCG_REG_R15, 4562#else 4563 TCG_REG_EBP, /* Currently used for the global env. */ 4564 TCG_REG_EBX, 4565 TCG_REG_ESI, 4566 TCG_REG_EDI, 4567#endif 4568}; 4569 4570/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4571 and tcg_register_jit. */ 4572 4573#define PUSH_SIZE \ 4574 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4575 * (TCG_TARGET_REG_BITS / 8)) 4576 4577#define FRAME_SIZE \ 4578 ((PUSH_SIZE \ 4579 + TCG_STATIC_CALL_ARGS_SIZE \ 4580 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4581 + TCG_TARGET_STACK_ALIGN - 1) \ 4582 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4583 4584/* Generate global QEMU prologue and epilogue code */ 4585static void tcg_target_qemu_prologue(TCGContext *s) 4586{ 4587 int i, stack_addend; 4588 4589 /* TB prologue */ 4590 4591 /* Reserve some stack space, also for TCG temps. */ 4592 stack_addend = FRAME_SIZE - PUSH_SIZE; 4593 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4594 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4595 4596 /* Save all callee saved registers. */ 4597 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4598 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4599 } 4600 4601 if (!tcg_use_softmmu && guest_base) { 4602 int seg = setup_guest_base_seg(); 4603 if (seg != 0) { 4604 x86_guest_base.seg = seg; 4605 } else if (guest_base == (int32_t)guest_base) { 4606 x86_guest_base.ofs = guest_base; 4607 } else { 4608 assert(TCG_TARGET_REG_BITS == 64); 4609 /* Choose R12 because, as a base, it requires a SIB byte. */ 4610 x86_guest_base.index = TCG_REG_R12; 4611 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4612 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4613 } 4614 } 4615 4616 if (TCG_TARGET_REG_BITS == 32) { 4617 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4618 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4619 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4620 /* jmp *tb. */ 4621 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4622 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4623 + stack_addend); 4624 } else { 4625 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4626 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4627 /* jmp *tb. */ 4628 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4629 } 4630 4631 /* 4632 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4633 * and fall through to the rest of the epilogue. 4634 */ 4635 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4636 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4637 4638 /* TB epilogue */ 4639 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4640 4641 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4642 4643 if (have_avx2) { 4644 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4645 } 4646 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4647 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4648 } 4649 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4650} 4651 4652static void tcg_out_tb_start(TCGContext *s) 4653{ 4654 /* nothing to do */ 4655} 4656 4657static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4658{ 4659 memset(p, 0x90, count); 4660} 4661 4662static void tcg_target_init(TCGContext *s) 4663{ 4664 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4665 if (TCG_TARGET_REG_BITS == 64) { 4666 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4667 } 4668 if (have_avx1) { 4669 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4670 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4671 } 4672 if (have_avx2) { 4673 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4674 } 4675 4676 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4677 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4678 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4679 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4680 if (TCG_TARGET_REG_BITS == 64) { 4681#if !defined(_WIN64) 4682 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4683 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4684#endif 4685 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4686 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4687 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4688 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4689 } 4690 4691 s->reserved_regs = 0; 4692 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4693 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4694#ifdef _WIN64 4695 /* These are call saved, and we don't save them, so don't use them. */ 4696 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4697 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4698 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4699 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4700 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4701 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4702 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4703 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4704 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4705 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4706#endif 4707} 4708 4709typedef struct { 4710 DebugFrameHeader h; 4711 uint8_t fde_def_cfa[4]; 4712 uint8_t fde_reg_ofs[14]; 4713} DebugFrame; 4714 4715/* We're expecting a 2 byte uleb128 encoded value. */ 4716QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4717 4718#if !defined(__ELF__) 4719 /* Host machine without ELF. */ 4720#elif TCG_TARGET_REG_BITS == 64 4721#define ELF_HOST_MACHINE EM_X86_64 4722static const DebugFrame debug_frame = { 4723 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4724 .h.cie.id = -1, 4725 .h.cie.version = 1, 4726 .h.cie.code_align = 1, 4727 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4728 .h.cie.return_column = 16, 4729 4730 /* Total FDE size does not include the "len" member. */ 4731 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4732 4733 .fde_def_cfa = { 4734 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4735 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4736 (FRAME_SIZE >> 7) 4737 }, 4738 .fde_reg_ofs = { 4739 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4740 /* The following ordering must match tcg_target_callee_save_regs. */ 4741 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4742 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4743 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4744 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4745 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4746 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4747 } 4748}; 4749#else 4750#define ELF_HOST_MACHINE EM_386 4751static const DebugFrame debug_frame = { 4752 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4753 .h.cie.id = -1, 4754 .h.cie.version = 1, 4755 .h.cie.code_align = 1, 4756 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4757 .h.cie.return_column = 8, 4758 4759 /* Total FDE size does not include the "len" member. */ 4760 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4761 4762 .fde_def_cfa = { 4763 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4764 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4765 (FRAME_SIZE >> 7) 4766 }, 4767 .fde_reg_ofs = { 4768 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4769 /* The following ordering must match tcg_target_callee_save_regs. */ 4770 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4771 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4772 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4773 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4774 } 4775}; 4776#endif 4777 4778#if defined(ELF_HOST_MACHINE) 4779void tcg_register_jit(const void *buf, size_t buf_size) 4780{ 4781 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4782} 4783#endif 4784