1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25/* Used for function call generation. */ 26#define TCG_TARGET_STACK_ALIGN 16 27#if defined(_WIN64) 28#define TCG_TARGET_CALL_STACK_OFFSET 32 29#else 30#define TCG_TARGET_CALL_STACK_OFFSET 0 31#endif 32#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL 33#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL 34#if defined(_WIN64) 35# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF 36# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_VEC 37#elif TCG_TARGET_REG_BITS == 64 38# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 39# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL 40#else 41# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 42# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF 43#endif 44 45#ifdef CONFIG_DEBUG_TCG 46static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 47#if TCG_TARGET_REG_BITS == 64 48 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 49#else 50 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 51#endif 52 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 53 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 54#if TCG_TARGET_REG_BITS == 64 55 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 56 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 57#endif 58}; 59#endif 60 61static const int tcg_target_reg_alloc_order[] = { 62#if TCG_TARGET_REG_BITS == 64 63 TCG_REG_RBP, 64 TCG_REG_RBX, 65 TCG_REG_R12, 66 TCG_REG_R13, 67 TCG_REG_R14, 68 TCG_REG_R15, 69 TCG_REG_R10, 70 TCG_REG_R11, 71 TCG_REG_R9, 72 TCG_REG_R8, 73 TCG_REG_RCX, 74 TCG_REG_RDX, 75 TCG_REG_RSI, 76 TCG_REG_RDI, 77 TCG_REG_RAX, 78#else 79 TCG_REG_EBX, 80 TCG_REG_ESI, 81 TCG_REG_EDI, 82 TCG_REG_EBP, 83 TCG_REG_ECX, 84 TCG_REG_EDX, 85 TCG_REG_EAX, 86#endif 87 TCG_REG_XMM0, 88 TCG_REG_XMM1, 89 TCG_REG_XMM2, 90 TCG_REG_XMM3, 91 TCG_REG_XMM4, 92 TCG_REG_XMM5, 93#ifndef _WIN64 94 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 95 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 96 TCG_REG_XMM6, 97 TCG_REG_XMM7, 98#if TCG_TARGET_REG_BITS == 64 99 TCG_REG_XMM8, 100 TCG_REG_XMM9, 101 TCG_REG_XMM10, 102 TCG_REG_XMM11, 103 TCG_REG_XMM12, 104 TCG_REG_XMM13, 105 TCG_REG_XMM14, 106 TCG_REG_XMM15, 107#endif 108#endif 109}; 110 111#define TCG_TMP_VEC TCG_REG_XMM5 112 113static const int tcg_target_call_iarg_regs[] = { 114#if TCG_TARGET_REG_BITS == 64 115#if defined(_WIN64) 116 TCG_REG_RCX, 117 TCG_REG_RDX, 118#else 119 TCG_REG_RDI, 120 TCG_REG_RSI, 121 TCG_REG_RDX, 122 TCG_REG_RCX, 123#endif 124 TCG_REG_R8, 125 TCG_REG_R9, 126#else 127 /* 32 bit mode uses stack based calling convention (GCC default). */ 128#endif 129}; 130 131static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 132{ 133 switch (kind) { 134 case TCG_CALL_RET_NORMAL: 135 tcg_debug_assert(slot >= 0 && slot <= 1); 136 return slot ? TCG_REG_EDX : TCG_REG_EAX; 137#ifdef _WIN64 138 case TCG_CALL_RET_BY_VEC: 139 tcg_debug_assert(slot == 0); 140 return TCG_REG_XMM0; 141#endif 142 default: 143 g_assert_not_reached(); 144 } 145} 146 147/* Constants we accept. */ 148#define TCG_CT_CONST_S32 0x100 149#define TCG_CT_CONST_U32 0x200 150#define TCG_CT_CONST_I32 0x400 151#define TCG_CT_CONST_WSZ 0x800 152#define TCG_CT_CONST_TST 0x1000 153#define TCG_CT_CONST_ZERO 0x2000 154 155/* Registers used with L constraint, which are the first argument 156 registers on x86_64, and two random call clobbered registers on 157 i386. */ 158#if TCG_TARGET_REG_BITS == 64 159# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 160# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 161#else 162# define TCG_REG_L0 TCG_REG_EAX 163# define TCG_REG_L1 TCG_REG_EDX 164#endif 165 166#if TCG_TARGET_REG_BITS == 64 167# define ALL_GENERAL_REGS 0x0000ffffu 168# define ALL_VECTOR_REGS 0xffff0000u 169# define ALL_BYTEL_REGS ALL_GENERAL_REGS 170#else 171# define ALL_GENERAL_REGS 0x000000ffu 172# define ALL_VECTOR_REGS 0x00ff0000u 173# define ALL_BYTEL_REGS 0x0000000fu 174#endif 175#define SOFTMMU_RESERVE_REGS \ 176 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 177 178#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 179#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 180 181static const tcg_insn_unit *tb_ret_addr; 182 183static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 184 intptr_t value, intptr_t addend) 185{ 186 value += addend; 187 switch(type) { 188 case R_386_PC32: 189 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 190 if (value != (int32_t)value) { 191 return false; 192 } 193 /* FALLTHRU */ 194 case R_386_32: 195 tcg_patch32(code_ptr, value); 196 break; 197 case R_386_PC8: 198 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 199 if (value != (int8_t)value) { 200 return false; 201 } 202 tcg_patch8(code_ptr, value); 203 break; 204 default: 205 g_assert_not_reached(); 206 } 207 return true; 208} 209 210/* test if a constant matches the constraint */ 211static bool tcg_target_const_match(int64_t val, int ct, 212 TCGType type, TCGCond cond, int vece) 213{ 214 if (ct & TCG_CT_CONST) { 215 return 1; 216 } 217 if (type == TCG_TYPE_I32) { 218 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 219 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 220 return 1; 221 } 222 } else { 223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 224 return 1; 225 } 226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 227 return 1; 228 } 229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 230 return 1; 231 } 232 /* 233 * This will be used in combination with TCG_CT_CONST_S32, 234 * so "normal" TESTQ is already matched. Also accept: 235 * TESTQ -> TESTL (uint32_t) 236 * TESTQ -> BT (is_power_of_2) 237 */ 238 if ((ct & TCG_CT_CONST_TST) 239 && is_tst_cond(cond) 240 && (val == (uint32_t)val || is_power_of_2(val))) { 241 return 1; 242 } 243 } 244 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 245 return 1; 246 } 247 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EbIb (0x80) 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVB_Ib (0xb0) 307#define OPC_MOVL_Iv (0xb8) 308#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 309#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 310#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 311#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 312#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 313#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 314#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 315#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 316#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 318#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 319#define OPC_MOVSBL (0xbe | P_EXT) 320#define OPC_MOVSWL (0xbf | P_EXT) 321#define OPC_MOVSLQ (0x63 | P_REXW) 322#define OPC_MOVZBL (0xb6 | P_EXT) 323#define OPC_MOVZWL (0xb7 | P_EXT) 324#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 325#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 326#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 327#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 328#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 329#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 330#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 331#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 332#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 333#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 334#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 335#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 336#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 337#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 338#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 339#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 340#define OPC_PAND (0xdb | P_EXT | P_DATA16) 341#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 342#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 343#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 344#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 345#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 346#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 347#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 348#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 349#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 350#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 351#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 352#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 353#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 354#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 355#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 356#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 358#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 359#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 360#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 362#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 363#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 364#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 366#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 367#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 368#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 369#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 370#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 371#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 372#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 373#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 374#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 375#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 376#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 377#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 378#define OPC_POR (0xeb | P_EXT | P_DATA16) 379#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 380#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 381#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 382#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 383#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 384#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 385#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 386#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 387#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 388#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 389#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 390#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 391#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 392#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 393#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 394#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 395#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 396#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 397#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 398#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 399#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 400#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 401#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 402#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 404#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 405#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 406#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 407#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 408#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 409#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 410#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 411#define OPC_PXOR (0xef | P_EXT | P_DATA16) 412#define OPC_POP_r32 (0x58) 413#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 414#define OPC_PUSH_r32 (0x50) 415#define OPC_PUSH_Iv (0x68) 416#define OPC_PUSH_Ib (0x6a) 417#define OPC_RET (0xc3) 418#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 419#define OPC_SHIFT_1 (0xd1) 420#define OPC_SHIFT_Ib (0xc1) 421#define OPC_SHIFT_cl (0xd3) 422#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 423#define OPC_SHUFPS (0xc6 | P_EXT) 424#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 425#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 426#define OPC_SHRD_Ib (0xac | P_EXT) 427#define OPC_TESTB (0x84) 428#define OPC_TESTL (0x85) 429#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 430#define OPC_UD2 (0x0b | P_EXT) 431#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 432#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 433#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 434#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 436#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 437#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 438#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 439#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 440#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 442#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 443#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 446#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 447#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 448#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 449#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 450#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 451#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 452#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 453#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 454#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 455#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 456#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 457#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 458#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 459#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 460#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 462#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 463#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 464#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 465#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 466#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 468#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 469#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 470#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 471#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 472#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 473#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 474#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 475#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 476#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 477#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 478#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 479#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 480#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 481#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 482#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 483#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 484#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 485#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 486#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 487#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 488#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 489#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 490#define OPC_VZEROUPPER (0x77 | P_EXT) 491#define OPC_XCHG_ax_r32 (0x90) 492#define OPC_XCHG_EvGv (0x87) 493 494#define OPC_GRP3_Eb (0xf6) 495#define OPC_GRP3_Ev (0xf7) 496#define OPC_GRP5 (0xff) 497#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 498#define OPC_GRPBT (0xba | P_EXT) 499 500#define OPC_GRPBT_BT 4 501#define OPC_GRPBT_BTS 5 502#define OPC_GRPBT_BTR 6 503#define OPC_GRPBT_BTC 7 504 505/* Group 1 opcode extensions for 0x80-0x83. 506 These are also used as modifiers for OPC_ARITH. */ 507#define ARITH_ADD 0 508#define ARITH_OR 1 509#define ARITH_ADC 2 510#define ARITH_SBB 3 511#define ARITH_AND 4 512#define ARITH_SUB 5 513#define ARITH_XOR 6 514#define ARITH_CMP 7 515 516/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 517#define SHIFT_ROL 0 518#define SHIFT_ROR 1 519#define SHIFT_SHL 4 520#define SHIFT_SHR 5 521#define SHIFT_SAR 7 522 523/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 524#define EXT3_TESTi 0 525#define EXT3_NOT 2 526#define EXT3_NEG 3 527#define EXT3_MUL 4 528#define EXT3_IMUL 5 529#define EXT3_DIV 6 530#define EXT3_IDIV 7 531 532/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 533#define EXT5_INC_Ev 0 534#define EXT5_DEC_Ev 1 535#define EXT5_CALLN_Ev 2 536#define EXT5_JMPN_Ev 4 537 538/* Condition codes to be added to OPC_JCC_{long,short}. */ 539#define JCC_JMP (-1) 540#define JCC_JO 0x0 541#define JCC_JNO 0x1 542#define JCC_JB 0x2 543#define JCC_JAE 0x3 544#define JCC_JE 0x4 545#define JCC_JNE 0x5 546#define JCC_JBE 0x6 547#define JCC_JA 0x7 548#define JCC_JS 0x8 549#define JCC_JNS 0x9 550#define JCC_JP 0xa 551#define JCC_JNP 0xb 552#define JCC_JL 0xc 553#define JCC_JGE 0xd 554#define JCC_JLE 0xe 555#define JCC_JG 0xf 556 557static const uint8_t tcg_cond_to_jcc[] = { 558 [TCG_COND_EQ] = JCC_JE, 559 [TCG_COND_NE] = JCC_JNE, 560 [TCG_COND_LT] = JCC_JL, 561 [TCG_COND_GE] = JCC_JGE, 562 [TCG_COND_LE] = JCC_JLE, 563 [TCG_COND_GT] = JCC_JG, 564 [TCG_COND_LTU] = JCC_JB, 565 [TCG_COND_GEU] = JCC_JAE, 566 [TCG_COND_LEU] = JCC_JBE, 567 [TCG_COND_GTU] = JCC_JA, 568 [TCG_COND_TSTEQ] = JCC_JE, 569 [TCG_COND_TSTNE] = JCC_JNE, 570}; 571 572#if TCG_TARGET_REG_BITS == 64 573static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 574{ 575 int rex; 576 577 if (opc & P_GS) { 578 tcg_out8(s, 0x65); 579 } 580 if (opc & P_DATA16) { 581 /* We should never be asking for both 16 and 64-bit operation. */ 582 tcg_debug_assert((opc & P_REXW) == 0); 583 tcg_out8(s, 0x66); 584 } 585 if (opc & P_SIMDF3) { 586 tcg_out8(s, 0xf3); 587 } else if (opc & P_SIMDF2) { 588 tcg_out8(s, 0xf2); 589 } 590 591 rex = 0; 592 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 593 rex |= (r & 8) >> 1; /* REX.R */ 594 rex |= (x & 8) >> 2; /* REX.X */ 595 rex |= (rm & 8) >> 3; /* REX.B */ 596 597 /* P_REXB_{R,RM} indicates that the given register is the low byte. 598 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 599 as otherwise the encoding indicates %[abcd]h. Note that the values 600 that are ORed in merely indicate that the REX byte must be present; 601 those bits get discarded in output. */ 602 rex |= opc & (r >= 4 ? P_REXB_R : 0); 603 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 604 605 if (rex) { 606 tcg_out8(s, (uint8_t)(rex | 0x40)); 607 } 608 609 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 610 tcg_out8(s, 0x0f); 611 if (opc & P_EXT38) { 612 tcg_out8(s, 0x38); 613 } else if (opc & P_EXT3A) { 614 tcg_out8(s, 0x3a); 615 } 616 } 617 618 tcg_out8(s, opc); 619} 620#else 621static void tcg_out_opc(TCGContext *s, int opc) 622{ 623 if (opc & P_DATA16) { 624 tcg_out8(s, 0x66); 625 } 626 if (opc & P_SIMDF3) { 627 tcg_out8(s, 0xf3); 628 } else if (opc & P_SIMDF2) { 629 tcg_out8(s, 0xf2); 630 } 631 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 632 tcg_out8(s, 0x0f); 633 if (opc & P_EXT38) { 634 tcg_out8(s, 0x38); 635 } else if (opc & P_EXT3A) { 636 tcg_out8(s, 0x3a); 637 } 638 } 639 tcg_out8(s, opc); 640} 641/* Discard the register arguments to tcg_out_opc early, so as not to penalize 642 the 32-bit compilation paths. This method works with all versions of gcc, 643 whereas relying on optimization may not be able to exclude them. */ 644#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 645#endif 646 647static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 648{ 649 tcg_out_opc(s, opc, r, rm, 0); 650 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 651} 652 653static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 654 int rm, int index) 655{ 656 int tmp; 657 658 if (opc & P_GS) { 659 tcg_out8(s, 0x65); 660 } 661 /* Use the two byte form if possible, which cannot encode 662 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 663 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 664 && ((rm | index) & 8) == 0) { 665 /* Two byte VEX prefix. */ 666 tcg_out8(s, 0xc5); 667 668 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 669 } else { 670 /* Three byte VEX prefix. */ 671 tcg_out8(s, 0xc4); 672 673 /* VEX.m-mmmm */ 674 if (opc & P_EXT3A) { 675 tmp = 3; 676 } else if (opc & P_EXT38) { 677 tmp = 2; 678 } else if (opc & P_EXT) { 679 tmp = 1; 680 } else { 681 g_assert_not_reached(); 682 } 683 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 684 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 685 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 686 tcg_out8(s, tmp); 687 688 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 689 } 690 691 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 692 /* VEX.pp */ 693 if (opc & P_DATA16) { 694 tmp |= 1; /* 0x66 */ 695 } else if (opc & P_SIMDF3) { 696 tmp |= 2; /* 0xf3 */ 697 } else if (opc & P_SIMDF2) { 698 tmp |= 3; /* 0xf2 */ 699 } 700 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 701 tcg_out8(s, tmp); 702 tcg_out8(s, opc); 703} 704 705static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 706 int rm, int index, int aaa, bool z) 707{ 708 /* The entire 4-byte evex prefix; with R' and V' set. */ 709 uint32_t p = 0x08041062; 710 int mm, pp; 711 712 tcg_debug_assert(have_avx512vl); 713 714 /* EVEX.mm */ 715 if (opc & P_EXT3A) { 716 mm = 3; 717 } else if (opc & P_EXT38) { 718 mm = 2; 719 } else if (opc & P_EXT) { 720 mm = 1; 721 } else { 722 g_assert_not_reached(); 723 } 724 725 /* EVEX.pp */ 726 if (opc & P_DATA16) { 727 pp = 1; /* 0x66 */ 728 } else if (opc & P_SIMDF3) { 729 pp = 2; /* 0xf3 */ 730 } else if (opc & P_SIMDF2) { 731 pp = 3; /* 0xf2 */ 732 } else { 733 pp = 0; 734 } 735 736 p = deposit32(p, 8, 2, mm); 737 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 738 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 739 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 740 p = deposit32(p, 16, 2, pp); 741 p = deposit32(p, 19, 4, ~v); 742 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 743 p = deposit32(p, 24, 3, aaa); 744 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 745 p = deposit32(p, 31, 1, z); 746 747 tcg_out32(s, p); 748 tcg_out8(s, opc); 749} 750 751static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 752{ 753 if (opc & P_EVEX) { 754 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 755 } else { 756 tcg_out_vex_opc(s, opc, r, v, rm, 0); 757 } 758 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 759} 760 761static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 762 int r, int v, int rm, TCGType type) 763{ 764 if (type == TCG_TYPE_V256) { 765 opc |= P_VEXL; 766 } 767 tcg_out_vex_modrm(s, opc, r, v, rm); 768} 769 770static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 771 int rm, int aaa, bool z, TCGType type) 772{ 773 if (type == TCG_TYPE_V256) { 774 opc |= P_VEXL; 775 } 776 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 777 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 778} 779 780/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 781 We handle either RM and INDEX missing with a negative value. In 64-bit 782 mode for absolute addresses, ~RM is the size of the immediate operand 783 that will follow the instruction. */ 784 785static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 786 int shift, intptr_t offset) 787{ 788 int mod, len; 789 790 if (index < 0 && rm < 0) { 791 if (TCG_TARGET_REG_BITS == 64) { 792 /* Try for a rip-relative addressing mode. This has replaced 793 the 32-bit-mode absolute addressing encoding. */ 794 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 795 intptr_t disp = offset - pc; 796 if (disp == (int32_t)disp) { 797 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 798 tcg_out32(s, disp); 799 return; 800 } 801 802 /* Try for an absolute address encoding. This requires the 803 use of the MODRM+SIB encoding and is therefore larger than 804 rip-relative addressing. */ 805 if (offset == (int32_t)offset) { 806 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 807 tcg_out8(s, (4 << 3) | 5); 808 tcg_out32(s, offset); 809 return; 810 } 811 812 /* ??? The memory isn't directly addressable. */ 813 g_assert_not_reached(); 814 } else { 815 /* Absolute address. */ 816 tcg_out8(s, (r << 3) | 5); 817 tcg_out32(s, offset); 818 return; 819 } 820 } 821 822 /* Find the length of the immediate addend. Note that the encoding 823 that would be used for (%ebp) indicates absolute addressing. */ 824 if (rm < 0) { 825 mod = 0, len = 4, rm = 5; 826 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 827 mod = 0, len = 0; 828 } else if (offset == (int8_t)offset) { 829 mod = 0x40, len = 1; 830 } else { 831 mod = 0x80, len = 4; 832 } 833 834 /* Use a single byte MODRM format if possible. Note that the encoding 835 that would be used for %esp is the escape to the two byte form. */ 836 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 837 /* Single byte MODRM format. */ 838 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 839 } else { 840 /* Two byte MODRM+SIB format. */ 841 842 /* Note that the encoding that would place %esp into the index 843 field indicates no index register. In 64-bit mode, the REX.X 844 bit counts, so %r12 can be used as the index. */ 845 if (index < 0) { 846 index = 4; 847 } else { 848 tcg_debug_assert(index != TCG_REG_ESP); 849 } 850 851 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 852 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 853 } 854 855 if (len == 1) { 856 tcg_out8(s, offset); 857 } else if (len == 4) { 858 tcg_out32(s, offset); 859 } 860} 861 862static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 863 int index, int shift, intptr_t offset) 864{ 865 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 866 tcg_out_sib_offset(s, r, rm, index, shift, offset); 867} 868 869static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 870 int rm, int index, int shift, 871 intptr_t offset) 872{ 873 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 874 tcg_out_sib_offset(s, r, rm, index, shift, offset); 875} 876 877/* A simplification of the above with no index or shift. */ 878static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 879 int rm, intptr_t offset) 880{ 881 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 882} 883 884static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 885 int v, int rm, intptr_t offset) 886{ 887 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 888} 889 890/* Output an opcode with an expected reference to the constant pool. */ 891static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 892{ 893 tcg_out_opc(s, opc, r, 0, 0); 894 /* Absolute for 32-bit, pc-relative for 64-bit. */ 895 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 896 tcg_out32(s, 0); 897} 898 899/* Output an opcode with an expected reference to the constant pool. */ 900static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 901{ 902 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 903 /* Absolute for 32-bit, pc-relative for 64-bit. */ 904 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 905 tcg_out32(s, 0); 906} 907 908/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 909static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 910{ 911 /* Propagate an opcode prefix, such as P_REXW. */ 912 int ext = subop & ~0x7; 913 subop &= 0x7; 914 915 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 916} 917 918static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 919{ 920 int rexw = 0; 921 922 if (arg == ret) { 923 return true; 924 } 925 switch (type) { 926 case TCG_TYPE_I64: 927 rexw = P_REXW; 928 /* fallthru */ 929 case TCG_TYPE_I32: 930 if (ret < 16) { 931 if (arg < 16) { 932 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 933 } else { 934 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 935 } 936 } else { 937 if (arg < 16) { 938 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 939 } else { 940 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 941 } 942 } 943 break; 944 945 case TCG_TYPE_V64: 946 tcg_debug_assert(ret >= 16 && arg >= 16); 947 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 948 break; 949 case TCG_TYPE_V128: 950 tcg_debug_assert(ret >= 16 && arg >= 16); 951 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 952 break; 953 case TCG_TYPE_V256: 954 tcg_debug_assert(ret >= 16 && arg >= 16); 955 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 956 break; 957 958 default: 959 g_assert_not_reached(); 960 } 961 return true; 962} 963 964static const int avx2_dup_insn[4] = { 965 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 966 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 967}; 968 969static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 970 TCGReg r, TCGReg a) 971{ 972 if (have_avx2) { 973 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 974 } else { 975 switch (vece) { 976 case MO_8: 977 /* ??? With zero in a register, use PSHUFB. */ 978 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 979 a = r; 980 /* FALLTHRU */ 981 case MO_16: 982 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 983 a = r; 984 /* FALLTHRU */ 985 case MO_32: 986 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 987 /* imm8 operand: all output lanes selected from input lane 0. */ 988 tcg_out8(s, 0); 989 break; 990 case MO_64: 991 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 992 break; 993 default: 994 g_assert_not_reached(); 995 } 996 } 997 return true; 998} 999 1000static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 1001 TCGReg r, TCGReg base, intptr_t offset) 1002{ 1003 if (have_avx2) { 1004 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1005 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 1006 r, 0, base, offset); 1007 } else { 1008 switch (vece) { 1009 case MO_64: 1010 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 1011 break; 1012 case MO_32: 1013 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 1014 break; 1015 case MO_16: 1016 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1017 tcg_out8(s, 0); /* imm8 */ 1018 tcg_out_dup_vec(s, type, vece, r, r); 1019 break; 1020 case MO_8: 1021 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1022 tcg_out8(s, 0); /* imm8 */ 1023 tcg_out_dup_vec(s, type, vece, r, r); 1024 break; 1025 default: 1026 g_assert_not_reached(); 1027 } 1028 } 1029 return true; 1030} 1031 1032static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1033 TCGReg ret, int64_t arg) 1034{ 1035 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1036 1037 if (arg == 0) { 1038 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1039 return; 1040 } 1041 if (arg == -1) { 1042 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1043 return; 1044 } 1045 1046 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1047 if (have_avx2) { 1048 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1049 } else { 1050 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1051 } 1052 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1053 } else { 1054 if (type == TCG_TYPE_V64) { 1055 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1056 } else if (have_avx2) { 1057 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1058 } else { 1059 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1060 } 1061 if (TCG_TARGET_REG_BITS == 64) { 1062 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1063 } else { 1064 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1065 } 1066 } 1067} 1068 1069static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1070 TCGReg ret, tcg_target_long arg) 1071{ 1072 if (arg == 0) { 1073 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1074 return; 1075 } 1076 if (arg == -1) { 1077 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1078 return; 1079 } 1080 1081 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1082 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1083 if (TCG_TARGET_REG_BITS == 64) { 1084 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1085 } else { 1086 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1087 } 1088} 1089 1090static void tcg_out_movi_int(TCGContext *s, TCGType type, 1091 TCGReg ret, tcg_target_long arg) 1092{ 1093 tcg_target_long diff; 1094 1095 if (arg == 0) { 1096 tgen_arithr(s, ARITH_XOR, ret, ret); 1097 return; 1098 } 1099 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1100 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1101 tcg_out32(s, arg); 1102 return; 1103 } 1104 if (arg == (int32_t)arg) { 1105 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1106 tcg_out32(s, arg); 1107 return; 1108 } 1109 1110 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1111 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1112 if (diff == (int32_t)diff) { 1113 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1114 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1115 tcg_out32(s, diff); 1116 return; 1117 } 1118 1119 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1120 tcg_out64(s, arg); 1121} 1122 1123static void tcg_out_movi(TCGContext *s, TCGType type, 1124 TCGReg ret, tcg_target_long arg) 1125{ 1126 switch (type) { 1127 case TCG_TYPE_I32: 1128#if TCG_TARGET_REG_BITS == 64 1129 case TCG_TYPE_I64: 1130#endif 1131 if (ret < 16) { 1132 tcg_out_movi_int(s, type, ret, arg); 1133 } else { 1134 tcg_out_movi_vec(s, type, ret, arg); 1135 } 1136 break; 1137 default: 1138 g_assert_not_reached(); 1139 } 1140} 1141 1142static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1143{ 1144 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1145 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1146 return true; 1147} 1148 1149static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1150 tcg_target_long imm) 1151{ 1152 /* This function is only used for passing structs by reference. */ 1153 tcg_debug_assert(imm == (int32_t)imm); 1154 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1155} 1156 1157static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1158{ 1159 if (val == (int8_t)val) { 1160 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1161 tcg_out8(s, val); 1162 } else if (val == (int32_t)val) { 1163 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1164 tcg_out32(s, val); 1165 } else { 1166 g_assert_not_reached(); 1167 } 1168} 1169 1170static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1171{ 1172 /* Given the strength of x86 memory ordering, we only need care for 1173 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1174 faster than "mfence", so don't bother with the sse insn. */ 1175 if (a0 & TCG_MO_ST_LD) { 1176 tcg_out8(s, 0xf0); 1177 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1178 tcg_out8(s, 0); 1179 } 1180} 1181 1182static inline void tcg_out_push(TCGContext *s, int reg) 1183{ 1184 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1185} 1186 1187static inline void tcg_out_pop(TCGContext *s, int reg) 1188{ 1189 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1190} 1191 1192static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1193 TCGReg arg1, intptr_t arg2) 1194{ 1195 switch (type) { 1196 case TCG_TYPE_I32: 1197 if (ret < 16) { 1198 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1199 } else { 1200 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1201 } 1202 break; 1203 case TCG_TYPE_I64: 1204 if (ret < 16) { 1205 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1206 break; 1207 } 1208 /* FALLTHRU */ 1209 case TCG_TYPE_V64: 1210 /* There is no instruction that can validate 8-byte alignment. */ 1211 tcg_debug_assert(ret >= 16); 1212 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1213 break; 1214 case TCG_TYPE_V128: 1215 /* 1216 * The gvec infrastructure is asserts that v128 vector loads 1217 * and stores use a 16-byte aligned offset. Validate that the 1218 * final pointer is aligned by using an insn that will SIGSEGV. 1219 */ 1220 tcg_debug_assert(ret >= 16); 1221 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1222 break; 1223 case TCG_TYPE_V256: 1224 /* 1225 * The gvec infrastructure only requires 16-byte alignment, 1226 * so here we must use an unaligned load. 1227 */ 1228 tcg_debug_assert(ret >= 16); 1229 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1230 ret, 0, arg1, arg2); 1231 break; 1232 default: 1233 g_assert_not_reached(); 1234 } 1235} 1236 1237static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1238 TCGReg arg1, intptr_t arg2) 1239{ 1240 switch (type) { 1241 case TCG_TYPE_I32: 1242 if (arg < 16) { 1243 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1244 } else { 1245 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1246 } 1247 break; 1248 case TCG_TYPE_I64: 1249 if (arg < 16) { 1250 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1251 break; 1252 } 1253 /* FALLTHRU */ 1254 case TCG_TYPE_V64: 1255 /* There is no instruction that can validate 8-byte alignment. */ 1256 tcg_debug_assert(arg >= 16); 1257 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1258 break; 1259 case TCG_TYPE_V128: 1260 /* 1261 * The gvec infrastructure is asserts that v128 vector loads 1262 * and stores use a 16-byte aligned offset. Validate that the 1263 * final pointer is aligned by using an insn that will SIGSEGV. 1264 * 1265 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1266 * for _WIN64, which must have SSE2 but may not have AVX. 1267 */ 1268 tcg_debug_assert(arg >= 16); 1269 if (have_avx1) { 1270 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1271 } else { 1272 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1273 } 1274 break; 1275 case TCG_TYPE_V256: 1276 /* 1277 * The gvec infrastructure only requires 16-byte alignment, 1278 * so here we must use an unaligned store. 1279 */ 1280 tcg_debug_assert(arg >= 16); 1281 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1282 arg, 0, arg1, arg2); 1283 break; 1284 default: 1285 g_assert_not_reached(); 1286 } 1287} 1288 1289static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1290 TCGReg base, intptr_t ofs) 1291{ 1292 int rexw = 0; 1293 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1294 if (val != (int32_t)val) { 1295 return false; 1296 } 1297 rexw = P_REXW; 1298 } else if (type != TCG_TYPE_I32) { 1299 return false; 1300 } 1301 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1302 tcg_out32(s, val); 1303 return true; 1304} 1305 1306static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1307{ 1308 /* Propagate an opcode prefix, such as P_DATA16. */ 1309 int ext = subopc & ~0x7; 1310 subopc &= 0x7; 1311 1312 if (count == 1) { 1313 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1314 } else { 1315 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1316 tcg_out8(s, count); 1317 } 1318} 1319 1320static inline void tcg_out_bswap32(TCGContext *s, int reg) 1321{ 1322 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1323} 1324 1325static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1326{ 1327 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1328} 1329 1330static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1331{ 1332 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1333 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1334 if (dest >= 4) { 1335 tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest); 1336 tcg_out32(s, 0xff); 1337 return; 1338 } 1339 src = dest; 1340 } 1341 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1342} 1343 1344static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1345{ 1346 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1347 1348 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1349 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1350 if (dest >= 4) { 1351 tcg_out_shifti(s, SHIFT_SHL, dest, 24); 1352 tcg_out_shifti(s, SHIFT_SAR, dest, 24); 1353 return; 1354 } 1355 src = dest; 1356 } 1357 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1358} 1359 1360static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1361{ 1362 /* movzwl */ 1363 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1364} 1365 1366static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1367{ 1368 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1369 /* movsw[lq] */ 1370 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1371} 1372 1373static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1374{ 1375 /* 32-bit mov zero extends. */ 1376 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1377} 1378 1379static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1380{ 1381 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1382 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1383} 1384 1385static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1386{ 1387 tcg_out_ext32s(s, dest, src); 1388} 1389 1390static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1391{ 1392 if (dest != src) { 1393 tcg_out_ext32u(s, dest, src); 1394 } 1395} 1396 1397static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1398{ 1399 tcg_out_ext32u(s, dest, src); 1400} 1401 1402static inline void tcg_out_bswap64(TCGContext *s, int reg) 1403{ 1404 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1405} 1406 1407static void tgen_arithi(TCGContext *s, int c, int r0, 1408 tcg_target_long val, int cf) 1409{ 1410 int rexw = 0; 1411 1412 if (TCG_TARGET_REG_BITS == 64) { 1413 rexw = c & -8; 1414 c &= 7; 1415 } 1416 1417 switch (c) { 1418 case ARITH_ADD: 1419 case ARITH_SUB: 1420 if (!cf) { 1421 /* 1422 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1423 * partial flags update stalls on Pentium4 and are not recommended 1424 * by current Intel optimization manuals. 1425 */ 1426 if (val == 1 || val == -1) { 1427 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1428 if (TCG_TARGET_REG_BITS == 64) { 1429 /* 1430 * The single-byte increment encodings are re-tasked 1431 * as the REX prefixes. Use the MODRM encoding. 1432 */ 1433 tcg_out_modrm(s, OPC_GRP5 + rexw, 1434 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1435 } else { 1436 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1437 } 1438 return; 1439 } 1440 if (val == 128) { 1441 /* 1442 * Facilitate using an 8-bit immediate. Carry is inverted 1443 * by this transformation, so do it only if cf == 0. 1444 */ 1445 c ^= ARITH_ADD ^ ARITH_SUB; 1446 val = -128; 1447 } 1448 } 1449 break; 1450 1451 case ARITH_AND: 1452 if (TCG_TARGET_REG_BITS == 64) { 1453 if (val == 0xffffffffu) { 1454 tcg_out_ext32u(s, r0, r0); 1455 return; 1456 } 1457 if (val == (uint32_t)val) { 1458 /* AND with no high bits set can use a 32-bit operation. */ 1459 rexw = 0; 1460 } 1461 } 1462 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1463 tcg_out_ext8u(s, r0, r0); 1464 return; 1465 } 1466 if (val == 0xffffu) { 1467 tcg_out_ext16u(s, r0, r0); 1468 return; 1469 } 1470 break; 1471 1472 case ARITH_OR: 1473 case ARITH_XOR: 1474 if (val >= 0x80 && val <= 0xff 1475 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1476 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1477 tcg_out8(s, val); 1478 return; 1479 } 1480 break; 1481 } 1482 1483 if (val == (int8_t)val) { 1484 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1485 tcg_out8(s, val); 1486 return; 1487 } 1488 if (rexw == 0 || val == (int32_t)val) { 1489 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1490 tcg_out32(s, val); 1491 return; 1492 } 1493 1494 g_assert_not_reached(); 1495} 1496 1497static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1498{ 1499 if (val != 0) { 1500 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1501 } 1502} 1503 1504/* Set SMALL to force a short forward branch. */ 1505static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1506{ 1507 int32_t val, val1; 1508 1509 if (l->has_value) { 1510 val = tcg_pcrel_diff(s, l->u.value_ptr); 1511 val1 = val - 2; 1512 if ((int8_t)val1 == val1) { 1513 if (opc == -1) { 1514 tcg_out8(s, OPC_JMP_short); 1515 } else { 1516 tcg_out8(s, OPC_JCC_short + opc); 1517 } 1518 tcg_out8(s, val1); 1519 } else { 1520 tcg_debug_assert(!small); 1521 if (opc == -1) { 1522 tcg_out8(s, OPC_JMP_long); 1523 tcg_out32(s, val - 5); 1524 } else { 1525 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1526 tcg_out32(s, val - 6); 1527 } 1528 } 1529 } else if (small) { 1530 if (opc == -1) { 1531 tcg_out8(s, OPC_JMP_short); 1532 } else { 1533 tcg_out8(s, OPC_JCC_short + opc); 1534 } 1535 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1536 s->code_ptr += 1; 1537 } else { 1538 if (opc == -1) { 1539 tcg_out8(s, OPC_JMP_long); 1540 } else { 1541 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1542 } 1543 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1544 s->code_ptr += 4; 1545 } 1546} 1547 1548static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1549 TCGArg arg2, int const_arg2, int rexw) 1550{ 1551 int jz, js; 1552 1553 if (!is_tst_cond(cond)) { 1554 if (!const_arg2) { 1555 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1556 } else if (arg2 == 0) { 1557 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1558 } else { 1559 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1560 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1561 } 1562 return tcg_cond_to_jcc[cond]; 1563 } 1564 1565 jz = tcg_cond_to_jcc[cond]; 1566 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1567 1568 if (!const_arg2) { 1569 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1570 return jz; 1571 } 1572 1573 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1574 if (arg2 == 0x80) { 1575 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1576 return js; 1577 } 1578 if (arg2 == 0xff) { 1579 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1580 return jz; 1581 } 1582 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1583 tcg_out8(s, arg2); 1584 return jz; 1585 } 1586 1587 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1588 if (arg2 == 0x8000) { 1589 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1590 return js; 1591 } 1592 if (arg2 == 0xff00) { 1593 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1594 return jz; 1595 } 1596 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1597 tcg_out8(s, arg2 >> 8); 1598 return jz; 1599 } 1600 1601 if (arg2 == 0xffff) { 1602 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1603 return jz; 1604 } 1605 if (arg2 == 0xffffffffu) { 1606 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1607 return jz; 1608 } 1609 1610 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1611 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1612 int sh = ctz64(arg2); 1613 1614 rexw = (sh & 32 ? P_REXW : 0); 1615 if ((sh & 31) == 31) { 1616 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1617 return js; 1618 } else { 1619 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1620 tcg_out8(s, sh); 1621 return jc; 1622 } 1623 } 1624 1625 if (rexw) { 1626 if (arg2 == (uint32_t)arg2) { 1627 rexw = 0; 1628 } else { 1629 tcg_debug_assert(arg2 == (int32_t)arg2); 1630 } 1631 } 1632 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1633 tcg_out32(s, arg2); 1634 return jz; 1635} 1636 1637static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1638 TCGArg arg1, TCGArg arg2, int const_arg2, 1639 TCGLabel *label, bool small) 1640{ 1641 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1642 tcg_out_jxx(s, jcc, label, small); 1643} 1644 1645#if TCG_TARGET_REG_BITS == 32 1646static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1647 const int *const_args, bool small) 1648{ 1649 TCGLabel *label_next = gen_new_label(); 1650 TCGLabel *label_this = arg_label(args[5]); 1651 TCGCond cond = args[4]; 1652 1653 switch (cond) { 1654 case TCG_COND_EQ: 1655 case TCG_COND_TSTEQ: 1656 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1657 args[0], args[2], const_args[2], label_next, 1); 1658 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1659 label_this, small); 1660 break; 1661 1662 case TCG_COND_NE: 1663 case TCG_COND_TSTNE: 1664 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1665 label_this, small); 1666 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1667 label_this, small); 1668 break; 1669 1670 default: 1671 tcg_out_brcond(s, 0, tcg_high_cond(cond), args[1], 1672 args[3], const_args[3], label_this, small); 1673 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1674 tcg_out_brcond(s, 0, tcg_unsigned_cond(cond), args[0], 1675 args[2], const_args[2], label_this, small); 1676 break; 1677 } 1678 tcg_out_label(s, label_next); 1679} 1680#endif 1681 1682static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond, 1683 TCGArg dest, TCGArg arg1, TCGArg arg2, 1684 int const_arg2, bool neg) 1685{ 1686 int cmp_rexw = rexw; 1687 bool inv = false; 1688 bool cleared; 1689 int jcc; 1690 1691 switch (cond) { 1692 case TCG_COND_NE: 1693 inv = true; 1694 /* fall through */ 1695 case TCG_COND_EQ: 1696 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1697 if (const_arg2 && arg2 == 0) { 1698 arg2 = 1; 1699 goto do_ltu; 1700 } 1701 break; 1702 1703 case TCG_COND_TSTNE: 1704 inv = true; 1705 /* fall through */ 1706 case TCG_COND_TSTEQ: 1707 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1708 if (const_arg2 && arg2 == 0xffffffffu) { 1709 arg2 = 1; 1710 cmp_rexw = 0; 1711 goto do_ltu; 1712 } 1713 break; 1714 1715 case TCG_COND_LEU: 1716 inv = true; 1717 /* fall through */ 1718 case TCG_COND_GTU: 1719 /* If arg2 is a register, swap for LTU/GEU. */ 1720 if (!const_arg2) { 1721 TCGReg t = arg1; 1722 arg1 = arg2; 1723 arg2 = t; 1724 goto do_ltu; 1725 } 1726 break; 1727 1728 case TCG_COND_GEU: 1729 inv = true; 1730 /* fall through */ 1731 case TCG_COND_LTU: 1732 do_ltu: 1733 /* 1734 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1735 * We can then use NEG or INC to produce the desired result. 1736 * This is always smaller than the SETCC expansion. 1737 */ 1738 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1739 1740 /* X - X - C = -C = (C ? -1 : 0) */ 1741 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1742 if (inv && neg) { 1743 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1744 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1745 } else if (inv) { 1746 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1747 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1748 } else if (!neg) { 1749 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1750 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1751 } 1752 return; 1753 1754 case TCG_COND_GE: 1755 inv = true; 1756 /* fall through */ 1757 case TCG_COND_LT: 1758 /* If arg2 is 0, extract the sign bit. */ 1759 if (const_arg2 && arg2 == 0) { 1760 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1); 1761 if (inv) { 1762 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1763 } 1764 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1765 dest, rexw ? 63 : 31); 1766 return; 1767 } 1768 break; 1769 1770 default: 1771 break; 1772 } 1773 1774 /* 1775 * If dest does not overlap the inputs, clearing it first is preferred. 1776 * The XOR breaks any false dependency for the low-byte write to dest, 1777 * and is also one byte smaller than MOVZBL. 1778 */ 1779 cleared = false; 1780 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1781 tgen_arithr(s, ARITH_XOR, dest, dest); 1782 cleared = true; 1783 } 1784 1785 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1786 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1787 1788 if (!cleared) { 1789 tcg_out_ext8u(s, dest, dest); 1790 } 1791 if (neg) { 1792 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1793 } 1794} 1795 1796#if TCG_TARGET_REG_BITS == 32 1797static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1798 const int *const_args) 1799{ 1800 TCGArg new_args[6]; 1801 TCGLabel *label_true, *label_over; 1802 1803 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1804 1805 if (args[0] == args[1] || args[0] == args[2] 1806 || (!const_args[3] && args[0] == args[3]) 1807 || (!const_args[4] && args[0] == args[4])) { 1808 /* When the destination overlaps with one of the argument 1809 registers, don't do anything tricky. */ 1810 label_true = gen_new_label(); 1811 label_over = gen_new_label(); 1812 1813 new_args[5] = label_arg(label_true); 1814 tcg_out_brcond2(s, new_args, const_args+1, 1); 1815 1816 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1817 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1818 tcg_out_label(s, label_true); 1819 1820 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1821 tcg_out_label(s, label_over); 1822 } else { 1823 /* When the destination does not overlap one of the arguments, 1824 clear the destination first, jump if cond false, and emit an 1825 increment in the true case. This results in smaller code. */ 1826 1827 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1828 1829 label_over = gen_new_label(); 1830 new_args[4] = tcg_invert_cond(new_args[4]); 1831 new_args[5] = label_arg(label_over); 1832 tcg_out_brcond2(s, new_args, const_args+1, 1); 1833 1834 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1835 tcg_out_label(s, label_over); 1836 } 1837} 1838#endif 1839 1840static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1841 TCGReg dest, TCGReg v1) 1842{ 1843 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1844} 1845 1846static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1847 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1848 TCGReg v1) 1849{ 1850 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1851 tcg_out_cmov(s, jcc, rexw, dest, v1); 1852} 1853 1854static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1855 TCGArg arg2, bool const_a2) 1856{ 1857 if (have_bmi1) { 1858 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1859 if (const_a2) { 1860 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1861 } else { 1862 tcg_debug_assert(dest != arg2); 1863 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1864 } 1865 } else { 1866 tcg_debug_assert(dest != arg2); 1867 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1868 tcg_out_cmov(s, JCC_JE, rexw, dest, arg2); 1869 } 1870} 1871 1872static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1873 TCGArg arg2, bool const_a2) 1874{ 1875 if (have_lzcnt) { 1876 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1877 if (const_a2) { 1878 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1879 } else { 1880 tcg_debug_assert(dest != arg2); 1881 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1882 } 1883 } else { 1884 tcg_debug_assert(!const_a2); 1885 tcg_debug_assert(dest != arg1); 1886 tcg_debug_assert(dest != arg2); 1887 1888 /* Recall that the output of BSR is the index not the count. */ 1889 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1890 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1891 1892 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1893 int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw); 1894 tcg_out_cmov(s, jcc, rexw, dest, arg2); 1895 } 1896} 1897 1898static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1899{ 1900 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1901 1902 if (disp == (int32_t)disp) { 1903 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1904 tcg_out32(s, disp); 1905 } else { 1906 /* rip-relative addressing into the constant pool. 1907 This is 6 + 8 = 14 bytes, as compared to using an 1908 immediate load 10 + 6 = 16 bytes, plus we may 1909 be able to re-use the pool constant for more calls. */ 1910 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1911 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1912 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1913 tcg_out32(s, 0); 1914 } 1915} 1916 1917static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1918 const TCGHelperInfo *info) 1919{ 1920 tcg_out_branch(s, 1, dest); 1921 1922#ifndef _WIN32 1923 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1924 /* 1925 * The sysv i386 abi for struct return places a reference as the 1926 * first argument of the stack, and pops that argument with the 1927 * return statement. Since we want to retain the aligned stack 1928 * pointer for the callee, we do not want to actually push that 1929 * argument before the call but rely on the normal store to the 1930 * stack slot. But we do need to compensate for the pop in order 1931 * to reset our correct stack pointer value. 1932 * Pushing a garbage value back onto the stack is quickest. 1933 */ 1934 tcg_out_push(s, TCG_REG_EAX); 1935 } 1936#endif 1937} 1938 1939static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1940{ 1941 tcg_out_branch(s, 0, dest); 1942} 1943 1944static void tcg_out_nopn(TCGContext *s, int n) 1945{ 1946 int i; 1947 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1948 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1949 * duplicate prefix, and all of the interesting recent cores can 1950 * decode and discard the duplicates in a single cycle. 1951 */ 1952 tcg_debug_assert(n >= 1); 1953 for (i = 1; i < n; ++i) { 1954 tcg_out8(s, 0x66); 1955 } 1956 tcg_out8(s, 0x90); 1957} 1958 1959typedef struct { 1960 TCGReg base; 1961 int index; 1962 int ofs; 1963 int seg; 1964 TCGAtomAlign aa; 1965} HostAddress; 1966 1967bool tcg_target_has_memory_bswap(MemOp memop) 1968{ 1969 TCGAtomAlign aa; 1970 1971 if (!have_movbe) { 1972 return false; 1973 } 1974 if ((memop & MO_SIZE) < MO_128) { 1975 return true; 1976 } 1977 1978 /* 1979 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 1980 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 1981 */ 1982 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 1983 return aa.atom < MO_128; 1984} 1985 1986/* 1987 * Because i686 has no register parameters and because x86_64 has xchg 1988 * to handle addr/data register overlap, we have placed all input arguments 1989 * before we need might need a scratch reg. 1990 * 1991 * Even then, a scratch is only needed for l->raddr. Rather than expose 1992 * a general-purpose scratch when we don't actually know it's available, 1993 * use the ra_gen hook to load into RAX if needed. 1994 */ 1995#if TCG_TARGET_REG_BITS == 64 1996static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1997{ 1998 if (arg < 0) { 1999 arg = TCG_REG_RAX; 2000 } 2001 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 2002 return arg; 2003} 2004static const TCGLdstHelperParam ldst_helper_param = { 2005 .ra_gen = ldst_ra_gen 2006}; 2007#else 2008static const TCGLdstHelperParam ldst_helper_param = { }; 2009#endif 2010 2011static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 2012 TCGReg l, TCGReg h, TCGReg v) 2013{ 2014 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2015 2016 /* vpmov{d,q} %v, %l */ 2017 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2018 /* vpextr{d,q} $1, %v, %h */ 2019 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2020 tcg_out8(s, 1); 2021} 2022 2023static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2024 TCGReg v, TCGReg l, TCGReg h) 2025{ 2026 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2027 2028 /* vmov{d,q} %l, %v */ 2029 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2030 /* vpinsr{d,q} $1, %h, %v, %v */ 2031 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2032 tcg_out8(s, 1); 2033} 2034 2035/* 2036 * Generate code for the slow path for a load at the end of block 2037 */ 2038static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2039{ 2040 MemOp opc = get_memop(l->oi); 2041 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2042 2043 /* resolve label address */ 2044 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2045 if (label_ptr[1]) { 2046 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2047 } 2048 2049 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2050 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2051 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2052 2053 tcg_out_jmp(s, l->raddr); 2054 return true; 2055} 2056 2057/* 2058 * Generate code for the slow path for a store at the end of block 2059 */ 2060static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2061{ 2062 MemOp opc = get_memop(l->oi); 2063 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2064 2065 /* resolve label address */ 2066 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2067 if (label_ptr[1]) { 2068 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2069 } 2070 2071 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2072 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2073 2074 tcg_out_jmp(s, l->raddr); 2075 return true; 2076} 2077 2078#ifdef CONFIG_USER_ONLY 2079static HostAddress x86_guest_base = { 2080 .index = -1 2081}; 2082 2083#if defined(__x86_64__) && defined(__linux__) 2084# include <asm/prctl.h> 2085# include <sys/prctl.h> 2086int arch_prctl(int code, unsigned long addr); 2087static inline int setup_guest_base_seg(void) 2088{ 2089 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2090 return P_GS; 2091 } 2092 return 0; 2093} 2094#define setup_guest_base_seg setup_guest_base_seg 2095#elif defined(__x86_64__) && \ 2096 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2097# include <machine/sysarch.h> 2098static inline int setup_guest_base_seg(void) 2099{ 2100 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2101 return P_GS; 2102 } 2103 return 0; 2104} 2105#define setup_guest_base_seg setup_guest_base_seg 2106#endif 2107#else 2108# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2109#endif /* CONFIG_USER_ONLY */ 2110#ifndef setup_guest_base_seg 2111# define setup_guest_base_seg() 0 2112#endif 2113 2114#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2115 2116/* 2117 * For softmmu, perform the TLB load and compare. 2118 * For useronly, perform any required alignment tests. 2119 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2120 * is required and fill in @h with the host address for the fast path. 2121 */ 2122static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2123 TCGReg addr, MemOpIdx oi, bool is_ld) 2124{ 2125 TCGLabelQemuLdst *ldst = NULL; 2126 MemOp opc = get_memop(oi); 2127 MemOp s_bits = opc & MO_SIZE; 2128 unsigned a_mask; 2129 2130 if (tcg_use_softmmu) { 2131 h->index = TCG_REG_L0; 2132 h->ofs = 0; 2133 h->seg = 0; 2134 } else { 2135 *h = x86_guest_base; 2136 } 2137 h->base = addr; 2138 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2139 a_mask = (1 << h->aa.align) - 1; 2140 2141 if (tcg_use_softmmu) { 2142 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2143 : offsetof(CPUTLBEntry, addr_write); 2144 TCGType ttype = TCG_TYPE_I32; 2145 TCGType tlbtype = TCG_TYPE_I32; 2146 int trexw = 0, hrexw = 0, tlbrexw = 0; 2147 unsigned mem_index = get_mmuidx(oi); 2148 unsigned s_mask = (1 << s_bits) - 1; 2149 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2150 int tlb_mask; 2151 2152 ldst = new_ldst_label(s); 2153 ldst->is_ld = is_ld; 2154 ldst->oi = oi; 2155 ldst->addr_reg = addr; 2156 2157 if (TCG_TARGET_REG_BITS == 64) { 2158 ttype = s->addr_type; 2159 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2160 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2161 hrexw = P_REXW; 2162 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2163 tlbtype = TCG_TYPE_I64; 2164 tlbrexw = P_REXW; 2165 } 2166 } 2167 } 2168 2169 tcg_out_mov(s, tlbtype, TCG_REG_L0, addr); 2170 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2171 s->page_bits - CPU_TLB_ENTRY_BITS); 2172 2173 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2174 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2175 2176 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2177 fast_ofs + offsetof(CPUTLBDescFast, table)); 2178 2179 /* 2180 * If the required alignment is at least as large as the access, 2181 * simply copy the address and mask. For lesser alignments, 2182 * check that we don't cross pages for the complete access. 2183 */ 2184 if (a_mask >= s_mask) { 2185 tcg_out_mov(s, ttype, TCG_REG_L1, addr); 2186 } else { 2187 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2188 addr, s_mask - a_mask); 2189 } 2190 tlb_mask = s->page_mask | a_mask; 2191 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2192 2193 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2194 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2195 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2196 2197 /* jne slow_path */ 2198 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2199 ldst->label_ptr[0] = s->code_ptr; 2200 s->code_ptr += 4; 2201 2202 /* TLB Hit. */ 2203 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2204 offsetof(CPUTLBEntry, addend)); 2205 } else if (a_mask) { 2206 int jcc; 2207 2208 ldst = new_ldst_label(s); 2209 ldst->is_ld = is_ld; 2210 ldst->oi = oi; 2211 ldst->addr_reg = addr; 2212 2213 /* jne slow_path */ 2214 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addr, a_mask, true, false); 2215 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2216 ldst->label_ptr[0] = s->code_ptr; 2217 s->code_ptr += 4; 2218 } 2219 2220 return ldst; 2221} 2222 2223static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2224 HostAddress h, TCGType type, MemOp memop) 2225{ 2226 bool use_movbe = false; 2227 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2228 int movop = OPC_MOVL_GvEv; 2229 2230 /* Do big-endian loads with movbe. */ 2231 if (memop & MO_BSWAP) { 2232 tcg_debug_assert(have_movbe); 2233 use_movbe = true; 2234 movop = OPC_MOVBE_GyMy; 2235 } 2236 2237 switch (memop & MO_SSIZE) { 2238 case MO_UB: 2239 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2240 h.base, h.index, 0, h.ofs); 2241 break; 2242 case MO_SB: 2243 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2244 h.base, h.index, 0, h.ofs); 2245 break; 2246 case MO_UW: 2247 if (use_movbe) { 2248 /* There is no extending movbe; only low 16-bits are modified. */ 2249 if (datalo != h.base && datalo != h.index) { 2250 /* XOR breaks dependency chains. */ 2251 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2252 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2253 datalo, h.base, h.index, 0, h.ofs); 2254 } else { 2255 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2256 datalo, h.base, h.index, 0, h.ofs); 2257 tcg_out_ext16u(s, datalo, datalo); 2258 } 2259 } else { 2260 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2261 h.base, h.index, 0, h.ofs); 2262 } 2263 break; 2264 case MO_SW: 2265 if (use_movbe) { 2266 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2267 datalo, h.base, h.index, 0, h.ofs); 2268 tcg_out_ext16s(s, type, datalo, datalo); 2269 } else { 2270 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2271 datalo, h.base, h.index, 0, h.ofs); 2272 } 2273 break; 2274 case MO_UL: 2275 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2276 h.base, h.index, 0, h.ofs); 2277 break; 2278#if TCG_TARGET_REG_BITS == 64 2279 case MO_SL: 2280 if (use_movbe) { 2281 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2282 h.base, h.index, 0, h.ofs); 2283 tcg_out_ext32s(s, datalo, datalo); 2284 } else { 2285 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2286 h.base, h.index, 0, h.ofs); 2287 } 2288 break; 2289#endif 2290 case MO_UQ: 2291 if (TCG_TARGET_REG_BITS == 64) { 2292 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2293 h.base, h.index, 0, h.ofs); 2294 break; 2295 } 2296 if (use_movbe) { 2297 TCGReg t = datalo; 2298 datalo = datahi; 2299 datahi = t; 2300 } 2301 if (h.base == datalo || h.index == datalo) { 2302 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2303 h.base, h.index, 0, h.ofs); 2304 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2305 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2306 } else { 2307 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2308 h.base, h.index, 0, h.ofs); 2309 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2310 h.base, h.index, 0, h.ofs + 4); 2311 } 2312 break; 2313 2314 case MO_128: 2315 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2316 2317 /* 2318 * Without 16-byte atomicity, use integer regs. 2319 * That is where we want the data, and it allows bswaps. 2320 */ 2321 if (h.aa.atom < MO_128) { 2322 if (use_movbe) { 2323 TCGReg t = datalo; 2324 datalo = datahi; 2325 datahi = t; 2326 } 2327 if (h.base == datalo || h.index == datalo) { 2328 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2329 h.base, h.index, 0, h.ofs); 2330 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2331 datalo, datahi, 0); 2332 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2333 datahi, datahi, 8); 2334 } else { 2335 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2336 h.base, h.index, 0, h.ofs); 2337 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2338 h.base, h.index, 0, h.ofs + 8); 2339 } 2340 break; 2341 } 2342 2343 /* 2344 * With 16-byte atomicity, a vector load is required. 2345 * If we already have 16-byte alignment, then VMOVDQA always works. 2346 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2347 * Else use we require a runtime test for alignment for VMOVDQA; 2348 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2349 */ 2350 if (h.aa.align >= MO_128) { 2351 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2352 TCG_TMP_VEC, 0, 2353 h.base, h.index, 0, h.ofs); 2354 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2355 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2356 TCG_TMP_VEC, 0, 2357 h.base, h.index, 0, h.ofs); 2358 } else { 2359 TCGLabel *l1 = gen_new_label(); 2360 TCGLabel *l2 = gen_new_label(); 2361 int jcc; 2362 2363 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2364 tcg_out_jxx(s, jcc, l1, true); 2365 2366 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2367 TCG_TMP_VEC, 0, 2368 h.base, h.index, 0, h.ofs); 2369 tcg_out_jxx(s, JCC_JMP, l2, true); 2370 2371 tcg_out_label(s, l1); 2372 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2373 TCG_TMP_VEC, 0, 2374 h.base, h.index, 0, h.ofs); 2375 tcg_out_label(s, l2); 2376 } 2377 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2378 break; 2379 2380 default: 2381 g_assert_not_reached(); 2382 } 2383} 2384 2385static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2386 TCGReg addr, MemOpIdx oi, TCGType data_type) 2387{ 2388 TCGLabelQemuLdst *ldst; 2389 HostAddress h; 2390 2391 ldst = prepare_host_addr(s, &h, addr, oi, true); 2392 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2393 2394 if (ldst) { 2395 ldst->type = data_type; 2396 ldst->datalo_reg = datalo; 2397 ldst->datahi_reg = datahi; 2398 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2399 } 2400} 2401 2402static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2403 HostAddress h, MemOp memop) 2404{ 2405 bool use_movbe = false; 2406 int movop = OPC_MOVL_EvGv; 2407 2408 /* 2409 * Do big-endian stores with movbe or system-mode. 2410 * User-only without movbe will have its swapping done generically. 2411 */ 2412 if (memop & MO_BSWAP) { 2413 tcg_debug_assert(have_movbe); 2414 use_movbe = true; 2415 movop = OPC_MOVBE_MyGy; 2416 } 2417 2418 switch (memop & MO_SIZE) { 2419 case MO_8: 2420 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2421 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2422 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2423 datalo, h.base, h.index, 0, h.ofs); 2424 break; 2425 case MO_16: 2426 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2427 h.base, h.index, 0, h.ofs); 2428 break; 2429 case MO_32: 2430 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2431 h.base, h.index, 0, h.ofs); 2432 break; 2433 case MO_64: 2434 if (TCG_TARGET_REG_BITS == 64) { 2435 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2436 h.base, h.index, 0, h.ofs); 2437 } else { 2438 if (use_movbe) { 2439 TCGReg t = datalo; 2440 datalo = datahi; 2441 datahi = t; 2442 } 2443 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2444 h.base, h.index, 0, h.ofs); 2445 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2446 h.base, h.index, 0, h.ofs + 4); 2447 } 2448 break; 2449 2450 case MO_128: 2451 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2452 2453 /* 2454 * Without 16-byte atomicity, use integer regs. 2455 * That is where we have the data, and it allows bswaps. 2456 */ 2457 if (h.aa.atom < MO_128) { 2458 if (use_movbe) { 2459 TCGReg t = datalo; 2460 datalo = datahi; 2461 datahi = t; 2462 } 2463 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2464 h.base, h.index, 0, h.ofs); 2465 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2466 h.base, h.index, 0, h.ofs + 8); 2467 break; 2468 } 2469 2470 /* 2471 * With 16-byte atomicity, a vector store is required. 2472 * If we already have 16-byte alignment, then VMOVDQA always works. 2473 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2474 * Else use we require a runtime test for alignment for VMOVDQA; 2475 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2476 */ 2477 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2478 if (h.aa.align >= MO_128) { 2479 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2480 TCG_TMP_VEC, 0, 2481 h.base, h.index, 0, h.ofs); 2482 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2483 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2484 TCG_TMP_VEC, 0, 2485 h.base, h.index, 0, h.ofs); 2486 } else { 2487 TCGLabel *l1 = gen_new_label(); 2488 TCGLabel *l2 = gen_new_label(); 2489 int jcc; 2490 2491 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2492 tcg_out_jxx(s, jcc, l1, true); 2493 2494 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2495 TCG_TMP_VEC, 0, 2496 h.base, h.index, 0, h.ofs); 2497 tcg_out_jxx(s, JCC_JMP, l2, true); 2498 2499 tcg_out_label(s, l1); 2500 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2501 TCG_TMP_VEC, 0, 2502 h.base, h.index, 0, h.ofs); 2503 tcg_out_label(s, l2); 2504 } 2505 break; 2506 2507 default: 2508 g_assert_not_reached(); 2509 } 2510} 2511 2512static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2513 TCGReg addr, MemOpIdx oi, TCGType data_type) 2514{ 2515 TCGLabelQemuLdst *ldst; 2516 HostAddress h; 2517 2518 ldst = prepare_host_addr(s, &h, addr, oi, false); 2519 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2520 2521 if (ldst) { 2522 ldst->type = data_type; 2523 ldst->datalo_reg = datalo; 2524 ldst->datahi_reg = datahi; 2525 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2526 } 2527} 2528 2529static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2530{ 2531 /* Reuse the zeroing that exists for goto_ptr. */ 2532 if (a0 == 0) { 2533 tcg_out_jmp(s, tcg_code_gen_epilogue); 2534 } else { 2535 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2536 tcg_out_jmp(s, tb_ret_addr); 2537 } 2538} 2539 2540static void tcg_out_goto_tb(TCGContext *s, int which) 2541{ 2542 /* 2543 * Jump displacement must be aligned for atomic patching; 2544 * see if we need to add extra nops before jump 2545 */ 2546 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2547 if (gap != 1) { 2548 tcg_out_nopn(s, gap - 1); 2549 } 2550 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2551 set_jmp_insn_offset(s, which); 2552 tcg_out32(s, 0); 2553 set_jmp_reset_offset(s, which); 2554} 2555 2556void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2557 uintptr_t jmp_rx, uintptr_t jmp_rw) 2558{ 2559 /* patch the branch destination */ 2560 uintptr_t addr = tb->jmp_target_addr[n]; 2561 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2562 /* no need to flush icache explicitly */ 2563} 2564 2565 2566static void tgen_add(TCGContext *s, TCGType type, 2567 TCGReg a0, TCGReg a1, TCGReg a2) 2568{ 2569 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2570 2571 if (a0 == a1) { 2572 tgen_arithr(s, ARITH_ADD + rexw, a0, a2); 2573 } else if (a0 == a2) { 2574 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2575 } else { 2576 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, 0); 2577 } 2578} 2579 2580static void tgen_addi(TCGContext *s, TCGType type, 2581 TCGReg a0, TCGReg a1, tcg_target_long a2) 2582{ 2583 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2584 2585 if (a0 == a1) { 2586 tgen_arithi(s, ARITH_ADD + rexw, a0, a2, false); 2587 } else { 2588 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2); 2589 } 2590} 2591 2592static const TCGOutOpBinary outop_add = { 2593 .base.static_constraint = C_O1_I2(r, r, re), 2594 .out_rrr = tgen_add, 2595 .out_rri = tgen_addi, 2596}; 2597 2598static void tgen_and(TCGContext *s, TCGType type, 2599 TCGReg a0, TCGReg a1, TCGReg a2) 2600{ 2601 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2602 tgen_arithr(s, ARITH_AND + rexw, a0, a2); 2603} 2604 2605static void tgen_andi(TCGContext *s, TCGType type, 2606 TCGReg a0, TCGReg a1, tcg_target_long a2) 2607{ 2608 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2609 tgen_arithi(s, ARITH_AND + rexw, a0, a2, false); 2610} 2611 2612static const TCGOutOpBinary outop_and = { 2613 .base.static_constraint = C_O1_I2(r, 0, reZ), 2614 .out_rrr = tgen_and, 2615 .out_rri = tgen_andi, 2616}; 2617 2618static void tgen_andc(TCGContext *s, TCGType type, 2619 TCGReg a0, TCGReg a1, TCGReg a2) 2620{ 2621 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2622 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2623} 2624 2625static TCGConstraintSetIndex cset_andc(TCGType type, unsigned flags) 2626{ 2627 return have_bmi1 ? C_O1_I2(r, r, r) : C_NotImplemented; 2628} 2629 2630static const TCGOutOpBinary outop_andc = { 2631 .base.static_constraint = C_Dynamic, 2632 .base.dynamic_constraint = cset_andc, 2633 .out_rrr = tgen_andc, 2634}; 2635 2636static const TCGOutOpBinary outop_eqv = { 2637 .base.static_constraint = C_NotImplemented, 2638}; 2639 2640static const TCGOutOpBinary outop_nand = { 2641 .base.static_constraint = C_NotImplemented, 2642}; 2643 2644static const TCGOutOpBinary outop_nor = { 2645 .base.static_constraint = C_NotImplemented, 2646}; 2647 2648static void tgen_or(TCGContext *s, TCGType type, 2649 TCGReg a0, TCGReg a1, TCGReg a2) 2650{ 2651 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2652 tgen_arithr(s, ARITH_OR + rexw, a0, a2); 2653} 2654 2655static void tgen_ori(TCGContext *s, TCGType type, 2656 TCGReg a0, TCGReg a1, tcg_target_long a2) 2657{ 2658 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2659 tgen_arithi(s, ARITH_OR + rexw, a0, a2, false); 2660} 2661 2662static const TCGOutOpBinary outop_or = { 2663 .base.static_constraint = C_O1_I2(r, 0, re), 2664 .out_rrr = tgen_or, 2665 .out_rri = tgen_ori, 2666}; 2667 2668static const TCGOutOpBinary outop_orc = { 2669 .base.static_constraint = C_NotImplemented, 2670}; 2671 2672static void tgen_sub(TCGContext *s, TCGType type, 2673 TCGReg a0, TCGReg a1, TCGReg a2) 2674{ 2675 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2676 tgen_arithr(s, ARITH_SUB + rexw, a0, a2); 2677} 2678 2679static const TCGOutOpSubtract outop_sub = { 2680 .base.static_constraint = C_O1_I2(r, 0, r), 2681 .out_rrr = tgen_sub, 2682}; 2683 2684static void tgen_xor(TCGContext *s, TCGType type, 2685 TCGReg a0, TCGReg a1, TCGReg a2) 2686{ 2687 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2688 tgen_arithr(s, ARITH_XOR + rexw, a0, a2); 2689} 2690 2691static void tgen_xori(TCGContext *s, TCGType type, 2692 TCGReg a0, TCGReg a1, tcg_target_long a2) 2693{ 2694 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2695 tgen_arithi(s, ARITH_XOR + rexw, a0, a2, false); 2696} 2697 2698static const TCGOutOpBinary outop_xor = { 2699 .base.static_constraint = C_O1_I2(r, 0, re), 2700 .out_rrr = tgen_xor, 2701 .out_rri = tgen_xori, 2702}; 2703 2704static void tgen_neg(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2705{ 2706 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2707 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2708} 2709 2710static const TCGOutOpUnary outop_neg = { 2711 .base.static_constraint = C_O1_I1(r, 0), 2712 .out_rr = tgen_neg, 2713}; 2714 2715static void tgen_not(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2716{ 2717 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2718 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2719} 2720 2721static const TCGOutOpUnary outop_not = { 2722 .base.static_constraint = C_O1_I1(r, 0), 2723 .out_rr = tgen_not, 2724}; 2725 2726 2727static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type, 2728 const TCGArg args[TCG_MAX_OP_ARGS], 2729 const int const_args[TCG_MAX_OP_ARGS]) 2730{ 2731 TCGArg a0, a1, a2; 2732 int c, const_a2, vexop, rexw; 2733 2734#if TCG_TARGET_REG_BITS == 64 2735# define OP_32_64(x) \ 2736 case glue(glue(INDEX_op_, x), _i64): \ 2737 case glue(glue(INDEX_op_, x), _i32) 2738#else 2739# define OP_32_64(x) \ 2740 case glue(glue(INDEX_op_, x), _i32) 2741#endif 2742 2743 /* Hoist the loads of the most common arguments. */ 2744 a0 = args[0]; 2745 a1 = args[1]; 2746 a2 = args[2]; 2747 const_a2 = const_args[2]; 2748 rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2749 2750 switch (opc) { 2751 case INDEX_op_goto_ptr: 2752 /* jmp to the given host address (could be epilogue) */ 2753 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2754 break; 2755 case INDEX_op_br: 2756 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2757 break; 2758 OP_32_64(ld8u): 2759 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2760 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2761 break; 2762 OP_32_64(ld8s): 2763 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2764 break; 2765 OP_32_64(ld16u): 2766 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2767 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2768 break; 2769 OP_32_64(ld16s): 2770 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2771 break; 2772#if TCG_TARGET_REG_BITS == 64 2773 case INDEX_op_ld32u_i64: 2774#endif 2775 case INDEX_op_ld_i32: 2776 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2777 break; 2778 2779 OP_32_64(st8): 2780 if (const_args[0]) { 2781 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2782 tcg_out8(s, a0); 2783 } else { 2784 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2785 } 2786 break; 2787 OP_32_64(st16): 2788 if (const_args[0]) { 2789 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2790 tcg_out16(s, a0); 2791 } else { 2792 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2793 } 2794 break; 2795#if TCG_TARGET_REG_BITS == 64 2796 case INDEX_op_st32_i64: 2797#endif 2798 case INDEX_op_st_i32: 2799 if (const_args[0]) { 2800 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2801 tcg_out32(s, a0); 2802 } else { 2803 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2804 } 2805 break; 2806 2807 OP_32_64(mul): 2808 if (const_a2) { 2809 int32_t val; 2810 val = a2; 2811 if (val == (int8_t)val) { 2812 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2813 tcg_out8(s, val); 2814 } else { 2815 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2816 tcg_out32(s, val); 2817 } 2818 } else { 2819 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2820 } 2821 break; 2822 2823 OP_32_64(div2): 2824 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2825 break; 2826 OP_32_64(divu2): 2827 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2828 break; 2829 2830 OP_32_64(shl): 2831 /* For small constant 3-operand shift, use LEA. */ 2832 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2833 if (a2 - 1 == 0) { 2834 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2835 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2836 } else { 2837 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2838 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2839 } 2840 break; 2841 } 2842 c = SHIFT_SHL; 2843 vexop = OPC_SHLX; 2844 goto gen_shift_maybe_vex; 2845 OP_32_64(shr): 2846 c = SHIFT_SHR; 2847 vexop = OPC_SHRX; 2848 goto gen_shift_maybe_vex; 2849 OP_32_64(sar): 2850 c = SHIFT_SAR; 2851 vexop = OPC_SARX; 2852 goto gen_shift_maybe_vex; 2853 OP_32_64(rotl): 2854 c = SHIFT_ROL; 2855 goto gen_shift; 2856 OP_32_64(rotr): 2857 c = SHIFT_ROR; 2858 goto gen_shift; 2859 gen_shift_maybe_vex: 2860 if (have_bmi2) { 2861 if (!const_a2) { 2862 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2863 break; 2864 } 2865 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2866 } 2867 /* FALLTHRU */ 2868 gen_shift: 2869 if (const_a2) { 2870 tcg_out_shifti(s, c + rexw, a0, a2); 2871 } else { 2872 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2873 } 2874 break; 2875 2876 OP_32_64(ctz): 2877 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2878 break; 2879 OP_32_64(clz): 2880 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2881 break; 2882 OP_32_64(ctpop): 2883 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2884 break; 2885 2886 OP_32_64(brcond): 2887 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1], 2888 arg_label(args[3]), 0); 2889 break; 2890 OP_32_64(setcond): 2891 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false); 2892 break; 2893 OP_32_64(negsetcond): 2894 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true); 2895 break; 2896 OP_32_64(movcond): 2897 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 2898 break; 2899 2900 OP_32_64(bswap16): 2901 if (a2 & TCG_BSWAP_OS) { 2902 /* Output must be sign-extended. */ 2903 if (rexw) { 2904 tcg_out_bswap64(s, a0); 2905 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2906 } else { 2907 tcg_out_bswap32(s, a0); 2908 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2909 } 2910 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2911 /* Output must be zero-extended, but input isn't. */ 2912 tcg_out_bswap32(s, a0); 2913 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2914 } else { 2915 tcg_out_rolw_8(s, a0); 2916 } 2917 break; 2918 OP_32_64(bswap32): 2919 tcg_out_bswap32(s, a0); 2920 if (rexw && (a2 & TCG_BSWAP_OS)) { 2921 tcg_out_ext32s(s, a0, a0); 2922 } 2923 break; 2924 2925 case INDEX_op_qemu_ld_i32: 2926 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32); 2927 break; 2928 case INDEX_op_qemu_ld_i64: 2929 if (TCG_TARGET_REG_BITS == 64) { 2930 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64); 2931 } else { 2932 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I64); 2933 } 2934 break; 2935 case INDEX_op_qemu_ld_i128: 2936 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2937 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I128); 2938 break; 2939 2940 case INDEX_op_qemu_st_i32: 2941 case INDEX_op_qemu_st8_i32: 2942 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32); 2943 break; 2944 case INDEX_op_qemu_st_i64: 2945 if (TCG_TARGET_REG_BITS == 64) { 2946 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64); 2947 } else { 2948 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I64); 2949 } 2950 break; 2951 case INDEX_op_qemu_st_i128: 2952 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2953 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I128); 2954 break; 2955 2956 OP_32_64(mulu2): 2957 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2958 break; 2959 OP_32_64(muls2): 2960 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2961 break; 2962 OP_32_64(add2): 2963 if (const_args[4]) { 2964 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2965 } else { 2966 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2967 } 2968 if (const_args[5]) { 2969 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2970 } else { 2971 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2972 } 2973 break; 2974 OP_32_64(sub2): 2975 if (const_args[4]) { 2976 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2977 } else { 2978 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2979 } 2980 if (const_args[5]) { 2981 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2982 } else { 2983 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2984 } 2985 break; 2986 2987#if TCG_TARGET_REG_BITS == 32 2988 case INDEX_op_brcond2_i32: 2989 tcg_out_brcond2(s, args, const_args, 0); 2990 break; 2991 case INDEX_op_setcond2_i32: 2992 tcg_out_setcond2(s, args, const_args); 2993 break; 2994#else /* TCG_TARGET_REG_BITS == 64 */ 2995 case INDEX_op_ld32s_i64: 2996 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2997 break; 2998 case INDEX_op_ld_i64: 2999 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 3000 break; 3001 case INDEX_op_st_i64: 3002 if (const_args[0]) { 3003 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 3004 tcg_out32(s, a0); 3005 } else { 3006 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 3007 } 3008 break; 3009 3010 case INDEX_op_bswap64_i64: 3011 tcg_out_bswap64(s, a0); 3012 break; 3013 case INDEX_op_extrh_i64_i32: 3014 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 3015 break; 3016#endif 3017 3018 OP_32_64(deposit): 3019 if (args[3] == 0 && args[4] == 8) { 3020 /* load bits 0..7 */ 3021 if (const_a2) { 3022 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 3023 0, a0, 0); 3024 tcg_out8(s, a2); 3025 } else { 3026 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 3027 } 3028 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 3029 /* load bits 8..15 */ 3030 if (const_a2) { 3031 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 3032 tcg_out8(s, a2); 3033 } else { 3034 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 3035 } 3036 } else if (args[3] == 0 && args[4] == 16) { 3037 /* load bits 0..15 */ 3038 if (const_a2) { 3039 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 3040 0, a0, 0); 3041 tcg_out16(s, a2); 3042 } else { 3043 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 3044 } 3045 } else { 3046 g_assert_not_reached(); 3047 } 3048 break; 3049 3050 case INDEX_op_extract_i64: 3051 if (a2 + args[3] == 32) { 3052 if (a2 == 0) { 3053 tcg_out_ext32u(s, a0, a1); 3054 break; 3055 } 3056 /* This is a 32-bit zero-extending right shift. */ 3057 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3058 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 3059 break; 3060 } 3061 /* FALLTHRU */ 3062 case INDEX_op_extract_i32: 3063 if (a2 == 0 && args[3] == 8) { 3064 tcg_out_ext8u(s, a0, a1); 3065 } else if (a2 == 0 && args[3] == 16) { 3066 tcg_out_ext16u(s, a0, a1); 3067 } else if (a2 == 8 && args[3] == 8) { 3068 /* 3069 * On the off-chance that we can use the high-byte registers. 3070 * Otherwise we emit the same ext16 + shift pattern that we 3071 * would have gotten from the normal tcg-op.c expansion. 3072 */ 3073 if (a1 < 4 && a0 < 8) { 3074 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3075 } else { 3076 tcg_out_ext16u(s, a0, a1); 3077 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3078 } 3079 } else { 3080 g_assert_not_reached(); 3081 } 3082 break; 3083 3084 case INDEX_op_sextract_i64: 3085 if (a2 == 0 && args[3] == 8) { 3086 tcg_out_ext8s(s, TCG_TYPE_I64, a0, a1); 3087 } else if (a2 == 0 && args[3] == 16) { 3088 tcg_out_ext16s(s, TCG_TYPE_I64, a0, a1); 3089 } else if (a2 == 0 && args[3] == 32) { 3090 tcg_out_ext32s(s, a0, a1); 3091 } else { 3092 g_assert_not_reached(); 3093 } 3094 break; 3095 3096 case INDEX_op_sextract_i32: 3097 if (a2 == 0 && args[3] == 8) { 3098 tcg_out_ext8s(s, TCG_TYPE_I32, a0, a1); 3099 } else if (a2 == 0 && args[3] == 16) { 3100 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3101 } else if (a2 == 8 && args[3] == 8) { 3102 if (a1 < 4 && a0 < 8) { 3103 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3104 } else { 3105 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3106 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3107 } 3108 } else { 3109 g_assert_not_reached(); 3110 } 3111 break; 3112 3113 OP_32_64(extract2): 3114 /* Note that SHRD outputs to the r/m operand. */ 3115 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3116 tcg_out8(s, args[3]); 3117 break; 3118 3119 case INDEX_op_mb: 3120 tcg_out_mb(s, a0); 3121 break; 3122 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3123 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3124 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3125 case INDEX_op_ext_i32_i64: /* Always emitted via tcg_reg_alloc_op. */ 3126 case INDEX_op_extu_i32_i64: 3127 case INDEX_op_extrl_i64_i32: 3128 default: 3129 g_assert_not_reached(); 3130 } 3131 3132#undef OP_32_64 3133} 3134 3135static int const umin_insn[4] = { 3136 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3137}; 3138 3139static int const umax_insn[4] = { 3140 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3141}; 3142 3143static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3144 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3145{ 3146 static int const cmpeq_insn[4] = { 3147 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3148 }; 3149 static int const cmpgt_insn[4] = { 3150 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3151 }; 3152 3153 enum { 3154 NEED_INV = 1, 3155 NEED_SWAP = 2, 3156 NEED_UMIN = 4, 3157 NEED_UMAX = 8, 3158 INVALID = 16, 3159 }; 3160 static const uint8_t cond_fixup[16] = { 3161 [0 ... 15] = INVALID, 3162 [TCG_COND_EQ] = 0, 3163 [TCG_COND_GT] = 0, 3164 [TCG_COND_NE] = NEED_INV, 3165 [TCG_COND_LE] = NEED_INV, 3166 [TCG_COND_LT] = NEED_SWAP, 3167 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3168 [TCG_COND_LEU] = NEED_UMIN, 3169 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3170 [TCG_COND_GEU] = NEED_UMAX, 3171 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3172 }; 3173 int fixup = cond_fixup[cond]; 3174 3175 assert(!(fixup & INVALID)); 3176 3177 if (fixup & NEED_INV) { 3178 cond = tcg_invert_cond(cond); 3179 } 3180 3181 if (fixup & NEED_SWAP) { 3182 TCGReg swap = v1; 3183 v1 = v2; 3184 v2 = swap; 3185 cond = tcg_swap_cond(cond); 3186 } 3187 3188 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3189 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3190 3191 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3192 assert(vece <= MO_32); 3193 3194 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3195 v2 = TCG_TMP_VEC; 3196 cond = TCG_COND_EQ; 3197 } 3198 3199 switch (cond) { 3200 case TCG_COND_EQ: 3201 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3202 break; 3203 case TCG_COND_GT: 3204 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3205 break; 3206 default: 3207 g_assert_not_reached(); 3208 } 3209 return fixup & NEED_INV; 3210} 3211 3212static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3213 TCGReg v1, TCGReg v2, TCGCond cond) 3214{ 3215 static const int cmpm_insn[2][4] = { 3216 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3217 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3218 }; 3219 static const int testm_insn[4] = { 3220 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3221 }; 3222 static const int testnm_insn[4] = { 3223 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3224 }; 3225 3226 static const int cond_ext[16] = { 3227 [TCG_COND_EQ] = 0, 3228 [TCG_COND_NE] = 4, 3229 [TCG_COND_LT] = 1, 3230 [TCG_COND_LTU] = 1, 3231 [TCG_COND_LE] = 2, 3232 [TCG_COND_LEU] = 2, 3233 [TCG_COND_NEVER] = 3, 3234 [TCG_COND_GE] = 5, 3235 [TCG_COND_GEU] = 5, 3236 [TCG_COND_GT] = 6, 3237 [TCG_COND_GTU] = 6, 3238 [TCG_COND_ALWAYS] = 7, 3239 }; 3240 3241 switch (cond) { 3242 case TCG_COND_TSTNE: 3243 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3244 break; 3245 case TCG_COND_TSTEQ: 3246 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3247 break; 3248 default: 3249 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3250 /* k1 */ 1, v1, v2, type); 3251 tcg_out8(s, cond_ext[cond]); 3252 break; 3253 } 3254} 3255 3256static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3257 unsigned vece, TCGReg dest) 3258{ 3259 static const int movm_insn[] = { 3260 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3261 }; 3262 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3263} 3264 3265static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3266 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3267{ 3268 /* 3269 * With avx512, we have a complete set of comparisons into mask. 3270 * Unless there's a single insn expansion for the comparision, 3271 * expand via a mask in k1. 3272 */ 3273 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3274 && cond != TCG_COND_EQ 3275 && cond != TCG_COND_LT 3276 && cond != TCG_COND_GT) { 3277 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3278 tcg_out_k1_to_vec(s, type, vece, v0); 3279 return; 3280 } 3281 3282 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3283 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3284 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3285 } 3286} 3287 3288static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3289 TCGReg v0, TCGReg c1, TCGReg c2, 3290 TCGReg v3, TCGReg v4, TCGCond cond) 3291{ 3292 static const int vpblendm_insn[] = { 3293 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3294 }; 3295 bool z = false; 3296 3297 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3298 if (!v3) { 3299 z = true; 3300 v3 = v4; 3301 cond = tcg_invert_cond(cond); 3302 } 3303 3304 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3305 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3306 /* k1 */1, z, type); 3307} 3308 3309static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3310 TCGReg v0, TCGReg c1, TCGReg c2, 3311 TCGReg v3, TCGReg v4, TCGCond cond) 3312{ 3313 bool inv; 3314 3315 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3316 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3317 return; 3318 } 3319 3320 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3321 3322 /* 3323 * Since XMM0 is 16, the only way we get 0 into V3 3324 * is via the constant zero constraint. 3325 */ 3326 if (!v3) { 3327 if (inv) { 3328 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3329 } else { 3330 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3331 } 3332 } else { 3333 if (inv) { 3334 TCGReg swap = v3; 3335 v3 = v4; 3336 v4 = swap; 3337 } 3338 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3339 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3340 } 3341} 3342 3343static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3344 unsigned vecl, unsigned vece, 3345 const TCGArg args[TCG_MAX_OP_ARGS], 3346 const int const_args[TCG_MAX_OP_ARGS]) 3347{ 3348 static int const add_insn[4] = { 3349 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3350 }; 3351 static int const ssadd_insn[4] = { 3352 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3353 }; 3354 static int const usadd_insn[4] = { 3355 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3356 }; 3357 static int const sub_insn[4] = { 3358 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3359 }; 3360 static int const sssub_insn[4] = { 3361 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3362 }; 3363 static int const ussub_insn[4] = { 3364 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3365 }; 3366 static int const mul_insn[4] = { 3367 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3368 }; 3369 static int const shift_imm_insn[4] = { 3370 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3371 }; 3372 static int const punpckl_insn[4] = { 3373 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3374 }; 3375 static int const punpckh_insn[4] = { 3376 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3377 }; 3378 static int const packss_insn[4] = { 3379 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3380 }; 3381 static int const packus_insn[4] = { 3382 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3383 }; 3384 static int const smin_insn[4] = { 3385 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3386 }; 3387 static int const smax_insn[4] = { 3388 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3389 }; 3390 static int const rotlv_insn[4] = { 3391 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3392 }; 3393 static int const rotrv_insn[4] = { 3394 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3395 }; 3396 static int const shlv_insn[4] = { 3397 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3398 }; 3399 static int const shrv_insn[4] = { 3400 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3401 }; 3402 static int const sarv_insn[4] = { 3403 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3404 }; 3405 static int const shls_insn[4] = { 3406 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3407 }; 3408 static int const shrs_insn[4] = { 3409 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3410 }; 3411 static int const sars_insn[4] = { 3412 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3413 }; 3414 static int const vpshldi_insn[4] = { 3415 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3416 }; 3417 static int const vpshldv_insn[4] = { 3418 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3419 }; 3420 static int const vpshrdv_insn[4] = { 3421 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3422 }; 3423 static int const abs_insn[4] = { 3424 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3425 }; 3426 3427 TCGType type = vecl + TCG_TYPE_V64; 3428 int insn, sub; 3429 TCGArg a0, a1, a2, a3; 3430 3431 a0 = args[0]; 3432 a1 = args[1]; 3433 a2 = args[2]; 3434 3435 switch (opc) { 3436 case INDEX_op_add_vec: 3437 insn = add_insn[vece]; 3438 goto gen_simd; 3439 case INDEX_op_ssadd_vec: 3440 insn = ssadd_insn[vece]; 3441 goto gen_simd; 3442 case INDEX_op_usadd_vec: 3443 insn = usadd_insn[vece]; 3444 goto gen_simd; 3445 case INDEX_op_sub_vec: 3446 insn = sub_insn[vece]; 3447 goto gen_simd; 3448 case INDEX_op_sssub_vec: 3449 insn = sssub_insn[vece]; 3450 goto gen_simd; 3451 case INDEX_op_ussub_vec: 3452 insn = ussub_insn[vece]; 3453 goto gen_simd; 3454 case INDEX_op_mul_vec: 3455 insn = mul_insn[vece]; 3456 goto gen_simd; 3457 case INDEX_op_and_vec: 3458 insn = OPC_PAND; 3459 goto gen_simd; 3460 case INDEX_op_or_vec: 3461 insn = OPC_POR; 3462 goto gen_simd; 3463 case INDEX_op_xor_vec: 3464 insn = OPC_PXOR; 3465 goto gen_simd; 3466 case INDEX_op_smin_vec: 3467 insn = smin_insn[vece]; 3468 goto gen_simd; 3469 case INDEX_op_umin_vec: 3470 insn = umin_insn[vece]; 3471 goto gen_simd; 3472 case INDEX_op_smax_vec: 3473 insn = smax_insn[vece]; 3474 goto gen_simd; 3475 case INDEX_op_umax_vec: 3476 insn = umax_insn[vece]; 3477 goto gen_simd; 3478 case INDEX_op_shlv_vec: 3479 insn = shlv_insn[vece]; 3480 goto gen_simd; 3481 case INDEX_op_shrv_vec: 3482 insn = shrv_insn[vece]; 3483 goto gen_simd; 3484 case INDEX_op_sarv_vec: 3485 insn = sarv_insn[vece]; 3486 goto gen_simd; 3487 case INDEX_op_rotlv_vec: 3488 insn = rotlv_insn[vece]; 3489 goto gen_simd; 3490 case INDEX_op_rotrv_vec: 3491 insn = rotrv_insn[vece]; 3492 goto gen_simd; 3493 case INDEX_op_shls_vec: 3494 insn = shls_insn[vece]; 3495 goto gen_simd; 3496 case INDEX_op_shrs_vec: 3497 insn = shrs_insn[vece]; 3498 goto gen_simd; 3499 case INDEX_op_sars_vec: 3500 insn = sars_insn[vece]; 3501 goto gen_simd; 3502 case INDEX_op_x86_punpckl_vec: 3503 insn = punpckl_insn[vece]; 3504 goto gen_simd; 3505 case INDEX_op_x86_punpckh_vec: 3506 insn = punpckh_insn[vece]; 3507 goto gen_simd; 3508 case INDEX_op_x86_packss_vec: 3509 insn = packss_insn[vece]; 3510 goto gen_simd; 3511 case INDEX_op_x86_packus_vec: 3512 insn = packus_insn[vece]; 3513 goto gen_simd; 3514 case INDEX_op_x86_vpshldv_vec: 3515 insn = vpshldv_insn[vece]; 3516 a1 = a2; 3517 a2 = args[3]; 3518 goto gen_simd; 3519 case INDEX_op_x86_vpshrdv_vec: 3520 insn = vpshrdv_insn[vece]; 3521 a1 = a2; 3522 a2 = args[3]; 3523 goto gen_simd; 3524#if TCG_TARGET_REG_BITS == 32 3525 case INDEX_op_dup2_vec: 3526 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3527 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3528 /* Then replicate the 64-bit elements across the rest of the vector. */ 3529 if (type != TCG_TYPE_V64) { 3530 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3531 } 3532 break; 3533#endif 3534 case INDEX_op_abs_vec: 3535 insn = abs_insn[vece]; 3536 a2 = a1; 3537 a1 = 0; 3538 goto gen_simd; 3539 gen_simd: 3540 tcg_debug_assert(insn != OPC_UD2); 3541 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3542 break; 3543 3544 case INDEX_op_cmp_vec: 3545 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3546 break; 3547 3548 case INDEX_op_cmpsel_vec: 3549 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3550 args[3], args[4], args[5]); 3551 break; 3552 3553 case INDEX_op_andc_vec: 3554 insn = OPC_PANDN; 3555 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3556 break; 3557 3558 case INDEX_op_shli_vec: 3559 insn = shift_imm_insn[vece]; 3560 sub = 6; 3561 goto gen_shift; 3562 case INDEX_op_shri_vec: 3563 insn = shift_imm_insn[vece]; 3564 sub = 2; 3565 goto gen_shift; 3566 case INDEX_op_sari_vec: 3567 if (vece == MO_64) { 3568 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3569 } else { 3570 insn = shift_imm_insn[vece]; 3571 } 3572 sub = 4; 3573 goto gen_shift; 3574 case INDEX_op_rotli_vec: 3575 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3576 if (vece == MO_64) { 3577 insn |= P_VEXW; 3578 } 3579 sub = 1; 3580 goto gen_shift; 3581 gen_shift: 3582 tcg_debug_assert(vece != MO_8); 3583 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3584 tcg_out8(s, a2); 3585 break; 3586 3587 case INDEX_op_ld_vec: 3588 tcg_out_ld(s, type, a0, a1, a2); 3589 break; 3590 case INDEX_op_st_vec: 3591 tcg_out_st(s, type, a0, a1, a2); 3592 break; 3593 case INDEX_op_dupm_vec: 3594 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3595 break; 3596 3597 case INDEX_op_x86_shufps_vec: 3598 insn = OPC_SHUFPS; 3599 sub = args[3]; 3600 goto gen_simd_imm8; 3601 case INDEX_op_x86_blend_vec: 3602 if (vece == MO_16) { 3603 insn = OPC_PBLENDW; 3604 } else if (vece == MO_32) { 3605 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3606 } else { 3607 g_assert_not_reached(); 3608 } 3609 sub = args[3]; 3610 goto gen_simd_imm8; 3611 case INDEX_op_x86_vperm2i128_vec: 3612 insn = OPC_VPERM2I128; 3613 sub = args[3]; 3614 goto gen_simd_imm8; 3615 case INDEX_op_x86_vpshldi_vec: 3616 insn = vpshldi_insn[vece]; 3617 sub = args[3]; 3618 goto gen_simd_imm8; 3619 3620 case INDEX_op_not_vec: 3621 insn = OPC_VPTERNLOGQ; 3622 a2 = a1; 3623 sub = 0x33; /* !B */ 3624 goto gen_simd_imm8; 3625 case INDEX_op_nor_vec: 3626 insn = OPC_VPTERNLOGQ; 3627 sub = 0x11; /* norCB */ 3628 goto gen_simd_imm8; 3629 case INDEX_op_nand_vec: 3630 insn = OPC_VPTERNLOGQ; 3631 sub = 0x77; /* nandCB */ 3632 goto gen_simd_imm8; 3633 case INDEX_op_eqv_vec: 3634 insn = OPC_VPTERNLOGQ; 3635 sub = 0x99; /* xnorCB */ 3636 goto gen_simd_imm8; 3637 case INDEX_op_orc_vec: 3638 insn = OPC_VPTERNLOGQ; 3639 sub = 0xdd; /* orB!C */ 3640 goto gen_simd_imm8; 3641 3642 case INDEX_op_bitsel_vec: 3643 insn = OPC_VPTERNLOGQ; 3644 a3 = args[3]; 3645 if (a0 == a1) { 3646 a1 = a2; 3647 a2 = a3; 3648 sub = 0xca; /* A?B:C */ 3649 } else if (a0 == a2) { 3650 a2 = a3; 3651 sub = 0xe2; /* B?A:C */ 3652 } else { 3653 tcg_out_mov(s, type, a0, a3); 3654 sub = 0xb8; /* B?C:A */ 3655 } 3656 goto gen_simd_imm8; 3657 3658 gen_simd_imm8: 3659 tcg_debug_assert(insn != OPC_UD2); 3660 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3661 tcg_out8(s, sub); 3662 break; 3663 3664 case INDEX_op_x86_psrldq_vec: 3665 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3666 tcg_out8(s, a2); 3667 break; 3668 3669 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3670 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3671 default: 3672 g_assert_not_reached(); 3673 } 3674} 3675 3676static TCGConstraintSetIndex 3677tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) 3678{ 3679 switch (op) { 3680 case INDEX_op_goto_ptr: 3681 return C_O0_I1(r); 3682 3683 case INDEX_op_ld8u_i32: 3684 case INDEX_op_ld8u_i64: 3685 case INDEX_op_ld8s_i32: 3686 case INDEX_op_ld8s_i64: 3687 case INDEX_op_ld16u_i32: 3688 case INDEX_op_ld16u_i64: 3689 case INDEX_op_ld16s_i32: 3690 case INDEX_op_ld16s_i64: 3691 case INDEX_op_ld_i32: 3692 case INDEX_op_ld32u_i64: 3693 case INDEX_op_ld32s_i64: 3694 case INDEX_op_ld_i64: 3695 return C_O1_I1(r, r); 3696 3697 case INDEX_op_st8_i32: 3698 case INDEX_op_st8_i64: 3699 return C_O0_I2(qi, r); 3700 3701 case INDEX_op_st16_i32: 3702 case INDEX_op_st16_i64: 3703 case INDEX_op_st_i32: 3704 case INDEX_op_st32_i64: 3705 return C_O0_I2(ri, r); 3706 3707 case INDEX_op_st_i64: 3708 return C_O0_I2(re, r); 3709 3710 case INDEX_op_mul_i32: 3711 case INDEX_op_mul_i64: 3712 return C_O1_I2(r, 0, re); 3713 3714 case INDEX_op_shl_i32: 3715 case INDEX_op_shl_i64: 3716 case INDEX_op_shr_i32: 3717 case INDEX_op_shr_i64: 3718 case INDEX_op_sar_i32: 3719 case INDEX_op_sar_i64: 3720 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3721 3722 case INDEX_op_rotl_i32: 3723 case INDEX_op_rotl_i64: 3724 case INDEX_op_rotr_i32: 3725 case INDEX_op_rotr_i64: 3726 return C_O1_I2(r, 0, ci); 3727 3728 case INDEX_op_brcond_i32: 3729 case INDEX_op_brcond_i64: 3730 return C_O0_I2(r, reT); 3731 3732 case INDEX_op_bswap16_i32: 3733 case INDEX_op_bswap16_i64: 3734 case INDEX_op_bswap32_i32: 3735 case INDEX_op_bswap32_i64: 3736 case INDEX_op_bswap64_i64: 3737 case INDEX_op_extrh_i64_i32: 3738 return C_O1_I1(r, 0); 3739 3740 case INDEX_op_ext_i32_i64: 3741 case INDEX_op_extu_i32_i64: 3742 case INDEX_op_extrl_i64_i32: 3743 case INDEX_op_extract_i32: 3744 case INDEX_op_extract_i64: 3745 case INDEX_op_sextract_i32: 3746 case INDEX_op_sextract_i64: 3747 case INDEX_op_ctpop_i32: 3748 case INDEX_op_ctpop_i64: 3749 return C_O1_I1(r, r); 3750 3751 case INDEX_op_extract2_i32: 3752 case INDEX_op_extract2_i64: 3753 return C_O1_I2(r, 0, r); 3754 3755 case INDEX_op_deposit_i32: 3756 case INDEX_op_deposit_i64: 3757 return C_O1_I2(q, 0, qi); 3758 3759 case INDEX_op_setcond_i32: 3760 case INDEX_op_setcond_i64: 3761 case INDEX_op_negsetcond_i32: 3762 case INDEX_op_negsetcond_i64: 3763 return C_O1_I2(q, r, reT); 3764 3765 case INDEX_op_movcond_i32: 3766 case INDEX_op_movcond_i64: 3767 return C_O1_I4(r, r, reT, r, 0); 3768 3769 case INDEX_op_div2_i32: 3770 case INDEX_op_div2_i64: 3771 case INDEX_op_divu2_i32: 3772 case INDEX_op_divu2_i64: 3773 return C_O2_I3(a, d, 0, 1, r); 3774 3775 case INDEX_op_mulu2_i32: 3776 case INDEX_op_mulu2_i64: 3777 case INDEX_op_muls2_i32: 3778 case INDEX_op_muls2_i64: 3779 return C_O2_I2(a, d, a, r); 3780 3781 case INDEX_op_add2_i32: 3782 case INDEX_op_add2_i64: 3783 case INDEX_op_sub2_i32: 3784 case INDEX_op_sub2_i64: 3785 return C_N1_O1_I4(r, r, 0, 1, re, re); 3786 3787 case INDEX_op_ctz_i32: 3788 case INDEX_op_ctz_i64: 3789 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3790 3791 case INDEX_op_clz_i32: 3792 case INDEX_op_clz_i64: 3793 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3794 3795 case INDEX_op_qemu_ld_i32: 3796 return C_O1_I1(r, L); 3797 3798 case INDEX_op_qemu_st_i32: 3799 return C_O0_I2(L, L); 3800 case INDEX_op_qemu_st8_i32: 3801 return C_O0_I2(s, L); 3802 3803 case INDEX_op_qemu_ld_i64: 3804 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3805 3806 case INDEX_op_qemu_st_i64: 3807 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3808 3809 case INDEX_op_qemu_ld_i128: 3810 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3811 return C_O2_I1(r, r, L); 3812 case INDEX_op_qemu_st_i128: 3813 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3814 return C_O0_I3(L, L, L); 3815 3816 case INDEX_op_brcond2_i32: 3817 return C_O0_I4(r, r, ri, ri); 3818 3819 case INDEX_op_setcond2_i32: 3820 return C_O1_I4(r, r, r, ri, ri); 3821 3822 case INDEX_op_ld_vec: 3823 case INDEX_op_dupm_vec: 3824 return C_O1_I1(x, r); 3825 3826 case INDEX_op_st_vec: 3827 return C_O0_I2(x, r); 3828 3829 case INDEX_op_add_vec: 3830 case INDEX_op_sub_vec: 3831 case INDEX_op_mul_vec: 3832 case INDEX_op_and_vec: 3833 case INDEX_op_or_vec: 3834 case INDEX_op_xor_vec: 3835 case INDEX_op_andc_vec: 3836 case INDEX_op_orc_vec: 3837 case INDEX_op_nand_vec: 3838 case INDEX_op_nor_vec: 3839 case INDEX_op_eqv_vec: 3840 case INDEX_op_ssadd_vec: 3841 case INDEX_op_usadd_vec: 3842 case INDEX_op_sssub_vec: 3843 case INDEX_op_ussub_vec: 3844 case INDEX_op_smin_vec: 3845 case INDEX_op_umin_vec: 3846 case INDEX_op_smax_vec: 3847 case INDEX_op_umax_vec: 3848 case INDEX_op_shlv_vec: 3849 case INDEX_op_shrv_vec: 3850 case INDEX_op_sarv_vec: 3851 case INDEX_op_rotlv_vec: 3852 case INDEX_op_rotrv_vec: 3853 case INDEX_op_shls_vec: 3854 case INDEX_op_shrs_vec: 3855 case INDEX_op_sars_vec: 3856 case INDEX_op_cmp_vec: 3857 case INDEX_op_x86_shufps_vec: 3858 case INDEX_op_x86_blend_vec: 3859 case INDEX_op_x86_packss_vec: 3860 case INDEX_op_x86_packus_vec: 3861 case INDEX_op_x86_vperm2i128_vec: 3862 case INDEX_op_x86_punpckl_vec: 3863 case INDEX_op_x86_punpckh_vec: 3864 case INDEX_op_x86_vpshldi_vec: 3865#if TCG_TARGET_REG_BITS == 32 3866 case INDEX_op_dup2_vec: 3867#endif 3868 return C_O1_I2(x, x, x); 3869 3870 case INDEX_op_abs_vec: 3871 case INDEX_op_dup_vec: 3872 case INDEX_op_not_vec: 3873 case INDEX_op_shli_vec: 3874 case INDEX_op_shri_vec: 3875 case INDEX_op_sari_vec: 3876 case INDEX_op_rotli_vec: 3877 case INDEX_op_x86_psrldq_vec: 3878 return C_O1_I1(x, x); 3879 3880 case INDEX_op_x86_vpshldv_vec: 3881 case INDEX_op_x86_vpshrdv_vec: 3882 return C_O1_I3(x, 0, x, x); 3883 3884 case INDEX_op_bitsel_vec: 3885 return C_O1_I3(x, x, x, x); 3886 case INDEX_op_cmpsel_vec: 3887 return C_O1_I4(x, x, x, xO, x); 3888 3889 default: 3890 return C_NotImplemented; 3891 } 3892} 3893 3894int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3895{ 3896 switch (opc) { 3897 case INDEX_op_add_vec: 3898 case INDEX_op_sub_vec: 3899 case INDEX_op_and_vec: 3900 case INDEX_op_or_vec: 3901 case INDEX_op_xor_vec: 3902 case INDEX_op_andc_vec: 3903 case INDEX_op_orc_vec: 3904 case INDEX_op_nand_vec: 3905 case INDEX_op_nor_vec: 3906 case INDEX_op_eqv_vec: 3907 case INDEX_op_not_vec: 3908 case INDEX_op_bitsel_vec: 3909 return 1; 3910 case INDEX_op_cmp_vec: 3911 case INDEX_op_cmpsel_vec: 3912 return -1; 3913 3914 case INDEX_op_rotli_vec: 3915 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3916 3917 case INDEX_op_shli_vec: 3918 case INDEX_op_shri_vec: 3919 /* We must expand the operation for MO_8. */ 3920 return vece == MO_8 ? -1 : 1; 3921 3922 case INDEX_op_sari_vec: 3923 switch (vece) { 3924 case MO_8: 3925 return -1; 3926 case MO_16: 3927 case MO_32: 3928 return 1; 3929 case MO_64: 3930 if (have_avx512vl) { 3931 return 1; 3932 } 3933 /* 3934 * We can emulate this for MO_64, but it does not pay off 3935 * unless we're producing at least 4 values. 3936 */ 3937 return type >= TCG_TYPE_V256 ? -1 : 0; 3938 } 3939 return 0; 3940 3941 case INDEX_op_shls_vec: 3942 case INDEX_op_shrs_vec: 3943 return vece >= MO_16; 3944 case INDEX_op_sars_vec: 3945 switch (vece) { 3946 case MO_16: 3947 case MO_32: 3948 return 1; 3949 case MO_64: 3950 return have_avx512vl; 3951 } 3952 return 0; 3953 case INDEX_op_rotls_vec: 3954 return vece >= MO_16 ? -1 : 0; 3955 3956 case INDEX_op_shlv_vec: 3957 case INDEX_op_shrv_vec: 3958 switch (vece) { 3959 case MO_16: 3960 return have_avx512bw; 3961 case MO_32: 3962 case MO_64: 3963 return have_avx2; 3964 } 3965 return 0; 3966 case INDEX_op_sarv_vec: 3967 switch (vece) { 3968 case MO_16: 3969 return have_avx512bw; 3970 case MO_32: 3971 return have_avx2; 3972 case MO_64: 3973 return have_avx512vl; 3974 } 3975 return 0; 3976 case INDEX_op_rotlv_vec: 3977 case INDEX_op_rotrv_vec: 3978 switch (vece) { 3979 case MO_16: 3980 return have_avx512vbmi2 ? -1 : 0; 3981 case MO_32: 3982 case MO_64: 3983 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3984 } 3985 return 0; 3986 3987 case INDEX_op_mul_vec: 3988 switch (vece) { 3989 case MO_8: 3990 return -1; 3991 case MO_64: 3992 return have_avx512dq; 3993 } 3994 return 1; 3995 3996 case INDEX_op_ssadd_vec: 3997 case INDEX_op_usadd_vec: 3998 case INDEX_op_sssub_vec: 3999 case INDEX_op_ussub_vec: 4000 return vece <= MO_16; 4001 case INDEX_op_smin_vec: 4002 case INDEX_op_smax_vec: 4003 case INDEX_op_umin_vec: 4004 case INDEX_op_umax_vec: 4005 case INDEX_op_abs_vec: 4006 return vece <= MO_32 || have_avx512vl; 4007 4008 default: 4009 return 0; 4010 } 4011} 4012 4013static void expand_vec_shi(TCGType type, unsigned vece, bool right, 4014 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4015{ 4016 uint8_t mask; 4017 4018 tcg_debug_assert(vece == MO_8); 4019 if (right) { 4020 mask = 0xff >> imm; 4021 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4022 } else { 4023 mask = 0xff << imm; 4024 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4025 } 4026 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4027} 4028 4029static void expand_vec_sari(TCGType type, unsigned vece, 4030 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4031{ 4032 TCGv_vec t1, t2; 4033 4034 switch (vece) { 4035 case MO_8: 4036 /* Unpack to 16-bit, shift, and repack. */ 4037 t1 = tcg_temp_new_vec(type); 4038 t2 = tcg_temp_new_vec(type); 4039 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4040 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4041 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4042 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4043 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4044 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4045 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4046 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4047 tcg_temp_free_vec(t1); 4048 tcg_temp_free_vec(t2); 4049 break; 4050 4051 case MO_64: 4052 t1 = tcg_temp_new_vec(type); 4053 if (imm <= 32) { 4054 /* 4055 * We can emulate a small sign extend by performing an arithmetic 4056 * 32-bit shift and overwriting the high half of a 64-bit logical 4057 * shift. Note that the ISA says shift of 32 is valid, but TCG 4058 * does not, so we have to bound the smaller shift -- we get the 4059 * same result in the high half either way. 4060 */ 4061 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4062 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4063 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4064 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4065 tcgv_vec_arg(t1), 0xaa); 4066 } else { 4067 /* Otherwise we will need to use a compare vs 0 to produce 4068 * the sign-extend, shift and merge. 4069 */ 4070 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4071 tcg_constant_vec(type, MO_64, 0), v1); 4072 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4073 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4074 tcg_gen_or_vec(MO_64, v0, v0, t1); 4075 } 4076 tcg_temp_free_vec(t1); 4077 break; 4078 4079 default: 4080 g_assert_not_reached(); 4081 } 4082} 4083 4084static void expand_vec_rotli(TCGType type, unsigned vece, 4085 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4086{ 4087 TCGv_vec t; 4088 4089 if (vece != MO_8 && have_avx512vbmi2) { 4090 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4091 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4092 return; 4093 } 4094 4095 t = tcg_temp_new_vec(type); 4096 tcg_gen_shli_vec(vece, t, v1, imm); 4097 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4098 tcg_gen_or_vec(vece, v0, v0, t); 4099 tcg_temp_free_vec(t); 4100} 4101 4102static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4103 TCGv_vec v1, TCGv_vec sh, bool right) 4104{ 4105 TCGv_vec t; 4106 4107 if (have_avx512vbmi2) { 4108 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4109 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4110 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4111 return; 4112 } 4113 4114 t = tcg_temp_new_vec(type); 4115 tcg_gen_dupi_vec(vece, t, 8 << vece); 4116 tcg_gen_sub_vec(vece, t, t, sh); 4117 if (right) { 4118 tcg_gen_shlv_vec(vece, t, v1, t); 4119 tcg_gen_shrv_vec(vece, v0, v1, sh); 4120 } else { 4121 tcg_gen_shrv_vec(vece, t, v1, t); 4122 tcg_gen_shlv_vec(vece, v0, v1, sh); 4123 } 4124 tcg_gen_or_vec(vece, v0, v0, t); 4125 tcg_temp_free_vec(t); 4126} 4127 4128static void expand_vec_rotls(TCGType type, unsigned vece, 4129 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4130{ 4131 TCGv_vec t = tcg_temp_new_vec(type); 4132 4133 tcg_debug_assert(vece != MO_8); 4134 4135 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4136 tcg_gen_dup_i32_vec(vece, t, lsh); 4137 if (vece >= MO_32) { 4138 tcg_gen_rotlv_vec(vece, v0, v1, t); 4139 } else { 4140 expand_vec_rotv(type, vece, v0, v1, t, false); 4141 } 4142 } else { 4143 TCGv_i32 rsh = tcg_temp_new_i32(); 4144 4145 tcg_gen_neg_i32(rsh, lsh); 4146 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4147 tcg_gen_shls_vec(vece, t, v1, lsh); 4148 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4149 tcg_gen_or_vec(vece, v0, v0, t); 4150 4151 tcg_temp_free_i32(rsh); 4152 } 4153 4154 tcg_temp_free_vec(t); 4155} 4156 4157static void expand_vec_mul(TCGType type, unsigned vece, 4158 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4159{ 4160 TCGv_vec t1, t2, t3, t4, zero; 4161 4162 tcg_debug_assert(vece == MO_8); 4163 4164 /* 4165 * Unpack v1 bytes to words, 0 | x. 4166 * Unpack v2 bytes to words, y | 0. 4167 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4168 * Shift logical right by 8 bits to clear the high 8 bytes before 4169 * using an unsigned saturated pack. 4170 * 4171 * The difference between the V64, V128 and V256 cases is merely how 4172 * we distribute the expansion between temporaries. 4173 */ 4174 switch (type) { 4175 case TCG_TYPE_V64: 4176 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4177 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4178 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4179 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4180 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4181 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4182 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4183 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4184 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4185 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4186 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4187 tcg_temp_free_vec(t1); 4188 tcg_temp_free_vec(t2); 4189 break; 4190 4191 case TCG_TYPE_V128: 4192 case TCG_TYPE_V256: 4193 t1 = tcg_temp_new_vec(type); 4194 t2 = tcg_temp_new_vec(type); 4195 t3 = tcg_temp_new_vec(type); 4196 t4 = tcg_temp_new_vec(type); 4197 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4198 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4199 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4200 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4201 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4202 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4203 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4204 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4205 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4206 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4207 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4208 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4209 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4210 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4211 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4212 tcg_temp_free_vec(t1); 4213 tcg_temp_free_vec(t2); 4214 tcg_temp_free_vec(t3); 4215 tcg_temp_free_vec(t4); 4216 break; 4217 4218 default: 4219 g_assert_not_reached(); 4220 } 4221} 4222 4223static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4224 TCGArg *a1, TCGArg *a2, TCGCond cond) 4225{ 4226 /* 4227 * Without AVX512, there are no 64-bit unsigned comparisons. 4228 * We must bias the inputs so that they become signed. 4229 * All other swapping and inversion are handled during code generation. 4230 */ 4231 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4232 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4233 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4234 TCGv_vec t1 = tcg_temp_new_vec(type); 4235 TCGv_vec t2 = tcg_temp_new_vec(type); 4236 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4237 4238 tcg_gen_sub_vec(vece, t1, v1, t3); 4239 tcg_gen_sub_vec(vece, t2, v2, t3); 4240 *a1 = tcgv_vec_arg(t1); 4241 *a2 = tcgv_vec_arg(t2); 4242 cond = tcg_signed_cond(cond); 4243 } 4244 return cond; 4245} 4246 4247static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4248 TCGArg a1, TCGArg a2, TCGCond cond) 4249{ 4250 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4251 /* Expand directly; do not recurse. */ 4252 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4253} 4254 4255static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4256 TCGArg a1, TCGArg a2, 4257 TCGArg a3, TCGArg a4, TCGCond cond) 4258{ 4259 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4260 /* Expand directly; do not recurse. */ 4261 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4262} 4263 4264void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4265 TCGArg a0, ...) 4266{ 4267 va_list va; 4268 TCGArg a1, a2, a3, a4, a5; 4269 TCGv_vec v0, v1, v2; 4270 4271 va_start(va, a0); 4272 a1 = va_arg(va, TCGArg); 4273 a2 = va_arg(va, TCGArg); 4274 v0 = temp_tcgv_vec(arg_temp(a0)); 4275 v1 = temp_tcgv_vec(arg_temp(a1)); 4276 4277 switch (opc) { 4278 case INDEX_op_shli_vec: 4279 expand_vec_shi(type, vece, false, v0, v1, a2); 4280 break; 4281 case INDEX_op_shri_vec: 4282 expand_vec_shi(type, vece, true, v0, v1, a2); 4283 break; 4284 case INDEX_op_sari_vec: 4285 expand_vec_sari(type, vece, v0, v1, a2); 4286 break; 4287 4288 case INDEX_op_rotli_vec: 4289 expand_vec_rotli(type, vece, v0, v1, a2); 4290 break; 4291 4292 case INDEX_op_rotls_vec: 4293 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4294 break; 4295 4296 case INDEX_op_rotlv_vec: 4297 v2 = temp_tcgv_vec(arg_temp(a2)); 4298 expand_vec_rotv(type, vece, v0, v1, v2, false); 4299 break; 4300 case INDEX_op_rotrv_vec: 4301 v2 = temp_tcgv_vec(arg_temp(a2)); 4302 expand_vec_rotv(type, vece, v0, v1, v2, true); 4303 break; 4304 4305 case INDEX_op_mul_vec: 4306 v2 = temp_tcgv_vec(arg_temp(a2)); 4307 expand_vec_mul(type, vece, v0, v1, v2); 4308 break; 4309 4310 case INDEX_op_cmp_vec: 4311 a3 = va_arg(va, TCGArg); 4312 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4313 break; 4314 4315 case INDEX_op_cmpsel_vec: 4316 a3 = va_arg(va, TCGArg); 4317 a4 = va_arg(va, TCGArg); 4318 a5 = va_arg(va, TCGArg); 4319 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4320 break; 4321 4322 default: 4323 break; 4324 } 4325 4326 va_end(va); 4327} 4328 4329static const int tcg_target_callee_save_regs[] = { 4330#if TCG_TARGET_REG_BITS == 64 4331 TCG_REG_RBP, 4332 TCG_REG_RBX, 4333#if defined(_WIN64) 4334 TCG_REG_RDI, 4335 TCG_REG_RSI, 4336#endif 4337 TCG_REG_R12, 4338 TCG_REG_R13, 4339 TCG_REG_R14, /* Currently used for the global env. */ 4340 TCG_REG_R15, 4341#else 4342 TCG_REG_EBP, /* Currently used for the global env. */ 4343 TCG_REG_EBX, 4344 TCG_REG_ESI, 4345 TCG_REG_EDI, 4346#endif 4347}; 4348 4349/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4350 and tcg_register_jit. */ 4351 4352#define PUSH_SIZE \ 4353 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4354 * (TCG_TARGET_REG_BITS / 8)) 4355 4356#define FRAME_SIZE \ 4357 ((PUSH_SIZE \ 4358 + TCG_STATIC_CALL_ARGS_SIZE \ 4359 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4360 + TCG_TARGET_STACK_ALIGN - 1) \ 4361 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4362 4363/* Generate global QEMU prologue and epilogue code */ 4364static void tcg_target_qemu_prologue(TCGContext *s) 4365{ 4366 int i, stack_addend; 4367 4368 /* TB prologue */ 4369 4370 /* Reserve some stack space, also for TCG temps. */ 4371 stack_addend = FRAME_SIZE - PUSH_SIZE; 4372 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4373 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4374 4375 /* Save all callee saved registers. */ 4376 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4377 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4378 } 4379 4380 if (!tcg_use_softmmu && guest_base) { 4381 int seg = setup_guest_base_seg(); 4382 if (seg != 0) { 4383 x86_guest_base.seg = seg; 4384 } else if (guest_base == (int32_t)guest_base) { 4385 x86_guest_base.ofs = guest_base; 4386 } else { 4387 assert(TCG_TARGET_REG_BITS == 64); 4388 /* Choose R12 because, as a base, it requires a SIB byte. */ 4389 x86_guest_base.index = TCG_REG_R12; 4390 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4391 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4392 } 4393 } 4394 4395 if (TCG_TARGET_REG_BITS == 32) { 4396 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4397 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4398 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4399 /* jmp *tb. */ 4400 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4401 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4402 + stack_addend); 4403 } else { 4404 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4405 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4406 /* jmp *tb. */ 4407 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4408 } 4409 4410 /* 4411 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4412 * and fall through to the rest of the epilogue. 4413 */ 4414 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4415 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4416 4417 /* TB epilogue */ 4418 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4419 4420 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4421 4422 if (have_avx2) { 4423 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4424 } 4425 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4426 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4427 } 4428 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4429} 4430 4431static void tcg_out_tb_start(TCGContext *s) 4432{ 4433 /* nothing to do */ 4434} 4435 4436static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4437{ 4438 memset(p, 0x90, count); 4439} 4440 4441static void tcg_target_init(TCGContext *s) 4442{ 4443 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4444 if (TCG_TARGET_REG_BITS == 64) { 4445 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4446 } 4447 if (have_avx1) { 4448 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4449 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4450 } 4451 if (have_avx2) { 4452 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4453 } 4454 4455 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4456 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4457 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4458 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4459 if (TCG_TARGET_REG_BITS == 64) { 4460#if !defined(_WIN64) 4461 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4462 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4463#endif 4464 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4465 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4466 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4467 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4468 } 4469 4470 s->reserved_regs = 0; 4471 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4472 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4473#ifdef _WIN64 4474 /* These are call saved, and we don't save them, so don't use them. */ 4475 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4476 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4477 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4478 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4479 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4480 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4481 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4482 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4483 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4484 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4485#endif 4486} 4487 4488typedef struct { 4489 DebugFrameHeader h; 4490 uint8_t fde_def_cfa[4]; 4491 uint8_t fde_reg_ofs[14]; 4492} DebugFrame; 4493 4494/* We're expecting a 2 byte uleb128 encoded value. */ 4495QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4496 4497#if !defined(__ELF__) 4498 /* Host machine without ELF. */ 4499#elif TCG_TARGET_REG_BITS == 64 4500#define ELF_HOST_MACHINE EM_X86_64 4501static const DebugFrame debug_frame = { 4502 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4503 .h.cie.id = -1, 4504 .h.cie.version = 1, 4505 .h.cie.code_align = 1, 4506 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4507 .h.cie.return_column = 16, 4508 4509 /* Total FDE size does not include the "len" member. */ 4510 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4511 4512 .fde_def_cfa = { 4513 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4514 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4515 (FRAME_SIZE >> 7) 4516 }, 4517 .fde_reg_ofs = { 4518 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4519 /* The following ordering must match tcg_target_callee_save_regs. */ 4520 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4521 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4522 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4523 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4524 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4525 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4526 } 4527}; 4528#else 4529#define ELF_HOST_MACHINE EM_386 4530static const DebugFrame debug_frame = { 4531 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4532 .h.cie.id = -1, 4533 .h.cie.version = 1, 4534 .h.cie.code_align = 1, 4535 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4536 .h.cie.return_column = 8, 4537 4538 /* Total FDE size does not include the "len" member. */ 4539 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4540 4541 .fde_def_cfa = { 4542 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4543 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4544 (FRAME_SIZE >> 7) 4545 }, 4546 .fde_reg_ofs = { 4547 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4548 /* The following ordering must match tcg_target_callee_save_regs. */ 4549 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4550 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4551 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4552 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4553 } 4554}; 4555#endif 4556 4557#if defined(ELF_HOST_MACHINE) 4558void tcg_register_jit(const void *buf, size_t buf_size) 4559{ 4560 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4561} 4562#endif 4563